diff --git a/.gitignore b/.gitignore index 922116d12..9350e0616 100644 --- a/.gitignore +++ b/.gitignore @@ -188,3 +188,4 @@ cov.xml *-structure.json *.dot .python-version +/data/inference diff --git a/sub_swebench_dataset/readme.md b/benchmark/sub_swebench_dataset/readme.md similarity index 100% rename from sub_swebench_dataset/readme.md rename to benchmark/sub_swebench_dataset/readme.md diff --git a/sub_swebench_dataset/scikit-learn-68.csv b/benchmark/sub_swebench_dataset/scikit-learn-68.csv similarity index 100% rename from sub_swebench_dataset/scikit-learn-68.csv rename to benchmark/sub_swebench_dataset/scikit-learn-68.csv diff --git a/sub_swebench_dataset/sub_swebench.csv b/benchmark/sub_swebench_dataset/sub_swebench.csv similarity index 100% rename from sub_swebench_dataset/sub_swebench.csv rename to benchmark/sub_swebench_dataset/sub_swebench.csv diff --git a/swe_bench/__init__.py b/benchmark/swe_bench/__init__.py similarity index 100% rename from swe_bench/__init__.py rename to benchmark/swe_bench/__init__.py diff --git a/benchmark/swe_bench/data/load_dataset.py b/benchmark/swe_bench/data/load_dataset.py new file mode 100644 index 000000000..715d33c2f --- /dev/null +++ b/benchmark/swe_bench/data/load_dataset.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# @Author : stellahong (stellahong@fuzhi.ai) +# @Desc : +from pathlib import Path + +import numpy as np +from datasets import load_dataset, load_from_disk + +from benchmark.swe_bench.inference.const import SCIKIT_LEARN_IDS + + +def load_oracle_dataset(dataset_name_or_path: str = "", split: str = "test", existing_ids: list = []): + if Path(dataset_name_or_path).exists(): + dataset = load_from_disk(dataset_name_or_path) + else: + dataset = load_dataset(dataset_name_or_path) + if split not in dataset: + raise ValueError(f"Invalid split {split} for dataset {dataset_name_or_path}") + dataset = dataset[split] + lens = np.array(list(map(len, dataset["text"]))) + dataset = dataset.select(np.argsort(lens)) + + if existing_ids: + dataset = dataset.filter( + lambda x: x["instance_id"] not in existing_ids, + desc="Filtering out existing ids", + load_from_cache_file=False, + ) + if SCIKIT_LEARN_IDS: + dataset = dataset.filter( + lambda x: x["instance_id"] in SCIKIT_LEARN_IDS, + desc="Filtering out subset_instance_ids", + load_from_cache_file=False, + ) + return dataset diff --git a/swe_bench/inference/__init__.py b/benchmark/swe_bench/data/repos/__init__.py similarity index 100% rename from swe_bench/inference/__init__.py rename to benchmark/swe_bench/data/repos/__init__.py diff --git a/swe_bench/gitagent.py b/benchmark/swe_bench/gitagent.py similarity index 100% rename from swe_bench/gitagent.py rename to benchmark/swe_bench/gitagent.py diff --git a/swe_bench/make_datasets/__init__.py b/benchmark/swe_bench/inference/__init__.py similarity index 100% rename from swe_bench/make_datasets/__init__.py rename to benchmark/swe_bench/inference/__init__.py diff --git a/data/inference/const.py b/benchmark/swe_bench/inference/const.py similarity index 85% rename from data/inference/const.py rename to benchmark/swe_bench/inference/const.py index 69a274310..1183c1d7c 100644 --- a/data/inference/const.py +++ b/benchmark/swe_bench/inference/const.py @@ -3,11 +3,11 @@ # @Desc : import pandas as pd -from metagpt.const import DATA_PATH, METAGPT_ROOT +from metagpt.const import METAGPT_ROOT -SUBSET_DATASET = METAGPT_ROOT / "sub_swebench_dataset" / "sub_swebench.csv" -SUBSET_DATASET_SKLERARN = METAGPT_ROOT / "sub_swebench_dataset" / "scikit-learn-68.csv" -TESTBED = DATA_PATH / "repos" +SUBSET_DATASET = METAGPT_ROOT / "benchmark" / "swe_bench" / "sub_swebench_dataset" / "sub_swebench.csv" +SUBSET_DATASET_SKLERARN = METAGPT_ROOT / "benchmark" / "sub_swebench_dataset" / "scikit-learn-68.csv" +TESTBED = METAGPT_ROOT / "benchmark" / "swe_bench" / "data" / "repos" # SCIKIT_LEARN_IDS: A list of instance identifiers from 'sub_swebench.csv' within SUBSET_DATASET. # This collection represents a subset specifically related to scikit-learn content. diff --git a/swe_bench/inference/run.py b/benchmark/swe_bench/inference/run.py similarity index 100% rename from swe_bench/inference/run.py rename to benchmark/swe_bench/inference/run.py diff --git a/swe_bench/inference/run_agent.py b/benchmark/swe_bench/inference/run_agent.py similarity index 93% rename from swe_bench/inference/run_agent.py rename to benchmark/swe_bench/inference/run_agent.py index 2e8c381ab..bdcad0bd9 100644 --- a/swe_bench/inference/run_agent.py +++ b/benchmark/swe_bench/inference/run_agent.py @@ -5,12 +5,12 @@ import re from tenacity import retry, stop_after_attempt, wait_random_exponential +from benchmark.swe_bench.gitagent import GitAgent +from benchmark.swe_bench.make_datasets.make_dataset import reset_task_env +from benchmark.swe_bench.utils.utils import extract_scripts_from_codetext from metagpt.logs import logger from metagpt.utils.exceptions import handle_exception from metagpt.utils.recovery_util import save_history -from swe_bench.gitagent import GitAgent -from swe_bench.make_datasets.make_dataset import reset_task_env -from swe_bench.utils.utils import extract_scripts_from_codetext PATCH_FORMAT = """ ```diff diff --git a/swe_bench/inference/run_api.py b/benchmark/swe_bench/inference/run_api.py similarity index 92% rename from swe_bench/inference/run_api.py rename to benchmark/swe_bench/inference/run_api.py index 5fc71f516..e9d57f1a5 100644 --- a/swe_bench/inference/run_api.py +++ b/benchmark/swe_bench/inference/run_api.py @@ -2,14 +2,14 @@ import json from pathlib import Path import fire -from data.load_dataset import load_oracle_dataset from tqdm.auto import tqdm +from benchmark.swe_bench.data.load_dataset import load_oracle_dataset +from benchmark.swe_bench.inference.run_agent import run_instance +from benchmark.swe_bench.utils.utils import check_existing_ids, extract_diff from metagpt.config2 import config from metagpt.logs import logger from metagpt.utils import count_string_tokens -from swe_bench.inference.run_agent import run_instance -from swe_bench.utils.utils import check_existing_ids, extract_diff # Replace with your own MAX_TOKEN = 128000 @@ -56,7 +56,7 @@ async def openai_inference( logger.info(f"{repo_prefix}_{version}") data.append(f"{repo_prefix}_{version}") - response = await run_instance(instance=datum) + response = await run_instance(instance=datum, use_reflection=use_reflection) if response is None: continue logger.info(f"Final response: {response}") diff --git a/swe_bench/utils/__init__.py b/benchmark/swe_bench/make_datasets/__init__.py similarity index 100% rename from swe_bench/utils/__init__.py rename to benchmark/swe_bench/make_datasets/__init__.py diff --git a/swe_bench/make_datasets/make_dataset.py b/benchmark/swe_bench/make_datasets/make_dataset.py similarity index 86% rename from swe_bench/make_datasets/make_dataset.py rename to benchmark/swe_bench/make_datasets/make_dataset.py index ee4fc8c41..60c54181b 100644 --- a/swe_bench/make_datasets/make_dataset.py +++ b/benchmark/swe_bench/make_datasets/make_dataset.py @@ -6,11 +6,11 @@ from pathlib import Path from tqdm.auto import tqdm -from data.inference.const import TESTBED +from benchmark.swe_bench.inference.const import TESTBED +from benchmark.swe_bench.make_datasets.make_instance import prompt_style_2_edits_only +from benchmark.swe_bench.utils.parse_diff import filter_changed_line +from benchmark.swe_bench.utils.repo_utils import EnvManager from metagpt.logs import logger -from swe_bench.make_datasets.make_instance import prompt_style_2_edits_only -from swe_bench.utils.parse_diff import filter_changed_line -from swe_bench.utils.repo_utils import EnvManager def reset_task_env(instance: dict = {}): diff --git a/swe_bench/make_datasets/make_instance.py b/benchmark/swe_bench/make_datasets/make_instance.py similarity index 100% rename from swe_bench/make_datasets/make_instance.py rename to benchmark/swe_bench/make_datasets/make_instance.py diff --git a/benchmark/swe_bench/utils/__init__.py b/benchmark/swe_bench/utils/__init__.py new file mode 100644 index 000000000..f12b94354 --- /dev/null +++ b/benchmark/swe_bench/utils/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- +# @Author : stellahong (stellahong@fuzhi.ai) +# @Desc : diff --git a/benchmark/swe_bench/utils/enums.py b/benchmark/swe_bench/utils/enums.py new file mode 100644 index 000000000..7c052a8ff --- /dev/null +++ b/benchmark/swe_bench/utils/enums.py @@ -0,0 +1,12 @@ +from enum import Enum, auto + + +class FileChangeMode(Enum): + create = auto() + delete = auto() + change = auto() + + +class LineChangeType(Enum): + addition = auto() + deletion = auto() diff --git a/benchmark/swe_bench/utils/parse_diff.py b/benchmark/swe_bench/utils/parse_diff.py new file mode 100644 index 000000000..67c22d33c --- /dev/null +++ b/benchmark/swe_bench/utils/parse_diff.py @@ -0,0 +1,115 @@ +import re +from typing import Dict, List + +from benchmark.swe_bench.utils.enums import FileChangeMode, LineChangeType +from metagpt.logs import logger + + +def extract_changes_from_patch(diff: str) -> List[Dict[str, any]]: + """Parses the patch text through the standard syntax of git diff, outputs the information of added and deleted lines. + + Extracts detailed information about file changes based on the output content of git diff. + + Args: + diff: A string containing the output of git diff. + + Returns: + A list of dictionaries containing information about each file change. + """ + changes = [] + current_file = None + + file_pattern = re.compile(r"^diff --git a/(.+) b/(.+)$") + line_change_pattern = re.compile(r"^@@ -(\d+),\d+ \+(\d+),\d+ @@.*$") + new_file_flag_line = "--- /dev/null" + deleted_file_flag_line = "+++ /dev/null" + + for line in diff.splitlines(): + file_section_start = file_pattern.match(line) + if file_section_start: + if current_file: + changes.append(current_file) + file_a, file_b = file_section_start.groups() + current_file = start_new_file_section(file_a, file_b) + current_file["mode"] = FileChangeMode.change + elif current_file: + # 匹配到新文件模式,标记当前文件为新增 + if line == new_file_flag_line: + current_file["mode"] = FileChangeMode.create + # 匹配到删除文件模式,标记当前文件为删除 + elif line == deleted_file_flag_line: + current_file["mode"] = FileChangeMode.delete + update_file_changes(current_file, line, line_change_pattern) + + if current_file: + changes.append(current_file) + + return changes + + +def start_new_file_section(file_before_change: str, file_after_change: str) -> Dict[str, any]: + """Function to initialize a new file section + + When encountering a new file change, this function is called to initialize a dictionary recording the file change information. + + Args: + file_before_change: The file name before the change + file_after_change: The file name after the change, or "/dev/null" if the file was deleted. + + Returns: + A dictionary representing the file change. + """ + return { + "file_before_change": file_before_change, + "file_after_change": file_after_change, + "changes": [], + } + + +def update_file_changes(current_file: Dict[str, any], line: str, line_change_pattern: re.Pattern): + """Updates the current file change information + + Updates the current file's change record based on a line from the diff. + + Args: + current_file: The current file information being processed + line: The current line from the diff + line_change_pattern: The regex pattern used to identify line changes + """ + line_change_match = line_change_pattern.match(line) + if line_change_match: + current_file["base_line"], current_file["changed_line"] = map(int, line_change_match.groups()) + elif line.startswith("+"): + current_file["changes"].append( + {"type": LineChangeType.addition, "line": current_file.get("changed_line", 1), "content": line[1:]} + ) + current_file["changed_line"] = current_file.get("changed_line", 0) + 1 + elif line.startswith("-"): + current_file["changes"].append( + {"type": LineChangeType.deletion, "line": current_file.get("base_line", 1), "content": line[1:]} + ) + current_file["base_line"] = current_file.get("base_line", 0) + 1 + + +def filter_changed_line(patch): + """Filters changed lines + + Filters the part of the change record of the current file that needs to be used. + + Args: + patch: The git diff text + """ + parsed_changes = extract_changes_from_patch(patch) + res = {} + for change in parsed_changes: + file_name = change["file_before_change"] + res[file_name] = [] + # 新增的文件略过 + if change["mode"] is FileChangeMode.create: + continue + for c in change["changes"]: + if c["type"] is LineChangeType.addition: + continue + logger.debug(f" {c['type']} at line {c['line']}: {c['content']}") + res[file_name].append(c) + return res diff --git a/swe_bench/utils/repo_utils.py b/benchmark/swe_bench/utils/repo_utils.py similarity index 100% rename from swe_bench/utils/repo_utils.py rename to benchmark/swe_bench/utils/repo_utils.py diff --git a/swe_bench/utils/utils.py b/benchmark/swe_bench/utils/utils.py similarity index 100% rename from swe_bench/utils/utils.py rename to benchmark/swe_bench/utils/utils.py