mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-23 15:48:11 +02:00
update code, change data path
This commit is contained in:
parent
f26a5cd1de
commit
91db2ef112
5 changed files with 7 additions and 5 deletions
35
swe_bench/data/load_dataset.py
Normal file
35
swe_bench/data/load_dataset.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Author : stellahong (stellahong@fuzhi.ai)
|
||||
# @Desc :
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from datasets import load_dataset, load_from_disk
|
||||
|
||||
from swe_bench.inference.const import SCIKIT_LEARN_IDS
|
||||
|
||||
|
||||
def load_oracle_dataset(dataset_name_or_path: str = "", split: str = "test", existing_ids: list = []):
|
||||
if Path(dataset_name_or_path).exists():
|
||||
dataset = load_from_disk(dataset_name_or_path)
|
||||
else:
|
||||
dataset = load_dataset(dataset_name_or_path)
|
||||
if split not in dataset:
|
||||
raise ValueError(f"Invalid split {split} for dataset {dataset_name_or_path}")
|
||||
dataset = dataset[split]
|
||||
lens = np.array(list(map(len, dataset["text"])))
|
||||
dataset = dataset.select(np.argsort(lens))
|
||||
|
||||
if len(existing_ids) > 0:
|
||||
dataset = dataset.filter(
|
||||
lambda x: x["instance_id"] not in existing_ids,
|
||||
desc="Filtering out existing ids",
|
||||
load_from_cache_file=False,
|
||||
)
|
||||
if len(SCIKIT_LEARN_IDS) > 0:
|
||||
dataset = dataset.filter(
|
||||
lambda x: x["instance_id"] in SCIKIT_LEARN_IDS,
|
||||
desc="Filtering out subset_instance_ids",
|
||||
load_from_cache_file=False,
|
||||
)
|
||||
return dataset
|
||||
60
swe_bench/inference/const.py
Normal file
60
swe_bench/inference/const.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Author : stellahong (stellahong@fuzhi.ai)
|
||||
# @Desc :
|
||||
import pandas as pd
|
||||
|
||||
from metagpt.const import DATA_PATH, METAGPT_ROOT
|
||||
|
||||
SUBSET_DATASET = METAGPT_ROOT / "sub_swebench_dataset" / "sub_swebench.csv"
|
||||
SUBSET_DATASET_SKLERARN = METAGPT_ROOT / "sub_swebench_dataset" / "scikit-learn-68.csv"
|
||||
TESTBED = METAGPT_ROOT / "swe-bench" / "data" / "repos"
|
||||
|
||||
# SCIKIT_LEARN_IDS: A list of instance identifiers from 'sub_swebench.csv' within SUBSET_DATASET.
|
||||
# This collection represents a subset specifically related to scikit-learn content.
|
||||
SCIKIT_LEARN_IDS = [
|
||||
"scikit-learn__scikit-learn-11578",
|
||||
"scikit-learn__scikit-learn-10297",
|
||||
"scikit-learn__scikit-learn-25747",
|
||||
"scikit-learn__scikit-learn-15512",
|
||||
"scikit-learn__scikit-learn-15119",
|
||||
"scikit-learn__scikit-learn-10870",
|
||||
"scikit-learn__scikit-learn-15100",
|
||||
"scikit-learn__scikit-learn-14496",
|
||||
"scikit-learn__scikit-learn-14890",
|
||||
"scikit-learn__scikit-learn-10428",
|
||||
"scikit-learn__scikit-learn-25744",
|
||||
"scikit-learn__scikit-learn-11542",
|
||||
"scikit-learn__scikit-learn-10198",
|
||||
"scikit-learn__scikit-learn-10459",
|
||||
]
|
||||
|
||||
MATPLOTLIB_IDS = [
|
||||
"matplotlib__matplotlib-24362",
|
||||
"matplotlib__matplotlib-20584",
|
||||
"matplotlib__matplotlib-23188",
|
||||
"matplotlib__matplotlib-24403",
|
||||
# 'matplotlib__matplotlib-21443',
|
||||
# 'matplotlib__matplotlib-23047'
|
||||
]
|
||||
|
||||
|
||||
def read_subset_instance(path=SUBSET_DATASET, tag="scikit-learn"):
|
||||
try:
|
||||
df = pd.read_excel(path)
|
||||
pass_filters = df["instance_id_pass"].tolist()
|
||||
fail_filters = df["instance_id_fail"].tolist()
|
||||
pass_filters = [s for s in pass_filters if tag in s]
|
||||
fail_filters = [s for s in fail_filters if tag in s]
|
||||
subset_instance = pass_filters + fail_filters
|
||||
|
||||
return subset_instance
|
||||
except FileNotFoundError:
|
||||
print(f"File not found: {path}")
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
return []
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(read_subset_instance(tag="matplotlib__matplotlib"))
|
||||
|
|
@ -2,7 +2,7 @@ import json
|
|||
from pathlib import Path
|
||||
|
||||
import fire
|
||||
from data.load_dataset import load_oracle_dataset
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from metagpt.config2 import config
|
||||
|
|
@ -10,6 +10,7 @@ from metagpt.logs import logger
|
|||
from metagpt.utils import count_string_tokens
|
||||
from swe_bench.inference.run_agent import run_instance
|
||||
from swe_bench.utils.utils import check_existing_ids, extract_diff
|
||||
from swe_bench.data.load_dataset import load_oracle_dataset
|
||||
|
||||
# Replace with your own
|
||||
MAX_TOKEN = 128000
|
||||
|
|
@ -56,7 +57,7 @@ async def openai_inference(
|
|||
logger.info(f"{repo_prefix}_{version}")
|
||||
data.append(f"{repo_prefix}_{version}")
|
||||
|
||||
response = await run_instance(instance=datum)
|
||||
response = await run_instance(instance=datum, use_reflection=use_reflection)
|
||||
if response is None:
|
||||
continue
|
||||
logger.info(f"Final response: {response}")
|
||||
|
|
|
|||
|
|
@ -6,11 +6,11 @@ from pathlib import Path
|
|||
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from data.inference.const import TESTBED
|
||||
from metagpt.logs import logger
|
||||
from swe_bench.make_datasets.make_instance import prompt_style_2_edits_only
|
||||
from swe_bench.utils.parse_diff import filter_changed_line
|
||||
from swe_bench.utils.repo_utils import EnvManager
|
||||
from swe_bench.inference.const import TESTBED
|
||||
|
||||
|
||||
def reset_task_env(instance: dict = {}):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue