update code, change data path

This commit is contained in:
stellahsr 2024-03-26 14:44:19 +08:00
parent f26a5cd1de
commit 91db2ef112
5 changed files with 7 additions and 5 deletions

View file

@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
# @Author : stellahong (stellahong@fuzhi.ai)
# @Desc :
from pathlib import Path
import numpy as np
from datasets import load_dataset, load_from_disk
from swe_bench.inference.const import SCIKIT_LEARN_IDS
def load_oracle_dataset(dataset_name_or_path: str = "", split: str = "test", existing_ids: list = []):
if Path(dataset_name_or_path).exists():
dataset = load_from_disk(dataset_name_or_path)
else:
dataset = load_dataset(dataset_name_or_path)
if split not in dataset:
raise ValueError(f"Invalid split {split} for dataset {dataset_name_or_path}")
dataset = dataset[split]
lens = np.array(list(map(len, dataset["text"])))
dataset = dataset.select(np.argsort(lens))
if len(existing_ids) > 0:
dataset = dataset.filter(
lambda x: x["instance_id"] not in existing_ids,
desc="Filtering out existing ids",
load_from_cache_file=False,
)
if len(SCIKIT_LEARN_IDS) > 0:
dataset = dataset.filter(
lambda x: x["instance_id"] in SCIKIT_LEARN_IDS,
desc="Filtering out subset_instance_ids",
load_from_cache_file=False,
)
return dataset

View file

@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
# @Author : stellahong (stellahong@fuzhi.ai)
# @Desc :
import pandas as pd
from metagpt.const import DATA_PATH, METAGPT_ROOT
SUBSET_DATASET = METAGPT_ROOT / "sub_swebench_dataset" / "sub_swebench.csv"
SUBSET_DATASET_SKLERARN = METAGPT_ROOT / "sub_swebench_dataset" / "scikit-learn-68.csv"
TESTBED = METAGPT_ROOT / "swe-bench" / "data" / "repos"
# SCIKIT_LEARN_IDS: A list of instance identifiers from 'sub_swebench.csv' within SUBSET_DATASET.
# This collection represents a subset specifically related to scikit-learn content.
SCIKIT_LEARN_IDS = [
"scikit-learn__scikit-learn-11578",
"scikit-learn__scikit-learn-10297",
"scikit-learn__scikit-learn-25747",
"scikit-learn__scikit-learn-15512",
"scikit-learn__scikit-learn-15119",
"scikit-learn__scikit-learn-10870",
"scikit-learn__scikit-learn-15100",
"scikit-learn__scikit-learn-14496",
"scikit-learn__scikit-learn-14890",
"scikit-learn__scikit-learn-10428",
"scikit-learn__scikit-learn-25744",
"scikit-learn__scikit-learn-11542",
"scikit-learn__scikit-learn-10198",
"scikit-learn__scikit-learn-10459",
]
MATPLOTLIB_IDS = [
"matplotlib__matplotlib-24362",
"matplotlib__matplotlib-20584",
"matplotlib__matplotlib-23188",
"matplotlib__matplotlib-24403",
# 'matplotlib__matplotlib-21443',
# 'matplotlib__matplotlib-23047'
]
def read_subset_instance(path=SUBSET_DATASET, tag="scikit-learn"):
try:
df = pd.read_excel(path)
pass_filters = df["instance_id_pass"].tolist()
fail_filters = df["instance_id_fail"].tolist()
pass_filters = [s for s in pass_filters if tag in s]
fail_filters = [s for s in fail_filters if tag in s]
subset_instance = pass_filters + fail_filters
return subset_instance
except FileNotFoundError:
print(f"File not found: {path}")
return []
except Exception as e:
print(f"An error occurred: {e}")
return []
if __name__ == "__main__":
print(read_subset_instance(tag="matplotlib__matplotlib"))

View file

@ -2,7 +2,7 @@ import json
from pathlib import Path
import fire
from data.load_dataset import load_oracle_dataset
from tqdm.auto import tqdm
from metagpt.config2 import config
@ -10,6 +10,7 @@ from metagpt.logs import logger
from metagpt.utils import count_string_tokens
from swe_bench.inference.run_agent import run_instance
from swe_bench.utils.utils import check_existing_ids, extract_diff
from swe_bench.data.load_dataset import load_oracle_dataset
# Replace with your own
MAX_TOKEN = 128000
@ -56,7 +57,7 @@ async def openai_inference(
logger.info(f"{repo_prefix}_{version}")
data.append(f"{repo_prefix}_{version}")
response = await run_instance(instance=datum)
response = await run_instance(instance=datum, use_reflection=use_reflection)
if response is None:
continue
logger.info(f"Final response: {response}")

View file

@ -6,11 +6,11 @@ from pathlib import Path
from tqdm.auto import tqdm
from data.inference.const import TESTBED
from metagpt.logs import logger
from swe_bench.make_datasets.make_instance import prompt_style_2_edits_only
from swe_bench.utils.parse_diff import filter_changed_line
from swe_bench.utils.repo_utils import EnvManager
from swe_bench.inference.const import TESTBED
def reset_task_env(instance: dict = {}):