From 0704f341dec48f1b49152b677dd238132a6abc46 Mon Sep 17 00:00:00 2001 From: didi <84363704+didiforgithub@users.noreply.github.com> Date: Wed, 11 Sep 2024 17:53:52 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BA=86eval=E7=B4=A2?= =?UTF-8?q?=E5=BC=95=E7=9A=84=E5=85=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/ags/benchmark/drop.py | 8 +++--- examples/ags/benchmark/gsm8k.py | 40 ++++++++++++----------------- examples/ags/benchmark/hotpotqa.py | 8 +++--- examples/ags/benchmark/humaneval.py | 8 +++--- examples/ags/benchmark/math.py | 8 +++--- examples/ags/benchmark/mbpp.py | 8 +++--- 6 files changed, 36 insertions(+), 44 deletions(-) diff --git a/examples/ags/benchmark/drop.py b/examples/ags/benchmark/drop.py index ff3a8065b..89c619755 100644 --- a/examples/ags/benchmark/drop.py +++ b/examples/ags/benchmark/drop.py @@ -210,12 +210,12 @@ def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}" ) -def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]: +def load_data(file_path: str, samples: int, test=False) -> List[Tuple[str, Dict[str, Any]]]: with open(file_path, mode="r") as file: data = json.load(file) data = list(data.items()) - random_indices = generate_random_indices(len(data), samples) + random_indices = generate_random_indices(len(data), samples, test) data = [data[i] for i in random_indices] return data @@ -287,8 +287,8 @@ def save_results_to_csv(results: List[List[Any]], path: str) -> float: return average_score -async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float: - data = load_data(file_path, samples) +async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> float: + data = load_data(file_path, samples, test=test) results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20) average_score = save_results_to_csv(results, path=path) print(f"Average score on DROP dataset: {average_score:.5f}") diff --git a/examples/ags/benchmark/gsm8k.py b/examples/ags/benchmark/gsm8k.py index ea02fd0e0..fc79823a5 100644 --- a/examples/ags/benchmark/gsm8k.py +++ b/examples/ags/benchmark/gsm8k.py @@ -64,33 +64,25 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) -> prompt = input max_retries = 5 retries = 0 - prediction = await graph(prompt) - cost = prediction[1] - output = prediction[0]["solution"] + while retries < max_retries: + try: + prediction = await graph(prompt) + cost = prediction[1] + output = prediction[0]["solution"] - print(output) + score = loose_match_score(expected_output, output) + break - score = loose_match_score(expected_output, output) - # break - # while retries < max_retries: - # try: - # prediction = await graph(prompt) - # cost = prediction[1] - # output = prediction[0]["solution"] + except Exception as e: + retries += 1 + print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})") - # score = loose_match_score(expected_output, output) - # break - - # except Exception as e: - # retries += 1 - # print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})") - - # if retries == max_retries: - # print("Maximum retries reached. Skipping this sample.") - # output = None - # cost = None - # score = 0 - # break + if retries == max_retries: + print("Maximum retries reached. Skipping this sample.") + output = None + cost = None + score = 0 + break return input, output, expected_output, score, cost diff --git a/examples/ags/benchmark/hotpotqa.py b/examples/ags/benchmark/hotpotqa.py index 19873aa37..44e71024f 100644 --- a/examples/ags/benchmark/hotpotqa.py +++ b/examples/ags/benchmark/hotpotqa.py @@ -59,13 +59,13 @@ def f1_score(prediction, ground_truth): return f1 -async def load_data(file_path: str, samples=20, total_length=1000) -> List[dict]: +async def load_data(file_path: str, samples=20, total_length=1000, test=False) -> List[dict]: data = [] async with aiofiles.open(file_path, mode="r") as file: async for line in file: data.append(json.loads(line)) data = data[:total_length] - random_indices = generate_random_indices(len(data), samples) + random_indices = generate_random_indices(len(data), samples, test) data = [data[i] for i in random_indices] return data @@ -119,8 +119,8 @@ def save_results_to_csv(results: List[Tuple[str, str, str, float]], path: str) - return average_score -async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float: - data = await load_data(file_path, samples) +async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> float: + data = await load_data(file_path, samples, test=test) results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20) average_score = save_results_to_csv(results, path=path) print(f"Average score on HotpotQA dataset: {average_score:.5f}") diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py index 65a7f3f16..ee61d5992 100644 --- a/examples/ags/benchmark/humaneval.py +++ b/examples/ags/benchmark/humaneval.py @@ -10,12 +10,12 @@ from examples.ags.benchmark.utils import generate_random_indices PASS = "pass" FAIL = "fail" -async def load_data(file_path: str, samples=1) -> List[dict]: +async def load_data(file_path: str, samples=1, test=False) -> List[dict]: data = [] async with aiofiles.open(file_path, mode="r") as file: async for line in file: data.append(json.loads(line)) - random_indices = generate_random_indices(len(data), samples) + random_indices = generate_random_indices(len(data), samples, test) data = [data[i] for i in random_indices] return data @@ -118,8 +118,8 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int, str]], path: s return round(avg_score, 5), round(total_cost, 5) # 修改返回值以包含total_cost -async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]: - data = await load_data(file_path, samples) +async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]: + data = await load_data(file_path, samples, test=test) results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20) average_score, total_cost = save_results_to_jsonl(results, path=path) print(f"Average score on HumanEval dataset: {average_score:.5f}") diff --git a/examples/ags/benchmark/math.py b/examples/ags/benchmark/math.py index 2a6a5cf4c..6c0ddccf7 100644 --- a/examples/ags/benchmark/math.py +++ b/examples/ags/benchmark/math.py @@ -212,12 +212,12 @@ def calculate_score(expected_output: str, prediction: str) -> int: return 1 if math_equal(predicted_answer, expected_answer) else 0 -async def load_data(file_path: str, samples: int = 200) -> List[dict]: +async def load_data(file_path: str, samples: int = 200, test=False) -> List[dict]: data = [] async with aiofiles.open(file_path, mode="r") as file: async for line in file: data.append(json.loads(line)) - random_indices = generate_random_indices(len(data), samples) + random_indices = generate_random_indices(len(data), samples, test) data = [data[i] for i in random_indices] return data @@ -270,8 +270,8 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren return await tqdm_asyncio.gather(*tasks, desc="Evaluating MATH problems", total=len(data)) -async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]: - data = await load_data(file_path, samples) +async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]: + data = await load_data(file_path, samples, test=test) results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20) average_score, total_cost = save_results_to_csv(results, path=path) print(f"Average score on MATH dataset: {average_score:.5f}") diff --git a/examples/ags/benchmark/mbpp.py b/examples/ags/benchmark/mbpp.py index b0722eb9c..a33144e68 100644 --- a/examples/ags/benchmark/mbpp.py +++ b/examples/ags/benchmark/mbpp.py @@ -10,12 +10,12 @@ from examples.ags.benchmark.utils import generate_random_indices PASS = "pass" FAIL = "fail" -async def load_data(file_path: str, samples=1) -> List[dict]: +async def load_data(file_path: str, samples=1, test=False) -> List[dict]: data = [] async with aiofiles.open(file_path, mode="r") as file: async for line in file: data.append(json.loads(line)) - random_indices = generate_random_indices(len(data), samples) + random_indices = generate_random_indices(len(data), samples, test) data = [data[i] for i in random_indices] return data @@ -99,10 +99,10 @@ def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str print(f"Results saved to {output_file}") return average_score, total_cost -async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]: +async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]: data = await load_data(file_path, samples) results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20) - average_score, total_cost = save_results_to_csv(results, path=path) + average_score, total_cost = save_results_to_csv(results, path=path, test=test) print(f"Average score on MBPP dataset: {average_score:.5f}") print(f"Total Cost: {total_cost:.5f}") return average_score, total_cost