From 0704f341dec48f1b49152b677dd238132a6abc46 Mon Sep 17 00:00:00 2001
From: didi <84363704+didiforgithub@users.noreply.github.com>
Date: Wed, 11 Sep 2024 17:53:52 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BA=86eval=E7=B4=A2?=
 =?UTF-8?q?=E5=BC=95=E7=9A=84=E5=85=A5=E5=8F=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/ags/benchmark/drop.py      |  8 +++---
 examples/ags/benchmark/gsm8k.py     | 40 ++++++++++++-----------------
 examples/ags/benchmark/hotpotqa.py  |  8 +++---
 examples/ags/benchmark/humaneval.py |  8 +++---
 examples/ags/benchmark/math.py      |  8 +++---
 examples/ags/benchmark/mbpp.py      |  8 +++---
 6 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/examples/ags/benchmark/drop.py b/examples/ags/benchmark/drop.py
index ff3a8065b..89c619755 100644
--- a/examples/ags/benchmark/drop.py
+++ b/examples/ags/benchmark/drop.py
@@ -210,12 +210,12 @@ def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str
             f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}"
         )
 
-def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]:
+def load_data(file_path: str, samples: int, test=False) -> List[Tuple[str, Dict[str, Any]]]:
     with open(file_path, mode="r") as file:
         data = json.load(file)
         data = list(data.items())
 
-    random_indices = generate_random_indices(len(data), samples)
+    random_indices = generate_random_indices(len(data), samples, test)
     data = [data[i] for i in random_indices]
     return data
 
@@ -287,8 +287,8 @@ def save_results_to_csv(results: List[List[Any]], path: str) -> float:
 
     return average_score
 
-async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
-    data = load_data(file_path, samples)
+async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> float:
+    data = load_data(file_path, samples, test=test)
     results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20)
     average_score = save_results_to_csv(results, path=path)
     print(f"Average score on DROP dataset: {average_score:.5f}")
diff --git a/examples/ags/benchmark/gsm8k.py b/examples/ags/benchmark/gsm8k.py
index ea02fd0e0..fc79823a5 100644
--- a/examples/ags/benchmark/gsm8k.py
+++ b/examples/ags/benchmark/gsm8k.py
@@ -64,33 +64,25 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
     prompt = input
     max_retries = 5
     retries = 0
-    prediction = await graph(prompt)
-    cost = prediction[1]
-    output = prediction[0]["solution"]
+    while retries < max_retries:
+        try:
+            prediction = await graph(prompt)
+            cost = prediction[1]
+            output = prediction[0]["solution"]
 
-    print(output)
+            score = loose_match_score(expected_output, output)
+            break
 
-    score = loose_match_score(expected_output, output)
-    # break
-    # while retries < max_retries:
-    #     try:
-    #         prediction = await graph(prompt)
-    #         cost = prediction[1]
-    #         output = prediction[0]["solution"]
+        except Exception as e:
+            retries += 1
+            print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
 
-    #         score = loose_match_score(expected_output, output)
-    #         break
-
-    #     except Exception as e:
-    #         retries += 1
-    #         print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
-
-    #         if retries == max_retries:
-    #             print("Maximum retries reached. Skipping this sample.")
-    #             output = None
-    #             cost = None
-    #             score = 0
-    #             break
+            if retries == max_retries:
+                print("Maximum retries reached. Skipping this sample.")
+                output = None
+                cost = None
+                score = 0
+                break
 
     return input, output, expected_output, score, cost
 
diff --git a/examples/ags/benchmark/hotpotqa.py b/examples/ags/benchmark/hotpotqa.py
index 19873aa37..44e71024f 100644
--- a/examples/ags/benchmark/hotpotqa.py
+++ b/examples/ags/benchmark/hotpotqa.py
@@ -59,13 +59,13 @@ def f1_score(prediction, ground_truth):
     return f1
 
 
-async def load_data(file_path: str, samples=20, total_length=1000) -> List[dict]:
+async def load_data(file_path: str, samples=20, total_length=1000, test=False) -> List[dict]:
     data = []
     async with aiofiles.open(file_path, mode="r") as file:
         async for line in file:
             data.append(json.loads(line))
     data = data[:total_length] 
-    random_indices = generate_random_indices(len(data), samples)
+    random_indices = generate_random_indices(len(data), samples, test)
     data = [data[i] for i in random_indices]
     return data
 
@@ -119,8 +119,8 @@ def save_results_to_csv(results: List[Tuple[str, str, str, float]], path: str) -
 
     return average_score
 
-async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
-    data = await load_data(file_path, samples)
+async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> float:
+    data = await load_data(file_path, samples, test=test)
     results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
     average_score = save_results_to_csv(results, path=path)
     print(f"Average score on HotpotQA dataset: {average_score:.5f}")
diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py
index 65a7f3f16..ee61d5992 100644
--- a/examples/ags/benchmark/humaneval.py
+++ b/examples/ags/benchmark/humaneval.py
@@ -10,12 +10,12 @@ from examples.ags.benchmark.utils import generate_random_indices
 PASS = "pass"
 FAIL = "fail"
 
-async def load_data(file_path: str, samples=1) -> List[dict]:
+async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
     data = []
     async with aiofiles.open(file_path, mode="r") as file:
         async for line in file:
             data.append(json.loads(line))
-    random_indices = generate_random_indices(len(data), samples)
+    random_indices = generate_random_indices(len(data), samples, test)
     data = [data[i] for i in random_indices]
     return data
 
@@ -118,8 +118,8 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int, str]], path: s
 
     return round(avg_score, 5), round(total_cost, 5)  # 修改返回值以包含total_cost
 
-async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
-    data = await load_data(file_path, samples)
+async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
+    data = await load_data(file_path, samples, test=test)
     results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
     average_score, total_cost = save_results_to_jsonl(results, path=path)
     print(f"Average score on HumanEval dataset: {average_score:.5f}")
diff --git a/examples/ags/benchmark/math.py b/examples/ags/benchmark/math.py
index 2a6a5cf4c..6c0ddccf7 100644
--- a/examples/ags/benchmark/math.py
+++ b/examples/ags/benchmark/math.py
@@ -212,12 +212,12 @@ def calculate_score(expected_output: str, prediction: str) -> int:
 
     return 1 if math_equal(predicted_answer, expected_answer) else 0
 
-async def load_data(file_path: str, samples: int = 200) -> List[dict]:
+async def load_data(file_path: str, samples: int = 200, test=False) -> List[dict]:
     data = []
     async with aiofiles.open(file_path, mode="r") as file:
         async for line in file:
             data.append(json.loads(line))
-    random_indices = generate_random_indices(len(data), samples)
+    random_indices = generate_random_indices(len(data), samples, test)
     data = [data[i] for i in random_indices]
     return data
 
@@ -270,8 +270,8 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
 
     return await tqdm_asyncio.gather(*tasks, desc="Evaluating MATH problems", total=len(data))
 
-async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
-    data = await load_data(file_path, samples)
+async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
+    data = await load_data(file_path, samples, test=test)
     results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
     average_score, total_cost = save_results_to_csv(results, path=path)
     print(f"Average score on MATH dataset: {average_score:.5f}")
diff --git a/examples/ags/benchmark/mbpp.py b/examples/ags/benchmark/mbpp.py
index b0722eb9c..a33144e68 100644
--- a/examples/ags/benchmark/mbpp.py
+++ b/examples/ags/benchmark/mbpp.py
@@ -10,12 +10,12 @@ from examples.ags.benchmark.utils import generate_random_indices
 PASS = "pass"
 FAIL = "fail"
 
-async def load_data(file_path: str, samples=1) -> List[dict]:
+async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
     data = []
     async with aiofiles.open(file_path, mode="r") as file:
         async for line in file:
             data.append(json.loads(line))
-    random_indices = generate_random_indices(len(data), samples)
+    random_indices = generate_random_indices(len(data), samples, test)
     data = [data[i] for i in random_indices]
     return data
 
@@ -99,10 +99,10 @@ def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str
     print(f"Results saved to {output_file}")
     return average_score, total_cost
 
-async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
+async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
     data = await load_data(file_path, samples)
     results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
-    average_score, total_cost = save_results_to_csv(results, path=path)
+    average_score, total_cost = save_results_to_csv(results, path=path, test=test)
     print(f"Average score on MBPP dataset: {average_score:.5f}")
     print(f"Total Cost: {total_cost:.5f}")
     return average_score, total_cost