更新了eval索引的入口

2026-06-08 15:05:17 +02:00 · 2024-09-11 17:53:52 +08:00 · 2024-09-11 17:53:52 +08:00 · 0704f341de
commit 0704f341de
parent b805da0bbe
6 changed files with 36 additions and 44 deletions
--- a/examples/ags/benchmark/drop.py
+++ b/examples/ags/benchmark/drop.py
@ -210,12 +210,12 @@ def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str
            f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}"
        )

-def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]:
+def load_data(file_path: str, samples: int, test=False) -> List[Tuple[str, Dict[str, Any]]]:
    with open(file_path, mode="r") as file:
        data = json.load(file)
        data = list(data.items())

-    random_indices = generate_random_indices(len(data), samples)
+    random_indices = generate_random_indices(len(data), samples, test)
    data = [data[i] for i in random_indices]
    return data

@ -287,8 +287,8 @@ def save_results_to_csv(results: List[List[Any]], path: str) -> float:

    return average_score

-async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
-    data = load_data(file_path, samples)
+async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> float:
+    data = load_data(file_path, samples, test=test)
    results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20)
    average_score = save_results_to_csv(results, path=path)
    print(f"Average score on DROP dataset: {average_score:.5f}")
--- a/examples/ags/benchmark/gsm8k.py
+++ b/examples/ags/benchmark/gsm8k.py
@ -64,33 +64,25 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
    prompt = input
    max_retries = 5
    retries = 0
-    prediction = await graph(prompt)
-    cost = prediction[1]
-    output = prediction[0]["solution"]
+    while retries < max_retries:
+        try:
+            prediction = await graph(prompt)
+            cost = prediction[1]
+            output = prediction[0]["solution"]

-    print(output)
+            score = loose_match_score(expected_output, output)
+            break

-    score = loose_match_score(expected_output, output)
-    # break
-    # while retries < max_retries:
-    #     try:
-    #         prediction = await graph(prompt)
-    #         cost = prediction[1]
-    #         output = prediction[0]["solution"]
+        except Exception as e:
+            retries += 1
+            print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")

-    #         score = loose_match_score(expected_output, output)
-    #         break
-
-    #     except Exception as e:
-    #         retries += 1
-    #         print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
-
-    #         if retries == max_retries:
-    #             print("Maximum retries reached. Skipping this sample.")
-    #             output = None
-    #             cost = None
-    #             score = 0
-    #             break
+            if retries == max_retries:
+                print("Maximum retries reached. Skipping this sample.")
+                output = None
+                cost = None
+                score = 0
+                break

    return input, output, expected_output, score, cost

--- a/examples/ags/benchmark/hotpotqa.py
+++ b/examples/ags/benchmark/hotpotqa.py
@ -59,13 +59,13 @@ def f1_score(prediction, ground_truth):
    return f1


-async def load_data(file_path: str, samples=20, total_length=1000) -> List[dict]:
+async def load_data(file_path: str, samples=20, total_length=1000, test=False) -> List[dict]:
    data = []
    async with aiofiles.open(file_path, mode="r") as file:
        async for line in file:
            data.append(json.loads(line))
    data = data[:total_length] 
-    random_indices = generate_random_indices(len(data), samples)
+    random_indices = generate_random_indices(len(data), samples, test)
    data = [data[i] for i in random_indices]
    return data

@ -119,8 +119,8 @@ def save_results_to_csv(results: List[Tuple[str, str, str, float]], path: str) -

    return average_score

-async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
-    data = await load_data(file_path, samples)
+async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> float:
+    data = await load_data(file_path, samples, test=test)
    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
    average_score = save_results_to_csv(results, path=path)
    print(f"Average score on HotpotQA dataset: {average_score:.5f}")
--- a/examples/ags/benchmark/humaneval.py
+++ b/examples/ags/benchmark/humaneval.py
@ -10,12 +10,12 @@ from examples.ags.benchmark.utils import generate_random_indices
 PASS = "pass"
 FAIL = "fail"

-async def load_data(file_path: str, samples=1) -> List[dict]:
+async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
    data = []
    async with aiofiles.open(file_path, mode="r") as file:
        async for line in file:
            data.append(json.loads(line))
-    random_indices = generate_random_indices(len(data), samples)
+    random_indices = generate_random_indices(len(data), samples, test)
    data = [data[i] for i in random_indices]
    return data

@ -118,8 +118,8 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int, str]], path: s

    return round(avg_score, 5), round(total_cost, 5)  # 修改返回值以包含total_cost

-async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
-    data = await load_data(file_path, samples)
+async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
+    data = await load_data(file_path, samples, test=test)
    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
    average_score, total_cost = save_results_to_jsonl(results, path=path)
    print(f"Average score on HumanEval dataset: {average_score:.5f}")
--- a/examples/ags/benchmark/math.py
+++ b/examples/ags/benchmark/math.py
@ -212,12 +212,12 @@ def calculate_score(expected_output: str, prediction: str) -> int:

    return 1 if math_equal(predicted_answer, expected_answer) else 0

-async def load_data(file_path: str, samples: int = 200) -> List[dict]:
+async def load_data(file_path: str, samples: int = 200, test=False) -> List[dict]:
    data = []
    async with aiofiles.open(file_path, mode="r") as file:
        async for line in file:
            data.append(json.loads(line))
-    random_indices = generate_random_indices(len(data), samples)
+    random_indices = generate_random_indices(len(data), samples, test)
    data = [data[i] for i in random_indices]
    return data

@ -270,8 +270,8 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren

    return await tqdm_asyncio.gather(*tasks, desc="Evaluating MATH problems", total=len(data))

-async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
-    data = await load_data(file_path, samples)
+async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
+    data = await load_data(file_path, samples, test=test)
    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
    average_score, total_cost = save_results_to_csv(results, path=path)
    print(f"Average score on MATH dataset: {average_score:.5f}")
--- a/examples/ags/benchmark/mbpp.py
+++ b/examples/ags/benchmark/mbpp.py
@ -10,12 +10,12 @@ from examples.ags.benchmark.utils import generate_random_indices
 PASS = "pass"
 FAIL = "fail"

-async def load_data(file_path: str, samples=1) -> List[dict]:
+async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
    data = []
    async with aiofiles.open(file_path, mode="r") as file:
        async for line in file:
            data.append(json.loads(line))
-    random_indices = generate_random_indices(len(data), samples)
+    random_indices = generate_random_indices(len(data), samples, test)
    data = [data[i] for i in random_indices]
    return data

@ -99,10 +99,10 @@ def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str
    print(f"Results saved to {output_file}")
    return average_score, total_cost

-async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
+async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
    data = await load_data(file_path, samples)
    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
-    average_score, total_cost = save_results_to_csv(results, path=path)
+    average_score, total_cost = save_results_to_csv(results, path=path, test=test)
    print(f"Average score on MBPP dataset: {average_score:.5f}")
    print(f"Total Cost: {total_cost:.5f}")
    return average_score, total_cost