Update human Eval

2026-06-08 15:05:17 +02:00 · 2024-09-10 10:58:39 +08:00 · 2024-09-10 10:58:39 +08:00 · c7c34cda7d
commit c7c34cda7d
parent 62ffa730e0
2 changed files with 13 additions and 5 deletions
--- a/examples/ags/benchmark/humaneval.py
+++ b/examples/ags/benchmark/humaneval.py
@ -82,10 +82,17 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren

    return await tqdm_asyncio.gather(*tasks, desc="Evaluating HumanEval problems", total=len(data))

+import os
+import time
+import json
+
 def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -> float:
    avg_score = 0
+    timestamp = int(time.time())
+    filename = f"humaneval_results_{timestamp}.jsonl"
+    full_path = os.path.join(path, filename)

-    with open(path, "w") as f:
+    with open(full_path, "w") as f:
        for result in results:
            f.write(
                json.dumps(
@ -99,10 +106,10 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -
                + "\n"
            )
            avg_score += result[3]
-    print(f"Results saved to {path}")
+    print(f"save to {full_path}")
    avg_score /= len(results)

-    return avg_score
+    return round(avg_score, 5)

 async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
    data = await load_data(file_path, samples)
--- a/examples/ags/experiments/baselines/cot_humaneval.py
+++ b/examples/ags/experiments/baselines/cot_humaneval.py
@ -38,11 +38,12 @@ class CoTSolveGraph(SolveGraph):

 if __name__ == "__main__":
    async def main():
+        # llm_config = ModelsConfig.default().get("gpt-4o-mini")
        llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
        graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="HumanEval")
        file_path = "examples/ags/data/human-eval-new.jsonl"
-        samples = 1 # 33/131  
-        path = "examples/ags/data/baselines/general"
+        samples = 131 # 33/131  
+        path = "examples/ags/data/baselines/general/humaneval"
        score = await humaneval_evaluation(graph, file_path, samples, path)
        return score