diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py index 4d9ab0480..fd28cff26 100644 --- a/examples/ags/benchmark/humaneval.py +++ b/examples/ags/benchmark/humaneval.py @@ -82,10 +82,17 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren return await tqdm_asyncio.gather(*tasks, desc="Evaluating HumanEval problems", total=len(data)) +import os +import time +import json + def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -> float: avg_score = 0 + timestamp = int(time.time()) + filename = f"humaneval_results_{timestamp}.jsonl" + full_path = os.path.join(path, filename) - with open(path, "w") as f: + with open(full_path, "w") as f: for result in results: f.write( json.dumps( @@ -99,10 +106,10 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) - + "\n" ) avg_score += result[3] - print(f"Results saved to {path}") + print(f"save to {full_path}") avg_score /= len(results) - return avg_score + return round(avg_score, 5) async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float: data = await load_data(file_path, samples) diff --git a/examples/ags/experiments/baselines/cot_humaneval.py b/examples/ags/experiments/baselines/cot_humaneval.py index 8f0db1066..27d5f8801 100644 --- a/examples/ags/experiments/baselines/cot_humaneval.py +++ b/examples/ags/experiments/baselines/cot_humaneval.py @@ -38,11 +38,12 @@ class CoTSolveGraph(SolveGraph): if __name__ == "__main__": async def main(): + # llm_config = ModelsConfig.default().get("gpt-4o-mini") llm_config = ModelsConfig.default().get("gpt-35-turbo-1106") graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="HumanEval") file_path = "examples/ags/data/human-eval-new.jsonl" - samples = 1 # 33/131 - path = "examples/ags/data/baselines/general" + samples = 131 # 33/131 + path = "examples/ags/data/baselines/general/humaneval" score = await humaneval_evaluation(graph, file_path, samples, path) return score