Update

2026-06-05 14:55:18 +02:00 · 2024-07-25 10:47:17 +08:00 · 2024-07-25 10:47:17 +08:00 · 772d2aea56
commit 772d2aea56
parent ca1c8f8c5c
9 changed files with 583 additions and 65 deletions
--- a/he_test.py
+++ b/he_test.py
@ -1,20 +1,46 @@
 import asyncio
+import json
 from metagpt.llm import LLM
+from evalplus.data import get_human_eval_plus, write_jsonl
 from examples.ags.benchmark.humaneval import sample_generate, samples_generate, extract_failure_tests, automatic_evalplus
-from examples.ags.w_action_node.utils import jsonl_ranker
-
+from examples.ags.w_action_node.utils import jsonl_ranker, llm_extract_test_case
+from examples.ags.w_action_node.graph import HumanEvalGraph
 # 132 141 136 80 73
-# asyncio.run(sample_generate('HumanEval/118',result_path="llm_based_4.jsonl",mode="llm"))
-# asyncio.run(samples_generate(mode='ags',result_path="ags_based_1.jsonl"))
+# asyncio.run(sample_generate('HumanEval/118',result_path="llm_based_8.jsonl",mode="llm"))
+asyncio.run(samples_generate(mode='llm',result_path="llm_based_100.jsonl"))
 # jsonl_ranker("samples.jsonl", "samples.jsonl")

-result_path = "ags_based_2.jsonl"
-if automatic_evalplus(result_path):
-    unpassed_exapmle = extract_failure_tests(result_path[:-6]+"_eval_results.json")
-    print(unpassed_exapmle)
+# result_path = "ags_based_6.jsonl"
+# if automatic_evalplus(result_path):
+#     unpassed_exapmle = extract_failure_tests(result_path[:-6]+"_eval_results.json")
+#     print(unpassed_exapmle)

 # unpassed_exapmle = extract_failure_tests(file_path="2_eval_results.json")
 # print(unpassed_exapmle)

 # for example in failure_list:
-#     asyncio.run(sample_generate(example))
+#     asyncio.run(sample_generate(example))
+
+# TODO 抽取Public Test没搞完，先用几个测试跑一下流程
+# from evalplus.data import get_human_eval_plus
+
+# id_list = [87, 95, 107, 112, 127, 136, 148, 155]
+# id_list = [155]
+# cases_id = [f"HumanEval/{case_id}" for case_id in id_list]
+# cases = {case_id: get_human_eval_plus()[case_id]['prompt'] for case_id in cases_id}
+# async def main(cases):
+#     try:
+#         tasks = [llm_extract_test_case(case_id, case) for case_id, case in cases.items()]
+#         results = await asyncio.gather(*tasks)
+#     except:
+#         failed_tasks = [task_id for task_id in results if task_id is not None]
+#         print(failed_tasks)
+#     return results
+
+# asyncio.run(main(cases))
+
+# [72, 80, 82, 87, 90, 95, 107, 109, 112, 124, 126, 127, 128, 132, 134, 136, 137, 138, 148, 154, 155]
+
+# case_prompt= get_human_eval_plus()["HumanEval/136"]['prompt']
+# solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1)
+# result = asyncio.run(solver.alpha_codium(problem_id="HumanEval/136", problem=case_prompt, ensemble_count=1))