diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py index f2c1d5bb2..5273e84e5 100644 --- a/examples/ags/benchmark/humaneval.py +++ b/examples/ags/benchmark/humaneval.py @@ -21,7 +21,7 @@ solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficie async def sample_generate(id): case = get_human_eval_plus()[f"{id}"] - solution_result = await solver(case['prompt'],ensemble_count=5) + solution_result = await solver(case['prompt'],ensemble_count=3) sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution']) with open("samples.jsonl", mode='a') as f: f.write(json.dumps(sample_dict) + '\n') @@ -153,6 +153,7 @@ def extract_failure_tests(file_path:str = "/Users/trl/Github_project/MetaGPT-Mat # asyncio.run(sample_generate('HumanEval/101')) -# asyncio.run(samples_generate(mode='llm')) +# asyncio.run(samples_generate(mode='ags')) # jsonl_ranker("samples.jsonl", "samples.jsonl") -# {"task_id": "HumanEval/101", "solution": "def words_string(s):\n import re\n return re.split(r'[,\\s]\\s*', s)"} \ No newline at end of file +# {"task_id": "HumanEval/101", "solution": "def words_string(s):\n import re\n return re.split(r'[,\\s]\\s*', s)"} + diff --git a/examples/ags/w_action_node/operator_an.py b/examples/ags/w_action_node/operator_an.py index 66d8baf08..673f635b4 100644 --- a/examples/ags/w_action_node/operator_an.py +++ b/examples/ags/w_action_node/operator_an.py @@ -25,9 +25,5 @@ class EnsembleOp(BaseModel): final_solution: str = Field(default="", description="Final ensemble solution for this problem") class MdEnsembleOp(BaseModel): - thought: str = Field(default="", - description="Analyze the solutions and think what's the best step by step.") - solution_letter: str = Field(default="", - description=""" - Based on the problem and solution candidates, carefully analyze which is the best answer. Focus solely on the correctness of the solution in addressing the problem. - Provide your final decision by writing the chosen solution number. (eg.A) """) \ No newline at end of file + thought: str = Field(default="", description="Analyze the solutions and think what's the best step by step.") + solution_letter: str = Field(default="", description="Choose The Best Solution, and output only one solution letter") \ No newline at end of file diff --git a/examples/ags/w_action_node/prompt.py b/examples/ags/w_action_node/prompt.py index f81331a55..00e83ec74 100644 --- a/examples/ags/w_action_node/prompt.py +++ b/examples/ags/w_action_node/prompt.py @@ -66,4 +66,7 @@ MD_ENSEMBLE_PROMPT = """ {solutions} + +### Instructions +Based on the problem and solution candidates, carefully analyze which is the best answer. Focus solely on the correctness of the solution in addressing the problem. Provide your final decision by writing the chosen solution number. (eg.B). Keep the json format. """ \ No newline at end of file diff --git a/he_test.py b/he_test.py index 2102dee83..23fc9139f 100644 --- a/he_test.py +++ b/he_test.py @@ -5,7 +5,7 @@ from examples.ags.w_action_node.utils import jsonl_ranker # asyncio.run(sample_generate('HumanEval/101')) # asyncio.run(sample_generate('HumanEval/1')) -# asyncio.run(samples_generate(mode='llm')) +asyncio.run(samples_generate(mode='ags')) # jsonl_ranker("samples.jsonl", "samples.jsonl") @@ -14,4 +14,9 @@ from examples.ags.w_action_node.utils import jsonl_ranker # print(unpassed_exapmle) # unpassed_exapmle = extract_failure_tests() -# print(unpassed_exapmle) \ No newline at end of file +# print(unpassed_exapmle) + +# failure_list = ['HumanEval/0', 'HumanEval/1', 'HumanEval/7', 'HumanEval/16', 'HumanEval/24', 'HumanEval/31', 'HumanEval/40', 'HumanEval/56', 'HumanEval/67', 'HumanEval/74', 'HumanEval/83', 'HumanEval/86', 'HumanEval/87', 'HumanEval/90', 'HumanEval/95', 'HumanEval/101', 'HumanEval/104', 'HumanEval/113', 'HumanEval/125', 'HumanEval/132', 'HumanEval/135', 'HumanEval/140', 'HumanEval/143', 'HumanEval/145', 'HumanEval/154', 'HumanEval/161'] + +# for example in failure_list: +# asyncio.run(sample_generate(example)) \ No newline at end of file