diff --git a/examples/ags/w_action_node/operator.py b/examples/ags/w_action_node/operator.py index 031ef84b4..d49ac119c 100644 --- a/examples/ags/w_action_node/operator.py +++ b/examples/ags/w_action_node/operator.py @@ -71,7 +71,7 @@ class Ensemble(Operator): async def __call__(self, solutions:List, problem_description): solution_text = "" for solution in solutions: - solution_text += solution + "\n" + solution_text += str(solution) + "\n" prompt = ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description) node = await ActionNode.from_pydantic(EnsembleOp).fill(context=prompt, llm=self.llm) response = node.instruct_content.model_dump() diff --git a/examples/ags/w_action_node/prompt.py b/examples/ags/w_action_node/prompt.py index 46d851923..44d854bd8 100644 --- a/examples/ags/w_action_node/prompt.py +++ b/examples/ags/w_action_node/prompt.py @@ -20,7 +20,7 @@ If you believe the solution is capable of resolving the issue, return True; othe REVISE_PROMPT = """ For the question described as {problem_description}, -please evaluate and revise the solution provided: {solution}, taking into account the review comments: {comment}." +please evaluate and revise the solution provided: {solution}, taking into account the review feedbacks: {feedback}." Then output the revised solution. """ diff --git a/he_test.py b/he_test.py new file mode 100644 index 000000000..36919433b --- /dev/null +++ b/he_test.py @@ -0,0 +1,68 @@ +import asyncio + +from metagpt.llm import LLM +from evalplus.data import get_human_eval_plus, write_jsonl +from examples.ags.w_action_node.graph import HumanEvalGraph +from examples.ags.w_action_node.operator import GenerateCode + +generate_code = GenerateCode(llm=LLM()) +solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability') + + + +async def samples_generate_sequence(): + sample_list = [] + for case in get_human_eval_plus().values(): + solution_result = await solver(case['prompt']) + sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution']) + sample_list.append(sample_dict) + write_jsonl("samples.jsonl", sample_list) + +async def samples_generate_ags(): + sample_list = [] + cases = list(get_human_eval_plus().values()) + + async def solve_with_id(case): + solution_result = await solver(case['prompt']) + return case['task_id'], solution_result['final_solution'] + + tasks = [solve_with_id(case) for case in cases] + results = await asyncio.gather(*tasks) + + for task_id, solution in results: + sample_dict = dict(task_id=task_id, solution=solution) + sample_list.append(sample_dict) + + write_jsonl("samples.jsonl", sample_list) + + # humanevalgraph result (review & revise -> ensemble) + # humaneval (base tests) + # pass@1: 0.823 + # humaneval+ (base + extra tests) + # pass@1: 0.774 + + # deepseek result + # humaneval (base tests) + # pass@1: 0.841 + # humaneval+ (base + extra tests) + # pass@1: 0.780 + +async def samples_generate_llm(): + sample_list = [] + cases = list(get_human_eval_plus().values()) + + async def solve_with_id(case): + solution_result = await generate_code(case['prompt']) + return case['task_id'], solution_result['code_solution'] + + tasks = [solve_with_id(case) for case in cases] + results = await asyncio.gather(*tasks) + + for task_id, solution in results: + sample_dict = dict(task_id=task_id, solution=solution) + sample_list.append(sample_dict) + + write_jsonl("samples.jsonl", sample_list) + +asyncio.run(samples_generate_llm()) +