Update he

2026-07-05 16:02:14 +02:00 · 2024-07-01 14:16:35 +08:00 · 2024-07-01 14:16:35 +08:00 · f1ce1330d7
commit f1ce1330d7
parent fd432fa132
3 changed files with 70 additions and 2 deletions
--- a/examples/ags/w_action_node/operator.py
+++ b/examples/ags/w_action_node/operator.py
@ -71,7 +71,7 @@ class Ensemble(Operator):
    async def __call__(self, solutions:List, problem_description):
        solution_text = ""
        for solution in solutions:
-            solution_text += solution + "\n"
+            solution_text += str(solution) + "\n"
        prompt = ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description)
        node = await ActionNode.from_pydantic(EnsembleOp).fill(context=prompt, llm=self.llm)
        response = node.instruct_content.model_dump()
--- a/examples/ags/w_action_node/prompt.py
+++ b/examples/ags/w_action_node/prompt.py
@ -20,7 +20,7 @@ If you believe the solution is capable of resolving the issue, return True; othe

 REVISE_PROMPT = """
 For the question described as {problem_description},
-please evaluate and revise the solution provided: {solution}, taking into account the review comments: {comment}."
+please evaluate and revise the solution provided: {solution}, taking into account the review feedbacks: {feedback}."
 Then output the revised solution.
 """

--- a/he_test.py
+++ b/he_test.py
@ -0,0 +1,68 @@
+import asyncio
+
+from metagpt.llm import LLM
+from evalplus.data import get_human_eval_plus, write_jsonl
+from examples.ags.w_action_node.graph import HumanEvalGraph
+from examples.ags.w_action_node.operator import GenerateCode
+
+generate_code = GenerateCode(llm=LLM())
+solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability')
+
+
+
+async def samples_generate_sequence():
+    sample_list = []
+    for case in get_human_eval_plus().values():
+        solution_result = await solver(case['prompt'])
+        sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
+        sample_list.append(sample_dict)
+    write_jsonl("samples.jsonl", sample_list)
+
+async def samples_generate_ags():
+    sample_list = []
+    cases = list(get_human_eval_plus().values())
+    
+    async def solve_with_id(case):
+        solution_result = await solver(case['prompt'])
+        return case['task_id'], solution_result['final_solution']
+    
+    tasks = [solve_with_id(case) for case in cases]
+    results = await asyncio.gather(*tasks)
+    
+    for task_id, solution in results:
+        sample_dict = dict(task_id=task_id, solution=solution)
+        sample_list.append(sample_dict)
+    
+    write_jsonl("samples.jsonl", sample_list)
+
+    # humanevalgraph result (review & revise -> ensemble)
+    # humaneval (base tests)
+    # pass@1: 0.823
+    # humaneval+ (base + extra tests)
+    # pass@1: 0.774
+
+    # deepseek result
+    # humaneval (base tests)
+    # pass@1: 0.841
+    # humaneval+ (base + extra tests)
+    # pass@1: 0.780
+
+async def samples_generate_llm():
+    sample_list = []
+    cases = list(get_human_eval_plus().values())
+    
+    async def solve_with_id(case):
+        solution_result =  await generate_code(case['prompt'])
+        return case['task_id'], solution_result['code_solution']
+    
+    tasks = [solve_with_id(case) for case in cases]
+    results = await asyncio.gather(*tasks)
+    
+    for task_id, solution in results:
+        sample_dict = dict(task_id=task_id, solution=solution)
+        sample_list.append(sample_dict)
+    
+    write_jsonl("samples.jsonl", sample_list)
+
+asyncio.run(samples_generate_llm())
+