diff --git a/examples/ags/w_action_node/operator.py b/examples/ags/w_action_node/operator.py
index 031ef84b4..d49ac119c 100644
--- a/examples/ags/w_action_node/operator.py
+++ b/examples/ags/w_action_node/operator.py
@@ -71,7 +71,7 @@ class Ensemble(Operator):
     async def __call__(self, solutions:List, problem_description):
         solution_text = ""
         for solution in solutions:
-            solution_text += solution + "\n"
+            solution_text += str(solution) + "\n"
         prompt = ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description)
         node = await ActionNode.from_pydantic(EnsembleOp).fill(context=prompt, llm=self.llm)
         response = node.instruct_content.model_dump()
diff --git a/examples/ags/w_action_node/prompt.py b/examples/ags/w_action_node/prompt.py
index 46d851923..44d854bd8 100644
--- a/examples/ags/w_action_node/prompt.py
+++ b/examples/ags/w_action_node/prompt.py
@@ -20,7 +20,7 @@ If you believe the solution is capable of resolving the issue, return True; othe
 
 REVISE_PROMPT = """
 For the question described as {problem_description},
-please evaluate and revise the solution provided: {solution}, taking into account the review comments: {comment}."
+please evaluate and revise the solution provided: {solution}, taking into account the review feedbacks: {feedback}."
 Then output the revised solution.
 """
 
diff --git a/he_test.py b/he_test.py
new file mode 100644
index 000000000..36919433b
--- /dev/null
+++ b/he_test.py
@@ -0,0 +1,68 @@
+import asyncio
+
+from metagpt.llm import LLM
+from evalplus.data import get_human_eval_plus, write_jsonl
+from examples.ags.w_action_node.graph import HumanEvalGraph
+from examples.ags.w_action_node.operator import GenerateCode
+
+generate_code = GenerateCode(llm=LLM())
+solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability')
+
+
+
+async def samples_generate_sequence():
+    sample_list = []
+    for case in get_human_eval_plus().values():
+        solution_result = await solver(case['prompt'])
+        sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
+        sample_list.append(sample_dict)
+    write_jsonl("samples.jsonl", sample_list)
+
+async def samples_generate_ags():
+    sample_list = []
+    cases = list(get_human_eval_plus().values())
+    
+    async def solve_with_id(case):
+        solution_result = await solver(case['prompt'])
+        return case['task_id'], solution_result['final_solution']
+    
+    tasks = [solve_with_id(case) for case in cases]
+    results = await asyncio.gather(*tasks)
+    
+    for task_id, solution in results:
+        sample_dict = dict(task_id=task_id, solution=solution)
+        sample_list.append(sample_dict)
+    
+    write_jsonl("samples.jsonl", sample_list)
+
+    # humanevalgraph result (review & revise -> ensemble)
+    # humaneval (base tests)
+    # pass@1: 0.823
+    # humaneval+ (base + extra tests)
+    # pass@1: 0.774
+
+    # deepseek result
+    # humaneval (base tests)
+    # pass@1: 0.841
+    # humaneval+ (base + extra tests)
+    # pass@1: 0.780
+
+async def samples_generate_llm():
+    sample_list = []
+    cases = list(get_human_eval_plus().values())
+    
+    async def solve_with_id(case):
+        solution_result =  await generate_code(case['prompt'])
+        return case['task_id'], solution_result['code_solution']
+    
+    tasks = [solve_with_id(case) for case in cases]
+    results = await asyncio.gather(*tasks)
+    
+    for task_id, solution in results:
+        sample_dict = dict(task_id=task_id, solution=solution)
+        sample_list.append(sample_dict)
+    
+    write_jsonl("samples.jsonl", sample_list)
+
+asyncio.run(samples_generate_llm())
+