diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py
index 638a64021..7e639b1ab 100644
--- a/examples/ags/benchmark/humaneval.py
+++ b/examples/ags/benchmark/humaneval.py
@@ -3,6 +3,11 @@
 # @Author  : didi
 # @Desc    : test on human eval graph
 
+# 1. 出效果
+# 2. 代码方面，格式问题，很多格式处理 ->增加效果
+# 3. GSM8k -> 
+# 4. 我来写一个GSM8k最基础代码，GSM8k实验代码需要你来改写
+
 import os
 import json
 import subprocess
@@ -17,7 +22,7 @@ from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock
 
 generate_code = GenerateCode(llm=LLM())
 generate_code_block = GenerateCodeBlock(llm=LLM())
-solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)
+solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1)
 
 async def sample_generate(id, result_path:str="samples.jsonl",mode:str="ags"):
     case = get_human_eval_plus()[f"{id}"]
@@ -55,7 +60,7 @@ async def samples_generate(mode:str, result_path:str="samples.jsonl"):
                 'solution': solution_result['final_solution']
                 }
             elif mode == "alpha":
-                solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=5)
+                solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=1)
                 sample_dict = {
                 'task_id': case['task_id'],
                 'solution': solution_result['final_solution']
@@ -164,6 +169,7 @@ def automatic_sanitize(result_path: str = "samples.jsonl"):
     sanitized_path = f"{base_name}-sanitized.jsonl"
     
     return sanitized_path
+
 def automatic_evalplus(result_path:str ="samples.jsonl"):
     """
     在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only
diff --git a/examples/ags/w_action_node/graph.py b/examples/ags/w_action_node/graph.py
index 7bd32bdc9..217371cb9 100644
--- a/examples/ags/w_action_node/graph.py
+++ b/examples/ags/w_action_node/graph.py
@@ -7,6 +7,7 @@ from metagpt.llm import LLM
 from typing import List
 from examples.ags.w_action_node.operator import Generate, GenerateCode, GenerateCodeBlock, Review, Revise, FuEnsemble, MdEnsemble, DbEnsemble, Rephrase, Test
 from examples.ags.w_action_node.utils import extract_test_cases_from_jsonl
+from evalplus.data import get_human_eval_plus
 class Graph:
     def __init__(self, name:str, llm:LLM) -> None:
         self.name = name
@@ -46,24 +47,23 @@ class HumanEvalGraph(Graph):
         solution = await self.mdensemble("code", solution_list, problem)
         return solution
     
-    
     async def alpha_codium(self, problem_id:str, problem:str, ensemble_count:int = 3):
     # async def __call__(self,problem_id, problem:str, ensemble_count:int = 3):
         test_cases = extract_test_cases_from_jsonl(problem_id)
+        entry_point = get_human_eval_plus()[problem_id]['entry_point']
         rephrase_problem = await self.rephrase(problem) # 在rephrase 中拼接原始的问题描述
         solution_list = []
         for _ in range(ensemble_count):
             for retry_count in range(5):
                 try:
-                    solution = await self.generate_code_block(problem, rephrase_problem)
+                    solution = await self.generate_code_block.rephrase_generate(problem, rephrase_problem, function_name=entry_point)
                     solution = solution.get('code_solution')
                     solution_list.append(solution)
                     break
                 except Exception as e:
                     print(e)
         solution = await self.mdensemble("code", solution_list, problem)
-        print("here",solution)
-        solution = await self.tester(problem, rephrase_problem, solution, test_cases)
+        solution = await self.tester(problem_id, problem, rephrase_problem, solution, test_cases)
         return solution
 
     async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2):
@@ -96,6 +96,18 @@ class HumanEvalGraph(Graph):
             solution = solution.get('revised_solution')
         return solution
     
-
 class Gsm8kGraph(Graph):
-    pass
\ No newline at end of file
+    def __init__(self, name:str, llm: LLM) -> None:
+        super().__init__(name, llm)
+        self.generate = Generate(llm=llm)
+        self.rephrase = Rephrase(llm=llm)
+    
+    async def __call__(self, problem:str):
+        solution = self.generate(problem)
+        return solution
+    
+    # async def __call__(self, problem:str):
+    # 这个地方没有修改对应的prompt，可以对应着humaneval改一下
+    #     problem = await self.rephrase(problem)
+    #     solution = self.generate(problem)
+    #     return solution
\ No newline at end of file
diff --git a/examples/ags/w_action_node/operator.py b/examples/ags/w_action_node/operator.py
index f7cf9b4b5..1069b73e5 100644
--- a/examples/ags/w_action_node/operator.py
+++ b/examples/ags/w_action_node/operator.py
@@ -3,6 +3,8 @@
 # @Author  : didi
 # @Desc    : operator demo of ags
 import ast
+import sys
+import traceback
 import random
 from typing import List, Tuple, Any, Dict
 from collections import Counter
@@ -115,6 +117,7 @@ class MdEnsemble(Operator):
         return shuffled_solutions, answer_mapping
 
     async def __call__(self, solution_type:str ,solutions:List[str], problem_description:str):
+        print(solutions)
         all_responses = []
         # 如果Solution方案是Code，我们利用AST去重
         if solution_type == "code":
@@ -132,6 +135,7 @@ class MdEnsemble(Operator):
                         updated_solutions.append(solution)
                 except SyntaxError:
                     # If the solution has a syntax error, we'll skip it
+                    print("here",solution)
                     continue
             solutions = updated_solutions
             updated_length = len(solutions)
@@ -316,44 +320,46 @@ class Rephrase(Operator):
 class Test(Operator):
     def __init__(self, name:str ="Tester", llm: LLM = LLM()):
         super().__init__(name, llm)
-
-    def test_cases_2_assert(self, test_cases):
-        return f"assert {test_cases[0]}({test_cases[1]}) == {test_cases[2]} \n"
     
-    def exec_code(self, solution, test_cases):
+    def exec_code(self, solution, test_cases, problem_id):
+        # TODO 未来还要做修改，最好能做到一个样例一测
         solution = solution["final_solution"]
-        pass_case = []
-        fail_case = []
-        for test_case in test_cases:
-            test_code = test_cases_2_test_functions(solution,test_case)
-            try:
-                exec(test_code)
-                pass_case.append(self.test_cases_2_assert(test_case))
-            except AssertionError as e:
-                fail_case.append(self.test_cases_2_assert(test_case))
-            except Exception as e:
-                with open("tester.txt", "a") as f:
-                    f.write(test_case[0] + "\n")
-                print(e)
-                return {"error":e}
-        if fail_case != []:
-            return fail_case
+        test_code = test_cases_2_test_functions(solution, test_cases)
+        print("test_code", test_code)
+        try:
+            exec(test_code, globals())
+        except AssertionError as e:
+            exc_type, exc_value, exc_traceback = sys.exc_info()
+            tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback)
+            with open("tester.txt", "a") as f:
+                f.write("test_error" +problem_id + "\n")
+            error_infomation = {"test_fail_case": {
+                "error_type": "AssertionError",
+                "error_message": str(e),
+                "traceback": tb_str
+            }}
+            print("error here", error_infomation)
+            return error_infomation
+        except Exception as e:
+            with open("tester.txt", "a") as f:
+                f.write(problem_id + "\n")
+            return {"exec_fail_case":str(e)}
         return []
 
-    async def __call__(self, problem, rephrase_problem, solution, test_cases):
-        result = self.exec_code(solution, test_cases)
-        # 处理通过Public Tests的代码
-        # TODO 这里的问题是，如果Test直接通过了就没有办法Check Multi Tests了
+    async def __call__(self, problem_id, problem, rephrase_problem, solution, test_cases):
+        result = self.exec_code(solution, test_cases, problem_id)
+        print("result here", result)
         if result == []:
             return solution
         # 处理代码执行失败的代码
-        elif type(result) == dict:
-            result = result["error"]
+        elif "exec_fail_case" in result:
+            result = result["exec_fail_case"]
             prompt = REFLECTION_ON_PUBILIC_TEST_PROMPT.format(problem_description=problem, rephrase_problem=rephrase_problem, code_solution=solution, exec_pass=f"executed unsuccessfully, error: \n {result}", test_fail="executed unsucessfully")
             node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm)
             response = node.instruct_content.model_dump()
             return {"final_solution":response["refined_solution"]}
         else:
+            result = result["test_fail_case"]
             prompt = REFLECTION_ON_PUBILIC_TEST_PROMPT.format(problem_description=problem, rephrase_problem=rephrase_problem, code_solution=solution, exec_pass="executed successfully", test_fail=result)
             node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm)
             response = node.instruct_content.model_dump()
diff --git a/examples/ags/w_action_node/prompt.py b/examples/ags/w_action_node/prompt.py
index 1ed40bfbb..5d82d1a3c 100644
--- a/examples/ags/w_action_node/prompt.py
+++ b/examples/ags/w_action_node/prompt.py
@@ -43,7 +43,7 @@ Please maintain the JSON format in your response.
 # """
 
 GENERATE_CODEBLOCK_REPHRASE_PROMPT = """
-You are given a code contest problem, and a self-reflection on the problem: 
+Please provide a self-contained  Python script that solves the following problem in a markdown code block:
 
 ### Problem Description:
 {problem_description}
@@ -51,8 +51,11 @@ You are given a code contest problem, and a self-reflection on the problem:
 ### self reflection on the problem
 {rephrase_problem}
 
-=======================
-The above is an incomplete Python code fragment and reflection on it. Return the complete and correct code with no additional text.
+When creating your solution:
+1. Consider all edge cases and boundary conditions.
+2. Avoid oversimplification - address all aspects of the problem.
+3. Ensure your logic covers all stated requirements.
+4. Avoid adding additional test cases beyond those provided in the problem description.
 """
 
 # GENERATE_CODEBLOCK_PROMPT = """
@@ -67,10 +70,9 @@ Please provide a self-contained  Python script that solves the following problem
 
 When creating your solution:
 1. Consider all edge cases and boundary conditions.
-2. Consider the order of operations in your solution and how each step affects subsequent steps.
-3. Avoid oversimplification - address all aspects of the problem.
-4. Ensure your logic covers all stated requirements.
-5. Avoid adding additional test cases beyond those provided in the problem description.
+2. Avoid oversimplification - address all aspects of the problem.
+3. Ensure your logic covers all stated requirements.
+4. Avoid adding additional test cases beyond those provided in the problem description.
 """
 
 REVIEW_PROMPT = """
diff --git a/examples/ags/w_action_node/utils.py b/examples/ags/w_action_node/utils.py
index df757ba73..366cbb13e 100644
--- a/examples/ags/w_action_node/utils.py
+++ b/examples/ags/w_action_node/utils.py
@@ -71,17 +71,12 @@ def parse_python_literal(s):
     except (ValueError, SyntaxError):
         return s
 
-def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test.jsonl"):
+def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test_reflexion.jsonl"):
     # 保留原有的硬编码测试用例
     hardcoded_cases = {
-        "HumanEval/87": [ ["get_row", [[[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 1, 6], [1, 2, 3, 4, 5, 1]], 1], [(0, 0), (1, 4), (1, 0), (2, 5), (2, 0)]], ["get_row", [[], 1], []], ["get_row", [[[], [1], [1, 2, 3]], 3], [(2, 2)]] ],
-        "HumanEval/95": [ ["check_dict_case", [{"a": "apple", "b": "banana"}], True], ["check_dict_case", [{"a": "apple", "A": "banana", "B": "banana"}], False], ["check_dict_case", [{"a": "apple", "8": "banana", "a": "apple"}], False], ["check_dict_case", [{"Name": "John", "Age": "36", "City": "Houston"}], False], ["check_dict_case", [{"STATE": "NC", "ZIP": "12345"}], True] ],
-        "HumanEval/107": [ ["even_odd_palindrome", [3], (1, 2)], ["even_odd_palindrome", [12], (4, 6)] ],
-        "HumanEval/112": [ ["reverse_delete", ["abcde", "ae"], ("bcd", False)], ["reverse_delete", ["abcdef", "b"], ("acdef", False)], ["reverse_delete", ["abcdedcba", "ab"], ("cdedc", True)] ],
-        "HumanEval/127": [ ["intersection", [(1, 2), (2, 3)], "NO"], ["intersection", [(-1, 1), (0, 4)], "NO"], ["intersection", [(-3, -1), (-5, 5)], "YES"] ],
-        "HumanEval/136": [ ["largest_smallest_integers", [2, 4, 1, 3, 5, 7], (None, 1)], ["largest_smallest_integers", [], (None, None)], ["largest_smallest_integers", [0], (None, None)] ],
-        "HumanEval/148": [ ["bf", ["Jupiter", "Neptune"], ("Saturn", "Uranus")], ["bf", ["Earth", "Mercury"], ("Venus",)], ["bf", ["Mercury", "Uranus"], ("Venus", "Earth", "Mars", "Jupiter", "Saturn")], ["bf", ["InvalidPlanet", "Neptune"], ()], ["bf", ["Jupiter", "InvalidPlanet"], ()], ["bf", ["Mercury", "Mercury"], ()] ],
-        "HumanEval/155": [ ["even_odd_count", [-12], (1, 1)], ["even_odd_count", [123], (1, 2)] ]
+        "HumanEval/32": "",
+        "HumanEval/38": "",
+        "HumanEval/50": "",
     }
 
     # 检查是否有硬编码的测试用例
@@ -92,16 +87,8 @@ def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test.jso
     with open(file_path, 'r') as file:
         for line in file:
             data = json.loads(line)
-            if problem_id in data:
-                problem_data = data[problem_id]
-                # 处理测试用例
-                for i, test_case in enumerate(problem_data):
-                    # 函数名保持不变
-                    # 参数列表需要解析
-                    test_case[1] = [parse_python_literal(arg) for arg in test_case[1]]
-                    # 预期输出需要解析
-                    test_case[2] = parse_python_literal(test_case[2])
-                return problem_data
+            if data.get("id") == problem_id:
+                return data.get("test")
 
     return None  # 如果没有找到问题，返回 None
 
@@ -158,45 +145,53 @@ async def llm_extract_test_case(id, problem_description: str, file_path:str="pub
 
 import json
 
-def test_cases_2_test_functions(solution: str, test_case: List):
-    print("here",solution)
-    function_name = test_case[0]
+# def test_cases_2_test_functions(solution: str, test_case: List):
+#     print("test_case", test_case)
+#     function_name = test_case[0]
     
-    def format_param(param):
-        if isinstance(param, str):
-            return repr(param)
-        elif isinstance(param, (int, float, bool)):
-            return str(param)
-        elif isinstance(param, list):
-            return '[' + ', '.join(format_param(item) for item in param) + ']'
-        elif isinstance(param, tuple):
-            return '(' + ', '.join(format_param(item) for item in param) + ')'
-        elif isinstance(param, dict):
-            return '{' + ', '.join(f'{format_param(k)}: {format_param(v)}' for k, v in param.items()) + '}'
-        elif isinstance(param, type(None)):
-            return 'None'
-        else:
-            raise ValueError(f"Unsupported parameter type: {type(param)}")
+#     def format_param(param):
+#         if isinstance(param, str):
+#             return repr(param)
+#         elif isinstance(param, (int, float, bool)):
+#             return str(param)
+#         elif isinstance(param, list):
+#             return '[' + ', '.join(format_param(item) for item in param) + ']'
+#         elif isinstance(param, tuple):
+#             return '(' + ', '.join(format_param(item) for item in param) + ')'
+#         elif isinstance(param, dict):
+#             return '{' + ', '.join(f'{format_param(k)}: {format_param(v)}' for k, v in param.items()) + '}'
+#         elif isinstance(param, type(None)):
+#             return 'None'
+#         else:
+#             raise ValueError(f"Unsupported parameter type: {type(param)}")
 
-    parameters = ', '.join(format_param(item) for item in test_case[1])
-    print(type(test_case[2]), test_case[2])
-    expected_output = format_param(test_case[2])
-    print(expected_output)
+#     parameters = ', '.join(format_param(item) for item in test_case[1])
+#     print(test_case[1], parameters)
 
+#     expected_output = format_param(test_case[2])
+#     print(type(test_case[2]), test_case[2], expected_output)
     
+#     tester_function = f"""
+# {solution}
+
+# def check(candidate):
+#     assert candidate({parameters}) == {expected_output}
+
+# check({function_name})
+#     """
+    
+#     print(f"""
+#     Generated test function:
+#     {tester_function}
+#     """)
+    
+#     return tester_function
+    
+
+def test_cases_2_test_functions(solution: str, test_cases: str):
     tester_function = f"""
 {solution}
 
-def check(candidate):
-    assert candidate({parameters}) == {expected_output}
-
-check({function_name})
-    """
-    
-    print(f"""
-    Generated test function:
-    {tester_function}
-    """)
-    
-    return tester_function
-    
\ No newline at end of file
+{test_cases}
+""" 
+    return tester_function
\ No newline at end of file
diff --git a/he_test.py b/he_test.py
index 567e592e3..a8d750d68 100644
--- a/he_test.py
+++ b/he_test.py
@@ -5,6 +5,7 @@ from evalplus.data import get_human_eval_plus, write_jsonl
 from examples.ags.benchmark.humaneval import sample_generate, samples_generate, extract_failure_tests, automatic_evalplus
 from examples.ags.w_action_node.utils import jsonl_ranker, llm_extract_test_case
 from examples.ags.w_action_node.graph import HumanEvalGraph
+from examples.ags.w_action_node.utils import extract_test_cases_from_jsonl
 # 132 141 136 80 73
 # asyncio.run(sample_generate('HumanEval/140',result_path="llm_based_1000.jsonl",mode="llm")) 
 # asyncio.run(sample_generate('HumanEval/140',result_path="llm_based_1000.jsonl",mode="llm")) 
@@ -12,7 +13,7 @@ from examples.ags.w_action_node.graph import HumanEvalGraph
 # asyncio.run(sample_generate('HumanEval/67',result_path="llm_based_1000.jsonl",mode="llm"))
 # asyncio.run(sample_generate('HumanEval/108',result_path="llm_based_1000.jsonl",mode="llm"))
 # asyncio.run(sample_generate('HumanEval/110',result_path="llm_based_1000.jsonl",mode="llm"))
-# asyncio.run(samples_generate(mode='alpha',result_path="alpha_based_100.jsonl"))
+asyncio.run(samples_generate(mode='alpha',result_path="alpha_based_104.jsonl"))
 # jsonl_ranker("llm_based_137.jsonl", "llm_based_137.jsonl")
 
 # result_path = "ags_based_6.jsonl"
@@ -47,6 +48,10 @@ from examples.ags.w_action_node.graph import HumanEvalGraph
 # [72, 80, 82, 87, 90, 95, 107, 109, 112, 124, 126, 127, 128, 132, 134, 136, 137, 138, 148, 154, 155]
 
 # TODO 代码问题，改动了一个地方导致Solution 没有了
-case_prompt= get_human_eval_plus()["HumanEval/76"]['prompt']
-solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1)
-result = asyncio.run(solver.alpha_codium(problem_id="HumanEval/136", problem=case_prompt, ensemble_count=1))
\ No newline at end of file
+# case_prompt= get_human_eval_plus()["HumanEval/140"]['prompt']
+# solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1)
+# result = asyncio.run(solver.alpha_codium(problem_id="HumanEval/140", problem=case_prompt, ensemble_count=1))
+
+# 1. Public Test 数据集不对 
+# 2. 修改两个Prompt的具体内容
+# 3. 尝试增加Test错误之后的修改能力
\ No newline at end of file
diff --git a/metagpt/actions/action_node.py b/metagpt/actions/action_node.py
index 904ada70f..20a73a433 100644
--- a/metagpt/actions/action_node.py
+++ b/metagpt/actions/action_node.py
@@ -512,12 +512,13 @@ class ActionNode:
         import re
         field_name = self.get_field_name()
         prompt = context
-        print(f"prompt: \n{prompt}")
+        # print("generate prompt", "\n", prompt)
         content = await self.llm.aask(prompt, timeout=timeout)
-        # TODO 在前置逻辑中完成entrypoint的提取就可以
+        # print("generate content", "\n", content)
         extracted_code = sanitize(code=content, entrypoint=function_name)
         # extracted_code = extract_code_from_response(content)    
         result = {field_name: extracted_code}
+        # print("final_result", "\n", result)
         return result
     
     async def messages_fill(