Update

2026-05-03 04:42:38 +02:00 · 2024-07-29 22:00:07 +08:00 · 2024-07-29 22:00:07 +08:00 · 3fc3d217a8
commit 3fc3d217a8
parent eac4b6c3e6
7 changed files with 128 additions and 101 deletions
--- a/examples/ags/benchmark/humaneval.py
+++ b/examples/ags/benchmark/humaneval.py
@ -3,6 +3,11 @@
 # @Author  : didi
 # @Desc    : test on human eval graph

+# 1. 出效果
+# 2. 代码方面，格式问题，很多格式处理 ->增加效果
+# 3. GSM8k -> 
+# 4. 我来写一个GSM8k最基础代码，GSM8k实验代码需要你来改写
+
 import os
 import json
 import subprocess
@ -17,7 +22,7 @@ from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock

 generate_code = GenerateCode(llm=LLM())
 generate_code_block = GenerateCodeBlock(llm=LLM())
-solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)
+solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1)

 async def sample_generate(id, result_path:str="samples.jsonl",mode:str="ags"):
    case = get_human_eval_plus()[f"{id}"]
@ -55,7 +60,7 @@ async def samples_generate(mode:str, result_path:str="samples.jsonl"):
                'solution': solution_result['final_solution']
                }
            elif mode == "alpha":
-                solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=5)
+                solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=1)
                sample_dict = {
                'task_id': case['task_id'],
                'solution': solution_result['final_solution']
@ -164,6 +169,7 @@ def automatic_sanitize(result_path: str = "samples.jsonl"):
    sanitized_path = f"{base_name}-sanitized.jsonl"
    
    return sanitized_path
+
 def automatic_evalplus(result_path:str ="samples.jsonl"):
    """
    在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only
--- a/examples/ags/w_action_node/graph.py
+++ b/examples/ags/w_action_node/graph.py
@ -7,6 +7,7 @@ from metagpt.llm import LLM
 from typing import List
 from examples.ags.w_action_node.operator import Generate, GenerateCode, GenerateCodeBlock, Review, Revise, FuEnsemble, MdEnsemble, DbEnsemble, Rephrase, Test
 from examples.ags.w_action_node.utils import extract_test_cases_from_jsonl
+from evalplus.data import get_human_eval_plus
 class Graph:
    def __init__(self, name:str, llm:LLM) -> None:
        self.name = name
@ -46,24 +47,23 @@ class HumanEvalGraph(Graph):
        solution = await self.mdensemble("code", solution_list, problem)
        return solution
    
-    
    async def alpha_codium(self, problem_id:str, problem:str, ensemble_count:int = 3):
    # async def __call__(self,problem_id, problem:str, ensemble_count:int = 3):
        test_cases = extract_test_cases_from_jsonl(problem_id)
+        entry_point = get_human_eval_plus()[problem_id]['entry_point']
        rephrase_problem = await self.rephrase(problem) # 在rephrase 中拼接原始的问题描述
        solution_list = []
        for _ in range(ensemble_count):
            for retry_count in range(5):
                try:
-                    solution = await self.generate_code_block(problem, rephrase_problem)
+                    solution = await self.generate_code_block.rephrase_generate(problem, rephrase_problem, function_name=entry_point)
                    solution = solution.get('code_solution')
                    solution_list.append(solution)
                    break
                except Exception as e:
                    print(e)
        solution = await self.mdensemble("code", solution_list, problem)
-        print("here",solution)
-        solution = await self.tester(problem, rephrase_problem, solution, test_cases)
+        solution = await self.tester(problem_id, problem, rephrase_problem, solution, test_cases)
        return solution

    async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2):
@ -96,6 +96,18 @@ class HumanEvalGraph(Graph):
            solution = solution.get('revised_solution')
        return solution
    
-
 class Gsm8kGraph(Graph):
-    pass
+    def __init__(self, name:str, llm: LLM) -> None:
+        super().__init__(name, llm)
+        self.generate = Generate(llm=llm)
+        self.rephrase = Rephrase(llm=llm)
+    
+    async def __call__(self, problem:str):
+        solution = self.generate(problem)
+        return solution
+    
+    # async def __call__(self, problem:str):
+    # 这个地方没有修改对应的prompt，可以对应着humaneval改一下
+    #     problem = await self.rephrase(problem)
+    #     solution = self.generate(problem)
+    #     return solution
--- a/examples/ags/w_action_node/operator.py
+++ b/examples/ags/w_action_node/operator.py
@ -3,6 +3,8 @@
 # @Author  : didi
 # @Desc    : operator demo of ags
 import ast
+import sys
+import traceback
 import random
 from typing import List, Tuple, Any, Dict
 from collections import Counter
@ -115,6 +117,7 @@ class MdEnsemble(Operator):
        return shuffled_solutions, answer_mapping

    async def __call__(self, solution_type:str ,solutions:List[str], problem_description:str):
+        print(solutions)
        all_responses = []
        # 如果Solution方案是Code，我们利用AST去重
        if solution_type == "code":
@ -132,6 +135,7 @@ class MdEnsemble(Operator):
                        updated_solutions.append(solution)
                except SyntaxError:
                    # If the solution has a syntax error, we'll skip it
+                    print("here",solution)
                    continue
            solutions = updated_solutions
            updated_length = len(solutions)
@ -316,44 +320,46 @@ class Rephrase(Operator):
 class Test(Operator):
    def __init__(self, name:str ="Tester", llm: LLM = LLM()):
        super().__init__(name, llm)
-
-    def test_cases_2_assert(self, test_cases):
-        return f"assert {test_cases[0]}({test_cases[1]}) == {test_cases[2]} \n"
    
-    def exec_code(self, solution, test_cases):
+    def exec_code(self, solution, test_cases, problem_id):
+        # TODO 未来还要做修改，最好能做到一个样例一测
        solution = solution["final_solution"]
-        pass_case = []
-        fail_case = []
-        for test_case in test_cases:
-            test_code = test_cases_2_test_functions(solution,test_case)
-            try:
-                exec(test_code)
-                pass_case.append(self.test_cases_2_assert(test_case))
-            except AssertionError as e:
-                fail_case.append(self.test_cases_2_assert(test_case))
-            except Exception as e:
-                with open("tester.txt", "a") as f:
-                    f.write(test_case[0] + "\n")
-                print(e)
-                return {"error":e}
-        if fail_case != []:
-            return fail_case
+        test_code = test_cases_2_test_functions(solution, test_cases)
+        print("test_code", test_code)
+        try:
+            exec(test_code, globals())
+        except AssertionError as e:
+            exc_type, exc_value, exc_traceback = sys.exc_info()
+            tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback)
+            with open("tester.txt", "a") as f:
+                f.write("test_error" +problem_id + "\n")
+            error_infomation = {"test_fail_case": {
+                "error_type": "AssertionError",
+                "error_message": str(e),
+                "traceback": tb_str
+            }}
+            print("error here", error_infomation)
+            return error_infomation
+        except Exception as e:
+            with open("tester.txt", "a") as f:
+                f.write(problem_id + "\n")
+            return {"exec_fail_case":str(e)}
        return []

-    async def __call__(self, problem, rephrase_problem, solution, test_cases):
-        result = self.exec_code(solution, test_cases)
-        # 处理通过Public Tests的代码
-        # TODO 这里的问题是，如果Test直接通过了就没有办法Check Multi Tests了
+    async def __call__(self, problem_id, problem, rephrase_problem, solution, test_cases):
+        result = self.exec_code(solution, test_cases, problem_id)
+        print("result here", result)
        if result == []:
            return solution
        # 处理代码执行失败的代码
-        elif type(result) == dict:
-            result = result["error"]
+        elif "exec_fail_case" in result:
+            result = result["exec_fail_case"]
            prompt = REFLECTION_ON_PUBILIC_TEST_PROMPT.format(problem_description=problem, rephrase_problem=rephrase_problem, code_solution=solution, exec_pass=f"executed unsuccessfully, error: \n {result}", test_fail="executed unsucessfully")
            node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm)
            response = node.instruct_content.model_dump()
            return {"final_solution":response["refined_solution"]}
        else:
+            result = result["test_fail_case"]
            prompt = REFLECTION_ON_PUBILIC_TEST_PROMPT.format(problem_description=problem, rephrase_problem=rephrase_problem, code_solution=solution, exec_pass="executed successfully", test_fail=result)
            node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm)
            response = node.instruct_content.model_dump()
--- a/examples/ags/w_action_node/prompt.py
+++ b/examples/ags/w_action_node/prompt.py
@ -43,7 +43,7 @@ Please maintain the JSON format in your response.
 # """

 GENERATE_CODEBLOCK_REPHRASE_PROMPT = """
-You are given a code contest problem, and a self-reflection on the problem: 
+Please provide a self-contained  Python script that solves the following problem in a markdown code block:

 ### Problem Description:
 {problem_description}
@ -51,8 +51,11 @@ You are given a code contest problem, and a self-reflection on the problem:
 ### self reflection on the problem
 {rephrase_problem}

-=======================
-The above is an incomplete Python code fragment and reflection on it. Return the complete and correct code with no additional text.
+When creating your solution:
+1. Consider all edge cases and boundary conditions.
+2. Avoid oversimplification - address all aspects of the problem.
+3. Ensure your logic covers all stated requirements.
+4. Avoid adding additional test cases beyond those provided in the problem description.
 """

 # GENERATE_CODEBLOCK_PROMPT = """
@ -67,10 +70,9 @@ Please provide a self-contained  Python script that solves the following problem

 When creating your solution:
 1. Consider all edge cases and boundary conditions.
-2. Consider the order of operations in your solution and how each step affects subsequent steps.
-3. Avoid oversimplification - address all aspects of the problem.
-4. Ensure your logic covers all stated requirements.
-5. Avoid adding additional test cases beyond those provided in the problem description.
+2. Avoid oversimplification - address all aspects of the problem.
+3. Ensure your logic covers all stated requirements.
+4. Avoid adding additional test cases beyond those provided in the problem description.
 """

 REVIEW_PROMPT = """
--- a/examples/ags/w_action_node/utils.py
+++ b/examples/ags/w_action_node/utils.py
@ -71,17 +71,12 @@ def parse_python_literal(s):
    except (ValueError, SyntaxError):
        return s

-def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test.jsonl"):
+def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test_reflexion.jsonl"):
    # 保留原有的硬编码测试用例
    hardcoded_cases = {
-        "HumanEval/87": [ ["get_row", [[[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 1, 6], [1, 2, 3, 4, 5, 1]], 1], [(0, 0), (1, 4), (1, 0), (2, 5), (2, 0)]], ["get_row", [[], 1], []], ["get_row", [[[], [1], [1, 2, 3]], 3], [(2, 2)]] ],
-        "HumanEval/95": [ ["check_dict_case", [{"a": "apple", "b": "banana"}], True], ["check_dict_case", [{"a": "apple", "A": "banana", "B": "banana"}], False], ["check_dict_case", [{"a": "apple", "8": "banana", "a": "apple"}], False], ["check_dict_case", [{"Name": "John", "Age": "36", "City": "Houston"}], False], ["check_dict_case", [{"STATE": "NC", "ZIP": "12345"}], True] ],
-        "HumanEval/107": [ ["even_odd_palindrome", [3], (1, 2)], ["even_odd_palindrome", [12], (4, 6)] ],
-        "HumanEval/112": [ ["reverse_delete", ["abcde", "ae"], ("bcd", False)], ["reverse_delete", ["abcdef", "b"], ("acdef", False)], ["reverse_delete", ["abcdedcba", "ab"], ("cdedc", True)] ],
-        "HumanEval/127": [ ["intersection", [(1, 2), (2, 3)], "NO"], ["intersection", [(-1, 1), (0, 4)], "NO"], ["intersection", [(-3, -1), (-5, 5)], "YES"] ],
-        "HumanEval/136": [ ["largest_smallest_integers", [2, 4, 1, 3, 5, 7], (None, 1)], ["largest_smallest_integers", [], (None, None)], ["largest_smallest_integers", [0], (None, None)] ],
-        "HumanEval/148": [ ["bf", ["Jupiter", "Neptune"], ("Saturn", "Uranus")], ["bf", ["Earth", "Mercury"], ("Venus",)], ["bf", ["Mercury", "Uranus"], ("Venus", "Earth", "Mars", "Jupiter", "Saturn")], ["bf", ["InvalidPlanet", "Neptune"], ()], ["bf", ["Jupiter", "InvalidPlanet"], ()], ["bf", ["Mercury", "Mercury"], ()] ],
-        "HumanEval/155": [ ["even_odd_count", [-12], (1, 1)], ["even_odd_count", [123], (1, 2)] ]
+        "HumanEval/32": "",
+        "HumanEval/38": "",
+        "HumanEval/50": "",
    }

    # 检查是否有硬编码的测试用例
@ -92,16 +87,8 @@ def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test.jso
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
-            if problem_id in data:
-                problem_data = data[problem_id]
-                # 处理测试用例
-                for i, test_case in enumerate(problem_data):
-                    # 函数名保持不变
-                    # 参数列表需要解析
-                    test_case[1] = [parse_python_literal(arg) for arg in test_case[1]]
-                    # 预期输出需要解析
-                    test_case[2] = parse_python_literal(test_case[2])
-                return problem_data
+            if data.get("id") == problem_id:
+                return data.get("test")

    return None  # 如果没有找到问题，返回 None

@ -158,45 +145,53 @@ async def llm_extract_test_case(id, problem_description: str, file_path:str="pub

 import json

-def test_cases_2_test_functions(solution: str, test_case: List):
-    print("here",solution)
-    function_name = test_case[0]
+# def test_cases_2_test_functions(solution: str, test_case: List):
+#     print("test_case", test_case)
+#     function_name = test_case[0]
    
-    def format_param(param):
-        if isinstance(param, str):
-            return repr(param)
-        elif isinstance(param, (int, float, bool)):
-            return str(param)
-        elif isinstance(param, list):
-            return '[' + ', '.join(format_param(item) for item in param) + ']'
-        elif isinstance(param, tuple):
-            return '(' + ', '.join(format_param(item) for item in param) + ')'
-        elif isinstance(param, dict):
-            return '{' + ', '.join(f'{format_param(k)}: {format_param(v)}' for k, v in param.items()) + '}'
-        elif isinstance(param, type(None)):
-            return 'None'
-        else:
-            raise ValueError(f"Unsupported parameter type: {type(param)}")
+#     def format_param(param):
+#         if isinstance(param, str):
+#             return repr(param)
+#         elif isinstance(param, (int, float, bool)):
+#             return str(param)
+#         elif isinstance(param, list):
+#             return '[' + ', '.join(format_param(item) for item in param) + ']'
+#         elif isinstance(param, tuple):
+#             return '(' + ', '.join(format_param(item) for item in param) + ')'
+#         elif isinstance(param, dict):
+#             return '{' + ', '.join(f'{format_param(k)}: {format_param(v)}' for k, v in param.items()) + '}'
+#         elif isinstance(param, type(None)):
+#             return 'None'
+#         else:
+#             raise ValueError(f"Unsupported parameter type: {type(param)}")

-    parameters = ', '.join(format_param(item) for item in test_case[1])
-    print(type(test_case[2]), test_case[2])
-    expected_output = format_param(test_case[2])
-    print(expected_output)
+#     parameters = ', '.join(format_param(item) for item in test_case[1])
+#     print(test_case[1], parameters)

+#     expected_output = format_param(test_case[2])
+#     print(type(test_case[2]), test_case[2], expected_output)
    
+#     tester_function = f"""
+# {solution}
+
+# def check(candidate):
+#     assert candidate({parameters}) == {expected_output}
+
+# check({function_name})
+#     """
+    
+#     print(f"""
+#     Generated test function:
+#     {tester_function}
+#     """)
+    
+#     return tester_function
+    
+
+def test_cases_2_test_functions(solution: str, test_cases: str):
    tester_function = f"""
 {solution}

-def check(candidate):
-    assert candidate({parameters}) == {expected_output}
-
-check({function_name})
-    """
-    
-    print(f"""
-    Generated test function:
-    {tester_function}
-    """)
-    
-    return tester_function
-    
+{test_cases}
+""" 
+    return tester_function