diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py index 638a64021..7e639b1ab 100644 --- a/examples/ags/benchmark/humaneval.py +++ b/examples/ags/benchmark/humaneval.py @@ -3,6 +3,11 @@ # @Author : didi # @Desc : test on human eval graph +# 1. 出效果 +# 2. 代码方面,格式问题,很多格式处理 ->增加效果 +# 3. GSM8k -> +# 4. 我来写一个GSM8k最基础代码,GSM8k实验代码需要你来改写 + import os import json import subprocess @@ -17,7 +22,7 @@ from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock generate_code = GenerateCode(llm=LLM()) generate_code_block = GenerateCodeBlock(llm=LLM()) -solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5) +solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1) async def sample_generate(id, result_path:str="samples.jsonl",mode:str="ags"): case = get_human_eval_plus()[f"{id}"] @@ -55,7 +60,7 @@ async def samples_generate(mode:str, result_path:str="samples.jsonl"): 'solution': solution_result['final_solution'] } elif mode == "alpha": - solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=5) + solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=1) sample_dict = { 'task_id': case['task_id'], 'solution': solution_result['final_solution'] @@ -164,6 +169,7 @@ def automatic_sanitize(result_path: str = "samples.jsonl"): sanitized_path = f"{base_name}-sanitized.jsonl" return sanitized_path + def automatic_evalplus(result_path:str ="samples.jsonl"): """ 在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only diff --git a/examples/ags/w_action_node/graph.py b/examples/ags/w_action_node/graph.py index 7bd32bdc9..217371cb9 100644 --- a/examples/ags/w_action_node/graph.py +++ b/examples/ags/w_action_node/graph.py @@ -7,6 +7,7 @@ from metagpt.llm import LLM from typing import List from examples.ags.w_action_node.operator import Generate, GenerateCode, GenerateCodeBlock, Review, Revise, FuEnsemble, MdEnsemble, DbEnsemble, Rephrase, Test from examples.ags.w_action_node.utils import extract_test_cases_from_jsonl +from evalplus.data import get_human_eval_plus class Graph: def __init__(self, name:str, llm:LLM) -> None: self.name = name @@ -46,24 +47,23 @@ class HumanEvalGraph(Graph): solution = await self.mdensemble("code", solution_list, problem) return solution - async def alpha_codium(self, problem_id:str, problem:str, ensemble_count:int = 3): # async def __call__(self,problem_id, problem:str, ensemble_count:int = 3): test_cases = extract_test_cases_from_jsonl(problem_id) + entry_point = get_human_eval_plus()[problem_id]['entry_point'] rephrase_problem = await self.rephrase(problem) # 在rephrase 中拼接原始的问题描述 solution_list = [] for _ in range(ensemble_count): for retry_count in range(5): try: - solution = await self.generate_code_block(problem, rephrase_problem) + solution = await self.generate_code_block.rephrase_generate(problem, rephrase_problem, function_name=entry_point) solution = solution.get('code_solution') solution_list.append(solution) break except Exception as e: print(e) solution = await self.mdensemble("code", solution_list, problem) - print("here",solution) - solution = await self.tester(problem, rephrase_problem, solution, test_cases) + solution = await self.tester(problem_id, problem, rephrase_problem, solution, test_cases) return solution async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2): @@ -96,6 +96,18 @@ class HumanEvalGraph(Graph): solution = solution.get('revised_solution') return solution - class Gsm8kGraph(Graph): - pass \ No newline at end of file + def __init__(self, name:str, llm: LLM) -> None: + super().__init__(name, llm) + self.generate = Generate(llm=llm) + self.rephrase = Rephrase(llm=llm) + + async def __call__(self, problem:str): + solution = self.generate(problem) + return solution + + # async def __call__(self, problem:str): + # 这个地方没有修改对应的prompt,可以对应着humaneval改一下 + # problem = await self.rephrase(problem) + # solution = self.generate(problem) + # return solution \ No newline at end of file diff --git a/examples/ags/w_action_node/operator.py b/examples/ags/w_action_node/operator.py index f7cf9b4b5..1069b73e5 100644 --- a/examples/ags/w_action_node/operator.py +++ b/examples/ags/w_action_node/operator.py @@ -3,6 +3,8 @@ # @Author : didi # @Desc : operator demo of ags import ast +import sys +import traceback import random from typing import List, Tuple, Any, Dict from collections import Counter @@ -115,6 +117,7 @@ class MdEnsemble(Operator): return shuffled_solutions, answer_mapping async def __call__(self, solution_type:str ,solutions:List[str], problem_description:str): + print(solutions) all_responses = [] # 如果Solution方案是Code,我们利用AST去重 if solution_type == "code": @@ -132,6 +135,7 @@ class MdEnsemble(Operator): updated_solutions.append(solution) except SyntaxError: # If the solution has a syntax error, we'll skip it + print("here",solution) continue solutions = updated_solutions updated_length = len(solutions) @@ -316,44 +320,46 @@ class Rephrase(Operator): class Test(Operator): def __init__(self, name:str ="Tester", llm: LLM = LLM()): super().__init__(name, llm) - - def test_cases_2_assert(self, test_cases): - return f"assert {test_cases[0]}({test_cases[1]}) == {test_cases[2]} \n" - def exec_code(self, solution, test_cases): + def exec_code(self, solution, test_cases, problem_id): + # TODO 未来还要做修改,最好能做到一个样例一测 solution = solution["final_solution"] - pass_case = [] - fail_case = [] - for test_case in test_cases: - test_code = test_cases_2_test_functions(solution,test_case) - try: - exec(test_code) - pass_case.append(self.test_cases_2_assert(test_case)) - except AssertionError as e: - fail_case.append(self.test_cases_2_assert(test_case)) - except Exception as e: - with open("tester.txt", "a") as f: - f.write(test_case[0] + "\n") - print(e) - return {"error":e} - if fail_case != []: - return fail_case + test_code = test_cases_2_test_functions(solution, test_cases) + print("test_code", test_code) + try: + exec(test_code, globals()) + except AssertionError as e: + exc_type, exc_value, exc_traceback = sys.exc_info() + tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback) + with open("tester.txt", "a") as f: + f.write("test_error" +problem_id + "\n") + error_infomation = {"test_fail_case": { + "error_type": "AssertionError", + "error_message": str(e), + "traceback": tb_str + }} + print("error here", error_infomation) + return error_infomation + except Exception as e: + with open("tester.txt", "a") as f: + f.write(problem_id + "\n") + return {"exec_fail_case":str(e)} return [] - async def __call__(self, problem, rephrase_problem, solution, test_cases): - result = self.exec_code(solution, test_cases) - # 处理通过Public Tests的代码 - # TODO 这里的问题是,如果Test直接通过了就没有办法Check Multi Tests了 + async def __call__(self, problem_id, problem, rephrase_problem, solution, test_cases): + result = self.exec_code(solution, test_cases, problem_id) + print("result here", result) if result == []: return solution # 处理代码执行失败的代码 - elif type(result) == dict: - result = result["error"] + elif "exec_fail_case" in result: + result = result["exec_fail_case"] prompt = REFLECTION_ON_PUBILIC_TEST_PROMPT.format(problem_description=problem, rephrase_problem=rephrase_problem, code_solution=solution, exec_pass=f"executed unsuccessfully, error: \n {result}", test_fail="executed unsucessfully") node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm) response = node.instruct_content.model_dump() return {"final_solution":response["refined_solution"]} else: + result = result["test_fail_case"] prompt = REFLECTION_ON_PUBILIC_TEST_PROMPT.format(problem_description=problem, rephrase_problem=rephrase_problem, code_solution=solution, exec_pass="executed successfully", test_fail=result) node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm) response = node.instruct_content.model_dump() diff --git a/examples/ags/w_action_node/prompt.py b/examples/ags/w_action_node/prompt.py index 1ed40bfbb..5d82d1a3c 100644 --- a/examples/ags/w_action_node/prompt.py +++ b/examples/ags/w_action_node/prompt.py @@ -43,7 +43,7 @@ Please maintain the JSON format in your response. # """ GENERATE_CODEBLOCK_REPHRASE_PROMPT = """ -You are given a code contest problem, and a self-reflection on the problem: +Please provide a self-contained Python script that solves the following problem in a markdown code block: ### Problem Description: {problem_description} @@ -51,8 +51,11 @@ You are given a code contest problem, and a self-reflection on the problem: ### self reflection on the problem {rephrase_problem} -======================= -The above is an incomplete Python code fragment and reflection on it. Return the complete and correct code with no additional text. +When creating your solution: +1. Consider all edge cases and boundary conditions. +2. Avoid oversimplification - address all aspects of the problem. +3. Ensure your logic covers all stated requirements. +4. Avoid adding additional test cases beyond those provided in the problem description. """ # GENERATE_CODEBLOCK_PROMPT = """ @@ -67,10 +70,9 @@ Please provide a self-contained Python script that solves the following problem When creating your solution: 1. Consider all edge cases and boundary conditions. -2. Consider the order of operations in your solution and how each step affects subsequent steps. -3. Avoid oversimplification - address all aspects of the problem. -4. Ensure your logic covers all stated requirements. -5. Avoid adding additional test cases beyond those provided in the problem description. +2. Avoid oversimplification - address all aspects of the problem. +3. Ensure your logic covers all stated requirements. +4. Avoid adding additional test cases beyond those provided in the problem description. """ REVIEW_PROMPT = """ diff --git a/examples/ags/w_action_node/utils.py b/examples/ags/w_action_node/utils.py index df757ba73..366cbb13e 100644 --- a/examples/ags/w_action_node/utils.py +++ b/examples/ags/w_action_node/utils.py @@ -71,17 +71,12 @@ def parse_python_literal(s): except (ValueError, SyntaxError): return s -def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test.jsonl"): +def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test_reflexion.jsonl"): # 保留原有的硬编码测试用例 hardcoded_cases = { - "HumanEval/87": [ ["get_row", [[[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 1, 6], [1, 2, 3, 4, 5, 1]], 1], [(0, 0), (1, 4), (1, 0), (2, 5), (2, 0)]], ["get_row", [[], 1], []], ["get_row", [[[], [1], [1, 2, 3]], 3], [(2, 2)]] ], - "HumanEval/95": [ ["check_dict_case", [{"a": "apple", "b": "banana"}], True], ["check_dict_case", [{"a": "apple", "A": "banana", "B": "banana"}], False], ["check_dict_case", [{"a": "apple", "8": "banana", "a": "apple"}], False], ["check_dict_case", [{"Name": "John", "Age": "36", "City": "Houston"}], False], ["check_dict_case", [{"STATE": "NC", "ZIP": "12345"}], True] ], - "HumanEval/107": [ ["even_odd_palindrome", [3], (1, 2)], ["even_odd_palindrome", [12], (4, 6)] ], - "HumanEval/112": [ ["reverse_delete", ["abcde", "ae"], ("bcd", False)], ["reverse_delete", ["abcdef", "b"], ("acdef", False)], ["reverse_delete", ["abcdedcba", "ab"], ("cdedc", True)] ], - "HumanEval/127": [ ["intersection", [(1, 2), (2, 3)], "NO"], ["intersection", [(-1, 1), (0, 4)], "NO"], ["intersection", [(-3, -1), (-5, 5)], "YES"] ], - "HumanEval/136": [ ["largest_smallest_integers", [2, 4, 1, 3, 5, 7], (None, 1)], ["largest_smallest_integers", [], (None, None)], ["largest_smallest_integers", [0], (None, None)] ], - "HumanEval/148": [ ["bf", ["Jupiter", "Neptune"], ("Saturn", "Uranus")], ["bf", ["Earth", "Mercury"], ("Venus",)], ["bf", ["Mercury", "Uranus"], ("Venus", "Earth", "Mars", "Jupiter", "Saturn")], ["bf", ["InvalidPlanet", "Neptune"], ()], ["bf", ["Jupiter", "InvalidPlanet"], ()], ["bf", ["Mercury", "Mercury"], ()] ], - "HumanEval/155": [ ["even_odd_count", [-12], (1, 1)], ["even_odd_count", [123], (1, 2)] ] + "HumanEval/32": "", + "HumanEval/38": "", + "HumanEval/50": "", } # 检查是否有硬编码的测试用例 @@ -92,16 +87,8 @@ def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test.jso with open(file_path, 'r') as file: for line in file: data = json.loads(line) - if problem_id in data: - problem_data = data[problem_id] - # 处理测试用例 - for i, test_case in enumerate(problem_data): - # 函数名保持不变 - # 参数列表需要解析 - test_case[1] = [parse_python_literal(arg) for arg in test_case[1]] - # 预期输出需要解析 - test_case[2] = parse_python_literal(test_case[2]) - return problem_data + if data.get("id") == problem_id: + return data.get("test") return None # 如果没有找到问题,返回 None @@ -158,45 +145,53 @@ async def llm_extract_test_case(id, problem_description: str, file_path:str="pub import json -def test_cases_2_test_functions(solution: str, test_case: List): - print("here",solution) - function_name = test_case[0] +# def test_cases_2_test_functions(solution: str, test_case: List): +# print("test_case", test_case) +# function_name = test_case[0] - def format_param(param): - if isinstance(param, str): - return repr(param) - elif isinstance(param, (int, float, bool)): - return str(param) - elif isinstance(param, list): - return '[' + ', '.join(format_param(item) for item in param) + ']' - elif isinstance(param, tuple): - return '(' + ', '.join(format_param(item) for item in param) + ')' - elif isinstance(param, dict): - return '{' + ', '.join(f'{format_param(k)}: {format_param(v)}' for k, v in param.items()) + '}' - elif isinstance(param, type(None)): - return 'None' - else: - raise ValueError(f"Unsupported parameter type: {type(param)}") +# def format_param(param): +# if isinstance(param, str): +# return repr(param) +# elif isinstance(param, (int, float, bool)): +# return str(param) +# elif isinstance(param, list): +# return '[' + ', '.join(format_param(item) for item in param) + ']' +# elif isinstance(param, tuple): +# return '(' + ', '.join(format_param(item) for item in param) + ')' +# elif isinstance(param, dict): +# return '{' + ', '.join(f'{format_param(k)}: {format_param(v)}' for k, v in param.items()) + '}' +# elif isinstance(param, type(None)): +# return 'None' +# else: +# raise ValueError(f"Unsupported parameter type: {type(param)}") - parameters = ', '.join(format_param(item) for item in test_case[1]) - print(type(test_case[2]), test_case[2]) - expected_output = format_param(test_case[2]) - print(expected_output) +# parameters = ', '.join(format_param(item) for item in test_case[1]) +# print(test_case[1], parameters) +# expected_output = format_param(test_case[2]) +# print(type(test_case[2]), test_case[2], expected_output) +# tester_function = f""" +# {solution} + +# def check(candidate): +# assert candidate({parameters}) == {expected_output} + +# check({function_name}) +# """ + +# print(f""" +# Generated test function: +# {tester_function} +# """) + +# return tester_function + + +def test_cases_2_test_functions(solution: str, test_cases: str): tester_function = f""" {solution} -def check(candidate): - assert candidate({parameters}) == {expected_output} - -check({function_name}) - """ - - print(f""" - Generated test function: - {tester_function} - """) - - return tester_function - \ No newline at end of file +{test_cases} +""" + return tester_function \ No newline at end of file diff --git a/he_test.py b/he_test.py index 567e592e3..a8d750d68 100644 --- a/he_test.py +++ b/he_test.py @@ -5,6 +5,7 @@ from evalplus.data import get_human_eval_plus, write_jsonl from examples.ags.benchmark.humaneval import sample_generate, samples_generate, extract_failure_tests, automatic_evalplus from examples.ags.w_action_node.utils import jsonl_ranker, llm_extract_test_case from examples.ags.w_action_node.graph import HumanEvalGraph +from examples.ags.w_action_node.utils import extract_test_cases_from_jsonl # 132 141 136 80 73 # asyncio.run(sample_generate('HumanEval/140',result_path="llm_based_1000.jsonl",mode="llm")) # asyncio.run(sample_generate('HumanEval/140',result_path="llm_based_1000.jsonl",mode="llm")) @@ -12,7 +13,7 @@ from examples.ags.w_action_node.graph import HumanEvalGraph # asyncio.run(sample_generate('HumanEval/67',result_path="llm_based_1000.jsonl",mode="llm")) # asyncio.run(sample_generate('HumanEval/108',result_path="llm_based_1000.jsonl",mode="llm")) # asyncio.run(sample_generate('HumanEval/110',result_path="llm_based_1000.jsonl",mode="llm")) -# asyncio.run(samples_generate(mode='alpha',result_path="alpha_based_100.jsonl")) +asyncio.run(samples_generate(mode='alpha',result_path="alpha_based_104.jsonl")) # jsonl_ranker("llm_based_137.jsonl", "llm_based_137.jsonl") # result_path = "ags_based_6.jsonl" @@ -47,6 +48,10 @@ from examples.ags.w_action_node.graph import HumanEvalGraph # [72, 80, 82, 87, 90, 95, 107, 109, 112, 124, 126, 127, 128, 132, 134, 136, 137, 138, 148, 154, 155] # TODO 代码问题,改动了一个地方导致Solution 没有了 -case_prompt= get_human_eval_plus()["HumanEval/76"]['prompt'] -solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1) -result = asyncio.run(solver.alpha_codium(problem_id="HumanEval/136", problem=case_prompt, ensemble_count=1)) \ No newline at end of file +# case_prompt= get_human_eval_plus()["HumanEval/140"]['prompt'] +# solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1) +# result = asyncio.run(solver.alpha_codium(problem_id="HumanEval/140", problem=case_prompt, ensemble_count=1)) + +# 1. Public Test 数据集不对 +# 2. 修改两个Prompt的具体内容 +# 3. 尝试增加Test错误之后的修改能力 \ No newline at end of file diff --git a/metagpt/actions/action_node.py b/metagpt/actions/action_node.py index 904ada70f..20a73a433 100644 --- a/metagpt/actions/action_node.py +++ b/metagpt/actions/action_node.py @@ -512,12 +512,13 @@ class ActionNode: import re field_name = self.get_field_name() prompt = context - print(f"prompt: \n{prompt}") + # print("generate prompt", "\n", prompt) content = await self.llm.aask(prompt, timeout=timeout) - # TODO 在前置逻辑中完成entrypoint的提取就可以 + # print("generate content", "\n", content) extracted_code = sanitize(code=content, entrypoint=function_name) # extracted_code = extract_code_from_response(content) result = {field_name: extracted_code} + # print("final_result", "\n", result) return result async def messages_fill(