mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-03 04:42:38 +02:00
Update
This commit is contained in:
parent
eac4b6c3e6
commit
3fc3d217a8
7 changed files with 128 additions and 101 deletions
|
|
@ -3,6 +3,11 @@
|
|||
# @Author : didi
|
||||
# @Desc : test on human eval graph
|
||||
|
||||
# 1. 出效果
|
||||
# 2. 代码方面,格式问题,很多格式处理 ->增加效果
|
||||
# 3. GSM8k ->
|
||||
# 4. 我来写一个GSM8k最基础代码,GSM8k实验代码需要你来改写
|
||||
|
||||
import os
|
||||
import json
|
||||
import subprocess
|
||||
|
|
@ -17,7 +22,7 @@ from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock
|
|||
|
||||
generate_code = GenerateCode(llm=LLM())
|
||||
generate_code_block = GenerateCodeBlock(llm=LLM())
|
||||
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)
|
||||
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1)
|
||||
|
||||
async def sample_generate(id, result_path:str="samples.jsonl",mode:str="ags"):
|
||||
case = get_human_eval_plus()[f"{id}"]
|
||||
|
|
@ -55,7 +60,7 @@ async def samples_generate(mode:str, result_path:str="samples.jsonl"):
|
|||
'solution': solution_result['final_solution']
|
||||
}
|
||||
elif mode == "alpha":
|
||||
solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=5)
|
||||
solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=1)
|
||||
sample_dict = {
|
||||
'task_id': case['task_id'],
|
||||
'solution': solution_result['final_solution']
|
||||
|
|
@ -164,6 +169,7 @@ def automatic_sanitize(result_path: str = "samples.jsonl"):
|
|||
sanitized_path = f"{base_name}-sanitized.jsonl"
|
||||
|
||||
return sanitized_path
|
||||
|
||||
def automatic_evalplus(result_path:str ="samples.jsonl"):
|
||||
"""
|
||||
在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ from metagpt.llm import LLM
|
|||
from typing import List
|
||||
from examples.ags.w_action_node.operator import Generate, GenerateCode, GenerateCodeBlock, Review, Revise, FuEnsemble, MdEnsemble, DbEnsemble, Rephrase, Test
|
||||
from examples.ags.w_action_node.utils import extract_test_cases_from_jsonl
|
||||
from evalplus.data import get_human_eval_plus
|
||||
class Graph:
|
||||
def __init__(self, name:str, llm:LLM) -> None:
|
||||
self.name = name
|
||||
|
|
@ -46,24 +47,23 @@ class HumanEvalGraph(Graph):
|
|||
solution = await self.mdensemble("code", solution_list, problem)
|
||||
return solution
|
||||
|
||||
|
||||
async def alpha_codium(self, problem_id:str, problem:str, ensemble_count:int = 3):
|
||||
# async def __call__(self,problem_id, problem:str, ensemble_count:int = 3):
|
||||
test_cases = extract_test_cases_from_jsonl(problem_id)
|
||||
entry_point = get_human_eval_plus()[problem_id]['entry_point']
|
||||
rephrase_problem = await self.rephrase(problem) # 在rephrase 中拼接原始的问题描述
|
||||
solution_list = []
|
||||
for _ in range(ensemble_count):
|
||||
for retry_count in range(5):
|
||||
try:
|
||||
solution = await self.generate_code_block(problem, rephrase_problem)
|
||||
solution = await self.generate_code_block.rephrase_generate(problem, rephrase_problem, function_name=entry_point)
|
||||
solution = solution.get('code_solution')
|
||||
solution_list.append(solution)
|
||||
break
|
||||
except Exception as e:
|
||||
print(e)
|
||||
solution = await self.mdensemble("code", solution_list, problem)
|
||||
print("here",solution)
|
||||
solution = await self.tester(problem, rephrase_problem, solution, test_cases)
|
||||
solution = await self.tester(problem_id, problem, rephrase_problem, solution, test_cases)
|
||||
return solution
|
||||
|
||||
async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2):
|
||||
|
|
@ -96,6 +96,18 @@ class HumanEvalGraph(Graph):
|
|||
solution = solution.get('revised_solution')
|
||||
return solution
|
||||
|
||||
|
||||
class Gsm8kGraph(Graph):
|
||||
pass
|
||||
def __init__(self, name:str, llm: LLM) -> None:
|
||||
super().__init__(name, llm)
|
||||
self.generate = Generate(llm=llm)
|
||||
self.rephrase = Rephrase(llm=llm)
|
||||
|
||||
async def __call__(self, problem:str):
|
||||
solution = self.generate(problem)
|
||||
return solution
|
||||
|
||||
# async def __call__(self, problem:str):
|
||||
# 这个地方没有修改对应的prompt,可以对应着humaneval改一下
|
||||
# problem = await self.rephrase(problem)
|
||||
# solution = self.generate(problem)
|
||||
# return solution
|
||||
|
|
@ -3,6 +3,8 @@
|
|||
# @Author : didi
|
||||
# @Desc : operator demo of ags
|
||||
import ast
|
||||
import sys
|
||||
import traceback
|
||||
import random
|
||||
from typing import List, Tuple, Any, Dict
|
||||
from collections import Counter
|
||||
|
|
@ -115,6 +117,7 @@ class MdEnsemble(Operator):
|
|||
return shuffled_solutions, answer_mapping
|
||||
|
||||
async def __call__(self, solution_type:str ,solutions:List[str], problem_description:str):
|
||||
print(solutions)
|
||||
all_responses = []
|
||||
# 如果Solution方案是Code,我们利用AST去重
|
||||
if solution_type == "code":
|
||||
|
|
@ -132,6 +135,7 @@ class MdEnsemble(Operator):
|
|||
updated_solutions.append(solution)
|
||||
except SyntaxError:
|
||||
# If the solution has a syntax error, we'll skip it
|
||||
print("here",solution)
|
||||
continue
|
||||
solutions = updated_solutions
|
||||
updated_length = len(solutions)
|
||||
|
|
@ -316,44 +320,46 @@ class Rephrase(Operator):
|
|||
class Test(Operator):
|
||||
def __init__(self, name:str ="Tester", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
||||
def test_cases_2_assert(self, test_cases):
|
||||
return f"assert {test_cases[0]}({test_cases[1]}) == {test_cases[2]} \n"
|
||||
|
||||
def exec_code(self, solution, test_cases):
|
||||
def exec_code(self, solution, test_cases, problem_id):
|
||||
# TODO 未来还要做修改,最好能做到一个样例一测
|
||||
solution = solution["final_solution"]
|
||||
pass_case = []
|
||||
fail_case = []
|
||||
for test_case in test_cases:
|
||||
test_code = test_cases_2_test_functions(solution,test_case)
|
||||
try:
|
||||
exec(test_code)
|
||||
pass_case.append(self.test_cases_2_assert(test_case))
|
||||
except AssertionError as e:
|
||||
fail_case.append(self.test_cases_2_assert(test_case))
|
||||
except Exception as e:
|
||||
with open("tester.txt", "a") as f:
|
||||
f.write(test_case[0] + "\n")
|
||||
print(e)
|
||||
return {"error":e}
|
||||
if fail_case != []:
|
||||
return fail_case
|
||||
test_code = test_cases_2_test_functions(solution, test_cases)
|
||||
print("test_code", test_code)
|
||||
try:
|
||||
exec(test_code, globals())
|
||||
except AssertionError as e:
|
||||
exc_type, exc_value, exc_traceback = sys.exc_info()
|
||||
tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback)
|
||||
with open("tester.txt", "a") as f:
|
||||
f.write("test_error" +problem_id + "\n")
|
||||
error_infomation = {"test_fail_case": {
|
||||
"error_type": "AssertionError",
|
||||
"error_message": str(e),
|
||||
"traceback": tb_str
|
||||
}}
|
||||
print("error here", error_infomation)
|
||||
return error_infomation
|
||||
except Exception as e:
|
||||
with open("tester.txt", "a") as f:
|
||||
f.write(problem_id + "\n")
|
||||
return {"exec_fail_case":str(e)}
|
||||
return []
|
||||
|
||||
async def __call__(self, problem, rephrase_problem, solution, test_cases):
|
||||
result = self.exec_code(solution, test_cases)
|
||||
# 处理通过Public Tests的代码
|
||||
# TODO 这里的问题是,如果Test直接通过了就没有办法Check Multi Tests了
|
||||
async def __call__(self, problem_id, problem, rephrase_problem, solution, test_cases):
|
||||
result = self.exec_code(solution, test_cases, problem_id)
|
||||
print("result here", result)
|
||||
if result == []:
|
||||
return solution
|
||||
# 处理代码执行失败的代码
|
||||
elif type(result) == dict:
|
||||
result = result["error"]
|
||||
elif "exec_fail_case" in result:
|
||||
result = result["exec_fail_case"]
|
||||
prompt = REFLECTION_ON_PUBILIC_TEST_PROMPT.format(problem_description=problem, rephrase_problem=rephrase_problem, code_solution=solution, exec_pass=f"executed unsuccessfully, error: \n {result}", test_fail="executed unsucessfully")
|
||||
node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm)
|
||||
response = node.instruct_content.model_dump()
|
||||
return {"final_solution":response["refined_solution"]}
|
||||
else:
|
||||
result = result["test_fail_case"]
|
||||
prompt = REFLECTION_ON_PUBILIC_TEST_PROMPT.format(problem_description=problem, rephrase_problem=rephrase_problem, code_solution=solution, exec_pass="executed successfully", test_fail=result)
|
||||
node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm)
|
||||
response = node.instruct_content.model_dump()
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ Please maintain the JSON format in your response.
|
|||
# """
|
||||
|
||||
GENERATE_CODEBLOCK_REPHRASE_PROMPT = """
|
||||
You are given a code contest problem, and a self-reflection on the problem:
|
||||
Please provide a self-contained Python script that solves the following problem in a markdown code block:
|
||||
|
||||
### Problem Description:
|
||||
{problem_description}
|
||||
|
|
@ -51,8 +51,11 @@ You are given a code contest problem, and a self-reflection on the problem:
|
|||
### self reflection on the problem
|
||||
{rephrase_problem}
|
||||
|
||||
=======================
|
||||
The above is an incomplete Python code fragment and reflection on it. Return the complete and correct code with no additional text.
|
||||
When creating your solution:
|
||||
1. Consider all edge cases and boundary conditions.
|
||||
2. Avoid oversimplification - address all aspects of the problem.
|
||||
3. Ensure your logic covers all stated requirements.
|
||||
4. Avoid adding additional test cases beyond those provided in the problem description.
|
||||
"""
|
||||
|
||||
# GENERATE_CODEBLOCK_PROMPT = """
|
||||
|
|
@ -67,10 +70,9 @@ Please provide a self-contained Python script that solves the following problem
|
|||
|
||||
When creating your solution:
|
||||
1. Consider all edge cases and boundary conditions.
|
||||
2. Consider the order of operations in your solution and how each step affects subsequent steps.
|
||||
3. Avoid oversimplification - address all aspects of the problem.
|
||||
4. Ensure your logic covers all stated requirements.
|
||||
5. Avoid adding additional test cases beyond those provided in the problem description.
|
||||
2. Avoid oversimplification - address all aspects of the problem.
|
||||
3. Ensure your logic covers all stated requirements.
|
||||
4. Avoid adding additional test cases beyond those provided in the problem description.
|
||||
"""
|
||||
|
||||
REVIEW_PROMPT = """
|
||||
|
|
|
|||
|
|
@ -71,17 +71,12 @@ def parse_python_literal(s):
|
|||
except (ValueError, SyntaxError):
|
||||
return s
|
||||
|
||||
def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test.jsonl"):
|
||||
def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test_reflexion.jsonl"):
|
||||
# 保留原有的硬编码测试用例
|
||||
hardcoded_cases = {
|
||||
"HumanEval/87": [ ["get_row", [[[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 1, 6], [1, 2, 3, 4, 5, 1]], 1], [(0, 0), (1, 4), (1, 0), (2, 5), (2, 0)]], ["get_row", [[], 1], []], ["get_row", [[[], [1], [1, 2, 3]], 3], [(2, 2)]] ],
|
||||
"HumanEval/95": [ ["check_dict_case", [{"a": "apple", "b": "banana"}], True], ["check_dict_case", [{"a": "apple", "A": "banana", "B": "banana"}], False], ["check_dict_case", [{"a": "apple", "8": "banana", "a": "apple"}], False], ["check_dict_case", [{"Name": "John", "Age": "36", "City": "Houston"}], False], ["check_dict_case", [{"STATE": "NC", "ZIP": "12345"}], True] ],
|
||||
"HumanEval/107": [ ["even_odd_palindrome", [3], (1, 2)], ["even_odd_palindrome", [12], (4, 6)] ],
|
||||
"HumanEval/112": [ ["reverse_delete", ["abcde", "ae"], ("bcd", False)], ["reverse_delete", ["abcdef", "b"], ("acdef", False)], ["reverse_delete", ["abcdedcba", "ab"], ("cdedc", True)] ],
|
||||
"HumanEval/127": [ ["intersection", [(1, 2), (2, 3)], "NO"], ["intersection", [(-1, 1), (0, 4)], "NO"], ["intersection", [(-3, -1), (-5, 5)], "YES"] ],
|
||||
"HumanEval/136": [ ["largest_smallest_integers", [2, 4, 1, 3, 5, 7], (None, 1)], ["largest_smallest_integers", [], (None, None)], ["largest_smallest_integers", [0], (None, None)] ],
|
||||
"HumanEval/148": [ ["bf", ["Jupiter", "Neptune"], ("Saturn", "Uranus")], ["bf", ["Earth", "Mercury"], ("Venus",)], ["bf", ["Mercury", "Uranus"], ("Venus", "Earth", "Mars", "Jupiter", "Saturn")], ["bf", ["InvalidPlanet", "Neptune"], ()], ["bf", ["Jupiter", "InvalidPlanet"], ()], ["bf", ["Mercury", "Mercury"], ()] ],
|
||||
"HumanEval/155": [ ["even_odd_count", [-12], (1, 1)], ["even_odd_count", [123], (1, 2)] ]
|
||||
"HumanEval/32": "",
|
||||
"HumanEval/38": "",
|
||||
"HumanEval/50": "",
|
||||
}
|
||||
|
||||
# 检查是否有硬编码的测试用例
|
||||
|
|
@ -92,16 +87,8 @@ def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test.jso
|
|||
with open(file_path, 'r') as file:
|
||||
for line in file:
|
||||
data = json.loads(line)
|
||||
if problem_id in data:
|
||||
problem_data = data[problem_id]
|
||||
# 处理测试用例
|
||||
for i, test_case in enumerate(problem_data):
|
||||
# 函数名保持不变
|
||||
# 参数列表需要解析
|
||||
test_case[1] = [parse_python_literal(arg) for arg in test_case[1]]
|
||||
# 预期输出需要解析
|
||||
test_case[2] = parse_python_literal(test_case[2])
|
||||
return problem_data
|
||||
if data.get("id") == problem_id:
|
||||
return data.get("test")
|
||||
|
||||
return None # 如果没有找到问题,返回 None
|
||||
|
||||
|
|
@ -158,45 +145,53 @@ async def llm_extract_test_case(id, problem_description: str, file_path:str="pub
|
|||
|
||||
import json
|
||||
|
||||
def test_cases_2_test_functions(solution: str, test_case: List):
|
||||
print("here",solution)
|
||||
function_name = test_case[0]
|
||||
# def test_cases_2_test_functions(solution: str, test_case: List):
|
||||
# print("test_case", test_case)
|
||||
# function_name = test_case[0]
|
||||
|
||||
def format_param(param):
|
||||
if isinstance(param, str):
|
||||
return repr(param)
|
||||
elif isinstance(param, (int, float, bool)):
|
||||
return str(param)
|
||||
elif isinstance(param, list):
|
||||
return '[' + ', '.join(format_param(item) for item in param) + ']'
|
||||
elif isinstance(param, tuple):
|
||||
return '(' + ', '.join(format_param(item) for item in param) + ')'
|
||||
elif isinstance(param, dict):
|
||||
return '{' + ', '.join(f'{format_param(k)}: {format_param(v)}' for k, v in param.items()) + '}'
|
||||
elif isinstance(param, type(None)):
|
||||
return 'None'
|
||||
else:
|
||||
raise ValueError(f"Unsupported parameter type: {type(param)}")
|
||||
# def format_param(param):
|
||||
# if isinstance(param, str):
|
||||
# return repr(param)
|
||||
# elif isinstance(param, (int, float, bool)):
|
||||
# return str(param)
|
||||
# elif isinstance(param, list):
|
||||
# return '[' + ', '.join(format_param(item) for item in param) + ']'
|
||||
# elif isinstance(param, tuple):
|
||||
# return '(' + ', '.join(format_param(item) for item in param) + ')'
|
||||
# elif isinstance(param, dict):
|
||||
# return '{' + ', '.join(f'{format_param(k)}: {format_param(v)}' for k, v in param.items()) + '}'
|
||||
# elif isinstance(param, type(None)):
|
||||
# return 'None'
|
||||
# else:
|
||||
# raise ValueError(f"Unsupported parameter type: {type(param)}")
|
||||
|
||||
parameters = ', '.join(format_param(item) for item in test_case[1])
|
||||
print(type(test_case[2]), test_case[2])
|
||||
expected_output = format_param(test_case[2])
|
||||
print(expected_output)
|
||||
# parameters = ', '.join(format_param(item) for item in test_case[1])
|
||||
# print(test_case[1], parameters)
|
||||
|
||||
# expected_output = format_param(test_case[2])
|
||||
# print(type(test_case[2]), test_case[2], expected_output)
|
||||
|
||||
# tester_function = f"""
|
||||
# {solution}
|
||||
|
||||
# def check(candidate):
|
||||
# assert candidate({parameters}) == {expected_output}
|
||||
|
||||
# check({function_name})
|
||||
# """
|
||||
|
||||
# print(f"""
|
||||
# Generated test function:
|
||||
# {tester_function}
|
||||
# """)
|
||||
|
||||
# return tester_function
|
||||
|
||||
|
||||
def test_cases_2_test_functions(solution: str, test_cases: str):
|
||||
tester_function = f"""
|
||||
{solution}
|
||||
|
||||
def check(candidate):
|
||||
assert candidate({parameters}) == {expected_output}
|
||||
|
||||
check({function_name})
|
||||
"""
|
||||
|
||||
print(f"""
|
||||
Generated test function:
|
||||
{tester_function}
|
||||
""")
|
||||
|
||||
return tester_function
|
||||
|
||||
{test_cases}
|
||||
"""
|
||||
return tester_function
|
||||
Loading…
Add table
Add a link
Reference in a new issue