This commit is contained in:
didi 2024-07-29 22:00:07 +08:00
parent eac4b6c3e6
commit 3fc3d217a8
7 changed files with 128 additions and 101 deletions

View file

@ -3,6 +3,11 @@
# @Author : didi
# @Desc : test on human eval graph
# 1. 出效果
# 2. 代码方面,格式问题,很多格式处理 ->增加效果
# 3. GSM8k ->
# 4. 我来写一个GSM8k最基础代码GSM8k实验代码需要你来改写
import os
import json
import subprocess
@ -17,7 +22,7 @@ from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock
generate_code = GenerateCode(llm=LLM())
generate_code_block = GenerateCodeBlock(llm=LLM())
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1)
async def sample_generate(id, result_path:str="samples.jsonl",mode:str="ags"):
case = get_human_eval_plus()[f"{id}"]
@ -55,7 +60,7 @@ async def samples_generate(mode:str, result_path:str="samples.jsonl"):
'solution': solution_result['final_solution']
}
elif mode == "alpha":
solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=5)
solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=1)
sample_dict = {
'task_id': case['task_id'],
'solution': solution_result['final_solution']
@ -164,6 +169,7 @@ def automatic_sanitize(result_path: str = "samples.jsonl"):
sanitized_path = f"{base_name}-sanitized.jsonl"
return sanitized_path
def automatic_evalplus(result_path:str ="samples.jsonl"):
"""
在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only

View file

@ -7,6 +7,7 @@ from metagpt.llm import LLM
from typing import List
from examples.ags.w_action_node.operator import Generate, GenerateCode, GenerateCodeBlock, Review, Revise, FuEnsemble, MdEnsemble, DbEnsemble, Rephrase, Test
from examples.ags.w_action_node.utils import extract_test_cases_from_jsonl
from evalplus.data import get_human_eval_plus
class Graph:
def __init__(self, name:str, llm:LLM) -> None:
self.name = name
@ -46,24 +47,23 @@ class HumanEvalGraph(Graph):
solution = await self.mdensemble("code", solution_list, problem)
return solution
async def alpha_codium(self, problem_id:str, problem:str, ensemble_count:int = 3):
# async def __call__(self,problem_id, problem:str, ensemble_count:int = 3):
test_cases = extract_test_cases_from_jsonl(problem_id)
entry_point = get_human_eval_plus()[problem_id]['entry_point']
rephrase_problem = await self.rephrase(problem) # 在rephrase 中拼接原始的问题描述
solution_list = []
for _ in range(ensemble_count):
for retry_count in range(5):
try:
solution = await self.generate_code_block(problem, rephrase_problem)
solution = await self.generate_code_block.rephrase_generate(problem, rephrase_problem, function_name=entry_point)
solution = solution.get('code_solution')
solution_list.append(solution)
break
except Exception as e:
print(e)
solution = await self.mdensemble("code", solution_list, problem)
print("here",solution)
solution = await self.tester(problem, rephrase_problem, solution, test_cases)
solution = await self.tester(problem_id, problem, rephrase_problem, solution, test_cases)
return solution
async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2):
@ -96,6 +96,18 @@ class HumanEvalGraph(Graph):
solution = solution.get('revised_solution')
return solution
class Gsm8kGraph(Graph):
pass
def __init__(self, name:str, llm: LLM) -> None:
super().__init__(name, llm)
self.generate = Generate(llm=llm)
self.rephrase = Rephrase(llm=llm)
async def __call__(self, problem:str):
solution = self.generate(problem)
return solution
# async def __call__(self, problem:str):
# 这个地方没有修改对应的prompt可以对应着humaneval改一下
# problem = await self.rephrase(problem)
# solution = self.generate(problem)
# return solution

View file

@ -3,6 +3,8 @@
# @Author : didi
# @Desc : operator demo of ags
import ast
import sys
import traceback
import random
from typing import List, Tuple, Any, Dict
from collections import Counter
@ -115,6 +117,7 @@ class MdEnsemble(Operator):
return shuffled_solutions, answer_mapping
async def __call__(self, solution_type:str ,solutions:List[str], problem_description:str):
print(solutions)
all_responses = []
# 如果Solution方案是Code我们利用AST去重
if solution_type == "code":
@ -132,6 +135,7 @@ class MdEnsemble(Operator):
updated_solutions.append(solution)
except SyntaxError:
# If the solution has a syntax error, we'll skip it
print("here",solution)
continue
solutions = updated_solutions
updated_length = len(solutions)
@ -316,44 +320,46 @@ class Rephrase(Operator):
class Test(Operator):
def __init__(self, name:str ="Tester", llm: LLM = LLM()):
super().__init__(name, llm)
def test_cases_2_assert(self, test_cases):
return f"assert {test_cases[0]}({test_cases[1]}) == {test_cases[2]} \n"
def exec_code(self, solution, test_cases):
def exec_code(self, solution, test_cases, problem_id):
# TODO 未来还要做修改,最好能做到一个样例一测
solution = solution["final_solution"]
pass_case = []
fail_case = []
for test_case in test_cases:
test_code = test_cases_2_test_functions(solution,test_case)
try:
exec(test_code)
pass_case.append(self.test_cases_2_assert(test_case))
except AssertionError as e:
fail_case.append(self.test_cases_2_assert(test_case))
except Exception as e:
with open("tester.txt", "a") as f:
f.write(test_case[0] + "\n")
print(e)
return {"error":e}
if fail_case != []:
return fail_case
test_code = test_cases_2_test_functions(solution, test_cases)
print("test_code", test_code)
try:
exec(test_code, globals())
except AssertionError as e:
exc_type, exc_value, exc_traceback = sys.exc_info()
tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback)
with open("tester.txt", "a") as f:
f.write("test_error" +problem_id + "\n")
error_infomation = {"test_fail_case": {
"error_type": "AssertionError",
"error_message": str(e),
"traceback": tb_str
}}
print("error here", error_infomation)
return error_infomation
except Exception as e:
with open("tester.txt", "a") as f:
f.write(problem_id + "\n")
return {"exec_fail_case":str(e)}
return []
async def __call__(self, problem, rephrase_problem, solution, test_cases):
result = self.exec_code(solution, test_cases)
# 处理通过Public Tests的代码
# TODO 这里的问题是如果Test直接通过了就没有办法Check Multi Tests了
async def __call__(self, problem_id, problem, rephrase_problem, solution, test_cases):
result = self.exec_code(solution, test_cases, problem_id)
print("result here", result)
if result == []:
return solution
# 处理代码执行失败的代码
elif type(result) == dict:
result = result["error"]
elif "exec_fail_case" in result:
result = result["exec_fail_case"]
prompt = REFLECTION_ON_PUBILIC_TEST_PROMPT.format(problem_description=problem, rephrase_problem=rephrase_problem, code_solution=solution, exec_pass=f"executed unsuccessfully, error: \n {result}", test_fail="executed unsucessfully")
node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm)
response = node.instruct_content.model_dump()
return {"final_solution":response["refined_solution"]}
else:
result = result["test_fail_case"]
prompt = REFLECTION_ON_PUBILIC_TEST_PROMPT.format(problem_description=problem, rephrase_problem=rephrase_problem, code_solution=solution, exec_pass="executed successfully", test_fail=result)
node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm)
response = node.instruct_content.model_dump()

View file

@ -43,7 +43,7 @@ Please maintain the JSON format in your response.
# """
GENERATE_CODEBLOCK_REPHRASE_PROMPT = """
You are given a code contest problem, and a self-reflection on the problem:
Please provide a self-contained Python script that solves the following problem in a markdown code block:
### Problem Description:
{problem_description}
@ -51,8 +51,11 @@ You are given a code contest problem, and a self-reflection on the problem:
### self reflection on the problem
{rephrase_problem}
=======================
The above is an incomplete Python code fragment and reflection on it. Return the complete and correct code with no additional text.
When creating your solution:
1. Consider all edge cases and boundary conditions.
2. Avoid oversimplification - address all aspects of the problem.
3. Ensure your logic covers all stated requirements.
4. Avoid adding additional test cases beyond those provided in the problem description.
"""
# GENERATE_CODEBLOCK_PROMPT = """
@ -67,10 +70,9 @@ Please provide a self-contained Python script that solves the following problem
When creating your solution:
1. Consider all edge cases and boundary conditions.
2. Consider the order of operations in your solution and how each step affects subsequent steps.
3. Avoid oversimplification - address all aspects of the problem.
4. Ensure your logic covers all stated requirements.
5. Avoid adding additional test cases beyond those provided in the problem description.
2. Avoid oversimplification - address all aspects of the problem.
3. Ensure your logic covers all stated requirements.
4. Avoid adding additional test cases beyond those provided in the problem description.
"""
REVIEW_PROMPT = """

View file

@ -71,17 +71,12 @@ def parse_python_literal(s):
except (ValueError, SyntaxError):
return s
def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test.jsonl"):
def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test_reflexion.jsonl"):
# 保留原有的硬编码测试用例
hardcoded_cases = {
"HumanEval/87": [ ["get_row", [[[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 1, 6], [1, 2, 3, 4, 5, 1]], 1], [(0, 0), (1, 4), (1, 0), (2, 5), (2, 0)]], ["get_row", [[], 1], []], ["get_row", [[[], [1], [1, 2, 3]], 3], [(2, 2)]] ],
"HumanEval/95": [ ["check_dict_case", [{"a": "apple", "b": "banana"}], True], ["check_dict_case", [{"a": "apple", "A": "banana", "B": "banana"}], False], ["check_dict_case", [{"a": "apple", "8": "banana", "a": "apple"}], False], ["check_dict_case", [{"Name": "John", "Age": "36", "City": "Houston"}], False], ["check_dict_case", [{"STATE": "NC", "ZIP": "12345"}], True] ],
"HumanEval/107": [ ["even_odd_palindrome", [3], (1, 2)], ["even_odd_palindrome", [12], (4, 6)] ],
"HumanEval/112": [ ["reverse_delete", ["abcde", "ae"], ("bcd", False)], ["reverse_delete", ["abcdef", "b"], ("acdef", False)], ["reverse_delete", ["abcdedcba", "ab"], ("cdedc", True)] ],
"HumanEval/127": [ ["intersection", [(1, 2), (2, 3)], "NO"], ["intersection", [(-1, 1), (0, 4)], "NO"], ["intersection", [(-3, -1), (-5, 5)], "YES"] ],
"HumanEval/136": [ ["largest_smallest_integers", [2, 4, 1, 3, 5, 7], (None, 1)], ["largest_smallest_integers", [], (None, None)], ["largest_smallest_integers", [0], (None, None)] ],
"HumanEval/148": [ ["bf", ["Jupiter", "Neptune"], ("Saturn", "Uranus")], ["bf", ["Earth", "Mercury"], ("Venus",)], ["bf", ["Mercury", "Uranus"], ("Venus", "Earth", "Mars", "Jupiter", "Saturn")], ["bf", ["InvalidPlanet", "Neptune"], ()], ["bf", ["Jupiter", "InvalidPlanet"], ()], ["bf", ["Mercury", "Mercury"], ()] ],
"HumanEval/155": [ ["even_odd_count", [-12], (1, 1)], ["even_odd_count", [123], (1, 2)] ]
"HumanEval/32": "",
"HumanEval/38": "",
"HumanEval/50": "",
}
# 检查是否有硬编码的测试用例
@ -92,16 +87,8 @@ def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test.jso
with open(file_path, 'r') as file:
for line in file:
data = json.loads(line)
if problem_id in data:
problem_data = data[problem_id]
# 处理测试用例
for i, test_case in enumerate(problem_data):
# 函数名保持不变
# 参数列表需要解析
test_case[1] = [parse_python_literal(arg) for arg in test_case[1]]
# 预期输出需要解析
test_case[2] = parse_python_literal(test_case[2])
return problem_data
if data.get("id") == problem_id:
return data.get("test")
return None # 如果没有找到问题,返回 None
@ -158,45 +145,53 @@ async def llm_extract_test_case(id, problem_description: str, file_path:str="pub
import json
def test_cases_2_test_functions(solution: str, test_case: List):
print("here",solution)
function_name = test_case[0]
# def test_cases_2_test_functions(solution: str, test_case: List):
# print("test_case", test_case)
# function_name = test_case[0]
def format_param(param):
if isinstance(param, str):
return repr(param)
elif isinstance(param, (int, float, bool)):
return str(param)
elif isinstance(param, list):
return '[' + ', '.join(format_param(item) for item in param) + ']'
elif isinstance(param, tuple):
return '(' + ', '.join(format_param(item) for item in param) + ')'
elif isinstance(param, dict):
return '{' + ', '.join(f'{format_param(k)}: {format_param(v)}' for k, v in param.items()) + '}'
elif isinstance(param, type(None)):
return 'None'
else:
raise ValueError(f"Unsupported parameter type: {type(param)}")
# def format_param(param):
# if isinstance(param, str):
# return repr(param)
# elif isinstance(param, (int, float, bool)):
# return str(param)
# elif isinstance(param, list):
# return '[' + ', '.join(format_param(item) for item in param) + ']'
# elif isinstance(param, tuple):
# return '(' + ', '.join(format_param(item) for item in param) + ')'
# elif isinstance(param, dict):
# return '{' + ', '.join(f'{format_param(k)}: {format_param(v)}' for k, v in param.items()) + '}'
# elif isinstance(param, type(None)):
# return 'None'
# else:
# raise ValueError(f"Unsupported parameter type: {type(param)}")
parameters = ', '.join(format_param(item) for item in test_case[1])
print(type(test_case[2]), test_case[2])
expected_output = format_param(test_case[2])
print(expected_output)
# parameters = ', '.join(format_param(item) for item in test_case[1])
# print(test_case[1], parameters)
# expected_output = format_param(test_case[2])
# print(type(test_case[2]), test_case[2], expected_output)
# tester_function = f"""
# {solution}
# def check(candidate):
# assert candidate({parameters}) == {expected_output}
# check({function_name})
# """
# print(f"""
# Generated test function:
# {tester_function}
# """)
# return tester_function
def test_cases_2_test_functions(solution: str, test_cases: str):
tester_function = f"""
{solution}
def check(candidate):
assert candidate({parameters}) == {expected_output}
check({function_name})
"""
print(f"""
Generated test function:
{tester_function}
""")
return tester_function
{test_cases}
"""
return tester_function