This commit is contained in:
didi 2024-07-09 14:51:27 +08:00
parent aeac3fe3f9
commit 86033a1037
6 changed files with 216 additions and 163 deletions

View file

@ -0,0 +1,156 @@
# -*- coding: utf-8 -*-
# @Date : 7/7/2024 17:07 PM
# @Author : didi
# @Desc : test on human eval graph
import json
import subprocess
import sys
import asyncio
import aiofiles
from metagpt.llm import LLM
from evalplus.data import get_human_eval_plus, write_jsonl
from examples.ags.w_action_node.utils import jsonl_ranker
from examples.ags.w_action_node.graph import HumanEvalGraph
from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock
generate_code = GenerateCode(llm=LLM())
generate_code_block = GenerateCodeBlock(llm=LLM())
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)
async def sample_generate(id):
case = get_human_eval_plus()[f"{id}"]
solution_result = await solver(case['prompt'],ensemble_count=5)
sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
with open("samples.jsonl", mode='a') as f:
f.write(json.dumps(sample_dict) + '\n')
jsonl_ranker("samples.jsonl", "samples.jsonl")
async def samples_generate(mode:str):
cases = list(get_human_eval_plus().values())
file_lock = asyncio.Lock()
async def solve_and_write(case, mode):
try:
if mode == 'llm':
# solution_result = await generate_code_block(case['prompt'])
solution_result = await generate_code(case['prompt'])
sample_dict = {
'task_id': case['task_id'],
'solution': solution_result['code_solution']
}
elif mode == "ags":
solution_result = await solver(case['prompt'], ensemble_count=5)
sample_dict = {
'task_id': case['task_id'],
'solution': solution_result['final_solution']
}
async with file_lock:
async with aiofiles.open("samples.jsonl", mode='a') as f:
await f.write(json.dumps(sample_dict) + '\n')
return None
except Exception as e:
print(e)
return case['task_id']
tasks = [solve_and_write(case, mode) for case in cases]
results = await asyncio.gather(*tasks)
failed_tasks = [task_id for task_id in results if task_id is not None]
# TODO 这个地方还是不够自动化
if failed_tasks:
for task_id in failed_tasks:
try:
await sample_generate(task_id)
except Exception as e:
print(f"failure {task_id}")
jsonl_ranker("samples.jsonl", "samples.jsonl")
if not failed_tasks:
if automatic_evalplus():
unpassed_exapmle = extract_failure_tests()
print(unpassed_exapmle)
async def samples_generate_ags():
sample_list = []
cases = list(get_human_eval_plus().values())
async def solve_with_id(case):
solution_result = await solver(case['prompt'], ensemble_count=3)
return case['task_id'], solution_result['final_solution']
tasks = [solve_with_id(case) for case in cases]
results = await asyncio.gather(*tasks)
for task_id, solution in results:
sample_dict = dict(task_id=task_id, solution=solution)
sample_list.append(sample_dict)
write_jsonl("samples.jsonl", sample_list)
async def samples_generate_llm():
sample_list = []
cases = list(get_human_eval_plus().values())
async def solve_with_id(case):
solution_result = await generate_code_block(case['prompt'])
# solution_result = await generate_code(case['prompt'])
return case['task_id'], solution_result['code_solution']
tasks = [solve_with_id(case) for case in cases]
results = await asyncio.gather(*tasks)
for task_id, solution in results:
sample_dict = dict(task_id=task_id, solution=solution)
sample_list.append(sample_dict)
write_jsonl("samples.jsonl", sample_list)
def automatic_evalplus():
"""
在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only
"""
command = [
sys.executable, # 使用当前 Python 解释器
"-m",
"evalplus.evaluate",
"--dataset", "humaneval",
"--samples", "samples.jsonl",
"--parallel", "2",
"--base-only"
]
try:
result = subprocess.run(command, check=True, capture_output=True, text=True)
print("输出:", result.stdout)
return True
except subprocess.CalledProcessError as e:
print("错误输出:", e.stderr)
return False
def extract_failure_tests(file_path:str = "/Users/trl/Github_project/MetaGPT-MathAI/samples_eval_results.json"):
with open(file_path, 'r') as f:
task_results = json.load(f)
failed_tests = []
for task in task_results['eval'].values():
if task[0]["base_status"] == "fail":
failed_test = {
"task_id": task[0]["task_id"],
# "solution": task["solution"],
# "fail_tests": task["base_fail_tests"]
}
failed_tests.append(failed_test)
print(len(failed_tests))
return failed_tests
# asyncio.run(sample_generate('HumanEval/101'))
# asyncio.run(samples_generate(mode='llm'))
# jsonl_ranker("samples.jsonl", "samples.jsonl")
# {"task_id": "HumanEval/101", "solution": "def words_string(s):\n import re\n return re.split(r'[,\\s]\\s*', s)"}

View file

@ -16,7 +16,7 @@ class Graph:
NotImplementedError("Subclasses must implement __call__ method")
class HumanEvalGraph(Graph):
def __init__(self, name:str, llm: LLM, criteria:str, vote_count:int =3) -> None:
def __init__(self, name:str, llm: LLM, criteria:str, vote_count:int =5) -> None:
super().__init__(name, llm)
self.criteria = criteria # TODO 自动构建图时,图的初始参数与图所使用的算子要求的外部参数相匹配
self.generate_code = GenerateCode(llm=llm)
@ -29,11 +29,11 @@ class HumanEvalGraph(Graph):
async def __call__(self, problem:str, ensemble_count:int = 3):
solution_list = []
for _ in range(ensemble_count):
# solution = await self.generate_code(problem)
solution = await self.generate_code_block(problem)
solution = await self.generate_code(problem)
# solution = await self.generate_code_block(problem)
solution = solution.get('code_solution')
solution_list.append(solution)
solution = await self.mdensemble(solution_list, problem)
solution = await self.mdensemble("code", solution_list, problem)
return solution
async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2):
@ -44,14 +44,16 @@ class HumanEvalGraph(Graph):
solution = await self.ensemble(solution_list, problem)
return solution
async def simple_ensemble(self, problem:str):
solution_list = []
for _ in range(3):
solution = await self.generate_code(problem)
solution = solution.get('code_solution')
solution_list.append(solution)
solution = await self.ensemble(solution_list, problem)
return solution
# async def simple_ensemble(self, problem:str, ensemble_count:int = 3):
# async def __call__(self, problem:str, ensemble_count:int = 3):
# solution_list = []
# for _ in range(ensemble_count):
# solution = await self.generate_code(problem)
# # solution = await self.generate_code_block(problem)
# solution = solution.get('code_solution')
# solution_list.append(solution)
# solution = await self.ensemble(solution_list, problem)
# return solution
async def single_solve(self, problem:str, max_loop:int):
solution = await self.generate_code(problem)

View file

@ -2,7 +2,7 @@
# @Date : 6/27/2024 17:36 PM
# @Author : didi
# @Desc : operator demo of ags
import ast
import random
from typing import List, Tuple, Any, Dict
from collections import Counter
@ -90,7 +90,7 @@ class Ensemble(Operator):
response = node.instruct_content.model_dump()
return response
class MdEnsemble(Ensemble):
class MdEnsemble(Operator):
def __init__(self, name:str ="MdEnsembler", llm: LLM = LLM(), vote_count:int=3):
super().__init__(name, llm)
@ -100,21 +100,35 @@ class MdEnsemble(Ensemble):
def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
shuffled_solutions = solutions.copy()
random.shuffle(shuffled_solutions)
answer_mapping = {
chr(65 + i): solutions.index(sol)
for i, sol in enumerate(shuffled_solutions)
}
# 这里的index方法会把检索到的放在第一个索引的位置。
answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
return shuffled_solutions, answer_mapping
@staticmethod
def most_frequent(lst: List[Any]) -> Tuple[Any, int]:
counter = Counter(lst)
most_common = counter.most_common(1)
return most_common[0] if most_common else (None, 0)
async def __call__(self, solutions:List[str], problem_description:str,):
async def __call__(self, solution_type:str ,solutions:List[str], problem_description:str):
all_responses = []
# 如果Solution方案是Code我们利用AST去重
if solution_type == "code":
original_length = len(solutions)
unique_structures = {}
updated_solutions = []
for solution in solutions:
try:
tree = ast.parse(solution)
structure_key = ast.dump(tree, annotate_fields=False, include_attributes=False)
if structure_key not in unique_structures:
unique_structures[structure_key] = solution
updated_solutions.append(solution)
except SyntaxError:
# If the solution has a syntax error, we'll skip it
continue
solutions = updated_solutions
updated_length = len(solutions)
print(f"Original number of solutions: {original_length}")
print(f"Updated number of solutions: {updated_length}")
if updated_length == 1:
return {"final_solution": solutions[0]}
for _ in range(self.vote_count):
shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
@ -131,38 +145,16 @@ class MdEnsemble(Ensemble):
if answer in answer_mapping:
original_index = answer_mapping[answer]
all_responses.append(solutions[original_index])
final_answer, frequency = self.most_frequent(all_responses)
print(f"original index: {original_index}")
all_responses.append(original_index)
most_frequent_index = Counter(all_responses).most_common(1)[0][0]
print(f"most frequent_index: {most_frequent_index}")
final_answer = solutions[most_frequent_index]
print(f"final answer: {final_answer}")
# final_answer, frequency = self.most_frequent(all_responses)
return {"final_solution": final_answer}
# def load_llm_configs(*config_names):
# """
# Load multiple LLM configurations and return a list of initialized LLMs.
# :param config_names: Variable number of configuration file names (without .yaml extension)
# :return: List of initialized LLM objects
# """
# llms = []
# for config_name in config_names:
# config_path = Path(f"~/.metagpt/{config_name}.yaml").expanduser()
# if config_path.exists():
# config = Config.from_yaml_file(config_path)
# llms.append(LLM(config.llm))
# else:
# print(f"Warning: Configuration file {config_path} not found. Skipping.")
# return llms
# 使用函数加载多个 LLM 配置
# llms = load_llm_configs("gpt-4o", "sonnet-35") # 你可以根据需要添加或删除配置
class ScEnsemble(Operator):
# TODO
pass

View file

@ -3,6 +3,10 @@
# @Author : didi
# @Desc : prompts of operators
# TODO PromptBreeder 评分是怎么做的?
# TODO 评估案例 GSM-8K 直接拿的DataSet
#
#
GENERATE_PROMPT = """
Generate Solution for the following problem: {problem_description}

View file

@ -1,103 +1,14 @@
import json
import asyncio
import aiofiles
from metagpt.llm import LLM
from evalplus.data import get_human_eval_plus, write_jsonl
from examples.ags.benchmark.humaneval import sample_generate, samples_generate, extract_failure_tests, automatic_evalplus
from examples.ags.w_action_node.utils import jsonl_ranker
from examples.ags.w_action_node.graph import HumanEvalGraph
from examples.ags.w_action_node.operator import GenerateCode
generate_code = GenerateCode(llm=LLM())
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)
async def sample_generate(id):
case = get_human_eval_plus()[f"{id}"]
solution_result = await solver(case['prompt'],ensemble_count=3)
sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
with open("samples.jsonl", mode='a') as f:
f.write(json.dumps(sample_dict) + '\n')
jsonl_ranker("samples.jsonl", "samples.jsonl")
async def samples_generate(mode:str):
cases = list(get_human_eval_plus().values())
file_lock = asyncio.Lock()
async def solve_and_write(case, mode):
try:
if mode == 'llm':
solution_result = await generate_code(case['prompt'])
sample_dict = {
'task_id': case['task_id'],
'solution': solution_result['code_solution']
}
elif mode == "ags":
solution_result = await solver(case['prompt'], ensemble_count=3)
sample_dict = {
'task_id': case['task_id'],
'solution': solution_result['final_solution']
}
async with file_lock:
async with aiofiles.open("samples.jsonl", mode='a') as f:
await f.write(json.dumps(sample_dict) + '\n')
return None
except Exception as e:
print(e)
return case['task_id']
tasks = [solve_and_write(case, mode) for case in cases]
results = await asyncio.gather(*tasks)
failed_tasks = [task_id for task_id in results if task_id is not None]
# TODO 这个地方还是不够自动化
if failed_tasks:
for task_id in failed_tasks:
try:
await sample_generate(task_id)
except Exception as e:
print(f"failure {task_id}")
jsonl_ranker("samples.jsonl", "samples.jsonl")
async def samples_generate_ags():
sample_list = []
cases = list(get_human_eval_plus().values())
async def solve_with_id(case):
solution_result = await solver(case['prompt'], ensemble_count=3)
return case['task_id'], solution_result['final_solution']
tasks = [solve_with_id(case) for case in cases]
results = await asyncio.gather(*tasks)
for task_id, solution in results:
sample_dict = dict(task_id=task_id, solution=solution)
sample_list.append(sample_dict)
write_jsonl("samples.jsonl", sample_list)
async def samples_generate_llm():
sample_list = []
cases = list(get_human_eval_plus().values())
async def solve_with_id(case):
solution_result = await generate_code(case['prompt'])
return case['task_id'], solution_result['code_solution']
tasks = [solve_with_id(case) for case in cases]
results = await asyncio.gather(*tasks)
for task_id, solution in results:
sample_dict = dict(task_id=task_id, solution=solution)
sample_list.append(sample_dict)
write_jsonl("samples.jsonl", sample_list)
# asyncio.run(sample_generate('HumanEval/101'))
# asyncio.run(samples_generate_llm())
# asyncio.run(sample_generate('HumanEval/1'))
asyncio.run(samples_generate(mode='ags'))
# jsonl_ranker("samples.jsonl", "samples.jsonl")
# if automatic_evalplus():
# unpassed_exapmle = extract_failure_tests()
# print(unpassed_exapmle)

12
test.py
View file

@ -1,12 +0,0 @@
import asyncio
from examples.ags.w_action_node.graph import HumanEvalGraph
from metagpt.llm import LLM
human_eval_example = """
from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n
"""
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability')
final_result = asyncio.run(solver(human_eval_example))
print(final_result)