mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-08 15:05:17 +02:00
Update
This commit is contained in:
parent
aeac3fe3f9
commit
86033a1037
6 changed files with 216 additions and 163 deletions
156
examples/ags/benchmark/humaneval.py
Normal file
156
examples/ags/benchmark/humaneval.py
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Date : 7/7/2024 17:07 PM
|
||||
# @Author : didi
|
||||
# @Desc : test on human eval graph
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import asyncio
|
||||
import aiofiles
|
||||
from metagpt.llm import LLM
|
||||
from evalplus.data import get_human_eval_plus, write_jsonl
|
||||
from examples.ags.w_action_node.utils import jsonl_ranker
|
||||
from examples.ags.w_action_node.graph import HumanEvalGraph
|
||||
from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock
|
||||
|
||||
generate_code = GenerateCode(llm=LLM())
|
||||
generate_code_block = GenerateCodeBlock(llm=LLM())
|
||||
|
||||
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)
|
||||
|
||||
async def sample_generate(id):
|
||||
case = get_human_eval_plus()[f"{id}"]
|
||||
solution_result = await solver(case['prompt'],ensemble_count=5)
|
||||
sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
|
||||
with open("samples.jsonl", mode='a') as f:
|
||||
f.write(json.dumps(sample_dict) + '\n')
|
||||
jsonl_ranker("samples.jsonl", "samples.jsonl")
|
||||
|
||||
async def samples_generate(mode:str):
|
||||
cases = list(get_human_eval_plus().values())
|
||||
file_lock = asyncio.Lock()
|
||||
|
||||
async def solve_and_write(case, mode):
|
||||
try:
|
||||
if mode == 'llm':
|
||||
# solution_result = await generate_code_block(case['prompt'])
|
||||
solution_result = await generate_code(case['prompt'])
|
||||
sample_dict = {
|
||||
'task_id': case['task_id'],
|
||||
'solution': solution_result['code_solution']
|
||||
}
|
||||
elif mode == "ags":
|
||||
solution_result = await solver(case['prompt'], ensemble_count=5)
|
||||
sample_dict = {
|
||||
'task_id': case['task_id'],
|
||||
'solution': solution_result['final_solution']
|
||||
}
|
||||
|
||||
async with file_lock:
|
||||
async with aiofiles.open("samples.jsonl", mode='a') as f:
|
||||
await f.write(json.dumps(sample_dict) + '\n')
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return case['task_id']
|
||||
|
||||
tasks = [solve_and_write(case, mode) for case in cases]
|
||||
results = await asyncio.gather(*tasks)
|
||||
failed_tasks = [task_id for task_id in results if task_id is not None]
|
||||
|
||||
# TODO 这个地方还是不够自动化
|
||||
if failed_tasks:
|
||||
for task_id in failed_tasks:
|
||||
try:
|
||||
await sample_generate(task_id)
|
||||
except Exception as e:
|
||||
print(f"failure {task_id}")
|
||||
jsonl_ranker("samples.jsonl", "samples.jsonl")
|
||||
|
||||
if not failed_tasks:
|
||||
if automatic_evalplus():
|
||||
unpassed_exapmle = extract_failure_tests()
|
||||
print(unpassed_exapmle)
|
||||
|
||||
async def samples_generate_ags():
|
||||
sample_list = []
|
||||
cases = list(get_human_eval_plus().values())
|
||||
|
||||
async def solve_with_id(case):
|
||||
solution_result = await solver(case['prompt'], ensemble_count=3)
|
||||
return case['task_id'], solution_result['final_solution']
|
||||
|
||||
tasks = [solve_with_id(case) for case in cases]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
for task_id, solution in results:
|
||||
sample_dict = dict(task_id=task_id, solution=solution)
|
||||
sample_list.append(sample_dict)
|
||||
|
||||
write_jsonl("samples.jsonl", sample_list)
|
||||
|
||||
async def samples_generate_llm():
|
||||
sample_list = []
|
||||
cases = list(get_human_eval_plus().values())
|
||||
|
||||
async def solve_with_id(case):
|
||||
solution_result = await generate_code_block(case['prompt'])
|
||||
# solution_result = await generate_code(case['prompt'])
|
||||
return case['task_id'], solution_result['code_solution']
|
||||
|
||||
tasks = [solve_with_id(case) for case in cases]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
for task_id, solution in results:
|
||||
sample_dict = dict(task_id=task_id, solution=solution)
|
||||
sample_list.append(sample_dict)
|
||||
|
||||
write_jsonl("samples.jsonl", sample_list)
|
||||
|
||||
def automatic_evalplus():
|
||||
"""
|
||||
在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only
|
||||
"""
|
||||
command = [
|
||||
sys.executable, # 使用当前 Python 解释器
|
||||
"-m",
|
||||
"evalplus.evaluate",
|
||||
"--dataset", "humaneval",
|
||||
"--samples", "samples.jsonl",
|
||||
"--parallel", "2",
|
||||
"--base-only"
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(command, check=True, capture_output=True, text=True)
|
||||
print("输出:", result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("错误输出:", e.stderr)
|
||||
return False
|
||||
|
||||
def extract_failure_tests(file_path:str = "/Users/trl/Github_project/MetaGPT-MathAI/samples_eval_results.json"):
|
||||
with open(file_path, 'r') as f:
|
||||
task_results = json.load(f)
|
||||
|
||||
failed_tests = []
|
||||
|
||||
for task in task_results['eval'].values():
|
||||
if task[0]["base_status"] == "fail":
|
||||
failed_test = {
|
||||
"task_id": task[0]["task_id"],
|
||||
# "solution": task["solution"],
|
||||
# "fail_tests": task["base_fail_tests"]
|
||||
}
|
||||
failed_tests.append(failed_test)
|
||||
print(len(failed_tests))
|
||||
|
||||
return failed_tests
|
||||
|
||||
|
||||
# asyncio.run(sample_generate('HumanEval/101'))
|
||||
# asyncio.run(samples_generate(mode='llm'))
|
||||
# jsonl_ranker("samples.jsonl", "samples.jsonl")
|
||||
# {"task_id": "HumanEval/101", "solution": "def words_string(s):\n import re\n return re.split(r'[,\\s]\\s*', s)"}
|
||||
|
|
@ -16,7 +16,7 @@ class Graph:
|
|||
NotImplementedError("Subclasses must implement __call__ method")
|
||||
|
||||
class HumanEvalGraph(Graph):
|
||||
def __init__(self, name:str, llm: LLM, criteria:str, vote_count:int =3) -> None:
|
||||
def __init__(self, name:str, llm: LLM, criteria:str, vote_count:int =5) -> None:
|
||||
super().__init__(name, llm)
|
||||
self.criteria = criteria # TODO 自动构建图时,图的初始参数与图所使用的算子要求的外部参数相匹配
|
||||
self.generate_code = GenerateCode(llm=llm)
|
||||
|
|
@ -29,11 +29,11 @@ class HumanEvalGraph(Graph):
|
|||
async def __call__(self, problem:str, ensemble_count:int = 3):
|
||||
solution_list = []
|
||||
for _ in range(ensemble_count):
|
||||
# solution = await self.generate_code(problem)
|
||||
solution = await self.generate_code_block(problem)
|
||||
solution = await self.generate_code(problem)
|
||||
# solution = await self.generate_code_block(problem)
|
||||
solution = solution.get('code_solution')
|
||||
solution_list.append(solution)
|
||||
solution = await self.mdensemble(solution_list, problem)
|
||||
solution = await self.mdensemble("code", solution_list, problem)
|
||||
return solution
|
||||
|
||||
async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2):
|
||||
|
|
@ -44,14 +44,16 @@ class HumanEvalGraph(Graph):
|
|||
solution = await self.ensemble(solution_list, problem)
|
||||
return solution
|
||||
|
||||
async def simple_ensemble(self, problem:str):
|
||||
solution_list = []
|
||||
for _ in range(3):
|
||||
solution = await self.generate_code(problem)
|
||||
solution = solution.get('code_solution')
|
||||
solution_list.append(solution)
|
||||
solution = await self.ensemble(solution_list, problem)
|
||||
return solution
|
||||
# async def simple_ensemble(self, problem:str, ensemble_count:int = 3):
|
||||
# async def __call__(self, problem:str, ensemble_count:int = 3):
|
||||
# solution_list = []
|
||||
# for _ in range(ensemble_count):
|
||||
# solution = await self.generate_code(problem)
|
||||
# # solution = await self.generate_code_block(problem)
|
||||
# solution = solution.get('code_solution')
|
||||
# solution_list.append(solution)
|
||||
# solution = await self.ensemble(solution_list, problem)
|
||||
# return solution
|
||||
|
||||
async def single_solve(self, problem:str, max_loop:int):
|
||||
solution = await self.generate_code(problem)
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
# @Date : 6/27/2024 17:36 PM
|
||||
# @Author : didi
|
||||
# @Desc : operator demo of ags
|
||||
|
||||
import ast
|
||||
import random
|
||||
from typing import List, Tuple, Any, Dict
|
||||
from collections import Counter
|
||||
|
|
@ -90,7 +90,7 @@ class Ensemble(Operator):
|
|||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
class MdEnsemble(Ensemble):
|
||||
class MdEnsemble(Operator):
|
||||
|
||||
def __init__(self, name:str ="MdEnsembler", llm: LLM = LLM(), vote_count:int=3):
|
||||
super().__init__(name, llm)
|
||||
|
|
@ -100,21 +100,35 @@ class MdEnsemble(Ensemble):
|
|||
def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
|
||||
shuffled_solutions = solutions.copy()
|
||||
random.shuffle(shuffled_solutions)
|
||||
answer_mapping = {
|
||||
chr(65 + i): solutions.index(sol)
|
||||
for i, sol in enumerate(shuffled_solutions)
|
||||
}
|
||||
# 这里的index方法会把检索到的放在第一个索引的位置。
|
||||
answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
|
||||
return shuffled_solutions, answer_mapping
|
||||
|
||||
@staticmethod
|
||||
def most_frequent(lst: List[Any]) -> Tuple[Any, int]:
|
||||
counter = Counter(lst)
|
||||
most_common = counter.most_common(1)
|
||||
return most_common[0] if most_common else (None, 0)
|
||||
|
||||
async def __call__(self, solutions:List[str], problem_description:str,):
|
||||
async def __call__(self, solution_type:str ,solutions:List[str], problem_description:str):
|
||||
all_responses = []
|
||||
# 如果Solution方案是Code,我们利用AST去重
|
||||
if solution_type == "code":
|
||||
original_length = len(solutions)
|
||||
unique_structures = {}
|
||||
updated_solutions = []
|
||||
|
||||
for solution in solutions:
|
||||
try:
|
||||
tree = ast.parse(solution)
|
||||
structure_key = ast.dump(tree, annotate_fields=False, include_attributes=False)
|
||||
|
||||
if structure_key not in unique_structures:
|
||||
unique_structures[structure_key] = solution
|
||||
updated_solutions.append(solution)
|
||||
except SyntaxError:
|
||||
# If the solution has a syntax error, we'll skip it
|
||||
continue
|
||||
solutions = updated_solutions
|
||||
updated_length = len(solutions)
|
||||
print(f"Original number of solutions: {original_length}")
|
||||
print(f"Updated number of solutions: {updated_length}")
|
||||
if updated_length == 1:
|
||||
return {"final_solution": solutions[0]}
|
||||
for _ in range(self.vote_count):
|
||||
shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
|
||||
|
||||
|
|
@ -131,38 +145,16 @@ class MdEnsemble(Ensemble):
|
|||
|
||||
if answer in answer_mapping:
|
||||
original_index = answer_mapping[answer]
|
||||
all_responses.append(solutions[original_index])
|
||||
|
||||
final_answer, frequency = self.most_frequent(all_responses)
|
||||
print(f"original index: {original_index}")
|
||||
all_responses.append(original_index)
|
||||
|
||||
most_frequent_index = Counter(all_responses).most_common(1)[0][0]
|
||||
print(f"most frequent_index: {most_frequent_index}")
|
||||
final_answer = solutions[most_frequent_index]
|
||||
print(f"final answer: {final_answer}")
|
||||
# final_answer, frequency = self.most_frequent(all_responses)
|
||||
return {"final_solution": final_answer}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# def load_llm_configs(*config_names):
|
||||
# """
|
||||
# Load multiple LLM configurations and return a list of initialized LLMs.
|
||||
|
||||
# :param config_names: Variable number of configuration file names (without .yaml extension)
|
||||
# :return: List of initialized LLM objects
|
||||
# """
|
||||
# llms = []
|
||||
# for config_name in config_names:
|
||||
# config_path = Path(f"~/.metagpt/{config_name}.yaml").expanduser()
|
||||
# if config_path.exists():
|
||||
# config = Config.from_yaml_file(config_path)
|
||||
# llms.append(LLM(config.llm))
|
||||
# else:
|
||||
# print(f"Warning: Configuration file {config_path} not found. Skipping.")
|
||||
# return llms
|
||||
|
||||
|
||||
# 使用函数加载多个 LLM 配置
|
||||
# llms = load_llm_configs("gpt-4o", "sonnet-35") # 你可以根据需要添加或删除配置
|
||||
class ScEnsemble(Operator):
|
||||
# TODO
|
||||
pass
|
||||
|
|
@ -3,6 +3,10 @@
|
|||
# @Author : didi
|
||||
# @Desc : prompts of operators
|
||||
|
||||
# TODO PromptBreeder 评分是怎么做的?
|
||||
# TODO 评估案例 GSM-8K 直接拿的DataSet
|
||||
#
|
||||
#
|
||||
|
||||
GENERATE_PROMPT = """
|
||||
Generate Solution for the following problem: {problem_description}
|
||||
|
|
|
|||
99
he_test.py
99
he_test.py
|
|
@ -1,103 +1,14 @@
|
|||
import json
|
||||
import asyncio
|
||||
import aiofiles
|
||||
from metagpt.llm import LLM
|
||||
from evalplus.data import get_human_eval_plus, write_jsonl
|
||||
from examples.ags.benchmark.humaneval import sample_generate, samples_generate, extract_failure_tests, automatic_evalplus
|
||||
from examples.ags.w_action_node.utils import jsonl_ranker
|
||||
from examples.ags.w_action_node.graph import HumanEvalGraph
|
||||
from examples.ags.w_action_node.operator import GenerateCode
|
||||
|
||||
generate_code = GenerateCode(llm=LLM())
|
||||
|
||||
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)
|
||||
|
||||
async def sample_generate(id):
|
||||
case = get_human_eval_plus()[f"{id}"]
|
||||
solution_result = await solver(case['prompt'],ensemble_count=3)
|
||||
sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
|
||||
with open("samples.jsonl", mode='a') as f:
|
||||
f.write(json.dumps(sample_dict) + '\n')
|
||||
jsonl_ranker("samples.jsonl", "samples.jsonl")
|
||||
|
||||
async def samples_generate(mode:str):
|
||||
cases = list(get_human_eval_plus().values())
|
||||
file_lock = asyncio.Lock()
|
||||
|
||||
async def solve_and_write(case, mode):
|
||||
try:
|
||||
if mode == 'llm':
|
||||
solution_result = await generate_code(case['prompt'])
|
||||
sample_dict = {
|
||||
'task_id': case['task_id'],
|
||||
'solution': solution_result['code_solution']
|
||||
}
|
||||
elif mode == "ags":
|
||||
solution_result = await solver(case['prompt'], ensemble_count=3)
|
||||
sample_dict = {
|
||||
'task_id': case['task_id'],
|
||||
'solution': solution_result['final_solution']
|
||||
}
|
||||
|
||||
async with file_lock:
|
||||
async with aiofiles.open("samples.jsonl", mode='a') as f:
|
||||
await f.write(json.dumps(sample_dict) + '\n')
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return case['task_id']
|
||||
|
||||
tasks = [solve_and_write(case, mode) for case in cases]
|
||||
results = await asyncio.gather(*tasks)
|
||||
failed_tasks = [task_id for task_id in results if task_id is not None]
|
||||
|
||||
# TODO 这个地方还是不够自动化
|
||||
if failed_tasks:
|
||||
for task_id in failed_tasks:
|
||||
try:
|
||||
await sample_generate(task_id)
|
||||
except Exception as e:
|
||||
print(f"failure {task_id}")
|
||||
jsonl_ranker("samples.jsonl", "samples.jsonl")
|
||||
|
||||
async def samples_generate_ags():
|
||||
sample_list = []
|
||||
cases = list(get_human_eval_plus().values())
|
||||
|
||||
async def solve_with_id(case):
|
||||
solution_result = await solver(case['prompt'], ensemble_count=3)
|
||||
return case['task_id'], solution_result['final_solution']
|
||||
|
||||
tasks = [solve_with_id(case) for case in cases]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
for task_id, solution in results:
|
||||
sample_dict = dict(task_id=task_id, solution=solution)
|
||||
sample_list.append(sample_dict)
|
||||
|
||||
write_jsonl("samples.jsonl", sample_list)
|
||||
|
||||
async def samples_generate_llm():
|
||||
sample_list = []
|
||||
cases = list(get_human_eval_plus().values())
|
||||
|
||||
async def solve_with_id(case):
|
||||
solution_result = await generate_code(case['prompt'])
|
||||
return case['task_id'], solution_result['code_solution']
|
||||
|
||||
tasks = [solve_with_id(case) for case in cases]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
for task_id, solution in results:
|
||||
sample_dict = dict(task_id=task_id, solution=solution)
|
||||
sample_list.append(sample_dict)
|
||||
|
||||
write_jsonl("samples.jsonl", sample_list)
|
||||
|
||||
# asyncio.run(sample_generate('HumanEval/101'))
|
||||
# asyncio.run(samples_generate_llm())
|
||||
# asyncio.run(sample_generate('HumanEval/1'))
|
||||
asyncio.run(samples_generate(mode='ags'))
|
||||
# jsonl_ranker("samples.jsonl", "samples.jsonl")
|
||||
|
||||
|
||||
|
||||
# if automatic_evalplus():
|
||||
# unpassed_exapmle = extract_failure_tests()
|
||||
# print(unpassed_exapmle)
|
||||
12
test.py
12
test.py
|
|
@ -1,12 +0,0 @@
|
|||
import asyncio
|
||||
from examples.ags.w_action_node.graph import HumanEvalGraph
|
||||
from metagpt.llm import LLM
|
||||
|
||||
human_eval_example = """
|
||||
from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n
|
||||
"""
|
||||
|
||||
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability')
|
||||
|
||||
final_result = asyncio.run(solver(human_eval_example))
|
||||
print(final_result)
|
||||
Loading…
Add table
Add a link
Reference in a new issue