From 86033a10372263acc808eb412dd66e7a6c9758b4 Mon Sep 17 00:00:00 2001 From: didi <84363704+didiforgithub@users.noreply.github.com> Date: Tue, 9 Jul 2024 14:51:27 +0800 Subject: [PATCH] Update --- examples/ags/benchmark/humaneval.py | 156 +++++++++++++++++++++++++ examples/ags/w_action_node/graph.py | 26 +++-- examples/ags/w_action_node/operator.py | 82 ++++++------- examples/ags/w_action_node/prompt.py | 4 + he_test.py | 99 +--------------- test.py | 12 -- 6 files changed, 216 insertions(+), 163 deletions(-) create mode 100644 examples/ags/benchmark/humaneval.py delete mode 100644 test.py diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py new file mode 100644 index 000000000..ea7b9dedb --- /dev/null +++ b/examples/ags/benchmark/humaneval.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- +# @Date : 7/7/2024 17:07 PM +# @Author : didi +# @Desc : test on human eval graph + +import json +import subprocess +import sys +import asyncio +import aiofiles +from metagpt.llm import LLM +from evalplus.data import get_human_eval_plus, write_jsonl +from examples.ags.w_action_node.utils import jsonl_ranker +from examples.ags.w_action_node.graph import HumanEvalGraph +from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock + +generate_code = GenerateCode(llm=LLM()) +generate_code_block = GenerateCodeBlock(llm=LLM()) + +solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5) + +async def sample_generate(id): + case = get_human_eval_plus()[f"{id}"] + solution_result = await solver(case['prompt'],ensemble_count=5) + sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution']) + with open("samples.jsonl", mode='a') as f: + f.write(json.dumps(sample_dict) + '\n') + jsonl_ranker("samples.jsonl", "samples.jsonl") + +async def samples_generate(mode:str): + cases = list(get_human_eval_plus().values()) + file_lock = asyncio.Lock() + + async def solve_and_write(case, mode): + try: + if mode == 'llm': + # solution_result = await generate_code_block(case['prompt']) + solution_result = await generate_code(case['prompt']) + sample_dict = { + 'task_id': case['task_id'], + 'solution': solution_result['code_solution'] + } + elif mode == "ags": + solution_result = await solver(case['prompt'], ensemble_count=5) + sample_dict = { + 'task_id': case['task_id'], + 'solution': solution_result['final_solution'] + } + + async with file_lock: + async with aiofiles.open("samples.jsonl", mode='a') as f: + await f.write(json.dumps(sample_dict) + '\n') + return None + + except Exception as e: + print(e) + return case['task_id'] + + tasks = [solve_and_write(case, mode) for case in cases] + results = await asyncio.gather(*tasks) + failed_tasks = [task_id for task_id in results if task_id is not None] + + # TODO 这个地方还是不够自动化 + if failed_tasks: + for task_id in failed_tasks: + try: + await sample_generate(task_id) + except Exception as e: + print(f"failure {task_id}") + jsonl_ranker("samples.jsonl", "samples.jsonl") + + if not failed_tasks: + if automatic_evalplus(): + unpassed_exapmle = extract_failure_tests() + print(unpassed_exapmle) + +async def samples_generate_ags(): + sample_list = [] + cases = list(get_human_eval_plus().values()) + + async def solve_with_id(case): + solution_result = await solver(case['prompt'], ensemble_count=3) + return case['task_id'], solution_result['final_solution'] + + tasks = [solve_with_id(case) for case in cases] + results = await asyncio.gather(*tasks) + + for task_id, solution in results: + sample_dict = dict(task_id=task_id, solution=solution) + sample_list.append(sample_dict) + + write_jsonl("samples.jsonl", sample_list) + +async def samples_generate_llm(): + sample_list = [] + cases = list(get_human_eval_plus().values()) + + async def solve_with_id(case): + solution_result = await generate_code_block(case['prompt']) + # solution_result = await generate_code(case['prompt']) + return case['task_id'], solution_result['code_solution'] + + tasks = [solve_with_id(case) for case in cases] + results = await asyncio.gather(*tasks) + + for task_id, solution in results: + sample_dict = dict(task_id=task_id, solution=solution) + sample_list.append(sample_dict) + + write_jsonl("samples.jsonl", sample_list) + +def automatic_evalplus(): + """ + 在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only + """ + command = [ + sys.executable, # 使用当前 Python 解释器 + "-m", + "evalplus.evaluate", + "--dataset", "humaneval", + "--samples", "samples.jsonl", + "--parallel", "2", + "--base-only" + ] + + try: + result = subprocess.run(command, check=True, capture_output=True, text=True) + print("输出:", result.stdout) + return True + except subprocess.CalledProcessError as e: + print("错误输出:", e.stderr) + return False + +def extract_failure_tests(file_path:str = "/Users/trl/Github_project/MetaGPT-MathAI/samples_eval_results.json"): + with open(file_path, 'r') as f: + task_results = json.load(f) + + failed_tests = [] + + for task in task_results['eval'].values(): + if task[0]["base_status"] == "fail": + failed_test = { + "task_id": task[0]["task_id"], + # "solution": task["solution"], + # "fail_tests": task["base_fail_tests"] + } + failed_tests.append(failed_test) + print(len(failed_tests)) + + return failed_tests + + +# asyncio.run(sample_generate('HumanEval/101')) +# asyncio.run(samples_generate(mode='llm')) +# jsonl_ranker("samples.jsonl", "samples.jsonl") +# {"task_id": "HumanEval/101", "solution": "def words_string(s):\n import re\n return re.split(r'[,\\s]\\s*', s)"} \ No newline at end of file diff --git a/examples/ags/w_action_node/graph.py b/examples/ags/w_action_node/graph.py index f9bc2e1b3..3870bfd6d 100644 --- a/examples/ags/w_action_node/graph.py +++ b/examples/ags/w_action_node/graph.py @@ -16,7 +16,7 @@ class Graph: NotImplementedError("Subclasses must implement __call__ method") class HumanEvalGraph(Graph): - def __init__(self, name:str, llm: LLM, criteria:str, vote_count:int =3) -> None: + def __init__(self, name:str, llm: LLM, criteria:str, vote_count:int =5) -> None: super().__init__(name, llm) self.criteria = criteria # TODO 自动构建图时,图的初始参数与图所使用的算子要求的外部参数相匹配 self.generate_code = GenerateCode(llm=llm) @@ -29,11 +29,11 @@ class HumanEvalGraph(Graph): async def __call__(self, problem:str, ensemble_count:int = 3): solution_list = [] for _ in range(ensemble_count): - # solution = await self.generate_code(problem) - solution = await self.generate_code_block(problem) + solution = await self.generate_code(problem) + # solution = await self.generate_code_block(problem) solution = solution.get('code_solution') solution_list.append(solution) - solution = await self.mdensemble(solution_list, problem) + solution = await self.mdensemble("code", solution_list, problem) return solution async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2): @@ -44,14 +44,16 @@ class HumanEvalGraph(Graph): solution = await self.ensemble(solution_list, problem) return solution - async def simple_ensemble(self, problem:str): - solution_list = [] - for _ in range(3): - solution = await self.generate_code(problem) - solution = solution.get('code_solution') - solution_list.append(solution) - solution = await self.ensemble(solution_list, problem) - return solution + # async def simple_ensemble(self, problem:str, ensemble_count:int = 3): + # async def __call__(self, problem:str, ensemble_count:int = 3): + # solution_list = [] + # for _ in range(ensemble_count): + # solution = await self.generate_code(problem) + # # solution = await self.generate_code_block(problem) + # solution = solution.get('code_solution') + # solution_list.append(solution) + # solution = await self.ensemble(solution_list, problem) + # return solution async def single_solve(self, problem:str, max_loop:int): solution = await self.generate_code(problem) diff --git a/examples/ags/w_action_node/operator.py b/examples/ags/w_action_node/operator.py index b6b6e1901..3b832cc18 100644 --- a/examples/ags/w_action_node/operator.py +++ b/examples/ags/w_action_node/operator.py @@ -2,7 +2,7 @@ # @Date : 6/27/2024 17:36 PM # @Author : didi # @Desc : operator demo of ags - +import ast import random from typing import List, Tuple, Any, Dict from collections import Counter @@ -90,7 +90,7 @@ class Ensemble(Operator): response = node.instruct_content.model_dump() return response -class MdEnsemble(Ensemble): +class MdEnsemble(Operator): def __init__(self, name:str ="MdEnsembler", llm: LLM = LLM(), vote_count:int=3): super().__init__(name, llm) @@ -100,21 +100,35 @@ class MdEnsemble(Ensemble): def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]: shuffled_solutions = solutions.copy() random.shuffle(shuffled_solutions) - answer_mapping = { - chr(65 + i): solutions.index(sol) - for i, sol in enumerate(shuffled_solutions) - } + # 这里的index方法会把检索到的放在第一个索引的位置。 + answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)} return shuffled_solutions, answer_mapping - - @staticmethod - def most_frequent(lst: List[Any]) -> Tuple[Any, int]: - counter = Counter(lst) - most_common = counter.most_common(1) - return most_common[0] if most_common else (None, 0) - async def __call__(self, solutions:List[str], problem_description:str,): + async def __call__(self, solution_type:str ,solutions:List[str], problem_description:str): all_responses = [] + # 如果Solution方案是Code,我们利用AST去重 + if solution_type == "code": + original_length = len(solutions) + unique_structures = {} + updated_solutions = [] + for solution in solutions: + try: + tree = ast.parse(solution) + structure_key = ast.dump(tree, annotate_fields=False, include_attributes=False) + + if structure_key not in unique_structures: + unique_structures[structure_key] = solution + updated_solutions.append(solution) + except SyntaxError: + # If the solution has a syntax error, we'll skip it + continue + solutions = updated_solutions + updated_length = len(solutions) + print(f"Original number of solutions: {original_length}") + print(f"Updated number of solutions: {updated_length}") + if updated_length == 1: + return {"final_solution": solutions[0]} for _ in range(self.vote_count): shuffled_solutions, answer_mapping = self.shuffle_answers(solutions) @@ -131,38 +145,16 @@ class MdEnsemble(Ensemble): if answer in answer_mapping: original_index = answer_mapping[answer] - all_responses.append(solutions[original_index]) - - final_answer, frequency = self.most_frequent(all_responses) + print(f"original index: {original_index}") + all_responses.append(original_index) + most_frequent_index = Counter(all_responses).most_common(1)[0][0] + print(f"most frequent_index: {most_frequent_index}") + final_answer = solutions[most_frequent_index] + print(f"final answer: {final_answer}") + # final_answer, frequency = self.most_frequent(all_responses) return {"final_solution": final_answer} - - - - - - - - - -# def load_llm_configs(*config_names): -# """ -# Load multiple LLM configurations and return a list of initialized LLMs. - -# :param config_names: Variable number of configuration file names (without .yaml extension) -# :return: List of initialized LLM objects -# """ -# llms = [] -# for config_name in config_names: -# config_path = Path(f"~/.metagpt/{config_name}.yaml").expanduser() -# if config_path.exists(): -# config = Config.from_yaml_file(config_path) -# llms.append(LLM(config.llm)) -# else: -# print(f"Warning: Configuration file {config_path} not found. Skipping.") -# return llms - - -# 使用函数加载多个 LLM 配置 -# llms = load_llm_configs("gpt-4o", "sonnet-35") # 你可以根据需要添加或删除配置 \ No newline at end of file +class ScEnsemble(Operator): + # TODO + pass \ No newline at end of file diff --git a/examples/ags/w_action_node/prompt.py b/examples/ags/w_action_node/prompt.py index 8b815715d..dcc1428de 100644 --- a/examples/ags/w_action_node/prompt.py +++ b/examples/ags/w_action_node/prompt.py @@ -3,6 +3,10 @@ # @Author : didi # @Desc : prompts of operators +# TODO PromptBreeder 评分是怎么做的? +# TODO 评估案例 GSM-8K 直接拿的DataSet +# +# GENERATE_PROMPT = """ Generate Solution for the following problem: {problem_description} diff --git a/he_test.py b/he_test.py index b410f86a7..fa827a4c1 100644 --- a/he_test.py +++ b/he_test.py @@ -1,103 +1,14 @@ -import json import asyncio -import aiofiles from metagpt.llm import LLM -from evalplus.data import get_human_eval_plus, write_jsonl +from examples.ags.benchmark.humaneval import sample_generate, samples_generate, extract_failure_tests, automatic_evalplus from examples.ags.w_action_node.utils import jsonl_ranker -from examples.ags.w_action_node.graph import HumanEvalGraph -from examples.ags.w_action_node.operator import GenerateCode - -generate_code = GenerateCode(llm=LLM()) - -solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5) - -async def sample_generate(id): - case = get_human_eval_plus()[f"{id}"] - solution_result = await solver(case['prompt'],ensemble_count=3) - sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution']) - with open("samples.jsonl", mode='a') as f: - f.write(json.dumps(sample_dict) + '\n') - jsonl_ranker("samples.jsonl", "samples.jsonl") - -async def samples_generate(mode:str): - cases = list(get_human_eval_plus().values()) - file_lock = asyncio.Lock() - - async def solve_and_write(case, mode): - try: - if mode == 'llm': - solution_result = await generate_code(case['prompt']) - sample_dict = { - 'task_id': case['task_id'], - 'solution': solution_result['code_solution'] - } - elif mode == "ags": - solution_result = await solver(case['prompt'], ensemble_count=3) - sample_dict = { - 'task_id': case['task_id'], - 'solution': solution_result['final_solution'] - } - - async with file_lock: - async with aiofiles.open("samples.jsonl", mode='a') as f: - await f.write(json.dumps(sample_dict) + '\n') - return None - - except Exception as e: - print(e) - return case['task_id'] - - tasks = [solve_and_write(case, mode) for case in cases] - results = await asyncio.gather(*tasks) - failed_tasks = [task_id for task_id in results if task_id is not None] - - # TODO 这个地方还是不够自动化 - if failed_tasks: - for task_id in failed_tasks: - try: - await sample_generate(task_id) - except Exception as e: - print(f"failure {task_id}") - jsonl_ranker("samples.jsonl", "samples.jsonl") - -async def samples_generate_ags(): - sample_list = [] - cases = list(get_human_eval_plus().values()) - - async def solve_with_id(case): - solution_result = await solver(case['prompt'], ensemble_count=3) - return case['task_id'], solution_result['final_solution'] - - tasks = [solve_with_id(case) for case in cases] - results = await asyncio.gather(*tasks) - - for task_id, solution in results: - sample_dict = dict(task_id=task_id, solution=solution) - sample_list.append(sample_dict) - - write_jsonl("samples.jsonl", sample_list) - -async def samples_generate_llm(): - sample_list = [] - cases = list(get_human_eval_plus().values()) - - async def solve_with_id(case): - solution_result = await generate_code(case['prompt']) - return case['task_id'], solution_result['code_solution'] - - tasks = [solve_with_id(case) for case in cases] - results = await asyncio.gather(*tasks) - - for task_id, solution in results: - sample_dict = dict(task_id=task_id, solution=solution) - sample_list.append(sample_dict) - - write_jsonl("samples.jsonl", sample_list) # asyncio.run(sample_generate('HumanEval/101')) -# asyncio.run(samples_generate_llm()) +# asyncio.run(sample_generate('HumanEval/1')) asyncio.run(samples_generate(mode='ags')) # jsonl_ranker("samples.jsonl", "samples.jsonl") - +# if automatic_evalplus(): +# unpassed_exapmle = extract_failure_tests() +# print(unpassed_exapmle) \ No newline at end of file diff --git a/test.py b/test.py deleted file mode 100644 index 78db8c0b4..000000000 --- a/test.py +++ /dev/null @@ -1,12 +0,0 @@ -import asyncio -from examples.ags.w_action_node.graph import HumanEvalGraph -from metagpt.llm import LLM - -human_eval_example = """ -from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n -""" - -solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability') - -final_result = asyncio.run(solver(human_eval_example)) -print(final_result) \ No newline at end of file