From 4e0a896bdcb8fcf70ea3b454558beb923ac4b1cf Mon Sep 17 00:00:00 2001 From: didi <84363704+didiforgithub@users.noreply.github.com> Date: Mon, 9 Sep 2024 17:17:15 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E4=BA=A4baseline=E4=BE=8B=E5=AD=90?= =?UTF-8?q?=EF=BC=9B=E4=BF=AE=E6=94=B9context-fill=20=E6=A0=BC=E5=BC=8F?= =?UTF-8?q?=E8=AF=86=E5=88=AB=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/ags/benchmark/gsm8k.py | 117 ++++++++++-- examples/ags/benchmark/utils.py | 17 ++ .../ags/experiments/baselines/cot_gsm8k.py | 73 ++++++++ examples/ags/experiments/baselines/io.py | 0 .../ags/experiments/baselines/llm_debate.py | 0 .../ags/experiments/baselines/medprompt.py | 0 .../ags/experiments/baselines/reflexion.py | 0 .../experiments/baselines/self_consistency.py | 0 .../ags/experiments/baselines/self_refine.py | 0 examples/ags/scripts/evaluator.py | 171 ++---------------- examples/ags/scripts/graph.py | 3 +- examples/ags/scripts/prompt.py | 12 ++ metagpt/actions/action_node.py | 43 ++++- 13 files changed, 254 insertions(+), 182 deletions(-) create mode 100644 examples/ags/benchmark/utils.py create mode 100644 examples/ags/experiments/baselines/cot_gsm8k.py create mode 100644 examples/ags/experiments/baselines/io.py create mode 100644 examples/ags/experiments/baselines/llm_debate.py create mode 100644 examples/ags/experiments/baselines/medprompt.py create mode 100644 examples/ags/experiments/baselines/reflexion.py create mode 100644 examples/ags/experiments/baselines/self_consistency.py create mode 100644 examples/ags/experiments/baselines/self_refine.py diff --git a/examples/ags/benchmark/gsm8k.py b/examples/ags/benchmark/gsm8k.py index d4566217c..97dce7bff 100644 --- a/examples/ags/benchmark/gsm8k.py +++ b/examples/ags/benchmark/gsm8k.py @@ -1,29 +1,110 @@ # -*- coding: utf-8 -*- # @Date : -# @Author : issac +# @Author : all # @Desc : test on gsm8k + +import re +import json import asyncio +import aiofiles +import pandas as pd +from typing import Optional, List, Tuple, Callable +from tqdm.asyncio import tqdm_asyncio -from deepeval.models.base_model import DeepEvalBaseLLM +from examples.ags.benchmark.utils import generate_random_indices + +def extract_number(text: str) -> Optional[float]: + """清理文本并提取单个数字""" + matches = re.findall(r"[-+]?\d+(?:,\d{3})*(?:\.\d+)?|\d+\.\d+", text) + if matches: + last_number = matches[-1].replace(",", "") + try: + return float(last_number) + except ValueError: + return None + else: + return None + +def loose_match_score(expected_output: str, prediction: str, tolerance: float = 1e-6) -> int: + """宽松匹配分数计算函数""" + expected_number = extract_number(expected_output) + predicted_number = extract_number(prediction) + + if expected_number is None or predicted_number is None: + return 0 + + if abs(expected_number - predicted_number) <= tolerance: + return 1 + else: + return 0 -# 这里是DeepEval强制定义的模型基础格式,这里不需要进行改动,只需要调用即可 -class GraphModel(DeepEvalBaseLLM): - def __init__(self, graph): - self.solver = graph +async def load_data(file_path: str, samples=1) -> List[dict]: + data = [] + async with aiofiles.open(file_path, mode="r") as file: + async for line in file: + data.append(json.loads(line)) + random_indices = generate_random_indices(len(data), samples) + data = [data[i] for i in random_indices] + return data + +def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> float: + """保存结果到CSV文件""" + df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"]) + average_score = df["score"].mean() - def load_model(self): - pass + output_file = f"{path}/{average_score:.5f}.csv" + df.to_csv(output_file, index=False) + print(f"Results saved to {output_file}") + return average_score - async def a_generate(self, prompt: str) -> str: - # TODO 还需要在这里继续整合Cost - solution_result, total_cost = await self.solver(prompt) - return solution_result +async def evaluate_problem(input: str, graph: Callable, expected_output: str) -> Tuple[str, str, str, int, str]: + """评估单个问题""" + prompt = input + max_retries = 5 + retries = 0 - def generate(self, prompt: str) -> str: - loop = asyncio.get_event_loop() - solution_result = loop.run_until_complete(self.a_generate(prompt)) # 等待 a_generate 方法完成 - return solution_result + while retries < max_retries: + try: + prediction = await graph(prompt) + cost = prediction[1] + output = prediction[0]["solution"] - def get_model_name(self): - return "Custom Azure OpenAI Model" + score = loose_match_score(expected_output, output) + break + + except Exception as e: + retries += 1 + print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})") + + if retries == max_retries: + print("Maximum retries reached. Skipping this sample.") + output = None + cost = None + score = 0 + break + + return input, output, expected_output, score, cost + +async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 20) -> List[Tuple[str, str, str, int, str]]: + """评估所有问题""" + semaphore = asyncio.Semaphore(max_concurrent_tasks) + + async def sem_evaluate(problem): + async with semaphore: + input_text = problem["question"] + expected_output = problem["answer"] + return await evaluate_problem(input_text, graph, expected_output) + + tasks = [sem_evaluate(problem) for problem in data] + + return await tqdm_asyncio.gather(*tasks, desc="Evaluating problems", total=len(data)) + +async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float: + """GSM8K评估主函数""" + data = await load_data(file_path, samples) + results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5) + print(results) + average_score = save_results_to_csv(results, path=path) + print(f"Average score: {average_score:.5f}") + return average_score diff --git a/examples/ags/benchmark/utils.py b/examples/ags/benchmark/utils.py new file mode 100644 index 000000000..871d5e50a --- /dev/null +++ b/examples/ags/benchmark/utils.py @@ -0,0 +1,17 @@ +import numpy as np + +def generate_random_indices(n, n_samples, test=False): + """ + 生成随机索引 + """ + + def _set_seed(seed=42): + np.random.seed(seed) + + _set_seed() + indices = np.arange(n) + np.random.shuffle(indices) + if test: + return indices[n_samples:] + else: + return indices[:n_samples] \ No newline at end of file diff --git a/examples/ags/experiments/baselines/cot_gsm8k.py b/examples/ags/experiments/baselines/cot_gsm8k.py new file mode 100644 index 000000000..3e08ff47d --- /dev/null +++ b/examples/ags/experiments/baselines/cot_gsm8k.py @@ -0,0 +1,73 @@ +from examples.ags.scripts.operator import Operator +from examples.ags.scripts.graph import SolveGraph +from examples.ags.benchmark.gsm8k import gsm8k_evaluation +from examples.ags.scripts.operator_an import GenerateOp +from metagpt.actions.action_node import ActionNode +from metagpt.configs.models_config import ModelsConfig +from metagpt.llm import LLM +from pydantic import BaseModel, Field +from typing import Dict, Any + +GSM8K_PROMPT_GPT = """ +{question}\nPlease reason step by step, and put your final answer in the end. Wrap content using xml tags. +""" + +GSM8K_PROMPT_DS = """ +{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}. +""" + +class GenerateOp(BaseModel): + solution: str = Field(default="", description="solution for the problem") + +class CoTGenerate(Operator): + def __init__(self, llm: LLM, name: str = "Generate"): + super().__init__(name, llm) + + async def __call__(self, problem, mode: str = None): + prompt = GSM8K_PROMPT_GPT.format(question=problem) + fill_kwargs = {"context": prompt, "llm": self.llm} + if mode: + fill_kwargs["mode"] = mode + node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs) + response = node.instruct_content.model_dump() + return response + +class CoTSolveGraph(SolveGraph): + def __init__(self, name: str, llm_config, dataset: str): + super().__init__(name, llm_config, dataset) + self.cot_generate = CoTGenerate(self.llm) + + async def __call__(self, problem): + solution = await self.cot_generate(problem, mode="context_fill") + return solution, self.llm.cost_manager.total_cost + +if __name__ == "__main__": + async def main(): + # llm_config = ModelsConfig.default().get("deepseek-coder") + # llm_config = ModelsConfig.default().get("gpt-4o-mini") + llm_config = ModelsConfig.default().get("gpt-35-turbo-1106") + graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K") + file_path = "examples/ags/data/gsm8k.jsonl" + samples = 1055 + # samples = 100 + path = "examples/ags/data/baselines/general" + score = await gsm8k_evaluation(graph, file_path, samples, path) + return score + + import asyncio + asyncio.run(main()) + + +# self consistency operator; universal self consistency; + +# IO指的没有任何Trick,看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。 + +# deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106 + + + +GENERATE_PROMPT = """ +Generate Solution for the following problem: {problem_description} +""" + +# med ensemble \ No newline at end of file diff --git a/examples/ags/experiments/baselines/io.py b/examples/ags/experiments/baselines/io.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/experiments/baselines/llm_debate.py b/examples/ags/experiments/baselines/llm_debate.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/experiments/baselines/medprompt.py b/examples/ags/experiments/baselines/medprompt.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/experiments/baselines/reflexion.py b/examples/ags/experiments/baselines/reflexion.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/experiments/baselines/self_consistency.py b/examples/ags/experiments/baselines/self_consistency.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/experiments/baselines/self_refine.py b/examples/ags/experiments/baselines/self_refine.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/scripts/evaluator.py b/examples/ags/scripts/evaluator.py index 898c0c2f1..b6b01875f 100644 --- a/examples/ags/scripts/evaluator.py +++ b/examples/ags/scripts/evaluator.py @@ -20,6 +20,9 @@ from sympy.parsing.latex import parse_latex from sympy.parsing.sympy_parser import parse_expr from tqdm.asyncio import tqdm_asyncio +from examples.ags.benchmark.gsm8k import gsm8k_evaluation +from examples.ags.benchmark.utils import generate_random_indices + DatasetType = Literal["HumanEval", "MBPP", "Gsm8K", "MATH", "HotpotQA", "DROP"] @@ -31,22 +34,6 @@ class Evaluator: def __init__(self, eval_path: str): self.eval_path = eval_path - def _generate_random_indices(self, n, n_samples, test=False): - """ - 生成随机索引 - """ - - def _set_seed(seed=42): - np.random.seed(seed) - - _set_seed() - indices = np.arange(n) - np.random.shuffle(indices) - if test: - return indices[n_samples:] - else: - return indices[:n_samples] - def validation_evaluate(self, dataset: DatasetType, graph, params: dict, path): """ Evaluates on validation dataset. @@ -74,131 +61,16 @@ class Evaluator: """ Evaluate on GSM8K dataset. """ - - # 模拟加载模型的函数 async def load_graph(): dataset = params["dataset"] llm_config = params["llm_config"] + return graph_class(name="Gsm8K", llm_config=llm_config, dataset=dataset) - graph = graph_class(name="Gsm8K", llm_config=llm_config, dataset=dataset) - return graph - - # 清理文本并提取单个数字 - def extract_number(text: str) -> Optional[float]: - # 使用正则表达式提取数字,包括整数和浮点数 - matches = re.findall(r"[-+]?\d+(?:,\d{3})*(?:\.\d+)?|\d+\.\d+", text) - print(matches) - if matches: - # 获取最后一个匹配的数字 - last_number = matches[-1] - - # 移除逗号以统一格式 - last_number = last_number.replace(",", "") - - try: - return float(last_number) - except ValueError: - return None - else: - return None - - # 宽松匹配分数计算函数 - def loose_match_score(expected_output: str, prediction: str, tolerance: float = 1e-6) -> int: - expected_number = extract_number(expected_output) - predicted_number = extract_number(prediction) - - print(predicted_number) - - # 如果预期输出或预测输出为空,返回不匹配 - if expected_number is None or predicted_number is None: - return 0 - - # 比较两个提取出的数字,允许一定的容差 - if abs(expected_number - predicted_number) <= tolerance: - return 1 # 数字相近,认为匹配成功 - else: - return 0 # 数字不匹配 - - # 异步评估单个问题 - async def _evaluate_problem(input: str, graph, expected_output: str) -> Tuple[str, str, str, int, str]: - prompt = input - max_retries = 5 - retries = 0 - - while retries < max_retries: - try: - # 假设模型有一个异步生成函数 - prediction = await graph(prompt) if graph else "None" # 这是一个占位符,替换成实际的模型生成逻辑 - cost = prediction[1] - output = prediction[0]["solution"] - - score = loose_match_score(expected_output, prediction[0]["solution"]) - break - - except Exception as e: - retries += 1 - print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})") - - if retries == max_retries: - print("Maximum retries reached. Skipping this sample.") - output = None - cost = None - score = 0 - break - - return input, output, expected_output, score, cost - - # 异步读取JSONL文件 - async def load_data(file_path: str) -> List[dict]: - data = [] - async with aiofiles.open(file_path, mode="r") as file: - async for line in file: - data.append(json.loads(line)) - return data[:samples] - - # 并行评估所有问题 - async def evaluate_all_problems(data: List[dict], graph, max_concurrent_tasks: int = 300): - semaphore = asyncio.Semaphore(max_concurrent_tasks) - - async def sem_evaluate(problem): - async with semaphore: - input_text = problem["question"] - expected_output = problem["answer"] - return await _evaluate_problem(input_text, graph, expected_output) - - tasks = [sem_evaluate(problem) for problem in data] - - # 使用tqdm.gather来显示进度条 - return await tqdm_asyncio.gather(*tasks, desc="Evaluating problems", total=len(data)) - - # 保存结果到CSV文件 - def save_results_to_csv(results: List[Tuple[str, str, str, int]], path): - df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"]) - average_score = df["score"].mean() - - # 生成文件名,保留五位小数 - output_file = f"{path}/{average_score:.5f}.csv" - df.to_csv(output_file, index=False) - print(f"Results saved to {output_file}") - - return average_score - - async def gsm8k(): - file_path = "examples/ags/w_action_node/data/gsm8k.jsonl" # 替换为您的JSONL文件路径 - data = await load_data(file_path) - - graph = await load_graph() - - results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20) - - # 保存结果到CSV文件并获取平均分 - average_score = save_results_to_csv(results, path=path) - - print(f"Average score: {average_score:.5f}") - return average_score - - score = await gsm8k() - + graph = await load_graph() + file_path = "examples/ags/data/gsm8k.jsonl" + + score = await gsm8k_evaluation(graph, file_path, samples, path) + return score async def _math_eval(self, graph_class, params, path, samples: int = 200): @@ -457,7 +329,7 @@ class Evaluator: return await tqdm_asyncio.gather(*tasks, desc="Evaluating MATH problems", total=len(data)) - def save_results_to_csv(results: List[Tuple[str, str, str, int]], path): + def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path): df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"]) average_score = df["score"].mean() @@ -503,7 +375,7 @@ class Evaluator: async with aiofiles.open(file_path, mode="r") as file: async for line in file: data.append(json.loads(line)) - random_indices = self._generate_random_indices(len(data), samples) + random_indices = generate_random_indices(len(data), samples) data = [data[i] for i in random_indices] return data @@ -656,9 +528,6 @@ class Evaluator: normalized = " ".join(parts).strip() return normalized - # def exact_match_score(prediction, ground_truth): - # return int(normalize_answer(prediction) == normalize_answer(ground_truth)) - def answer_to_bags(answer: str) -> Set[str]: raw_spans = [answer] @@ -725,7 +594,7 @@ class Evaluator: async with aiofiles.open(file_path, mode="r") as file: async for line in file: data.append(json.loads(line)) - random_indices = self._generate_random_indices(len(data), samples) + random_indices = generate_random_indices(len(data), samples) data = [data[i] for i in random_indices] return data @@ -778,18 +647,6 @@ class Evaluator: return await tqdm_asyncio.gather(*tasks, desc="Evaluating problems", total=len(data)) - # def save_results_to_jsonl(results: List[Tuple[str, str, str, str, int]], path): - # avg_score = 0 - - # with open(path, "w") as f: - # for result in results: - # f.write(json.dumps({"question": result[0], "prediction": result[1], "expected_output": result[2], "supporting_sentences": result[3], "score": result[4]}) + "\n") - # avg_score += result[4] - # print(f"Results saved to {path}") - # avg_score /= len(results) - - # return avg_score - def save_results_to_csv(results: List[Tuple[str, str, str, str, int]], path): df = pd.DataFrame( results, columns=["question", "prediction", "expected_output", "supporting_sentences", "score"] @@ -834,7 +691,7 @@ class Evaluator: async with aiofiles.open(file_path, mode="r") as file: async for line in file: data.append(json.loads(line)) - random_indices = self._generate_random_indices(len(data), samples) + random_indices = generate_random_indices(len(data), samples) data = [data[i] for i in random_indices] return data @@ -1056,7 +913,7 @@ class Evaluator: with open(file_path, mode="r") as file: data = json.load(file) data = list(data.items()) - random_indices = self._generate_random_indices(len(data), samples) + random_indices = generate_random_indices(len(data), samples) data = [data[i] for i in random_indices] return data diff --git a/examples/ags/scripts/graph.py b/examples/ags/scripts/graph.py index 1a49ee442..c8828f646 100644 --- a/examples/ags/scripts/graph.py +++ b/examples/ags/scripts/graph.py @@ -14,6 +14,7 @@ DatasetType = Literal["HumanEval", "MBPP", "Gsm8K", "MATH", "HotpotQa", "MMLU"] cost_manager = CostManager() +# TODO 这个类应该作为一个基类,不能够这样使用 class SolveGraph: def __init__( self, @@ -25,7 +26,7 @@ class SolveGraph: self.dataset = dataset self.llm = create_llm_instance(llm_config) self.llm.cost_manager = CostManager() - self.generate = Generate() + self.generate = Generate(self.llm) async def __call__(self, problem: str): """ diff --git a/examples/ags/scripts/prompt.py b/examples/ags/scripts/prompt.py index 6194ddfcb..2ddd73ed2 100644 --- a/examples/ags/scripts/prompt.py +++ b/examples/ags/scripts/prompt.py @@ -322,3 +322,15 @@ MATH_ANSWER_FORMAT_PROMPT = """ ### Instructions Provide the answer as a numerical value only, without units or any additional text. """ + +PYTHON_CODE_SOLVER_PROMPT = """You are a professional Python programmer. Your task is to write Python code based on the user's request. Make sure to add appropriate explanations and your personal thought process to your code. Additionally, all code should be encapsulated in Python code blocks. + +The packages you can use include: numpy, scipy, pandas, sympy, statsmodels, scikit-learn. If you attempt to import another external package and encounter an error, do not say it cannot be imported. Instead, try to write new code that avoids this issue. + +Always output complete code rather than just giving suggestions or partial modifications, as your code will be executed directly. If immediate execution is required to check for possible errors, include test cases in the code. + +In your response, only the code that needs to be run should be wrapped in multi-line code blocks. No other multi-line code blocks should appear. Your code needs to print the output after execution. Your code should not print error messages. + +Problem description: {problem} +Please write Python code to solve this problem. +""" diff --git a/metagpt/actions/action_node.py b/metagpt/actions/action_node.py index 6249343aa..4f04ef8f7 100644 --- a/metagpt/actions/action_node.py +++ b/metagpt/actions/action_node.py @@ -486,11 +486,18 @@ class ActionNode: def get_field_names(self): """ - Get the field names from the Pydantic model associated with this ActionNode. + 获取与此ActionNode关联的Pydantic模型的字段名称。 """ model_class = self.create_class() return model_class.model_fields.keys() + def get_field_types(self): + """ + 获取与此ActionNode关联的Pydantic模型的字段类型。 + """ + model_class = self.create_class() + return {field_name: field.annotation for field_name, field in model_class.model_fields.items()} + def xml_compile(self, context): # TODO 再来一版 @@ -529,20 +536,44 @@ class ActionNode: async def context_fill(self, context): """ - Fill Context with XML TAG + 使用XML标签填充上下文并根据字段类型进行转换,包括字符串、整数、布尔值、列表和字典类型 """ field_names = self.get_field_names() + field_types = self.get_field_types() + extracted_data = {} content = await self.llm.aask(context) - # TODO 自动解析类型标注的功能 - for field_name in field_names: - # Use regex to find content within XML tags matching the field name pattern = rf"<{field_name}>(.*?)" match = re.search(pattern, content, re.DOTALL) if match: - extracted_data[field_name] = match.group(1).strip() + raw_value = match.group(1).strip() + field_type = field_types.get(field_name) + + if field_type == str: + extracted_data[field_name] = raw_value + elif field_type == int: + try: + extracted_data[field_name] = int(raw_value) + except ValueError: + extracted_data[field_name] = 0 # 或者其他默认值 + elif field_type == bool: + extracted_data[field_name] = raw_value.lower() in ('true', 'yes', '1', 'on', 'True') + elif field_type == list: + try: + extracted_data[field_name] = eval(raw_value) + if not isinstance(extracted_data[field_name], list): + raise ValueError + except: + extracted_data[field_name] = [] # 默认空列表 + elif field_type == dict: + try: + extracted_data[field_name] = eval(raw_value) + if not isinstance(extracted_data[field_name], dict): + raise ValueError + except: + extracted_data[field_name] = {} # 默认空字典 return extracted_data