diff --git a/examples/ags/benchmark/drop.py b/examples/ags/benchmark/drop.py index 42851ec83..1dbea0f50 100644 --- a/examples/ags/benchmark/drop.py +++ b/examples/ags/benchmark/drop.py @@ -6,6 +6,8 @@ import numpy as np from scipy.optimize import linear_sum_assignment from tqdm.asyncio import tqdm_asyncio +from examples.ags.benchmark.utils import generate_random_indices + def is_number(text: str) -> bool: try: float(text) @@ -101,10 +103,14 @@ def f1_score(predicted_bag: Set[str], gold_bag: Set[str]) -> float: f1 = (2 * precision * recall) / (precision + recall) if not (precision == 0.0 and recall == 0.0) else 0.0 return f1 -def load_data(file_path: str) -> List[Tuple[str, Dict[str, Any]]]: +def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]: with open(file_path, mode="r") as file: data = json.load(file) - return list(data.items()) + data = list(data.items()) + + random_indices = generate_random_indices(len(data), samples) + data = [data[i] for i in random_indices] + return data async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float]: def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str]: @@ -178,8 +184,8 @@ def save_results_to_csv(results: List[List[Any]], path: str) -> float: return average_score -async def drop_evaluation(graph: Callable, file_path: str, path: str) -> float: - data = load_data(file_path) +async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float: + data = load_data(file_path, samples) results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20) average_score = save_results_to_csv(results, path=path) print(f"Average score on DROP dataset: {average_score:.5f}") diff --git a/examples/ags/experiments/baselines/cot_drop.py b/examples/ags/experiments/baselines/cot_drop.py new file mode 100644 index 000000000..d0d3ecb34 --- /dev/null +++ b/examples/ags/experiments/baselines/cot_drop.py @@ -0,0 +1,57 @@ +from examples.ags.scripts.operator import Operator +from examples.ags.scripts.graph import SolveGraph +from examples.ags.benchmark.drop import drop_evaluation +from examples.ags.scripts.operator_an import GenerateOp +from metagpt.actions.action_node import ActionNode +from metagpt.configs.models_config import ModelsConfig +from metagpt.llm import LLM +from pydantic import BaseModel, Field +from typing import Tuple + +DROP_PROMPT = """ +问题:{question} + +上下文: +{context} + +请一步步思考,并在最后给出你的答案。使用XML标签包裹内容。 +""" + +class GenerateOp(BaseModel): + answer: str = Field(default="", description="问题的答案") + +class CoTGenerate(Operator): + def __init__(self, llm: LLM, name: str = "Generate"): + super().__init__(name, llm) + + async def __call__(self, question: str, context: str, mode: str = None) -> Tuple[str, str]: + prompt = DROP_PROMPT.format(question=question, context=context) + fill_kwargs = {"context": prompt, "llm": self.llm} + if mode: + fill_kwargs["mode"] = mode + node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs) + response = node.instruct_content.model_dump() + return response["answer"] + +class CoTSolveGraph(SolveGraph): + def __init__(self, name: str, llm_config, dataset: str): + super().__init__(name, llm_config, dataset) + self.cot_generate = CoTGenerate(self.llm) + + async def __call__(self, question: str, context: str) -> Tuple[str, str]: + answer = await self.cot_generate(question, context, mode="context_fill") + return answer + +if __name__ == "__main__": + async def main(): + llm_config = ModelsConfig.default().get("gpt-4o-mini") + # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106") + graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="DROP") + file_path = "examples/ags/data/drop_dataset_dev.json" + samples = 3 + path = "examples/ags/data/baselines/general/drop" + score = await drop_evaluation(graph, file_path, samples, path) + return score + + import asyncio + asyncio.run(main()) \ No newline at end of file diff --git a/examples/ags/experiments/baselines/cot_math.py b/examples/ags/experiments/baselines/cot_math.py new file mode 100644 index 000000000..2c1df5cf7 --- /dev/null +++ b/examples/ags/experiments/baselines/cot_math.py @@ -0,0 +1,73 @@ +from examples.ags.scripts.operator import Operator +from examples.ags.scripts.graph import SolveGraph +from examples.ags.benchmark.math import math_evaluation +from examples.ags.scripts.operator_an import GenerateOp +from metagpt.actions.action_node import ActionNode +from metagpt.configs.models_config import ModelsConfig +from metagpt.llm import LLM +from pydantic import BaseModel, Field +from typing import Dict, Any + +MATH_PROMPT_GPT = """ +{question}\nPlease reason step by step, and put your final answer in the end. Wrap content using xml tags. +""" + +MATH_PROMPT_DS = """ +{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}. +""" + +class GenerateOp(BaseModel): + solution: str = Field(default="", description="solution for the problem") + +class CoTGenerate(Operator): + def __init__(self, llm: LLM, name: str = "Generate"): + super().__init__(name, llm) + + async def __call__(self, problem, mode: str = None): + prompt = MATH_PROMPT_GPT.format(question=problem) + fill_kwargs = {"context": prompt, "llm": self.llm} + if mode: + fill_kwargs["mode"] = mode + node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs) + response = node.instruct_content.model_dump() + return response + +class CoTSolveGraph(SolveGraph): + def __init__(self, name: str, llm_config, dataset: str): + super().__init__(name, llm_config, dataset) + self.cot_generate = CoTGenerate(self.llm) + + async def __call__(self, problem): + solution = await self.cot_generate(problem, mode="context_fill") + return solution, self.llm.cost_manager.total_cost + +if __name__ == "__main__": + async def main(): + # llm_config = ModelsConfig.default().get("deepseek-coder") + llm_config = ModelsConfig.default().get("gpt-4o-mini") + # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106") + graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K") + file_path = "examples/ags/data/math.jsonl" + samples = 100 + # samples = 100 + path = "examples/ags/data/baselines/general/math" + score = await math_evaluation(graph, file_path, samples, path) + return score + + import asyncio + asyncio.run(main()) + + +# self consistency operator; universal self consistency; + +# IO指的没有任何Trick,看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。 + +# deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106 + + + +GENERATE_PROMPT = """ +Generate Solution for the following problem: {problem_description} +""" + +# med ensemble \ No newline at end of file diff --git a/examples/ags/experiments/baselines/cot_mbpp.py b/examples/ags/experiments/baselines/cot_mbpp.py new file mode 100644 index 000000000..9db817fbe --- /dev/null +++ b/examples/ags/experiments/baselines/cot_mbpp.py @@ -0,0 +1,52 @@ +from examples.ags.scripts.operator import Operator +from examples.ags.scripts.graph import SolveGraph +from examples.ags.benchmark.mbpp import mbpp_evaluation +from examples.ags.scripts.operator_an import GenerateOp +from metagpt.actions.action_node import ActionNode +from metagpt.configs.models_config import ModelsConfig +from metagpt.llm import LLM +from pydantic import BaseModel, Field +from typing import Tuple + +MBPP_PROMPT = """ +{question}\nPlease reason step by step, and put your python function in the end. +""" + +class GenerateOp(BaseModel): + solution: str = Field(default="", description="问题的Python函数实现") + +class CoTGenerate(Operator): + def __init__(self, llm: LLM, name: str = "Generate"): + super().__init__(name, llm) + + async def __call__(self, question: str, mode: str = None) -> Tuple[str, str]: + prompt = MBPP_PROMPT.format(question=question) + fill_kwargs = {"context": prompt, "llm": self.llm} + if mode: + fill_kwargs["mode"] = mode + node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs) + response = node.instruct_content.model_dump() + return response + +class CoTSolveGraph(SolveGraph): + def __init__(self, name: str, llm_config, dataset: str): + super().__init__(name, llm_config, dataset) + self.cot_generate = CoTGenerate(self.llm) + + async def __call__(self, question: str) -> Tuple[str, str]: + response = await self.cot_generate(question, mode="context_fill") + return response["solution"] + +if __name__ == "__main__": + async def main(): + llm_config = ModelsConfig.default().get("gpt-4o-mini") + # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106") + graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="MBPP") + file_path = "examples/ags/data/mbpp-new.jsonl" + samples = 30 + path = "examples/ags/data/baselines/general/mbpp" + score = await mbpp_evaluation(graph, file_path, samples, path) + return score + + import asyncio + asyncio.run(main()) \ No newline at end of file