From b9a2d94da286b0cb33badbb0ceb13515b9a33c55 Mon Sep 17 00:00:00 2001 From: didi <84363704+didiforgithub@users.noreply.github.com> Date: Wed, 11 Sep 2024 15:21:33 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BA=86xml-compile=E6=96=B9?= =?UTF-8?q?=E6=B3=95=EF=BC=8C=E6=9B=B4=E6=96=B0=E4=BA=86=E5=89=A9=E4=BD=99?= =?UTF-8?q?Baseline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/ags/benchmark/gsm8k.py | 2 +- .../ags/experiments/baselines/llm_debate.py | 0 .../baselines/multi_persona_gsm8k.py | 119 ++++++++++++++++++ .../ags/experiments/baselines/reflexion.py | 0 .../ags/experiments/baselines/self_refine.py | 0 .../baselines/self_refine_gsm8k.py | 115 +++++++++++++++++ metagpt/actions/action_node.py | 2 +- 7 files changed, 236 insertions(+), 2 deletions(-) delete mode 100644 examples/ags/experiments/baselines/llm_debate.py create mode 100644 examples/ags/experiments/baselines/multi_persona_gsm8k.py delete mode 100644 examples/ags/experiments/baselines/reflexion.py delete mode 100644 examples/ags/experiments/baselines/self_refine.py create mode 100644 examples/ags/experiments/baselines/self_refine_gsm8k.py diff --git a/examples/ags/benchmark/gsm8k.py b/examples/ags/benchmark/gsm8k.py index ddc4a0169..3e63ce914 100644 --- a/examples/ags/benchmark/gsm8k.py +++ b/examples/ags/benchmark/gsm8k.py @@ -64,7 +64,7 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) -> prompt = input max_retries = 5 retries = 0 - + while retries < max_retries: try: prediction = await graph(prompt) diff --git a/examples/ags/experiments/baselines/llm_debate.py b/examples/ags/experiments/baselines/llm_debate.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/ags/experiments/baselines/multi_persona_gsm8k.py b/examples/ags/experiments/baselines/multi_persona_gsm8k.py new file mode 100644 index 000000000..6737c9405 --- /dev/null +++ b/examples/ags/experiments/baselines/multi_persona_gsm8k.py @@ -0,0 +1,119 @@ +from examples.ags.scripts.operator import Operator +from examples.ags.scripts.graph import SolveGraph +from examples.ags.benchmark.gsm8k import gsm8k_evaluation +from metagpt.actions.action_node import ActionNode +from metagpt.configs.models_config import ModelsConfig +from metagpt.llm import LLM +from pydantic import BaseModel, Field +from typing import List + +DEBATE_INITIAL_PROMPT = """ +{question} +Please think step by step and then solve this task. +""" + +DEBATE_PROMPT = """ +{question} +Considering the solutions provided by other agents as additional suggestions. Please think carefully and provide an updated answer. +""" + +FINAL_DECISION_PROMPT = """ +{question} +Considering all the thinking processes and answers: +{all_thinking} +{all_answers} +Please reason carefully and provide the final answer. To ensure accuracy, only provide the answer in the solution, without any steps. +""" + +class DebateOp(BaseModel): + thinking: str = Field(default="", description="thinking process") + answer: str = Field(default="", description="answer") + +class FinalDecisionOp(BaseModel): + thinking: str = Field(default="", description="final thinking process") + solution: str = Field(default="", description="final answer") + +class DebateAgent(Operator): + def __init__(self, llm: LLM, name: str, role: str): + super().__init__(name, llm) + self.role = role + + async def __call__(self, problem: str, context: List[str] = None, mode: str = None): + role_prompt = f"You are a {self.role}. Based on your professional knowledge and thinking style," + if context is None: + prompt = role_prompt + DEBATE_INITIAL_PROMPT.format(question=problem) + else: + prompt = role_prompt + DEBATE_PROMPT.format(question=problem) + "\n".join(context) + + fill_kwargs = {"context": prompt, "llm": self.llm} + if mode: + fill_kwargs["mode"] = mode + node = await ActionNode.from_pydantic(DebateOp).fill(**fill_kwargs) + return node.instruct_content.model_dump() + +class FinalDecisionAgent(Operator): + def __init__(self, llm: LLM, name: str = "FinalDecision"): + super().__init__(name, llm) + + async def __call__(self, problem: str, all_thinking: List[str], all_answers: List[str], mode: str = None): + prompt = FINAL_DECISION_PROMPT.format( + question=problem, + all_thinking="\n".join(all_thinking), + all_answers="\n".join(all_answers) + ) + fill_kwargs = {"context": prompt, "llm": self.llm} + if mode: + fill_kwargs["mode"] = mode + node = await ActionNode.from_pydantic(FinalDecisionOp).fill(**fill_kwargs) + return node.instruct_content.model_dump() + +class MultiPersonaGraph(SolveGraph): + def __init__(self, name: str, llm_config, dataset: str): + super().__init__(name, llm_config, dataset) + self.debate_agents = [ + DebateAgent(self.llm, f"Debate Agent {i}", role) + for i, role in enumerate([ + 'Math Competition Champion', + 'Elementary School Math Teacher', + 'Math Professor', + 'Computer Scientist' + ]) + ] + self.final_decision_agent = FinalDecisionAgent(self.llm) + + async def __call__(self, problem): + max_round = 2 + all_thinking = [[] for _ in range(max_round)] + all_answers = [[] for _ in range(max_round)] + + for r in range(max_round): + for i, agent in enumerate(self.debate_agents): + if r == 0: + result = await agent(problem, mode="context_fill") + else: + context = [f"{agent.role}'s previous round thinking: {all_thinking[r-1][i]}"] + \ + [f"{self.debate_agents[j].role}'s thinking: {all_thinking[r-1][j]}" for j in range(len(self.debate_agents)) if j != i] + result = await agent(problem, context, mode="context_fill") + all_thinking[r].append(result["thinking"]) + all_answers[r].append(result["answer"]) + + final_result = await self.final_decision_agent( + problem, + [f"{agent.role}'s final thinking: {thinking}" for agent, thinking in zip(self.debate_agents, all_thinking[-1])], + [f"{agent.role}'s final answer: {answer}" for agent, answer in zip(self.debate_agents, all_answers[-1])], + mode="context_fill" + ) + return final_result, self.llm.cost_manager.total_cost + +if __name__ == "__main__": + async def main(): + llm_config = ModelsConfig.default().get("deepseek-coder") + graph = MultiPersonaGraph(name="multi-persona", llm_config=llm_config, dataset="Gsm8K") + file_path = "examples/ags/data/gsm8k.jsonl" + samples = 1 + path = "examples/ags/data/baselines/general" + score, cost = await gsm8k_evaluation(graph, file_path, samples, path) + return score, cost + + import asyncio + asyncio.run(main()) \ No newline at end of file diff --git a/examples/ags/experiments/baselines/reflexion.py b/examples/ags/experiments/baselines/reflexion.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/ags/experiments/baselines/self_refine.py b/examples/ags/experiments/baselines/self_refine.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/ags/experiments/baselines/self_refine_gsm8k.py b/examples/ags/experiments/baselines/self_refine_gsm8k.py new file mode 100644 index 000000000..4ab23be94 --- /dev/null +++ b/examples/ags/experiments/baselines/self_refine_gsm8k.py @@ -0,0 +1,115 @@ +from examples.ags.scripts.operator import Operator +from examples.ags.scripts.graph import SolveGraph +from examples.ags.benchmark.gsm8k import gsm8k_evaluation +from metagpt.actions.action_node import ActionNode +from metagpt.configs.models_config import ModelsConfig +from metagpt.llm import LLM +from pydantic import BaseModel, Field +from typing import Dict, Any + +GSM8K_PROMPT_GPT = """ +{question}\nPlease reason step by step, and put your final answer in the end. Wrap content using xml tags. +""" + +GSM8K_PROMPT_DS = """ +{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}. +""" + +REVIEW_PROMPT = """ +For the question described as {question}, +please review the following solution: {solution}, and criticize on where might be wrong. You should provide a review result in boolean format. +If you believe the solution is capable of resolving the issue, return True; otherwise, return False, and include your feedback. +""" + +REVISE_PROMPT = """ +For the question described as {question}, \nand an error solution: {solution}, \nwith the feedback: {feedback}, +Given the previous solution and feedback, carefully refine the solution to solve the question and ensure it aligns with the original format. +""" + +class GenerateOp(BaseModel): + solution: str = Field(default="", description="solution for the problem") + +class ReviewOp(BaseModel): + review_result: bool = Field( + default=False, + description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'", + ) + feedback: str = Field( + default="", + description="Your FeedBack for this problem based on the criteria. If the review result is true, you can put it 'nothing here'.", + ) + + +class ReviseOp(BaseModel): + solution: str = Field(default="", description="Based on the feedback, revised solution for this problem") + + +class CoTGenerate(Operator): + def __init__(self, llm: LLM, name: str = "Generate"): + super().__init__(name, llm) + + async def __call__(self, problem, mode: str = None): + prompt = GSM8K_PROMPT_GPT.format(question=problem) + fill_kwargs = {"context": prompt, "llm": self.llm} + if mode: + fill_kwargs["mode"] = mode + node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs) + response = node.instruct_content.model_dump() + return response + +class Review(Operator): + def __init__(self, llm: LLM, name: str = "Review"): + super().__init__(name, llm) + + async def __call__(self, problem, solution, mode: str = None): + prompt = REVIEW_PROMPT.format(question=problem, solution=solution) + fill_kwargs = {"context": prompt, "llm": self.llm} + if mode: + fill_kwargs["mode"] = mode + node = await ActionNode.from_pydantic(ReviewOp).fill(**fill_kwargs) + response = node.instruct_content.model_dump() + return response + +class Revise(Operator): + def __init__(self, name: str = "Revise", llm: LLM = LLM()): + super().__init__(name, llm) + + async def __call__(self, problem, solution, feedback, mode: str = None): + prompt = REVISE_PROMPT.format(question=problem, solution=solution, feedback=feedback) + fill_kwargs = {"context": prompt, "llm": self.llm} + if mode: + fill_kwargs["mode"] = mode + node = await ActionNode.from_pydantic(ReviseOp).fill(**fill_kwargs) + response = node.instruct_content.model_dump() + return response + +class SelfRefineGraph(SolveGraph): + def __init__(self, name: str, llm_config, dataset: str): + super().__init__(name, llm_config, dataset) + self.cot_generate = CoTGenerate(self.llm) + self.review = Review(self.llm) + self.revise = Revise(self.llm) + + async def __call__(self, problem): + solution = await self.cot_generate(problem, mode="context_fill") + for i in range(5): + review = await self.review(problem, solution) + if review["review_result"]: + break + solution = await self.revise(problem, solution, review["feedback"]) + return solution, self.llm.cost_manager.total_cost + +if __name__ == "__main__": + async def main(): + llm_config = ModelsConfig.default().get("deepseek-coder") + # llm_config = ModelsConfig.default().get("gpt-4o-mini") + # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106") + graph = SelfRefineGraph(name="self-refine", llm_config=llm_config, dataset="Gsm8K") + file_path = "examples/ags/data/gsm8k.jsonl" + samples = 10 + path = "examples/ags/data/baselines/general" + score, cost = await gsm8k_evaluation(graph, file_path, samples, path) + return score, cost + + import asyncio + asyncio.run(main()) diff --git a/metagpt/actions/action_node.py b/metagpt/actions/action_node.py index 4f04ef8f7..0ab8c7207 100644 --- a/metagpt/actions/action_node.py +++ b/metagpt/actions/action_node.py @@ -511,7 +511,7 @@ class ActionNode: example_str = "\n".join(examples) # Add the example to the context context += f""" -### response format (must be strictly followed) (do not include any other formats except for the given XML format): \n +### Response format (must be strictly followed): All content must be enclosed in the given XML tags, ensuring each opening has a corresponding closing , with no incomplete or self-closing tags allowed.\n {example_str} """ return context