From b9a2d94da286b0cb33badbb0ceb13515b9a33c55 Mon Sep 17 00:00:00 2001
From: didi <84363704+didiforgithub@users.noreply.github.com>
Date: Wed, 11 Sep 2024 15:21:33 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BA=86xml-compile=E6=96=B9?=
 =?UTF-8?q?=E6=B3=95=EF=BC=8C=E6=9B=B4=E6=96=B0=E4=BA=86=E5=89=A9=E4=BD=99?=
 =?UTF-8?q?Baseline?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/ags/benchmark/gsm8k.py               |   2 +-
 .../ags/experiments/baselines/llm_debate.py   |   0
 .../baselines/multi_persona_gsm8k.py          | 119 ++++++++++++++++++
 .../ags/experiments/baselines/reflexion.py    |   0
 .../ags/experiments/baselines/self_refine.py  |   0
 .../baselines/self_refine_gsm8k.py            | 115 +++++++++++++++++
 metagpt/actions/action_node.py                |   2 +-
 7 files changed, 236 insertions(+), 2 deletions(-)
 delete mode 100644 examples/ags/experiments/baselines/llm_debate.py
 create mode 100644 examples/ags/experiments/baselines/multi_persona_gsm8k.py
 delete mode 100644 examples/ags/experiments/baselines/reflexion.py
 delete mode 100644 examples/ags/experiments/baselines/self_refine.py
 create mode 100644 examples/ags/experiments/baselines/self_refine_gsm8k.py

diff --git a/examples/ags/benchmark/gsm8k.py b/examples/ags/benchmark/gsm8k.py
index ddc4a0169..3e63ce914 100644
--- a/examples/ags/benchmark/gsm8k.py
+++ b/examples/ags/benchmark/gsm8k.py
@@ -64,7 +64,7 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
     prompt = input
     max_retries = 5
     retries = 0
-
+    
     while retries < max_retries:
         try:
             prediction = await graph(prompt)
diff --git a/examples/ags/experiments/baselines/llm_debate.py b/examples/ags/experiments/baselines/llm_debate.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/ags/experiments/baselines/multi_persona_gsm8k.py b/examples/ags/experiments/baselines/multi_persona_gsm8k.py
new file mode 100644
index 000000000..6737c9405
--- /dev/null
+++ b/examples/ags/experiments/baselines/multi_persona_gsm8k.py
@@ -0,0 +1,119 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.gsm8k import gsm8k_evaluation
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import List
+
+DEBATE_INITIAL_PROMPT = """
+{question}
+Please think step by step and then solve this task.
+"""
+
+DEBATE_PROMPT = """
+{question}
+Considering the solutions provided by other agents as additional suggestions. Please think carefully and provide an updated answer.
+"""
+
+FINAL_DECISION_PROMPT = """
+{question}
+Considering all the thinking processes and answers:
+{all_thinking}
+{all_answers}
+Please reason carefully and provide the final answer. To ensure accuracy, only provide the answer in the solution, without any steps.
+"""
+
+class DebateOp(BaseModel):
+    thinking: str = Field(default="", description="thinking process")
+    answer: str = Field(default="", description="answer")
+
+class FinalDecisionOp(BaseModel):
+    thinking: str = Field(default="", description="final thinking process")
+    solution: str = Field(default="", description="final answer")
+
+class DebateAgent(Operator):
+    def __init__(self, llm: LLM, name: str, role: str):
+        super().__init__(name, llm)
+        self.role = role
+
+    async def __call__(self, problem: str, context: List[str] = None, mode: str = None):
+        role_prompt = f"You are a {self.role}. Based on your professional knowledge and thinking style,"
+        if context is None:
+            prompt = role_prompt + DEBATE_INITIAL_PROMPT.format(question=problem)
+        else:
+            prompt = role_prompt + DEBATE_PROMPT.format(question=problem) + "\n".join(context)
+        
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(DebateOp).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+class FinalDecisionAgent(Operator):
+    def __init__(self, llm: LLM, name: str = "FinalDecision"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem: str, all_thinking: List[str], all_answers: List[str], mode: str = None):
+        prompt = FINAL_DECISION_PROMPT.format(
+            question=problem,
+            all_thinking="\n".join(all_thinking),
+            all_answers="\n".join(all_answers)
+        )
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(FinalDecisionOp).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+class MultiPersonaGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.debate_agents = [
+            DebateAgent(self.llm, f"Debate Agent {i}", role)
+            for i, role in enumerate([
+                'Math Competition Champion',
+                'Elementary School Math Teacher',
+                'Math Professor',
+                'Computer Scientist'
+            ])
+        ]
+        self.final_decision_agent = FinalDecisionAgent(self.llm)
+
+    async def __call__(self, problem):
+        max_round = 2
+        all_thinking = [[] for _ in range(max_round)]
+        all_answers = [[] for _ in range(max_round)]
+
+        for r in range(max_round):
+            for i, agent in enumerate(self.debate_agents):
+                if r == 0:
+                    result = await agent(problem, mode="context_fill")
+                else:
+                    context = [f"{agent.role}'s previous round thinking: {all_thinking[r-1][i]}"] + \
+                              [f"{self.debate_agents[j].role}'s thinking: {all_thinking[r-1][j]}" for j in range(len(self.debate_agents)) if j != i]
+                    result = await agent(problem, context, mode="context_fill")
+                all_thinking[r].append(result["thinking"])
+                all_answers[r].append(result["answer"])
+
+        final_result = await self.final_decision_agent(
+            problem,
+            [f"{agent.role}'s final thinking: {thinking}" for agent, thinking in zip(self.debate_agents, all_thinking[-1])],
+            [f"{agent.role}'s final answer: {answer}" for agent, answer in zip(self.debate_agents, all_answers[-1])],
+            mode="context_fill"
+        )
+        return final_result, self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("deepseek-coder")
+        graph = MultiPersonaGraph(name="multi-persona", llm_config=llm_config, dataset="Gsm8K")
+        file_path = "examples/ags/data/gsm8k.jsonl"
+        samples = 1
+        path = "examples/ags/data/baselines/general"
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
+        return score, cost
+
+    import asyncio
+    asyncio.run(main())
\ No newline at end of file
diff --git a/examples/ags/experiments/baselines/reflexion.py b/examples/ags/experiments/baselines/reflexion.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/ags/experiments/baselines/self_refine.py b/examples/ags/experiments/baselines/self_refine.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/ags/experiments/baselines/self_refine_gsm8k.py b/examples/ags/experiments/baselines/self_refine_gsm8k.py
new file mode 100644
index 000000000..4ab23be94
--- /dev/null
+++ b/examples/ags/experiments/baselines/self_refine_gsm8k.py
@@ -0,0 +1,115 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.gsm8k import gsm8k_evaluation
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any
+
+GSM8K_PROMPT_GPT = """
+{question}\nPlease reason step by step, and put your final answer in the end. Wrap content using xml tags.
+"""
+
+GSM8K_PROMPT_DS = """
+{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
+"""
+
+REVIEW_PROMPT = """
+For the question described as {question},
+please review the following solution: {solution}, and criticize on where might be wrong. You should provide a review result in boolean format.
+If you believe the solution is capable of resolving the issue, return True; otherwise, return False, and include your feedback.
+"""
+
+REVISE_PROMPT = """
+For the question described as {question}, \nand an error solution: {solution}, \nwith the feedback: {feedback},
+Given the previous solution and feedback, carefully refine the solution to solve the question and ensure it aligns with the original format.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="solution for the problem")
+
+class ReviewOp(BaseModel):
+    review_result: bool = Field(
+        default=False,
+        description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'",
+    )
+    feedback: str = Field(
+        default="",
+        description="Your FeedBack for this problem based on the criteria. If the review result is true, you can put it 'nothing here'.",
+    )
+
+
+class ReviseOp(BaseModel):
+    solution: str = Field(default="", description="Based on the feedback, revised solution for this problem")
+
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, mode: str = None):
+        prompt = GSM8K_PROMPT_GPT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+    
+class Review(Operator):
+    def __init__(self, llm: LLM, name: str = "Review"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, solution, mode: str = None):
+        prompt = REVIEW_PROMPT.format(question=problem, solution=solution)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ReviewOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+class Revise(Operator):
+    def __init__(self, name: str = "Revise", llm: LLM = LLM()):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, solution, feedback, mode: str = None):
+        prompt = REVISE_PROMPT.format(question=problem, solution=solution, feedback=feedback)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ReviseOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response  
+
+class SelfRefineGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(self.llm)
+        self.review = Review(self.llm)
+        self.revise = Revise(self.llm)
+
+    async def __call__(self, problem):
+        solution = await self.cot_generate(problem, mode="context_fill")
+        for i in range(5):
+            review = await self.review(problem, solution)
+            if review["review_result"]:
+                break
+            solution = await self.revise(problem, solution, review["feedback"])
+        return solution, self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("deepseek-coder")
+        # llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        graph = SelfRefineGraph(name="self-refine", llm_config=llm_config, dataset="Gsm8K")
+        file_path = "examples/ags/data/gsm8k.jsonl"
+        samples = 10
+        path = "examples/ags/data/baselines/general"
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
+        return score, cost
+
+    import asyncio
+    asyncio.run(main())
diff --git a/metagpt/actions/action_node.py b/metagpt/actions/action_node.py
index 4f04ef8f7..0ab8c7207 100644
--- a/metagpt/actions/action_node.py
+++ b/metagpt/actions/action_node.py
@@ -511,7 +511,7 @@ class ActionNode:
         example_str = "\n".join(examples)
         # Add the example to the context
         context += f"""
-### response format (must be strictly followed) (do not include any other formats except for the given XML format): \n
+### Response format (must be strictly followed): All content must be enclosed in the given XML tags, ensuring each opening <tag> has a corresponding closing </tag>, with no incomplete or self-closing tags allowed.\n
 {example_str}
 """
         return context