更新了xml-compile方法，更新了剩余Baseline

2026-05-05 13:52:38 +02:00 · 2024-09-11 15:21:33 +08:00 · 2024-09-11 15:21:33 +08:00 · b9a2d94da2
commit b9a2d94da2
parent f691c5f439
7 changed files with 236 additions and 2 deletions
--- a/examples/ags/benchmark/gsm8k.py
+++ b/examples/ags/benchmark/gsm8k.py
@ -64,7 +64,7 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
    prompt = input
    max_retries = 5
    retries = 0
-
+    
    while retries < max_retries:
        try:
            prediction = await graph(prompt)
--- a/examples/ags/experiments/baselines/llm_debate.py
+++ b/examples/ags/experiments/baselines/llm_debate.py
--- a/examples/ags/experiments/baselines/multi_persona_gsm8k.py
+++ b/examples/ags/experiments/baselines/multi_persona_gsm8k.py
@ -0,0 +1,119 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.gsm8k import gsm8k_evaluation
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import List
+
+DEBATE_INITIAL_PROMPT = """
+{question}
+Please think step by step and then solve this task.
+"""
+
+DEBATE_PROMPT = """
+{question}
+Considering the solutions provided by other agents as additional suggestions. Please think carefully and provide an updated answer.
+"""
+
+FINAL_DECISION_PROMPT = """
+{question}
+Considering all the thinking processes and answers:
+{all_thinking}
+{all_answers}
+Please reason carefully and provide the final answer. To ensure accuracy, only provide the answer in the solution, without any steps.
+"""
+
+class DebateOp(BaseModel):
+    thinking: str = Field(default="", description="thinking process")
+    answer: str = Field(default="", description="answer")
+
+class FinalDecisionOp(BaseModel):
+    thinking: str = Field(default="", description="final thinking process")
+    solution: str = Field(default="", description="final answer")
+
+class DebateAgent(Operator):
+    def __init__(self, llm: LLM, name: str, role: str):
+        super().__init__(name, llm)
+        self.role = role
+
+    async def __call__(self, problem: str, context: List[str] = None, mode: str = None):
+        role_prompt = f"You are a {self.role}. Based on your professional knowledge and thinking style,"
+        if context is None:
+            prompt = role_prompt + DEBATE_INITIAL_PROMPT.format(question=problem)
+        else:
+            prompt = role_prompt + DEBATE_PROMPT.format(question=problem) + "\n".join(context)
+        
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(DebateOp).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+class FinalDecisionAgent(Operator):
+    def __init__(self, llm: LLM, name: str = "FinalDecision"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem: str, all_thinking: List[str], all_answers: List[str], mode: str = None):
+        prompt = FINAL_DECISION_PROMPT.format(
+            question=problem,
+            all_thinking="\n".join(all_thinking),
+            all_answers="\n".join(all_answers)
+        )
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(FinalDecisionOp).fill(**fill_kwargs)
+        return node.instruct_content.model_dump()
+
+class MultiPersonaGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.debate_agents = [
+            DebateAgent(self.llm, f"Debate Agent {i}", role)
+            for i, role in enumerate([
+                'Math Competition Champion',
+                'Elementary School Math Teacher',
+                'Math Professor',
+                'Computer Scientist'
+            ])
+        ]
+        self.final_decision_agent = FinalDecisionAgent(self.llm)
+
+    async def __call__(self, problem):
+        max_round = 2
+        all_thinking = [[] for _ in range(max_round)]
+        all_answers = [[] for _ in range(max_round)]
+
+        for r in range(max_round):
+            for i, agent in enumerate(self.debate_agents):
+                if r == 0:
+                    result = await agent(problem, mode="context_fill")
+                else:
+                    context = [f"{agent.role}'s previous round thinking: {all_thinking[r-1][i]}"] + \
+                              [f"{self.debate_agents[j].role}'s thinking: {all_thinking[r-1][j]}" for j in range(len(self.debate_agents)) if j != i]
+                    result = await agent(problem, context, mode="context_fill")
+                all_thinking[r].append(result["thinking"])
+                all_answers[r].append(result["answer"])
+
+        final_result = await self.final_decision_agent(
+            problem,
+            [f"{agent.role}'s final thinking: {thinking}" for agent, thinking in zip(self.debate_agents, all_thinking[-1])],
+            [f"{agent.role}'s final answer: {answer}" for agent, answer in zip(self.debate_agents, all_answers[-1])],
+            mode="context_fill"
+        )
+        return final_result, self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("deepseek-coder")
+        graph = MultiPersonaGraph(name="multi-persona", llm_config=llm_config, dataset="Gsm8K")
+        file_path = "examples/ags/data/gsm8k.jsonl"
+        samples = 1
+        path = "examples/ags/data/baselines/general"
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
+        return score, cost
+
+    import asyncio
+    asyncio.run(main())
--- a/examples/ags/experiments/baselines/reflexion.py
+++ b/examples/ags/experiments/baselines/reflexion.py
--- a/examples/ags/experiments/baselines/self_refine.py
+++ b/examples/ags/experiments/baselines/self_refine.py
--- a/examples/ags/experiments/baselines/self_refine_gsm8k.py
+++ b/examples/ags/experiments/baselines/self_refine_gsm8k.py
@ -0,0 +1,115 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.gsm8k import gsm8k_evaluation
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any
+
+GSM8K_PROMPT_GPT = """
+{question}\nPlease reason step by step, and put your final answer in the end. Wrap content using xml tags.
+"""
+
+GSM8K_PROMPT_DS = """
+{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
+"""
+
+REVIEW_PROMPT = """
+For the question described as {question},
+please review the following solution: {solution}, and criticize on where might be wrong. You should provide a review result in boolean format.
+If you believe the solution is capable of resolving the issue, return True; otherwise, return False, and include your feedback.
+"""
+
+REVISE_PROMPT = """
+For the question described as {question}, \nand an error solution: {solution}, \nwith the feedback: {feedback},
+Given the previous solution and feedback, carefully refine the solution to solve the question and ensure it aligns with the original format.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="solution for the problem")
+
+class ReviewOp(BaseModel):
+    review_result: bool = Field(
+        default=False,
+        description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'",
+    )
+    feedback: str = Field(
+        default="",
+        description="Your FeedBack for this problem based on the criteria. If the review result is true, you can put it 'nothing here'.",
+    )
+
+
+class ReviseOp(BaseModel):
+    solution: str = Field(default="", description="Based on the feedback, revised solution for this problem")
+
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, mode: str = None):
+        prompt = GSM8K_PROMPT_GPT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+    
+class Review(Operator):
+    def __init__(self, llm: LLM, name: str = "Review"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, solution, mode: str = None):
+        prompt = REVIEW_PROMPT.format(question=problem, solution=solution)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ReviewOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+class Revise(Operator):
+    def __init__(self, name: str = "Revise", llm: LLM = LLM()):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, solution, feedback, mode: str = None):
+        prompt = REVISE_PROMPT.format(question=problem, solution=solution, feedback=feedback)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ReviseOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response  
+
+class SelfRefineGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(self.llm)
+        self.review = Review(self.llm)
+        self.revise = Revise(self.llm)
+
+    async def __call__(self, problem):
+        solution = await self.cot_generate(problem, mode="context_fill")
+        for i in range(5):
+            review = await self.review(problem, solution)
+            if review["review_result"]:
+                break
+            solution = await self.revise(problem, solution, review["feedback"])
+        return solution, self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("deepseek-coder")
+        # llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        graph = SelfRefineGraph(name="self-refine", llm_config=llm_config, dataset="Gsm8K")
+        file_path = "examples/ags/data/gsm8k.jsonl"
+        samples = 10
+        path = "examples/ags/data/baselines/general"
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
+        return score, cost
+
+    import asyncio
+    asyncio.run(main())
--- a/metagpt/actions/action_node.py
+++ b/metagpt/actions/action_node.py
@ -511,7 +511,7 @@ class ActionNode:
        example_str = "\n".join(examples)
        # Add the example to the context
        context += f"""
-### response format (must be strictly followed) (do not include any other formats except for the given XML format): \n
+### Response format (must be strictly followed): All content must be enclosed in the given XML tags, ensuring each opening <tag> has a corresponding closing </tag>, with no incomplete or self-closing tags allowed.\n
 {example_str}
 """
        return context