更新了eval BUG，同时更新了新的baseline

2026-06-08 15:05:17 +02:00 · 2024-09-11 17:00:14 +08:00 · 2024-09-11 17:00:14 +08:00 · b805da0bbe
commit b805da0bbe
parent b9a2d94da2
20 changed files with 274 additions and 40 deletions
--- a/examples/ags/benchmark/gsm8k.py
+++ b/examples/ags/benchmark/gsm8k.py
@ -39,12 +39,12 @@ def loose_match_score(expected_output: str, prediction: str, tolerance: float =
        return 0


-async def load_data(file_path: str, samples=1) -> List[dict]:
+async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
    data = []
    async with aiofiles.open(file_path, mode="r") as file:
        async for line in file:
            data.append(json.loads(line))
-    random_indices = generate_random_indices(len(data), samples)
+    random_indices = generate_random_indices(len(data), samples, test=test)
    data = [data[i] for i in random_indices]
    return data
        
@ -64,26 +64,33 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
    prompt = input
    max_retries = 5
    retries = 0
-    
-    while retries < max_retries:
-        try:
-            prediction = await graph(prompt)
-            cost = prediction[1]
-            output = prediction[0]["solution"]
+    prediction = await graph(prompt)
+    cost = prediction[1]
+    output = prediction[0]["solution"]

-            score = loose_match_score(expected_output, output)
-            break
+    print(output)

-        except Exception as e:
-            retries += 1
-            print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
+    score = loose_match_score(expected_output, output)
+    # break
+    # while retries < max_retries:
+    #     try:
+    #         prediction = await graph(prompt)
+    #         cost = prediction[1]
+    #         output = prediction[0]["solution"]

-            if retries == max_retries:
-                print("Maximum retries reached. Skipping this sample.")
-                output = None
-                cost = None
-                score = 0
-                break
+    #         score = loose_match_score(expected_output, output)
+    #         break
+
+    #     except Exception as e:
+    #         retries += 1
+    #         print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
+
+    #         if retries == max_retries:
+    #             print("Maximum retries reached. Skipping this sample.")
+    #             output = None
+    #             cost = None
+    #             score = 0
+    #             break

    return input, output, expected_output, score, cost

@ -101,9 +108,9 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren

    return await tqdm_asyncio.gather(*tasks, desc="Evaluating problems", total=len(data))

-async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
+async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
    """GSM8K evaluation main function"""
-    data = await load_data(file_path, samples)
+    data = await load_data(file_path, samples, test=test)
    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5)
    average_score, total_cost = save_results_to_csv(results, path=path)
    print(f"Average score: {average_score:.5f}")
--- a/examples/ags/experiments/baselines/medprompt_gsm8k.py
+++ b/examples/ags/experiments/baselines/medprompt_gsm8k.py
@ -0,0 +1,133 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.gsm8k import gsm8k_evaluation
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any, List, Tuple
+from collections import Counter
+
+import random
+
+GSM8K_PROMPT_GPT = """
+{question}\nPlease reason step by step, and to ensure accuracy, provide the correct answer in the final, without any additional text.
+"""
+
+GSM8K_PROMPT_DS = """
+{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="solution for the problem")
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, mode: str = None):
+        prompt = GSM8K_PROMPT_GPT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+MD_ENSEMBLE_PROMPT = """
+You are given a problem:
+{question}
+
+Here is a list of possible solutions to the problem:
+{solutions}
+
+Using the inputs above, your goal is to choose the best solution to the problem.
+The main consideration is that the solution can fully solve the problem in a correct and robust manner.
+Provide your final decision by writing the chosen solution letter.
+
+Please follow the required format in your response.
+"""
+
+class MdEnsembleOp(BaseModel):
+    thought: str = Field(
+        default="",
+        description="Step-by-step analysis of the solutions to determine the best one.",
+    )
+    solution_letter: str = Field(default="", description="The letter of the chosen best solution (only one letter).")
+
+
+class MdEnsemble(Operator):
+    """
+    Paper: Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine
+    Link: https://arxiv.org/abs/2311.16452
+    """
+
+    def __init__(self, name: str = "MdEnsemble", llm: LLM = LLM(), vote_count: int = 3):
+        super().__init__(name, llm)
+        self.vote_count = vote_count
+
+    @staticmethod
+    def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
+        shuffled_solutions = solutions.copy()
+        random.shuffle(shuffled_solutions)
+        answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
+        return shuffled_solutions, answer_mapping
+
+    async def __call__(self, solutions: List[str], problem: str, mode: str = None):
+        print(f"solution count: {len(solutions)}")
+        all_responses = []
+
+        for _ in range(self.vote_count):
+            shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
+
+            solution_text = ""
+            for index, solution in enumerate(shuffled_solutions):
+                solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
+
+            prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
+            fill_kwargs = {"context": prompt, "llm": self.llm}
+            if mode:
+                fill_kwargs["mode"] = mode
+            node = await ActionNode.from_pydantic(MdEnsembleOp).fill(**fill_kwargs)
+            response = node.instruct_content.model_dump()
+
+            answer = response.get("solution_letter", "")
+            answer = answer.strip().upper()
+
+            if answer in answer_mapping:
+                original_index = answer_mapping[answer]
+                all_responses.append(original_index)
+
+        most_frequent_index = Counter(all_responses).most_common(1)[0][0]
+        final_answer = solutions[most_frequent_index]
+        return {"solution": final_answer}  
+
+class MedPromptGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str, vote_count: int = 3):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(self.llm)
+        self.md_ensemble = MdEnsemble(self.llm, vote_count=vote_count)
+
+    async def __call__(self, problem):
+        solutions = []
+        for i in range(2):
+            solution = await self.cot_generate(problem, mode="context_fill")
+            solutions.append(solution["solution"])
+        solution = await self.md_ensemble(solutions, problem, mode="context_fill")
+        return solution, self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("deepseek-coder")
+        # llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        graph = MedPromptGraph(name="MedPrompt", llm_config=llm_config, dataset="Gsm8K", vote_count=2)
+        file_path = "examples/ags/data/gsm8k.jsonl"
+        samples = 264
+        path = "examples/ags/data/baselines/general"
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=True)
+        return score, cost
+
+    import asyncio
+    asyncio.run(main())
--- a/examples/ags/experiments/baselines/self_consistency.py
+++ b/examples/ags/experiments/baselines/self_consistency.py
--- a/examples/ags/experiments/baselines/self_consistency_gsm8k.py
+++ b/examples/ags/experiments/baselines/self_consistency_gsm8k.py
@ -0,0 +1,109 @@
+from examples.ags.scripts.operator import Operator
+from examples.ags.scripts.graph import SolveGraph
+from examples.ags.benchmark.gsm8k import gsm8k_evaluation
+from examples.ags.scripts.operator_an import GenerateOp
+from metagpt.actions.action_node import ActionNode 
+from metagpt.configs.models_config import ModelsConfig
+from metagpt.llm import LLM
+from pydantic import BaseModel, Field
+from typing import Dict, Any, List, Tuple
+from collections import Counter
+
+import random
+
+GSM8K_PROMPT_GPT = """
+{question}\nPlease reason step by step, and to ensure accuracy, provide the correct answer in the final, without any additional text.
+"""
+
+GSM8K_PROMPT_DS = """
+{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
+"""
+
+class GenerateOp(BaseModel):
+    solution: str = Field(default="", description="solution for the problem")
+
+class CoTGenerate(Operator):
+    def __init__(self, llm: LLM, name: str = "Generate"):
+        super().__init__(name, llm)
+
+    async def __call__(self, problem, mode: str = None):
+        prompt = GSM8K_PROMPT_GPT.format(question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+        return response
+
+SC_ENSEMBLE_PROMPT = """
+Given the question descripted as follows: {question}
+some solutions are generated to solve the question as follows:
+{solutions}
+
+Evaluate these solutions and select the most consistent solution based on majority consensus.
+Give your answer with a single id of solution (without anything else).
+"""
+
+class ScEnsembleOp(BaseModel):
+    solution_letter: str = Field(default="", description="The letter of most consistent solution.")
+
+
+class ScEnsemble(Operator):
+    """
+    Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
+    Link: https://arxiv.org/abs/2203.11171
+    Paper: Universal Self-Consistency for Large Language Model Generation
+    Link: https://arxiv.org/abs/2311.17311
+    """
+
+    def __init__(self, name: str = "ScEnsemble", llm: LLM = LLM()):
+        super().__init__(name, llm)
+
+    async def __call__(self, solutions: List[str], problem: str, mode: str = None):
+        answer_mapping = {}
+        solution_text = ""
+        for index, solution in enumerate(solutions):
+            answer_mapping[chr(65 + index)] = index
+            solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
+
+        prompt = SC_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
+        fill_kwargs = {"context": prompt, "llm": self.llm}
+        if mode:
+            fill_kwargs["mode"] = mode
+        node = await ActionNode.from_pydantic(ScEnsembleOp).fill(**fill_kwargs)
+        response = node.instruct_content.model_dump()
+
+        answer = response.get("solution_letter", "")
+        answer = answer.strip().upper()
+
+        return {"solution": solutions[answer_mapping[answer]]}
+
+
+class SelfConsistencyGraph(SolveGraph):
+    def __init__(self, name: str, llm_config, dataset: str):
+        super().__init__(name, llm_config, dataset)
+        self.cot_generate = CoTGenerate(self.llm)
+        self.sc_ensemble = ScEnsemble(self.llm)
+
+    async def __call__(self, problem):
+        solutions = []
+        for i in range(2):
+            solution = await self.cot_generate(problem, mode="context_fill")
+            solutions.append(solution["solution"])
+        solution = await self.sc_ensemble(solutions, problem, mode="context_fill")
+        return solution, self.llm.cost_manager.total_cost
+
+if __name__ == "__main__":
+    async def main():
+        llm_config = ModelsConfig.default().get("deepseek-coder")
+        # llm_config = ModelsConfig.default().get("gpt-4o-mini")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        graph = SelfConsistencyGraph(name="SelfConsistency", llm_config=llm_config, dataset="Gsm8K")
+        file_path = "examples/ags/data/gsm8k.jsonl"
+        samples = 1
+        path = "examples/ags/data/baselines/general"
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=False)
+        return score, cost
+
+    import asyncio
+    asyncio.run(main())
--- a/examples/ags/scripts/prompt.py
+++ b/examples/ags/scripts/prompt.py
@ -127,17 +127,17 @@ Based on the given problem and solution candidates:
 """

 MD_ENSEMBLE_PROMPT = """
-You are given a coding problem:
+You are given a problem:
 {problem_description}

 Here is a list of possible solutions to the problem:
 {solutions}

-Using the inputs above, your goal is to choose the best solution to the code contest problem.
-Don't just pick the most efficient solution. The main consideration is that the solution can fully solve the problem in a correct and robust manner.
+Using the inputs above, your goal is to choose the best solution to the problem.
+The main consideration is that the solution can fully solve the problem in a correct and robust manner.
 Provide your final decision by writing the chosen solution letter.

-Please maintain the JSON format in your response.
+Please follow the required format in your response.
 """

 SC_ENSEMBLE_PROMPT = """
--- a/llm_based_120_eval_results.json
+++ b/llm_based_120_eval_results.json
--- a/llm_based_121_eval_results.json
+++ b/llm_based_121_eval_results.json
--- a/llm_based_122_eval_results.json
+++ b/llm_based_122_eval_results.json
--- a/llm_based_123_eval_results.json
+++ b/llm_based_123_eval_results.json
--- a/llm_based_125_eval_results.json
+++ b/llm_based_125_eval_results.json
--- a/llm_based_126_eval_results.json
+++ b/llm_based_126_eval_results.json
--- a/llm_based_127_eval_results.json
+++ b/llm_based_127_eval_results.json
--- a/llm_based_128_eval_results.json
+++ b/llm_based_128_eval_results.json
--- a/llm_based_129_eval_results.json
+++ b/llm_based_129_eval_results.json
--- a/llm_based_130_eval_results.json
+++ b/llm_based_130_eval_results.json
--- a/llm_based_135_eval_results.json
+++ b/llm_based_135_eval_results.json
--- a/llm_based_136_eval_results.json
+++ b/llm_based_136_eval_results.json
--- a/llm_based_137_eval_results.json
+++ b/llm_based_137_eval_results.json
--- a/llm_based_138_eval_results.json
+++ b/llm_based_138_eval_results.json
--- a/llm_based_139_eval_results.json
+++ b/llm_based_139_eval_results.json