Update

2026-06-17 15:35:21 +02:00 · 2024-07-14 09:12:33 +08:00 · 2024-07-14 09:12:33 +08:00 · 8a241054c7
commit 8a241054c7
parent 7fa68d5649
8 changed files with 301 additions and 120 deletions
--- a/examples/ags/benchmark/humaneval.py
+++ b/examples/ags/benchmark/humaneval.py
@ -19,15 +19,15 @@ generate_code_block = GenerateCodeBlock(llm=LLM())

 solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)

-async def sample_generate(id):
+async def sample_generate(id, result_path:str="samples.jsonl"):
    case = get_human_eval_plus()[f"{id}"]
-    solution_result = await solver(case['prompt'],ensemble_count=3)
+    solution_result = await solver(case['prompt'],ensemble_count=5)
    sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
-    with open("samples.jsonl", mode='a') as f:
+    with open(result_path, mode='a') as f:
        f.write(json.dumps(sample_dict) + '\n')
-    jsonl_ranker("samples.jsonl", "samples.jsonl")
+    jsonl_ranker(result_path, result_path)

-async def samples_generate(mode:str):
+async def samples_generate(mode:str, result_path:str="samples.jsonl"):
    cases = list(get_human_eval_plus().values())
    file_lock = asyncio.Lock()
    
@ -48,7 +48,7 @@ async def samples_generate(mode:str):
                }

            async with file_lock:
-                async with aiofiles.open("samples.jsonl", mode='a') as f:
+                async with aiofiles.open(result_path, mode='a') as f:
                    await f.write(json.dumps(sample_dict) + '\n')
            return None

@ -67,11 +67,12 @@ async def samples_generate(mode:str):
                await sample_generate(task_id) 
            except Exception as e:
                print(f"failure {task_id}")
-    jsonl_ranker("samples.jsonl", "samples.jsonl")
+    jsonl_ranker(result_path, result_path)
    
    if not failed_tasks:
-        if automatic_evalplus():
-            unpassed_exapmle = extract_failure_tests()
+        if automatic_evalplus(result_path):
+            eval_path = result_path[:-6]+"_eval_results.json"
+            unpassed_exapmle = extract_failure_tests(eval_path)
            print(unpassed_exapmle)
    else:
        print(failed_tasks)
@ -111,7 +112,7 @@ async def samples_generate_llm():
    
    write_jsonl("samples.jsonl", sample_list)

-def automatic_evalplus():
+def automatic_evalplus(result_path:str ="samples.jsonl"):
    """
    在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only
    """
@ -120,7 +121,7 @@ def automatic_evalplus():
        "-m",
        "evalplus.evaluate",
        "--dataset", "humaneval",
-        "--samples", "samples.jsonl",
+        "--samples", result_path,
        "--parallel", "2",
        "--base-only"
    ]
@ -133,7 +134,7 @@ def automatic_evalplus():
        print("错误输出:", e.stderr)
        return False
    
-def extract_failure_tests(file_path:str = "/Users/trl/Github_project/MetaGPT-MathAI/samples_eval_results.json"):
+def extract_failure_tests(file_path:str = "samples_eval_results.json"):
    with open(file_path, 'r') as f:
        task_results = json.load(f)

--- a/examples/ags/w_action_node/graph.py
+++ b/examples/ags/w_action_node/graph.py
@ -5,7 +5,7 @@

 from metagpt.llm import LLM 

-from examples.ags.w_action_node.operator import Generate, GenerateCode, GenerateCodeBlock, Review, Revise, Ensemble, MdEnsemble
+from examples.ags.w_action_node.operator import Generate, GenerateCode, GenerateCodeBlock, Review, Revise, FuEnsemble, MdEnsemble

 class Graph:
    def __init__(self, name:str, llm:LLM) -> None:
@ -23,18 +23,18 @@ class HumanEvalGraph(Graph):
        self.generate_code_block = GenerateCodeBlock(llm=llm)
        self.review = Review(llm=llm, criteria=criteria)
        self.revise = Revise(llm=llm)
-        self.ensemble = Ensemble(llm=llm)
+        self.fuensemble = FuEnsemble(llm=llm)
        self.mdensemble = MdEnsemble(llm=llm, vote_count=vote_count)

-    async def __call__(self, problem:str, ensemble_count:int = 3):
-        solution_list = []
-        for _ in range(ensemble_count):
-            solution = await self.generate_code(problem)
-            # solution = await self.generate_code_block(problem)
-            solution = solution.get('code_solution')
-            solution_list.append(solution)
-        solution = await self.mdensemble("code", solution_list, problem)
-        return solution
+    # async def __call__(self, problem:str, ensemble_count:int = 3):
+    #     solution_list = []
+    #     for _ in range(ensemble_count):
+    #         solution = await self.generate_code(problem)
+    #         # solution = await self.generate_code_block(problem)
+    #         solution = solution.get('code_solution')
+    #         solution_list.append(solution)
+    #     solution = await self.mdensemble("code", solution_list, problem)
+    #     return solution
    
    async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2):
        solution_list = []
@ -45,15 +45,15 @@ class HumanEvalGraph(Graph):
        return solution

    # async def simple_ensemble(self, problem:str, ensemble_count:int = 3):
-    # async def __call__(self, problem:str, ensemble_count:int = 3):
-    #     solution_list = []
-    #     for _ in range(ensemble_count):
-    #         solution = await self.generate_code(problem)
-    #         # solution = await self.generate_code_block(problem)
-    #         solution = solution.get('code_solution')
-    #         solution_list.append(solution)
-    #     solution = await self.ensemble(solution_list, problem)
-    #     return solution
+    async def __call__(self, problem:str, ensemble_count:int = 3):
+        solution_list = []
+        for _ in range(ensemble_count):
+            solution = await self.generate_code(problem)
+            # solution = await self.generate_code_block(problem)
+            solution = solution.get('code_solution')
+            solution_list.append(solution)
+        solution = await self.fuensemble(solution_list, problem)
+        return solution
    
    async def single_solve(self, problem:str, max_loop:int):
        solution = await self.generate_code(problem)
@ -65,5 +65,4 @@ class HumanEvalGraph(Graph):
            solution = await self.revise(problem, solution, review_feedback['feedback'])
            solution = solution.get('revised_solution')
        return solution
-    
-
+    
--- a/examples/ags/w_action_node/operator.py
+++ b/examples/ags/w_action_node/operator.py
@ -10,8 +10,8 @@ from collections import Counter
 from metagpt.actions.action_node import ActionNode
 from metagpt.llm import LLM 

-from examples.ags.w_action_node.operator_an import GenerateOp, GenerateCodeOp, GenerateCodeBlockOp ,ReviewOp, ReviseOp, EnsembleOp, MdEnsembleOp
-from examples.ags.w_action_node.prompt import GENERATE_PROMPT, GENERATE_CODE_PROMPT, REVIEW_PROMPT, REVISE_PROMPT, ENSEMBLE_PROMPT, MD_ENSEMBLE_PROMPT
+from examples.ags.w_action_node.operator_an import GenerateOp, GenerateCodeOp, GenerateCodeBlockOp ,ReviewOp, ReviseOp, FuEnsembleOp, MdEnsembleOp
+from examples.ags.w_action_node.prompt import GENERATE_PROMPT, GENERATE_CODE_PROMPT, REVIEW_PROMPT, REVISE_PROMPT, FU_ENSEMBLE_PROMPT, MD_ENSEMBLE_PROMPT, DE_ENSEMBLE_ANGEL_PROMPT, DE_ENSEMBLE_DEVIL_PROMPT, DE_ENSEMBLE_JUDGE_PROMPT

 class Operator:
    def __init__(self, name, llm:LLM=None):
@ -76,23 +76,22 @@ class Revise(Operator):
        response = node.instruct_content.model_dump()
        return response

-class Ensemble(Operator):
+class FuEnsemble(Operator):

-    def __init__(self, name:str ="Ensembler", llm: LLM = LLM()):
+    def __init__(self, name:str ="FuseEnsembler", llm: LLM = LLM()):
        super().__init__(name, llm)

    async def __call__(self, solutions:List, problem_description):
        solution_text = ""
        for solution in solutions:
            solution_text += str(solution) + "\n"
-        prompt = ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description)
-        node = await ActionNode.from_pydantic(EnsembleOp).fill(context=prompt, llm=self.llm)
+        prompt = FU_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description)
+        node = await ActionNode.from_pydantic(FuEnsembleOp).fill(context=prompt, llm=self.llm)
        response = node.instruct_content.model_dump()
        return response
    
 class MdEnsemble(Operator):
-
-    def __init__(self, name:str ="MdEnsembler", llm: LLM = LLM(), vote_count:int=3):
+    def __init__(self, name:str ="MedEnsembler", llm: LLM = LLM(), vote_count:int=3):
        super().__init__(name, llm)
        self.vote_count = vote_count
    
@ -100,7 +99,6 @@ class MdEnsemble(Operator):
    def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
        shuffled_solutions = solutions.copy()
        random.shuffle(shuffled_solutions)
-        # 这里的index方法会把检索到的放在第一个索引的位置。
        answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
        return shuffled_solutions, answer_mapping

@ -156,15 +154,126 @@ class MdEnsemble(Operator):
        return {"final_solution": final_answer}

 class ScEnsemble(Operator):
-    # TODO
+    """
+    self consistency ensemble
+    """
+
+    # ScEnsemble 的构建相对好做一点 30分钟左右
    pass

-class Debate(Operator):
-    # TODO
+class DbEnsemble(Operator):
    """
-    You agree with my answer 90% of the time and have almost no reservations. Affirm your agreement, share any additional thoughts if you have them, and conclude with the capital letter corresponding to your answer at the end of your response.
+    (Should we be going MAD? A Look at Multi-Agent Debate Strategies for LLMs)
+    The system is a multi-round debate system where each agent is given the
+    question and responses generated by all agents. For each round, a judge
+    analyzes the responses provided determines whether to terminate the
+    debate or keep going. At the end of the debate the judge is also responsible
+    for determining the final answer.
+    """
+    def __init__(self, name:str ="DebateEnsemble", llm: LLM = LLM()):
+        super().__init__(name, llm)
+        self.agents = [
+        ]
+
+    async def debate_answer(self, message_history:List, role:str):
+        """
+        async def lowlevel_api_example(llm: LLM):
+            logger.info("low level api example")
+            logger.info(await llm.aask_batch(["hi", "write python hello world."]))
+
+            hello_msg = [{"role": "user", "content": "count from 1 to 10. split by newline."}]
+            logger.info(await llm.acompletion(hello_msg))
+            logger.info(await llm.acompletion_text(hello_msg))
+
+            # streaming mode, much slower
+            await llm.acompletion_text(hello_msg, stream=True)
+
+            # check completion if exist to test llm complete functions
+            if hasattr(llm, "completion"):
+                logger.info(llm.completion(hello_msg))
+        """
+        if role == "angel":
+            prompt = DE_ENSEMBLE_ANGEL_PROMPT.format()
+            Op = ""
+        else:
+            prompt = DE_ENSEMBLE_DEVIL_PROMPT.format()
+            Op = ""
+        
+        node = await ActionNode.from_pydantic(Op).messages_fill(messages=message_history,llm=self.llm)
+        node = await ActionNode.from_pydantic(FuEnsembleOp).fill(context=prompt, llm=self.llm)
+        response = node.instruct_content.model_dump()
+        return response
+
+    async def judge_answer(message_histroy:List):
+        """
+
+        """
+        pass
+
+    async def __call__(self, origin_solution:str, problem_description:str, max_round:int = 3):
+        # 思路，输入一个原始答案，构建一个agent代表这个答案进行辩论；另一个agent（devil）使用debate llm的内容进行辩论；法官在每一轮次做出决定是否终止，到了maxround还没终止就由法官进行总结。
+        # 以下是调用llm的方法
+        """
+        1. judge信息只有法官自己看到
+        2. agent answer信息所有人都能看到，具体代码逻辑在debate
+        """
+        # 在MG里面多轮对话传Message在哪里传，预计时间1小时左右吧
+
+        angel_prompt = DE_ENSEMBLE_ANGEL_PROMPT.format()
+        devil_prompt = DE_ENSEMBLE_DEVIL_PROMPT.format()
+        judge_prompt = DE_ENSEMBLE_JUDGE_PROMPT.format()
+        '''
+            Devil
+            You agree with my answer 90% of the time and have almost no reservations. Affirm your agreement, share any additional thoughts if you have them, and conclude with the capital letter corresponding to your answer at the end of your response.
+            
+            Angel
+            Do you agree with my perspective? Please provide your reasons and answer.
+
+            Judge
+            final_mode: "You, as the moderator, will evaluate both sides' answers and determine your
+            preference for an answer candidate. Please summarize your reasons for supporting affirmative/negative side and
+            give the final answer that you think is correct to conclude the debate. Now please output your answer in json format, with the format as follows:
+            {\"Reason\": \"\", \"debate_answer\": \"the capital letter corresponding to the answer\"}.
+            Please strictly output in JSON format, do not output irrelevant content."
+
+            universal_mode: "You, as the moderator, will evaluate both sides' answers and determine if there is a clear
+            preference for an answer candidate. If so, please summarize your reasons for supporting affirmative/negative side and
+            give the final answer that you think is correct, and the debate will conclude. If not, the debate will continue to
+            the next round. Now please output your answer in json format, with the format as follows:
+            {\"Whether there is a preference\": \"Yes or No\", \"Supported Side\": \"Affirmative or Negative\",
+            \"Reason\": \"\", \"debate_answer\": \"the capital letter corresponding to the answer\"}.
+            Please strictly output in JSON format, do not output irrelevant content."
+        '''
+
+        # 在action node 之中构建一个能够传递message history的方法。
+        for _ in max_round:
+            for agent in self.agents:
+                pass
+
+        node = await ActionNode.from_pydantic(FuEnsembleOp).fill(context=prompt, llm=self.llm)
+        response = node.instruct_content.model_dump()
+        return response
+
+class Rephrase(Operator):
+    """
+
+    https://arxiv.org/abs/2404.14963
    """
    pass

-class CriticalThinkingAbstract(Operator):
-    pass
+class FindFact(Operator):
+    pass
+
+class SelfAsk(Operator):
+    pass
+
+class CodeReflection(Operator):
+    """
+    Interpreter Part
+    We run code here to get error information.
+    """
+
+class Verify(Operator):
+    """
+    ? 还没有想好
+    """
--- a/examples/ags/w_action_node/operator_an.py
+++ b/examples/ags/w_action_node/operator_an.py
@ -9,10 +9,10 @@ class GenerateOp(BaseModel):
    solution: str = Field(default="", description="Your Solution for this problem")

 class GenerateCodeOp(BaseModel):
-    code_solution: str = Field(default="", description="Your Code Solution for this problem")
+    code_solution: str = Field(default="", description="Your complete code solution for this problem")

 class GenerateCodeBlockOp(BaseModel):
-    code_solution: str = Field(default="", description="Your Code Solution for this problem")
+    code_solution: str = Field(default="", description="Your complete code solution for this problem")

 class ReviewOp(BaseModel):
    review_result: bool = Field(default=False, description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'")
@ -21,9 +21,24 @@ class ReviewOp(BaseModel):
 class ReviseOp(BaseModel):
    revised_solution: str = Field(default="", description="Based on the feedback, revised solution for this problem")

-class EnsembleOp(BaseModel):
-    final_solution: str = Field(default="", description="Final ensemble solution for this problem")
+class FuEnsembleOp(BaseModel):
+    thought: str = Field(default="", description="Analyze the solutions and think how to combine the advantages of various solutions to form the best possible solution.")
+    final_solution: str = Field(default="", description="Output the final solution after analysis and integration")

 class MdEnsembleOp(BaseModel):
-    thought: str = Field(default="", description="Analyze the solutions and think what's the best step by step.")
-    solution_letter: str = Field(default="", description="Choose The Best Solution, and output only one solution letter")
+    thought: str = Field(
+        default="""Example thought process:
+                1. Examined the 'compare_one' function.
+                2. The function correctly handles both numeric and string inputs by converting strings to floats.
+                3. It properly compares two values and returns the larger one.
+                4. The function returns None if the values are equal, which might be useful in some contexts but could be improved by returning either value.
+                5. The use of 'isinstance' for type checking is a good practice.
+                6. The function handles decimal separators well by replacing ',' with '.'.
+                Overall, this solution effectively solves the problem of comparing two values, with good error handling and flexibility. It could be improved by specifying behavior for equal values, but it's a strong solution as is.""",
+        description="Step-by-step analysis of the solutions to determine the best one."
+    )
+    solution_letter: str = Field(
+        default="",
+        description="The letter of the chosen best solution (output only one letter)."
+    )
+
--- a/examples/ags/w_action_node/prompt.py
+++ b/examples/ags/w_action_node/prompt.py
@ -12,17 +12,36 @@ GENERATE_PROMPT = """
 Generate Solution for the following problem: {problem_description}
 """

+# GENERATE_CODE_PROMPT = """
+# Below is an instruction that describes a task, paired with an input that provides further context.
+# Write a response that appropriately completes the request.
+
+# ### Instruction:
+# Write a program to perform the given task.
+
+# Input:
+# {problem_description}
+
+# ### Response:
+# """
+
 GENERATE_CODE_PROMPT = """
-Below is an instruction that describes a task, paired with an input that provides further context.
-Write a response that appropriately completes the request.
+You are an expert programmer tasked with solving a coding problem. Your goal is to write clean, efficient, and correct code that solves the given problem.

-### Instruction:
-Write a program to perform the given task.
-
-Input:
+### Problem Description:
 {problem_description}

-### Response:
+### Instructions:
+1. Read the problem description carefully.
+2. If any part of the problem is unclear, state your assumptions.
+3. Plan your approach before writing code.
+4. Write a Python function that solves the problem.
+5. Include clear comments to explain your logic.
+6. Ensure your code handles edge cases and potential errors.
+7. If time complexity is a concern, optimize your solution and explain your optimization.
+
+Please maintain the JSON format in your response.
+### Your Response: 
 """
 # GENERATE_CODE_PROMPT = """
 # Generate Code Solution for the following problem: {problem_description}
@ -40,22 +59,25 @@ please evaluate and revise the solution provided: {solution}, taking into accoun
 Then output the revised solution.
 """

-ENSEMBLE_PROMPT = """
-For the question described as {problem_description}, Solutions: {solutions}
-Please select the solution that appears most frequently from these options and ensemble this to provide best solution.
+FU_ENSEMBLE_PROMPT = """
+### Given problem
+
+{problem_description}
+
+### We've got a list of solutions
+
+<solutions>
+{solutions}
+</solutions>
+
+### Instructions
+Based on the given problem and solution candidates:
+
+1. Analyze the pros and cons of each candidate solution
+2. Consider how to integrate reasonable parts from different solutions
+3. Formulate a more comprehensive and effective solution
 """

-# MD_ENSEMBLE_PROMPT = """
-# # Context
-# For the question described as {problem_description}, 
-# Solutions can be seen below: 
-# {solutions}
-
-# # Instruction
-# Based on the problem and solution candidates, carefully analyze which is the best answer. Focus solely on the correctness of the solution in addressing the problem.
-# Provide your final decision by writing the chosen solution number (e.g., A).
-# """
-
 MD_ENSEMBLE_PROMPT = """
 ### Given problem

@ -68,5 +90,13 @@ MD_ENSEMBLE_PROMPT = """
 </solutions>

 ### Instructions
-Based on the problem and solution candidates, carefully analyze which is the best answer. Focus solely on the correctness of the solution in addressing the problem. Provide your final decision by writing the chosen solution number. (eg.B). Keep the json format.
-"""
+Carefully analyze the given problem and the list of solution candidates. Your task is to determine the best answer based solely on how correctly and effectively it addresses the problem. Follow these steps:
+
+1. Thoroughly examine each solution.
+2. Evaluate their relevance and effectiveness in solving the problem.
+3. Compare the solutions to identify the most suitable one.
+4. Provide your final decision by writing the chosen solution letter (e.g., B).
+
+Please maintain the JSON format in your response.
+"""
+