From 86033a10372263acc808eb412dd66e7a6c9758b4 Mon Sep 17 00:00:00 2001
From: didi <84363704+didiforgithub@users.noreply.github.com>
Date: Tue, 9 Jul 2024 14:51:27 +0800
Subject: [PATCH] Update

---
 examples/ags/benchmark/humaneval.py    | 156 +++++++++++++++++++++++++
 examples/ags/w_action_node/graph.py    |  26 +++--
 examples/ags/w_action_node/operator.py |  82 ++++++-------
 examples/ags/w_action_node/prompt.py   |   4 +
 he_test.py                             |  99 +---------------
 test.py                                |  12 --
 6 files changed, 216 insertions(+), 163 deletions(-)
 create mode 100644 examples/ags/benchmark/humaneval.py
 delete mode 100644 test.py

diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py
new file mode 100644
index 000000000..ea7b9dedb
--- /dev/null
+++ b/examples/ags/benchmark/humaneval.py
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+# @Date    : 7/7/2024 17:07 PM
+# @Author  : didi
+# @Desc    : test on human eval graph
+
+import json
+import subprocess
+import sys
+import asyncio
+import aiofiles
+from metagpt.llm import LLM
+from evalplus.data import get_human_eval_plus, write_jsonl
+from examples.ags.w_action_node.utils import jsonl_ranker
+from examples.ags.w_action_node.graph import HumanEvalGraph
+from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock
+
+generate_code = GenerateCode(llm=LLM())
+generate_code_block = GenerateCodeBlock(llm=LLM())
+
+solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)
+
+async def sample_generate(id):
+    case = get_human_eval_plus()[f"{id}"]
+    solution_result = await solver(case['prompt'],ensemble_count=5)
+    sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
+    with open("samples.jsonl", mode='a') as f:
+        f.write(json.dumps(sample_dict) + '\n')
+    jsonl_ranker("samples.jsonl", "samples.jsonl")
+
+async def samples_generate(mode:str):
+    cases = list(get_human_eval_plus().values())
+    file_lock = asyncio.Lock()
+    
+    async def solve_and_write(case, mode):
+        try:
+            if mode == 'llm':
+                # solution_result = await generate_code_block(case['prompt'])
+                solution_result = await generate_code(case['prompt'])
+                sample_dict = {
+                'task_id': case['task_id'],
+                'solution': solution_result['code_solution']
+                }
+            elif mode == "ags":
+                solution_result = await solver(case['prompt'], ensemble_count=5)
+                sample_dict = {
+                'task_id': case['task_id'],
+                'solution': solution_result['final_solution']
+                }
+
+            async with file_lock:
+                async with aiofiles.open("samples.jsonl", mode='a') as f:
+                    await f.write(json.dumps(sample_dict) + '\n')
+            return None
+
+        except Exception as e: 
+            print(e)
+            return case['task_id']
+
+    tasks = [solve_and_write(case, mode) for case in cases]
+    results = await asyncio.gather(*tasks)
+    failed_tasks = [task_id for task_id in results if task_id is not None]
+
+    # TODO 这个地方还是不够自动化
+    if failed_tasks:
+        for task_id in failed_tasks:
+            try:
+                await sample_generate(task_id) 
+            except Exception as e:
+                print(f"failure {task_id}")
+    jsonl_ranker("samples.jsonl", "samples.jsonl")
+    
+    if not failed_tasks:
+        if automatic_evalplus():
+            unpassed_exapmle = extract_failure_tests()
+            print(unpassed_exapmle)
+
+async def samples_generate_ags():
+    sample_list = []
+    cases = list(get_human_eval_plus().values())
+    
+    async def solve_with_id(case):
+        solution_result = await solver(case['prompt'], ensemble_count=3)
+        return case['task_id'], solution_result['final_solution']
+    
+    tasks = [solve_with_id(case) for case in cases]
+    results = await asyncio.gather(*tasks)
+    
+    for task_id, solution in results:
+        sample_dict = dict(task_id=task_id, solution=solution)
+        sample_list.append(sample_dict)
+    
+    write_jsonl("samples.jsonl", sample_list)
+
+async def samples_generate_llm():
+    sample_list = []
+    cases = list(get_human_eval_plus().values())
+    
+    async def solve_with_id(case):
+        solution_result =  await generate_code_block(case['prompt'])
+        # solution_result =  await generate_code(case['prompt'])
+        return case['task_id'], solution_result['code_solution']
+    
+    tasks = [solve_with_id(case) for case in cases]
+    results = await asyncio.gather(*tasks)
+    
+    for task_id, solution in results:
+        sample_dict = dict(task_id=task_id, solution=solution)
+        sample_list.append(sample_dict)
+    
+    write_jsonl("samples.jsonl", sample_list)
+
+def automatic_evalplus():
+    """
+    在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only
+    """
+    command = [
+        sys.executable,  # 使用当前 Python 解释器
+        "-m",
+        "evalplus.evaluate",
+        "--dataset", "humaneval",
+        "--samples", "samples.jsonl",
+        "--parallel", "2",
+        "--base-only"
+    ]
+    
+    try:
+        result = subprocess.run(command, check=True, capture_output=True, text=True)
+        print("输出:", result.stdout)
+        return True
+    except subprocess.CalledProcessError as e:
+        print("错误输出:", e.stderr)
+        return False
+    
+def extract_failure_tests(file_path:str = "/Users/trl/Github_project/MetaGPT-MathAI/samples_eval_results.json"):
+    with open(file_path, 'r') as f:
+        task_results = json.load(f)
+
+    failed_tests = []
+    
+    for task in task_results['eval'].values():
+        if task[0]["base_status"] == "fail":
+            failed_test = {
+                "task_id": task[0]["task_id"],
+                # "solution": task["solution"],
+                # "fail_tests": task["base_fail_tests"]
+            }
+            failed_tests.append(failed_test)
+    print(len(failed_tests))
+    
+    return failed_tests
+
+
+# asyncio.run(sample_generate('HumanEval/101'))
+# asyncio.run(samples_generate(mode='llm'))
+# jsonl_ranker("samples.jsonl", "samples.jsonl")
+# {"task_id": "HumanEval/101", "solution": "def words_string(s):\n    import re\n    return re.split(r'[,\\s]\\s*', s)"}
\ No newline at end of file
diff --git a/examples/ags/w_action_node/graph.py b/examples/ags/w_action_node/graph.py
index f9bc2e1b3..3870bfd6d 100644
--- a/examples/ags/w_action_node/graph.py
+++ b/examples/ags/w_action_node/graph.py
@@ -16,7 +16,7 @@ class Graph:
         NotImplementedError("Subclasses must implement __call__ method")
 
 class HumanEvalGraph(Graph):
-    def __init__(self, name:str, llm: LLM, criteria:str, vote_count:int =3) -> None:
+    def __init__(self, name:str, llm: LLM, criteria:str, vote_count:int =5) -> None:
         super().__init__(name, llm)
         self.criteria = criteria # TODO 自动构建图时，图的初始参数与图所使用的算子要求的外部参数相匹配
         self.generate_code = GenerateCode(llm=llm)
@@ -29,11 +29,11 @@ class HumanEvalGraph(Graph):
     async def __call__(self, problem:str, ensemble_count:int = 3):
         solution_list = []
         for _ in range(ensemble_count):
-            # solution = await self.generate_code(problem)
-            solution = await self.generate_code_block(problem)
+            solution = await self.generate_code(problem)
+            # solution = await self.generate_code_block(problem)
             solution = solution.get('code_solution')
             solution_list.append(solution)
-        solution = await self.mdensemble(solution_list, problem)
+        solution = await self.mdensemble("code", solution_list, problem)
         return solution
     
     async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2):
@@ -44,14 +44,16 @@ class HumanEvalGraph(Graph):
         solution = await self.ensemble(solution_list, problem)
         return solution
 
-    async def simple_ensemble(self, problem:str):
-        solution_list = []
-        for _ in range(3):
-            solution = await self.generate_code(problem)
-            solution = solution.get('code_solution')
-            solution_list.append(solution)
-        solution = await self.ensemble(solution_list, problem)
-        return solution
+    # async def simple_ensemble(self, problem:str, ensemble_count:int = 3):
+    # async def __call__(self, problem:str, ensemble_count:int = 3):
+    #     solution_list = []
+    #     for _ in range(ensemble_count):
+    #         solution = await self.generate_code(problem)
+    #         # solution = await self.generate_code_block(problem)
+    #         solution = solution.get('code_solution')
+    #         solution_list.append(solution)
+    #     solution = await self.ensemble(solution_list, problem)
+    #     return solution
     
     async def single_solve(self, problem:str, max_loop:int):
         solution = await self.generate_code(problem)
diff --git a/examples/ags/w_action_node/operator.py b/examples/ags/w_action_node/operator.py
index b6b6e1901..3b832cc18 100644
--- a/examples/ags/w_action_node/operator.py
+++ b/examples/ags/w_action_node/operator.py
@@ -2,7 +2,7 @@
 # @Date    : 6/27/2024 17:36 PM
 # @Author  : didi
 # @Desc    : operator demo of ags
-
+import ast
 import random
 from typing import List, Tuple, Any, Dict
 from collections import Counter
@@ -90,7 +90,7 @@ class Ensemble(Operator):
         response = node.instruct_content.model_dump()
         return response
     
-class MdEnsemble(Ensemble):
+class MdEnsemble(Operator):
 
     def __init__(self, name:str ="MdEnsembler", llm: LLM = LLM(), vote_count:int=3):
         super().__init__(name, llm)
@@ -100,21 +100,35 @@ class MdEnsemble(Ensemble):
     def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
         shuffled_solutions = solutions.copy()
         random.shuffle(shuffled_solutions)
-        answer_mapping = {
-            chr(65 + i): solutions.index(sol)
-            for i, sol in enumerate(shuffled_solutions)
-        }
+        # 这里的index方法会把检索到的放在第一个索引的位置。
+        answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
         return shuffled_solutions, answer_mapping
-    
-    @staticmethod
-    def most_frequent(lst: List[Any]) -> Tuple[Any, int]:
-        counter = Counter(lst)
-        most_common = counter.most_common(1)
-        return most_common[0] if most_common else (None, 0)
 
-    async def __call__(self, solutions:List[str], problem_description:str,):
+    async def __call__(self, solution_type:str ,solutions:List[str], problem_description:str):
         all_responses = []
+        # 如果Solution方案是Code，我们利用AST去重
+        if solution_type == "code":
+            original_length = len(solutions)
+            unique_structures = {}
+            updated_solutions = []
 
+            for solution in solutions:
+                try:
+                    tree = ast.parse(solution)
+                    structure_key = ast.dump(tree, annotate_fields=False, include_attributes=False)
+                    
+                    if structure_key not in unique_structures:
+                        unique_structures[structure_key] = solution
+                        updated_solutions.append(solution)
+                except SyntaxError:
+                    # If the solution has a syntax error, we'll skip it
+                    continue
+            solutions = updated_solutions
+            updated_length = len(solutions)
+            print(f"Original number of solutions: {original_length}")
+            print(f"Updated number of solutions: {updated_length}")
+            if updated_length == 1:
+                return {"final_solution": solutions[0]}
         for _ in range(self.vote_count):
             shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
             
@@ -131,38 +145,16 @@ class MdEnsemble(Ensemble):
             
             if answer in answer_mapping:
                 original_index = answer_mapping[answer]
-                all_responses.append(solutions[original_index])
-            
-        final_answer, frequency = self.most_frequent(all_responses)
+                print(f"original index: {original_index}")
+                all_responses.append(original_index)
         
+        most_frequent_index = Counter(all_responses).most_common(1)[0][0]
+        print(f"most frequent_index: {most_frequent_index}") 
+        final_answer = solutions[most_frequent_index]
+        print(f"final answer: {final_answer}")
+        # final_answer, frequency = self.most_frequent(all_responses)
         return {"final_solution": final_answer}
 
-
-
-
-
-
-
-
-
-
-# def load_llm_configs(*config_names):
-#     """
-#     Load multiple LLM configurations and return a list of initialized LLMs.
-
-#     :param config_names: Variable number of configuration file names (without .yaml extension)
-#     :return: List of initialized LLM objects
-#     """
-#     llms = []
-#     for config_name in config_names:
-#         config_path = Path(f"~/.metagpt/{config_name}.yaml").expanduser()
-#         if config_path.exists():
-#             config = Config.from_yaml_file(config_path)
-#             llms.append(LLM(config.llm))
-#         else:
-#             print(f"Warning: Configuration file {config_path} not found. Skipping.")
-#     return llms
-
-
-# 使用函数加载多个 LLM 配置
-# llms = load_llm_configs("gpt-4o", "sonnet-35")  # 你可以根据需要添加或删除配置
\ No newline at end of file
+class ScEnsemble(Operator):
+    # TODO
+    pass
\ No newline at end of file
diff --git a/examples/ags/w_action_node/prompt.py b/examples/ags/w_action_node/prompt.py
index 8b815715d..dcc1428de 100644
--- a/examples/ags/w_action_node/prompt.py
+++ b/examples/ags/w_action_node/prompt.py
@@ -3,6 +3,10 @@
 # @Author  : didi
 # @Desc    : prompts of operators
 
+# TODO PromptBreeder 评分是怎么做的？
+# TODO 评估案例 GSM-8K 直接拿的DataSet
+# 
+# 
 
 GENERATE_PROMPT = """
 Generate Solution for the following problem: {problem_description}
diff --git a/he_test.py b/he_test.py
index b410f86a7..fa827a4c1 100644
--- a/he_test.py
+++ b/he_test.py
@@ -1,103 +1,14 @@
-import json
 import asyncio
-import aiofiles
 from metagpt.llm import LLM
-from evalplus.data import get_human_eval_plus, write_jsonl
+from examples.ags.benchmark.humaneval import sample_generate, samples_generate, extract_failure_tests, automatic_evalplus
 from examples.ags.w_action_node.utils import jsonl_ranker
-from examples.ags.w_action_node.graph import HumanEvalGraph
-from examples.ags.w_action_node.operator import GenerateCode
-
-generate_code = GenerateCode(llm=LLM())
-
-solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)
-
-async def sample_generate(id):
-    case = get_human_eval_plus()[f"{id}"]
-    solution_result = await solver(case['prompt'],ensemble_count=3)
-    sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
-    with open("samples.jsonl", mode='a') as f:
-        f.write(json.dumps(sample_dict) + '\n')
-    jsonl_ranker("samples.jsonl", "samples.jsonl")
-
-async def samples_generate(mode:str):
-    cases = list(get_human_eval_plus().values())
-    file_lock = asyncio.Lock()
-    
-    async def solve_and_write(case, mode):
-        try:
-            if mode == 'llm':
-                solution_result = await generate_code(case['prompt'])
-                sample_dict = {
-                'task_id': case['task_id'],
-                'solution': solution_result['code_solution']
-                }
-            elif mode == "ags":
-                solution_result = await solver(case['prompt'], ensemble_count=3)
-                sample_dict = {
-                'task_id': case['task_id'],
-                'solution': solution_result['final_solution']
-                }
-
-            async with file_lock:
-                async with aiofiles.open("samples.jsonl", mode='a') as f:
-                    await f.write(json.dumps(sample_dict) + '\n')
-            return None
-
-        except Exception as e: 
-            print(e)
-            return case['task_id']
-
-    tasks = [solve_and_write(case, mode) for case in cases]
-    results = await asyncio.gather(*tasks)
-    failed_tasks = [task_id for task_id in results if task_id is not None]
-
-    # TODO 这个地方还是不够自动化
-    if failed_tasks:
-        for task_id in failed_tasks:
-            try:
-                await sample_generate(task_id) 
-            except Exception as e:
-                print(f"failure {task_id}")
-    jsonl_ranker("samples.jsonl", "samples.jsonl")
-
-async def samples_generate_ags():
-    sample_list = []
-    cases = list(get_human_eval_plus().values())
-    
-    async def solve_with_id(case):
-        solution_result = await solver(case['prompt'], ensemble_count=3)
-        return case['task_id'], solution_result['final_solution']
-    
-    tasks = [solve_with_id(case) for case in cases]
-    results = await asyncio.gather(*tasks)
-    
-    for task_id, solution in results:
-        sample_dict = dict(task_id=task_id, solution=solution)
-        sample_list.append(sample_dict)
-    
-    write_jsonl("samples.jsonl", sample_list)
-
-async def samples_generate_llm():
-    sample_list = []
-    cases = list(get_human_eval_plus().values())
-    
-    async def solve_with_id(case):
-        solution_result =  await generate_code(case['prompt'])
-        return case['task_id'], solution_result['code_solution']
-    
-    tasks = [solve_with_id(case) for case in cases]
-    results = await asyncio.gather(*tasks)
-    
-    for task_id, solution in results:
-        sample_dict = dict(task_id=task_id, solution=solution)
-        sample_list.append(sample_dict)
-    
-    write_jsonl("samples.jsonl", sample_list)
 
 # asyncio.run(sample_generate('HumanEval/101'))
-# asyncio.run(samples_generate_llm())
+# asyncio.run(sample_generate('HumanEval/1'))
 asyncio.run(samples_generate(mode='ags'))
 # jsonl_ranker("samples.jsonl", "samples.jsonl")
 
 
-
+# if automatic_evalplus():
+#     unpassed_exapmle = extract_failure_tests()
+#     print(unpassed_exapmle)
\ No newline at end of file
diff --git a/test.py b/test.py
deleted file mode 100644
index 78db8c0b4..000000000
--- a/test.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import asyncio
-from examples.ags.w_action_node.graph import HumanEvalGraph
-from metagpt.llm import LLM 
-
-human_eval_example = """
-from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n
-"""
-
-solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability')
-
-final_result = asyncio.run(solver(human_eval_example))
-print(final_result)
\ No newline at end of file