Update humaneval

2026-06-08 15:05:17 +02:00 · 2024-07-10 16:23:38 +08:00 · 2024-07-10 16:23:38 +08:00 · 4af2315c77
commit 4af2315c77
parent 86033a1037
6 changed files with 46 additions and 13 deletions
--- a/examples/ags/benchmark/humaneval.py
+++ b/examples/ags/benchmark/humaneval.py
@ -73,6 +73,8 @@ async def samples_generate(mode:str):
        if automatic_evalplus():
            unpassed_exapmle = extract_failure_tests()
            print(unpassed_exapmle)
+    else:
+        print(failed_tasks)

 async def samples_generate_ags():
    sample_list = []
--- a/examples/ags/w_action_node/graph.py
+++ b/examples/ags/w_action_node/graph.py
@ -64,4 +64,6 @@ class HumanEvalGraph(Graph):
                break
            solution = await self.revise(problem, solution, review_feedback['feedback'])
            solution = solution.get('revised_solution')
-        return solution
+        return solution
+    
+
--- a/examples/ags/w_action_node/operator.py
+++ b/examples/ags/w_action_node/operator.py
@ -157,4 +157,14 @@ class MdEnsemble(Operator):

 class ScEnsemble(Operator):
    # TODO
+    pass
+
+class Debate(Operator):
+    # TODO
+    """
+    You agree with my answer 90% of the time and have almost no reservations. Affirm your agreement, share any additional thoughts if you have them, and conclude with the capital letter corresponding to your answer at the end of your response.
+    """
+    pass
+
+class CriticalThinkingAbstract(Operator):
    pass
--- a/examples/ags/w_action_node/operator_an.py
+++ b/examples/ags/w_action_node/operator_an.py
@ -25,5 +25,9 @@ class EnsembleOp(BaseModel):
    final_solution: str = Field(default="", description="Final ensemble solution for this problem")

 class MdEnsembleOp(BaseModel):
-    thought: str = Field(default="", description="Analyze the solutions and think what's the best step by step.")
-    solution_letter: str = Field(default="", description="Choose The Best Solution, and output the solution letter")
+    thought: str = Field(default="",
+                          description="Analyze the solutions and think what's the best step by step.")
+    solution_letter: str = Field(default="",
+                                 description="""
+        Based on the problem and solution candidates, carefully analyze which is the best answer. Focus solely on the correctness of the solution in addressing the problem.
+        Provide your final decision by writing the chosen solution number. (eg.A) """)
--- a/examples/ags/w_action_node/prompt.py
+++ b/examples/ags/w_action_node/prompt.py
@ -45,13 +45,25 @@ For the question described as {problem_description}, Solutions: {solutions}
 Please select the solution that appears most frequently from these options and ensemble this to provide best solution.
 """

-MD_ENSEMBLE_PROMPT = """
-# Context
-For the question described as {problem_description}, 
-Solutions can be seen below: 
-{solutions}
+# MD_ENSEMBLE_PROMPT = """
+# # Context
+# For the question described as {problem_description}, 
+# Solutions can be seen below: 
+# {solutions}

-# Instruction
-Based on the problem and solution candidates, carefully analyze which is the best answer. Focus solely on the correctness of the solution in addressing the problem.
-Provide your final decision by writing the chosen solution number (e.g., A).
+# # Instruction
+# Based on the problem and solution candidates, carefully analyze which is the best answer. Focus solely on the correctness of the solution in addressing the problem.
+# Provide your final decision by writing the chosen solution number (e.g., A).
+# """
+
+MD_ENSEMBLE_PROMPT = """
+### Given problem
+
+{problem_description}
+
+### We've got a list of solutions
+
+<solutions>
+{solutions}
+</solutions>
 """
--- a/he_test.py
+++ b/he_test.py
@ -5,10 +5,13 @@ from examples.ags.w_action_node.utils import jsonl_ranker

 # asyncio.run(sample_generate('HumanEval/101'))
 # asyncio.run(sample_generate('HumanEval/1'))
-asyncio.run(samples_generate(mode='ags'))
+# asyncio.run(samples_generate(mode='llm'))
 # jsonl_ranker("samples.jsonl", "samples.jsonl")


 # if automatic_evalplus():
 #     unpassed_exapmle = extract_failure_tests()
-#     print(unpassed_exapmle)
+#     print(unpassed_exapmle)
+
+# unpassed_exapmle = extract_failure_tests()
+# print(unpassed_exapmle)