diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py index 5273e84e5..678102f17 100644 --- a/examples/ags/benchmark/humaneval.py +++ b/examples/ags/benchmark/humaneval.py @@ -19,15 +19,15 @@ generate_code_block = GenerateCodeBlock(llm=LLM()) solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5) -async def sample_generate(id): +async def sample_generate(id, result_path:str="samples.jsonl"): case = get_human_eval_plus()[f"{id}"] - solution_result = await solver(case['prompt'],ensemble_count=3) + solution_result = await solver(case['prompt'],ensemble_count=5) sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution']) - with open("samples.jsonl", mode='a') as f: + with open(result_path, mode='a') as f: f.write(json.dumps(sample_dict) + '\n') - jsonl_ranker("samples.jsonl", "samples.jsonl") + jsonl_ranker(result_path, result_path) -async def samples_generate(mode:str): +async def samples_generate(mode:str, result_path:str="samples.jsonl"): cases = list(get_human_eval_plus().values()) file_lock = asyncio.Lock() @@ -48,7 +48,7 @@ async def samples_generate(mode:str): } async with file_lock: - async with aiofiles.open("samples.jsonl", mode='a') as f: + async with aiofiles.open(result_path, mode='a') as f: await f.write(json.dumps(sample_dict) + '\n') return None @@ -67,11 +67,12 @@ async def samples_generate(mode:str): await sample_generate(task_id) except Exception as e: print(f"failure {task_id}") - jsonl_ranker("samples.jsonl", "samples.jsonl") + jsonl_ranker(result_path, result_path) if not failed_tasks: - if automatic_evalplus(): - unpassed_exapmle = extract_failure_tests() + if automatic_evalplus(result_path): + eval_path = result_path[:-6]+"_eval_results.json" + unpassed_exapmle = extract_failure_tests(eval_path) print(unpassed_exapmle) else: print(failed_tasks) @@ -111,7 +112,7 @@ async def samples_generate_llm(): write_jsonl("samples.jsonl", sample_list) -def automatic_evalplus(): +def automatic_evalplus(result_path:str ="samples.jsonl"): """ 在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only """ @@ -120,7 +121,7 @@ def automatic_evalplus(): "-m", "evalplus.evaluate", "--dataset", "humaneval", - "--samples", "samples.jsonl", + "--samples", result_path, "--parallel", "2", "--base-only" ] @@ -133,7 +134,7 @@ def automatic_evalplus(): print("错误输出:", e.stderr) return False -def extract_failure_tests(file_path:str = "/Users/trl/Github_project/MetaGPT-MathAI/samples_eval_results.json"): +def extract_failure_tests(file_path:str = "samples_eval_results.json"): with open(file_path, 'r') as f: task_results = json.load(f) diff --git a/examples/ags/w_action_node/graph.py b/examples/ags/w_action_node/graph.py index e5b04a874..3db637188 100644 --- a/examples/ags/w_action_node/graph.py +++ b/examples/ags/w_action_node/graph.py @@ -5,7 +5,7 @@ from metagpt.llm import LLM -from examples.ags.w_action_node.operator import Generate, GenerateCode, GenerateCodeBlock, Review, Revise, Ensemble, MdEnsemble +from examples.ags.w_action_node.operator import Generate, GenerateCode, GenerateCodeBlock, Review, Revise, FuEnsemble, MdEnsemble class Graph: def __init__(self, name:str, llm:LLM) -> None: @@ -23,18 +23,18 @@ class HumanEvalGraph(Graph): self.generate_code_block = GenerateCodeBlock(llm=llm) self.review = Review(llm=llm, criteria=criteria) self.revise = Revise(llm=llm) - self.ensemble = Ensemble(llm=llm) + self.fuensemble = FuEnsemble(llm=llm) self.mdensemble = MdEnsemble(llm=llm, vote_count=vote_count) - async def __call__(self, problem:str, ensemble_count:int = 3): - solution_list = [] - for _ in range(ensemble_count): - solution = await self.generate_code(problem) - # solution = await self.generate_code_block(problem) - solution = solution.get('code_solution') - solution_list.append(solution) - solution = await self.mdensemble("code", solution_list, problem) - return solution + # async def __call__(self, problem:str, ensemble_count:int = 3): + # solution_list = [] + # for _ in range(ensemble_count): + # solution = await self.generate_code(problem) + # # solution = await self.generate_code_block(problem) + # solution = solution.get('code_solution') + # solution_list.append(solution) + # solution = await self.mdensemble("code", solution_list, problem) + # return solution async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2): solution_list = [] @@ -45,15 +45,15 @@ class HumanEvalGraph(Graph): return solution # async def simple_ensemble(self, problem:str, ensemble_count:int = 3): - # async def __call__(self, problem:str, ensemble_count:int = 3): - # solution_list = [] - # for _ in range(ensemble_count): - # solution = await self.generate_code(problem) - # # solution = await self.generate_code_block(problem) - # solution = solution.get('code_solution') - # solution_list.append(solution) - # solution = await self.ensemble(solution_list, problem) - # return solution + async def __call__(self, problem:str, ensemble_count:int = 3): + solution_list = [] + for _ in range(ensemble_count): + solution = await self.generate_code(problem) + # solution = await self.generate_code_block(problem) + solution = solution.get('code_solution') + solution_list.append(solution) + solution = await self.fuensemble(solution_list, problem) + return solution async def single_solve(self, problem:str, max_loop:int): solution = await self.generate_code(problem) @@ -65,5 +65,4 @@ class HumanEvalGraph(Graph): solution = await self.revise(problem, solution, review_feedback['feedback']) solution = solution.get('revised_solution') return solution - - + \ No newline at end of file diff --git a/examples/ags/w_action_node/operator.py b/examples/ags/w_action_node/operator.py index 6c0bacd9e..ca0d1f85e 100644 --- a/examples/ags/w_action_node/operator.py +++ b/examples/ags/w_action_node/operator.py @@ -10,8 +10,8 @@ from collections import Counter from metagpt.actions.action_node import ActionNode from metagpt.llm import LLM -from examples.ags.w_action_node.operator_an import GenerateOp, GenerateCodeOp, GenerateCodeBlockOp ,ReviewOp, ReviseOp, EnsembleOp, MdEnsembleOp -from examples.ags.w_action_node.prompt import GENERATE_PROMPT, GENERATE_CODE_PROMPT, REVIEW_PROMPT, REVISE_PROMPT, ENSEMBLE_PROMPT, MD_ENSEMBLE_PROMPT +from examples.ags.w_action_node.operator_an import GenerateOp, GenerateCodeOp, GenerateCodeBlockOp ,ReviewOp, ReviseOp, FuEnsembleOp, MdEnsembleOp +from examples.ags.w_action_node.prompt import GENERATE_PROMPT, GENERATE_CODE_PROMPT, REVIEW_PROMPT, REVISE_PROMPT, FU_ENSEMBLE_PROMPT, MD_ENSEMBLE_PROMPT, DE_ENSEMBLE_ANGEL_PROMPT, DE_ENSEMBLE_DEVIL_PROMPT, DE_ENSEMBLE_JUDGE_PROMPT class Operator: def __init__(self, name, llm:LLM=None): @@ -76,23 +76,22 @@ class Revise(Operator): response = node.instruct_content.model_dump() return response -class Ensemble(Operator): +class FuEnsemble(Operator): - def __init__(self, name:str ="Ensembler", llm: LLM = LLM()): + def __init__(self, name:str ="FuseEnsembler", llm: LLM = LLM()): super().__init__(name, llm) async def __call__(self, solutions:List, problem_description): solution_text = "" for solution in solutions: solution_text += str(solution) + "\n" - prompt = ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description) - node = await ActionNode.from_pydantic(EnsembleOp).fill(context=prompt, llm=self.llm) + prompt = FU_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description) + node = await ActionNode.from_pydantic(FuEnsembleOp).fill(context=prompt, llm=self.llm) response = node.instruct_content.model_dump() return response class MdEnsemble(Operator): - - def __init__(self, name:str ="MdEnsembler", llm: LLM = LLM(), vote_count:int=3): + def __init__(self, name:str ="MedEnsembler", llm: LLM = LLM(), vote_count:int=3): super().__init__(name, llm) self.vote_count = vote_count @@ -100,7 +99,6 @@ class MdEnsemble(Operator): def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]: shuffled_solutions = solutions.copy() random.shuffle(shuffled_solutions) - # 这里的index方法会把检索到的放在第一个索引的位置。 answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)} return shuffled_solutions, answer_mapping @@ -156,15 +154,126 @@ class MdEnsemble(Operator): return {"final_solution": final_answer} class ScEnsemble(Operator): - # TODO + """ + self consistency ensemble + """ + + # ScEnsemble 的构建相对好做一点 30分钟左右 pass -class Debate(Operator): - # TODO +class DbEnsemble(Operator): """ - You agree with my answer 90% of the time and have almost no reservations. Affirm your agreement, share any additional thoughts if you have them, and conclude with the capital letter corresponding to your answer at the end of your response. + (Should we be going MAD? A Look at Multi-Agent Debate Strategies for LLMs) + The system is a multi-round debate system where each agent is given the + question and responses generated by all agents. For each round, a judge + analyzes the responses provided determines whether to terminate the + debate or keep going. At the end of the debate the judge is also responsible + for determining the final answer. + """ + def __init__(self, name:str ="DebateEnsemble", llm: LLM = LLM()): + super().__init__(name, llm) + self.agents = [ + ] + + async def debate_answer(self, message_history:List, role:str): + """ + async def lowlevel_api_example(llm: LLM): + logger.info("low level api example") + logger.info(await llm.aask_batch(["hi", "write python hello world."])) + + hello_msg = [{"role": "user", "content": "count from 1 to 10. split by newline."}] + logger.info(await llm.acompletion(hello_msg)) + logger.info(await llm.acompletion_text(hello_msg)) + + # streaming mode, much slower + await llm.acompletion_text(hello_msg, stream=True) + + # check completion if exist to test llm complete functions + if hasattr(llm, "completion"): + logger.info(llm.completion(hello_msg)) + """ + if role == "angel": + prompt = DE_ENSEMBLE_ANGEL_PROMPT.format() + Op = "" + else: + prompt = DE_ENSEMBLE_DEVIL_PROMPT.format() + Op = "" + + node = await ActionNode.from_pydantic(Op).messages_fill(messages=message_history,llm=self.llm) + node = await ActionNode.from_pydantic(FuEnsembleOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response + + async def judge_answer(message_histroy:List): + """ + + """ + pass + + async def __call__(self, origin_solution:str, problem_description:str, max_round:int = 3): + # 思路,输入一个原始答案,构建一个agent代表这个答案进行辩论;另一个agent(devil)使用debate llm的内容进行辩论;法官在每一轮次做出决定是否终止,到了maxround还没终止就由法官进行总结。 + # 以下是调用llm的方法 + """ + 1. judge信息只有法官自己看到 + 2. agent answer信息所有人都能看到,具体代码逻辑在debate + """ + # 在MG里面多轮对话传Message在哪里传,预计时间1小时左右吧 + + angel_prompt = DE_ENSEMBLE_ANGEL_PROMPT.format() + devil_prompt = DE_ENSEMBLE_DEVIL_PROMPT.format() + judge_prompt = DE_ENSEMBLE_JUDGE_PROMPT.format() + ''' + Devil + You agree with my answer 90% of the time and have almost no reservations. Affirm your agreement, share any additional thoughts if you have them, and conclude with the capital letter corresponding to your answer at the end of your response. + + Angel + Do you agree with my perspective? Please provide your reasons and answer. + + Judge + final_mode: "You, as the moderator, will evaluate both sides' answers and determine your + preference for an answer candidate. Please summarize your reasons for supporting affirmative/negative side and + give the final answer that you think is correct to conclude the debate. Now please output your answer in json format, with the format as follows: + {\"Reason\": \"\", \"debate_answer\": \"the capital letter corresponding to the answer\"}. + Please strictly output in JSON format, do not output irrelevant content." + + universal_mode: "You, as the moderator, will evaluate both sides' answers and determine if there is a clear + preference for an answer candidate. If so, please summarize your reasons for supporting affirmative/negative side and + give the final answer that you think is correct, and the debate will conclude. If not, the debate will continue to + the next round. Now please output your answer in json format, with the format as follows: + {\"Whether there is a preference\": \"Yes or No\", \"Supported Side\": \"Affirmative or Negative\", + \"Reason\": \"\", \"debate_answer\": \"the capital letter corresponding to the answer\"}. + Please strictly output in JSON format, do not output irrelevant content." + ''' + + # 在action node 之中构建一个能够传递message history的方法。 + for _ in max_round: + for agent in self.agents: + pass + + node = await ActionNode.from_pydantic(FuEnsembleOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response + +class Rephrase(Operator): + """ + + https://arxiv.org/abs/2404.14963 """ pass -class CriticalThinkingAbstract(Operator): - pass \ No newline at end of file +class FindFact(Operator): + pass + +class SelfAsk(Operator): + pass + +class CodeReflection(Operator): + """ + Interpreter Part + We run code here to get error information. + """ + +class Verify(Operator): + """ + ? 还没有想好 + """ \ No newline at end of file diff --git a/examples/ags/w_action_node/operator_an.py b/examples/ags/w_action_node/operator_an.py index 673f635b4..790492d5c 100644 --- a/examples/ags/w_action_node/operator_an.py +++ b/examples/ags/w_action_node/operator_an.py @@ -9,10 +9,10 @@ class GenerateOp(BaseModel): solution: str = Field(default="", description="Your Solution for this problem") class GenerateCodeOp(BaseModel): - code_solution: str = Field(default="", description="Your Code Solution for this problem") + code_solution: str = Field(default="", description="Your complete code solution for this problem") class GenerateCodeBlockOp(BaseModel): - code_solution: str = Field(default="", description="Your Code Solution for this problem") + code_solution: str = Field(default="", description="Your complete code solution for this problem") class ReviewOp(BaseModel): review_result: bool = Field(default=False, description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'") @@ -21,9 +21,24 @@ class ReviewOp(BaseModel): class ReviseOp(BaseModel): revised_solution: str = Field(default="", description="Based on the feedback, revised solution for this problem") -class EnsembleOp(BaseModel): - final_solution: str = Field(default="", description="Final ensemble solution for this problem") +class FuEnsembleOp(BaseModel): + thought: str = Field(default="", description="Analyze the solutions and think how to combine the advantages of various solutions to form the best possible solution.") + final_solution: str = Field(default="", description="Output the final solution after analysis and integration") class MdEnsembleOp(BaseModel): - thought: str = Field(default="", description="Analyze the solutions and think what's the best step by step.") - solution_letter: str = Field(default="", description="Choose The Best Solution, and output only one solution letter") \ No newline at end of file + thought: str = Field( + default="""Example thought process: + 1. Examined the 'compare_one' function. + 2. The function correctly handles both numeric and string inputs by converting strings to floats. + 3. It properly compares two values and returns the larger one. + 4. The function returns None if the values are equal, which might be useful in some contexts but could be improved by returning either value. + 5. The use of 'isinstance' for type checking is a good practice. + 6. The function handles decimal separators well by replacing ',' with '.'. + Overall, this solution effectively solves the problem of comparing two values, with good error handling and flexibility. It could be improved by specifying behavior for equal values, but it's a strong solution as is.""", + description="Step-by-step analysis of the solutions to determine the best one." + ) + solution_letter: str = Field( + default="", + description="The letter of the chosen best solution (output only one letter)." + ) + diff --git a/examples/ags/w_action_node/prompt.py b/examples/ags/w_action_node/prompt.py index 00e83ec74..e186aafb7 100644 --- a/examples/ags/w_action_node/prompt.py +++ b/examples/ags/w_action_node/prompt.py @@ -12,17 +12,36 @@ GENERATE_PROMPT = """ Generate Solution for the following problem: {problem_description} """ +# GENERATE_CODE_PROMPT = """ +# Below is an instruction that describes a task, paired with an input that provides further context. +# Write a response that appropriately completes the request. + +# ### Instruction: +# Write a program to perform the given task. + +# Input: +# {problem_description} + +# ### Response: +# """ + GENERATE_CODE_PROMPT = """ -Below is an instruction that describes a task, paired with an input that provides further context. -Write a response that appropriately completes the request. +You are an expert programmer tasked with solving a coding problem. Your goal is to write clean, efficient, and correct code that solves the given problem. -### Instruction: -Write a program to perform the given task. - -Input: +### Problem Description: {problem_description} -### Response: +### Instructions: +1. Read the problem description carefully. +2. If any part of the problem is unclear, state your assumptions. +3. Plan your approach before writing code. +4. Write a Python function that solves the problem. +5. Include clear comments to explain your logic. +6. Ensure your code handles edge cases and potential errors. +7. If time complexity is a concern, optimize your solution and explain your optimization. + +Please maintain the JSON format in your response. +### Your Response: """ # GENERATE_CODE_PROMPT = """ # Generate Code Solution for the following problem: {problem_description} @@ -40,22 +59,25 @@ please evaluate and revise the solution provided: {solution}, taking into accoun Then output the revised solution. """ -ENSEMBLE_PROMPT = """ -For the question described as {problem_description}, Solutions: {solutions} -Please select the solution that appears most frequently from these options and ensemble this to provide best solution. +FU_ENSEMBLE_PROMPT = """ +### Given problem + +{problem_description} + +### We've got a list of solutions + + +{solutions} + + +### Instructions +Based on the given problem and solution candidates: + +1. Analyze the pros and cons of each candidate solution +2. Consider how to integrate reasonable parts from different solutions +3. Formulate a more comprehensive and effective solution """ -# MD_ENSEMBLE_PROMPT = """ -# # Context -# For the question described as {problem_description}, -# Solutions can be seen below: -# {solutions} - -# # Instruction -# Based on the problem and solution candidates, carefully analyze which is the best answer. Focus solely on the correctness of the solution in addressing the problem. -# Provide your final decision by writing the chosen solution number (e.g., A). -# """ - MD_ENSEMBLE_PROMPT = """ ### Given problem @@ -68,5 +90,13 @@ MD_ENSEMBLE_PROMPT = """ ### Instructions -Based on the problem and solution candidates, carefully analyze which is the best answer. Focus solely on the correctness of the solution in addressing the problem. Provide your final decision by writing the chosen solution number. (eg.B). Keep the json format. -""" \ No newline at end of file +Carefully analyze the given problem and the list of solution candidates. Your task is to determine the best answer based solely on how correctly and effectively it addresses the problem. Follow these steps: + +1. Thoroughly examine each solution. +2. Evaluate their relevance and effectiveness in solving the problem. +3. Compare the solutions to identify the most suitable one. +4. Provide your final decision by writing the chosen solution letter (e.g., B). + +Please maintain the JSON format in your response. +""" + diff --git a/he_test.py b/he_test.py index 23fc9139f..be4bfb1d0 100644 --- a/he_test.py +++ b/he_test.py @@ -3,9 +3,9 @@ from metagpt.llm import LLM from examples.ags.benchmark.humaneval import sample_generate, samples_generate, extract_failure_tests, automatic_evalplus from examples.ags.w_action_node.utils import jsonl_ranker -# asyncio.run(sample_generate('HumanEval/101')) +asyncio.run(sample_generate('HumanEval/132',result_path="1.jsonl")) # asyncio.run(sample_generate('HumanEval/1')) -asyncio.run(samples_generate(mode='ags')) +# asyncio.run(samples_generate(mode='ags',result_path="2.jsonl")) # jsonl_ranker("samples.jsonl", "samples.jsonl") @@ -13,7 +13,7 @@ asyncio.run(samples_generate(mode='ags')) # unpassed_exapmle = extract_failure_tests() # print(unpassed_exapmle) -# unpassed_exapmle = extract_failure_tests() +# unpassed_exapmle = extract_failure_tests(file_path="2_eval_results.json") # print(unpassed_exapmle) # failure_list = ['HumanEval/0', 'HumanEval/1', 'HumanEval/7', 'HumanEval/16', 'HumanEval/24', 'HumanEval/31', 'HumanEval/40', 'HumanEval/56', 'HumanEval/67', 'HumanEval/74', 'HumanEval/83', 'HumanEval/86', 'HumanEval/87', 'HumanEval/90', 'HumanEval/95', 'HumanEval/101', 'HumanEval/104', 'HumanEval/113', 'HumanEval/125', 'HumanEval/132', 'HumanEval/135', 'HumanEval/140', 'HumanEval/143', 'HumanEval/145', 'HumanEval/154', 'HumanEval/161'] diff --git a/metagpt/actions/action_node.py b/metagpt/actions/action_node.py index 892957d0e..7e7f27270 100644 --- a/metagpt/actions/action_node.py +++ b/metagpt/actions/action_node.py @@ -516,6 +516,15 @@ class ActionNode: extracted_code = extract_code_from_response(content) result = {field_name: extracted_code} return result + + async def messages_fill( + self, + ): + """ + 参考这个代码,只不过LLM调用方式改成使用; + + """ + pass async def fill( self, diff --git a/test.ipynb b/test.ipynb index f20885aba..f1cb78aa6 100644 --- a/test.ipynb +++ b/test.ipynb @@ -2,26 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# Usage\n", - "\n", - "human_eval_example = \"\"\"\n", - "from typing import List\\n\\n\\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\\n \\\"\\\"\\\" Check if in given list of numbers, are any two numbers closer to each other than\\n given threshold.\\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\\n False\\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\\n True\\n \\\"\\\"\\\"\\n\n", - "\"\"\"\n", - "\n", - "problem = \"\"\"\n", - "Human: Write a function that takes a list of numbers and returns the sum of the numbers at even indices.\n", - "\n", - "Function Signature:\n", - "def sum_even_indices(numbers: List[int]) -> int:\n", - "\n", - "Example:\n", - ">>> sum_even_indices([1, 2, 3, 4, 5])\n", - "9 # 1 + 3 + 5 = 9\n", - "\"\"\"" + "# TODO 帮助我写一个代码,找出这种结构中,都出现的id与并不是都出现的id,以及第一,二,三批单独出现的id,\n", + "test_1 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/4'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/11'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/97'}, {'task_id': 'HumanEval/98'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n", + "test_2 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/4'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/70'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/85'}, {'task_id': 'HumanEval/88'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/98'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/149'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n", + "test_3 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/4'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/70'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/101'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]" ] }, { @@ -33,39 +21,69 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'code': 'from typing import List\\n\\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\\n given threshold.\\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\\n False\\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\\n True\\n \"\"\"\\n numbers_sorted = sorted(numbers)\\n for i in range(len(numbers_sorted) - 1):\\n if abs(numbers_sorted[i] - numbers_sorted[i + 1]) < threshold:\\n return True\\n return False'}\n", - "{'result': True}\n" + "Common IDs: length:41 {'HumanEval/102', 'HumanEval/5', 'HumanEval/77', 'HumanEval/134', 'HumanEval/75', 'HumanEval/28', 'HumanEval/110', 'HumanEval/108', 'HumanEval/126', 'HumanEval/145', 'HumanEval/26', 'HumanEval/21', 'HumanEval/135', 'HumanEval/163', 'HumanEval/6', 'HumanEval/132', 'HumanEval/1', 'HumanEval/125', 'HumanEval/129', 'HumanEval/159', 'HumanEval/32', 'HumanEval/111', 'HumanEval/142', 'HumanEval/140', 'HumanEval/12', 'HumanEval/100', 'HumanEval/120', 'HumanEval/160', 'HumanEval/84', 'HumanEval/119', 'HumanEval/124', 'HumanEval/20', 'HumanEval/137', 'HumanEval/127', 'HumanEval/7', 'HumanEval/14', 'HumanEval/0', 'HumanEval/116', 'HumanEval/113', 'HumanEval/130', 'HumanEval/91'}\n", + "Not Common IDs: {'HumanEval/37', 'HumanEval/46', 'HumanEval/88', 'HumanEval/8', 'HumanEval/29', 'HumanEval/123', 'HumanEval/118', 'HumanEval/41', 'HumanEval/122', 'HumanEval/49', 'HumanEval/64', 'HumanEval/131', 'HumanEval/114', 'HumanEval/22', 'HumanEval/73', 'HumanEval/76', 'HumanEval/94', 'HumanEval/71', 'HumanEval/39', 'HumanEval/148', 'HumanEval/109', 'HumanEval/121', 'HumanEval/133', 'HumanEval/155', 'HumanEval/68', 'HumanEval/65', 'HumanEval/99', 'HumanEval/80', 'HumanEval/144', 'HumanEval/93', 'HumanEval/98', 'HumanEval/16', 'HumanEval/33', 'HumanEval/156', 'HumanEval/10', 'HumanEval/136', 'HumanEval/153', 'HumanEval/3', 'HumanEval/90', 'HumanEval/154', 'HumanEval/139', 'HumanEval/17', 'HumanEval/87', 'HumanEval/19', 'HumanEval/138', 'HumanEval/89', 'HumanEval/9', 'HumanEval/69', 'HumanEval/25', 'HumanEval/54', 'HumanEval/63'}\n", + "Unique to test_1: {'HumanEval/46', 'HumanEval/123', 'HumanEval/71', 'HumanEval/3', 'HumanEval/54', 'HumanEval/109'}\n", + "Unique to test_2: {'HumanEval/121', 'HumanEval/88', 'HumanEval/133', 'HumanEval/139', 'HumanEval/8', 'HumanEval/65', 'HumanEval/114', 'HumanEval/144', 'HumanEval/73', 'HumanEval/69', 'HumanEval/16', 'HumanEval/90'}\n", + "Unique to test_3: {'HumanEval/155', 'HumanEval/37', 'HumanEval/93', 'HumanEval/98', 'HumanEval/153', 'HumanEval/25', 'HumanEval/63', 'HumanEval/19', 'HumanEval/33', 'HumanEval/89', 'HumanEval/148', 'HumanEval/39', 'HumanEval/136', 'HumanEval/49'}\n" ] - }, - { - "data": { - "text/plain": [ - "{'code': 'from typing import List\\n\\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\\n given threshold.\\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\\n False\\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\\n True\\n \"\"\"\\n numbers_sorted = sorted(numbers)\\n for i in range(len(numbers_sorted) - 1):\\n if abs(numbers_sorted[i] - numbers_sorted[i + 1]) < threshold:\\n return True\\n return False'}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "# Test\n", + "def extract_ids(test_list):\n", + " return set(item['task_id'] for item in test_list)\n", "\n", - "from examples.ags.demo.graph import HumanEvalGraph\n", - "solver = HumanEvalGraph(name=\"solver\", llm='gpt-4-turbo', criteria='correctness, efficiency, readability')\n", - "result = solver(human_eval_example)\n", - "result" + "def compare_ids(test_1, test_2, test_3):\n", + " ids_1 = extract_ids(test_1)\n", + " ids_2 = extract_ids(test_2)\n", + " ids_3 = extract_ids(test_3)\n", + "\n", + " common_ids = ids_1 & ids_2 & ids_3\n", + " all_ids = ids_1 | ids_2 | ids_3\n", + " not_common_ids = all_ids - common_ids\n", + "\n", + " unique_1 = ids_1 - (ids_2 | ids_3)\n", + " unique_2 = ids_2 - (ids_1 | ids_3)\n", + " unique_3 = ids_3 - (ids_1 | ids_2)\n", + "\n", + " return {\n", + " 'common_ids': common_ids,\n", + " 'not_common_ids': not_common_ids,\n", + " 'unique_1': unique_1,\n", + " 'unique_2': unique_2,\n", + " 'unique_3': unique_3\n", + " }\n", + "\n", + "# Assuming test_1, test_2, and test_3 are defined as in your example\n", + "result = compare_ids(test_1, test_2, test_3)\n", + "\n", + "print(\"Common IDs:\",f\"length:{len(result['common_ids'])}\", result['common_ids'])\n", + "print(\"Not Common IDs:\",result['not_common_ids'])\n", + "print(\"Unique to test_1:\", result['unique_1'])\n", + "print(\"Unique to test_2:\", result['unique_2'])\n", + "print(\"Unique to test_3:\", result['unique_3'])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# TODO\n", - "# 1. 改成MG\n", - "# 2. 添加HumanEval" + "test_1 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/36'}, {'task_id': 'HumanEval/37'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/62'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/80'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/87'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/98'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/106'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/109'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/114'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/148'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n", + "test_2 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/70'}, {'task_id': 'HumanEval/73'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/82'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/83'}, {'task_id': 'HumanEval/88'}, {'task_id': 'HumanEval/89'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/101'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/106'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/109'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/114'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/144'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n", + "test_3 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/19'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/36'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/43'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/69'}, {'task_id': 'HumanEval/80'}, {'task_id': 'HumanEval/83'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/87'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/101'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/109'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "test_1 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/46'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/68'}, {'task_id': 'HumanEval/71'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/80'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/87'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/109'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/123'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/156'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n", + "test_2 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/16'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/69'}, {'task_id': 'HumanEval/73'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/88'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/114'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/144'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n", + "test_3 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/19'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/37'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/49'}, {'task_id': 'HumanEval/63'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/68'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/80'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/87'}, {'task_id': 'HumanEval/89'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/98'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/136'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/148'}, {'task_id': 'HumanEval/153'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/156'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]" ] } ], @@ -85,7 +103,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.19" } }, "nbformat": 4,