diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py index 9f63406ef..b63b8889e 100644 --- a/examples/ags/benchmark/humaneval.py +++ b/examples/ags/benchmark/humaneval.py @@ -19,10 +19,14 @@ generate_code_block = GenerateCodeBlock(llm=LLM()) solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5) -async def sample_generate(id, result_path:str="samples.jsonl"): +async def sample_generate(id, result_path:str="samples.jsonl",mode:str="ags"): case = get_human_eval_plus()[f"{id}"] - solution_result = await solver(case['prompt'],ensemble_count=5) - sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution']) + if mode == "ags": + solution_result = await solver(case['prompt'],ensemble_count=5) + sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution']) + else: + solution_result = await generate_code_block(case['prompt']) + sample_dict = dict(task_id=case['task_id'], solution=solution_result['code_solution']) with open(result_path, mode='a') as f: f.write(json.dumps(sample_dict) + '\n') jsonl_ranker(result_path, result_path) @@ -62,11 +66,29 @@ async def samples_generate(mode:str, result_path:str="samples.jsonl"): # TODO 这个地方还是不够自动化 if failed_tasks: - for task_id in failed_tasks: - try: - await sample_generate(task_id) - except Exception as e: - print(f"failure {task_id}") + print(failed_tasks) + if mode == 'llm': + for task_id in failed_tasks: + case = get_human_eval_plus()[task_id] + for _ in range(3): + try: + solution_result = await generate_code_block(case['prompt']) + task_dict = { + 'task_id': case['task_id'], + 'solution': solution_result['code_solution'] + } + with open(result_path, mode='a') as f: + f.write(json.dumps(task_dict) + '\n') + failed_tasks.remove(task_id) + break + except Exception as e: + print(f"{e} \n failure {task_id}") + elif mode == "ags": + for task_id in failed_tasks: + try: + await sample_generate(task_id,result_path) + except Exception as e: + print(f"failure {task_id}") jsonl_ranker(result_path, result_path) if not failed_tasks: diff --git a/examples/ags/w_action_node/graph.py b/examples/ags/w_action_node/graph.py index 0766ad94a..7b029b5a9 100644 --- a/examples/ags/w_action_node/graph.py +++ b/examples/ags/w_action_node/graph.py @@ -32,10 +32,16 @@ class HumanEvalGraph(Graph): async def __call__(self, problem:str, ensemble_count:int = 3): solution_list = [] for _ in range(ensemble_count): - solution = await self.generate_code(problem) - # solution = await self.generate_code_block(problem) - solution = solution.get('code_solution') - solution_list.append(solution) + for retry_count in range(5): + try: + # solution = await self.generate_code(problem) + solution = await self.generate_code_block(problem) + solution = solution.get('code_solution') + solution_list.append(solution) + break + except Exception as e: + print(e) + # solution list 有5个 solution = await self.mdensemble("code", solution_list, problem) return solution diff --git a/examples/ags/w_action_node/operator.py b/examples/ags/w_action_node/operator.py index 5b5a48875..fe9ee5da7 100644 --- a/examples/ags/w_action_node/operator.py +++ b/examples/ags/w_action_node/operator.py @@ -127,8 +127,8 @@ class MdEnsemble(Operator): continue solutions = updated_solutions updated_length = len(solutions) - print(f"Original number of solutions: {original_length}") - print(f"Updated number of solutions: {updated_length}") + # print(f"Original number of solutions: {original_length}") + # print(f"Updated number of solutions: {updated_length}") if updated_length == 1: return {"final_solution": solutions[0]} for _ in range(self.vote_count): @@ -136,7 +136,7 @@ class MdEnsemble(Operator): solution_text = "" for index, solution in enumerate(shuffled_solutions): - solution_text += f"{chr(65 + index)}: {str(solution)}\n" + solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n" prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description) node = await ActionNode.from_pydantic(MdEnsembleOp).fill(context=prompt, llm=self.llm) diff --git a/examples/ags/w_action_node/operator_an.py b/examples/ags/w_action_node/operator_an.py index 7560614ea..928c0f67a 100644 --- a/examples/ags/w_action_node/operator_an.py +++ b/examples/ags/w_action_node/operator_an.py @@ -39,6 +39,6 @@ class MdEnsembleOp(BaseModel): ) solution_letter: str = Field( default="", - description="The letter of the chosen best solution (output only one letter)." + description="The letter of the chosen best solution (only one letter)." ) diff --git a/examples/ags/w_action_node/prompt.py b/examples/ags/w_action_node/prompt.py index dc92455aa..9822fab49 100644 --- a/examples/ags/w_action_node/prompt.py +++ b/examples/ags/w_action_node/prompt.py @@ -83,23 +83,15 @@ Based on the given problem and solution candidates: """ MD_ENSEMBLE_PROMPT = """ -### Given problem - +You are given a coding problem: {problem_description} -### We've got a list of solutions - - +Here is a list of possible solutions to the problem: {solutions} - -### Instructions -Carefully analyze the given problem and the list of solution candidates. Your task is to determine the best answer based solely on how correctly and effectively it addresses the problem. Follow these steps: - -1. Thoroughly examine each solution. -2. Evaluate their relevance and effectiveness in solving the problem. -3. Compare the solutions to identify the most suitable one. -4. Provide your final decision by writing the chosen solution letter (e.g., B). +Using the inputs above, your goal is to choose the best solution to the code contest problem. +Don't just pick the most efficient solution. The main consideration is that the solution can fully solve the problem in a correct and robust manner. +Provide your final decision by writing the chosen solution letter (e.g., B). Please maintain the JSON format in your response. """ diff --git a/he_test.py b/he_test.py index be4bfb1d0..a0cbed79f 100644 --- a/he_test.py +++ b/he_test.py @@ -3,20 +3,18 @@ from metagpt.llm import LLM from examples.ags.benchmark.humaneval import sample_generate, samples_generate, extract_failure_tests, automatic_evalplus from examples.ags.w_action_node.utils import jsonl_ranker -asyncio.run(sample_generate('HumanEval/132',result_path="1.jsonl")) -# asyncio.run(sample_generate('HumanEval/1')) -# asyncio.run(samples_generate(mode='ags',result_path="2.jsonl")) +# 132 141 136 80 73 +# asyncio.run(sample_generate('HumanEval/118',result_path="llm_based_4.jsonl",mode="llm")) +# asyncio.run(samples_generate(mode='ags',result_path="ags_based_1.jsonl")) # jsonl_ranker("samples.jsonl", "samples.jsonl") - -# if automatic_evalplus(): -# unpassed_exapmle = extract_failure_tests() -# print(unpassed_exapmle) +result_path = "ags_based_2.jsonl" +if automatic_evalplus(result_path): + unpassed_exapmle = extract_failure_tests(result_path[:-6]+"_eval_results.json") + print(unpassed_exapmle) # unpassed_exapmle = extract_failure_tests(file_path="2_eval_results.json") # print(unpassed_exapmle) -# failure_list = ['HumanEval/0', 'HumanEval/1', 'HumanEval/7', 'HumanEval/16', 'HumanEval/24', 'HumanEval/31', 'HumanEval/40', 'HumanEval/56', 'HumanEval/67', 'HumanEval/74', 'HumanEval/83', 'HumanEval/86', 'HumanEval/87', 'HumanEval/90', 'HumanEval/95', 'HumanEval/101', 'HumanEval/104', 'HumanEval/113', 'HumanEval/125', 'HumanEval/132', 'HumanEval/135', 'HumanEval/140', 'HumanEval/143', 'HumanEval/145', 'HumanEval/154', 'HumanEval/161'] - # for example in failure_list: # asyncio.run(sample_generate(example)) \ No newline at end of file diff --git a/test.ipynb b/test.ipynb index f1cb78aa6..c137a9a2e 100644 --- a/test.ipynb +++ b/test.ipynb @@ -7,25 +7,27 @@ "outputs": [], "source": [ "# TODO 帮助我写一个代码,找出这种结构中,都出现的id与并不是都出现的id,以及第一,二,三批单独出现的id,\n", - "test_1 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/4'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/11'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/97'}, {'task_id': 'HumanEval/98'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n", - "test_2 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/4'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/70'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/85'}, {'task_id': 'HumanEval/88'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/98'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/149'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n", - "test_3 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/4'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/70'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/101'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]" + "test_1 = [{'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/11'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/46'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/63'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/95'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n", + "test_2 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/153'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n", + "test_3 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/83'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/97'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/128'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/149'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n", + "test_4 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/36'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/43'}, {'task_id': 'HumanEval/46'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/73'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/97'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/128'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/148'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/161'}, {'task_id': 'HumanEval/163'}]" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Common IDs: length:41 {'HumanEval/102', 'HumanEval/5', 'HumanEval/77', 'HumanEval/134', 'HumanEval/75', 'HumanEval/28', 'HumanEval/110', 'HumanEval/108', 'HumanEval/126', 'HumanEval/145', 'HumanEval/26', 'HumanEval/21', 'HumanEval/135', 'HumanEval/163', 'HumanEval/6', 'HumanEval/132', 'HumanEval/1', 'HumanEval/125', 'HumanEval/129', 'HumanEval/159', 'HumanEval/32', 'HumanEval/111', 'HumanEval/142', 'HumanEval/140', 'HumanEval/12', 'HumanEval/100', 'HumanEval/120', 'HumanEval/160', 'HumanEval/84', 'HumanEval/119', 'HumanEval/124', 'HumanEval/20', 'HumanEval/137', 'HumanEval/127', 'HumanEval/7', 'HumanEval/14', 'HumanEval/0', 'HumanEval/116', 'HumanEval/113', 'HumanEval/130', 'HumanEval/91'}\n", - "Not Common IDs: {'HumanEval/37', 'HumanEval/46', 'HumanEval/88', 'HumanEval/8', 'HumanEval/29', 'HumanEval/123', 'HumanEval/118', 'HumanEval/41', 'HumanEval/122', 'HumanEval/49', 'HumanEval/64', 'HumanEval/131', 'HumanEval/114', 'HumanEval/22', 'HumanEval/73', 'HumanEval/76', 'HumanEval/94', 'HumanEval/71', 'HumanEval/39', 'HumanEval/148', 'HumanEval/109', 'HumanEval/121', 'HumanEval/133', 'HumanEval/155', 'HumanEval/68', 'HumanEval/65', 'HumanEval/99', 'HumanEval/80', 'HumanEval/144', 'HumanEval/93', 'HumanEval/98', 'HumanEval/16', 'HumanEval/33', 'HumanEval/156', 'HumanEval/10', 'HumanEval/136', 'HumanEval/153', 'HumanEval/3', 'HumanEval/90', 'HumanEval/154', 'HumanEval/139', 'HumanEval/17', 'HumanEval/87', 'HumanEval/19', 'HumanEval/138', 'HumanEval/89', 'HumanEval/9', 'HumanEval/69', 'HumanEval/25', 'HumanEval/54', 'HumanEval/63'}\n", - "Unique to test_1: {'HumanEval/46', 'HumanEval/123', 'HumanEval/71', 'HumanEval/3', 'HumanEval/54', 'HumanEval/109'}\n", - "Unique to test_2: {'HumanEval/121', 'HumanEval/88', 'HumanEval/133', 'HumanEval/139', 'HumanEval/8', 'HumanEval/65', 'HumanEval/114', 'HumanEval/144', 'HumanEval/73', 'HumanEval/69', 'HumanEval/16', 'HumanEval/90'}\n", - "Unique to test_3: {'HumanEval/155', 'HumanEval/37', 'HumanEval/93', 'HumanEval/98', 'HumanEval/153', 'HumanEval/25', 'HumanEval/63', 'HumanEval/19', 'HumanEval/33', 'HumanEval/89', 'HumanEval/148', 'HumanEval/39', 'HumanEval/136', 'HumanEval/49'}\n" + "Common IDs: length:25 {'HumanEval/32', 'HumanEval/130', 'HumanEval/127', 'HumanEval/100', 'HumanEval/132', 'HumanEval/115', 'HumanEval/129', 'HumanEval/75', 'HumanEval/76', 'HumanEval/108', 'HumanEval/140', 'HumanEval/119', 'HumanEval/41', 'HumanEval/155', 'HumanEval/145', 'HumanEval/93', 'HumanEval/120', 'HumanEval/111', 'HumanEval/126', 'HumanEval/163', 'HumanEval/134', 'HumanEval/142', 'HumanEval/125', 'HumanEval/137', 'HumanEval/133'}\n", + "Not Common IDs: {'HumanEval/122', 'HumanEval/6', 'HumanEval/74', 'HumanEval/81', 'HumanEval/83', 'HumanEval/90', 'HumanEval/160', 'HumanEval/36', 'HumanEval/73', 'HumanEval/94', 'HumanEval/95', 'HumanEval/14', 'HumanEval/148', 'HumanEval/29', 'HumanEval/39', 'HumanEval/64', 'HumanEval/46', 'HumanEval/102', 'HumanEval/26', 'HumanEval/153', 'HumanEval/10', 'HumanEval/161', 'HumanEval/139', 'HumanEval/11', 'HumanEval/159', 'HumanEval/54', 'HumanEval/110', 'HumanEval/131', 'HumanEval/149', 'HumanEval/1', 'HumanEval/43', 'HumanEval/128', 'HumanEval/97', 'HumanEval/118', 'HumanEval/135', 'HumanEval/77', 'HumanEval/121', 'HumanEval/154', 'HumanEval/113', 'HumanEval/63', 'HumanEval/138', 'HumanEval/91', 'HumanEval/84'}\n", + "Unique to test_1: {'HumanEval/11', 'HumanEval/154', 'HumanEval/63', 'HumanEval/90', 'HumanEval/94', 'HumanEval/95'}\n", + "Unique to test_2: {'HumanEval/139', 'HumanEval/102', 'HumanEval/160', 'HumanEval/153', 'HumanEval/138'}\n", + "Unique to test_3: {'HumanEval/149', 'HumanEval/29', 'HumanEval/83', 'HumanEval/26'}\n", + "Unique to test_4: {'HumanEval/14', 'HumanEval/148', 'HumanEval/73', 'HumanEval/36', 'HumanEval/161', 'HumanEval/43'}\n" ] } ], @@ -33,40 +35,44 @@ "def extract_ids(test_list):\n", " return set(item['task_id'] for item in test_list)\n", "\n", - "def compare_ids(test_1, test_2, test_3):\n", + "def compare_ids(test_1, test_2, test_3, test_4):\n", " ids_1 = extract_ids(test_1)\n", " ids_2 = extract_ids(test_2)\n", " ids_3 = extract_ids(test_3)\n", + " ids_4 = extract_ids(test_4)\n", "\n", - " common_ids = ids_1 & ids_2 & ids_3\n", - " all_ids = ids_1 | ids_2 | ids_3\n", + " common_ids = ids_1 & ids_2 & ids_3 & ids_4\n", + " all_ids = ids_1 | ids_2 | ids_3 | ids_4\n", " not_common_ids = all_ids - common_ids\n", "\n", - " unique_1 = ids_1 - (ids_2 | ids_3)\n", - " unique_2 = ids_2 - (ids_1 | ids_3)\n", - " unique_3 = ids_3 - (ids_1 | ids_2)\n", + " unique_1 = ids_1 - (ids_2 | ids_3 | ids_4)\n", + " unique_2 = ids_2 - (ids_1 | ids_3 | ids_4)\n", + " unique_3 = ids_3 - (ids_1 | ids_2 | ids_4)\n", + " unique_4 = ids_4 - (ids_1 | ids_2 | ids_3)\n", "\n", " return {\n", " 'common_ids': common_ids,\n", " 'not_common_ids': not_common_ids,\n", " 'unique_1': unique_1,\n", " 'unique_2': unique_2,\n", - " 'unique_3': unique_3\n", + " 'unique_3': unique_3,\n", + " 'unique_4': unique_4\n", " }\n", "\n", "# Assuming test_1, test_2, and test_3 are defined as in your example\n", - "result = compare_ids(test_1, test_2, test_3)\n", + "result = compare_ids(test_1, test_2, test_3, test_4)\n", "\n", "print(\"Common IDs:\",f\"length:{len(result['common_ids'])}\", result['common_ids'])\n", "print(\"Not Common IDs:\",result['not_common_ids'])\n", "print(\"Unique to test_1:\", result['unique_1'])\n", "print(\"Unique to test_2:\", result['unique_2'])\n", - "print(\"Unique to test_3:\", result['unique_3'])" + "print(\"Unique to test_3:\", result['unique_3'])\n", + "print(\"Unique to test_4:\", result['unique_4'])" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -77,7 +83,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -85,6 +91,132 @@ "test_2 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/16'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/69'}, {'task_id': 'HumanEval/73'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/88'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/114'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/144'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n", "test_3 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/19'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/37'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/49'}, {'task_id': 'HumanEval/63'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/68'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/80'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/87'}, {'task_id': 'HumanEval/89'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/98'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/136'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/148'}, {'task_id': 'HumanEval/153'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/156'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]" ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "from typing import List\n", + "\n", + "\n", + "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n", + " \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n", + " given threshold.\n", + " >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n", + " False\n", + " >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n", + " True\n", + " \"\"\"\n", + " for i in range(len(numbers)):\n", + " for j in range(i + 1, len(numbers)):\n", + " if abs(numbers[i] - numbers[j]) < threshold:\n", + " return True\n", + " return False\n", + "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n", + " \n", + " # Sort the numbers in ascending order\n", + " numbers.sort()\n", + " \n", + " # Iterate through the numbers and check the difference between adjacent numbers\n", + " for i in range(len(numbers) - 1):\n", + " if abs(numbers[i] - numbers[i+1]) < threshold:\n", + " return True\n", + " \n", + " # If no adjacent numbers are closer than the threshold, return False\n", + " return False\n", + "\n", + "\n", + " sorted_numbers = sorted(numbers)\n", + " for i in range(len(sorted_numbers) - 1):\n", + " if sorted_numbers[i + 1] - sorted_numbers[i] < threshold:\n", + " return True\n", + " return False\n", + "\n", + "\n", + "\n", + "\n", + "METADATA = {\n", + " 'author': 'jt',\n", + " 'dataset': 'test'\n", + "}\n", + "\n", + "\n", + "def check(candidate):\n", + " assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n", + " assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n", + " assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n", + " assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n", + " assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n", + " assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n", + " assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "'\\n assert isinstance(threshold, float) and threshold > 0, \"invalid inputs\" # $_CONTRACT_$\\n assert isinstance(numbers, list), \"invalid inputs\" # $_CONTRACT_$\\n assert all([isinstance(v, (int, float)) for v in numbers]), \"invalid inputs\" # $_CONTRACT_$\\n'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from evalplus.data import get_human_eval_plus, write_jsonl\n", + "\n", + "humaneval = get_human_eval_plus()\n", + "d_result = {\"task_id\": \"HumanEval/0\", \"solution\": \"from typing import List\\n\\n\\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\\n \\\"\\\"\\\" Check if in given list of numbers, are any two numbers closer to each other than\\n given threshold.\\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\\n False\\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\\n True\\n \\\"\\\"\\\"\\n for i in range(len(numbers)):\\n for j in range(i + 1, len(numbers)):\\n if abs(numbers[i] - numbers[j]) < threshold:\\n return True\\n return False\"}\n", + "result = {\"task_id\": \"HumanEval/0\", \"solution\": \"def has_close_elements(numbers: List[float], threshold: float) -> bool:\\n \\n # Sort the numbers in ascending order\\n numbers.sort()\\n \\n # Iterate through the numbers and check the difference between adjacent numbers\\n for i in range(len(numbers) - 1):\\n if abs(numbers[i] - numbers[i+1]) < threshold:\\n return True\\n \\n # If no adjacent numbers are closer than the threshold, return False\\n return False\"}\n", + "print(d_result[\"solution\"])\n", + "print(result[\"solution\"])\n", + "d_result = \n", + "result = \n", + "print(d_result[\"solution\"])\n", + "print(result[\"solution\"])\n", + "print(humaneval['HumanEval/0']['canonical_solution'])\n", + "print(humaneval['HumanEval/0']['test'])\n", + "humaneval['HumanEval/0']['contract']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List\n", + "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n", + " \n", + " # Sort the numbers in ascending order\n", + " numbers.sort()\n", + " \n", + " # Iterate through the numbers and check the difference between adjacent numbers\n", + " for i in range(len(numbers) - 1):\n", + " if abs(numbers[i] - numbers[i+1]) < threshold:\n", + " return True\n", + " \n", + " # If no adjacent numbers are closer than the threshold, return False\n", + " return False\n", + "\n", + "def check(candidate):\n", + " assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n", + " assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n", + " assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n", + " assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n", + " assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n", + " assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n", + " assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n", + "\n", + "check(has_close_elements)" + ] } ], "metadata": {