This commit is contained in:
didi 2024-07-17 23:08:41 +08:00
parent e0955c5bf9
commit 89b0c4ce30
7 changed files with 208 additions and 58 deletions

View file

@ -19,10 +19,14 @@ generate_code_block = GenerateCodeBlock(llm=LLM())
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)
async def sample_generate(id, result_path:str="samples.jsonl"):
async def sample_generate(id, result_path:str="samples.jsonl",mode:str="ags"):
case = get_human_eval_plus()[f"{id}"]
solution_result = await solver(case['prompt'],ensemble_count=5)
sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
if mode == "ags":
solution_result = await solver(case['prompt'],ensemble_count=5)
sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
else:
solution_result = await generate_code_block(case['prompt'])
sample_dict = dict(task_id=case['task_id'], solution=solution_result['code_solution'])
with open(result_path, mode='a') as f:
f.write(json.dumps(sample_dict) + '\n')
jsonl_ranker(result_path, result_path)
@ -62,11 +66,29 @@ async def samples_generate(mode:str, result_path:str="samples.jsonl"):
# TODO 这个地方还是不够自动化
if failed_tasks:
for task_id in failed_tasks:
try:
await sample_generate(task_id)
except Exception as e:
print(f"failure {task_id}")
print(failed_tasks)
if mode == 'llm':
for task_id in failed_tasks:
case = get_human_eval_plus()[task_id]
for _ in range(3):
try:
solution_result = await generate_code_block(case['prompt'])
task_dict = {
'task_id': case['task_id'],
'solution': solution_result['code_solution']
}
with open(result_path, mode='a') as f:
f.write(json.dumps(task_dict) + '\n')
failed_tasks.remove(task_id)
break
except Exception as e:
print(f"{e} \n failure {task_id}")
elif mode == "ags":
for task_id in failed_tasks:
try:
await sample_generate(task_id,result_path)
except Exception as e:
print(f"failure {task_id}")
jsonl_ranker(result_path, result_path)
if not failed_tasks:

View file

@ -32,10 +32,16 @@ class HumanEvalGraph(Graph):
async def __call__(self, problem:str, ensemble_count:int = 3):
solution_list = []
for _ in range(ensemble_count):
solution = await self.generate_code(problem)
# solution = await self.generate_code_block(problem)
solution = solution.get('code_solution')
solution_list.append(solution)
for retry_count in range(5):
try:
# solution = await self.generate_code(problem)
solution = await self.generate_code_block(problem)
solution = solution.get('code_solution')
solution_list.append(solution)
break
except Exception as e:
print(e)
# solution list 有5个
solution = await self.mdensemble("code", solution_list, problem)
return solution

View file

@ -127,8 +127,8 @@ class MdEnsemble(Operator):
continue
solutions = updated_solutions
updated_length = len(solutions)
print(f"Original number of solutions: {original_length}")
print(f"Updated number of solutions: {updated_length}")
# print(f"Original number of solutions: {original_length}")
# print(f"Updated number of solutions: {updated_length}")
if updated_length == 1:
return {"final_solution": solutions[0]}
for _ in range(self.vote_count):
@ -136,7 +136,7 @@ class MdEnsemble(Operator):
solution_text = ""
for index, solution in enumerate(shuffled_solutions):
solution_text += f"{chr(65 + index)}: {str(solution)}\n"
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description)
node = await ActionNode.from_pydantic(MdEnsembleOp).fill(context=prompt, llm=self.llm)

View file

@ -39,6 +39,6 @@ class MdEnsembleOp(BaseModel):
)
solution_letter: str = Field(
default="",
description="The letter of the chosen best solution (output only one letter)."
description="The letter of the chosen best solution (only one letter)."
)

View file

@ -83,23 +83,15 @@ Based on the given problem and solution candidates:
"""
MD_ENSEMBLE_PROMPT = """
### Given problem
You are given a coding problem:
{problem_description}
### We've got a list of solutions
<solutions>
Here is a list of possible solutions to the problem:
{solutions}
</solutions>
### Instructions
Carefully analyze the given problem and the list of solution candidates. Your task is to determine the best answer based solely on how correctly and effectively it addresses the problem. Follow these steps:
1. Thoroughly examine each solution.
2. Evaluate their relevance and effectiveness in solving the problem.
3. Compare the solutions to identify the most suitable one.
4. Provide your final decision by writing the chosen solution letter (e.g., B).
Using the inputs above, your goal is to choose the best solution to the code contest problem.
Don't just pick the most efficient solution. The main consideration is that the solution can fully solve the problem in a correct and robust manner.
Provide your final decision by writing the chosen solution letter (e.g., B).
Please maintain the JSON format in your response.
"""

View file

@ -3,20 +3,18 @@ from metagpt.llm import LLM
from examples.ags.benchmark.humaneval import sample_generate, samples_generate, extract_failure_tests, automatic_evalplus
from examples.ags.w_action_node.utils import jsonl_ranker
asyncio.run(sample_generate('HumanEval/132',result_path="1.jsonl"))
# asyncio.run(sample_generate('HumanEval/1'))
# asyncio.run(samples_generate(mode='ags',result_path="2.jsonl"))
# 132 141 136 80 73
# asyncio.run(sample_generate('HumanEval/118',result_path="llm_based_4.jsonl",mode="llm"))
# asyncio.run(samples_generate(mode='ags',result_path="ags_based_1.jsonl"))
# jsonl_ranker("samples.jsonl", "samples.jsonl")
# if automatic_evalplus():
# unpassed_exapmle = extract_failure_tests()
# print(unpassed_exapmle)
result_path = "ags_based_2.jsonl"
if automatic_evalplus(result_path):
unpassed_exapmle = extract_failure_tests(result_path[:-6]+"_eval_results.json")
print(unpassed_exapmle)
# unpassed_exapmle = extract_failure_tests(file_path="2_eval_results.json")
# print(unpassed_exapmle)
# failure_list = ['HumanEval/0', 'HumanEval/1', 'HumanEval/7', 'HumanEval/16', 'HumanEval/24', 'HumanEval/31', 'HumanEval/40', 'HumanEval/56', 'HumanEval/67', 'HumanEval/74', 'HumanEval/83', 'HumanEval/86', 'HumanEval/87', 'HumanEval/90', 'HumanEval/95', 'HumanEval/101', 'HumanEval/104', 'HumanEval/113', 'HumanEval/125', 'HumanEval/132', 'HumanEval/135', 'HumanEval/140', 'HumanEval/143', 'HumanEval/145', 'HumanEval/154', 'HumanEval/161']
# for example in failure_list:
# asyncio.run(sample_generate(example))

View file

@ -7,25 +7,27 @@
"outputs": [],
"source": [
"# TODO 帮助我写一个代码找出这种结构中都出现的id与并不是都出现的id以及第一三批单独出现的id\n",
"test_1 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/4'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/11'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/97'}, {'task_id': 'HumanEval/98'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n",
"test_2 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/4'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/70'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/85'}, {'task_id': 'HumanEval/88'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/98'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/149'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n",
"test_3 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/4'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/70'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/101'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]"
"test_1 = [{'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/11'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/46'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/63'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/95'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n",
"test_2 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/153'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"test_3 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/83'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/97'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/128'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/149'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n",
"test_4 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/36'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/43'}, {'task_id': 'HumanEval/46'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/73'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/97'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/128'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/148'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/161'}, {'task_id': 'HumanEval/163'}]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Common IDs: length:41 {'HumanEval/102', 'HumanEval/5', 'HumanEval/77', 'HumanEval/134', 'HumanEval/75', 'HumanEval/28', 'HumanEval/110', 'HumanEval/108', 'HumanEval/126', 'HumanEval/145', 'HumanEval/26', 'HumanEval/21', 'HumanEval/135', 'HumanEval/163', 'HumanEval/6', 'HumanEval/132', 'HumanEval/1', 'HumanEval/125', 'HumanEval/129', 'HumanEval/159', 'HumanEval/32', 'HumanEval/111', 'HumanEval/142', 'HumanEval/140', 'HumanEval/12', 'HumanEval/100', 'HumanEval/120', 'HumanEval/160', 'HumanEval/84', 'HumanEval/119', 'HumanEval/124', 'HumanEval/20', 'HumanEval/137', 'HumanEval/127', 'HumanEval/7', 'HumanEval/14', 'HumanEval/0', 'HumanEval/116', 'HumanEval/113', 'HumanEval/130', 'HumanEval/91'}\n",
"Not Common IDs: {'HumanEval/37', 'HumanEval/46', 'HumanEval/88', 'HumanEval/8', 'HumanEval/29', 'HumanEval/123', 'HumanEval/118', 'HumanEval/41', 'HumanEval/122', 'HumanEval/49', 'HumanEval/64', 'HumanEval/131', 'HumanEval/114', 'HumanEval/22', 'HumanEval/73', 'HumanEval/76', 'HumanEval/94', 'HumanEval/71', 'HumanEval/39', 'HumanEval/148', 'HumanEval/109', 'HumanEval/121', 'HumanEval/133', 'HumanEval/155', 'HumanEval/68', 'HumanEval/65', 'HumanEval/99', 'HumanEval/80', 'HumanEval/144', 'HumanEval/93', 'HumanEval/98', 'HumanEval/16', 'HumanEval/33', 'HumanEval/156', 'HumanEval/10', 'HumanEval/136', 'HumanEval/153', 'HumanEval/3', 'HumanEval/90', 'HumanEval/154', 'HumanEval/139', 'HumanEval/17', 'HumanEval/87', 'HumanEval/19', 'HumanEval/138', 'HumanEval/89', 'HumanEval/9', 'HumanEval/69', 'HumanEval/25', 'HumanEval/54', 'HumanEval/63'}\n",
"Unique to test_1: {'HumanEval/46', 'HumanEval/123', 'HumanEval/71', 'HumanEval/3', 'HumanEval/54', 'HumanEval/109'}\n",
"Unique to test_2: {'HumanEval/121', 'HumanEval/88', 'HumanEval/133', 'HumanEval/139', 'HumanEval/8', 'HumanEval/65', 'HumanEval/114', 'HumanEval/144', 'HumanEval/73', 'HumanEval/69', 'HumanEval/16', 'HumanEval/90'}\n",
"Unique to test_3: {'HumanEval/155', 'HumanEval/37', 'HumanEval/93', 'HumanEval/98', 'HumanEval/153', 'HumanEval/25', 'HumanEval/63', 'HumanEval/19', 'HumanEval/33', 'HumanEval/89', 'HumanEval/148', 'HumanEval/39', 'HumanEval/136', 'HumanEval/49'}\n"
"Common IDs: length:25 {'HumanEval/32', 'HumanEval/130', 'HumanEval/127', 'HumanEval/100', 'HumanEval/132', 'HumanEval/115', 'HumanEval/129', 'HumanEval/75', 'HumanEval/76', 'HumanEval/108', 'HumanEval/140', 'HumanEval/119', 'HumanEval/41', 'HumanEval/155', 'HumanEval/145', 'HumanEval/93', 'HumanEval/120', 'HumanEval/111', 'HumanEval/126', 'HumanEval/163', 'HumanEval/134', 'HumanEval/142', 'HumanEval/125', 'HumanEval/137', 'HumanEval/133'}\n",
"Not Common IDs: {'HumanEval/122', 'HumanEval/6', 'HumanEval/74', 'HumanEval/81', 'HumanEval/83', 'HumanEval/90', 'HumanEval/160', 'HumanEval/36', 'HumanEval/73', 'HumanEval/94', 'HumanEval/95', 'HumanEval/14', 'HumanEval/148', 'HumanEval/29', 'HumanEval/39', 'HumanEval/64', 'HumanEval/46', 'HumanEval/102', 'HumanEval/26', 'HumanEval/153', 'HumanEval/10', 'HumanEval/161', 'HumanEval/139', 'HumanEval/11', 'HumanEval/159', 'HumanEval/54', 'HumanEval/110', 'HumanEval/131', 'HumanEval/149', 'HumanEval/1', 'HumanEval/43', 'HumanEval/128', 'HumanEval/97', 'HumanEval/118', 'HumanEval/135', 'HumanEval/77', 'HumanEval/121', 'HumanEval/154', 'HumanEval/113', 'HumanEval/63', 'HumanEval/138', 'HumanEval/91', 'HumanEval/84'}\n",
"Unique to test_1: {'HumanEval/11', 'HumanEval/154', 'HumanEval/63', 'HumanEval/90', 'HumanEval/94', 'HumanEval/95'}\n",
"Unique to test_2: {'HumanEval/139', 'HumanEval/102', 'HumanEval/160', 'HumanEval/153', 'HumanEval/138'}\n",
"Unique to test_3: {'HumanEval/149', 'HumanEval/29', 'HumanEval/83', 'HumanEval/26'}\n",
"Unique to test_4: {'HumanEval/14', 'HumanEval/148', 'HumanEval/73', 'HumanEval/36', 'HumanEval/161', 'HumanEval/43'}\n"
]
}
],
@ -33,40 +35,44 @@
"def extract_ids(test_list):\n",
" return set(item['task_id'] for item in test_list)\n",
"\n",
"def compare_ids(test_1, test_2, test_3):\n",
"def compare_ids(test_1, test_2, test_3, test_4):\n",
" ids_1 = extract_ids(test_1)\n",
" ids_2 = extract_ids(test_2)\n",
" ids_3 = extract_ids(test_3)\n",
" ids_4 = extract_ids(test_4)\n",
"\n",
" common_ids = ids_1 & ids_2 & ids_3\n",
" all_ids = ids_1 | ids_2 | ids_3\n",
" common_ids = ids_1 & ids_2 & ids_3 & ids_4\n",
" all_ids = ids_1 | ids_2 | ids_3 | ids_4\n",
" not_common_ids = all_ids - common_ids\n",
"\n",
" unique_1 = ids_1 - (ids_2 | ids_3)\n",
" unique_2 = ids_2 - (ids_1 | ids_3)\n",
" unique_3 = ids_3 - (ids_1 | ids_2)\n",
" unique_1 = ids_1 - (ids_2 | ids_3 | ids_4)\n",
" unique_2 = ids_2 - (ids_1 | ids_3 | ids_4)\n",
" unique_3 = ids_3 - (ids_1 | ids_2 | ids_4)\n",
" unique_4 = ids_4 - (ids_1 | ids_2 | ids_3)\n",
"\n",
" return {\n",
" 'common_ids': common_ids,\n",
" 'not_common_ids': not_common_ids,\n",
" 'unique_1': unique_1,\n",
" 'unique_2': unique_2,\n",
" 'unique_3': unique_3\n",
" 'unique_3': unique_3,\n",
" 'unique_4': unique_4\n",
" }\n",
"\n",
"# Assuming test_1, test_2, and test_3 are defined as in your example\n",
"result = compare_ids(test_1, test_2, test_3)\n",
"result = compare_ids(test_1, test_2, test_3, test_4)\n",
"\n",
"print(\"Common IDs:\",f\"length:{len(result['common_ids'])}\", result['common_ids'])\n",
"print(\"Not Common IDs:\",result['not_common_ids'])\n",
"print(\"Unique to test_1:\", result['unique_1'])\n",
"print(\"Unique to test_2:\", result['unique_2'])\n",
"print(\"Unique to test_3:\", result['unique_3'])"
"print(\"Unique to test_3:\", result['unique_3'])\n",
"print(\"Unique to test_4:\", result['unique_4'])"
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@ -77,7 +83,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -85,6 +91,132 @@
"test_2 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/16'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/69'}, {'task_id': 'HumanEval/73'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/88'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/114'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/144'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"test_3 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/19'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/37'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/49'}, {'task_id': 'HumanEval/63'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/68'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/80'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/87'}, {'task_id': 'HumanEval/89'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/98'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/136'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/148'}, {'task_id': 'HumanEval/153'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/156'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"from typing import List\n",
"\n",
"\n",
"def has_close_elements(numbers: List[float], threshold: float) -> bool:\n",
" \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n",
" given threshold.\n",
" >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n",
" False\n",
" >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n",
" True\n",
" \"\"\"\n",
" for i in range(len(numbers)):\n",
" for j in range(i + 1, len(numbers)):\n",
" if abs(numbers[i] - numbers[j]) < threshold:\n",
" return True\n",
" return False\n",
"def has_close_elements(numbers: List[float], threshold: float) -> bool:\n",
" \n",
" # Sort the numbers in ascending order\n",
" numbers.sort()\n",
" \n",
" # Iterate through the numbers and check the difference between adjacent numbers\n",
" for i in range(len(numbers) - 1):\n",
" if abs(numbers[i] - numbers[i+1]) < threshold:\n",
" return True\n",
" \n",
" # If no adjacent numbers are closer than the threshold, return False\n",
" return False\n",
"\n",
"\n",
" sorted_numbers = sorted(numbers)\n",
" for i in range(len(sorted_numbers) - 1):\n",
" if sorted_numbers[i + 1] - sorted_numbers[i] < threshold:\n",
" return True\n",
" return False\n",
"\n",
"\n",
"\n",
"\n",
"METADATA = {\n",
" 'author': 'jt',\n",
" 'dataset': 'test'\n",
"}\n",
"\n",
"\n",
"def check(candidate):\n",
" assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n",
" assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n",
" assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n",
" assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n",
" assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n",
" assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n",
" assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n",
"\n",
"\n"
]
},
{
"data": {
"text/plain": [
"'\\n assert isinstance(threshold, float) and threshold > 0, \"invalid inputs\" # $_CONTRACT_$\\n assert isinstance(numbers, list), \"invalid inputs\" # $_CONTRACT_$\\n assert all([isinstance(v, (int, float)) for v in numbers]), \"invalid inputs\" # $_CONTRACT_$\\n'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from evalplus.data import get_human_eval_plus, write_jsonl\n",
"\n",
"humaneval = get_human_eval_plus()\n",
"d_result = {\"task_id\": \"HumanEval/0\", \"solution\": \"from typing import List\\n\\n\\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\\n \\\"\\\"\\\" Check if in given list of numbers, are any two numbers closer to each other than\\n given threshold.\\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\\n False\\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\\n True\\n \\\"\\\"\\\"\\n for i in range(len(numbers)):\\n for j in range(i + 1, len(numbers)):\\n if abs(numbers[i] - numbers[j]) < threshold:\\n return True\\n return False\"}\n",
"result = {\"task_id\": \"HumanEval/0\", \"solution\": \"def has_close_elements(numbers: List[float], threshold: float) -> bool:\\n \\n # Sort the numbers in ascending order\\n numbers.sort()\\n \\n # Iterate through the numbers and check the difference between adjacent numbers\\n for i in range(len(numbers) - 1):\\n if abs(numbers[i] - numbers[i+1]) < threshold:\\n return True\\n \\n # If no adjacent numbers are closer than the threshold, return False\\n return False\"}\n",
"print(d_result[\"solution\"])\n",
"print(result[\"solution\"])\n",
"d_result = \n",
"result = \n",
"print(d_result[\"solution\"])\n",
"print(result[\"solution\"])\n",
"print(humaneval['HumanEval/0']['canonical_solution'])\n",
"print(humaneval['HumanEval/0']['test'])\n",
"humaneval['HumanEval/0']['contract']\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from typing import List\n",
"def has_close_elements(numbers: List[float], threshold: float) -> bool:\n",
" \n",
" # Sort the numbers in ascending order\n",
" numbers.sort()\n",
" \n",
" # Iterate through the numbers and check the difference between adjacent numbers\n",
" for i in range(len(numbers) - 1):\n",
" if abs(numbers[i] - numbers[i+1]) < threshold:\n",
" return True\n",
" \n",
" # If no adjacent numbers are closer than the threshold, return False\n",
" return False\n",
"\n",
"def check(candidate):\n",
" assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n",
" assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n",
" assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n",
" assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n",
" assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n",
" assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n",
" assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n",
"\n",
"check(has_close_elements)"
]
}
],
"metadata": {