Update GitNore

This commit is contained in:
didi 2024-07-27 01:57:06 +08:00
parent 772d2aea56
commit eac4b6c3e6
30 changed files with 939 additions and 9 deletions

1
.gitignore vendored
View file

@ -188,3 +188,4 @@ cov.xml
*-structure.json
*.dot
.python-version
*.jsonl

1
EVALPLUS-4omini/1.json Normal file

File diff suppressed because one or more lines are too long

1
EVALPLUS-4omini/2.json Normal file

File diff suppressed because one or more lines are too long

1
EVALPLUS-4omini/3.json Normal file

File diff suppressed because one or more lines are too long

1
EVALPLUS-4omini/4.json Normal file

File diff suppressed because one or more lines are too long

1
EVALPLUS-4omini/5.json Normal file

File diff suppressed because one or more lines are too long

View file

@ -28,8 +28,9 @@ async def sample_generate(id, result_path:str="samples.jsonl",mode:str="ags"):
solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=5)
sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
elif mode == "llm":
solution_result = await generate_code_block(case['prompt'])
solution_result = await generate_code_block(case['prompt'],case['entry_point'])
sample_dict = dict(task_id=case['task_id'], solution=solution_result['code_solution'])
print(sample_dict)
with open(result_path, mode='a') as f:
f.write(json.dumps(sample_dict) + '\n')
jsonl_ranker(result_path, result_path)

View file

@ -62,6 +62,7 @@ class HumanEvalGraph(Graph):
except Exception as e:
print(e)
solution = await self.mdensemble("code", solution_list, problem)
print("here",solution)
solution = await self.tester(problem, rephrase_problem, solution, test_cases)
return solution

View file

@ -332,6 +332,8 @@ class Test(Operator):
except AssertionError as e:
fail_case.append(self.test_cases_2_assert(test_case))
except Exception as e:
with open("tester.txt", "a") as f:
f.write(test_case[0] + "\n")
print(e)
return {"error":e}
if fail_case != []:

View file

@ -55,9 +55,22 @@ You are given a code contest problem, and a self-reflection on the problem:
The above is an incomplete Python code fragment and reflection on it. Return the complete and correct code with no additional text.
"""
GENERATE_CODEBLOCK_PROMPT = """
Please provide a self-contained Python script that solves the following problem in a markdown code block:
# GENERATE_CODEBLOCK_PROMPT = """
# Please provide a self-contained Python script that solves the following problem in a markdown code block:
# {problem_description}
# """
GENERATE_CODEBLOCK_PROMPT ="""
Please provide a self-contained Python script that solves the following problem in a markdown code block:
{problem_description}
When creating your solution:
1. Consider all edge cases and boundary conditions.
2. Consider the order of operations in your solution and how each step affects subsequent steps.
3. Avoid oversimplification - address all aspects of the problem.
4. Ensure your logic covers all stated requirements.
5. Avoid adding additional test cases beyond those provided in the problem description.
"""
REVIEW_PROMPT = """

View file

@ -159,6 +159,7 @@ async def llm_extract_test_case(id, problem_description: str, file_path:str="pub
import json
def test_cases_2_test_functions(solution: str, test_case: List):
print("here",solution)
function_name = test_case[0]
def format_param(param):
@ -181,6 +182,7 @@ def test_cases_2_test_functions(solution: str, test_case: List):
print(type(test_case[2]), test_case[2])
expected_output = format_param(test_case[2])
print(expected_output)
tester_function = f"""
{solution}

View file

@ -6,9 +6,14 @@ from examples.ags.benchmark.humaneval import sample_generate, samples_generate,
from examples.ags.w_action_node.utils import jsonl_ranker, llm_extract_test_case
from examples.ags.w_action_node.graph import HumanEvalGraph
# 132 141 136 80 73
# asyncio.run(sample_generate('HumanEval/118',result_path="llm_based_8.jsonl",mode="llm"))
asyncio.run(samples_generate(mode='llm',result_path="llm_based_100.jsonl"))
# jsonl_ranker("samples.jsonl", "samples.jsonl")
# asyncio.run(sample_generate('HumanEval/140',result_path="llm_based_1000.jsonl",mode="llm"))
# asyncio.run(sample_generate('HumanEval/140',result_path="llm_based_1000.jsonl",mode="llm"))
# asyncio.run(sample_generate('HumanEval/140',result_path="llm_based_1000.jsonl",mode="llm"))
# asyncio.run(sample_generate('HumanEval/67',result_path="llm_based_1000.jsonl",mode="llm"))
# asyncio.run(sample_generate('HumanEval/108',result_path="llm_based_1000.jsonl",mode="llm"))
# asyncio.run(sample_generate('HumanEval/110',result_path="llm_based_1000.jsonl",mode="llm"))
# asyncio.run(samples_generate(mode='alpha',result_path="alpha_based_100.jsonl"))
# jsonl_ranker("llm_based_137.jsonl", "llm_based_137.jsonl")
# result_path = "ags_based_6.jsonl"
# if automatic_evalplus(result_path):
@ -41,6 +46,7 @@ asyncio.run(samples_generate(mode='llm',result_path="llm_based_100.jsonl"))
# [72, 80, 82, 87, 90, 95, 107, 109, 112, 124, 126, 127, 128, 132, 134, 136, 137, 138, 148, 154, 155]
# case_prompt= get_human_eval_plus()["HumanEval/136"]['prompt']
# solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1)
# result = asyncio.run(solver.alpha_codium(problem_id="HumanEval/136", problem=case_prompt, ensemble_count=1))
# TODO 代码问题改动了一个地方导致Solution 没有了
case_prompt= get_human_eval_plus()["HumanEval/76"]['prompt']
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1)
result = asyncio.run(solver.alpha_codium(problem_id="HumanEval/136", problem=case_prompt, ensemble_count=1))

715
humaneval_analysis.ipynb Normal file
View file

@ -0,0 +1,715 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-07-25 16:45:31.369 | INFO | metagpt.const:get_metagpt_package_root:29 - Package root set to /Users/trl/Github_project/MetaGPT-MathAI\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"22\n",
"[{'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/40'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"20\n",
"[{'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/40'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n",
"24\n",
"[{'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/40'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/89'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n",
"23\n",
"[{'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/40'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/109'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"22\n",
"[{'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/40'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n"
]
}
],
"source": [
"from examples.ags.benchmark.humaneval import extract_failure_tests\n",
"\n",
"file_path_list = [\"llm_based_120_eval_results.json\", \"llm_based_121_eval_results.json\", \"llm_based_122_eval_results.json\", \"llm_based_123_eval_results.json\", \"llm_based_124_eval_results.json\"]\n",
"\n",
"for file_path in file_path_list:\n",
" unpassed_exapmle = extract_failure_tests(file_path)\n",
" print(unpassed_exapmle)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"17\n",
"[{'task_id': 'HumanEval/163'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/129'}]\n",
"20\n",
"[{'task_id': 'HumanEval/163'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/129'}]\n",
"17\n",
"[{'task_id': 'HumanEval/163'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/83'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/129'}]\n",
"18\n",
"[{'task_id': 'HumanEval/163'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/83'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/129'}]\n",
"18\n",
"[{'task_id': 'HumanEval/163'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/129'}]\n"
]
}
],
"source": [
"\n",
"\n",
"from examples.ags.benchmark.humaneval import extract_failure_tests\n",
"\n",
"file_path_list = [\"EVALPLUS-4omini/1.json\", \"EVALPLUS-4omini/2.json\", \"EVALPLUS-4omini/3.json\", \"EVALPLUS-4omini/4.json\", \"EVALPLUS-4omini/5.json\"]\n",
"\n",
"for file_path in file_path_list:\n",
" unpassed_exapmle = extract_failure_tests(file_path)\n",
" print(unpassed_exapmle)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20\n",
"[{'task_id': 'HumanEval/16'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"23\n",
"[{'task_id': 'HumanEval/16'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"20\n",
"[{'task_id': 'HumanEval/16'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"22\n",
"[{'task_id': 'HumanEval/16'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"23\n",
"[{'task_id': 'HumanEval/16'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/67'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n"
]
}
],
"source": [
"from examples.ags.benchmark.humaneval import extract_failure_tests\n",
"\n",
"file_path_list = [\"llm_based_125_eval_results.json\", \"llm_based_126_eval_results.json\", \"llm_based_127_eval_results.json\", \"llm_based_128_eval_results.json\", \"llm_based_129_eval_results.json\"]\n",
"\n",
"for file_path in file_path_list:\n",
" unpassed_exapmle = extract_failure_tests(file_path)\n",
" print(unpassed_exapmle)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"21\n",
"[{'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/83'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"22\n",
"[{'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"21\n",
"[{'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"20\n",
"[{'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n",
"22\n",
"[{'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/83'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n"
]
}
],
"source": [
"from examples.ags.benchmark.humaneval import extract_failure_tests\n",
"\n",
"file_path_list = [\"llm_based_135_eval_results.json\", \"llm_based_136_eval_results.json\", \"llm_based_137_eval_results.json\", \"llm_based_138_eval_results.json\", \"llm_based_139_eval_results.json\"]\n",
"\n",
"for file_path in file_path_list:\n",
" unpassed_exapmle = extract_failure_tests(file_path)\n",
" print(unpassed_exapmle)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"def fruit_distribution(s,n):\n",
" \"\"\"\n",
" In this task, you will be given a string that represents a number of apples and oranges \n",
" that are distributed in a basket of fruit this basket contains \n",
" apples, oranges, and mango fruits. Given the string that represents the total number of \n",
" the oranges and apples and an integer that represent the total number of the fruits \n",
" in the basket return the number of the mango fruits in the basket.\n",
" for examble:\n",
" fruit_distribution(\"5 apples and 6 oranges\", 19) ->19 - 5 - 6 = 8\n",
" fruit_distribution(\"0 apples and 1 oranges\",3) -> 3 - 0 - 1 = 2\n",
" fruit_distribution(\"2 apples and 3 oranges\", 100) -> 100 - 2 - 3 = 95\n",
" fruit_distribution(\"100 apples and 1 oranges\",120) -> 120 - 100 - 1 = 19\n",
" \"\"\"\n",
"\n",
"def fruit_distribution(s, n):\n",
" \"\"\"\n",
" In this task, you will be given a string that represents a number of apples and oranges \n",
" that are distributed in a basket of fruit. This basket contains \n",
" apples, oranges, and mango fruits. Given the string that represents the total number of \n",
" the oranges and apples and an integer that represents the total number of the fruits \n",
" in the basket, return the number of the mango fruits in the basket.\n",
" \n",
" Parameters:\n",
" s (str): A string representing the number of apples and oranges.\n",
" n (int): An integer representing the total number of fruits in the basket.\n",
" \n",
" Returns:\n",
" int: The number of mango fruits in the basket.\n",
" \n",
" Examples:\n",
" fruit_distribution(\"5 apples and 6 oranges\", 19) -> 8\n",
" fruit_distribution(\"0 apples and 1 oranges\", 3) -> 2\n",
" fruit_distribution(\"2 apples and 3 oranges\", 100) -> 95\n",
" fruit_distribution(\"100 apples and 1 oranges\", 120) -> 19\n",
" \"\"\"\n",
" \n",
" # Extract the number of apples and oranges from the string\n",
" parts = s.split()\n",
" apples = int(parts[0]) # First part is the number of apples\n",
" oranges = int(parts[2]) # Third part is the number of oranges\n",
" \n",
" # Calculate the number of mangoes\n",
" mangoes = n - apples - oranges\n",
" \n",
" return mangoes\n",
"--------------------------\n",
"def fruit_distribution(s, n):\n",
" \"\"\"\n",
" In this task, you will be given a string that represents a number of apples and oranges \n",
" that are distributed in a basket of fruit. This basket contains \n",
" apples, oranges, and mango fruits. Given the string that represents the total number of \n",
" the oranges and apples and an integer that represents the total number of the fruits \n",
" in the basket, return the number of the mango fruits in the basket.\n",
" \n",
" Parameters:\n",
" s (str): A string representing the number of apples and oranges.\n",
" n (int): An integer representing the total number of fruits in the basket.\n",
" \n",
" Returns:\n",
" int: The number of mango fruits in the basket.\n",
" \n",
" Examples:\n",
" fruit_distribution(\"5 apples and 6 oranges\", 19) -> 8\n",
" fruit_distribution(\"0 apples and 1 oranges\", 3) -> 2\n",
" fruit_distribution(\"2 apples and 3 oranges\", 100) -> 95\n",
" fruit_distribution(\"100 apples and 1 oranges\", 120) -> 19\n",
" \"\"\"\n",
" \n",
" # Extract the number of apples and oranges from the string\n",
" parts = s.split()\n",
" apples = int(parts[0]) # First part is the number of apples\n",
" oranges = int(parts[3]) # Fourth part is the number of oranges\n",
" \n",
" # Calculate the number of mangoes\n",
" mangoes = n - apples - oranges\n",
" \n",
" return mangoes\n",
"--------------------------\n",
"def fruit_distribution(s, n):\n",
" \"\"\"\n",
" Calculate the number of mango fruits in a basket given the number of apples and oranges.\n",
"\n",
" Parameters:\n",
" s (str): A string representing the number of apples and oranges in the format \"X apples and Y oranges\".\n",
" n (int): An integer representing the total number of fruits in the basket.\n",
"\n",
" Returns:\n",
" int: The number of mango fruits in the basket.\n",
" \"\"\"\n",
" # Split the input string to extract the number of apples and oranges\n",
" parts = s.split(\" and \")\n",
" apples = int(parts[0].split()[0]) # Get the number of apples\n",
" oranges = int(parts[1].split()[0]) # Get the number of oranges\n",
"\n",
" # Calculate the number of mangoes\n",
" mangoes = n - apples - oranges\n",
"\n",
" return mangoes\n"
]
}
],
"source": [
"from evalplus.data import get_human_eval_plus\n",
"\n",
"humaneval = get_human_eval_plus()\n",
"print(humaneval['HumanEval/67'][\"prompt\"])\n",
"\n",
"result = {\"solution\": \"def fruit_distribution(s, n):\\n \\\"\\\"\\\"\\n In this task, you will be given a string that represents a number of apples and oranges \\n that are distributed in a basket of fruit. This basket contains \\n apples, oranges, and mango fruits. Given the string that represents the total number of \\n the oranges and apples and an integer that represents the total number of the fruits \\n in the basket, return the number of the mango fruits in the basket.\\n \\n Parameters:\\n s (str): A string representing the number of apples and oranges.\\n n (int): An integer representing the total number of fruits in the basket.\\n \\n Returns:\\n int: The number of mango fruits in the basket.\\n \\n Examples:\\n fruit_distribution(\\\"5 apples and 6 oranges\\\", 19) -> 8\\n fruit_distribution(\\\"0 apples and 1 oranges\\\", 3) -> 2\\n fruit_distribution(\\\"2 apples and 3 oranges\\\", 100) -> 95\\n fruit_distribution(\\\"100 apples and 1 oranges\\\", 120) -> 19\\n \\\"\\\"\\\"\\n \\n # Extract the number of apples and oranges from the string\\n parts = s.split()\\n apples = int(parts[0]) # First part is the number of apples\\n oranges = int(parts[2]) # Third part is the number of oranges\\n \\n # Calculate the number of mangoes\\n mangoes = n - apples - oranges\\n \\n return mangoes\"}\n",
"print(result[\"solution\"])\n",
"\n",
"print(\"--------------------------\")\n",
"\n",
"correct_result = {\"solution\":\"def fruit_distribution(s, n):\\n \\\"\\\"\\\"\\n In this task, you will be given a string that represents a number of apples and oranges \\n that are distributed in a basket of fruit. This basket contains \\n apples, oranges, and mango fruits. Given the string that represents the total number of \\n the oranges and apples and an integer that represents the total number of the fruits \\n in the basket, return the number of the mango fruits in the basket.\\n \\n Parameters:\\n s (str): A string representing the number of apples and oranges.\\n n (int): An integer representing the total number of fruits in the basket.\\n \\n Returns:\\n int: The number of mango fruits in the basket.\\n \\n Examples:\\n fruit_distribution(\\\"5 apples and 6 oranges\\\", 19) -> 8\\n fruit_distribution(\\\"0 apples and 1 oranges\\\", 3) -> 2\\n fruit_distribution(\\\"2 apples and 3 oranges\\\", 100) -> 95\\n fruit_distribution(\\\"100 apples and 1 oranges\\\", 120) -> 19\\n \\\"\\\"\\\"\\n \\n # Extract the number of apples and oranges from the string\\n parts = s.split()\\n apples = int(parts[0]) # First part is the number of apples\\n oranges = int(parts[3]) # Fourth part is the number of oranges\\n \\n # Calculate the number of mangoes\\n mangoes = n - apples - oranges\\n \\n return mangoes\"}\n",
"\n",
"print(correct_result[\"solution\"])\n",
"\n",
"print(\"--------------------------\")\n",
"\n",
"modify_result = {'task_id': 'HumanEval/67', 'solution': 'def fruit_distribution(s, n):\\n \"\"\"\\n Calculate the number of mango fruits in a basket given the number of apples and oranges.\\n\\n Parameters:\\n s (str): A string representing the number of apples and oranges in the format \"X apples and Y oranges\".\\n n (int): An integer representing the total number of fruits in the basket.\\n\\n Returns:\\n int: The number of mango fruits in the basket.\\n \"\"\"\\n # Split the input string to extract the number of apples and oranges\\n parts = s.split(\" and \")\\n apples = int(parts[0].split()[0]) # Get the number of apples\\n oranges = int(parts[1].split()[0]) # Get the number of oranges\\n\\n # Calculate the number of mangoes\\n mangoes = n - apples - oranges\\n\\n return mangoes'}\n",
"\n",
"print(modify_result[\"solution\"])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"def count_nums(arr):\n",
" \"\"\"\n",
" Write a function count_nums which takes an array of integers and returns\n",
" the number of elements which has a sum of digits > 0.\n",
" If a number is negative, then its first signed digit will be negative:\n",
" e.g. -123 has signed digits -1, 2, and 3.\n",
" >>> count_nums([]) == 0\n",
" >>> count_nums([-1, 11, -11]) == 1\n",
" >>> count_nums([1, 1, 2]) == 3\n",
" \"\"\"\n",
"\n",
"def count_nums(arr):\n",
" \"\"\"\n",
" Write a function count_nums which takes an array of integers and returns\n",
" the number of elements which has a sum of digits > 0.\n",
" If a number is negative, then its first signed digit will be negative:\n",
" e.g. -123 has signed digits -1, 2, and 3.\n",
" \n",
" >>> count_nums([]) == 0\n",
" >>> count_nums([-1, 11, -11]) == 1\n",
" >>> count_nums([1, 1, 2]) == 3\n",
" \"\"\"\n",
" def sum_of_digits(n):\n",
" # Convert the number to string and calculate the sum of its digits\n",
" return sum(int(digit) if digit != '-' else -int(digit[1]) for digit in str(n) if digit.isdigit() or digit == '-')\n",
"\n",
" count = 0\n",
" for number in arr:\n",
" if sum_of_digits(number) > 0:\n",
" count += 1\n",
" \n",
" return count\n",
"--------------------------\n",
"def count_nums(arr):\n",
" \"\"\"\n",
" Write a function count_nums which takes an array of integers and returns\n",
" the number of elements which has a sum of digits > 0.\n",
" If a number is negative, then its first signed digit will be negative:\n",
" e.g. -123 has signed digits -1, 2, and 3.\n",
" \n",
" >>> count_nums([]) == 0\n",
" >>> count_nums([-1, 11, -11]) == 1\n",
" >>> count_nums([1, 1, 2]) == 3\n",
" \"\"\"\n",
" def sum_of_digits(n):\n",
" # Convert the number to string and calculate the sum of its digits\n",
" # For negative numbers, we need to consider the first digit as negative\n",
" str_n = str(n)\n",
" if n < 0:\n",
" return -int(str_n[1]) + sum(int(digit) for digit in str_n[2:])\n",
" else:\n",
" return sum(int(digit) for digit in str_n)\n",
"\n",
" count = 0\n",
" for num in arr:\n",
" if sum_of_digits(num) > 0:\n",
" count += 1\n",
" \n",
" return count\n",
"--------------------------\n",
"def count_nums(arr):\n",
" \"\"\"\n",
" Write a function count_nums which takes an array of integers and returns\n",
" the number of elements which has a sum of digits > 0.\n",
" If a number is negative, then its first signed digit will be negative:\n",
" e.g. -123 has signed digits -1, 2, and 3.\n",
" \n",
" >>> count_nums([]) == 0\n",
" >>> count_nums([-1, 11, -11]) == 1\n",
" >>> count_nums([1, 1, 2]) == 3\n",
" \"\"\"\n",
" \n",
" def sum_of_digits(n):\n",
" \"\"\"Helper function to calculate the sum of digits of a number.\"\"\"\n",
" # Convert the number to string and iterate over each character\n",
" # If the number is negative, we need to consider the first digit as negative\n",
" str_n = str(n)\n",
" digit_sum = 0\n",
" \n",
" for i, char in enumerate(str_n):\n",
" if char.isdigit():\n",
" digit_sum += int(char)\n",
" elif i == 0 and char == '-':\n",
" digit_sum -= 1 # First signed digit is negative\n",
" \n",
" return digit_sum\n",
"\n",
" count = 0\n",
" for number in arr:\n",
" if sum_of_digits(number) > 0:\n",
" count += 1\n",
" \n",
" return count\n"
]
}
],
"source": [
"from evalplus.data import get_human_eval_plus\n",
"\n",
"humaneval = get_human_eval_plus()\n",
"print(humaneval['HumanEval/108'][\"prompt\"])\n",
"\n",
"result = {\"solution\": \"def count_nums(arr):\\n \\\"\\\"\\\"\\n Write a function count_nums which takes an array of integers and returns\\n the number of elements which has a sum of digits > 0.\\n If a number is negative, then its first signed digit will be negative:\\n e.g. -123 has signed digits -1, 2, and 3.\\n \\n >>> count_nums([]) == 0\\n >>> count_nums([-1, 11, -11]) == 1\\n >>> count_nums([1, 1, 2]) == 3\\n \\\"\\\"\\\"\\n def sum_of_digits(n):\\n # Convert the number to string and calculate the sum of its digits\\n return sum(int(digit) if digit != '-' else -int(digit[1]) for digit in str(n) if digit.isdigit() or digit == '-')\\n\\n count = 0\\n for number in arr:\\n if sum_of_digits(number) > 0:\\n count += 1\\n \\n return count\"}\n",
"print(result[\"solution\"])\n",
"\n",
"print(\"--------------------------\")\n",
"\n",
"correct_result = {\"solution\": \"def count_nums(arr):\\n \\\"\\\"\\\"\\n Write a function count_nums which takes an array of integers and returns\\n the number of elements which has a sum of digits > 0.\\n If a number is negative, then its first signed digit will be negative:\\n e.g. -123 has signed digits -1, 2, and 3.\\n \\n >>> count_nums([]) == 0\\n >>> count_nums([-1, 11, -11]) == 1\\n >>> count_nums([1, 1, 2]) == 3\\n \\\"\\\"\\\"\\n def sum_of_digits(n):\\n # Convert the number to string and calculate the sum of its digits\\n # For negative numbers, we need to consider the first digit as negative\\n str_n = str(n)\\n if n < 0:\\n return -int(str_n[1]) + sum(int(digit) for digit in str_n[2:])\\n else:\\n return sum(int(digit) for digit in str_n)\\n\\n count = 0\\n for num in arr:\\n if sum_of_digits(num) > 0:\\n count += 1\\n \\n return count\"}\n",
"print(correct_result[\"solution\"])\n",
"\n",
"print(\"--------------------------\")\n",
"\n",
"modify_result = {'task_id': 'HumanEval/108', 'solution': 'def count_nums(arr):\\n \"\"\"\\n Write a function count_nums which takes an array of integers and returns\\n the number of elements which has a sum of digits > 0.\\n If a number is negative, then its first signed digit will be negative:\\n e.g. -123 has signed digits -1, 2, and 3.\\n \\n >>> count_nums([]) == 0\\n >>> count_nums([-1, 11, -11]) == 1\\n >>> count_nums([1, 1, 2]) == 3\\n \"\"\"\\n \\n def sum_of_digits(n):\\n \"\"\"Helper function to calculate the sum of digits of a number.\"\"\"\\n # Convert the number to string and iterate over each character\\n # If the number is negative, we need to consider the first digit as negative\\n str_n = str(n)\\n digit_sum = 0\\n \\n for i, char in enumerate(str_n):\\n if char.isdigit():\\n digit_sum += int(char)\\n elif i == 0 and char == \\'-\\':\\n digit_sum -= 1 # First signed digit is negative\\n \\n return digit_sum\\n\\n count = 0\\n for number in arr:\\n if sum_of_digits(number) > 0:\\n count += 1\\n \\n return count'}\n",
"\n",
"print(modify_result[\"solution\"])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"def exchange(lst1, lst2):\n",
" \"\"\"In this problem, you will implement a function that takes two lists of numbers,\n",
" and determines whether it is possible to perform an exchange of elements\n",
" between them to make lst1 a list of only even numbers.\n",
" There is no limit on the number of exchanged elements between lst1 and lst2.\n",
" If it is possible to exchange elements between the lst1 and lst2 to make\n",
" all the elements of lst1 to be even, return \"YES\".\n",
" Otherwise, return \"NO\".\n",
" For example:\n",
" exchange([1, 2, 3, 4], [1, 2, 3, 4]) => \"YES\"\n",
" exchange([1, 2, 3, 4], [1, 5, 3, 4]) => \"NO\"\n",
" It is assumed that the input lists will be non-empty.\n",
" \"\"\"\n",
"\n",
"def exchange(lst1, lst2):\n",
" \"\"\"Determines whether it is possible to perform an exchange of elements\n",
" between two lists to make lst1 a list of only even numbers.\n",
" \n",
" Args:\n",
" lst1 (list): The first list of numbers.\n",
" lst2 (list): The second list of numbers.\n",
" \n",
" Returns:\n",
" str: \"YES\" if lst1 can be made all even, \"NO\" otherwise.\n",
" \"\"\"\n",
" # Check if lst1 already contains only even numbers\n",
" if all(x % 2 == 0 for x in lst1):\n",
" return \"YES\"\n",
" \n",
" # Count the number of odd numbers in lst1\n",
" odd_count_lst1 = sum(1 for x in lst1 if x % 2 != 0)\n",
" \n",
" # Check if lst2 has at least one even number to exchange\n",
" has_even_in_lst2 = any(x % 2 == 0 for x in lst2)\n",
" \n",
" # If lst1 has odd numbers and lst2 has at least one even number, we can exchange\n",
" if odd_count_lst1 > 0 and has_even_in_lst2:\n",
" return \"YES\"\n",
" \n",
" return \"NO\"\n",
"--------------------------\n",
"def exchange(lst1, lst2):\n",
" \"\"\"In this problem, you will implement a function that takes two lists of numbers,\n",
" and determines whether it is possible to perform an exchange of elements\n",
" between them to make lst1 a list of only even numbers.\n",
" There is no limit on the number of exchanged elements between lst1 and lst2.\n",
" If it is possible to exchange elements between the lst1 and lst2 to make\n",
" all the elements of lst1 to be even, return \"YES\".\n",
" Otherwise, return \"NO\".\n",
" For example:\n",
" exchange([1, 2, 3, 4], [1, 2, 3, 4]) => \"YES\"\n",
" exchange([1, 2, 3, 4], [1, 5, 3, 4]) => \"NO\"\n",
" It is assumed that the input lists will be non-empty.\n",
" \"\"\"\n",
" \n",
" # Count the number of odd numbers in lst1\n",
" odd_count_lst1 = sum(1 for x in lst1 if x % 2 != 0)\n",
" \n",
" # Count the number of even numbers in lst2\n",
" even_count_lst2 = sum(1 for x in lst2 if x % 2 == 0)\n",
" \n",
" # If there are no odd numbers in lst1, it's already valid\n",
" if odd_count_lst1 == 0:\n",
" return \"YES\"\n",
" \n",
" # If there are enough even numbers in lst2 to replace all odd numbers in lst1\n",
" if even_count_lst2 >= odd_count_lst1:\n",
" return \"YES\"\n",
" \n",
" return \"NO\"\n"
]
}
],
"source": [
"from evalplus.data import get_human_eval_plus\n",
"\n",
"humaneval = get_human_eval_plus()\n",
"print(humaneval['HumanEval/110'][\"prompt\"])\n",
"\n",
"result = {\"solution\": \"def exchange(lst1, lst2):\\n \\\"\\\"\\\"Determines whether it is possible to perform an exchange of elements\\n between two lists to make lst1 a list of only even numbers.\\n \\n Args:\\n lst1 (list): The first list of numbers.\\n lst2 (list): The second list of numbers.\\n \\n Returns:\\n str: \\\"YES\\\" if lst1 can be made all even, \\\"NO\\\" otherwise.\\n \\\"\\\"\\\"\\n # Check if lst1 already contains only even numbers\\n if all(x % 2 == 0 for x in lst1):\\n return \\\"YES\\\"\\n \\n # Count the number of odd numbers in lst1\\n odd_count_lst1 = sum(1 for x in lst1 if x % 2 != 0)\\n \\n # Check if lst2 has at least one even number to exchange\\n has_even_in_lst2 = any(x % 2 == 0 for x in lst2)\\n \\n # If lst1 has odd numbers and lst2 has at least one even number, we can exchange\\n if odd_count_lst1 > 0 and has_even_in_lst2:\\n return \\\"YES\\\"\\n \\n return \\\"NO\\\"\"}\n",
"print(result[\"solution\"])\n",
"\n",
"print(\"--------------------------\")\n",
"\n",
"correct_result = {\"solution\": \"def exchange(lst1, lst2):\\n \\\"\\\"\\\"In this problem, you will implement a function that takes two lists of numbers,\\n and determines whether it is possible to perform an exchange of elements\\n between them to make lst1 a list of only even numbers.\\n There is no limit on the number of exchanged elements between lst1 and lst2.\\n If it is possible to exchange elements between the lst1 and lst2 to make\\n all the elements of lst1 to be even, return \\\"YES\\\".\\n Otherwise, return \\\"NO\\\".\\n For example:\\n exchange([1, 2, 3, 4], [1, 2, 3, 4]) => \\\"YES\\\"\\n exchange([1, 2, 3, 4], [1, 5, 3, 4]) => \\\"NO\\\"\\n It is assumed that the input lists will be non-empty.\\n \\\"\\\"\\\"\\n \\n # Count the number of odd numbers in lst1\\n odd_count_lst1 = sum(1 for x in lst1 if x % 2 != 0)\\n \\n # Count the number of even numbers in lst2\\n even_count_lst2 = sum(1 for x in lst2 if x % 2 == 0)\\n \\n # If there are no odd numbers in lst1, it's already valid\\n if odd_count_lst1 == 0:\\n return \\\"YES\\\"\\n \\n # If there are enough even numbers in lst2 to replace all odd numbers in lst1\\n if even_count_lst2 >= odd_count_lst1:\\n return \\\"YES\\\"\\n \\n return \\\"NO\\\"\"}\n",
"print(correct_result[\"solution\"])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"def count_distinct_characters(string: str) -> int:\n",
" \"\"\" Given a string, find out how many distinct characters (regardless of case) does it consist of\n",
" >>> count_distinct_characters('xyzXYZ')\n",
" 3\n",
" >>> count_distinct_characters('Jerry')\n",
" 4\n",
" \"\"\"\n",
"\n",
"def count_distinct_characters(string: str) -> int:\n",
" \"\"\" Given a string, find out how many distinct characters (regardless of case) does it consist of\n",
" >>> count_distinct_characters('xyzXYZ')\n",
" 3\n",
" >>> count_distinct_characters('Jerry')\n",
" 4\n",
" \"\"\"\n",
" # Convert the string to lowercase to ensure case insensitivity\n",
" lower_string = string.lower()\n",
" # Use a set to find distinct characters\n",
" distinct_characters = set(lower_string)\n",
" # Return the number of distinct characters\n",
" return len(distinct_characters)\n",
"--------------------------\n",
"def count_distinct_characters(string: str) -> int:\n",
" \"\"\" Given a string, find out how many distinct characters (regardless of case) does it consist of\n",
" >>> count_distinct_characters('xyzXYZ')\n",
" 3\n",
" >>> count_distinct_characters('Jerry')\n",
" 4\n",
" \"\"\"\n",
" # Convert the string to lowercase to ensure case insensitivity\n",
" lower_string = string.lower()\n",
" # Use a set to find distinct characters\n",
" distinct_characters = set(lower_string)\n",
" # Return the number of distinct characters\n",
" return len(distinct_characters)\n",
"--------------------------\n",
"def count_distinct_characters(string: str) -> int:\n",
" \"\"\" \n",
" Given a string, find out how many distinct characters (regardless of case) does it consist of.\n",
" \n",
" >>> count_distinct_characters('xyzXYZ')\n",
" 3\n",
" >>> count_distinct_characters('Jerry')\n",
" 4\n",
" >>> count_distinct_characters('')\n",
" 0\n",
" >>> count_distinct_characters('aA')\n",
" 1\n",
" >>> count_distinct_characters('123abcABC!@#')\n",
" 9\n",
" >>> count_distinct_characters(' ')\n",
" 0\n",
" >>> count_distinct_characters('!@#$%^&*()')\n",
" 10\n",
" \"\"\"\n",
" # Convert the string to lowercase to ensure case insensitivity\n",
" lower_string = string.lower()\n",
" \n",
" # Use a set to store distinct characters\n",
" distinct_characters = set()\n",
" \n",
" # Iterate through each character in the string\n",
" for char in lower_string:\n",
" # Add only non-space characters to the set\n",
" if char != ' ':\n",
" distinct_characters.add(char)\n",
" \n",
" # Return the number of distinct characters\n",
" return len(distinct_characters)\n"
]
}
],
"source": [
"from evalplus.data import get_human_eval_plus\n",
"\n",
"humaneval = get_human_eval_plus()\n",
"print(humaneval['HumanEval/16'][\"prompt\"])\n",
"\n",
"result = {\"task_id\": \"HumanEval/16\", \"solution\": \"def count_distinct_characters(string: str) -> int:\\n \\\"\\\"\\\" Given a string, find out how many distinct characters (regardless of case) does it consist of\\n >>> count_distinct_characters('xyzXYZ')\\n 3\\n >>> count_distinct_characters('Jerry')\\n 4\\n \\\"\\\"\\\"\\n # Convert the string to lowercase to ensure case insensitivity\\n lower_string = string.lower()\\n # Use a set to find distinct characters\\n distinct_characters = set(lower_string)\\n # Return the number of distinct characters\\n return len(distinct_characters)\"}\n",
"print(result[\"solution\"])\n",
"\n",
"print(\"--------------------------\")\n",
"\n",
"correct_result = {\"task_id\": \"HumanEval/16\", \"solution\": \"def count_distinct_characters(string: str) -> int:\\n \\\"\\\"\\\" Given a string, find out how many distinct characters (regardless of case) does it consist of\\n >>> count_distinct_characters('xyzXYZ')\\n 3\\n >>> count_distinct_characters('Jerry')\\n 4\\n \\\"\\\"\\\"\\n # Convert the string to lowercase to ensure case insensitivity\\n lower_string = string.lower()\\n # Use a set to find distinct characters\\n distinct_characters = set(lower_string)\\n # Return the number of distinct characters\\n return len(distinct_characters)\"}\n",
"\n",
"print(correct_result[\"solution\"])\n",
"\n",
"print(\"--------------------------\")\n",
"\n",
"modify_result = {\"task_id\": \"HumanEval/16\", \"solution\": \"def count_distinct_characters(string: str) -> int:\\n \\\"\\\"\\\" \\n Given a string, find out how many distinct characters (regardless of case) does it consist of.\\n \\n >>> count_distinct_characters('xyzXYZ')\\n 3\\n >>> count_distinct_characters('Jerry')\\n 4\\n >>> count_distinct_characters('')\\n 0\\n >>> count_distinct_characters('aA')\\n 1\\n >>> count_distinct_characters('123abcABC!@#')\\n 9\\n >>> count_distinct_characters(' ')\\n 0\\n >>> count_distinct_characters('!@#$%^&*()')\\n 10\\n \\\"\\\"\\\"\\n # Convert the string to lowercase to ensure case insensitivity\\n lower_string = string.lower()\\n \\n # Use a set to store distinct characters\\n distinct_characters = set()\\n \\n # Iterate through each character in the string\\n for char in lower_string:\\n # Add only non-space characters to the set\\n if char != ' ':\\n distinct_characters.add(char)\\n \\n # Return the number of distinct characters\\n return len(distinct_characters)\"}\n",
"\n",
"print(modify_result[\"solution\"])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"def fix_spaces(text):\n",
" \"\"\"\n",
" Given a string text, replace all spaces in it with underscores, \n",
" and if a string has more than 2 consecutive spaces, \n",
" then replace all consecutive spaces with - \n",
" \n",
" fix_spaces(\"Example\") == \"Example\"\n",
" fix_spaces(\"Example 1\") == \"Example_1\"\n",
" fix_spaces(\" Example 2\") == \"_Example_2\"\n",
" fix_spaces(\" Example 3\") == \"_Example-3\"\n",
" \"\"\"\n",
"\n",
"def fix_spaces(text):\n",
" \"\"\"\n",
" Given a string text, replace all spaces in it with underscores, \n",
" and if a string has more than 2 consecutive spaces, \n",
" then replace all consecutive spaces with - \n",
" \n",
" fix_spaces(\"Example\") == \"Example\"\n",
" fix_spaces(\"Example 1\") == \"Example_1\"\n",
" fix_spaces(\" Example 2\") == \"_Example_2\"\n",
" fix_spaces(\" Example 3\") == \"_Example-3\"\n",
" \"\"\"\n",
" # Replace multiple spaces with a placeholder\n",
" # This will help us identify sequences of spaces\n",
" placeholder = '##'\n",
" text = text.replace(' ' * 3, placeholder) # Replace 3 spaces with a placeholder\n",
" \n",
" # Replace remaining spaces with underscores\n",
" text = text.replace(' ', '_')\n",
" \n",
" # Replace the placeholder back to '-'\n",
" text = text.replace(placeholder, '-')\n",
" \n",
" return text\n",
"--------------------------\n",
"def fix_spaces(text):\n",
" \"\"\"\n",
" Given a string text, replace all spaces in it with underscores, \n",
" and if a string has more than 2 consecutive spaces, \n",
" then replace all consecutive spaces with - \n",
" \n",
" fix_spaces(\"Example\") == \"Example\"\n",
" fix_spaces(\"Example 1\") == \"Example_1\"\n",
" fix_spaces(\" Example 2\") == \"_Example_2\"\n",
" fix_spaces(\" Example 3\") == \"_Example-3\"\n",
" \"\"\"\n",
" # Replace multiple spaces with a placeholder\n",
" # This will help us identify sequences of spaces\n",
" placeholder = ' ' * 3 # Three spaces\n",
" text = text.replace(placeholder, ' - ')\n",
" \n",
" # Replace all remaining spaces with underscores\n",
" text = text.replace(' ', '_')\n",
" \n",
" # Replace the placeholder with a single dash\n",
" text = text.replace(' - ', '-')\n",
" \n",
" return text\n"
]
}
],
"source": [
"\n",
"from evalplus.data import get_human_eval_plus\n",
"\n",
"humaneval = get_human_eval_plus()\n",
"print(humaneval['HumanEval/140'][\"prompt\"])\n",
"\n",
"correct_result = {\"task_id\": \"HumanEval/140\", \"solution\": \"def fix_spaces(text):\\n \\\"\\\"\\\"\\n Given a string text, replace all spaces in it with underscores, \\n and if a string has more than 2 consecutive spaces, \\n then replace all consecutive spaces with - \\n \\n fix_spaces(\\\"Example\\\") == \\\"Example\\\"\\n fix_spaces(\\\"Example 1\\\") == \\\"Example_1\\\"\\n fix_spaces(\\\" Example 2\\\") == \\\"_Example_2\\\"\\n fix_spaces(\\\" Example 3\\\") == \\\"_Example-3\\\"\\n \\\"\\\"\\\"\\n # Replace multiple spaces with a placeholder\\n # This will help us identify sequences of spaces\\n placeholder = '##'\\n text = text.replace(' ' * 3, placeholder) # Replace 3 spaces with a placeholder\\n \\n # Replace remaining spaces with underscores\\n text = text.replace(' ', '_')\\n \\n # Replace the placeholder back to '-'\\n text = text.replace(placeholder, '-')\\n \\n return text\"}\n",
"\n",
"print(correct_result[\"solution\"])\n",
"\n",
"print(\"--------------------------\")\n",
"\n",
"modify_result = {\"task_id\": \"HumanEval/140\", \"solution\": \"def fix_spaces(text):\\n \\\"\\\"\\\"\\n Given a string text, replace all spaces in it with underscores, \\n and if a string has more than 2 consecutive spaces, \\n then replace all consecutive spaces with - \\n \\n fix_spaces(\\\"Example\\\") == \\\"Example\\\"\\n fix_spaces(\\\"Example 1\\\") == \\\"Example_1\\\"\\n fix_spaces(\\\" Example 2\\\") == \\\"_Example_2\\\"\\n fix_spaces(\\\" Example 3\\\") == \\\"_Example-3\\\"\\n \\\"\\\"\\\"\\n # Replace multiple spaces with a placeholder\\n # This will help us identify sequences of spaces\\n placeholder = ' ' * 3 # Three spaces\\n text = text.replace(placeholder, ' - ')\\n \\n # Replace all remaining spaces with underscores\\n text = text.replace(' ', '_')\\n \\n # Replace the placeholder with a single dash\\n text = text.replace(' - ', '-')\\n \\n return text\"}\n",
"\n",
"print(modify_result[\"solution\"])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "ags_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -512,6 +512,7 @@ class ActionNode:
import re
field_name = self.get_field_name()
prompt = context
print(f"prompt: \n{prompt}")
content = await self.llm.aask(prompt, timeout=timeout)
# TODO 在前置逻辑中完成entrypoint的提取就可以
extracted_code = sanitize(code=content, entrypoint=function_name)

168
tester.txt Normal file
View file

@ -0,0 +1,168 @@
filter_integers
incr_list
string_sequence
greatest_common_divisor
generate_integers
anti_shuffle
derivative
monotonic
eat
solution
sort_numbers
make_palindrome
fib
order_by_points
numerical_letter_grade
is_simple_power
rounded_avg
is_nested
multiply
x_or_y
count_distinct_characters
prime_length
solve
below_zero
minSubArraySum
count_upper
find_closest_elements
count_up_to
below_threshold
triangle_area
choose_num
sum_to_n
common
unique_digits
intersection
search
factorize
add_elements
mean_absolute_deviation
get_closest_vowel
get_max_triples
tri
longest
even_odd_palindrome
get_row
maximum
move_one_ball
cycpattern_check
solve
check_if_last_char_is_a_letter
get_odd_collatz
circular_shift
exchange
flip_case
get_positive
parse_music
is_prime
add
unique
is_equal_to_sum_even
fibfib
is_multiply_prime
truncate_number
fix_spaces
vowels_count
add
words_string
correct_bracketing
any_int
filter_by_substring
change_base
f
special_factorial
string_xor
hex_key
Strongest_Extension
strange_sort_list
prime_fib
encrypt
simplify
encode_shift
has_close_elements
decimal_to_binary
count_nums
rescale_to_unit
median
triangle_area
fizz_buzz
how_many_times
remove_vowels
find_max
right_angle_triangle
int_to_mini_roman
sum_squares
match_parens
smallest_change
largest_divisor
sort_array
triples_sum_to_zero
is_bored
sorted_list_sum
can_arrange
encode_cyclic
by_length
largest_smallest_integers
bf
modp
car_race_collision
sort_third
histogram
compare_one
words_in_sentence
pluck
is_sorted
filter_by_prefix
same_chars
parse_nested_parens
remove_duplicates
sum_squares
encode
sort_even
make_a_pile
digitSum
prod_signs
largest_prime_factor
sum_product
double_the_difference
split_words
odd_count
minPath
total_match
skjkasdkd
is_palindrome
find_zero
check_dict_case
string_to_md5
next_smallest
is_happy
all_prefixes
separate_paren_groups
iscube
select_words
closest_integer
fruit_distribution
do_algebra
max_fill
sort_array
digits
even_odd_count
correct_bracketing
largest_smallest_integers
reverse_delete
strlen
pairs_sum_to_zero
intersperse
rolling_max
concatenate
valid_date
compare
starts_one_ends
will_it_fly
max_element
specialFilter
file_name_check
fib4
largest_smallest_integers
largest_smallest_integers
largest_smallest_integers