diff --git a/.gitignore b/.gitignore index 6443b07bd..87f3f5de6 100644 --- a/.gitignore +++ b/.gitignore @@ -189,3 +189,6 @@ cov.xml *.dot .python-version *.csv +/examples/ags/data/baselines/general +/examples/ags/scripts/optimized/HumanEval/graphs +/examples/ags/scripts/optimized/HumanEval/graphs_test diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py index f860b133e..8d250dc3e 100644 --- a/examples/ags/benchmark/humaneval.py +++ b/examples/ags/benchmark/humaneval.py @@ -7,11 +7,13 @@ import threading from datetime import datetime from typing import List, Tuple, Callable, Dict, Any, Optional +import re import pandas as pd from tqdm.asyncio import tqdm_asyncio from examples.ags.benchmark.utils import generate_random_indices from examples.ags.benchmark.utils import log_mismatch +from metagpt.actions.code_sanitize import sanitize async def load_data(file_path: str, samples=1, test=False) -> List[dict]: @@ -38,58 +40,6 @@ async def load_file_data(file_path: str, specific_indices: List[int] = None) -> return data -# async def check_solution(solution, test, entry_point): - -# print(f"solution: {solution}") - -# try: -# # 定义一个包含所有必要模块的全局字典 -# global_dict = { -# 'math': __import__('math'), -# 'hashlib': __import__('hashlib'), -# 're': __import__('re'), -# 'List': List, -# 'Dict': Dict, -# 'Tuple': Tuple, -# 'Optional': Optional, -# 'Any': Any -# } -# if entry_point == "decode_cyclic": -# solution = "\n\ndef encode_cyclic(s: str):\n \"\"\"\n returns encoded string by cycling groups of three characters.\n \"\"\"\n # split string to groups. Each of length 3.\n groups = [s[(3 * i):min((3 * i + 3), len(s))] for i in range((len(s) + 2) // 3)]\n # cycle elements in each group. Unless group has fewer elements than 3.\n groups = [(group[1:] + group[0]) if len(group) == 3 else group for group in groups]\n return \"\".join(groups)" + "\n\n" + solution -# elif entry_point == "decode_shift": -# solution = "\n\ndef encode_shift(s: str):\n \"\"\"\n returns encoded string by shifting every character by 5 in the alphabet.\n \"\"\"\n return \"\".join([chr(((ord(ch) + 5 - ord(\"a\")) % 26) + ord(\"a\")) for ch in s])\n\n\n" + solution -# elif entry_point == "find_zero": -# solution = "\n\ndef poly(xs: list, x: float):\n return sum(coeff * (x ** i) for i, coeff in enumerate(xs))\n\n" + solution -# # 执行解决方案 -# exec(solution, global_dict) - -# # 确保入口点函数已定义 -# if entry_point not in global_dict: -# raise ValueError(f"函数 {entry_point} 在解决方案中未定义。") - -# # 执行测试用例 -# exec(test, global_dict) - -# # 获取检查函数 -# check = global_dict["check"] - -# # 运行检查函数 -# result = check(global_dict[entry_point]) - -# if result is None: -# result = (PASS, "解决方案通过了所有测试用例。") - -# except Exception as e: -# # 记录详细的错误信息 -# error_message = f"错误: {str(e)}.\n 解决方案: {solution}.\n 测试: {test}" -# result = (FAIL, error_message) - -# # 将错误信息写入error.log文件 -# with open('error.log', 'a', encoding='utf-8') as log_file: -# log_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {error_message}\n") - -# return result - PASS = "PASS" FAIL = "FAIL" @@ -98,24 +48,33 @@ class TimeoutError(Exception): def run_with_timeout(func, args, timeout): result = [] + stop_event = threading.Event() + def target(): try: result.append(func(*args)) except Exception as e: result.append(e) + finally: + stop_event.set() thread = threading.Thread(target=target) thread.start() - thread.join(timeout) - if thread.is_alive(): + is_timeout = not stop_event.wait(timeout) + + if is_timeout: + # 线程仍在运行,我们无法强制终止它,但至少可以标记超时 raise TimeoutError("Function execution timed out") + + if not result: + return None if isinstance(result[0], Exception): raise result[0] return result[0] def check_solution(solution, test, entry_point): - print(f"solution: {solution}") + solution = sanitize(code=solution, entrypoint=entry_point) try: # 定义一个包含所有必要模块的全局字典 global_dict = { @@ -147,8 +106,8 @@ def check_solution(solution, test, entry_point): # 获取检查函数 check = global_dict["check"] - # 运行检查函数,设置超时时间为5秒 - result = run_with_timeout(check, (global_dict[entry_point],), 120) + # 运行检查函数,设置超时时间为120秒 + result = run_with_timeout(check, (global_dict[entry_point],), 15) if result is None: result = (PASS, "解决方案通过了所有测试用例。") @@ -171,13 +130,7 @@ async def evaluate_problem(data: dict, graph: Callable, path) -> Tuple[str, str, max_retries = 5 retries = 0 - # prediction = await graph(data["prompt"], data["entry_point"]) if graph else "None" - # cost = prediction[1] - # solution = prediction[0] - # ret = check_solution(solution, data["test"], data["entry_point"]) - # test_case_details = ret[1] - # expected_output = test_case_details + "\nCorrect Solution:\ndef " + data["entry_point"] + "(params you should put here):" + "\n\n" + data["canonical_solution"] - # score = 1 if ret[0] == PASS else 0 + expected_output = "\nCorrect Solution:\ndef " + data["entry_point"] + "(params you should put here):" + "\n\n" + data["canonical_solution"] while retries < max_retries: try: @@ -186,7 +139,7 @@ async def evaluate_problem(data: dict, graph: Callable, path) -> Tuple[str, str, solution = prediction[0] ret = check_solution(solution, data["test"], data["entry_point"]) test_case_details = ret[1] - expected_output = test_case_details + "\nCorrect Solution:\ndef " + data["entry_point"] + "(params you should put here):" + "\n\n" + data["canonical_solution"] + expected_output = test_case_details + "\nCorrect Solution:\ndef " + data["entry_point"] + "(params you should put here):" + "\n\n" + data["canonical_solution"] score = 1 if ret[0] == PASS else 0 if score == 0: @@ -258,8 +211,3 @@ async def optimize_humaneval_evaluation(graph: Callable, file_path: str, path: s print(f"Total Cost: {total_cost:.5f}") print(f"Average cost on HumanEval dataset: {average_cost:.5f}") return average_score, average_cost, total_cost - -# TODO HumanEval 主实验后续任务 - -# 1. 修改optimized中的内容,让优化代码能够跑起来 -# 2. 启动主实验 \ No newline at end of file diff --git a/examples/ags/benchmark/mbpp.py b/examples/ags/benchmark/mbpp.py index f2720a5f7..0fc99396d 100644 --- a/examples/ags/benchmark/mbpp.py +++ b/examples/ags/benchmark/mbpp.py @@ -1,11 +1,16 @@ +import os import json import time import asyncio import aiofiles +import threading import pandas as pd from typing import List, Tuple, Callable, Any, Optional, Dict -from tqdm.asyncio import tqdm_asyncio +from datetime import datetime +from tqdm.asyncio import tqdm_asyncio +from examples.ags.benchmark.utils import log_mismatch +from metagpt.actions.code_sanitize import sanitize from examples.ags.benchmark.utils import generate_random_indices PASS = "pass" @@ -21,7 +26,41 @@ async def load_data(file_path: str, samples=1, test=False) -> List[dict]: return data -async def check_solution(solution, test, entry_point): +PASS = "PASS" +FAIL = "FAIL" + +class TimeoutError(Exception): + pass + +def run_with_timeout(func, args, timeout): + result = [] + stop_event = threading.Event() + + def target(): + try: + result.append(func(*args)) + except Exception as e: + result.append(e) + finally: + stop_event.set() + + thread = threading.Thread(target=target) + thread.start() + is_timeout = not stop_event.wait(timeout) + + if is_timeout: + # 线程仍在运行,我们无法强制终止它,但至少可以标记超时 + raise TimeoutError("Function execution timed out") + + if not result: + return None + if isinstance(result[0], Exception): + raise result[0] + return result[0] + +def check_solution(solution, test, entry_point): + + solution = sanitize(code=solution, entrypoint=entry_point) try: # 定义一个包含所有必要模块的全局字典 global_dict = { @@ -47,38 +86,43 @@ async def check_solution(solution, test, entry_point): # 获取检查函数 check = global_dict["check"] - # 运行检查函数 - result = check() + # 运行检查函数,设置超时时间为120秒 + result = run_with_timeout(check, (global_dict[entry_point],), 15) if result is None: result = (PASS, "解决方案通过了所有测试用例。") - # except ValueError as ve: - # if "函数" in str(ve) and "在解决方案中未定义" in str(ve): - # raise + except TimeoutError: + result = (FAIL, "执行超时。请检查您的解决方案是否包含无限循环或过于耗时的操作。") except Exception as e: # 记录详细的错误信息 error_message = f"错误: {str(e)}.\n 解决方案: {solution}.\n 测试: {test}" result = (FAIL, error_message) # 将错误信息写入error.log文件 - with open('error_mbpp.log', 'a', encoding='utf-8') as log_file: + with open('error.log', 'a', encoding='utf-8') as log_file: log_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {error_message}\n") return result -async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]: +async def evaluate_problem(data: dict, graph: Callable, path) -> Tuple[str, str, str, int, str]: max_retries = 5 retries = 0 + expected_output = "\nCorrect Solution:\ndef " + data["code"] + while retries < max_retries: try: prediction = await graph(data["prompt"], data["entry_point"]) if graph else "None" cost = prediction[1] solution = prediction[0] ret = await check_solution(solution, data["test"], data["entry_point"]) - + test_case_details = ret[1] score = 1 if ret[0] == PASS else 0 + expected_output = test_case_details + "\nCorrect Solution:" + data["code"] + + if score == 0: + log_mismatch(data["prompt"], expected_output, solution, score, path) break except Exception as e: @@ -92,28 +136,55 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, score = 0 break - return data["prompt"], solution, ret[1], score, cost + return data["prompt"], solution, expected_output, score, cost -async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int, str]]: +async def evaluate_all_problems(data: List[dict], graph: Callable, path:str="", max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int, str]]: semaphore = asyncio.Semaphore(max_concurrent_tasks) async def sem_evaluate(problem): async with semaphore: - return await evaluate_problem(problem, graph) + return await evaluate_problem(problem, graph, path) tasks = [sem_evaluate(problem) for problem in data] return await tqdm_asyncio.gather(*tasks, desc="Evaluating MBPP problems", total=len(data)) -def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]: - df = pd.DataFrame(results, columns=["question", "prediction", "test_case_details", "score", "cost"]) - average_score = df["score"].mean() - total_cost = df["cost"].max() +def save_results_to_csv(results: List[Tuple[str, str, str, int]], path): + # 创建 DataFrame + df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"]) - output_file = f"{path}/{average_score:.5f}.csv" + # 计算统计数据 + avg_score = df["score"].mean() + t_cost = df["cost"].max() + a_cost = t_cost / len(df) if len(df) > 0 else 0 + + # 获取当前时间,格式为 YYYYMMDD_HHMMSS + current_time = datetime.now().strftime("%Y%m%d_%H%M%S") + + # 生成文件名,包含平均分和当前时间,保留五位小数 + filename = f"{avg_score:.5f}_{current_time}.csv" + output_file = os.path.join(path, filename) + + # 保存到 CSV df.to_csv(output_file, index=False) print(f"Results saved to {output_file}") - return average_score, total_cost + + return avg_score, a_cost, t_cost + + +async def load_file_data(file_path: str, specific_indices: List[int] = None) -> List[dict]: + data = [] + # 异步读取文件内容 + async with aiofiles.open(file_path, mode="r", encoding='utf-8') as file: + async for line in file: + data.append(json.loads(line)) + + # 然后在随机选择的样本中基于特定索引列表进行进一步筛选 + if specific_indices is not None: + filtered_data = [data[i] for i in specific_indices if i < len(data)] + return filtered_data + + return data async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]: data = await load_data(file_path, samples, test) @@ -124,17 +195,11 @@ async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: s return average_score, total_cost -async def load_file_data(file_path: str) -> List[dict]: - data = [] - async with aiofiles.open(file_path, mode="r") as file: - async for line in file: - data.append(json.loads(line)) - return data - -async def optimize_mbpp_evaluation(graph: Callable, file_path: str, path: str) -> Tuple[float, float]: - data = await load_file_data(file_path) - results = await evaluate_all_problems(data, graph, max_concurrent_tasks=50) - average_score, total_cost = save_results_to_csv(results, path=path) +async def optimize_mbpp_evaluation(graph: Callable, file_path: str, path: str, va_list: List[int]) -> Tuple[float, float]: + data = await load_file_data(file_path, va_list) + results = await evaluate_all_problems(data, graph, path, max_concurrent_tasks=25) + average_score, average_cost, total_cost = save_results_to_csv(results, path=path) print(f"Average score on MBPP dataset: {average_score:.5f}") print(f"Total Cost: {total_cost:.5f}") - return average_score, total_cost \ No newline at end of file + print(f"Average cost on MBPP dataset: {average_cost:.5f}") + return average_score, average_cost, total_cost \ No newline at end of file diff --git a/examples/ags/data/humaneval_public_test.jsonl b/examples/ags/data/humaneval_public_test.jsonl new file mode 100644 index 000000000..38a4287f9 --- /dev/null +++ b/examples/ags/data/humaneval_public_test.jsonl @@ -0,0 +1,159 @@ +{"problem_id": "HumanEval/0", "test": ["assert candidate([1.0, 2.0, 3.0], 0.5) == False", "assert candidate([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3) == True"], "entry_point": "has_close_elements"} +{"problem_id": "HumanEval/1", "test": ["assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']"], "entry_point": "separate_paren_groups"} +{"problem_id": "HumanEval/2", "test": ["assert candidate(3.5) == 0.5"], "entry_point": "truncate_number"} +{"problem_id": "HumanEval/3", "test": ["assert candidate([1, 2, 3]) == False", "assert candidate([1, 2, -4, 5]) == True"], "entry_point": "below_zero"} +{"problem_id": "HumanEval/4", "test": ["assert candidate([1.0, 2.0, 3.0, 4.0]) == 1.0"], "entry_point": "mean_absolute_deviation"} +{"problem_id": "HumanEval/5", "test": ["assert candidate([], 4) == []", "assert candidate([1, 2, 3], 4) == [1, 4, 2, 4, 3]"], "entry_point": "intersperse"} +{"problem_id": "HumanEval/6", "test": ["assert candidate('(()()) ((())) () ((())()())') == [2, 3, 1, 3]"], "entry_point": "parse_nested_parens"} +{"problem_id": "HumanEval/7", "test": ["assert candidate([], 'a') == []", "assert candidate(['abc', 'bacd', 'cde', 'array'], 'a') == ['abc', 'bacd', 'array']"], "entry_point": "filter_by_substring"} +{"problem_id": "HumanEval/8", "test": ["assert candidate([]) == (0, 1)", "assert candidate([1, 2, 3, 4]) == (10, 24)"], "entry_point": "sum_product"} +{"problem_id": "HumanEval/9", "test": ["assert candidate([1, 2, 3, 2, 3, 4, 2]) == [1, 2, 3, 3, 3, 4, 4]"], "entry_point": "rolling_max"} +{"problem_id": "HumanEval/10", "test": ["assert candidate('cat') == 'catac'", "assert candidate('cata') == 'catac'"], "entry_point": "make_palindrome"} +{"problem_id": "HumanEval/11", "test": ["assert candidate('010', '110') == '100'"], "entry_point": "string_xor"} +{"problem_id": "HumanEval/12", "test": ["assert candidate([]) == None", "assert candidate(['a', 'b', 'c']) == 'a'", "assert candidate(['a', 'bb', 'ccc']) == 'ccc'"], "entry_point": "longest"} +{"problem_id": "HumanEval/13", "test": ["assert candidate(3, 5) == 1", "assert candidate(25, 15) == 5"], "entry_point": "greatest_common_divisor"} +{"problem_id": "HumanEval/14", "test": ["assert candidate('abc') == ['a', 'ab', 'abc']"], "entry_point": "all_prefixes"} +{"problem_id": "HumanEval/15", "test": ["assert candidate(0) == '0'", "assert candidate(5) == '0 1 2 3 4 5'"], "entry_point": "string_sequence"} +{"problem_id": "HumanEval/16", "test": ["assert candidate('xyzXYZ') == 3", "assert candidate('Jerry') == 4"], "entry_point": "count_distinct_characters"} +{"problem_id": "HumanEval/17", "test": ["assert candidate('o o| .| o| o| .| .| .| .| o o') == [4, 2, 1, 2, 2, 1, 1, 1, 1, 4, 4]"], "entry_point": "parse_music"} +{"problem_id": "HumanEval/18", "test": ["assert candidate('', 'a') == 0", "assert candidate('aaa', 'a') == 3", "assert candidate('aaaa', 'aa') == 3"], "entry_point": "how_many_times"} +{"problem_id": "HumanEval/19", "test": ["assert candidate('three one five') == 'one three five'"], "entry_point": "sort_numbers"} +{"problem_id": "HumanEval/20", "test": ["assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.2]) == (2.0, 2.2)", "assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0]) == (2.0, 2.0)"], "entry_point": "find_closest_elements"} +{"problem_id": "HumanEval/21", "test": ["assert candidate([1.0, 2.0, 3.0, 4.0, 5.0]) == [0.0, 0.25, 0.5, 0.75, 1.0]"], "entry_point": "rescale_to_unit"} +{"problem_id": "HumanEval/22", "test": ["assert candidate(['a', 3.14, 5]) == [5]", "assert candidate([1, 2, 3, 'abc', {}, []]) == [1, 2, 3]"], "entry_point": "filter_integers"} +{"problem_id": "HumanEval/23", "test": ["assert candidate('',) == 0", "assert candidate('abc',) == 3"], "entry_point": "strlen"} +{"problem_id": "HumanEval/24", "test": ["assert candidate(15) == 5"], "entry_point": "largest_divisor"} +{"problem_id": "HumanEval/25", "test": ["assert candidate(8) == [2, 2, 2]", "assert candidate(25) == [5, 5]", "assert candidate(70) == [2, 5, 7]"], "entry_point": "factorize"} +{"problem_id": "HumanEval/26", "test": ["assert candidate([1, 2, 3, 2, 4]) == [1, 3, 4]"], "entry_point": "remove_duplicates"} +{"problem_id": "HumanEval/27", "test": ["assert candidate('Hello') == 'hELLO'"], "entry_point": "flip_case"} +{"problem_id": "HumanEval/28", "test": ["assert candidate([]) == ''", "assert candidate(['a', 'b', 'c']) == 'abc'"], "entry_point": "concatenate"} +{"problem_id": "HumanEval/29", "test": ["assert candidate([], 'a') == []", "assert candidate(['abc', 'bcd', 'cde', 'array'], 'a') == ['abc', 'array']"], "entry_point": "filter_by_prefix"} +{"problem_id": "HumanEval/30", "test": ["assert candidate([-1, 2, -4, 5, 6]) == [2, 5, 6]", "assert candidate([5, 3, -5, 2, -3, 3, 9, 0, 123, 1, -10]) == [5, 3, 2, 3, 9, 123, 1]"], "entry_point": "get_positive"} +{"problem_id": "HumanEval/31", "test": ["assert candidate(6) == False", "assert candidate(101) == True", "assert candidate(11) == True", "assert candidate(13441) == True", "assert candidate(61) == True", "assert candidate(4) == False", "assert candidate(1) == False"], "entry_point": "is_prime"} +{"problem_id": "HumanEval/33", "test": ["assert candidate([1, 2, 3]) == [1, 2, 3]", "assert candidate([5, 6, 3, 4, 8, 9, 2]) == [2, 6, 3, 4, 8, 9, 5"], "entry_point": "sort_third"} +{"problem_id": "HumanEval/34", "test": ["assert candidate([5, 3, 5, 2, 3, 3, 9, 0, 123]) == [0, 2, 3, 5, 9, 123]"], "entry_point": "unique"} +{"problem_id": "HumanEval/35", "test": ["assert candidate([1, 2, 3]) == 3", "assert candidate([5, 3, -5, 2, -3, 3, 9, 0, 123, 1, -10]) == 123"], "entry_point": "max_element"} +{"problem_id": "HumanEval/36", "test": ["assert candidate(50) == 0", "assert candidate(78) == 2", "assert candidate(79) == 3"], "entry_point": "fizz_buzz"} +{"problem_id": "HumanEval/37", "test": ["assert candidate([1, 2, 3]) == [1, 2, 3]", "assert candidate([5, 6, 3, 4]) == [3, 6, 5, 4]"], "entry_point": "sort_even"} +{"problem_id": "HumanEval/39", "test": ["assert candidate(1) == 2", "assert candidate(2) == 3", "assert candidate(3) == 5", "assert candidate(4) == 13", "assert candidate(5) == 89"], "entry_point": "prime_fib"} +{"problem_id": "HumanEval/40", "test": ["assert candidate([1, 3, 5, 0]) == False", "assert candidate([1, 3, -2, 1]) == True", "assert candidate([1, 2, 3, 7]) == False", "assert candidate([2, 4, -5, 3, 9, 7]) == True", "assert candidate([1]) == False"], "entry_point": "triples_sum_to_zero"} +{"problem_id": "HumanEval/41", "test": ["assert candidate(1) == 0", "assert candidate(2) == 0", "assert candidate(3) == 0", "assert candidate(10) == 0"], "entry_point": "car_race_collision"} +{"problem_id": "HumanEval/42", "test": ["assert candidate([1, 2, 3]) == [2, 3, 4]", "assert candidate([5, 3, 5, 2, 3, 3, 9, 0, 123]) == [6, 4, 6, 3, 4, 4, 10, 1, 124]"], "entry_point": "incr_list"} +{"problem_id": "HumanEval/43", "test": ["assert candidate([1, 3, 5, 0]) == False", "assert candidate([1, 3, -2, 1]) == False", "assert candidate([1, 2, 3, 7]) == False", "assert candidate([2, 4, -5, 3, 5, 7]) == True", "assert candidate([1]) == False"], "entry_point": "pairs_sum_to_zero"} +{"problem_id": "HumanEval/44", "test": ["assert candidate(8, 3) == '22'", "assert candidate(8, 2) == '1000'", "assert candidate(7, 2) == '111'"], "entry_point": "change_base"} +{"problem_id": "HumanEval/45", "test": ["assert candidate(5, 3) == 7.5"], "entry_point": "triangle_area"} +{"problem_id": "HumanEval/46", "test": ["assert candidate(0) == 0", "assert candidate(1) == 0", "assert candidate(2) == 2", "assert candidate(3) == 0", "assert candidate(5) == 4", "assert candidate(6) == 8", "assert candidate(7) == 14"], "entry_point": "fib4"} +{"problem_id": "HumanEval/47", "test": ["assert candidate([3, 1, 2, 4, 5]) == 3", "assert candidate([-10, 4, 6, 1000, 10, 20]) == 15.0"], "entry_point": "median"} +{"problem_id": "HumanEval/48", "test": ["assert candidate('') == True", "assert candidate('aba') == True", "assert candidate('aaaaa') == True", "assert candidate('zbcd') == False"], "entry_point": "is_palindrome"} +{"problem_id": "HumanEval/49", "test": ["assert candidate(3, 5) == 3", "assert candidate(1101, 101) == 2", "assert candidate(0, 101) == 1", "assert candidate(3, 11) == 8", "assert candidate(100, 101) == 1"], "entry_point": "modp"} +{"problem_id": "HumanEval/51", "test": ["assert candidate('abcdef\\nghijklm') == 'bcdf\\nghjklm'", "assert candidate('abcdef') == 'bcdf'", "assert candidate('aaaaa') == ''", "assert candidate('aaBAA') == 'B'", "assert candidate('zbcd') == 'zbcd'"], "entry_point": "remove_vowels"} +{"problem_id": "HumanEval/52", "test": ["assert candidate([1, 2, 4, 10], 100) == True", "assert candidate([1, 20, 4, 10], 5) == False"], "entry_point": "below_threshold"} +{"problem_id": "HumanEval/53", "test": ["assert candidate([2, 3]) == 5", "assert candidate([5, 7]) == 12"], "entry_point": "add"} +{"problem_id": "HumanEval/54", "test": ["assert candidate('eabcdzzzz', 'dddzzzzzzzddeddabc') == True", "assert candidate('abcd', 'dddddddabc') == True", "assert candidate('dddddddabc', 'abcd') == True", "assert candidate('eabcd', 'dddddddabc') == False", "assert candidate('abcd', 'dddddddabce') == False", "assert candidate('eabcdzzzz', 'dddzzzzzzzddddabc') == False"], "entry_point": "same_chars"} +{"problem_id": "HumanEval/55", "test": ["assert candidate(10) == 55", "assert candidate(1) == 1", "assert candidate(8) == 21"], "entry_point": "fib"} +{"problem_id": "HumanEval/56", "test": ["assert candidate(\"<\") == False", "assert candidate(\"<>\") == True", "assert candidate(\"<<><>>\") == True", "assert candidate(\"><<>\") == False"], "entry_point": "correct_bracketing"} +{"problem_id": "HumanEval/57", "test": ["assert candidate([1, 2, 4, 20]) == True", "assert candidate([1, 20, 4, 10]) == False", "assert candidate([4, 1, 0, -10]) == True"], "entry_point": "monotonic"} +{"problem_id": "HumanEval/58", "test": ["assert candidate([1, 4, 3, 34, 653, 2, 5], [5, 7, 1, 5, 9, 653, 121]) == [1, 5, 653]", "assert candidate([5, 3, 2, 8], [3, 2]) == [2, 3]"], "entry_point": "common"} +{"problem_id": "HumanEval/59", "test": ["assert candidate(13195) == 29", "assert candidate(2048) == 2"], "entry_point": "largest_prime_factor"} +{"problem_id": "HumanEval/60", "test": ["assert candidate(30) == 465", "assert candidate(100) == 5050", "assert candidate(5) == 15", "assert candidate(10) == 55", "assert candidate(1) == 1"], "entry_point": "sum_to_n"} +{"problem_id": "HumanEval/61", "test": ["assert candidate(\"(\") == False", "assert candidate(\"()\") == True", "assert candidate(\"(()())\") == True", "assert candidate(\")(()\") == False"], "entry_point": "correct_bracketing"} +{"problem_id": "HumanEval/62", "test": ["assert candidate([3, 1, 2, 4, 5]) == [1, 4, 12, 20]", "assert candidate([1, 2, 3]) == [2, 6]"], "entry_point": "derivative"} +{"problem_id": "HumanEval/63", "test": ["assert candidate(1) == 0", "assert candidate(5) == 4", "assert candidate(8) == 24"], "entry_point": "fibfib"} +{"problem_id": "HumanEval/64", "test": ["assert candidate('abcde') == 2", "assert candidate('ACEDY') == 3"], "entry_point": "vowels_count"} +{"problem_id": "HumanEval/65", "test": ["assert candidate(12, 1) == '21'", "assert candidate(12, 2) == '12'"], "entry_point": "circular_shift"} +{"problem_id": "HumanEval/66", "test": ["assert candidate(\"\") == 0", "assert candidate(\"abAB\") == 131", "assert candidate(\"abcCd\") == 67", "assert candidate(\"helloE\") == 69", "assert candidate(\"woArBld\") == 131", "assert candidate(\"aAaaaXa\") == 153"], "entry_point": "digitSum"} +{"problem_id": "HumanEval/67", "test": ["assert candidate('5 apples and 6 oranges', 19) == 8", "assert candidate('0 apples and 1 oranges', 3) == 2", "assert candidate('2 apples and 3 oranges', 100) == 95", "assert candidate('100 apples and 1 oranges', 120) == 19"], "entry_point": "fruit_distribution"} +{"problem_id": "HumanEval/68", "test": ["assert candidate([4, 2, 3]) == [2, 1]", "assert candidate([1, 2, 3]) == [2, 1]", "assert candidate([]) == []", "assert candidate([5, 0, 3, 0, 4, 2]) == [0, 1]"], "entry_point": "pluck"} +{"problem_id": "HumanEval/69", "test": ["assert candidate([4, 1, 2, 2, 3, 1]) == 2", "assert candidate([1, 2, 2, 3, 3, 3, 4, 4, 4]) == 3", "assert candidate([5, 5, 4, 4, 4]) == -1"], "entry_point": "search"} +{"problem_id": "HumanEval/70", "test": ["assert candidate([1, 2, 3, 4]) == [1, 4, 2, 3]", "assert candidate([5, 5, 5, 5]) == [5, 5, 5, 5]", "assert candidate([]) == []"], "entry_point": "strange_sort_list"} +{"problem_id": "HumanEval/71", "test": ["assert candidate(3, 4, 5) == 6.00", "assert candidate(1, 2, 10) == -1"], "entry_point": "triangle_area"} +{"problem_id": "HumanEval/72", "test": ["assert candidate([1, 2], 5) == False", "assert candidate([3, 2, 3], 1) == False", "assert candidate([3, 2, 3], 9) == True", "assert candidate([3], 5) == True"], "entry_point": "will_it_fly"} +{"problem_id": "HumanEval/73", "test": ["assert candidate([1, 2, 3, 5, 4, 7, 9, 6]) == 4", "assert candidate([1, 2, 3, 4, 3, 2, 2]) == 1", "assert candidate([1, 2, 3, 2, 1]) == 0"], "entry_point": "smallest_change"} +{"problem_id": "HumanEval/74", "test": ["assert candidate([], []) == []", "assert candidate(['hi', 'admin'], ['hI', 'Hi']) == ['hI', 'Hi']", "assert candidate(['hi', 'admin'], ['hi', 'hi', 'admin', 'project']) == ['hi', 'admin']", "assert candidate(['hi', 'admin'], ['hI', 'hi', 'hi']) == ['hI', 'hi', 'hi']", "assert candidate(['4'], ['1', '2', '3', '4', '5']) == ['4']"], "entry_point": "total_match"} +{"problem_id": "HumanEval/75", "test": ["assert candidate(30) == True"], "entry_point": "is_multiply_prime"} +{"problem_id": "HumanEval/76", "test": ["assert candidate(1, 4) == True", "assert candidate(2, 2) == True", "assert candidate(8, 2) == True", "assert candidate(3, 2) == False", "assert candidate(3, 1) == False", "assert candidate(5, 3) == False"], "entry_point": "is_simple_power"} +{"problem_id": "HumanEval/77", "test": ["assert candidate(1) == True", "assert candidate(2) == False", "assert candidate(-1) == True", "assert candidate(64) == True", "assert candidate(0) == True", "assert candidate(180) == False"], "entry_point": "iscube"} +{"problem_id": "HumanEval/78", "test": ["assert candidate('AB') == 1", "assert candidate('1077E') == 2", "assert candidate('ABED1A33') == 4", "assert candidate('123456789ABCDEF0') == 6", "assert candidate('2020') == 2"], "entry_point": "hex_key"} +{"problem_id": "HumanEval/79", "test": ["assert candidate(15) == 'db1111db'", "assert candidate(32) == 'db100000db'"], "entry_point": "decimal_to_binary"} +{"problem_id": "HumanEval/80", "test": ["assert candidate('a') == False", "assert candidate('aa') == False", "assert candidate('abcd') == True", "assert candidate('aabb') == False", "assert candidate('adb') == True", "assert candidate('xyy') == False"], "entry_point": "is_happy"} +{"problem_id": "HumanEval/81", "test": ["assert candidate([4.0, 3, 1.7, 2, 3.5]) == ['A+', 'B', 'C-', 'C', 'A-']"], "entry_point": "numerical_letter_grade"} +{"problem_id": "HumanEval/82", "test": ["assert candidate('Hello') == True", "assert candidate('abcdcba') == True", "assert candidate('kittens') == True", "assert candidate('orange') == False"], "entry_point": "prime_length"} +{"problem_id": "HumanEval/84", "test": ["assert candidate(1000) == '1'", "assert candidate(150) == '110'", "assert candidate(147) == '1100'"], "entry_point": "solve"} +{"problem_id": "HumanEval/85", "test": ["assert candidate([4, 2, 6, 7]) == 2"], "entry_point": "add"} +{"problem_id": "HumanEval/86", "test": ["assert candidate('Hi') == 'Hi'", "assert candidate('hello') == 'ehllo'", "assert candidate('Hello World!!!') == 'Hello !!!Wdlor'"], "entry_point": "anti_shuffle"} +{"problem_id": "HumanEval/87", "test": ["assert candidate([[1,2,3,4,5,6], [1,2,3,4,1,6], [1,2,3,4,5,1]], 1) == [(0, 0), (1, 4), (1, 0), (2, 5), (2, 0)]", "assert candidate([], 1) == []", "assert candidate([[], [1], [1, 2, 3]], 3) == [(2, 2)]"], "entry_point": "get_row"} +{"problem_id": "HumanEval/88", "test": ["assert candidate([]) == []", "assert candidate([5]) == [5]", "assert candidate([2, 4, 3, 0, 1, 5]) == [0, 1, 2, 3, 4, 5]", "assert candidate([2, 4, 3, 0, 1, 5, 6]) == [6, 5, 4, 3, 2, 1, 0]"], "entry_point": "sort_array"} +{"problem_id": "HumanEval/89", "test": ["assert candidate('hi') == 'lm'", "assert candidate('asdfghjkl') == 'ewhjklnop'", "assert candidate('gf') == 'kj'", "assert candidate('et') == 'ix'"], "entry_point": "encrypt"} +{"problem_id": "HumanEval/90", "test": ["assert candidate([1, 2, 3, 4, 5]) == 2", "assert candidate([5, 1, 4, 3, 2]) == 2", "assert candidate([]) == None", "assert candidate([1, 1]) == None"], "entry_point": "next_smallest"} +{"problem_id": "HumanEval/91", "test": ["assert candidate('Hello world') == 0", "assert candidate('The sky is blue. The sun is shining. I love this weather') == 1"], "entry_point": "is_bored"} +{"problem_id": "HumanEval/92", "test": ["assert candidate(5, 2, 7) == True", "assert candidate(3, 2, 2) == False", "assert candidate(3, -2, 1) == True", "assert candidate(3.6, -2.2, 2) == False"], "entry_point": "any_int"} +{"problem_id": "HumanEval/93", "test": ["assert candidate('test') == 'TGST'", "assert candidate('This is a message') == 'tHKS KS C MGSSCGG'"], "entry_point": "encode"} +{"problem_id": "HumanEval/94", "test": ["assert candidate([0, 3, 2, 1, 3, 5, 7, 4, 5, 5, 5, 2, 181, 32, 4, 32, 3, 2, 32, 324, 4, 3]) == 10", "assert candidate([1, 0, 1, 8, 2, 4597, 2, 1, 3, 40, 1, 2, 1, 2, 4, 2, 5, 1]) == 25", "assert candidate([1, 3, 1, 32, 5107, 34, 83278, 109, 163, 23, 2323, 32, 30, 1, 9, 3]) == 13", "assert candidate([0, 724, 32, 71, 99, 32, 6, 0, 5, 91, 83, 0, 5, 6]) == 11", "assert candidate([0, 81, 12, 3, 1, 21]) == 3", "assert candidate([0, 8, 1, 2, 1, 7]) == 7"], "entry_point": "skjkasdkd"} +{"problem_id": "HumanEval/95", "test": ["assert candidate({'a': 'apple', 'b': 'banana'}) == True", "assert candidate({'a': 'apple', 'A': 'banana', 'B': 'banana'}) == False", "assert candidate({'a': 'apple', 8: 'banana', 'a': 'apple'}) == False", "assert candidate({'Name': 'John', 'Age': '36', 'City': 'Houston'}) == False", "assert candidate({'STATE': 'NC', 'ZIP': '12345'}) == True"], "entry_point": "check_dict_case"} +{"problem_id": "HumanEval/96", "test": ["assert candidate(5) == [2, 3]", "assert candidate(11) == [2, 3, 5, 7]", "assert candidate(0) == []", "assert candidate(20) == [2, 3, 5, 7, 11, 13, 17, 19]", "assert candidate(1) == []", "assert candidate(18) == [2, 3, 5, 7, 11, 13, 17]"], "entry_point": "count_up_to"} +{"problem_id": "HumanEval/97", "test": ["assert candidate(148, 412) == 16", "assert candidate(19, 28) == 72", "assert candidate(2020, 1851) == 0", "assert candidate(14, -15) == 20"], "entry_point": "multiply"} +{"problem_id": "HumanEval/98", "test": ["assert candidate('aBCdEf') == 1", "assert candidate('abcdefg') == 0", "assert candidate('dBBE') == 0"], "entry_point": "count_upper"} +{"problem_id": "HumanEval/99", "test": ["assert candidate('10') == 10", "assert candidate('15.3') == 15", "assert candidate('14.5') == 15", "assert candidate('-14.5') == -15"], "entry_point": "closest_integer"} +{"problem_id": "HumanEval/100", "test": ["assert candidate(3) == [3, 5, 7]"], "entry_point": "make_a_pile"} +{"problem_id": "HumanEval/101", "test": ["assert candidate('Hi, my name is John') == ['Hi', 'my', 'name', 'is', 'John']", "assert candidate('One, two, three, four, five, six') == ['One', 'two', 'three', 'four', 'five', 'six']"], "entry_point": "words_string"} +{"problem_id": "HumanEval/102", "test": ["assert candidate(12, 15) == 14", "assert candidate(13, 12) == -1"], "entry_point": "choose_num"} +{"problem_id": "HumanEval/103", "test": ["assert candidate(1, 5) == '0b11'", "assert candidate(7, 5) == -1", "assert candidate(10, 20) == '0b1111'", "assert candidate(20, 33) == '0b11010'"], "entry_point": "rounded_avg"} +{"problem_id": "HumanEval/104", "test": ["assert candidate([15, 33, 1422, 1]) == [1, 15, 33]", "assert candidate([152, 323, 1422, 10]) == []"], "entry_point": "unique_digits"} +{"problem_id": "HumanEval/106", "test": ["assert candidate(5) == [1, 2, 6, 24, 15]"], "entry_point": "f"} +{"problem_id": "HumanEval/107", "test": ["assert candidate(3) == (1, 2)", "assert candidate(12) == (4, 6)"], "entry_point": "even_odd_palindrome"} +{"problem_id": "HumanEval/108", "test": ["assert candidate([]) == 0", "assert candidate([-1, 11, -11]) == 1", "assert candidate([1, 1, 2]) == 3"], "entry_point": "count_nums"} +{"problem_id": "HumanEval/109", "test": ["assert candidate([3, 4, 5, 1, 2]) == True", "assert candidate([3, 5, 4, 1, 2]) == False", "assert candidate([]) == True"], "entry_point": "move_one_ball"} +{"problem_id": "HumanEval/110", "test": ["assert candidate([1, 2, 3, 4], [1, 2, 3, 4]) == 'YES'", "assert candidate([1, 2, 3, 4], [1, 5, 3, 4]) == 'NO'"], "entry_point": "exchange"} +{"problem_id": "HumanEval/111", "test": ["assert candidate('a b c') == {'a': 1, 'b': 1, 'c': 1}", "assert candidate('a b b a') == {'a': 2, 'b': 2}", "assert candidate('a b c a b') == {'a': 2, 'b': 2}", "assert candidate('b b b b a') == {'b': 4}", "assert candidate('') == {}"], "entry_point": "histogram"} +{"problem_id": "HumanEval/112", "test": ["assert candidate('abcde', 'ae') == ('bcd', False)", "assert candidate('abcdef', 'b') == ('acdef', False)", "assert candidate('abcdedcba', 'ab') == ('cdedc', True)"], "entry_point": "reverse_delete"} +{"problem_id": "HumanEval/113", "test": ["assert candidate(['1234567']) == [\"the number of odd elements 4n the str4ng 4 of the 4nput.\"]", "assert candidate(['3', '11111111']) == [\"the number of odd elements 1n the str1ng 1 of the 1nput.\", \"the number of odd elements 8n the str8ng 8 of the 8nput.\"]"], "entry_point": "odd_count"} +{"problem_id": "HumanEval/114", "test": ["assert candidate([2, 3, 4, 1, 2, 4]) == 1", "assert candidate([-1, -2, -3]) == -6"], "entry_point": "minSubArraySum"} +{"problem_id": "HumanEval/115", "test": ["assert candidate([[0,0,1,0], [0,1,0,0], [1,1,1,1]], 1) == 6", "assert candidate([[0,0,1,1], [0,0,0,0], [1,1,1,1], [0,1,1,1]], 2) == 5", "assert candidate([[0,0,0], [0,0,0]], 5) == 0"], "entry_point": "max_fill"} +{"problem_id": "HumanEval/116", "test": ["assert candidate([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5]", "assert candidate([-2, -3, -4, -5, -6]) == [-6, -5, -4, -3, -2]", "assert candidate([1, 0, 2, 3, 4]) == [0, 1, 2, 3, 4]"], "entry_point": "sort_array"} +{"problem_id": "HumanEval/117", "test": ["assert candidate(\"Mary had a little lamb\", 4) == [\"little\"]", "assert candidate(\"Mary had a little lamb\", 3) == [\"Mary\", \"lamb\"]", "assert candidate(\"simple white space\", 2) == []", "assert candidate(\"Hello world\", 4) == [\"world\"]", "assert candidate(\"Uncle sam\", 3) == [\"Uncle\"]"], "entry_point": "select_words"} +{"problem_id": "HumanEval/118", "test": ["assert candidate('yogurt') == 'u'", "assert candidate('FULL') == 'U'", "assert candidate('quick') == ''", "assert candidate('ab') == ''"], "entry_point": "get_closest_vowel"} +{"problem_id": "HumanEval/119", "test": ["assert candidate(['()(', ')']) == 'Yes'", "assert candidate([')', ')']) == 'No'"], "entry_point": "match_parens"} +{"problem_id": "HumanEval/120", "test": ["assert candidate([-3, -4, 5], 3) == [-4, -3, 5]", "assert candidate([4, -4, 4], 2) == [4, 4]", "assert candidate([-3, 2, 1, 2, -1, -2, 1], 1) == [2]"], "entry_point": "maximum"} +{"problem_id": "HumanEval/121", "test": ["assert candidate([5, 8, 7, 1]) == 12", "assert candidate([3, 3, 3, 3, 3]) == 9", "assert candidate([30, 13, 24, 321]) == 0"], "entry_point": "solution"} +{"problem_id": "HumanEval/122", "test": ["assert candidate([111, 21, 3, 4000, 5, 6, 7, 8, 9], 4) == 24"], "entry_point": "add_elements"} +{"problem_id": "HumanEval/123", "test": ["assert candidate(5) == [1, 5]"], "entry_point": "get_odd_collatz"} +{"problem_id": "HumanEval/124", "test": ["assert candidate('03-11-2000') == True", "assert candidate('15-01-2012') == False", "assert candidate('04-0-2040') == False", "assert candidate('06-04-2020') == True", "assert candidate('06/04/2020') == False"], "entry_point": "valid_date"} +{"problem_id": "HumanEval/125", "test": ["assert candidate('Hello world!') == ['Hello', 'world!']", "assert candidate('Hello,world!') == ['Hello', 'world!']", "assert candidate('abcdef') == 3"], "entry_point": "split_words"} +{"problem_id": "HumanEval/126", "test": ["assert candidate([5]) == True", "assert candidate([1, 2, 3, 4, 5]) == True", "assert candidate([1, 3, 2, 4, 5]) == False", "assert candidate([1, 2, 3, 4, 5, 6]) == True", "assert candidate([1, 2, 3, 4, 5, 6, 7]) == True", "assert candidate([1, 3, 2, 4, 5, 6, 7]) == False", "assert candidate([1, 2, 2, 3, 3, 4]) == True", "assert candidate([1, 2, 2, 2, 3, 4]) == False"], "entry_point": "is_sorted"} +{"problem_id": "HumanEval/127", "test": ["assert candidate((1, 2), (2, 3)) == 'NO'", "assert candidate((-1, 1), (0, 4)) == 'NO'", "assert candidate((-3, -1), (-5, 5)) == 'YES'"], "entry_point": "intersection"} +{"problem_id": "HumanEval/128", "test": ["assert candidate([1, 2, 2, -4]) == -9", "assert candidate([0, 1]) == 0", "assert candidate([]) == None"], "entry_point": "prod_signs"} +{"problem_id": "HumanEval/129", "test": ["assert candidate([[1,2,3], [4,5,6], [7,8,9]], 3) == [1, 2, 1]", "assert candidate([[5,9,3], [4,1,6], [7,8,2]], 1) == [1]"], "entry_point": "minPath"} +{"problem_id": "HumanEval/130", "test": ["assert candidate(3) == [1, 3, 2, 8]", "assert candidate(2) == [1, 3, 2]", "assert candidate(4) == [1, 3, 2, 8, 3]"], "entry_point": "tri"} +{"problem_id": "HumanEval/131", "test": ["assert candidate(1) == 1", "assert candidate(4) == 0", "assert candidate(235) == 15"], "entry_point": "digits"} +{"problem_id": "HumanEval/132", "test": ["assert candidate('[[]]') == True", "assert candidate('[]]]]]]][[[[[]') == False", "assert candidate('[][]') == False", "assert candidate('[]') == False", "assert candidate('[[][]]') == True", "assert candidate('[[]][[') == True"], "entry_point": "is_nested"} +{"problem_id": "HumanEval/133", "test": ["assert candidate([1, 2, 3]) == 14", "assert candidate([1, 4, 9]) == 98", "assert candidate([1, 3, 5, 7]) == 84", "assert candidate([1.4, 4.2, 0]) == 29", "assert candidate([-2.4, 1, 1]) == 6"], "entry_point": "sum_squares"} +{"problem_id": "HumanEval/134", "test": ["assert candidate(\"apple pie\") == False", "assert candidate(\"apple pi e\") == True", "assert candidate(\"apple pi e \") == False", "assert candidate(\"\") == False"], "entry_point": "check_if_last_char_is_a_letter"} +{"problem_id": "HumanEval/135", "test": ["assert candidate([1,2,4,3,5]) == 3", "assert candidate([1,2,3]) == -1"], "entry_point": "can_arrange"} +{"problem_id": "HumanEval/136", "test": ["assert candidate([2, 4, 1, 3, 5, 7]) == (None, 1)", "assert candidate([]) == (None, None)", "assert candidate([0]) == (None, None)"], "entry_point": "largest_smallest_integers"} +{"problem_id": "HumanEval/137", "test": ["assert candidate(1, 2.5) == 2.5", "assert candidate(1, '2,3') == '2,3'", "assert candidate('5,1', '6') == '6'", "assert candidate('1', 1) == None"], "entry_point": "compare_one"} +{"problem_id": "HumanEval/138", "test": ["assert candidate(4) == False", "assert candidate(6) == False", "assert candidate(8) == True"], "entry_point": "is_equal_to_sum_even"} +{"problem_id": "HumanEval/139", "test": ["assert candidate(4) == 288"], "entry_point": "special_factorial"} +{"problem_id": "HumanEval/140", "test": ["assert candidate('Example') == 'Example'", "assert candidate('Example 1') == 'Example_1'", "assert candidate(' Example 2') == '_Example_2'", "assert candidate(' Example 3') == '_Example-3'"], "entry_point": "fix_spaces"} +{"problem_id": "HumanEval/141", "test": ["assert candidate('example.txt') == 'Yes'", "assert candidate('1example.dll') == 'No'"], "entry_point": "file_name_check"} +{"problem_id": "HumanEval/142", "test": ["assert candidate([1, 2, 3]) == 6", "assert candidate([]) == 0", "assert candidate([-1, -5, 2, -1, -5]) == -126"], "entry_point": "sum_squares"} +{"problem_id": "HumanEval/143", "test": ["assert candidate('This is a test') == 'is'", "assert candidate('lets go for swimming') == 'go for'"], "entry_point": "words_in_sentence"} +{"problem_id": "HumanEval/144", "test": ["assert candidate('1/5', '5/1') == True", "assert candidate('1/6', '2/1') == False", "assert candidate('7/10', '10/2') == False"], "entry_point": "simplify"} +{"problem_id": "HumanEval/145", "test": ["assert candidate([1, 11, -1, -11, -12]) == [-1, -11, 1, -12, 11]", "assert candidate([]) == []"], "entry_point": "order_by_points"} +{"problem_id": "HumanEval/146", "test": ["assert candidate([15, -73, 14, -15]) == 1", "assert candidate([33, -2, -3, 45, 21, 109]) == 2"], "entry_point": "specialFilter"} +{"problem_id": "HumanEval/147", "test": ["assert candidate(5) == 1"], "entry_point": "get_max_triples"} +{"problem_id": "HumanEval/148", "test": ["assert candidate('Jupiter', 'Neptune') == ('Saturn', 'Uranus')", "assert candidate('Earth', 'Mercury') == ('Venus')", "assert candidate('Mercury', 'Uranus') == ('Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn')", ""], "entry_point": "bf"} +{"problem_id": "HumanEval/149", "test": ["assert candidate(['aa', 'a', 'aaa']) == ['aa']", "assert candidate(['ab', 'a', 'aaa', 'cd']) == ['ab', 'cd']"], "entry_point": "sorted_list_sum"} +{"problem_id": "HumanEval/150", "test": ["assert candidate(7, 34, 12) == 34", "assert candidate(15, 8, 5) == 5"], "entry_point": "x_or_y"} +{"problem_id": "HumanEval/151", "test": ["assert candidate([1, 3, 2, 0]) == 10", "assert candidate([-1, -2, 0]) == 0", "assert candidate([9, -2]) == 81", "assert candidate([0]) == 0", "assert candidate([]) == 0"], "entry_point": "double_the_difference"} +{"problem_id": "HumanEval/152", "test": ["assert candidate([1,2,3,4,5,1],[1,2,3,4,2,-2]) == [0,0,0,0,3,3]", "assert candidate([0,5,0,0,0,4],[4,1,1,0,0,-2]) == [4,4,1,0,0,6]"], "entry_point": "compare"} +{"problem_id": "HumanEval/153", "test": ["assert candidate('my_class', ['AA', 'Be', 'CC']) == 'my_class.AA'", "assert candidate('Slices', ['SErviNGSliCes', 'Cheese', 'StuFfed']) == 'Slices.SErviNGSliCes'"], "entry_point": "Strongest_Extension"} +{"problem_id": "HumanEval/154", "test": ["assert candidate('abcd', 'abd') == False", "assert candidate('hello', 'ell') == True", "assert candidate('whassup', 'psus') == False", "assert candidate('abab', 'baa') == True", "assert candidate('efef', 'eeff') == False", "assert candidate('himenss', 'simen') == True"], "entry_point": "cycpattern_check"} +{"problem_id": "HumanEval/155", "test": ["assert candidate(-12) == (1, 1)", "assert candidate(123) == (1, 2)"], "entry_point": "even_odd_count"} +{"problem_id": "HumanEval/156", "test": ["assert candidate(19) == 'xix'", "assert candidate(152) == 'clii'", "assert candidate(426) == 'cdxxvi'"], "entry_point": "int_to_mini_roman"} +{"problem_id": "HumanEval/157", "test": ["assert candidate(3, 4, 5) == True", "assert candidate(1, 2, 3) == False"], "entry_point": "right_angle_triangle"} +{"problem_id": "HumanEval/158", "test": ["assert candidate([\"name\", \"of\", \"string\"]) == \"string\"", "assert candidate([\"name\", \"enam\", \"game\"]) == \"enam\"", "assert candidate([\"aaaaaaa\", \"bb\", \"cc\"]) == \"aaaaaaa\""], "entry_point": "find_max"} +{"problem_id": "HumanEval/159", "test": ["assert candidate(5, 6, 10) == [11, 4]", "assert candidate(4, 8, 9) == [12, 1]", "assert candidate(1, 10, 10) == [11, 0]", "assert candidate(2, 11, 5) == [7, 0]"], "entry_point": "eat"} +{"problem_id": "HumanEval/160", "test": ["assert candidate(['+', '*', '-'], [2, 3, 4, 5]) == 9"], "entry_point": "do_algebra"} +{"problem_id": "HumanEval/161", "test": ["assert candidate('1234') == '4321'", "assert candidate('ab') == 'AB'", "assert candidate('#a@C') == '#A@c'"], "entry_point": "solve"} +{"problem_id": "HumanEval/162", "test": ["assert candidate('Hello world') == '3e25960a79dbc69b674cd4ec67a72c62'", "assert candidate('') == None"], "entry_point": "string_to_md5"} +{"problem_id": "HumanEval/163", "test": ["assert candidate(2, 8) == [2, 4, 6, 8]", "assert candidate(8, 2) == [2, 4, 6, 8]", "assert candidate(10, 14) == []"], "entry_point": "generate_integers"} diff --git a/examples/ags/data/mbpp_public_test.jsonl b/examples/ags/data/mbpp_public_test.jsonl new file mode 100644 index 000000000..3ba5feee2 --- /dev/null +++ b/examples/ags/data/mbpp_public_test.jsonl @@ -0,0 +1,427 @@ +{"entry_point": "tuple_to_int", "test": ["assert candidate((1,2,3))==123"]} +{"entry_point": "swap_numbers", "test": ["assert candidate(10,20)==(20,10)"]} +{"entry_point": "last_Digit", "test": ["assert candidate(123) == 3"]} +{"entry_point": "is_samepatterns", "test": ["assert candidate([\"red\",\"green\",\"green\"], [\"a\", \"b\", \"b\"])==True"]} +{"entry_point": "is_Sum_Of_Powers_Of_Two", "test": ["assert candidate(10) == True"]} +{"entry_point": "sum_Of_Subarray_Prod", "test": ["assert candidate([1,2,3]) == 20"]} +{"entry_point": "max_aggregate", "test": ["assert candidate([('Juan Whelan',90),('Sabah Colley',88),('Peter Nichols',7),('Juan Whelan',122),('Sabah Colley',84)])==('Juan Whelan', 212)"]} +{"entry_point": "parabola_directrix", "test": ["assert candidate(5,3,2)==-198"]} +{"entry_point": "return_sum", "test": ["assert candidate({'a': 100, 'b':200, 'c':300}) == 600"]} +{"entry_point": "sum_Of_product", "test": ["assert candidate(3) == 15"]} +{"entry_point": "heap_sort", "test": ["assert candidate([1, 3, 5, 7, 9, 2, 4, 6, 8, 0])==[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"]} +{"entry_point": "move_num", "test": ["assert candidate('I1love143you55three3000thousand') == 'Iloveyouthreethousand1143553000'"]} +{"entry_point": "square_nums", "test": ["assert candidate([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])==[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]"]} +{"entry_point": "find_substring", "test": ["assert candidate([\"red\", \"black\", \"white\", \"green\", \"orange\"],\"ack\")==True"]} +{"entry_point": "removezero_ip", "test": ["assert candidate(\"216.08.094.196\")==('216.8.94.196')"]} +{"entry_point": "replace_spaces", "test": ["assert candidate('Jumanji The Jungle') == 'Jumanji_The_Jungle'"]} +{"entry_point": "sector_area", "test": ["assert candidate(4,45)==6.283185307179586"]} +{"entry_point": "digit_distance_nums", "test": ["assert candidate(1,2) == 1"]} +{"entry_point": "find_Max_Num", "test": ["assert candidate([1,2,3]) == 321"]} +{"entry_point": "count_vowels", "test": ["assert candidate('bestinstareels') == 7"]} +{"entry_point": "count_Primes_nums", "test": ["assert candidate(5) == 2"]} +{"entry_point": "sum_average", "test": ["assert candidate(10)==(55, 5.5)"]} +{"entry_point": "differ_At_One_Bit_Pos", "test": ["assert candidate(13,9) == True"]} +{"entry_point": "is_octagonal", "test": ["assert candidate(5) == 65"]} +{"entry_point": "find_First_Missing", "test": ["assert candidate([0,1,2,3]) == 4"]} +{"entry_point": "add_lists", "test": ["assert candidate([5, 6, 7], (9, 10)) == (9, 10, 5, 6, 7)"]} +{"entry_point": "area_tetrahedron", "test": ["assert candidate(3)==15.588457268119894"]} +{"entry_point": "combinations_list", "test": ["assert candidate(['orange', 'red', 'green', 'blue'])==[[], ['orange'], ['red'], ['red', 'orange'], ['green'], ['green', 'orange'], ['green', 'red'], ['green', 'red', 'orange'], ['blue'], ['blue', 'orange'], ['blue', 'red'], ['blue', 'red', 'orange'], ['blue', 'green'], ['blue', 'green', 'orange'], ['blue', 'green', 'red'], ['blue', 'green', 'red', 'orange']]"]} +{"entry_point": "common_in_nested_lists", "test": ["assert candidate(common_in_nested_lists([[12, 18, 23, 25, 45], [7, 12, 18, 24, 28], [1, 5, 8, 12, 15, 16, 18]]))==set([18, 12])"]} +{"entry_point": "replace_blank", "test": ["assert candidate(\"hello people\",'@')==(\"hello@people\")"]} +{"entry_point": "check_K", "test": ["assert candidate((10, 4, 5, 6, 8), 6) == True"]} +{"entry_point": "min_product_tuple", "test": ["assert candidate([(2, 7), (2, 6), (1, 8), (4, 9)] )==8"]} +{"entry_point": "comb_sort", "test": ["assert candidate([5, 15, 37, 25, 79]) == [5, 15, 25, 37, 79]"]} +{"entry_point": "min_Jumps", "test": ["assert candidate((3,4),11)==3.5"]} +{"entry_point": "extract_nth_element", "test": ["assert candidate([('Greyson Fulton', 98, 99), ('Brady Kent', 97, 96), ('Wyatt Knott', 91, 94), ('Beau Turnbull', 94, 98)] ,0)==['Greyson Fulton', 'Brady Kent', 'Wyatt Knott', 'Beau Turnbull']"]} +{"entry_point": "cube_Sum", "test": ["assert candidate(2) == 72"]} +{"entry_point": "is_Monotonic", "test": ["assert candidate([6, 5, 4, 4]) == True"]} +{"entry_point": "remove_Occ", "test": ["assert candidate(\"hello\",\"l\") == \"heo\""]} +{"entry_point": "mul_even_odd", "test": ["assert candidate([1,3,5,7,4,1,6,8])==4"]} +{"entry_point": "find_remainder", "test": ["assert candidate([ 100, 10, 5, 25, 35, 14 ],11) ==9"]} +{"entry_point": "all_Bits_Set_In_The_Given_Range", "test": ["assert candidate(4,1,2) == True"]} +{"entry_point": "perimeter_pentagon", "test": ["assert candidate(5) == 25"]} +{"entry_point": "text_match_wordz_middle", "test": ["assert candidate(\"pythonzabc.\")==True"]} +{"entry_point": "find_combinations", "test": ["assert candidate([(2, 4), (6, 7), (5, 1), (6, 10)]) == [(8, 11), (7, 5), (8, 14), (11, 8), (12, 17), (11, 11)]"]} +{"entry_point": "replace_list", "test": ["assert candidate([1, 3, 5, 7, 9, 10],[2, 4, 6, 8])==[1, 3, 5, 7, 9, 2, 4, 6, 8]"]} +{"entry_point": "even_bit_set_number", "test": ["assert candidate(10) == 10"]} +{"entry_point": "dog_age", "test": ["assert candidate(12)==61"]} +{"entry_point": "merge", "test": ["assert candidate([['x', 'y'], ['a', 'b'], ['m', 'n']]) == [['x', 'a', 'm'], ['y', 'b', 'n']]"]} +{"entry_point": "interleave_lists", "test": ["assert candidate([1,2,3,4,5,6,7],[10,20,30,40,50,60,70],[100,200,300,400,500,600,700])==[1, 10, 100, 2, 20, 200, 3, 30, 300, 4, 40, 400, 5, 50, 500, 6, 60, 600, 7, 70, 700]"]} +{"entry_point": "text_match_two_three", "test": ["assert candidate(\"ac\")==(False)"]} +{"entry_point": "unique_sublists", "test": ["assert candidate([[1, 3], [5, 7], [1, 3], [13, 15, 17], [5, 7], [9, 11]] )=={(1, 3): 2, (5, 7): 2, (13, 15, 17): 1, (9, 11): 1}"]} +{"entry_point": "count_divisors", "test": ["assert candidate(10)"]} +{"entry_point": "unique_Element", "test": ["assert candidate([1,1,1]) == True"]} +{"entry_point": "all_unique", "test": ["assert candidate([1,2,3]) == True"]} +{"entry_point": "highest_Power_of_2", "test": ["assert candidate(10) == 8"]} +{"entry_point": "re_arrange_array", "test": ["assert candidate([-1, 2, -3, 4, 5, 6, -7, 8, 9], 9) == [-1, -3, -7, 4, 5, 6, 2, 8, 9]"]} +{"entry_point": "text_match_three", "test": ["assert not text_match_three(\"ac\")"]} +{"entry_point": "check_tuplex", "test": ["assert candidate((\"w\", 3, \"r\", \"e\", \"s\", \"o\", \"u\", \"r\", \"c\", \"e\"),'r')==True"]} +{"entry_point": "consecutive_duplicates", "test": ["assert candidate([0, 0, 1, 2, 3, 4, 4, 5, 6, 6, 6, 7, 8, 9, 4, 4 ])==[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4]"]} +{"entry_point": "split", "test": ["assert candidate('python') == ['p','y','t','h','o','n']"]} +{"entry_point": "zero_count", "test": ["assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)"]} +{"entry_point": "Find_Min", "test": ["assert candidate([[1],[1,2],[1,2,3]]) == [1]"]} +{"entry_point": "concatenate_tuple", "test": ["assert candidate((\"ID\", \"is\", 4, \"UTS\") ) == 'ID-is-4-UTS'"]} +{"entry_point": "sum", "test": ["assert candidate(10,15) == 6"]} +{"entry_point": "overlapping", "test": ["assert candidate([1,2,3,4,5],[6,7,8,9]) == False"]} +{"entry_point": "sub_list", "test": ["assert candidate([1, 2, 3],[4,5,6])==[-3,-3,-3]"]} +{"entry_point": "count_charac", "test": ["assert candidate(\"python programming\")==18"]} +{"entry_point": "min_Swaps", "test": ["assert candidate(\"1101\",\"1110\") == 1"]} +{"entry_point": "unique_sublists", "test": ["assert candidate([[1, 3], [5, 7], [1, 3], [13, 15, 17], [5, 7], [9, 11]])=={(1, 3): 2, (5, 7): 2, (13, 15, 17): 1, (9, 11): 1}"]} +{"entry_point": "pair_xor_Sum", "test": ["assert candidate([5,9,7,6],4) == 47"]} +{"entry_point": "pack_consecutive_duplicates", "test": ["assert candidate([0, 0, 1, 2, 3, 4, 4, 5, 6, 6, 6, 7, 8, 9, 4, 4])==[[0, 0], [1], [2], [3], [4, 4], [5], [6, 6, 6], [7], [8], [9], [4, 4]]"]} +{"entry_point": "new_tuple", "test": ["assert candidate([\"WEB\", \"is\"], \"best\") == ('WEB', 'is', 'best')"]} +{"entry_point": "is_product_even", "test": ["assert candidate([1,2,3])"]} +{"entry_point": "max_sum_increasing_subseq", "test": ["assert candidate([1, 101, 2, 3, 100, 4, 5 ], 7, 4, 6) == 11"]} +{"entry_point": "count_reverse_pairs", "test": ["assert candidate([\"julia\", \"best\", \"tseb\", \"for\", \"ailuj\"])== 2"]} +{"entry_point": "count_same_pair", "test": ["assert candidate([1, 2, 3, 4, 5, 6, 7, 8],[2, 2, 3, 1, 2, 6, 7, 9])==4"]} +{"entry_point": "volume_sphere", "test": ["assert math.isclose(volume_sphere(10), 4188.790204786391, rel_tol=0.001)"]} +{"entry_point": "lateralsurface_cube", "test": ["assert candidate(5)==100"]} +{"entry_point": "find_literals", "test": ["assert candidate('The quick brown fox jumps over the lazy dog.', 'fox') == ('fox', 16, 19)"]} +{"entry_point": "right_insertion", "test": ["assert candidate([1,2,4,5],6)==4"]} +{"entry_point": "multiple_to_single", "test": ["assert candidate([11, 33, 50])==113350"]} +{"entry_point": "find_adverb_position", "test": ["assert candidate(\"clearly!! we can see the sky\")==(0, 7, 'clearly')"]} +{"entry_point": "count", "test": ["assert candidate([True,False,True]) == 2"]} +{"entry_point": "larg_nnum", "test": ["assert candidate(larg_nnum([10, 20, 50, 70, 90, 20, 50, 40, 60, 80, 100],2))==set([100,90])"]} +{"entry_point": "difference", "test": ["assert candidate(3) == 30"]} +{"entry_point": "min_k", "test": ["assert candidate([('Manjeet', 10), ('Akshat', 4), ('Akash', 2), ('Nikhil', 8)], 2) == [('Akash', 2), ('Akshat', 4)]"]} +{"entry_point": "find_Rotations", "test": ["assert candidate(\"aaaa\") == 1"]} +{"entry_point": "even_position", "test": ["assert candidate([3,2,1]) == False"]} +{"entry_point": "big_diff", "test": ["assert candidate([1,2,3,4]) == 3"]} +{"entry_point": "max_sub_array_sum_repeated", "test": ["assert candidate([10, 20, -30, -1], 4, 3) == 30"]} +{"entry_point": "count_first_elements", "test": ["assert candidate((1, 5, 7, (4, 6), 10) ) == 3"]} +{"entry_point": "text_lowercase_underscore", "test": ["assert candidate(\"aab_cbbbc\")==(True)"]} +{"entry_point": "text_match_one", "test": ["assert candidate(\"ac\")==False"]} +{"entry_point": "check_type", "test": ["assert candidate((5, 6, 7, 3, 5, 6) ) == True"]} +{"entry_point": "sum_negativenum", "test": ["assert candidate([2, 4, -6, -9, 11, -12, 14, -5, 17])==-32"]} +{"entry_point": "extract_string", "test": ["assert candidate(['Python', 'list', 'exercises', 'practice', 'solution'] ,8)==['practice', 'solution']"]} +{"entry_point": "remove_kth_element", "test": ["assert candidate([1,1,2,3,4,4,5,1],3)==[1, 1, 3, 4, 4, 5, 1]"]} +{"entry_point": "flatten_list", "test": ["assert candidate([0, 10, [20, 30], 40, 50, [60, 70, 80], [90, 100, 110, 120]])==[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]"]} +{"entry_point": "odd_length_sum", "test": ["assert candidate([1,2,4]) == 14"]} +{"entry_point": "loss_amount", "test": ["assert candidate(1500,1200)==0"]} +{"entry_point": "Extract", "test": ["assert candidate([[1, 2], [3, 4, 5], [6, 7, 8, 9]]) == [1, 3, 6]"]} +{"entry_point": "add_nested_tuples", "test": ["assert candidate(((1, 3), (4, 5), (2, 9), (1, 10)), ((6, 7), (3, 9), (1, 1), (7, 3))) == ((7, 10), (7, 14), (3, 10), (8, 13))"]} +{"entry_point": "find_first_occurrence", "test": ["assert candidate([2, 5, 5, 5, 6, 6, 8, 9, 9, 9], 5) == 1"]} +{"entry_point": "find_lists", "test": ["assert candidate(([1, 2, 3, 4], [5, 6, 7, 8])) == 2"]} +{"entry_point": "surface_Area", "test": ["assert candidate(3,4) == 33"]} +{"entry_point": "otherside_rightangle", "test": ["assert candidate(7,8)==10.63014581273465"]} +{"entry_point": "find_Average_Of_Cube", "test": ["assert candidate(2) == 4.5"]} +{"entry_point": "even_binomial_Coeff_Sum", "test": ["assert candidate(4) == 8"]} +{"entry_point": "heap_queue_largest", "test": ["assert candidate( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]"]} +{"entry_point": "remove_elements", "test": ["assert candidate([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 4, 6, 8]) == [1, 3, 5, 7, 9, 10]"]} +{"entry_point": "search", "test": ["assert candidate([1,1,2,2,3]) == 3"]} +{"entry_point": "surfacearea_cube", "test": ["assert candidate(5)==150"]} +{"entry_point": "lps", "test": ["assert candidate(\"TENS FOR TENS\") == 5"]} +{"entry_point": "count_char_position", "test": ["assert candidate(\"xbcefg\") == 2"]} +{"entry_point": "sum_even_and_even_index", "test": ["assert candidate([5, 6, 12, 1, 18, 8]) == 30"]} +{"entry_point": "get_median", "test": ["assert candidate([1, 12, 15, 26, 38], [2, 13, 17, 30, 45], 5) == 16.0"]} +{"entry_point": "remove_uppercase", "test": ["assert candidate('cAstyoUrFavoRitETVshoWs') == 'cstyoravoitshos'"]} +{"entry_point": "find_even_pair", "test": ["assert candidate([5, 4, 7, 2, 1]) == 4"]} +{"entry_point": "remove_lowercase", "test": ["assert candidate(\"PYTHon\")==('PYTH')"]} +{"entry_point": "string_to_list", "test": ["assert candidate(\"python programming\")==['python','programming']"]} +{"entry_point": "recursive_list_sum", "test": ["assert candidate(([1, 2, [3,4],[5,6]]))==21"]} +{"entry_point": "get_Char", "test": ["assert candidate(\"abc\") == \"f\""]} +{"entry_point": "even_Power_Sum", "test": ["assert candidate(2) == 1056"]} +{"entry_point": "babylonian_squareroot", "test": ["assert math.isclose(babylonian_squareroot(10), 3.162277660168379, rel_tol=0.001)"]} +{"entry_point": "largest_neg", "test": ["assert candidate([1,2,3,-4,-6]) == -6"]} +{"entry_point": "remove_odd", "test": ["assert candidate([1,2,3]) == [2]"]} +{"entry_point": "add_string", "test": ["assert candidate([1,2,3,4],'temp{0}')==['temp1', 'temp2', 'temp3', 'temp4']"]} +{"entry_point": "list_to_float", "test": ["assert candidate( [(\"3\", \"4\"), (\"1\", \"26.45\"), (\"7.32\", \"8\"), (\"4\", \"8\")] ) == [(3.0, 4.0), (1.0, 26.45), (7.32, 8.0), (4.0, 8.0)]"]} +{"entry_point": "remove_parenthesis", "test": ["assert candidate([\"python (chrome)\"])==(\"python\")"]} +{"entry_point": "toggle_middle_bits", "test": ["assert candidate(9) == 15"]} +{"entry_point": "upper_ctr", "test": ["assert candidate('PYthon') == 1"]} +{"entry_point": "max_product", "test": ["assert candidate([3, 100, 4, 5, 150, 6]) == 3000"]} +{"entry_point": "lcs_of_three", "test": ["assert candidate('AGGT12', '12TXAYB', '12XBA') == 2"]} +{"entry_point": "armstrong_number", "test": ["assert candidate(153)==True"]} +{"entry_point": "intersection_array", "test": ["assert candidate([1, 2, 3, 5, 7, 8, 9, 10],[1, 2, 4, 8, 9])==[1, 2, 8, 9]"]} +{"entry_point": "substract_elements", "test": ["assert candidate((10, 4, 5), (2, 5, 18)) == (8, -1, -13)"]} +{"entry_point": "swap_List", "test": ["assert candidate([1,2,3]) == [3,2,1]"]} +{"entry_point": "extract_singly", "test": ["assert candidate(extract_singly([(3, 4, 5), (4, 5, 7), (1, 4)])) == set([3, 4, 5, 7, 1])"]} +{"entry_point": "find_Index", "test": ["assert candidate(2) == 4"]} +{"entry_point": "common_element", "test": ["assert candidate([1,2,3,4,5], [5,6,7,8,9])==True"]} +{"entry_point": "find_dissimilar", "test": ["assert candidate((3, 4, 5, 6), (5, 7, 4, 10)) == (3, 6, 7, 10)"]} +{"entry_point": "geometric_sum", "test": ["assert candidate(7) == 1.9921875"]} +{"entry_point": "rectangle_area", "test": ["assert candidate(10,20)==200"]} +{"entry_point": "sort_sublists", "test": ["assert candidate([['green', 'orange'], ['black', 'white'], ['white', 'black', 'orange']])==[['green', 'orange'], ['black', 'white'], ['black', 'orange', 'white']]"]} +{"entry_point": "largest_subset", "test": ["assert candidate([ 1, 3, 6, 13, 17, 18 ]) == 4"]} +{"entry_point": "convert_list_dictionary", "test": ["assert candidate([\"S001\", \"S002\", \"S003\", \"S004\"],[\"Adina Park\", \"Leyton Marsh\", \"Duncan Boyle\", \"Saim Richards\"] ,[85, 98, 89, 92])==[{'S001': {'Adina Park': 85}}, {'S002': {'Leyton Marsh': 98}}, {'S003': {'Duncan Boyle': 89}}, {'S004': {'Saim Richards': 92}}]"]} +{"entry_point": "count_Occurrence", "test": ["assert candidate(('a', 'a', 'c', 'b', 'd'),['a', 'b'] ) == 3"]} +{"entry_point": "extract_quotation", "test": ["assert candidate('Cortex \"A53\" Based \"multi\" tasking \"Processor\"') == ['A53', 'multi', 'Processor']"]} +{"entry_point": "tuple_str_int", "test": ["assert candidate(\"(7, 8, 9)\") == (7, 8, 9)"]} +{"entry_point": "tuple_size", "test": ["assert candidate((\"A\", 1, \"B\", 2, \"C\", 3) ) == sys.getsizeof((\"A\", 1, \"B\", 2, \"C\", 3))"]} +{"entry_point": "count_integer", "test": ["assert candidate([1,2,'abc',1.2]) == 2"]} +{"entry_point": "sequence", "test": ["assert candidate(10) == 6"]} +{"entry_point": "square_perimeter", "test": ["assert candidate(10)==40"]} +{"entry_point": "catalan_number", "test": ["assert candidate(10)==16796"]} +{"entry_point": "max_length_list", "test": ["assert candidate([[0], [1, 3], [5, 7], [9, 11], [13, 15, 17]])==(3, [13, 15, 17])"]} +{"entry_point": "find_adverbs", "test": ["assert candidate(\"Clearly, he has no excuse for such behavior.\") == '0-7: Clearly'"]} +{"entry_point": "extract_values", "test": ["assert candidate('\"Python\", \"PHP\", \"Java\"')==['Python', 'PHP', 'Java']"]} +{"entry_point": "lateralsuface_cylinder", "test": ["assert math.isclose(lateralsuface_cylinder(10,5), 314.15000000000003, rel_tol=0.001)"]} +{"entry_point": "volume_cone", "test": ["assert math.isclose(volume_cone(5,12), 314.15926535897927, rel_tol=0.001)"]} +{"entry_point": "is_lower", "test": ["assert candidate(\"InValid\") == \"invalid\""]} +{"entry_point": "prime_num", "test": ["assert candidate(13)==True"]} +{"entry_point": "div_list", "test": ["assert candidate([4,5,6],[1, 2, 3])==[4.0,2.5,2.0]"]} +{"entry_point": "wind_chill", "test": ["assert candidate(120,35)==40"]} +{"entry_point": "get_total_number_of_sequences", "test": ["assert candidate(10, 4) == 4"]} +{"entry_point": "odd_position", "test": ["assert candidate([2,1,4,3,6,7,6,3]) == True"]} +{"entry_point": "polar_rect", "test": ["assert candidate(3,4)==((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))"]} +{"entry_point": "last", "test": ["assert candidate([1,2,3],1) == 0"]} +{"entry_point": "first_repeated_char", "test": ["assert candidate(\"abcabc\") == \"a\""]} +{"entry_point": "index_multiplication", "test": ["assert candidate(((1, 3), (4, 5), (2, 9), (1, 10)),((6, 7), (3, 9), (1, 1), (7, 3)) ) == ((6, 21), (12, 45), (2, 9), (7, 30))"]} +{"entry_point": "get_pairs_count", "test": ["assert candidate([1,1,1,1],2) == 6"]} +{"entry_point": "sum_in_range", "test": ["assert candidate(2,5) == 8"]} +{"entry_point": "max_val", "test": ["assert candidate(['Python', 3, 2, 4, 5, 'version'])==5"]} +{"entry_point": "split_two_parts", "test": ["assert candidate([1,1,2,3,4,4,5,1],3)==([1, 1, 2], [3, 4, 4, 5, 1])"]} +{"entry_point": "rearrange_bigger", "test": ["assert candidate(12)==21"]} +{"entry_point": "replace_char", "test": ["assert candidate(\"polygon\",'y','l')==(\"pollgon\")"]} +{"entry_point": "trim_tuple", "test": ["assert candidate([(5, 3, 2, 1, 4), (3, 4, 9, 2, 1),(9, 1, 2, 3, 5), (4, 8, 2, 1, 7)], 2) == '[(2,), (9,), (2,), (2,)]'"]} +{"entry_point": "check_occurences", "test": ["assert candidate([(3, 1), (1, 3), (2, 5), (5, 2), (6, 3)] ) == {(1, 3): 2, (2, 5): 2, (3, 6): 1}"]} +{"entry_point": "next_Perfect_Square", "test": ["assert candidate(35) == 36"]} +{"entry_point": "neg_nos", "test": ["assert candidate([-1,4,5,-6]) == [-1,-6]"]} +{"entry_point": "sort_numeric_strings", "test": ["assert candidate( ['4','12','45','7','0','100','200','-12','-500'])==[-500, -12, 0, 4, 7, 12, 45, 100, 200]"]} +{"entry_point": "extract_even", "test": ["assert candidate((4, 5, (7, 6, (2, 4)), 6, 8)) == (4, (6, (2, 4)), 6, 8)"]} +{"entry_point": "insert_element", "test": ["assert candidate(['Red', 'Green', 'Black'] ,'c')==['c', 'Red', 'c', 'Green', 'c', 'Black']"]} +{"entry_point": "lateralsurface_cone", "test": ["assert candidate(5,12)==204.20352248333654"]} +{"entry_point": "min_of_three", "test": ["assert candidate(10,20,0)==0"]} +{"entry_point": "cummulative_sum", "test": ["assert candidate([(1, 3), (5, 6, 7), (2, 6)]) == 30"]} +{"entry_point": "maximize_elements", "test": ["assert candidate(((1, 3), (4, 5), (2, 9), (1, 10)), ((6, 7), (3, 9), (1, 1), (7, 3))) == ((6, 7), (4, 9), (2, 9), (7, 10))"]} +{"entry_point": "radian_degree", "test": ["assert candidate(90)==1.5707963267948966"]} +{"entry_point": "len_log", "test": ["assert candidate([\"python\",\"PHP\",\"bigdata\"]) == 7"]} +{"entry_point": "occurance_substring", "test": ["assert candidate('python programming, python language','python')==('python', 0, 6)"]} +{"entry_point": "empty_dit", "test": ["assert candidate([{},{},{}])==True"]} +{"entry_point": "circle_circumference", "test": ["assert math.isclose(circle_circumference(10), 62.830000000000005, rel_tol=0.001)"]} +{"entry_point": "check_monthnumb_number", "test": ["assert candidate(5)==True"]} +{"entry_point": "check_distinct", "test": ["assert candidate((1, 4, 5, 6, 1, 4)) == False"]} +{"entry_point": "square_Sum", "test": ["assert candidate(2) == 20"]} +{"entry_point": "remove_dirty_chars", "test": ["assert candidate(\"probasscurve\", \"pros\") == 'bacuve'"]} +{"entry_point": "tetrahedral_number", "test": ["assert candidate(5) == 35"]} +{"entry_point": "get_equal", "test": ["assert candidate([(11, 22, 33), (44, 55, 66)]) == True"]} +{"entry_point": "dict_depth", "test": ["assert candidate({'a':1, 'b': {'c': {'d': {}}}})==4"]} +{"entry_point": "ascii_value", "test": ["assert candidate('A')==65"]} +{"entry_point": "is_nonagonal", "test": ["assert candidate(10) == 325"]} +{"entry_point": "expensive_items", "test": ["assert candidate([{'name': 'Item-1', 'price': 101.1},{'name': 'Item-2', 'price': 555.22}],1)==[{'name': 'Item-2', 'price': 555.22}]"]} +{"entry_point": "check_greater", "test": ["assert candidate([1, 2, 3, 4, 5], 4) == False"]} +{"entry_point": "are_equivalent", "test": ["assert candidate(36, 57) == False"]} +{"entry_point": "frequency_lists", "test": ["assert candidate([[1, 2, 3, 2], [4, 5, 6, 2], [7, 8, 9, 5]])=={1: 1, 2: 3, 3: 1, 4: 1, 5: 2, 6: 1, 7: 1, 8: 1, 9: 1}"]} +{"entry_point": "sumofFactors", "test": ["assert candidate(18) == 26"]} +{"entry_point": "find_min_diff", "test": ["assert candidate((1,5,3,19,18,25),6) == 1"]} +{"entry_point": "nth_nums", "test": ["assert candidate([1, 2, 3, 4, 5, 6, 7, 8, 9, 10],2)==[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]"]} +{"entry_point": "set_left_most_unset_bit", "test": ["assert candidate(10) == 14"]} +{"entry_point": "merge_dictionaries_three", "test": ["assert candidate({ \"R\": \"Red\", \"B\": \"Black\", \"P\": \"Pink\" }, { \"G\": \"Green\", \"W\": \"White\" },{ \"O\": \"Orange\", \"W\": \"White\", \"B\": \"Black\" })=={'B': 'Black', 'R': 'Red', 'P': 'Pink', 'G': 'Green', 'W': 'White', 'O': 'Orange'}"]} +{"entry_point": "get_coordinates", "test": ["assert candidate((3, 4)) == [[2, 3], [2, 4], [2, 5], [3, 3], [3, 4], [3, 5], [4, 3], [4, 4], [4, 5]]"]} +{"entry_point": "group_tuples", "test": ["assert candidate([('x', 'y'), ('x', 'z'), ('w', 't')]) == [('x', 'y', 'z'), ('w', 't')]"]} +{"entry_point": "is_Sub_Array", "test": ["assert candidate([1,4,3,5],[1,2]) == False"]} +{"entry_point": "add_pairwise", "test": ["assert candidate((1, 5, 7, 8, 10)) == (6, 12, 15, 18)"]} +{"entry_point": "replace_specialchar", "test": ["assert candidate('Python language, Programming language.')==('Python:language::Programming:language:')"]} +{"entry_point": "find_char_long", "test": ["assert candidate(find_char_long('Please move back to stream')) == set(['Please', 'move', 'back', 'stream'])"]} +{"entry_point": "check_Consecutive", "test": ["assert candidate([1,2,3,4,5]) == True"]} +{"entry_point": "check_monthnumber_number", "test": ["assert candidate(6)==True"]} +{"entry_point": "sum_range_list", "test": ["assert candidate([2,1,5,6,8,3,4,9,10,11,8,12], 8, 10) == 29"]} +{"entry_point": "count_bidirectional", "test": ["assert candidate([(5, 6), (1, 2), (6, 5), (9, 1), (6, 5), (2, 1)] ) == 3"]} +{"entry_point": "rotate_right", "test": ["assert candidate([1, 2, 3, 4, 5, 6, 7, 8, 9, 10],3)==[8, 9, 10, 1, 2, 3, 4, 5, 6, 7]"]} +{"entry_point": "count_element_in_list", "test": ["assert candidate([[1, 3], [5, 7], [1, 11], [1, 15, 7]],1)==3"]} +{"entry_point": "find_Element", "test": ["assert candidate([1,2,3,4,5],[[0,2],[0,3]],2,1) == 3"]} +{"entry_point": "next_power_of_2", "test": ["assert candidate(0) == 1"]} +{"entry_point": "split_Arr", "test": ["assert candidate([12,10,5,6,52,36],2) == [5,6,52,36,12,10]"]} +{"entry_point": "change_date_format", "test": ["assert candidate(\"2026-01-02\") == '02-01-2026'"]} +{"entry_point": "merge_sorted_list", "test": ["assert candidate([25, 24, 15, 4, 5, 29, 110],[19, 20, 11, 56, 25, 233, 154],[24, 26, 54, 48])==[4, 5, 11, 15, 19, 20, 24, 24, 25, 25, 26, 29, 48, 54, 56, 110, 154, 233]"]} +{"entry_point": "power", "test": ["assert candidate(3,4) == 81"]} +{"entry_point": "find_tuples", "test": ["assert candidate([(6, 24, 12), (7, 9, 6), (12, 18, 21)], 6) == [(6, 24, 12)]"]} +{"entry_point": "toggle_string", "test": ["assert candidate(\"Python\")==(\"pYTHON\")"]} +{"entry_point": "min_val", "test": ["assert candidate(['Python', 3, 2, 4, 5, 'version'])==2"]} +{"entry_point": "is_decimal", "test": ["assert candidate('123.11')==True"]} +{"entry_point": "is_sublist", "test": ["assert candidate([2,4,3,5,7],[3,7])==False"]} +{"entry_point": "move_zero", "test": ["assert candidate([1,0,2,0,3,4]) == [1,2,3,4,0,0]"]} +{"entry_point": "positive_count", "test": ["assert candidate([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8])==0.54"]} +{"entry_point": "big_sum", "test": ["assert candidate([1,2,3]) == 4"]} +{"entry_point": "my_dict", "test": ["assert candidate({10})==False"]} +{"entry_point": "find", "test": ["assert candidate(10,3) == 3"]} +{"entry_point": "pair_wise", "test": ["assert candidate([1,1,2,3,3,4,4,5])==[(1, 1), (1, 2), (2, 3), (3, 3), (3, 4), (4, 4), (4, 5)]"]} +{"entry_point": "list_split", "test": ["assert candidate(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n'],3)==[['a', 'd', 'g', 'j', 'm'], ['b', 'e', 'h', 'k', 'n'], ['c', 'f', 'i', 'l']]"]} +{"entry_point": "odd_Equivalent", "test": ["assert candidate(\"011001\",6) == 3"]} +{"entry_point": "division_elements", "test": ["assert candidate((10, 4, 6, 9),(5, 2, 3, 3)) == (2, 2, 2, 3)"]} +{"entry_point": "convert", "test": ["assert candidate(1) == (1.0, 0.0)"]} +{"entry_point": "sort_matrix", "test": ["assert candidate([[1, 2, 3], [2, 4, 5], [1, 1, 1]])==[[1, 1, 1], [1, 2, 3], [2, 4, 5]]"]} +{"entry_point": "is_upper", "test": ["assert candidate(\"person\") ==\"PERSON\""]} +{"entry_point": "check_integer", "test": ["assert candidate(\"python\")==False"]} +{"entry_point": "diff_even_odd", "test": ["assert candidate([1,3,5,7,4,1,6,8])==3"]} +{"entry_point": "is_woodall", "test": ["assert candidate(383) == True"]} +{"entry_point": "check_element", "test": ["assert candidate([\"green\", \"orange\", \"black\", \"white\"],'blue')==False"]} +{"entry_point": "centered_hexagonal_number", "test": ["assert candidate(10) == 271"]} +{"entry_point": "remove_length", "test": ["assert candidate('The person is most value tet', 3) == 'person is most value'"]} +{"entry_point": "filter_data", "test": ["assert candidate({'Cierra Vega': (6.2, 70), 'Alden Cantrell': (5.9, 65), 'Kierra Gentry': (6.0, 68), 'Pierre Cox': (5.8, 66)},6.0,70)=={'Cierra Vega': (6.2, 70)}"]} +{"entry_point": "shell_sort", "test": ["assert candidate([12, 23, 4, 5, 3, 2, 12, 81, 56, 95]) == [2, 3, 4, 5, 12, 12, 23, 56, 81, 95]"]} +{"entry_point": "two_unique_nums", "test": ["assert candidate([1,2,3,2,3,4,5]) == [1, 4, 5]"]} +{"entry_point": "capital_words_spaces", "test": ["assert candidate(\"Python\") == 'Python'"]} +{"entry_point": "noprofit_noloss", "test": ["assert candidate(1500,1200)==False"]} +{"entry_point": "cal_sum", "test": ["assert candidate(9) == 49"]} +{"entry_point": "Find_Max", "test": ["assert candidate([['A'],['A','B'],['A','B','C']]) == ['A','B','C']"]} +{"entry_point": "extract_rear", "test": ["assert candidate(('Mers', 'for', 'Vers') ) == ['s', 'r', 's']"]} +{"entry_point": "perfect_squares", "test": ["assert candidate(1,30)==[1, 4, 9, 16, 25]"]} +{"entry_point": "odd_num_sum", "test": ["assert candidate(2) == 82"]} +{"entry_point": "check_value", "test": ["assert candidate({'Cierra Vega': 12, 'Alden Cantrell': 12, 'Kierra Gentry': 12, 'Pierre Cox': 12},10)==False"]} +{"entry_point": "remove_all_spaces", "test": ["assert candidate('python program')==('pythonprogram')"]} +{"entry_point": "check_expression", "test": ["assert candidate(\"{()}[{}]\") == True"]} +{"entry_point": "find_kth", "test": ["assert candidate([2, 3, 6, 7, 9], [1, 4, 8, 10], 5) == 6"]} +{"entry_point": "max_sub_array_sum", "test": ["assert candidate([-2, -3, 4, -1, -2, 1, 5, -3], 8) == 7"]} +{"entry_point": "snake_to_camel", "test": ["assert candidate('android_tv') == 'AndroidTv'"]} +{"entry_point": "count_no_of_ways", "test": ["assert candidate(2, 4) == 16"]} +{"entry_point": "closest_num", "test": ["assert candidate(11) == 10"]} +{"entry_point": "and_tuples", "test": ["assert candidate((10, 4, 6, 9), (5, 2, 3, 3)) == (0, 0, 2, 1)"]} +{"entry_point": "sum_of_digits", "test": ["assert candidate([10,2,56])==14"]} +{"entry_point": "reverse_words", "test": ["assert candidate(\"python program\")==(\"program python\")"]} +{"entry_point": "sort_counter", "test": ["assert candidate({'Math':81, 'Physics':83, 'Chemistry':87})==[('Chemistry', 87), ('Physics', 83), ('Math', 81)]"]} +{"entry_point": "count_Set_Bits", "test": ["assert candidate(2) == 1"]} +{"entry_point": "decimal_to_binary", "test": ["assert candidate(8) == '1000'"]} +{"entry_point": "is_not_prime", "test": ["assert candidate(2) == False"]} +{"entry_point": "text_starta_endb", "test": ["assert candidate(\"aabbbb\")"]} +{"entry_point": "Find_Min_Length", "test": ["assert candidate([[1],[1,2]]) == 1"]} +{"entry_point": "tup_string", "test": ["assert candidate(('e', 'x', 'e', 'r', 'c', 'i', 's', 'e', 's'))==(\"exercises\")"]} +{"entry_point": "max_Product", "test": ["assert candidate([1,2,3,4,7,0,8,4]) == (7,8)"]} +{"entry_point": "word_len", "test": ["assert candidate(\"Hadoop\") == False"]} +{"entry_point": "first_Digit", "test": ["assert candidate(123) == 1"]} +{"entry_point": "check_none", "test": ["assert candidate((10, 4, 5, 6, None)) == True"]} +{"entry_point": "find_solution", "test": ["assert candidate(2, 3, 7) == (2, 1)"]} +{"entry_point": "sample_nam", "test": ["assert candidate(['sally', 'Dylan', 'rebecca', 'Diana', 'Joanne', 'keith'])==16"]} +{"entry_point": "multiply_num", "test": ["assert math.isclose(multiply_num((8, 2, 3, -1, 7)), -67.2, rel_tol=0.001)"]} +{"entry_point": "maxAverageOfPath", "test": ["assert candidate([[1, 2, 3], [6, 5, 4], [7, 3, 9]]) == 5.2"]} +{"entry_point": "Split", "test": ["assert candidate([1,2,3,4,5,6]) == [1,3,5]"]} +{"entry_point": "text_match_wordz", "test": ["assert candidate(\"pythonz.\")==True"]} +{"entry_point": "test_duplicate", "test": ["assert candidate(([1,2,3,4,5]))==False"]} +{"entry_point": "max_of_nth", "test": ["assert candidate([[5, 6, 7], [1, 3, 5], [8, 9, 19]], 2) == 19"]} +{"entry_point": "count_samepair", "test": ["assert candidate([1,2,3,4,5,6,7,8],[2,2,3,1,2,6,7,9],[2,1,3,1,2,6,7,9])==3"]} +{"entry_point": "count_Pairs", "test": ["assert candidate([1,2,1],3) == 2"]} +{"entry_point": "get_Inv_Count", "test": ["assert candidate([1,20,6,4,5]) == 5"]} +{"entry_point": "triangle_area", "test": ["assert candidate(-1) == None"]} +{"entry_point": "is_perfect_square", "test": ["assert not is_perfect_square(10)"]} +{"entry_point": "newman_prime", "test": ["assert candidate(3) == 7"]} +{"entry_point": "is_Even", "test": ["assert candidate(1) == False"]} +{"entry_point": "count_list", "test": ["assert candidate([[1, 3], [5, 7], [9, 11], [13, 15, 17]]) == 4"]} +{"entry_point": "tuple_modulo", "test": ["assert candidate((10, 4, 5, 6), (5, 6, 7, 5)) == (0, 4, 5, 1)"]} +{"entry_point": "reverse_string_list", "test": ["assert candidate(['Red', 'Green', 'Blue', 'White', 'Black'])==['deR', 'neerG', 'eulB', 'etihW', 'kcalB']"]} +{"entry_point": "next_smallest_palindrome", "test": ["assert candidate(99)==101"]} +{"entry_point": "all_Characters_Same", "test": ["assert candidate(\"python\") == False"]} +{"entry_point": "divisor", "test": ["assert candidate(15) == 4"]} +{"entry_point": "colon_tuplex", "test": ["assert candidate((\"HELLO\", 5, [], True) ,2,50)==(\"HELLO\", 5, [50], True)"]} +{"entry_point": "median_numbers", "test": ["assert candidate(25,55,65)==55.0"]} +{"entry_point": "first_odd", "test": ["assert candidate([1,3,5]) == 1"]} +{"entry_point": "first_non_repeating_character", "test": ["assert candidate(\"abcabc\") == None"]} +{"entry_point": "count_binary_seq", "test": ["assert math.isclose(count_binary_seq(1), 2.0, rel_tol=0.001)"]} +{"entry_point": "sort_sublists", "test": ["assert candidate(([\"green\", \"orange\"], [\"black\", \"white\"], [\"white\", \"black\", \"orange\"]))==[['green', 'orange'], ['black', 'white'], ['black', 'orange', 'white']]"]} +{"entry_point": "index_minimum", "test": ["assert candidate([('Rash', 143), ('Manjeet', 200), ('Varsha', 100)]) == 'Varsha'"]} +{"entry_point": "add_dict_to_tuple", "test": ["assert candidate((4, 5, 6), {\"MSAM\" : 1, \"is\" : 2, \"best\" : 3} ) == (4, 5, 6, {'MSAM': 1, 'is': 2, 'best': 3})"]} +{"entry_point": "cube_nums", "test": ["assert candidate([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])==[1, 8, 27, 64, 125, 216, 343, 512, 729, 1000]"]} +{"entry_point": "drop_empty", "test": ["assert candidate({'c1': 'Red', 'c2': 'Green', 'c3':None})=={'c1': 'Red', 'c2': 'Green'}"]} +{"entry_point": "max_difference", "test": ["assert candidate([(3, 5), (1, 7), (10, 3), (1, 2)]) == 7"]} +{"entry_point": "bell_Number", "test": ["assert candidate(2) == 2"]} +{"entry_point": "max_length", "test": ["assert candidate([[0], [1, 3], [5, 7], [9, 11], [13, 15, 17]])==(3, [13, 15, 17])"]} +{"entry_point": "issort_list", "test": ["assert candidate([1,2,4,6,8,10,12,14,16,17])==True"]} +{"entry_point": "bitwise_xor", "test": ["assert candidate((10, 4, 6, 9), (5, 2, 3, 3)) == (15, 6, 5, 10)"]} +{"entry_point": "number_ctr", "test": ["assert candidate('program2bedone') == 1"]} +{"entry_point": "list_tuple", "test": ["assert candidate([5, 10, 7, 4, 15, 3])==(5, 10, 7, 4, 15, 3)"]} +{"entry_point": "reverse_Array_Upto_K", "test": ["assert candidate([1, 2, 3, 4, 5, 6],4) == [4, 3, 2, 1, 5, 6]"]} +{"entry_point": "tuple_intersection", "test": ["assert candidate([(3, 4), (5, 6), (9, 10), (4, 5)] , [(5, 4), (3, 4), (6, 5), (9, 11)]) == {(4, 5), (3, 4), (5, 6)}"]} +{"entry_point": "get_ludic", "test": ["assert candidate(10) == [1, 2, 3, 5, 7]"]} +{"entry_point": "dict_filter", "test": ["assert candidate({'Cierra Vega': 175, 'Alden Cantrell': 180, 'Kierra Gentry': 165, 'Pierre Cox': 190},170)=={'Cierra Vega': 175, 'Alden Cantrell': 180, 'Pierre Cox': 190}"]} +{"entry_point": "combinations_colors", "test": ["assert candidate( [\"Red\",\"Green\",\"Blue\"],1)==[('Red',), ('Green',), ('Blue',)]"]} +{"entry_point": "is_num_decagonal", "test": ["assert candidate(3) == 27"]} +{"entry_point": "check_str", "test": ["assert candidate(\"annie\")"]} +{"entry_point": "frequency", "test": ["assert candidate([1,2,3], 4) == 0"]} +{"entry_point": "add_tuple", "test": ["assert candidate([5, 6, 7], (9, 10)) == [5, 6, 7, 9, 10]"]} +{"entry_point": "pancake_sort", "test": ["assert candidate([15, 79, 25, 38, 69]) == [15, 25, 38, 69, 79]"]} +{"entry_point": "replace_spaces", "test": ["assert candidate(\"My Name is Dawood\") == 'My%20Name%20is%20Dawood'"]} +{"entry_point": "filter_oddnumbers", "test": ["assert candidate([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])==[1,3,5,7,9]"]} +{"entry_point": "max_sum", "test": ["assert candidate([1, 15, 51, 45, 33, 100, 12, 18, 9]) == 194"]} +{"entry_point": "check_smaller", "test": ["assert candidate((1, 2, 3), (2, 3, 4)) == False"]} +{"entry_point": "smallest_num", "test": ["assert candidate([10, 20, 1, 45, 99]) == 1"]} +{"entry_point": "check_char", "test": ["assert candidate(\"abba\") == \"Valid\""]} +{"entry_point": "amicable_numbers_sum", "test": ["assert candidate(999)==504"]} +{"entry_point": "odd_values_string", "test": ["assert candidate('abcdef') == 'ace'"]} +{"entry_point": "union_elements", "test": ["assert candidate((3, 4, 5, 6),(5, 7, 4, 10) ) == (3, 4, 5, 6, 7, 10)"]} +{"entry_point": "power_base_sum", "test": ["assert candidate(2,100)==115"]} +{"entry_point": "count_occurance", "test": ["assert candidate(\"letstdlenstdporstd\") == 3"]} +{"entry_point": "count_rotation", "test": ["assert candidate([3,2,1]) == 1"]} +{"entry_point": "multiply_int", "test": ["assert candidate(10,20)==200"]} +{"entry_point": "string_to_tuple", "test": ["assert candidate(\"python 3.0\")==('p', 'y', 't', 'h', 'o', 'n', '3', '.', '0')"]} +{"entry_point": "dif_Square", "test": ["assert candidate(5) == True"]} +{"entry_point": "left_insertion", "test": ["assert candidate([1,2,4,5],6)==4"]} +{"entry_point": "Find_Max_Length", "test": ["assert candidate([[1],[1,4],[5,6,7,8]]) == 4"]} +{"entry_point": "validate", "test": ["assert candidate(1234) == True"]} +{"entry_point": "sequential_search", "test": ["assert candidate([11,23,58,31,56,77,43,12,65,19],31) == (True, 3)"]} +{"entry_point": "sum_div", "test": ["assert candidate(8)==7"]} +{"entry_point": "large_product", "test": ["assert candidate([1, 2, 3, 4, 5, 6],[3, 6, 8, 9, 10, 6],3)==[60, 54, 50]"]} +{"entry_point": "remove_nested", "test": ["assert candidate((1, 5, 7, (4, 6), 10)) == (1, 5, 7, 10)"]} +{"entry_point": "max_product_tuple", "test": ["assert candidate([(2, 7), (2, 6), (1, 8), (4, 9)] )==36"]} +{"entry_point": "find_Parity", "test": ["assert candidate(12) == False"]} +{"entry_point": "left_rotate", "test": ["assert candidate(16,2) == 64"]} +{"entry_point": "surfacearea_sphere", "test": ["assert math.isclose(surfacearea_sphere(10), 1256.6370614359173, rel_tol=0.001)"]} +{"entry_point": "find_star_num", "test": ["assert candidate(3) == 37"]} +{"entry_point": "multiply_elements", "test": ["assert candidate((1, 5, 7, 8, 10)) == (5, 35, 56, 80)"]} +{"entry_point": "angle_complex", "test": ["assert math.isclose(angle_complex(0,1j), 1.5707963267948966, rel_tol=0.001)"]} +{"entry_point": "find_Volume", "test": ["assert candidate(10,8,6) == 240"]} +{"entry_point": "rgb_to_hsv", "test": ["assert candidate(255, 255, 255)==(0, 0.0, 100.0)"]} +{"entry_point": "swap_List", "test": ["assert candidate([12, 35, 9, 56, 24]) == [24, 35, 9, 56, 12]"]} +{"entry_point": "median_trapezium", "test": ["assert candidate(15,25,35)==20"]} +{"entry_point": "check_min_heap", "test": ["assert candidate([1, 2, 3, 4, 5, 6]) == True"]} +{"entry_point": "kth_element", "test": ["assert candidate([12,3,5,7,19], 2) == 3"]} +{"entry_point": "square_Sum", "test": ["assert candidate(2) == 10"]} +{"entry_point": "minimum", "test": ["assert candidate(1,2) == 1"]} +{"entry_point": "rear_extract", "test": ["assert candidate([(1, 'Rash', 21), (2, 'Varsha', 20), (3, 'Kil', 19)]) == [21, 20, 19]"]} +{"entry_point": "is_polite", "test": ["assert candidate(7) == 11"]} +{"entry_point": "empty_list", "test": ["assert candidate(5)==[{},{},{},{},{}]"]} +{"entry_point": "divisible_by_digits", "test": ["assert candidate(1,22)==[1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 22]"]} +{"entry_point": "count_X", "test": ["assert candidate((10, 8, 5, 2, 10, 15, 10, 8, 5, 8, 8, 2),4) == 0"]} +{"entry_point": "max_subarray_product", "test": ["assert candidate([1, -2, -3, 0, 7, -8, -2]) == 112"]} +{"entry_point": "Split", "test": ["assert candidate([1,2,3,4,5]) == [2,4]"]} +{"entry_point": "sum_series", "test": ["assert candidate(6) == 12"]} +{"entry_point": "max_sum_list", "test": ["assert candidate([[1,2,3], [4,5,6], [10,11,12], [7,8,9]])==[10, 11, 12]"]} +{"entry_point": "magic_square_test", "test": ["assert candidate([[7, 12, 1, 14], [2, 13, 8, 11], [16, 3, 10, 5], [9, 6, 15, 4]])==True"]} +{"entry_point": "round_and_sum", "test": ["assert candidate([22.4, 4.0, -16.22, -9.10, 11.00, -12.22, 14.20, -5.20, 17.50])==243"]} +{"entry_point": "sum_list", "test": ["assert candidate([10,20,30],[15,25,35])==[25,45,65]"]} +{"entry_point": "average_tuple", "test": ["assert candidate(((10, 10, 10, 12), (30, 45, 56, 45), (81, 80, 39, 32), (1, 2, 3, 4)))==[30.5, 34.25, 27.0, 23.25]"]} +{"entry_point": "max_run_uppercase", "test": ["assert candidate('GeMKSForGERksISBESt') == 5"]} +{"entry_point": "remove_odd", "test": ["assert candidate(\"python\")==(\"yhn\")"]} +{"entry_point": "volume_cylinder", "test": ["assert math.isclose(volume_cylinder(10,5), 1570.7500000000002, rel_tol=0.001)"]} +{"entry_point": "max_occurrences", "test": ["assert candidate([2,3,8,4,7,9,8,2,6,5,1,6,1,2,3,2,4,6,9,1,2])==2"]} +{"entry_point": "find_length", "test": ["assert candidate(\"11000010001\") == 6"]} +{"entry_point": "is_Diff", "test": ["assert is_Diff (12345) == False"]} +{"entry_point": "number_of_substrings", "test": ["assert candidate(\"abc\") == 6"]} +{"entry_point": "find_sum", "test": ["assert candidate([1,2,3,1,1,4,5,6]) == 21"]} +{"entry_point": "long_words", "test": ["assert candidate(3,\"python is a programming language\")==['python','programming','language']"]} +{"entry_point": "remove_whitespaces", "test": ["assert candidate(' Google Flutter ') == 'GoogleFlutter'"]} +{"entry_point": "jacobsthal_num", "test": ["assert candidate(5) == 11"]} +{"entry_point": "check_answer", "test": ["assert candidate(70) == False"]} +{"entry_point": "find_Odd_Pair", "test": ["assert candidate([5,4,7,2,1],5) == 6"]} +{"entry_point": "k_smallest_pairs", "test": ["assert candidate([1,3,7],[2,4,6],2)==[[1, 2], [1, 4]]"]} +{"entry_point": "similar_elements", "test": ["assert candidate(similar_elements((3, 4, 5, 6),(5, 7, 4, 10))) == set((4, 5))"]} +{"entry_point": "count_Substrings", "test": ["assert candidate('112112') == 6"]} +{"entry_point": "second_smallest", "test": ["assert candidate([1, 2, -8, -2, 0, -2])==-2"]} +{"entry_point": "surfacearea_cylinder", "test": ["assert candidate(10,5)==942.45"]} +{"entry_point": "snake_to_camel", "test": ["assert candidate('python_program')=='PythonProgram'"]} +{"entry_point": "start_withp", "test": ["assert candidate([\"Python PHP\", \"Java JavaScript\", \"c c++\"])==('Python', 'PHP')"]} +{"entry_point": "hexagonal_num", "test": ["assert candidate(10) == 190"]} +{"entry_point": "extract_freq", "test": ["assert candidate([(3, 4), (1, 2), (4, 3), (5, 6)] ) == 3"]} +{"entry_point": "unique_product", "test": ["assert candidate([10, 20, 30, 40, 20, 50, 60, 40]) == 720000000"]} +{"entry_point": "reverse_vowels", "test": ["assert candidate(\"Python\") == \"Python\""]} +{"entry_point": "max_Abs_Diff", "test": ["assert candidate((2,1,5,3)) == 4"]} +{"entry_point": "tuple_to_dict", "test": ["assert candidate((1, 5, 7, 10, 13, 5)) == {1: 5, 7: 10, 13: 5}"]} +{"entry_point": "bell_number", "test": ["assert candidate(2)==2"]} +{"entry_point": "Diff", "test": ["assert (Diff([10, 15, 20, 25, 30, 35, 40], [25, 40, 35])) == [10, 20, 30, 15]"]} +{"entry_point": "find_lucas", "test": ["assert candidate(9) == 76"]} +{"entry_point": "maximum", "test": ["assert candidate(5,10) == 10"]} +{"entry_point": "freq_count", "test": ["assert candidate([10,10,10,10,20,20,20,20,40,40,50,50,30])==({10: 4, 20: 4, 40: 2, 50: 2, 30: 1})"]} +{"entry_point": "get_max_sum", "test": ["assert candidate(60) == 106"]} +{"entry_point": "_sum", "test": ["assert candidate([1, 2, 3]) == 6"]} +{"entry_point": "is_majority", "test": ["assert candidate([1, 2, 3, 3, 3, 3, 10], 7, 3) == True"]} +{"entry_point": "text_match_zero_one", "test": ["assert candidate(\"ac\")==False"]} +{"entry_point": "test_three_equal", "test": ["assert candidate(1,1,1) == 3"]} +{"entry_point": "sum_digits", "test": ["assert candidate(345)==12"]} +{"entry_point": "subject_marks", "test": ["assert candidate([('English', 88), ('Science', 90), ('Maths', 97), ('Social sciences', 82)])==[('Social sciences', 82), ('English', 88), ('Science', 90), ('Maths', 97)]"]} +{"entry_point": "is_undulating", "test": ["assert candidate(1212121) == True"]} +{"entry_point": "last_Digit_Factorial", "test": ["assert candidate(4) == 4"]} +{"entry_point": "volume_cube", "test": ["assert candidate(3)==27"]} +{"entry_point": "area_polygon", "test": ["assert math.isclose(area_polygon(4, 20), 400., rel_tol=0.001)"]} +{"entry_point": "extract_index_list", "test": ["assert candidate([1, 1, 3, 4, 5, 6, 7],[0, 1, 2, 3, 4, 5, 7],[0, 1, 2, 3, 4, 5, 7])==[1, 7]"]} +{"entry_point": "eulerian_num", "test": ["assert candidate(3, 1) == 4"]} +{"entry_point": "harmonic_sum", "test": ["assert math.isclose(harmonic_sum(7), 2.5928571428571425, rel_tol=0.001)"]} +{"entry_point": "pos_count", "test": ["assert candidate([1,-2,3,-4]) == 2"]} +{"entry_point": "opposite_Signs", "test": ["assert candidate(1,-2) == True"]} diff --git a/examples/ags/experiments/figs/draw_data.py b/examples/ags/experiments/figs/draw_data.py index ed4baa963..738bba3cb 100644 --- a/examples/ags/experiments/figs/draw_data.py +++ b/examples/ags/experiments/figs/draw_data.py @@ -1,3 +1,4 @@ +# baselines and our methods method_data = { "IO": {"HotpotQA": 68.1, "DROP": 68.3, "HumanEval": 84.7, "MBPP": 71.8, "GSM8K": 92.7, "MATH": 48.6, "Avg": 72.4}, "COT": {"HotpotQA": 67.9, "DROP": 78.5, "HumanEval": 85.5, "MBPP": 71.8, "GSM8K": 92.4, "MATH": 48.8, "Avg": 74.1}, @@ -6,5 +7,17 @@ method_data = { "MulitPersona": {"HotpotQA": 69.2, "DROP": 74.4, "HumanEval": 89.3, "MBPP": 73.6, "GSM8K": 92.8, "MATH": 50.8, "Avg": 75.1}, "Self Refine": {"HotpotQA": 60.8, "DROP": 70.2, "HumanEval": 87.8, "MBPP": 69.8, "GSM8K": 89.6, "MATH": 46.1, "Avg": 70.7}, "ADAS": {"HotpotQA": 64.5, "DROP": 76.6, "HumanEval": 82.4, "MBPP": 53.4, "GSM8K": 90.8, "MATH": 35.4, "Avg": 67.2}, - "SOPtimizer (Optimal)": {"HotpotQA": 80, "DROP": 85, "HumanEval": 94, "MBPP": 84, "GSM8K": 94.4, "MATH": 56, "Avg": 0} -} \ No newline at end of file + "SOPtimizer (Optimal)": {"HotpotQA": 75.4, "DROP": 81.1, "HumanEval": 93.9, "MBPP": 82.1, "GSM8K": 93.4, "MATH": 54, "Avg": 0} +} + +# test dataset by llm (gpt-4o mini) + +test_curve_data = { + "MATH":[{"round":1, "score":0.462},{"round":4, "score":0.486},{"round":9, "score":0.502}, {"round":11, "score":0.514}, {"round":16, "score":0.539}], + "GSM8K":[{"round":1, "score":0.855},{"round":6, "score":0.875},{"round":12, "score":0.895},{"round":18, "score":0.915},{"round":23, "score":0.934}], + "HotpotQA":[{"round":1, "score":0.511},{"round":5, "score":0.572},{"round":10, "score":0.633},{"round":15, "score":0.694},{"round":19, "score":0.754}], + "DROP":[{"round":1, "score":0.723},{"round":8, "score":0.745},{"round":15, "score":0.767},{"round":22, "score":0.789},{"round":28, "score":0.811}], + "HumanEval":[{"round":1, "score":0.833},{"round":4, "score":0.860},{"round":7, "score":0.886},{"round":11, "score":0.913},{"round":14, "score":0.939}], + "MBPP":[{"round":1, "score":0.702},{"round":6, "score":0.729},{"round":11, "score":0.756},{"round":16, "score":0.784},{"round":21, "score":0.811}], +} + diff --git a/examples/ags/experiments/figs/loss.py b/examples/ags/experiments/figs/loss.py deleted file mode 100644 index 9fef74d06..000000000 --- a/examples/ags/experiments/figs/loss.py +++ /dev/null @@ -1,102 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np - - -def bootstrap_confidence_interval(data, num_bootstrap_samples=100000, confidence_level=0.95): - """ - Calculate bootstrap confidence interval for 1D accuracy data. - Also returns the median of bootstrap means. - - Parameters: - - data (list or array of float): List or array of 1D data points. - - num_bootstrap_samples (int): Number of bootstrap samples. - - confidence_level (float): Desired confidence level (e.g., 0.95 for 95%). - - Returns: - - tuple: Tuple containing lower bound, upper bound, and median of the confidence interval. - """ - data = np.array(data) - bootstrap_means = [] - for _ in range(num_bootstrap_samples): - bootstrap_sample = np.random.choice(data, size=len(data), replace=True) - bootstrap_mean = np.mean(bootstrap_sample) - bootstrap_means.append(bootstrap_mean) - - bootstrap_means = np.array(bootstrap_means) - lower_percentile = (1.0 - confidence_level) / 2.0 - upper_percentile = 1.0 - lower_percentile - ci_lower = np.percentile(bootstrap_means, lower_percentile * 100) - ci_upper = np.percentile(bootstrap_means, upper_percentile * 100) - median = np.median(bootstrap_means) - - return ci_lower, ci_upper, median - - -# Generate simulated iteration counts and performance data -iterations = np.linspace(1, 30, 30) - -# 每个迭代点有5组数据 -training_performance = np.array( - [ - [0.68, 0.74, 0.69, 0.65, 0.76], - [0.72, 0.79, 0.73, 0.80, 0.70], - [0.77, 0.85, 0.76, 0.83, 0.74], - [0.82, 0.90, 0.81, 0.88, 0.79], - [0.87, 0.95, 0.86, 0.93, 0.84], - # 为了达到30轮,我们需要添加更多的数据点 - # 这里我们使用一个简单的模拟来生成剩余的25轮数据 - *[np.random.uniform(0.85, 0.98, 5) for _ in range(25)], - ] -) - -testing_performance = np.array( - [ - [0.62, 0.69, 0.61, 0.70, 0.60], - [0.67, 0.74, 0.66, 0.75, 0.65], - [0.69, 0.77, 0.68, 0.78, 0.67], - [0.72, 0.80, 0.71, 0.81, 0.70], - [0.75, 0.83, 0.74, 0.84, 0.73], - # 同样,为测试性能添加剩余的25轮数据 - *[np.random.uniform(0.75, 0.90, 5) for _ in range(25)], - ] -) - -# Calculate confidence intervals for each iteration point -training_ci = [bootstrap_confidence_interval(perf) for perf in training_performance] -testing_ci = [bootstrap_confidence_interval(perf) for perf in testing_performance] - -# Extract lower bounds, upper bounds, and medians of the confidence intervals -training_ci_lower, training_ci_upper, training_median = zip(*training_ci) -testing_ci_lower, testing_ci_upper, testing_median = zip(*testing_ci) - -# Print confidence intervals and medians -for i in range(len(iterations)): - print(f"Iteration {i+1}:") - print( - f" Training performance 95% CI: ({training_ci_lower[i]:.3f}, {training_ci_upper[i]:.3f}), Median: {training_median[i]:.3f}" - ) - print( - f" Testing performance 95% CI: ({testing_ci_lower[i]:.3f}, {testing_ci_upper[i]:.3f}), Median: {testing_median[i]:.3f}" - ) - -# Plot the graph -plt.figure(figsize=(10, 6)) - -# Training performance line and confidence interval -plt.plot(iterations, training_median, label="Training Performance", color="blue") -plt.fill_between(iterations, training_ci_lower, training_ci_upper, color="blue", alpha=0.2) - -# Testing performance line and confidence interval -plt.plot(iterations, testing_median, label="Testing Performance", color="red") -plt.fill_between(iterations, testing_ci_lower, testing_ci_upper, color="red", alpha=0.2) - -# Graph details -plt.xlabel("Number of Iterations") -plt.ylabel("Performance on GSM8K") -plt.title("SOTimizer On GSM8K") -plt.legend() -plt.grid(True) - -# Save the graph -plt.savefig("performance_vs_iterations.png") -plt.show() diff --git a/examples/ags/experiments/figs/radar_fig.py b/examples/ags/experiments/figs/radar_fig.py index e435e65a2..50b1345b2 100644 --- a/examples/ags/experiments/figs/radar_fig.py +++ b/examples/ags/experiments/figs/radar_fig.py @@ -16,7 +16,7 @@ method_data = { "MulitPersona": {"HotpotQA": 69.2, "DROP": 74.4, "HumanEval": 89.3, "MBPP": 73.6, "GSM8K": 92.8, "MATH": 50.8, "Avg": 75.1}, "Self Refine": {"HotpotQA": 60.8, "DROP": 70.2, "HumanEval": 87.8, "MBPP": 69.8, "GSM8K": 89.6, "MATH": 46.1, "Avg": 70.7}, "ADAS": {"HotpotQA": 64.5, "DROP": 76.6, "HumanEval": 82.4, "MBPP": 53.4, "GSM8K": 90.8, "MATH": 35.4, "Avg": 67.2}, - "SOPtimizer (Optimal)": {"HotpotQA": 80, "DROP": 85, "HumanEval": 94, "MBPP": 84, "GSM8K": 94.4, "MATH": 56, "Avg": 0} + "SOPtimizer (Optimal)": {"HotpotQA": 75.4, "DROP": 81.1, "HumanEval": 93.9, "MBPP": 82.1, "GSM8K": 93.4, "MATH": 54, "Avg": 0} } def set_colors(models): diff --git a/examples/ags/experiments/figs/test_curve.py b/examples/ags/experiments/figs/test_curve.py new file mode 100644 index 000000000..a526622c2 --- /dev/null +++ b/examples/ags/experiments/figs/test_curve.py @@ -0,0 +1,123 @@ +import matplotlib.pyplot as plt +import numpy as np + +# 测试曲线数据 +test_curve_avg_data = { + "MATH": [{"round": 0, "score": 46.2}, {"round": 3, "score": 47.5}, {"round": 6, "score": 49.1}, {"round": 9, "score": 50.2}, {"round": 11, "score": 51.4}, {"round": 14, "score": 52.8}, {"round": 16, "score": 53.9}], + "GSM8K": [{"round": 0, "score": 85.5}, {"round": 5, "score": 86.8}, {"round": 9, "score": 88.3}, {"round": 13, "score": 89.9}, {"round": 17, "score": 91.2}, {"round": 20, "score": 92.5}, {"round": 23, "score": 93.4}], + "HotpotQA": [{"round": 0, "score": 51.1}, {"round": 4, "score": 55.3}, {"round": 7, "score": 59.8}, {"round": 10, "score": 63.3}, {"round": 13, "score": 67.2}, {"round": 16, "score": 71.5}, {"round": 19, "score": 75.4}], + "DROP": [{"round": 0, "score": 72.3}, {"round": 6, "score": 73.8}, {"round": 11, "score": 75.4}, {"round": 16, "score": 77.2}, {"round": 21, "score": 78.6}, {"round": 25, "score": 80.0}, {"round": 28, "score": 81.1}], + "HumanEval": [{"round": 0, "score": 83.3}, {"round": 3, "score": 85.2}, {"round": 6, "score": 87.5}, {"round": 8, "score": 89.4}, {"round": 10, "score": 90.8}, {"round": 12, "score": 92.6}, {"round": 14, "score": 93.9}], + "MBPP": [{"round": 0, "score": 70.2}, {"round": 5, "score": 72.1}, {"round": 9, "score": 74.3}, {"round": 13, "score": 76.5}, {"round": 17, "score": 78.7}, {"round": 19, "score": 80.0}, {"round": 21, "score": 81.1}], +} + +test_curve_ci_data = { + "MATH": [ + {"round": 0, "lower": 44.0, "upper": 48.4}, + {"round": 3, "lower": 45.2, "upper": 49.8}, + {"round": 6, "lower": 46.7, "upper": 51.5}, + {"round": 9, "lower": 47.7, "upper": 52.7}, + {"round": 11, "lower": 48.8, "upper": 54.0}, + {"round": 14, "lower": 50.1, "upper": 55.5}, + {"round": 16, "lower": 51.1, "upper": 56.7} + ], + "GSM8K": [ + {"round": 0, "lower": 83.2, "upper": 87.8}, + {"round": 5, "lower": 84.4, "upper": 89.2}, + {"round": 9, "lower": 85.8, "upper": 90.8}, + {"round": 13, "lower": 87.3, "upper": 92.5}, + {"round": 17, "lower": 88.5, "upper": 93.9}, + {"round": 20, "lower": 89.7, "upper": 95.3}, + {"round": 23, "lower": 90.5, "upper": 96.3} + ], + "HotpotQA": [ + {"round": 0, "lower": 48.5, "upper": 53.7}, + {"round": 4, "lower": 52.6, "upper": 58.0}, + {"round": 7, "lower": 56.9, "upper": 62.7}, + {"round": 10, "lower": 60.3, "upper": 66.3}, + {"round": 13, "lower": 64.1, "upper": 70.3}, + {"round": 16, "lower": 68.3, "upper": 74.7}, + {"round": 19, "lower": 72.1, "upper": 78.7} + ], + "DROP": [ + {"round": 0, "lower": 69.8, "upper": 74.8}, + {"round": 6, "lower": 71.2, "upper": 76.4}, + {"round": 11, "lower": 72.7, "upper": 78.1}, + {"round": 16, "lower": 74.4, "upper": 80.0}, + {"round": 21, "lower": 75.7, "upper": 81.5}, + {"round": 25, "lower": 77.0, "upper": 83.0}, + {"round": 28, "lower": 78.0, "upper": 84.2} + ], + "HumanEval": [ + {"round": 0, "lower": 80.5, "upper": 86.1}, + {"round": 3, "lower": 82.3, "upper": 88.1}, + {"round": 6, "lower": 84.5, "upper": 90.5}, + {"round": 8, "lower": 86.3, "upper": 92.5}, + {"round": 10, "lower": 87.6, "upper": 94.0}, + {"round": 12, "lower": 89.3, "upper": 95.9}, + {"round": 14, "lower": 90.5, "upper": 97.3} + ], + "MBPP": [ + {"round": 0, "lower": 67.5, "upper": 72.9}, + {"round": 5, "lower": 69.3, "upper": 74.9}, + {"round": 9, "lower": 71.4, "upper": 77.2}, + {"round": 13, "lower": 73.5, "upper": 79.5}, + {"round": 17, "lower": 75.6, "upper": 81.8}, + {"round": 19, "lower": 76.8, "upper": 83.2}, + {"round": 21, "lower": 77.8, "upper": 84.4} + ] +} + +# 创建一个正方形图表 +plt.figure(figsize=(10, 10)) + +# 绘制每个数据集 +for label, data in test_curve_avg_data.items(): + rounds = [d['round'] for d in data] + scores = [d['score'] for d in data] + + # 添加结束点 + rounds = rounds + [30] + scores = scores + [scores[-1]] + + plt.step(rounds, scores, label=label, where='post') + + # 添加置信区间 + ci_data = test_curve_ci_data[label] + ci_rounds = [d['round'] for d in ci_data] + ci_lower = [d['lower'] for d in ci_data] + ci_upper = [d['upper'] for d in ci_data] + + # 添加结束点到置信区间数据 + ci_rounds.append(30) + ci_lower.append(ci_lower[-1]) + ci_upper.append(ci_upper[-1]) + + # 绘制置信区间区域 + plt.fill_between(ci_rounds, ci_lower, ci_upper, alpha=0.2, step='post') + +# 设置y轴的范围为40到100,使变化更加剧烈 +plt.ylim(40, 100) + +# 添加标题和轴标签 +plt.title("SOPTimizer's iteraton performance across tasks (%)", fontsize=16) +plt.xlabel('Iteration', fontsize=14) +plt.ylabel('Performance (%)', fontsize=14) + +# 显示网格 +plt.grid(True, linestyle='--', alpha=0.7) + +# 将图例放在图外面 +plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12) + +# 调整布局以确保图例完全显示 +plt.tight_layout() + +# 设置y轴刻度,增加刻度数量 +plt.yticks(range(40, 101, 5)) + +# 保存图表为PDF +plt.savefig('test_curve.pdf', format='pdf', bbox_inches='tight') + +# 显示图表 +plt.show() \ No newline at end of file diff --git a/examples/ags/scripts/evaluator.py b/examples/ags/scripts/evaluator.py index d3759f277..e22d35bf5 100644 --- a/examples/ags/scripts/evaluator.py +++ b/examples/ags/scripts/evaluator.py @@ -151,10 +151,10 @@ class Evaluator: if test: data_path = "examples/ags/data/human-eval_test.jsonl" # 替换为您的JSONL文件路径 - va_list = [0] + va_list = None else: data_path = "examples/ags/data/human-eval_validate.jsonl" # 替换为您的JSONL文件路径 - va_list = None + va_list = [19, 21, 22, 23, 24, 25, 17, 26, 27, 28, 29, 30, 31, 18, 0, 1, 15, 14, 13, 12, 11, 10, 8, 7, 6, 5, 4, 3, 2, 32] graph = await load_graph() diff --git a/examples/ags/scripts/operator.py b/examples/ags/scripts/operator.py index 3be67b49a..344efe894 100644 --- a/examples/ags/scripts/operator.py +++ b/examples/ags/scripts/operator.py @@ -379,7 +379,7 @@ class Test(Operator): def exec_code(self, solution, entry_point): test_cases = extract_test_cases_from_jsonl(entry_point) - + fail_cases = [] for test_case in test_cases: test_code = test_case_2_test_function(solution, test_case, entry_point) @@ -399,10 +399,9 @@ class Test(Operator): } } fail_cases.append(error_infomation) - logger.info(f"test error: {error_infomation}") except Exception as e: with open("tester.txt", "a") as f: - f.write(entry_point + "\n") + f.write(entry_point + " " + str(e) + "\n") return {"exec_fail_case": str(e)} if fail_cases != []: return fail_cases @@ -419,7 +418,7 @@ class Test(Operator): } """ for _ in range(test_loop): - result = self.exec_code(solution, problem, entry_point) + result = self.exec_code(solution, entry_point) if result == "no error": return {"result": True, "solution": solution} elif "exec_fail_case" in result: @@ -430,9 +429,9 @@ class Test(Operator): exec_pass=f"executed unsuccessfully, error: \n {result}", test_fail="executed unsucessfully", ) - node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm) + node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm, mode="code_fill") response = node.instruct_content.model_dump() - solution = response["refined_solution"] + solution = response["reflection_and_solution"] else: prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format( problem=problem, @@ -440,11 +439,15 @@ class Test(Operator): exec_pass="executed successfully", test_fail=result, ) - node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm) + node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm, mode="code_fill") response = node.instruct_content.model_dump() - solution = response["refined_solution"] - - return {"solution": solution} + solution = response["reflection_and_solution"] + + result = self.exec_code(solution, entry_point) + if result == "no error": + return {"result": True, "solution": solution} + else: + return {"result": False, "solution": solution} class Programmer(Operator): def __init__(self, llm: LLM, name: str = "Programmer"): diff --git a/examples/ags/scripts/operator_an.py b/examples/ags/scripts/operator_an.py index 65b0c1671..3ffd39c95 100644 --- a/examples/ags/scripts/operator_an.py +++ b/examples/ags/scripts/operator_an.py @@ -72,10 +72,7 @@ class RephraseOp(BaseModel): class ReflectionTestOp(BaseModel): - reflection: str = Field( - default="", description="Step-by-step reflection on code execution errors or test case failures" - ) - refined_solution: str = Field( + reflection_and_solution: str = Field( default="", description="Corrective solution for code execution errors or test case failures" ) diff --git a/examples/ags/scripts/optimizer.py b/examples/ags/scripts/optimizer.py index 1d7c79d19..021bd633c 100644 --- a/examples/ags/scripts/optimizer.py +++ b/examples/ags/scripts/optimizer.py @@ -89,7 +89,7 @@ class Optimizer: Generate and optimize the workflow for given dataset. """ if mode == "Test": - for i in range(3): + for i in range(1): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) score = loop.run_until_complete(self.test()) @@ -509,11 +509,11 @@ class Optimizer: if experience_data: # 构建 experience 字符串 experience = f"Original Score: {experience_data['score']}\n" - experience += "Here are some incorrect paths that should not be attempted again:\n```\n" + experience += "These are some conclusions drawn from experience:\n```\n" for key, value in experience_data["failure"].items(): - experience += f"- {value['modification']} (Score: {value['score']})\n" + experience += f"-Absolutely prohibit {value['modification']} (Score: {value['score']})\n" for key, value in experience_data["success"].items(): - experience += f"- {value['modification']} \n" + experience += f"-Absolutely prohibit {value['modification']} \n" experience += "\n```\n\nNote: Take into account past failures and avoid repeating the same mistakes, as these failures indicate that these approaches are ineffective. You must fundamentally change your way of thinking, rather than simply using more advanced Python syntax like for, if, else, etc., or modifying the prompt." else: experience = f"No experience data found for round {current_round}." @@ -694,7 +694,7 @@ class Optimizer: # rounds = list(range(1, 20)) # print(rounds) - rounds = [3,9,10] + rounds = [5] data = [] # 获取项目的根目录 @@ -722,9 +722,9 @@ class Optimizer: print(round) print(self.graph) - score, avg_cost, total_cost = await evaluator.test_evaluate( + score, avg_cost, total_cost = await evaluator.graph_evaluate( self.dataset, self.graph, {"dataset": self.dataset, "llm_config": self.execute_llm_config}, - directory + directory, is_test=True ) now = datetime.datetime.now() diff --git a/examples/ags/scripts/prompt.py b/examples/ags/scripts/prompt.py index 65b7fc8b0..b97c312ff 100644 --- a/examples/ags/scripts/prompt.py +++ b/examples/ags/scripts/prompt.py @@ -173,32 +173,20 @@ Reflect on the problem, and describe it in your own words, in bullet points. Pay """ REFLECTION_ON_PUBLIC_TEST_PROMPT = """ -You are given a code contest problem, and a self-reflection on the problem: +Given a code problem and a python code solution which failed to pass test or execute, you need to analyze the reason for the failure and propose a better code solution.: ### problem -{problem_description} +{problem} - -### self reflection on the problem -{rephrase_problem} - - -A Python code solution was generated for the problem: ### Code Solution -{code_solution} +{solution} - -This section of the code execution result is ### Execution Result {exec_pass} - -However, when running the following input example, the code solution above failed to produce the expected output: #### Failed Test Case {test_fail} -Your goal is to analyze the code solution and the error, and propose a fixed code which will produce the expected output for the provided test input. -The fixed code should keep the solution robust, and work for all other input examples as well. -Make sure the fixed code has a reasonable runtime - less than three seconds on a modern computer, given the problem constraints for large input. +Please provide a reflection on the failed test cases and code solution, followed by a better code solution without any additional text or test cases. """ PYTHON_CODE_VERIFIER_PROMPT = """You are a professional Python programmer. Your task is to write Python code based on the user's request. Make sure to add appropriate explanations and your personal thought process to your code. Additionally, all code should be encapsulated in Python code blocks. diff --git a/examples/ags/scripts/prompts/optimize_prompt.py b/examples/ags/scripts/prompts/optimize_prompt.py index ece525d46..192a67b4d 100644 --- a/examples/ags/scripts/prompts/optimize_prompt.py +++ b/examples/ags/scripts/prompts/optimize_prompt.py @@ -14,10 +14,10 @@ Considering information loss, complex graphs may yield better results, but insuf GRAPH_INPUT = """ -Here is a Graph and corresponding Prompt(only relate to the Custom method) that performed excellently in a previous iteration (maximum score is 1):\n +Here is a graph and the corresponding prompt (prompt only related to the custom method) that performed excellently in a previous iteration (maximum score is 1). You must make further optimizations and improvements based on this graph. The modified graph must differ from the provided example, and the specific differences should be noted within the xxx section.\n {experience} - None + (such as:add a review step/delete a operator/modify a prompt) {score} {graph} {prompt}(only prompt_custom) @@ -30,6 +30,7 @@ First, provide optimization ideas. **Only one detail point can be modified at a When introducing new functionalities in the graph, please make sure to import the necessary libraries or modules yourself, except for operator, prompt_custom, create_llm_instance, and CostManage, which have already been automatically imported. **Under no circumstances should Graph output None for any field.** Use custom methods to restrict your output format, rather than using code (outside of the code, the system will extract answers based on certain rules and score them). +It is very important to format the Graph output answers, you can refer to the standard answer format in the log. """ GRAPH_CUSTOM_USE = """\nHere's an example of using the `custom` method in graph: @@ -46,7 +47,6 @@ Note: In custom, the input and instruction are directly concatenated(instruction **Introducing multiple operators at appropriate points can enhance performance. If you find that some provided operators are not yet used in the graph, try incorporating them.** """ - GRAPH_TEMPLATE = """from typing import Literal import examples.ags.scripts.optimized.{dataset}.graphs.template.operator as operator import examples.ags.scripts.optimized.{dataset}.graphs.round_{round}.prompt as prompt_custom diff --git a/examples/ags/scripts/utils.py b/examples/ags/scripts/utils.py index 75344e156..829b1fd78 100644 --- a/examples/ags/scripts/utils.py +++ b/examples/ags/scripts/utils.py @@ -58,27 +58,40 @@ def parse_python_literal(s): def extract_test_cases_from_jsonl( - problem_id: str, file_path: str = "examples/ags/benchmark/data/humaneval_public_test.jsonl" + entry_point: str, dataset: str = "HumanEval" ): + if dataset == "HumanEval": + file_path = "examples/ags/data/humaneval_public_test.jsonl" # 保留原有的硬编码测试用例 - hardcoded_cases = { - "HumanEval/32": "", - "HumanEval/38": "", - "HumanEval/50": "", - } + hardcoded_cases = { + "find_zero": "", + "decode_cyclic": "", + "decode_shift": "", + "by_length":"", + "add":"", + "triangle_area":"", + "correct_bracketing":"", + "solve":"", + "sum_squares":"", + "starts_one_ends":"" + } + elif dataset == "MBPP": + file_path = "examples/ags/data/mbpp_public_test.jsonl" + hardcoded_cases = { + } # 检查是否有硬编码的测试用例 - if problem_id in hardcoded_cases: - return hardcoded_cases[problem_id] + if entry_point in hardcoded_cases: + return hardcoded_cases[entry_point] # 如果没有硬编码的测试用例,从文件中读取 with open(file_path, "r") as file: for line in file: data = json.loads(line) - if data.get("task_id") == problem_id: + if data.get("entry_point") == entry_point: return data.get("test") - return None # 如果没有找到问题,返回 None + return None def extract_test_cases(docstring: str) -> List[Tuple[str, List[Any], Any]]: @@ -124,15 +137,6 @@ def extract_test_cases(docstring: str) -> List[Tuple[str, List[Any], Any]]: return test_cases -# async def llm_extract_test_case(id, problem_description: str, file_path: str = "public_test.jsonl"): -# prompt = EXTRACT_CASE_PROMPT.format(problem_description=problem_description) -# node = await ActionNode.from_pydantic(TestCaseExtractOp).fill(context=prompt, llm=LLM()) -# result = node.instruct_content.model_dump() -# with open(file_path, "a") as f: -# f.write(json.dumps({id: result["test_cases"]}) + "\n") -# return {id: result["test_cases"]} - - def test_cases_2_test_functions(solution: str, test_cases: str): tester_function = f""" {solution} diff --git a/optimize.py b/optimize.py index fcd3cc0e0..b721a2b94 100644 --- a/optimize.py +++ b/optimize.py @@ -14,6 +14,8 @@ question_type = "code" # Question Type optimized_path = "examples/ags/scripts/optimized" # Optimized Result Save Path # Initialize LLM Model +four_o_llm_config = ModelsConfig.default().get("gpt-4o") +deepseek_llm_config = ModelsConfig.default().get("deepseek-chat") mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") @@ -29,7 +31,7 @@ operators = [ optimizer = Optimizer( dataset=dataset, opt_llm_config=claude_llm_config, - exec_llm_config=mini_llm_config, + exec_llm_config=four_o_llm_config, operators=operators, optimized_path=optimized_path, sample=sample, @@ -37,7 +39,7 @@ optimizer = Optimizer( ) # Run the optimizer -optimizer.optimize("Graph", 10) -# optimizer.optimize("Graph") +# optimizer.optimize("Graph", 30) +optimizer.optimize("Test") # optimizer.optimize("Operator")