Update AFlolw

This commit is contained in:
didi 2024-10-16 11:49:18 +08:00
parent eae351466f
commit bb229f2319
20 changed files with 3 additions and 4619 deletions

6
.gitignore vendored
View file

@ -189,6 +189,6 @@ cov.xml
*.dot
.python-version
*.csv
/examples/ags/data/baselines/general
/examples/ags/scripts/optimized/HumanEval/graphs
/examples/ags/scripts/optimized/HumanEval/graphs_test
/examples/aflow/data/baselines/general
/examples/aflow/scripts/optimized/HumanEval/graphs
/examples/aflow/scripts/optimized/HumanEval/graphs_test

52
10.txt
View file

@ -1,52 +0,0 @@
2024-07-01 15:30:33.806 | DEBUG | metagpt.provider.base_llm:aask:151 - [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': '\n## context\n\nGenerate Code Solution for the following problem: \n\ndef is_palindrome(string: str) -> bool:\n """ Test if given string is a palindrome """\n return string == string[::-1]\n\n\ndef make_palindrome(string: str) -> str:\n """ Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n >>> make_palindrome(\'\')\n \'\'\n >>> make_palindrome(\'cat\')\n \'catac\'\n >>> make_palindrome(\'cata\')\n \'catac\'\n """\n\n\n\n-----\n\n## format example\n[CONTENT]\n{\n "solution": ""\n}\n[/CONTENT]\n\n## nodes: "<node>: <type> # <instruction>"\n- solution: <class \'str\'> # Your Code Solution for this problem\n\n\n## constraint\nLanguage: Please use the same language as Human INPUT.\nFormat: output wrapped inside [CONTENT][/CONTENT] like format example, nothing else.\n\n## action\nFollow instructions of nodes, generate output and make sure it follows the format example.\n'}]
2024-07-01 15:30:42.412 | INFO | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.000 | Max budget: $10.000 | Current cost: $0.000, prompt_tokens: 318, completion_tokens: 175
2024-07-01 15:30:42.413 | DEBUG | metagpt.actions.action_node:_aask_v1:421 - llm raw output:
[CONTENT]
{
"solution": "def make_palindrome(string: str) -> str:\n \"\"\" Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n \"\"\"\n if not string:\n return ''\n\n for i in range(len(string)):\n if string[i:] == string[i:][::-1]:\n return string + string[:i][::-1]\n\n return string + string[:-1][::-1]\n"
}
[/CONTENT]
2024-07-01 15:30:42.418 | DEBUG | metagpt.actions.action_node:_aask_v1:431 - parsed_data:
{'solution': 'def make_palindrome(string: str) -> str:\n """ Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n """\n if not string:\n return \'\'\n\n for i in range(len(string)):\n if string[i:] == string[i:][::-1]:\n return string + string[:i][::-1]\n\n return string + string[:-1][::-1]\n'}
2024-07-01 15:30:42.419 | DEBUG | metagpt.provider.base_llm:aask:151 - [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': '\n## context\n\nFor the question described as \n\ndef is_palindrome(string: str) -> bool:\n """ Test if given string is a palindrome """\n return string == string[::-1]\n\n\ndef make_palindrome(string: str) -> str:\n """ Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n >>> make_palindrome(\'\')\n \'\'\n >>> make_palindrome(\'cat\')\n \'catac\'\n >>> make_palindrome(\'cata\')\n \'catac\'\n """\n,\nplease review the following solution: {\'solution\': \'def make_palindrome(string: str) -> str:\\n """ Find the shortest palindrome that begins with a supplied string.\\n Algorithm idea is simple:\\n - Find the longest postfix of supplied string that is a palindrome.\\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\\n """\\n if not string:\\n return \\\'\\\'\\n\\n for i in range(len(string)):\\n if string[i:] == string[i:][::-1]:\\n return string + string[:i][::-1]\\n\\n return string + string[:-1][::-1]\\n\'}, and provide a review result in boolean format.\nIf you believe the solution is capable of resolving the issue, return True; otherwise, return False, and include your comments\n\n\n-----\n\n## format example\n[CONTENT]\n{\n "review_result": false,\n "feedback": ""\n}\n[/CONTENT]\n\n## nodes: "<node>: <type> # <instruction>"\n- review_result: <class \'bool\'> # The Review Result (Bool). If you think this solution looks good for you, return \'true\'; If not, return \'false\'\n- feedback: <class \'str\'> # Your FeedBack for this problem based on the criteria. If the review result is true, you can put it \'nothing here\'.\n\n\n## constraint\nLanguage: Please use the same language as Human INPUT.\nFormat: output wrapped inside [CONTENT][/CONTENT] like format example, nothing else.\n\n## action\nFollow instructions of nodes, generate output and make sure it follows the format example.\n'}]
2024-07-01 15:30:44.222 | INFO | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.000 | Max budget: $10.000 | Current cost: $0.000, prompt_tokens: 585, completion_tokens: 29
2024-07-01 15:30:44.222 | DEBUG | metagpt.actions.action_node:_aask_v1:421 - llm raw output:
[CONTENT]
{
"review_result": true,
"feedback": "nothing here"
}
[/CONTENT]
2024-07-01 15:30:44.224 | DEBUG | metagpt.actions.action_node:_aask_v1:431 - parsed_data:
{'review_result': True, 'feedback': 'nothing here'}
2024-07-01 15:30:44.224 | DEBUG | metagpt.provider.base_llm:aask:151 - [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': '\n## context\n\nGenerate Code Solution for the following problem: \n\ndef is_palindrome(string: str) -> bool:\n """ Test if given string is a palindrome """\n return string == string[::-1]\n\n\ndef make_palindrome(string: str) -> str:\n """ Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n >>> make_palindrome(\'\')\n \'\'\n >>> make_palindrome(\'cat\')\n \'catac\'\n >>> make_palindrome(\'cata\')\n \'catac\'\n """\n\n\n\n-----\n\n## format example\n[CONTENT]\n{\n "solution": ""\n}\n[/CONTENT]\n\n## nodes: "<node>: <type> # <instruction>"\n- solution: <class \'str\'> # Your Code Solution for this problem\n\n\n## constraint\nLanguage: Please use the same language as Human INPUT.\nFormat: output wrapped inside [CONTENT][/CONTENT] like format example, nothing else.\n\n## action\nFollow instructions of nodes, generate output and make sure it follows the format example.\n'}]
2024-07-01 15:30:53.135 | INFO | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.000 | Max budget: $10.000 | Current cost: $0.000, prompt_tokens: 318, completion_tokens: 175
2024-07-01 15:30:53.136 | DEBUG | metagpt.actions.action_node:_aask_v1:421 - llm raw output:
[CONTENT]
{
"solution": "def make_palindrome(string: str) -> str:\n \"\"\" Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n \"\"\"\n if not string:\n return ''\n\n for i in range(len(string)):\n if string[i:] == string[i:][::-1]:\n return string + string[:i][::-1]\n\n return string + string[:-1][::-1]\n"
}
[/CONTENT]
2024-07-01 15:30:53.137 | DEBUG | metagpt.actions.action_node:_aask_v1:431 - parsed_data:
{'solution': 'def make_palindrome(string: str) -> str:\n """ Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n """\n if not string:\n return \'\'\n\n for i in range(len(string)):\n if string[i:] == string[i:][::-1]:\n return string + string[:i][::-1]\n\n return string + string[:-1][::-1]\n'}
2024-07-01 15:30:53.138 | DEBUG | metagpt.provider.base_llm:aask:151 - [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': '\n## context\n\nFor the question described as \n\ndef is_palindrome(string: str) -> bool:\n """ Test if given string is a palindrome """\n return string == string[::-1]\n\n\ndef make_palindrome(string: str) -> str:\n """ Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n >>> make_palindrome(\'\')\n \'\'\n >>> make_palindrome(\'cat\')\n \'catac\'\n >>> make_palindrome(\'cata\')\n \'catac\'\n """\n,\nplease review the following solution: {\'solution\': \'def make_palindrome(string: str) -> str:\\n """ Find the shortest palindrome that begins with a supplied string.\\n Algorithm idea is simple:\\n - Find the longest postfix of supplied string that is a palindrome.\\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\\n """\\n if not string:\\n return \\\'\\\'\\n\\n for i in range(len(string)):\\n if string[i:] == string[i:][::-1]:\\n return string + string[:i][::-1]\\n\\n return string + string[:-1][::-1]\\n\'}, and provide a review result in boolean format.\nIf you believe the solution is capable of resolving the issue, return True; otherwise, return False, and include your comments\n\n\n-----\n\n## format example\n[CONTENT]\n{\n "review_result": false,\n "feedback": ""\n}\n[/CONTENT]\n\n## nodes: "<node>: <type> # <instruction>"\n- review_result: <class \'bool\'> # The Review Result (Bool). If you think this solution looks good for you, return \'true\'; If not, return \'false\'\n- feedback: <class \'str\'> # Your FeedBack for this problem based on the criteria. If the review result is true, you can put it \'nothing here\'.\n\n\n## constraint\nLanguage: Please use the same language as Human INPUT.\nFormat: output wrapped inside [CONTENT][/CONTENT] like format example, nothing else.\n\n## action\nFollow instructions of nodes, generate output and make sure it follows the format example.\n'}]
2024-07-01 15:30:55.232 | INFO | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.000 | Max budget: $10.000 | Current cost: $0.000, prompt_tokens: 585, completion_tokens: 29
2024-07-01 15:30:55.233 | DEBUG | metagpt.actions.action_node:_aask_v1:421 - llm raw output:
[CONTENT]
{
"review_result": true,
"feedback": "nothing here"
}
[/CONTENT]
2024-07-01 15:30:55.234 | DEBUG | metagpt.actions.action_node:_aask_v1:431 - parsed_data:
{'review_result': True, 'feedback': 'nothing here'}
2024-07-01 15:30:55.234 | DEBUG | metagpt.provider.base_llm:aask:151 - [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': '\n## context\n\nFor the question described as \n\ndef is_palindrome(string: str) -> bool:\n """ Test if given string is a palindrome """\n return string == string[::-1]\n\n\ndef make_palindrome(string: str) -> str:\n """ Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n >>> make_palindrome(\'\')\n \'\'\n >>> make_palindrome(\'cat\')\n \'catac\'\n >>> make_palindrome(\'cata\')\n \'catac\'\n """\n, Solutions: def make_palindrome(string: str) -> str:\n """ Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n """\n if not string:\n return \'\'\n\n for i in range(len(string)):\n if string[i:] == string[i:][::-1]:\n return string + string[:i][::-1]\n\n return string + string[:-1][::-1]\n\ndef make_palindrome(string: str) -> str:\n """ Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n """\n if not string:\n return \'\'\n\n for i in range(len(string)):\n if string[i:] == string[i:][::-1]:\n return string + string[:i][::-1]\n\n return string + string[:-1][::-1]\n\n\nPlease select the solution that appears most frequently from these options and provide the best solution based on that.\n\n\n-----\n\n## format example\n[CONTENT]\n{\n "solution": ""\n}\n[/CONTENT]\n\n## nodes: "<node>: <type> # <instruction>"\n- solution: <class \'str\'> # Final ensemble solution for this problem\n\n\n## constraint\nLanguage: Please use the same language as Human INPUT.\nFormat: output wrapped inside [CONTENT][/CONTENT] like format example, nothing else.\n\n## action\nFollow instructions of nodes, generate output and make sure it follows the format example.\n'}]
2024-07-01 15:31:03.826 | INFO | metagpt.utils.cost_manager:update_cost:57 - Total running cost: $0.001 | Max budget: $10.000 | Current cost: $0.000, prompt_tokens: 635, completion_tokens: 173
2024-07-01 15:31:03.827 | DEBUG | metagpt.actions.action_node:_aask_v1:421 - llm raw output:
[CONTENT]
{
"solution": "def make_palindrome(string: str) -> str:\n \"\"\" Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n \"\"\"\n if not string:\n return ''\n\n for i in range(len(string)):\n if string[i:] == string[i:][::-1]:\n return string + string[:i][::-1]\n\n return string + string[:-1][::-1]"
}
[/CONTENT]
2024-07-01 15:31:03.830 | DEBUG | metagpt.actions.action_node:_aask_v1:431 - parsed_data:
{'solution': 'def make_palindrome(string: str) -> str:\n """ Find the shortest palindrome that begins with a supplied string.\n Algorithm idea is simple:\n - Find the longest postfix of supplied string that is a palindrome.\n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n """\n if not string:\n return \'\'\n\n for i in range(len(string)):\n if string[i:] == string[i:][::-1]:\n return string + string[:i][::-1]\n\n return string + string[:-1][::-1]'}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1,53 +0,0 @@
from examples.ags.scripts.operator import Operator
from examples.ags.scripts.graph import SolveGraph
from examples.ags.benchmark.humaneval import humaneval_evaluation
from examples.ags.scripts.operator_an import GenerateOp
from metagpt.actions.action_node import ActionNode
from metagpt.configs.models_config import ModelsConfig
from metagpt.llm import LLM
from pydantic import BaseModel, Field
HUMANEVAL_PROMPT_GPT = """
{question}\nPlease provide a step-by-step explanation in text, followed by your Python function without any additional text or test cases.
"""
class GenerateOp(BaseModel):
solution: str = Field(default="", description="Python Solution For This Question.")
class CoTGenerate(Operator):
def __init__(self, llm: LLM, name: str = "Generate"):
super().__init__(name, llm)
async def __call__(self, problem, function_name, mode: str = None):
prompt = HUMANEVAL_PROMPT_GPT.format(question=problem)
fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
return response
class CoTSolveGraph(SolveGraph):
def __init__(self, name: str, llm_config, dataset: str):
super().__init__(name, llm_config, dataset)
self.cot_generate = CoTGenerate(self.llm)
async def __call__(self, problem, function_name):
solution = await self.cot_generate(problem, function_name, mode="code_fill")
return solution["solution"], self.llm.cost_manager.total_cost
if __name__ == "__main__":
async def main():
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
llm_config = ModelsConfig.default().get("deepseek-chat")
# llm_config = ModelsConfig.default().get("gpt-4o")
graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="HumanEval")
file_path = "examples/ags/data/baseline_data/human-eval.jsonl"
samples = 33 # 33/131
path = "examples/ags/data/baselines/general/humaneval"
score = await humaneval_evaluation(graph, file_path, samples, path,test=True)
return score
import asyncio
asyncio.run(main())

View file

@ -1,52 +0,0 @@
import asyncio
from examples.ags.benchmark.humaneval import sample_generate, samples_generate
asyncio.run(sample_generate("HumanEval/0", result_path="llm_based_1000.jsonl", mode="llm"))
asyncio.run(samples_generate(mode="alpha_codium", result_path="alpha_based_1000.jsonl"))
# asyncio.run(sample_generate('HumanEval/140',result_path="llm_based_1000.jsonl",mode="llm"))
# asyncio.run(sample_generate('HumanEval/140',result_path="llm_based_1000.jsonl",mode="llm"))
# asyncio.run(sample_generate('HumanEval/67',result_path="llm_based_1000.jsonl",mode="llm"))
# asyncio.run(sample_generate('HumanEval/108',result_path="llm_based_1000.jsonl",mode="llm"))
# asyncio.run(sample_generate('HumanEval/110',result_path="llm_based_1000.jsonl",mode="llm"))
# asyncio.run(samples_generate(mode='alpha',result_path="alpha_based_108.jsonl"))
# sort_json_by_key("alpha_based_108.jsonl", "alpha_based_108.jsonl")
# 64 84 160 148 109
# result_path = "ags_based_6.jsonl"
# if automatic_evalplus(result_path):
# unpassed_exapmle = extract_failure_tests(result_path[:-6]+"_eval_results.json")
# print(unpassed_exapmle)
# unpassed_exapmle = extract_failure_tests(file_path="2_eval_results.json")
# print(unpassed_exapmle)
# for example in failure_list:
# asyncio.run(sample_generate(example))
# id_list = [87, 95, 107, 112, 127, 136, 148, 155]
# id_list = [155]
# cases_id = [f"HumanEval/{case_id}" for case_id in id_list]
# cases = {case_id: get_human_eval_plus()[case_id]['prompt'] for case_id in cases_id}
# async def main(cases):
# try:
# tasks = [llm_extract_test_case(case_id, case) for case_id, case in cases.items()]
# results = await asyncio.gather(*tasks)
# except:
# failed_tasks = [task_id for task_id in results if task_id is not None]
# print(failed_tasks)
# return results
# asyncio.run(main(cases))
# [72, 80, 82, 87, 90, 95, 107, 109, 112, 124, 126, 127, 128, 132, 134, 136, 137, 138, 148, 154, 155]
# TODO 代码问题改动了一个地方导致Solution 没有了
# case_prompt= get_human_eval_plus()["HumanEval/140"]['prompt']
# solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=1)
# result = asyncio.run(solver.alpha_codium(problem_id="HumanEval/140", problem=case_prompt, ensemble_count=1))
# 1. Public Test 数据集不对
# 2. 修改两个Prompt的具体内容
# 3. 尝试增加Test错误之后的修改能力

File diff suppressed because it is too large Load diff

View file

@ -1,53 +0,0 @@
from examples.ags.scripts.operator import Operator
from examples.ags.scripts.graph import SolveGraph
from examples.ags.benchmark.humaneval import humaneval_evaluation
from examples.ags.scripts.operator_an import GenerateOp
from metagpt.actions.action_node import ActionNode
from metagpt.configs.models_config import ModelsConfig
from metagpt.llm import LLM
from pydantic import BaseModel, Field
HUMANEVAL_PROMPT_IO = """
{question}\nGenerate an answer to this question, without any additional test cases.
"""
class GenerateOp(BaseModel):
solution: str = Field(default="", description="Python Solution For This Question.")
class Generate(Operator):
def __init__(self, llm: LLM, name: str = "Generate"):
super().__init__(name, llm)
async def __call__(self, problem, function_name, mode: str = None):
prompt = HUMANEVAL_PROMPT_IO.format(question=problem)
fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
return response
class IOSolveGraph(SolveGraph):
def __init__(self, name: str, llm_config, dataset: str):
super().__init__(name, llm_config, dataset)
self.cot_generate = Generate(self.llm)
async def __call__(self, problem, function_name):
solution = await self.cot_generate(problem, function_name, mode="code_fill")
return solution["solution"], self.llm.cost_manager.total_cost
if __name__ == "__main__":
async def main():
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
# llm_config = ModelsConfig.default().get("gpt-4o")
# llm_config = ModelsConfig.default().get("deepseek-chat")
llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620")
graph = IOSolveGraph(name="Io", llm_config=llm_config, dataset="HumanEval")
file_path = "examples/ags/data/baseline_data/human-eval.jsonl"
samples = 33 # 33/131
path = "examples/ags/data/baselines/general/humaneval"
score = await humaneval_evaluation(graph, file_path, samples, path,test=True)
return score
import asyncio
asyncio.run(main())

View file

@ -1,127 +0,0 @@
from examples.ags.scripts.operator import Operator
from examples.ags.scripts.graph import SolveGraph
from examples.ags.benchmark.humaneval import humaneval_evaluation
from examples.ags.scripts.operator_an import GenerateOp
from metagpt.actions.action_node import ActionNode
from metagpt.configs.models_config import ModelsConfig
from metagpt.llm import LLM
from pydantic import BaseModel, Field
from typing import Dict, Any, List, Tuple
from collections import Counter
import random
HUMANEVAL_PROMPT_GPT = """
{question}\nPlease provide a step-by-step explanation in text, followed by your Python function without any additional text or test cases.
"""
MD_ENSEMBLE_PROMPT = """
Given the question described as follows: {question}
Several solutions have been generated to address the given question. They are as follows:
{solutions}
Carefully evaluate these solutions and identify the solution that is more capable of solving the problem compared to other solutions, as this is crucial for problem-solving.
In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the solution. Do not include any additional text or explanation in the "solution_letter" field.
"""
class GenerateOp(BaseModel):
solution: str = Field(default="", description="Python Solution For This Question.")
class CoTGenerate(Operator):
def __init__(self, llm: LLM, name: str = "Generate"):
super().__init__(name, llm)
async def __call__(self, problem, function_name, mode: str = None):
prompt = HUMANEVAL_PROMPT_GPT.format(question=problem)
fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
return response
class MdEnsembleOp(BaseModel):
thought: str = Field(
default="",
description="Step-by-step analysis of the solutions to determine the best one.",
)
solution_letter: str = Field(default="", description="The letter of the chosen best solution (only one letter).")
class MdEnsemble(Operator):
"""
Paper: Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine
Link: https://arxiv.org/abs/2311.16452
"""
def __init__(self, llm: LLM, name: str = "MdEnsemble", vote_count: int = 5):
super().__init__(name, llm)
self.vote_count = vote_count
@staticmethod
def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
shuffled_solutions = solutions.copy()
random.shuffle(shuffled_solutions)
answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
return shuffled_solutions, answer_mapping
async def __call__(self, solutions: List[str], problem: str, mode: str = None):
print(f"solution count: {len(solutions)}")
all_responses = []
for _ in range(self.vote_count):
shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
solution_text = ""
for index, solution in enumerate(shuffled_solutions):
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
fill_kwargs = {"context": prompt, "llm": self.llm}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(MdEnsembleOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
answer = response.get("solution_letter", "A")
answer = answer.strip().upper()
if answer in answer_mapping:
original_index = answer_mapping[answer]
all_responses.append(original_index)
most_frequent_index = Counter(all_responses).most_common(1)[0][0]
final_answer = solutions[most_frequent_index]
return {"solution": final_answer}
class MedPromptGraph(SolveGraph):
def __init__(self, name: str, llm_config, dataset: str, vote_count: int = 5):
super().__init__(name, llm_config, dataset)
self.cot_generate = CoTGenerate(self.llm)
self.md_ensemble = MdEnsemble(self.llm, vote_count=vote_count)
async def __call__(self, problem, function_name):
solutions = []
for i in range(3):
solution = await self.cot_generate(problem, function_name, mode="code_fill")
solutions.append(solution["solution"])
solution = await self.md_ensemble(solutions, problem, mode="context_fill")
return solution["solution"], self.llm.cost_manager.total_cost
if __name__ == "__main__":
async def main():
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
# llm_config = ModelsConfig.default().get("gpt-4o")
# llm_config = ModelsConfig.default().get("deepseek-chat")
llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620")
graph = MedPromptGraph(name="MedPrompt", llm_config=llm_config, dataset="HumanEval", vote_count=5)
file_path = "examples/ags/data/baseline_data/human-eval.jsonl"
samples = 33
path = "examples/ags/data/baselines/general/humaneval"
score, cost = await humaneval_evaluation(graph, file_path, samples, path, test=True)
return score, cost
import asyncio
asyncio.run(main())

View file

@ -1,120 +0,0 @@
from examples.ags.scripts.operator import Operator
from examples.ags.scripts.graph import SolveGraph
from examples.ags.benchmark.humaneval import humaneval_evaluation
from metagpt.actions.action_node import ActionNode
from metagpt.configs.models_config import ModelsConfig
from metagpt.llm import LLM
from pydantic import BaseModel, Field
from typing import List
DEBATE_INITIAL_PROMPT = """
{question}\nPlease provide a step-by-step explanation in text, followed by your Python function without any additional text or test cases.
"""
DEBATE_PROMPT = """
{question}
Considering the solutions provided by other agents as additional suggestions. Please think carefully and provide an updated python function without any additional text or test cases.
"""
FINAL_DECISION_PROMPT = """
{question}
Considering all the thinking processes and answers:
{all_thinking}
{all_answers}
Please reason carefully and provide the final answer. Make sure the code output is wrapped with ```python``` without any additional text or test cases.
"""
class DebateOp(BaseModel):
thinking: str = Field(default="", description="think")
answer: str = Field(default="", description="answer")
class FinalDecisionOp(BaseModel):
solution: str = Field(default="", description="final answer")
class DebateAgent(Operator):
def __init__(self, llm: LLM, name: str, role: str):
super().__init__(name, llm)
self.role = role
async def __call__(self, problem: str, function_name: str, context: List[str] = None, mode: str = None):
role_prompt = f"You are a {self.role}. Based on your professional knowledge and thinking style,"
if context is None:
prompt = role_prompt + DEBATE_INITIAL_PROMPT.format(question=problem)
else:
prompt = role_prompt + DEBATE_PROMPT.format(question=problem) + "\n".join(context)
fill_kwargs = {"context": prompt, "llm": self.llm, "function_name":function_name}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(DebateOp).fill(**fill_kwargs)
return node.instruct_content.model_dump()
class FinalDecisionAgent(Operator):
def __init__(self, llm: LLM, name: str = "FinalDecision"):
super().__init__(name, llm)
async def __call__(self, problem: str, function_name, all_thinking: List[str], all_answers: List[str], mode: str = None):
prompt = FINAL_DECISION_PROMPT.format(
question=problem,
all_thinking="\n".join(all_thinking),
all_answers="\n".join(all_answers)
)
fill_kwargs = {"context": prompt, "llm": self.llm, "function_name":function_name}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(FinalDecisionOp).fill(**fill_kwargs)
return node.instruct_content.model_dump()
class MultiPersonaGraph(SolveGraph):
def __init__(self, name: str, llm_config, dataset: str):
super().__init__(name, llm_config, dataset)
self.debate_agents = [
DebateAgent(self.llm, f"Debate Agent {i}", role)
for i, role in enumerate([
'Innovative CS Thinker - ICPC Competitor',
'Critical Reasoning Expert - Math Professor',
'Computational Thinking Specialist - Computer Science Researcher'
])
]
self.final_decision_agent = FinalDecisionAgent(self.llm)
async def __call__(self, problem, function_name):
max_round = 2
all_thinking = [[] for _ in range(max_round)]
all_answers = [[] for _ in range(max_round)]
for r in range(max_round):
for i, agent in enumerate(self.debate_agents):
if r == 0:
result = await agent(problem, function_name, mode="context_fill")
else:
context = [f"{agent.role}'s previous round thinking: {all_thinking[r-1][i]}"] + \
[f"{self.debate_agents[j].role}'s thinking: {all_thinking[r-1][j]}" for j in range(len(self.debate_agents)) if j != i]
result = await agent(problem, context, mode="context_fill")
all_thinking[r].append(result["thinking"])
all_answers[r].append(result["answer"])
final_result = await self.final_decision_agent(
problem,
function_name ,
[f"{agent.role}'s final thinking: {thinking}" for agent, thinking in zip(self.debate_agents, all_thinking[-1])],
[f"{agent.role}'s final answer: {answer}" for agent, answer in zip(self.debate_agents, all_answers[-1])],
mode="code_fill"
)
return final_result['solution'], self.llm.cost_manager.total_cost
if __name__ == "__main__":
async def main():
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
# llm_config = ModelsConfig.default().get("gpt-4o")
# llm_config = ModelsConfig.default().get("deepseek-chat")
llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620")
graph = MultiPersonaGraph(name="multi-persona", llm_config=llm_config, dataset="HumanEval")
file_path = "examples/ags/data/baseline_data/human-eval.jsonl"
samples = 33
path = "examples/ags/data/baselines/general/humaneval"
score, cost = await humaneval_evaluation(graph, file_path, samples, path, test=True)
return score, cost
import asyncio
asyncio.run(main())

File diff suppressed because one or more lines are too long

View file

@ -1,105 +0,0 @@
from examples.ags.scripts.operator import Operator
from examples.ags.scripts.graph import SolveGraph
from examples.ags.benchmark.humaneval import humaneval_evaluation
from examples.ags.scripts.operator_an import GenerateOp
from metagpt.actions.action_node import ActionNode
from metagpt.configs.models_config import ModelsConfig
from metagpt.llm import LLM
from pydantic import BaseModel, Field
from typing import List
HUMANEVAL_PROMPT_GPT = """
{question}\nPlease provide a step-by-step explanation in text, followed by your Python function without any additional text or test cases.
"""
SC_ENSEMBLE_PROMPT = """
Given the question described as follows: {question}
Several solutions have been generated to address the given question. They are as follows:
{solutions}
Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution.
In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field.
"""
class GenerateOp(BaseModel):
solution: str = Field(default="", description="Python Solution For This Question.")
class CoTGenerate(Operator):
def __init__(self, llm: LLM, name: str = "Generate"):
super().__init__(name, llm)
async def __call__(self, problem, function_name, mode: str = None):
prompt = HUMANEVAL_PROMPT_GPT.format(question=problem)
fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
return response
class ScEnsembleOp(BaseModel):
thought: str = Field(default="", description="The thought of the most consistent solution.")
solution_letter: str = Field(default="", description="The letter of most consistent solution.")
class ScEnsemble(Operator):
"""
Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
Link: https://arxiv.org/abs/2203.11171
Paper: Universal Self-Consistency for Large Language Model Generation
Link: https://arxiv.org/abs/2311.17311
"""
def __init__(self, llm, name: str = "ScEnsemble"):
super().__init__(name, llm)
async def __call__(self, solutions: List[str], problem: str, mode: str = None):
answer_mapping = {}
solution_text = ""
for index, solution in enumerate(solutions):
answer_mapping[chr(65 + index)] = index
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
prompt = SC_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
fill_kwargs = {"context": prompt, "llm": self.llm}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(ScEnsembleOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
answer = response.get("solution_letter", "A")
answer = answer.strip().upper()
return {"solution": solutions[answer_mapping[answer]]}
class SelfConsistencyGraph(SolveGraph):
def __init__(self, name: str, llm_config, dataset: str):
super().__init__(name, llm_config, dataset)
self.cot_generate = CoTGenerate(llm=self.llm)
self.sc_ensemble = ScEnsemble(llm=self.llm)
async def __call__(self, problem, function_name):
solutions = []
for i in range(5):
solution = await self.cot_generate(problem, function_name, mode="code_fill")
solutions.append(solution["solution"])
solution = await self.sc_ensemble(solutions, problem, mode="context_fill")
return solution["solution"], self.llm.cost_manager.total_cost
if __name__ == "__main__":
async def main():
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
llm_config = ModelsConfig.default().get("deepseek-chat")
# llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620")
graph = SelfConsistencyGraph(name="SelfConsistency", llm_config=llm_config, dataset="HumanEval")
file_path = "examples/ags/data/baseline_data/human-eval.jsonl"
samples = 33 # 33/131
path = "examples/ags/data/baselines/general/humaneval"
score = await humaneval_evaluation(graph, file_path, samples, path,test=True)
return score
import asyncio
asyncio.run(main())

View file

@ -1,122 +0,0 @@
from examples.ags.scripts.operator import Operator
from examples.ags.scripts.graph import SolveGraph
from examples.ags.benchmark.humaneval import humaneval_evaluation
from metagpt.actions.action_node import ActionNode
from metagpt.configs.models_config import ModelsConfig
from metagpt.llm import LLM
from pydantic import BaseModel, Field
from typing import Dict, Any
HUMANEVAL_PROMPT_GPT = """
{question}\nPlease provide a step-by-step explanation in text, followed by your Python function without any additional text or test cases.
"""
REVIEW_PROMPT = """
Given a problem and a thoughtful solution, your task is to using critical thinking (questioning) to review the solution's correctness and provide a review result in boolean format.
problem: {problem}
solution: {solution}
If you are more than 95 percent confident that the final answer is incorrect, please return False and give a feedback for the error. Otherwise, please return True and give a explanation for the correctness.
"""
REVISE_PROMPT = """
Given a problem and a thoughtful solution which is just reviewed as incorrect, your task is to revise the solution to solve the question and ensure the final code solution is wrapped with ```python```.
problem: {problem}
solution: {solution}
feedback: {feedback}
Ensure the output code is self-contained, and without any additional text or test cases.
"""
class GenerateOp(BaseModel):
solution: str = Field(default="", description="solution for the problem")
class ReviewOp(BaseModel):
review_result: bool = Field(
default=False,
description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'",
)
feedback: str = Field(
default="",
description="Your FeedBack for this problem based on the criteria. If the review result is true, you can put it 'nothing here'.",
)
class ReviseOp(BaseModel):
solution: str = Field(default="", description="Based on the feedback, revised solution for this problem")
class CoTGenerate(Operator):
def __init__(self, llm: LLM, name: str = "Generate"):
super().__init__(name, llm)
async def __call__(self, problem, function_name, mode: str = None):
prompt = HUMANEVAL_PROMPT_GPT.format(question=problem)
fill_kwargs = {"context": prompt, "llm": self.llm, "function_name": function_name}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
return response
class Review(Operator):
def __init__(self, llm: LLM, name: str = "Review"):
super().__init__(name, llm)
async def __call__(self, problem, solution, mode: str = None):
prompt = REVIEW_PROMPT.format(problem=problem, solution=solution)
fill_kwargs = {"context": prompt, "llm": self.llm}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(ReviewOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
return response
class Revise(Operator):
def __init__(self, llm: LLM, name: str = "Revise"):
super().__init__(name, llm)
async def __call__(self, problem, solution, feedback, mode: str = None):
prompt = REVISE_PROMPT.format(problem=problem, solution=solution, feedback=feedback)
fill_kwargs = {"context": prompt, "llm": self.llm}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(ReviseOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
return response
class SelfRefineGraph(SolveGraph):
def __init__(self, name: str, llm_config, dataset: str):
llm_config.temperature = 0.0
super().__init__(name, llm_config, dataset)
self.cot_generate = CoTGenerate(self.llm)
self.review = Review(self.llm)
self.revise = Revise(self.llm)
async def __call__(self, problem, function_name):
solution = await self.cot_generate(problem, function_name, mode="code_fill")
for i in range(3):
review = await self.review(problem, solution, mode="context_fill")
if review["review_result"]:
break
solution = await self.revise(problem, solution, review["feedback"], mode="code_fill")
return solution["solution"], self.llm.cost_manager.total_cost
if __name__ == "__main__":
async def main():
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
# llm_config = ModelsConfig.default().get("gpt-4o")
# llm_config = ModelsConfig.default().get("deepseek-chat")
llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620")
graph = SelfRefineGraph(name="self-refine", llm_config=llm_config, dataset="HumanEval")
file_path = "examples/ags/data/baseline_data/human-eval.jsonl"
samples = 33
path = "examples/ags/data/baselines/general/humaneval"
score, cost = await humaneval_evaluation(graph, file_path, samples, path, test=True)
return score, cost
import asyncio
asyncio.run(main())

View file

@ -1,243 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# TODO 帮助我写一个代码找出这种结构中都出现的id与并不是都出现的id以及第一三批单独出现的id\n",
"test_1 = [{'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/11'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/46'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/63'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/95'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n",
"test_2 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/153'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"test_3 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/83'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/97'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/128'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/149'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/163'}]\n",
"test_4 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/36'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/43'}, {'task_id': 'HumanEval/46'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/73'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/97'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/128'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/148'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/161'}, {'task_id': 'HumanEval/163'}]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Common IDs: length:25 {'HumanEval/32', 'HumanEval/130', 'HumanEval/127', 'HumanEval/100', 'HumanEval/132', 'HumanEval/115', 'HumanEval/129', 'HumanEval/75', 'HumanEval/76', 'HumanEval/108', 'HumanEval/140', 'HumanEval/119', 'HumanEval/41', 'HumanEval/155', 'HumanEval/145', 'HumanEval/93', 'HumanEval/120', 'HumanEval/111', 'HumanEval/126', 'HumanEval/163', 'HumanEval/134', 'HumanEval/142', 'HumanEval/125', 'HumanEval/137', 'HumanEval/133'}\n",
"Not Common IDs: {'HumanEval/122', 'HumanEval/6', 'HumanEval/74', 'HumanEval/81', 'HumanEval/83', 'HumanEval/90', 'HumanEval/160', 'HumanEval/36', 'HumanEval/73', 'HumanEval/94', 'HumanEval/95', 'HumanEval/14', 'HumanEval/148', 'HumanEval/29', 'HumanEval/39', 'HumanEval/64', 'HumanEval/46', 'HumanEval/102', 'HumanEval/26', 'HumanEval/153', 'HumanEval/10', 'HumanEval/161', 'HumanEval/139', 'HumanEval/11', 'HumanEval/159', 'HumanEval/54', 'HumanEval/110', 'HumanEval/131', 'HumanEval/149', 'HumanEval/1', 'HumanEval/43', 'HumanEval/128', 'HumanEval/97', 'HumanEval/118', 'HumanEval/135', 'HumanEval/77', 'HumanEval/121', 'HumanEval/154', 'HumanEval/113', 'HumanEval/63', 'HumanEval/138', 'HumanEval/91', 'HumanEval/84'}\n",
"Unique to test_1: {'HumanEval/11', 'HumanEval/154', 'HumanEval/63', 'HumanEval/90', 'HumanEval/94', 'HumanEval/95'}\n",
"Unique to test_2: {'HumanEval/139', 'HumanEval/102', 'HumanEval/160', 'HumanEval/153', 'HumanEval/138'}\n",
"Unique to test_3: {'HumanEval/149', 'HumanEval/29', 'HumanEval/83', 'HumanEval/26'}\n",
"Unique to test_4: {'HumanEval/14', 'HumanEval/148', 'HumanEval/73', 'HumanEval/36', 'HumanEval/161', 'HumanEval/43'}\n"
]
}
],
"source": [
"def extract_ids(test_list):\n",
" return set(item['task_id'] for item in test_list)\n",
"\n",
"def compare_ids(test_1, test_2, test_3, test_4):\n",
" ids_1 = extract_ids(test_1)\n",
" ids_2 = extract_ids(test_2)\n",
" ids_3 = extract_ids(test_3)\n",
" ids_4 = extract_ids(test_4)\n",
"\n",
" common_ids = ids_1 & ids_2 & ids_3 & ids_4\n",
" all_ids = ids_1 | ids_2 | ids_3 | ids_4\n",
" not_common_ids = all_ids - common_ids\n",
"\n",
" unique_1 = ids_1 - (ids_2 | ids_3 | ids_4)\n",
" unique_2 = ids_2 - (ids_1 | ids_3 | ids_4)\n",
" unique_3 = ids_3 - (ids_1 | ids_2 | ids_4)\n",
" unique_4 = ids_4 - (ids_1 | ids_2 | ids_3)\n",
"\n",
" return {\n",
" 'common_ids': common_ids,\n",
" 'not_common_ids': not_common_ids,\n",
" 'unique_1': unique_1,\n",
" 'unique_2': unique_2,\n",
" 'unique_3': unique_3,\n",
" 'unique_4': unique_4\n",
" }\n",
"\n",
"# Assuming test_1, test_2, and test_3 are defined as in your example\n",
"result = compare_ids(test_1, test_2, test_3, test_4)\n",
"\n",
"print(\"Common IDs:\",f\"length:{len(result['common_ids'])}\", result['common_ids'])\n",
"print(\"Not Common IDs:\",result['not_common_ids'])\n",
"print(\"Unique to test_1:\", result['unique_1'])\n",
"print(\"Unique to test_2:\", result['unique_2'])\n",
"print(\"Unique to test_3:\", result['unique_3'])\n",
"print(\"Unique to test_4:\", result['unique_4'])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"test_1 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/36'}, {'task_id': 'HumanEval/37'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/62'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/80'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/87'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/98'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/106'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/109'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/114'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/148'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"test_2 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/70'}, {'task_id': 'HumanEval/73'}, {'task_id': 'HumanEval/74'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/82'}, {'task_id': 'HumanEval/81'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/83'}, {'task_id': 'HumanEval/88'}, {'task_id': 'HumanEval/89'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/101'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/106'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/109'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/114'}, {'task_id': 'HumanEval/115'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/144'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"test_3 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/19'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/36'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/43'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/69'}, {'task_id': 'HumanEval/80'}, {'task_id': 'HumanEval/83'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/87'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/101'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/109'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"test_1 = [{'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/3'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/46'}, {'task_id': 'HumanEval/54'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/68'}, {'task_id': 'HumanEval/71'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/80'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/87'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/109'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/123'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/156'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"test_2 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/8'}, {'task_id': 'HumanEval/9'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/16'}, {'task_id': 'HumanEval/17'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/65'}, {'task_id': 'HumanEval/69'}, {'task_id': 'HumanEval/73'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/88'}, {'task_id': 'HumanEval/90'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/114'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/118'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/121'}, {'task_id': 'HumanEval/122'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/133'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/138'}, {'task_id': 'HumanEval/139'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/144'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]\n",
"test_3 = [{'task_id': 'HumanEval/0'}, {'task_id': 'HumanEval/1'}, {'task_id': 'HumanEval/5'}, {'task_id': 'HumanEval/6'}, {'task_id': 'HumanEval/7'}, {'task_id': 'HumanEval/10'}, {'task_id': 'HumanEval/12'}, {'task_id': 'HumanEval/14'}, {'task_id': 'HumanEval/20'}, {'task_id': 'HumanEval/19'}, {'task_id': 'HumanEval/22'}, {'task_id': 'HumanEval/21'}, {'task_id': 'HumanEval/25'}, {'task_id': 'HumanEval/26'}, {'task_id': 'HumanEval/28'}, {'task_id': 'HumanEval/29'}, {'task_id': 'HumanEval/32'}, {'task_id': 'HumanEval/33'}, {'task_id': 'HumanEval/37'}, {'task_id': 'HumanEval/39'}, {'task_id': 'HumanEval/41'}, {'task_id': 'HumanEval/49'}, {'task_id': 'HumanEval/63'}, {'task_id': 'HumanEval/64'}, {'task_id': 'HumanEval/68'}, {'task_id': 'HumanEval/75'}, {'task_id': 'HumanEval/76'}, {'task_id': 'HumanEval/77'}, {'task_id': 'HumanEval/80'}, {'task_id': 'HumanEval/84'}, {'task_id': 'HumanEval/87'}, {'task_id': 'HumanEval/89'}, {'task_id': 'HumanEval/91'}, {'task_id': 'HumanEval/93'}, {'task_id': 'HumanEval/94'}, {'task_id': 'HumanEval/98'}, {'task_id': 'HumanEval/100'}, {'task_id': 'HumanEval/99'}, {'task_id': 'HumanEval/102'}, {'task_id': 'HumanEval/108'}, {'task_id': 'HumanEval/110'}, {'task_id': 'HumanEval/111'}, {'task_id': 'HumanEval/113'}, {'task_id': 'HumanEval/116'}, {'task_id': 'HumanEval/120'}, {'task_id': 'HumanEval/119'}, {'task_id': 'HumanEval/124'}, {'task_id': 'HumanEval/126'}, {'task_id': 'HumanEval/125'}, {'task_id': 'HumanEval/127'}, {'task_id': 'HumanEval/129'}, {'task_id': 'HumanEval/131'}, {'task_id': 'HumanEval/132'}, {'task_id': 'HumanEval/130'}, {'task_id': 'HumanEval/134'}, {'task_id': 'HumanEval/135'}, {'task_id': 'HumanEval/137'}, {'task_id': 'HumanEval/136'}, {'task_id': 'HumanEval/140'}, {'task_id': 'HumanEval/142'}, {'task_id': 'HumanEval/145'}, {'task_id': 'HumanEval/148'}, {'task_id': 'HumanEval/153'}, {'task_id': 'HumanEval/154'}, {'task_id': 'HumanEval/155'}, {'task_id': 'HumanEval/156'}, {'task_id': 'HumanEval/159'}, {'task_id': 'HumanEval/160'}, {'task_id': 'HumanEval/163'}]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"from typing import List\n",
"\n",
"\n",
"def has_close_elements(numbers: List[float], threshold: float) -> bool:\n",
" \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n",
" given threshold.\n",
" >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n",
" False\n",
" >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n",
" True\n",
" \"\"\"\n",
" for i in range(len(numbers)):\n",
" for j in range(i + 1, len(numbers)):\n",
" if abs(numbers[i] - numbers[j]) < threshold:\n",
" return True\n",
" return False\n",
"def has_close_elements(numbers: List[float], threshold: float) -> bool:\n",
" \n",
" # Sort the numbers in ascending order\n",
" numbers.sort()\n",
" \n",
" # Iterate through the numbers and check the difference between adjacent numbers\n",
" for i in range(len(numbers) - 1):\n",
" if abs(numbers[i] - numbers[i+1]) < threshold:\n",
" return True\n",
" \n",
" # If no adjacent numbers are closer than the threshold, return False\n",
" return False\n",
"\n",
"\n",
" sorted_numbers = sorted(numbers)\n",
" for i in range(len(sorted_numbers) - 1):\n",
" if sorted_numbers[i + 1] - sorted_numbers[i] < threshold:\n",
" return True\n",
" return False\n",
"\n",
"\n",
"\n",
"\n",
"METADATA = {\n",
" 'author': 'jt',\n",
" 'dataset': 'test'\n",
"}\n",
"\n",
"\n",
"def check(candidate):\n",
" assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n",
" assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n",
" assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n",
" assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n",
" assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n",
" assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n",
" assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n",
"\n",
"\n"
]
},
{
"data": {
"text/plain": [
"'\\n assert isinstance(threshold, float) and threshold > 0, \"invalid inputs\" # $_CONTRACT_$\\n assert isinstance(numbers, list), \"invalid inputs\" # $_CONTRACT_$\\n assert all([isinstance(v, (int, float)) for v in numbers]), \"invalid inputs\" # $_CONTRACT_$\\n'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from evalplus.data import get_human_eval_plus, write_jsonl\n",
"\n",
"humaneval = get_human_eval_plus()\n",
"d_result = {\"task_id\": \"HumanEval/0\", \"solution\": \"from typing import List\\n\\n\\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\\n \\\"\\\"\\\" Check if in given list of numbers, are any two numbers closer to each other than\\n given threshold.\\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\\n False\\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\\n True\\n \\\"\\\"\\\"\\n for i in range(len(numbers)):\\n for j in range(i + 1, len(numbers)):\\n if abs(numbers[i] - numbers[j]) < threshold:\\n return True\\n return False\"}\n",
"result = {\"task_id\": \"HumanEval/0\", \"solution\": \"def has_close_elements(numbers: List[float], threshold: float) -> bool:\\n \\n # Sort the numbers in ascending order\\n numbers.sort()\\n \\n # Iterate through the numbers and check the difference between adjacent numbers\\n for i in range(len(numbers) - 1):\\n if abs(numbers[i] - numbers[i+1]) < threshold:\\n return True\\n \\n # If no adjacent numbers are closer than the threshold, return False\\n return False\"}\n",
"print(d_result[\"solution\"])\n",
"print(result[\"solution\"])\n",
"d_result = \n",
"result = \n",
"print(d_result[\"solution\"])\n",
"print(result[\"solution\"])\n",
"print(humaneval['HumanEval/0']['canonical_solution'])\n",
"print(humaneval['HumanEval/0']['test'])\n",
"humaneval['HumanEval/0']['contract']\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from typing import List\n",
"def has_close_elements(numbers: List[float], threshold: float) -> bool:\n",
" \n",
" # Sort the numbers in ascending order\n",
" numbers.sort()\n",
" \n",
" # Iterate through the numbers and check the difference between adjacent numbers\n",
" for i in range(len(numbers) - 1):\n",
" if abs(numbers[i] - numbers[i+1]) < threshold:\n",
" return True\n",
" \n",
" # If no adjacent numbers are closer than the threshold, return False\n",
" return False\n",
"\n",
"def check(candidate):\n",
" assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n",
" assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n",
" assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n",
" assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n",
" assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n",
" assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n",
" assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n",
"\n",
"check(has_close_elements)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

51
test.py
View file

@ -1,51 +0,0 @@
import asyncio
from pydantic import BaseModel, Field
from metagpt.actions.action_node import ActionNode
from metagpt.configs.models_config import ModelsConfig
from metagpt.provider.llm_provider_registry import create_llm_instance
from metagpt.utils.cost_manager import CostManager
deepseek_llm_config = ModelsConfig.default().get("deepseek-coder")
deepseek_llm = create_llm_instance(deepseek_llm_config)
deepseek_llm.cost_manager = CostManager()
claude_llm_config = ModelsConfig.default().get("claude-3.5-sonnet")
claude_llm = create_llm_instance(claude_llm_config)
# TODO 思考一下,如何每次都去创建新实例,从而保证每次计数的一致。
# llm.cost_manager = data.llm.cost_manager
class GenerateCodeSolution(BaseModel):
solution: str = Field(default="", description="A description of the solution")
thought: str = Field(
default="",
description="Shortly explain why this solution correctly solves the problem. Be specific and detailed regarding the problem rules and goals.",
)
GENERATE_ON_CONTEXT_PROMPT = """
Please generate a solution for the following problem based on the provided context:
### Problem Description
{problem_description}
"""
async def main():
prompt = GENERATE_ON_CONTEXT_PROMPT.format(
problem_description="Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"
)
node = await ActionNode.from_pydantic(GenerateCodeSolution).fill(
context=prompt, llm=deepseek_llm, mode="context_fill"
)
response = node.instruct_content.model_dump()
print(deepseek_llm.cost_manager.total_cost)
return response
if __name__ == "__main__":
print(asyncio.run(main()))

View file

@ -1,168 +0,0 @@
filter_integers
incr_list
string_sequence
greatest_common_divisor
generate_integers
anti_shuffle
derivative
monotonic
eat
solution
sort_numbers
make_palindrome
fib
order_by_points
numerical_letter_grade
is_simple_power
rounded_avg
is_nested
multiply
x_or_y
count_distinct_characters
prime_length
solve
below_zero
minSubArraySum
count_upper
find_closest_elements
count_up_to
below_threshold
triangle_area
choose_num
sum_to_n
common
unique_digits
intersection
search
factorize
add_elements
mean_absolute_deviation
get_closest_vowel
get_max_triples
tri
longest
even_odd_palindrome
get_row
maximum
move_one_ball
cycpattern_check
solve
check_if_last_char_is_a_letter
get_odd_collatz
circular_shift
exchange
flip_case
get_positive
parse_music
is_prime
add
unique
is_equal_to_sum_even
fibfib
is_multiply_prime
truncate_number
fix_spaces
vowels_count
add
words_string
correct_bracketing
any_int
filter_by_substring
change_base
f
special_factorial
string_xor
hex_key
Strongest_Extension
strange_sort_list
prime_fib
encrypt
simplify
encode_shift
has_close_elements
decimal_to_binary
count_nums
rescale_to_unit
median
triangle_area
fizz_buzz
how_many_times
remove_vowels
find_max
right_angle_triangle
int_to_mini_roman
sum_squares
match_parens
smallest_change
largest_divisor
sort_array
triples_sum_to_zero
is_bored
sorted_list_sum
can_arrange
encode_cyclic
by_length
largest_smallest_integers
bf
modp
car_race_collision
sort_third
histogram
compare_one
words_in_sentence
pluck
is_sorted
filter_by_prefix
same_chars
parse_nested_parens
remove_duplicates
sum_squares
encode
sort_even
make_a_pile
digitSum
prod_signs
largest_prime_factor
sum_product
double_the_difference
split_words
odd_count
minPath
total_match
skjkasdkd
is_palindrome
find_zero
check_dict_case
string_to_md5
next_smallest
is_happy
all_prefixes
separate_paren_groups
iscube
select_words
closest_integer
fruit_distribution
do_algebra
max_fill
sort_array
digits
even_odd_count
correct_bracketing
largest_smallest_integers
reverse_delete
strlen
pairs_sum_to_zero
intersperse
rolling_max
concatenate
valid_date
compare
starts_one_ends
will_it_fly
max_element
specialFilter
file_name_check
fib4
largest_smallest_integers
largest_smallest_integers
largest_smallest_integers