Update Sota baseline

This commit is contained in:
didi 2024-07-16 10:15:35 +08:00
parent 8a241054c7
commit e0955c5bf9
7 changed files with 442 additions and 110 deletions

View file

View file

@ -34,8 +34,8 @@ async def samples_generate(mode:str, result_path:str="samples.jsonl"):
async def solve_and_write(case, mode):
try:
if mode == 'llm':
# solution_result = await generate_code_block(case['prompt'])
solution_result = await generate_code(case['prompt'])
solution_result = await generate_code_block(case['prompt'])
# solution_result = await generate_code(case['prompt'])
sample_dict = {
'task_id': case['task_id'],
'solution': solution_result['code_solution']
@ -70,10 +70,12 @@ async def samples_generate(mode:str, result_path:str="samples.jsonl"):
jsonl_ranker(result_path, result_path)
if not failed_tasks:
if automatic_evalplus(result_path):
eval_path = result_path[:-6]+"_eval_results.json"
unpassed_exapmle = extract_failure_tests(eval_path)
print(unpassed_exapmle)
else:
print(failed_tasks)
@ -112,6 +114,9 @@ async def samples_generate_llm():
write_jsonl("samples.jsonl", sample_list)
def hello():
pass
def automatic_evalplus(result_path:str ="samples.jsonl"):
"""
在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only

View file

@ -0,0 +1,239 @@
# Import necessary libraries and modules
import gzip
import itertools
import json
import os
import subprocess
from typing import Dict, Iterable, List, Union
import numpy as np
import tqdm
from loguru import logger
# Define the root directory as the location of the script
ROOT = os.path.dirname(os.path.abspath(__file__))
# Define the input data file containing human evaluations
HUMAN_EVAL = r"HumanEval.jsonl.gz"
def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
"""
Reads a JSONL file containing problem evaluations and returns them as a dictionary.
Args:
evalset_file (str): Path to the JSONL file.
Returns:
Dict[str, Dict]: A dictionary where task IDs are keys and problem details are values.
"""
return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
def stream_jsonl(filename: str) -> Iterable[Dict]:
"""
Parses a JSONL file and yields each line as a dictionary.
Args:
filename (str): Path to the JSONL file.
Yields:
Iterable[Dict]: A generator of dictionaries representing JSONL lines.
"""
if filename.endswith(".gz"):
with open(filename, "rb") as gzfp:
with gzip.open(gzfp, "rt") as fp:
for line in fp:
if any(not x.isspace() for x in line):
yield json.loads(line)
else:
with open(filename, "r") as fp:
for line in fp:
if any(not x.isspace() for x in line):
yield json.loads(line)
def _generate_examples(filepath, split, name="sanitized"):
if name == "full":
def _read_lines(fn, start, end):
data = []
with open(fn, encoding="utf-8") as f:
for line in f:
sample = json.loads(line)
if start <= sample["task_id"] <= end:
data.append(sample)
elif sample["task_id"] > end:
break
return data
if split == "test":
data = _read_lines(filepath, 11, 510)
elif split == "train":
data = _read_lines(filepath, 601, 974)
elif split == "validation":
data = _read_lines(filepath, 511, 600)
elif split == "prompt":
data = _read_lines(filepath, 1, 10)
elif name == "sanitized":
with open(filepath, encoding="utf-8") as f:
data = json.load(f)
if split == "test":
data = [sample for sample in data if 11 <= sample["task_id"] <= 510]
elif split == "train":
data = [sample for sample in data if 601 <= sample["task_id"] <= 974]
elif split == "validation":
data = [sample for sample in data if 511 <= sample["task_id"] <= 600]
elif split == "prompt":
data = [sample for sample in data if 1 <= sample["task_id"] <= 10]
id_ = 0
for sample in data:
yield id_, sample
id_ += 1
def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
"""
Writes an iterable of dictionaries to a JSONL file.
Args:
filename (str): Path to the output JSONL file.
data (Iterable[Dict]): Data to write as JSONL.
append (bool): If True, appends to an existing file, else creates a new file.
"""
# Determine the file writing mode based on the 'append' flag
if append:
mode = "ab"
else:
mode = "wb"
filename = os.path.expanduser(filename)
# Handle .gz compression
if filename.endswith(".gz"):
with open(filename, mode) as fp:
with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
for x in data:
gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
else:
with open(filename, mode) as fp:
for x in data:
fp.write((json.dumps(x) + "\n").encode("utf-8"))
def execution(task_id, check_program):
"""
Executes a Python program and captures its output.
Args:
task_id: A unique identifier for the task.
check_program: The Python program to execute.
Returns:
bool: True if the execution was successful, False otherwise.
"""
process = subprocess.Popen(["python", "-c", f"{check_program}"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
try:
# Wait for the process to complete, with a timeout
stdout, stderr = process.communicate(timeout=30)
if len(stderr) == 0:
# logger.info(f"{task_id}: passed")
passed = True
elif b"OK" in stderr:
# logger.info(f"{task_id}: passed, {stderr}")
passed = True
else:
logger.info(f"{task_id}: error: {stderr}")
passed = False
except subprocess.TimeoutExpired:
logger.info("The command did not complete within the given timeout.")
process.kill() # Kill the process if it times out
logger.info(f"{task_id}: error")
passed = False
return passed
def estimate_pass_at_k(
num_samples: Union[int, List[int], np.ndarray], num_correct: Union[List[int], np.ndarray], k: int
) -> np.ndarray:
"""
Estimates pass@k of each problem and returns them in an array.
Args:
num_samples: Number of total samples (can be an int, list, or NumPy array).
num_correct: Number of correct samples (list or NumPy array).
k (int): The 'k' value for pass@k.
Returns:
np.ndarray: An array of pass rates for each problem.
"""
# Define a pass rate estimator function
def estimator(n: int, c: int, k: int) -> float:
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
# Determine the number of samples based on the input type
if isinstance(num_samples, int):
num_samples_it = itertools.repeat(num_samples, len(num_correct))
else:
assert len(num_samples) == len(num_correct)
num_samples_it = iter(num_samples)
# Calculate pass rates for each problem
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
def evaluate(total: List, correct: List, ks: List = [1, 10]):
"""
Evaluates and logs pass rates at various 'k' values.
Args:
total (List): List of total samples.
correct (List): List of correct samples.
ks (List): List of 'k' values to evaluate.
Returns:
dict: A dictionary of pass rates at each 'k' value.
"""
total = np.array(total)
correct = np.array(correct)
# Calculate and log pass rates at each 'k' value
pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in ks if (total >= k).all()}
logger.info(pass_at_k)
return pass_at_k
if __name__ == "__main__":
logger.info("Reading samples...")
problems = read_problems(HUMAN_EVAL)
total, correct = [], []
passed = []
for sample in tqdm.tqdm(stream_jsonl("example_samples.jsonl")):
task_id = sample["task_id"]
completion = sample["completion"]
problem = problems[task_id]
# Construct a check program
check_program = completion + "\n" + problem["test"] + "\n" + f"check({problem['entry_point']})"
# Execute the check program and capture the result
passed_flg = execution(task_id, check_program)
if not passed_flg:
logger.debug("error")
else:
logger.debug("passed")
passed.append(len(passed))
total.append(len(passed))
correct.append(sum(passed))
# Evaluate pass rates at various 'k' values
evaluate(total, correct, ks=[1, 5, 10])

View file

@ -4,8 +4,8 @@
# @Desc : graph & an instance - humanevalgraph
from metagpt.llm import LLM
from examples.ags.w_action_node.operator import Generate, GenerateCode, GenerateCodeBlock, Review, Revise, FuEnsemble, MdEnsemble
from typing import List
from examples.ags.w_action_node.operator import Generate, GenerateCode, GenerateCodeBlock, Review, Revise, FuEnsemble, MdEnsemble, DbEnsemble
class Graph:
def __init__(self, name:str, llm:LLM) -> None:
@ -15,6 +15,9 @@ class Graph:
def __call__():
NotImplementedError("Subclasses must implement __call__ method")
def optimize(dataset:List):
pass
class HumanEvalGraph(Graph):
def __init__(self, name:str, llm: LLM, criteria:str, vote_count:int =5) -> None:
super().__init__(name, llm)
@ -26,15 +29,15 @@ class HumanEvalGraph(Graph):
self.fuensemble = FuEnsemble(llm=llm)
self.mdensemble = MdEnsemble(llm=llm, vote_count=vote_count)
# async def __call__(self, problem:str, ensemble_count:int = 3):
# solution_list = []
# for _ in range(ensemble_count):
# solution = await self.generate_code(problem)
# # solution = await self.generate_code_block(problem)
# solution = solution.get('code_solution')
# solution_list.append(solution)
# solution = await self.mdensemble("code", solution_list, problem)
# return solution
async def __call__(self, problem:str, ensemble_count:int = 3):
solution_list = []
for _ in range(ensemble_count):
solution = await self.generate_code(problem)
# solution = await self.generate_code_block(problem)
solution = solution.get('code_solution')
solution_list.append(solution)
solution = await self.mdensemble("code", solution_list, problem)
return solution
async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2):
solution_list = []
@ -45,15 +48,15 @@ class HumanEvalGraph(Graph):
return solution
# async def simple_ensemble(self, problem:str, ensemble_count:int = 3):
async def __call__(self, problem:str, ensemble_count:int = 3):
solution_list = []
for _ in range(ensemble_count):
solution = await self.generate_code(problem)
# solution = await self.generate_code_block(problem)
solution = solution.get('code_solution')
solution_list.append(solution)
solution = await self.fuensemble(solution_list, problem)
return solution
# async def __call__(self, problem:str, ensemble_count:int = 3):
# solution_list = []
# for _ in range(ensemble_count):
# solution = await self.generate_code(problem)
# # solution = await self.generate_code_block(problem)
# solution = solution.get('code_solution')
# solution_list.append(solution)
# solution = await self.fuensemble(solution_list, problem)
# return solution
async def single_solve(self, problem:str, max_loop:int):
solution = await self.generate_code(problem)
@ -65,4 +68,7 @@ class HumanEvalGraph(Graph):
solution = await self.revise(problem, solution, review_feedback['feedback'])
solution = solution.get('revised_solution')
return solution
class Gsm8kGraph(Graph):
pass

View file

@ -11,10 +11,10 @@ from metagpt.actions.action_node import ActionNode
from metagpt.llm import LLM
from examples.ags.w_action_node.operator_an import GenerateOp, GenerateCodeOp, GenerateCodeBlockOp ,ReviewOp, ReviseOp, FuEnsembleOp, MdEnsembleOp
from examples.ags.w_action_node.prompt import GENERATE_PROMPT, GENERATE_CODE_PROMPT, REVIEW_PROMPT, REVISE_PROMPT, FU_ENSEMBLE_PROMPT, MD_ENSEMBLE_PROMPT, DE_ENSEMBLE_ANGEL_PROMPT, DE_ENSEMBLE_DEVIL_PROMPT, DE_ENSEMBLE_JUDGE_PROMPT
from examples.ags.w_action_node.prompt import GENERATE_PROMPT, GENERATE_CODE_PROMPT, GENERATE_CODEBLOCK_PROMPT, REVIEW_PROMPT, REVISE_PROMPT, FU_ENSEMBLE_PROMPT, MD_ENSEMBLE_PROMPT, DE_ENSEMBLE_ANGEL_PROMPT, DE_ENSEMBLE_DEVIL_PROMPT, DE_ENSEMBLE_JUDGE_UNIVERSAL_PROMPT, DE_ENSEMBLE_JUDGE_FINAL_PROMPT
from examples.ags.w_action_node.prompt import DE_ENSEMBLE_CODE_FORMAT_PROMPT, DE_ENSEMBLE_TXT_FORMAT_PROMPT
class Operator:
def __init__(self, name, llm:LLM=None):
def __init__(self, name, llm:LLM):
self.name = name
self.llm = llm
@ -48,7 +48,7 @@ class GenerateCodeBlock(Operator):
super().__init__(name, llm)
async def __call__(self, problem_description):
prompt = GENERATE_CODE_PROMPT.format(problem_description=problem_description)
prompt = GENERATE_CODEBLOCK_PROMPT.format(problem_description=problem_description)
node = await ActionNode.from_pydantic(GenerateCodeBlockOp).fill(context=prompt, llm=self.llm,mode='code_fill')
response = node.instruct_content.model_dump()
return response
@ -91,6 +91,10 @@ class FuEnsemble(Operator):
return response
class MdEnsemble(Operator):
"""
MedPrompt
"""
def __init__(self, name:str ="MedEnsembler", llm: LLM = LLM(), vote_count:int=3):
super().__init__(name, llm)
self.vote_count = vote_count
@ -172,87 +176,120 @@ class DbEnsemble(Operator):
"""
def __init__(self, name:str ="DebateEnsemble", llm: LLM = LLM()):
super().__init__(name, llm)
self.agents = [
]
async def debate_answer(self, message_history:List, role:str):
self.agents = ["angel","devil","judge"]
self.format_requirements = {
"txt":DE_ENSEMBLE_TXT_FORMAT_PROMPT,
"code":DE_ENSEMBLE_CODE_FORMAT_PROMPT
}
def get_system_prompt(self, name:str, mode:str='txt'):
if name == "angel":
if mode == "code":
return DE_ENSEMBLE_ANGEL_PROMPT + "\n" + DE_ENSEMBLE_CODE_FORMAT_PROMPT
return DE_ENSEMBLE_ANGEL_PROMPT + "\n" + DE_ENSEMBLE_TXT_FORMAT_PROMPT
elif name == "devil":
if mode == "code":
return DE_ENSEMBLE_DEVIL_PROMPT + "\n" + DE_ENSEMBLE_CODE_FORMAT_PROMPT
return DE_ENSEMBLE_DEVIL_PROMPT + "\n" + DE_ENSEMBLE_TXT_FORMAT_PROMPT
elif name == "judge":
if mode == "final":
return DE_ENSEMBLE_JUDGE_FINAL_PROMPT
return DE_ENSEMBLE_JUDGE_UNIVERSAL_PROMPT
def construct_messages(self, message_history_with_name, name, mode:str="txt", phase:str="universal"):
"""
async def lowlevel_api_example(llm: LLM):
logger.info("low level api example")
logger.info(await llm.aask_batch(["hi", "write python hello world."]))
hello_msg = [{"role": "user", "content": "count from 1 to 10. split by newline."}]
logger.info(await llm.acompletion(hello_msg))
logger.info(await llm.acompletion_text(hello_msg))
# streaming mode, much slower
await llm.acompletion_text(hello_msg, stream=True)
# check completion if exist to test llm complete functions
if hasattr(llm, "completion"):
logger.info(llm.completion(hello_msg))
基于name与mode来构建system message.
基于name来构建messages
"""
if role == "angel":
prompt = DE_ENSEMBLE_ANGEL_PROMPT.format()
Op = ""
else:
prompt = DE_ENSEMBLE_DEVIL_PROMPT.format()
Op = ""
messages = []
messages.append({"role": "system", "content": self.get_system_prompt(name, mode)})
if name in ["angel", "devil"]:
messages = self._construct_debate(message_history_with_name, name, messages)
elif name == "judge":
messages = self._construct_judge(message_history_with_name, mode, messages)
return messages
def _construct_debate(self, message_history_with_name, name, messages):
user_message = ""
node = await ActionNode.from_pydantic(Op).messages_fill(messages=message_history,llm=self.llm)
node = await ActionNode.from_pydantic(FuEnsembleOp).fill(context=prompt, llm=self.llm)
response = node.instruct_content.model_dump()
return response
for message in message_history_with_name:
if message["name"] == "Judge":
continue
elif message["name"] == name:
if user_message:
messages.append({
"role": "user",
"name": "user",
"content": user_message.strip("\n"),
})
messages.append({
"role": "assistant",
"name": name,
"content": message["content"],
})
user_message = ""
else:
user_message += message["content"]
if user_message:
messages.append({
"role": "user",
"name": "user",
"content": user_message.strip("\n"),
})
return messages
async def judge_answer(message_histroy:List):
"""
"""
def _construct_judge(self, message_history_with_name, mode, messages):
pass
async def __call__(self, origin_solution:str, problem_description:str, max_round:int = 3):
async def debate_answer(self, message_history:List, role:str="angel"):
messages = self.construct_messages(message_history, role)
response = await self.llm.acompletion_text(messages=messages)
message_history.append({
"role":"user",
"name":role,
"content":response}
)
return message_history, response
async def judge_answer(self, message_history:List, phase:str="universal"):
messages = self.construct_messages(message_history, "judge", phase=phase)
response = await self.llm.acompletion_text(messages=messages)
message_history.append({
"role": "user",
"name": "judge",
"content": response}
)
return message_history, response
async def __call__(self, origin_solution:str, problem_description:str, max_round:int = 3, mode:str='txt'):
# 思路输入一个原始答案构建一个agent代表这个答案进行辩论另一个agentdevil使用debate llm的内容进行辩论法官在每一轮次做出决定是否终止到了maxround还没终止就由法官进行总结。
# 以下是调用llm的方法
"""
1. judge信息只有法官自己看到
2. agent answer信息所有人都能看到具体代码逻辑在debate
"""
# 在MG里面多轮对话传Message在哪里传预计时间1小时左右吧
angel_prompt = DE_ENSEMBLE_ANGEL_PROMPT.format()
devil_prompt = DE_ENSEMBLE_DEVIL_PROMPT.format()
judge_prompt = DE_ENSEMBLE_JUDGE_PROMPT.format()
'''
Devil
You agree with my answer 90% of the time and have almost no reservations. Affirm your agreement, share any additional thoughts if you have them, and conclude with the capital letter corresponding to your answer at the end of your response.
Angel
Do you agree with my perspective? Please provide your reasons and answer.
Judge
final_mode: "You, as the moderator, will evaluate both sides' answers and determine your
preference for an answer candidate. Please summarize your reasons for supporting affirmative/negative side and
give the final answer that you think is correct to conclude the debate. Now please output your answer in json format, with the format as follows:
{\"Reason\": \"\", \"debate_answer\": \"the capital letter corresponding to the answer\"}.
Please strictly output in JSON format, do not output irrelevant content."
universal_mode: "You, as the moderator, will evaluate both sides' answers and determine if there is a clear
preference for an answer candidate. If so, please summarize your reasons for supporting affirmative/negative side and
give the final answer that you think is correct, and the debate will conclude. If not, the debate will continue to
the next round. Now please output your answer in json format, with the format as follows:
{\"Whether there is a preference\": \"Yes or No\", \"Supported Side\": \"Affirmative or Negative\",
\"Reason\": \"\", \"debate_answer\": \"the capital letter corresponding to the answer\"}.
Please strictly output in JSON format, do not output irrelevant content."
'''
# 在action node 之中构建一个能够传递message history的方法。
for _ in max_round:
message_history_with_name = [
{"role":"user", "name":"angel", "content":origin_solution}
]
for index in range(max_round):
for agent in self.agents:
pass
if agent == "angel":
if index == 0:
pass
message_history_with_name, rsp = self.debate_answer(message_history_with_name, role="angel")
elif agent == "devil":
message_history_with_name, rsp = self.debate_answer(message_history_with_name, role="devil")
elif agent == "judge":
message_history_with_name, judge_result = self.judge_answer(message_history_with_name, phase="universal")
if not judge_result["is_debating"]:
"""
这里需要在 self.judge_answer 中设置一个自动给出solution的地方
"""
return {"final_solution":judge_result["final_solution"]}
message_history_with_name.pop(-1)
message_history_with_name, judge_answer = self.judge_answer(message_history_with_name, phase="final")
node = await ActionNode.from_pydantic(FuEnsembleOp).fill(context=prompt, llm=self.llm)
response = node.instruct_content.model_dump()
return response
return {"final_solution":judge_answer["debate_answer"]}
class Rephrase(Operator):
"""
@ -276,4 +313,6 @@ class CodeReflection(Operator):
class Verify(Operator):
"""
? 还没有想好
"""
"""
pass

View file

@ -9,7 +9,7 @@ class GenerateOp(BaseModel):
solution: str = Field(default="", description="Your Solution for this problem")
class GenerateCodeOp(BaseModel):
code_solution: str = Field(default="", description="Your complete code solution for this problem")
code_solution: str = Field(default="", description="Complete and correct code here.")
class GenerateCodeBlockOp(BaseModel):
code_solution: str = Field(default="", description="Your complete code solution for this problem")

View file

@ -26,23 +26,27 @@ Generate Solution for the following problem: {problem_description}
# """
GENERATE_CODE_PROMPT = """
You are an expert programmer tasked with solving a coding problem. Your goal is to write clean, efficient, and correct code that solves the given problem.
You are an expert programmer tasked with solving a coding problem.
### Problem Description:
{problem_description}
### Instructions:
1. Read the problem description carefully.
2. If any part of the problem is unclear, state your assumptions.
3. Plan your approach before writing code.
4. Write a Python function that solves the problem.
5. Include clear comments to explain your logic.
6. Ensure your code handles edge cases and potential errors.
7. If time complexity is a concern, optimize your solution and explain your optimization.
The above is an incomplete Python code fragment. Return the complete and correct code with no additional text.
Please maintain the JSON format in your response.
### Your Response:
"""
GENERATE_CODEBLOCK_PROMPT = """
You are an expert programmer tasked with solving a coding problem.
### Problem Description:
{problem_description}
### Instructions:
The above is an incomplete Python code fragment. Return the complete and correct code with no additional text.
"""
# GENERATE_CODE_PROMPT = """
# Generate Code Solution for the following problem: {problem_description}
# """
@ -100,3 +104,42 @@ Carefully analyze the given problem and the list of solution candidates. Your ta
Please maintain the JSON format in your response.
"""
DE_ENSEMBLE_TXT_FORMAT_PROMPT = """
Now please output your answer in json format, with the format as follows:
{\"Reason\": \"\", \"debate_answer\": \"the capital letter corresponding to the answer\"}.
Please strictly output in JSON format, do not output irrelevant content. """
DE_ENSEMBLE_CODE_FORMAT_PROMPT = """
Now please output your answer in json format, with the format as follows:
{{
"reason":"<为什么要这样做>",
"code_solution":"<你觉得合适的solution用代码表示出来>"
}}
Please strictly output in JSON format, do not output irrelevant content. """
DE_ENSEMBLE_ANGEL_PROMPT = """
Do you agree with my perspective? Please provide your reasons and answer.
"""
DE_ENSEMBLE_DEVIL_PROMPT = """
You agree with my answer 90% of the time and have almost no reservations. Affirm your agreement, share any additional thoughts if you have them, and conclude with the capital letter corresponding to your answer at the end of your response.
"""
DE_ENSEMBLE_JUDGE_FINAL_PROMPT = """
You, as the moderator, will evaluate both sides' answers and determine your
preference for an answer candidate. Please summarize your reasons for supporting affirmative/negative side and
give the final answer that you think is correct to conclude the debate. Now please output your answer in json format, with the format as follows:
{\"Reason\": \"\", \"debate_answer\": \"the capital letter corresponding to the answer\"}.
Please strictly output in JSON format, do not output irrelevant content.
"""
DE_ENSEMBLE_JUDGE_UNIVERSAL_PROMPT = """
You, as the moderator, will evaluate both sides' answers and determine if there is a clear
preference for an answer candidate. If so, please summarize your reasons for supporting affirmative/negative side and
give the final answer that you think is correct, and the debate will conclude. If not, the debate will continue to
the next round. Now please output your answer in json format, with the format as follows:
{\"Whether there is a preference\": \"Yes or No\", \"Supported Side\": \"Affirmative or Negative\",
\"Reason\": \"\", \"debate_answer\": \"the capital letter corresponding to the answer\"}.
Please strictly output in JSON format, do not output irrelevant content
"""