Update Code Review

This commit is contained in:
didi 2024-08-01 14:20:40 +08:00
parent 686b1cd130
commit 6d4c72cdf0
11 changed files with 549 additions and 662 deletions

View file

@ -1,70 +1,4 @@
# -*- coding: utf-8 -*-
# @Date :
# @Date :
# @Author : issac
# @Desc : test on gsm8k
import json
import re
import os
# 读取原始数据集
def read_jsonl(path: str):
with open(path) as fh:
return [json.loads(line) for line in fh.readlines() if line]
# 和图/和基础模型直接交互得到答案
def LLM(question):
answer = ""
# 这里就是输入问题question返回答案answer
# answer = 根据question生成的回答
return answer
def gsm_extract_answer(completion):
ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
INVALID_ANS = "[invalid]"
match = ANS_RE.search(completion)
if match:
match_str = match.group(1).strip()
match_str = match_str.replace(",", "")
return match_str
else:
return INVALID_ANS
def gsm_is_correct(data):
INVALID_ANS = "[invalid]"
gt_answer = gsm_extract_answer(data["answer"])
assert gt_answer != INVALID_ANS
return gsm_extract_answer(data["answer_llm"]) == gt_answer
# 提取数据集并得到测试答案
def get_examples(split):
path = os.path.join("", f"{split}.jsonl")
output_path = "gsm8k_generate.jsonl"
examples = read_jsonl(path)
processed_examples = [] # 用于存储处理后的样本
for ex in examples:
answer_llm = LLM(ex['question'])
ex['answer_llm'] = answer_llm
ex['is_correct'] = gsm_is_correct(ex)
# 将处理后的样本添加到列表中
processed_examples.append(ex)
# 将处理后的样本写入到新的 JSONL 文件
with open(output_path, 'w', encoding='utf-8') as f:
for example in processed_examples:
# 将字典转换为 JSON 格式的字符串,并写入新行
json_line = json.dumps(example) + '\n'
f.write(json_line)
print(f"{len(examples)} {split} examples")
return examples
if __name__ == "__main__":
example = get_examples("gsm")
print(example[:5])

View file

@ -1,155 +1,4 @@
# -*- coding: utf-8 -*-
# @Date :
# @Date :
# @Author : issac
# @Desc : test on hotpotqa
import sys
import json
import re
import string
from collections import Counter
import pickle
def normalize_answer(s):
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
normalized_prediction = normalize_answer(prediction)
normalized_ground_truth = normalize_answer(ground_truth)
ZERO_METRIC = (0, 0, 0)
if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
return ZERO_METRIC
if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
return ZERO_METRIC
prediction_tokens = normalized_prediction.split()
ground_truth_tokens = normalized_ground_truth.split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return ZERO_METRIC
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1, precision, recall
def exact_match_score(prediction, ground_truth):
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def update_answer(metrics, prediction, gold):
em = exact_match_score(prediction, gold)
f1, prec, recall = f1_score(prediction, gold)
metrics['em'] += float(em)
metrics['f1'] += f1
metrics['prec'] += prec
metrics['recall'] += recall
return em, prec, recall
def update_sp(metrics, prediction, gold):
cur_sp_pred = set(map(tuple, prediction))
gold_sp_pred = set(map(tuple, gold))
tp, fp, fn = 0, 0, 0
for e in cur_sp_pred:
if e in gold_sp_pred:
tp += 1
else:
fp += 1
for e in gold_sp_pred:
if e not in cur_sp_pred:
fn += 1
prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
em = 1.0 if fp + fn == 0 else 0.0
metrics['sp_em'] += em
metrics['sp_f1'] += f1
metrics['sp_prec'] += prec
metrics['sp_recall'] += recall
return em, prec, recall
def LLM(question):
answer = ""
# 这里就是输入问题question返回答案answer
# answer = 根据question生成的回答
return answer
def eval(prediction_file, gold_file):
with open(prediction_file) as f:
prediction = json.load(f)
with open(gold_file) as f:
gold = json.load(f)
metrics = {'em': 0, 'f1': 0, 'prec': 0, 'recall': 0,
'sp_em': 0, 'sp_f1': 0, 'sp_prec': 0, 'sp_recall': 0,
'joint_em': 0, 'joint_f1': 0, 'joint_prec': 0, 'joint_recall': 0}
for dp in gold:
cur_id = dp['_id']
can_eval_joint = True
if cur_id not in prediction['answer']:
print('missing answer {}'.format(cur_id))
can_eval_joint = False
else:
em, prec, recall = update_answer(
metrics, prediction['answer'][cur_id], dp['answer'])
N = len(gold)
for k in metrics.keys():
metrics[k] /= N
print(metrics)
def LLM(question):
answer = question
# 这里就是输入问题question返回答案answer
# answer = 根据question生成的回答
return answer
def answer(prediction_file, gold_file):
with open(gold_file) as f:
gold = json.load(f)
# 初始化预测字典,包含 answer 和 sp 两个键,初始为空字典
prediction = {'answer': {}}
for dp in gold:
cur_id = dp['_id']
paragraphs = [item[1] for item in dp['context'] if isinstance(item[1], list)] # 确保 item[1] 是列表
# 将所有文本段落连接成一个字符串
context_str = "\n".join(" ".join(paragraph) for paragraph in paragraphs)
question = dp['question']
# 构建输入字符串
input_llm = f"question{question}\n\ncontext{context_str}"
# 假设 LLM 是一个函数,返回模型的预测答案
response = LLM(input_llm)
# 将预测答案存储在字典中,键为 cur_id
prediction['answer'][cur_id] = response
# 将预测结果写入文件
with open(prediction_file, 'w') as f:
json.dump(prediction, f)
if __name__ == '__main__':
answer('hotpot_pre.json', 'your path here')
eval('hotpot_pre.json', 'your path here')

View file

@ -3,134 +3,132 @@
# @Author : didi
# @Desc : test on human eval graph
import os
import asyncio
import json
import os
import subprocess
import sys
import asyncio
from typing import Literal, Optional
import aiofiles
from metagpt.llm import LLM
from evalplus.data import get_human_eval_plus
from examples.ags.w_action_node.utils import jsonl_ranker
from examples.ags.w_action_node.graph import HumanEvalGraph
from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock
from examples.ags.w_action_node.utils import sort_json_by_task_id
from metagpt.llm import LLM
from metagpt.logs import logger
from metagpt.utils.common import add_jsonl_file, read_json_file
from metagpt.utils.exceptions import handle_exception
generate_code = GenerateCode(llm=LLM())
generate_code_block = GenerateCodeBlock(llm=LLM())
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria='correctness, efficiency, readability', vote_count=5)
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria="correctness, efficiency, readability", vote_count=5)
async def sample_generate(id, result_path:str="samples.jsonl",mode:str="ags"):
ModeType = Literal["ags", "alpha_codium", "llm"]
async def llm_generate(id):
case = get_human_eval_plus()[f"{id}"]
solution_result = await generate_code_block(case["prompt"], case["entry_point"])
sample_dict = dict(task_id=case["task_id"], solution=solution_result["code_solution"])
return sample_dict
async def ags_generate(id, ensemble_count: int = 5):
case = get_human_eval_plus()[f"{id}"]
solution_result = await solver(case["prompt"], ensemble_count=ensemble_count)
sample_dict = dict(task_id=case["task_id"], solution=solution_result["final_solution"])
return sample_dict
async def alpha_codium_generate(id):
case = get_human_eval_plus()[f"{id}"]
solution_result = await solver.alpha_codium(case["task_id"], case["prompt"], ensemble_count=5)
sample_dict = dict(task_id=case["task_id"], solution=solution_result["final_solution"])
return sample_dict
async def route_generate(mode: ModeType, id: str):
if mode == "ags":
solution_result = await solver(case['prompt'],ensemble_count=5)
sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
elif mode == "alpha":
solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=5)
sample_dict = dict(task_id=case['task_id'], solution=solution_result['final_solution'])
sample_dict = await ags_generate(id)
elif mode == "alpha_codium":
sample_dict = await alpha_codium_generate(id)
elif mode == "llm":
solution_result = await generate_code_block(case['prompt'],case['entry_point'])
sample_dict = dict(task_id=case['task_id'], solution=solution_result['code_solution'])
print(sample_dict)
with open(result_path, mode='a') as f:
f.write(json.dumps(sample_dict) + '\n')
jsonl_ranker(result_path, result_path)
sample_dict = await llm_generate(id)
else:
raise ValueError(f"Invalid mode: {mode}")
return sample_dict
async def samples_generate(mode:str, result_path:str="samples.jsonl"):
cases = list(get_human_eval_plus().values())
async def sample_generate(id, result_path: str = "samples.jsonl", mode: ModeType = "ags"):
sample_dict = await route_generate(mode, id)
add_jsonl_file(result_path, [sample_dict])
sort_json_by_task_id(result_path, result_path)
async def samples_generate(mode: ModeType, result_path: str = "samples.jsonl"):
ids = list(get_human_eval_plus().keys())
file_lock = asyncio.Lock()
async def solve_and_write(case, mode):
try:
if mode == 'llm':
solution_result = await generate_code_block(problem_description=case['prompt'], function_name=case['entry_point'])
# solution_result = await generate_code(case['prompt'])
sample_dict = {
'task_id': case['task_id'],
'solution': solution_result['code_solution']
}
elif mode == "ags":
solution_result = await solver(case['prompt'], ensemble_count=5)
sample_dict = {
'task_id': case['task_id'],
'solution': solution_result['final_solution']
}
elif mode == "alpha":
solution_result = await solver.alpha_codium(case['task_id'], case['prompt'], ensemble_count=5)
sample_dict = {
'task_id': case['task_id'],
'solution': solution_result['final_solution']
}
# TODO 解决 final_solution 问题之后就可以开始正式测评了
async with file_lock:
async with aiofiles.open(result_path, mode='a') as f:
await f.write(json.dumps(sample_dict) + '\n')
return None
except Exception as e:
print(e)
return case['task_id']
@handle_exception(
exception_type=Exception,
exception_msg="Error in solve_and_write function",
default_return=lambda id, *args, **kwargs: id,
)
async def solve_and_write(id: str, mode: ModeType) -> Optional[str]:
sample_dict = await route_generate(mode, id)
async with file_lock:
async with aiofiles.open(result_path, mode="a") as f:
await f.write(json.dumps(sample_dict) + "\n")
return None
tasks = [solve_and_write(case, mode) for case in cases]
tasks = [solve_and_write(id, mode) for id in ids]
results = await asyncio.gather(*tasks)
failed_tasks = [task_id for task_id in results if task_id is not None]
if failed_tasks:
print(failed_tasks)
if mode == 'llm':
for task_id in failed_tasks:
case = get_human_eval_plus()[task_id]
for _ in range(3):
try:
solution_result = await generate_code_block(case['prompt'],function_name=case['entry_point'])
task_dict = {
'task_id': case['task_id'],
'solution': solution_result['code_solution']
}
with open(result_path, mode='a') as f:
f.write(json.dumps(task_dict) + '\n')
failed_tasks.remove(task_id)
break
except Exception as e:
print(f"{e} \n failure {task_id}")
elif mode == "ags" or mode == "alpha":
for task_id in failed_tasks:
try:
await sample_generate(task_id,result_path,mode)
except Exception as e:
print(f"failure {task_id}")
jsonl_ranker(result_path, result_path)
if not failed_tasks:
# 自动 sanitize
# result_path = automatic_sanitize(result_path)
if automatic_evalplus(result_path):
eval_path = result_path[:-6]+"_eval_results.json"
unpassed_exapmle = extract_failure_tests(eval_path)
print(unpassed_exapmle)
else:
print(failed_tasks)
logger.info(failed_tasks)
for task_id in failed_tasks:
try:
await sample_generate(task_id, result_path, mode)
failed_tasks.remove(task_id)
except Exception:
logger.error(f"{task_id} fail")
def automatic_sanitize(result_path: str = "samples.jsonl"):
sort_json_by_task_id(result_path, result_path)
if not failed_tasks:
if automatic_evalplus(result_path):
eval_path = result_path[:-6] + "_eval_results.json"
unpassed_exapmle = extract_failure_tests(eval_path)
logger.info(unpassed_exapmle)
else:
logger.info(failed_tasks)
@handle_exception(exception_type=subprocess.CalledProcessError, exception_msg="sanitize error", default_return=None)
def automatic_sanitize(result_path: str = "samples.jsonl") -> Optional[str]:
"""
在命令行中自动执行 evalplus.sanitize --samples result_path
返回result_path前缀加上"-sanitized.jsonl"
"""
command = ["evalplus.sanitize", "--samples", result_path]
try:
subprocess.run(command, check=True)
except subprocess.CalledProcessError as e:
print(f"执行命令时出错: {e}")
return None
# 构建sanitized文件路径
subprocess.run(command, check=True)
base_name = os.path.splitext(result_path)[0]
sanitized_path = f"{base_name}-sanitized.jsonl"
return sanitized_path
def automatic_evalplus(result_path:str ="samples.jsonl"):
@handle_exception(
exception_type=subprocess.CalledProcessError,
exception_msg="Error in automatic_evalplus function",
default_return=False,
)
def automatic_evalplus(result_path: str = "samples.jsonl") -> bool:
"""
在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only
"""
@ -138,34 +136,30 @@ def automatic_evalplus(result_path:str ="samples.jsonl"):
sys.executable, # 使用当前 Python 解释器
"-m",
"evalplus.evaluate",
"--dataset", "humaneval",
"--samples", result_path,
"--parallel", "2",
"--base-only"
"--dataset",
"humaneval",
"--samples",
result_path,
"--parallel",
"2",
"--base-only",
]
try:
result = subprocess.run(command, check=True, capture_output=True, text=True)
print("输出:", result.stdout)
return True
except subprocess.CalledProcessError as e:
print("错误输出:", e.stderr)
return False
def extract_failure_tests(file_path:str = "samples_eval_results.json"):
with open(file_path, 'r') as f:
task_results = json.load(f)
result = subprocess.run(command, check=True, capture_output=True, text=True)
logger.info(f"ouptput: \n {result.stdout}")
return True
def extract_failure_tests(file_path: str = "samples_eval_results.json"):
task_results = read_json_file(file_path)
failed_tests = []
for task in task_results['eval'].values():
for task in task_results["eval"].values():
if task[0]["base_status"] == "fail":
failed_test = {
"task_id": task[0]["task_id"],
# "solution": task["solution"],
# "fail_tests": task["base_fail_tests"]
}
failed_tests.append(failed_test)
print(len(failed_tests))
logger.info(f"length of failed tests: {len(failed_tests)}")
return failed_tests

View file

@ -3,26 +3,41 @@
# @Author : didi
# @Desc : graph & an instance - humanevalgraph
from metagpt.llm import LLM
from typing import List
from examples.ags.w_action_node.operator import Generate, GenerateCode, GenerateCodeBlock, Review, Revise, FuEnsemble, MdEnsemble, DbEnsemble, Rephrase, Test
from examples.ags.w_action_node.utils import extract_test_cases_from_jsonl
from evalplus.data import get_human_eval_plus
from examples.ags.w_action_node.operator import (
FuEnsemble,
Generate,
GenerateCode,
GenerateCodeBlock,
MdEnsemble,
Rephrase,
Review,
Revise,
Test,
)
from examples.ags.w_action_node.utils import extract_test_cases_from_jsonl
from metagpt.llm import LLM
class Graph:
def __init__(self, name:str, llm:LLM) -> None:
def __init__(self, name: str, llm: LLM) -> None:
self.name = name
self.model = llm
self.model = llm
def __call__():
NotImplementedError("Subclasses must implement __call__ method")
def optimize(dataset:List):
def optimize(dataset: List):
pass
class HumanEvalGraph(Graph):
def __init__(self, name:str, llm: LLM, criteria:str, vote_count:int =5) -> None:
def __init__(self, name: str, llm: LLM, criteria: str, vote_count: int = 5) -> None:
super().__init__(name, llm)
self.criteria = criteria # TODO 自动构建图时,图的初始参数与图所使用的算子要求的外部参数相匹配
self.criteria = criteria # TODO 自动构建图时,图的初始参数与图所使用的算子要求的外部参数相匹配
self.generate_code = GenerateCode(llm=llm)
self.generate_code_block = GenerateCodeBlock(llm=llm)
self.review = Review(llm=llm, criteria=criteria)
@ -32,84 +47,82 @@ class HumanEvalGraph(Graph):
self.fuensemble = FuEnsemble(llm=llm)
self.mdensemble = MdEnsemble(llm=llm, vote_count=vote_count)
async def __call__(self, problem:str, ensemble_count:int = 3):
async def __call__(self, problem: str, ensemble_count: int = 3):
solution_list = []
for _ in range(ensemble_count):
for retry_count in range(5):
try:
# solution = await self.generate_code(problem)
solution = await self.generate_code_block(problem)
solution = solution.get('code_solution')
solution_list.append(solution)
break
except Exception as e:
print(e)
solution = await self.generate_code_block(problem)
solution = solution.get("code_solution")
solution_list.append(solution)
solution = await self.mdensemble("code", solution_list, problem)
return solution
async def alpha_codium(self, problem_id:str, problem:str, ensemble_count:int = 3):
async def alpha_codium(self, problem_id: str, problem: str, ensemble_count: int = 3):
"""
Paper: Code Generation with AlphaCodium: From Prompt Engineering to Flow Engineering
Link: https://arxiv.org/abs/2404.14963
Flow: An incomplete version of alpha codium, implementing the basic process of rephrase -> code ensemble -> tes
"""
test_cases = extract_test_cases_from_jsonl(problem_id)
entry_point = get_human_eval_plus()[problem_id]['entry_point']
rephrase_problem = await self.rephrase(problem) # 在rephrase 中拼接原始的问题描述
entry_point = get_human_eval_plus()[problem_id]["entry_point"]
rephrase_problem = await self.rephrase(problem) # 在rephrase 中拼接原始的问题描述
solution_list = []
for _ in range(ensemble_count):
for retry_count in range(5):
try:
solution = await self.generate_code_block.rephrase_generate(problem, rephrase_problem, function_name=entry_point)
solution = solution.get('code_solution')
solution_list.append(solution)
break
except Exception as e:
print(e)
solution = await self.generate_code_block.rephrase_generate(
problem, rephrase_problem, function_name=entry_point
)
solution = solution.get("code_solution")
solution_list.append(solution)
solution = await self.mdensemble("code", solution_list, problem)
solution = await self.tester(problem_id, problem, rephrase_problem, solution, test_cases)
return solution
async def review_revise_ensemble(self, problem:str, ensemble_count:int = 2):
async def review_revise_ensemble(self, problem: str, ensemble_count: int = 2, revise_round: int = 3):
solution_list = []
for _ in range(ensemble_count):
solution = await self.single_solve(problem, 3)
solution = await self.single_solve(problem, revise_round)
solution_list.append(solution)
solution = await self.ensemble(solution_list, problem)
return solution
async def simple_ensemble(self, problem:str, ensemble_count:int = 3):
async def simple_ensemble(self, problem: str, ensemble_count: int = 3):
solution_list = []
for _ in range(ensemble_count):
solution = await self.generate_code(problem)
# solution = await self.generate_code_block(problem)
solution = solution.get('code_solution')
solution = solution.get("code_solution")
solution_list.append(solution)
solution = await self.fuensemble(solution_list, problem)
return solution
async def single_solve(self, problem:str, max_loop:int):
async def single_solve(self, problem: str, max_loop: int):
solution = await self.generate_code(problem)
solution = solution.get('code_solution')
solution = solution.get("code_solution")
for _ in range(max_loop):
review_feedback = await self.review(problem, solution)
if review_feedback['review_result']:
if review_feedback["review_result"]:
break
solution = await self.revise(problem, solution, review_feedback['feedback'])
solution = solution.get('revised_solution')
solution = await self.revise(problem, solution, review_feedback["feedback"])
solution = solution.get("revised_solution")
return solution
class Gsm8kGraph(Graph):
def __init__(self, name:str, llm: LLM) -> None:
def __init__(self, name: str, llm: LLM) -> None:
super().__init__(name, llm)
self.generate = Generate(llm=llm)
self.rephrase = Rephrase(llm=llm)
async def __call__(self, problem:str):
async def __call__(self, problem: str):
solution = self.generate(problem)
return solution
class HotpotQAGraph(Graph):
def __init__(self, name:str, llm: LLM) -> None:
def __init__(self, name: str, llm: LLM) -> None:
super().__init__(name, llm)
self.generate = Generate(llm=llm)
self.rephrase = Rephrase(llm=llm)
async def __call__(self, problem:str):
async def __call__(self, problem: str):
solution = self.generate(problem)
return solution
return solution

View file

@ -3,30 +3,60 @@
# @Author : didi
# @Desc : operator demo of ags
import ast
import random
import sys
import traceback
import random
from typing import List, Tuple, Any, Dict
from collections import Counter
from typing import Dict, List, Tuple
from metagpt.actions.action_node import ActionNode
from metagpt.llm import LLM
from tenacity import retry, stop_after_attempt
from examples.ags.w_action_node.operator_an import GenerateOp, GenerateCodeOp, GenerateCodeBlockOp ,ReviewOp, ReviseOp, FuEnsembleOp, MdEnsembleOp, ReflectionTestOp, RephraseOp
from examples.ags.w_action_node.prompt import GENERATE_PROMPT, GENERATE_CODE_PROMPT, GENERATE_CODEBLOCK_PROMPT, REVIEW_PROMPT, REVISE_PROMPT, FU_ENSEMBLE_PROMPT, MD_ENSEMBLE_PROMPT, REFLECTION_ON_PUBILIC_TEST_PROMPT, REPHRASE_ON_PROBLEM_PROMPT, GENERATE_CODEBLOCK_REPHRASE_PROMPT
from examples.ags.w_action_node.prompt import DE_ENSEMBLE_CODE_FORMAT_PROMPT, DE_ENSEMBLE_TXT_FORMAT_PROMPT, DE_ENSEMBLE_ANGEL_PROMPT, DE_ENSEMBLE_DEVIL_PROMPT, DE_ENSEMBLE_JUDGE_UNIVERSAL_PROMPT, DE_ENSEMBLE_JUDGE_FINAL_PROMPT
from examples.ags.w_action_node.operator_an import (
FuEnsembleOp,
GenerateCodeBlockOp,
GenerateCodeOp,
GenerateOp,
MdEnsembleOp,
ReflectionTestOp,
RephraseOp,
ReviewOp,
ReviseOp,
)
from examples.ags.w_action_node.prompt import (
DE_ENSEMBLE_ANGEL_PROMPT,
DE_ENSEMBLE_CODE_FORMAT_PROMPT,
DE_ENSEMBLE_DEVIL_PROMPT,
DE_ENSEMBLE_JUDGE_FINAL_PROMPT,
DE_ENSEMBLE_JUDGE_UNIVERSAL_PROMPT,
DE_ENSEMBLE_TXT_FORMAT_PROMPT,
FU_ENSEMBLE_PROMPT,
GENERATE_CODE_PROMPT,
GENERATE_CODEBLOCK_PROMPT,
GENERATE_CODEBLOCK_REPHRASE_PROMPT,
GENERATE_PROMPT,
MD_ENSEMBLE_PROMPT,
REFLECTION_ON_PUBLIC_TEST_PROMPT,
REPHRASE_ON_PROBLEM_PROMPT,
REVIEW_PROMPT,
REVISE_PROMPT,
)
from examples.ags.w_action_node.utils import test_cases_2_test_functions
from metagpt.actions.action_node import ActionNode
from metagpt.llm import LLM
from metagpt.logs import logger
class Operator:
def __init__(self, name, llm:LLM):
def __init__(self, name, llm: LLM):
self.name = name
self.llm = llm
def __call__(self, *args, **kwargs):
raise NotImplementedError
class Generate(Operator):
def __init__(self, name:str ="Generator", llm: LLM = LLM()):
def __init__(self, name: str = "Generate", llm: LLM = LLM()):
super().__init__(name, llm)
async def __call__(self, problem_description):
@ -34,10 +64,10 @@ class Generate(Operator):
node = await ActionNode.from_pydantic(GenerateOp).fill(context=prompt, llm=self.llm)
response = node.instruct_content.model_dump()
return response
class GenerateCode(Operator):
def __init__(self, name:str ="Coder", llm: LLM = LLM()):
class GenerateCode(Operator):
def __init__(self, name: str = "GenerateCode", llm: LLM = LLM()):
super().__init__(name, llm)
async def __call__(self, problem_description):
@ -45,39 +75,49 @@ class GenerateCode(Operator):
node = await ActionNode.from_pydantic(GenerateCodeOp).fill(context=prompt, llm=self.llm)
response = node.instruct_content.model_dump()
return response
class GenerateCodeBlock(Operator):
def __init__(self, name:str ="Coder", llm: LLM = LLM()):
class GenerateCodeBlock(Operator):
def __init__(self, name: str = "GenerateCodeBlock", llm: LLM = LLM()):
super().__init__(name, llm)
@retry(stop=stop_after_attempt(3))
async def __call__(self, problem_description, function_name):
prompt = GENERATE_CODEBLOCK_PROMPT.format(problem_description=problem_description)
node = await ActionNode.from_pydantic(GenerateCodeBlockOp).fill(context=prompt, llm=self.llm, mode='code_fill',function_name=function_name)
node = await ActionNode.from_pydantic(GenerateCodeBlockOp).fill(
context=prompt, llm=self.llm, mode="code_fill", function_name=function_name
)
response = node.instruct_content.model_dump()
return response
@retry(stop=stop_after_attempt(3))
async def rephrase_generate(self, problem_description, rephrase_problem, function_name):
prompt = GENERATE_CODEBLOCK_REPHRASE_PROMPT.format(problem_description=problem_description,rephrase_problem=rephrase_problem)
node = await ActionNode.from_pydantic(GenerateCodeBlockOp).fill(context=prompt, llm=self.llm, mode='code_fill', function_name=function_name)
prompt = GENERATE_CODEBLOCK_REPHRASE_PROMPT.format(
problem_description=problem_description, rephrase_problem=rephrase_problem
)
node = await ActionNode.from_pydantic(GenerateCodeBlockOp).fill(
context=prompt, llm=self.llm, mode="code_fill", function_name=function_name
)
response = node.instruct_content.model_dump()
return response
class Review(Operator):
def __init__(self, criteria, name:str ="Reviewer", llm: LLM = LLM()):
def __init__(self, criteria, name: str = "Review", llm: LLM = LLM()):
self.criteria = criteria
super().__init__(name, llm)
async def __call__(self, problem_description, solution):
prompt = REVIEW_PROMPT.format(problem_description=problem_description, solution=solution, criteria=self.criteria)
prompt = REVIEW_PROMPT.format(
problem_description=problem_description, solution=solution, criteria=self.criteria
)
node = await ActionNode.from_pydantic(ReviewOp).fill(context=prompt, llm=self.llm)
response = node.instruct_content.model_dump()
return response
class Revise(Operator):
def __init__(self, name:str ="Reviser", llm: LLM = LLM()):
class Revise(Operator):
def __init__(self, name: str = "Revise", llm: LLM = LLM()):
super().__init__(name, llm)
async def __call__(self, problem_description, solution, feedback):
@ -86,12 +126,16 @@ class Revise(Operator):
response = node.instruct_content.model_dump()
return response
class FuEnsemble(Operator):
def __init__(self, name:str ="FuseEnsembler", llm: LLM = LLM()):
class FuEnsemble(Operator):
"""
Function: Critically evaluating multiple solution candidates, synthesizing their strengths, and developing an enhanced, integrated solution.
"""
def __init__(self, name: str = "FuEnsemble", llm: LLM = LLM()):
super().__init__(name, llm)
async def __call__(self, solutions:List, problem_description):
async def __call__(self, solutions: List, problem_description):
solution_text = ""
for solution in solutions:
solution_text += str(solution) + "\n"
@ -99,13 +143,18 @@ class FuEnsemble(Operator):
node = await ActionNode.from_pydantic(FuEnsembleOp).fill(context=prompt, llm=self.llm)
response = node.instruct_content.model_dump()
return response
class MdEnsemble(Operator):
def __init__(self, name:str ="MedEnsembler", llm: LLM = LLM(), vote_count:int=3):
class MdEnsemble(Operator):
"""
Paper: Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine
Link: https://arxiv.org/abs/2311.16452
"""
def __init__(self, name: str = "MdEnsemble", llm: LLM = LLM(), vote_count: int = 3):
super().__init__(name, llm)
self.vote_count = vote_count
@staticmethod
def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
shuffled_solutions = solutions.copy()
@ -113,8 +162,7 @@ class MdEnsemble(Operator):
answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
return shuffled_solutions, answer_mapping
async def __call__(self, solution_type:str ,solutions:List[str], problem_description:str):
print(solutions)
async def __call__(self, solution_type: str, solutions: List[str], problem_description: str):
all_responses = []
# 当Ensmeble方案是Code类型时我们使用AST进行去重
if solution_type == "code":
@ -125,7 +173,7 @@ class MdEnsemble(Operator):
try:
tree = ast.parse(solution)
structure_key = ast.dump(tree, annotate_fields=False, include_attributes=False)
if structure_key not in unique_structures:
unique_structures[structure_key] = solution
updated_solutions.append(solution)
@ -136,54 +184,52 @@ class MdEnsemble(Operator):
updated_length = len(solutions)
if updated_length == 1:
return {"final_solution": solutions[0]}
for _ in range(self.vote_count):
shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
solution_text = ""
for index, solution in enumerate(shuffled_solutions):
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description)
node = await ActionNode.from_pydantic(MdEnsembleOp).fill(context=prompt, llm=self.llm)
response = node.instruct_content.model_dump()
answer = response.get('solution_letter', '')
answer = response.get("solution_letter", "")
answer = answer.strip().upper()
if answer in answer_mapping:
original_index = answer_mapping[answer]
# print(f"original index: {original_index}")
all_responses.append(original_index)
most_frequent_index = Counter(all_responses).most_common(1)[0][0]
final_answer = solutions[most_frequent_index]
return {"final_solution": final_answer}
class ScEnsemble(Operator):
"""
self consistency ensemble
Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
Link: https://arxiv.org/abs/2203.11171
"""
pass
class DbEnsemble(Operator):
class MADEnsemble(Operator):
"""
(Should we be going MAD? A Look at Multi-Agent Debate Strategies for LLMs)
The system is a multi-round debate system where each agent is given the
question and responses generated by all agents. For each round, a judge
analyzes the responses provided determines whether to terminate the
debate or keep going. At the end of the debate the judge is also responsible
for determining the final answer.
Paper: Should we be going MAD? A Look at Multi-Agent Debate Strategies for LLMs
Link: https://arxiv.org/abs/2311.17371
"""
def __init__(self, name:str ="DebateEnsemble", llm: LLM = LLM()):
def __init__(self, name: str = "DebateEnsemble", llm: LLM = LLM()):
super().__init__(name, llm)
self.agents = ["angel","devil","judge"]
self.format_requirements = {
"txt":DE_ENSEMBLE_TXT_FORMAT_PROMPT,
"code":DE_ENSEMBLE_CODE_FORMAT_PROMPT
}
def get_system_prompt(self, name:str, mode:str='txt'):
self.agents = ["angel", "devil", "judge"]
self.format_requirements = {"txt": DE_ENSEMBLE_TXT_FORMAT_PROMPT, "code": DE_ENSEMBLE_CODE_FORMAT_PROMPT}
def get_system_prompt(self, name: str, mode: str = "txt"):
if name == "angel":
if mode == "code":
return DE_ENSEMBLE_ANGEL_PROMPT + "\n" + DE_ENSEMBLE_CODE_FORMAT_PROMPT
@ -194,10 +240,10 @@ class DbEnsemble(Operator):
return DE_ENSEMBLE_DEVIL_PROMPT + "\n" + DE_ENSEMBLE_TXT_FORMAT_PROMPT
elif name == "judge":
if mode == "final":
return DE_ENSEMBLE_JUDGE_FINAL_PROMPT
return DE_ENSEMBLE_JUDGE_FINAL_PROMPT
return DE_ENSEMBLE_JUDGE_UNIVERSAL_PROMPT
def construct_messages(self, message_history_with_name, name, mode:str="txt", phase:str="universal"):
def construct_messages(self, message_history_with_name, name, mode: str = "txt", phase: str = "universal"):
"""
基于name与mode来构建system message.
基于name来构建messages
@ -210,67 +256,63 @@ class DbEnsemble(Operator):
elif name == "judge":
messages = self._construct_judge(message_history_with_name, mode, messages)
return messages
def _construct_debate(self, message_history_with_name, name, messages):
user_message = ""
for message in message_history_with_name:
if message["name"] == "Judge":
continue
elif message["name"] == name:
if user_message:
messages.append({
"role": "user",
"name": "user",
"content": user_message.strip("\n"),
})
messages.append({
"role": "assistant",
"name": name,
"content": message["content"],
})
messages.append(
{
"role": "user",
"name": "user",
"content": user_message.strip("\n"),
}
)
messages.append(
{
"role": "assistant",
"name": name,
"content": message["content"],
}
)
user_message = ""
else:
user_message += message["content"]
if user_message:
messages.append({
"role": "user",
"name": "user",
"content": user_message.strip("\n"),
})
messages.append(
{
"role": "user",
"name": "user",
"content": user_message.strip("\n"),
}
)
return messages
def _construct_judge(self, message_history_with_name, mode, messages):
pass
async def debate_answer(self, message_history:List, role:str="angel"):
async def debate_answer(self, message_history: List, role: str = "angel"):
messages = self.construct_messages(message_history, role)
response = await self.llm.acompletion_text(messages=messages)
message_history.append({
"role":"user",
"name":role,
"content":response}
)
message_history.append({"role": "user", "name": role, "content": response})
return message_history, response
async def judge_answer(self, message_history:List, phase:str="universal"):
async def judge_answer(self, message_history: List, phase: str = "universal"):
messages = self.construct_messages(message_history, "judge", phase=phase)
response = await self.llm.acompletion_text(messages=messages)
message_history.append({
"role": "user",
"name": "judge",
"content": response}
)
message_history.append({"role": "user", "name": "judge", "content": response})
return message_history, response
async def __call__(self, origin_solution:str, problem_description:str, max_round:int = 3, mode:str='txt'):
async def __call__(self, origin_solution: str, problem_description: str, max_round: int = 3, mode: str = "txt"):
# 思路输入一个原始答案构建一个agent代表这个答案进行辩论另一个agentdevil使用debate llm的内容进行辩论法官在每一轮次做出决定是否终止到了maxround还没终止就由法官进行总结。
message_history_with_name = [
{"role":"user", "name":"angel", "content":origin_solution}
]
message_history_with_name = [{"role": "user", "name": "angel", "content": origin_solution}]
for index in range(max_round):
for agent in self.agents:
if agent == "angel":
@ -280,91 +322,108 @@ class DbEnsemble(Operator):
elif agent == "devil":
message_history_with_name, rsp = self.debate_answer(message_history_with_name, role="devil")
elif agent == "judge":
message_history_with_name, judge_result = self.judge_answer(message_history_with_name, phase="universal")
message_history_with_name, judge_result = self.judge_answer(
message_history_with_name, phase="universal"
)
if not judge_result["is_debating"]:
"""
这里需要在 self.judge_answer 中设置一个自动给出solution的地方
"""
return {"final_solution":judge_result["final_solution"]}
message_history_with_name.pop(-1)
message_history_with_name, judge_answer = self.judge_answer(message_history_with_name, phase="final")
return {"final_solution": judge_result["final_solution"]}
message_history_with_name.pop(-1)
message_history_with_name, judge_answer = self.judge_answer(message_history_with_name, phase="final")
return {"final_solution": judge_answer["debate_answer"]}
return {"final_solution":judge_answer["debate_answer"]}
class Rephrase(Operator):
"""
1. AlphaCodium
2. https://arxiv.org/abs/2404.14963
Paper: Code Generation with AlphaCodium: From Prompt Engineering to Flow Engineering
Link: https://arxiv.org/abs/2404.14963
Paper: Achieving >97% on GSM8K: Deeply Understanding the Problems Makes LLMs Better Solvers for Math Word Problems
Link: https://arxiv.org/abs/2404.14963
"""
def __init__(self, name:str ="Rephraser", llm: LLM = LLM()):
def __init__(self, name: str = "Rephrase", llm: LLM = LLM()):
super().__init__(name, llm)
async def __call__(self, problem_description:str)->str:
async def __call__(self, problem_description: str) -> str:
prompt = REPHRASE_ON_PROBLEM_PROMPT.format(problem_description=problem_description)
node = await ActionNode.from_pydantic(RephraseOp).fill(context=prompt, llm=self.llm)
response = node.instruct_content.model_dump()
return response["rephrased_problem"]
class Test(Operator):
def __init__(self, name:str ="Tester", llm: LLM = LLM()):
def __init__(self, name: str = "Test", llm: LLM = LLM()):
super().__init__(name, llm)
def exec_code(self, solution, test_cases, problem_id):
# TODO
# TODO
# 1. 获取更加详细的Test error信息
# 2. 更换Public Test数据集当前使用的数据存在Label Leak(使用的Reflexion的数据集)
# 3. 实现单独测试每一个test case -> 1
# 2. 更换Public Test数据集当前使用的数据存在Label Leak(使用的Reflexion的数据集) -> 这个问题使用LLM抽取解决直接生成为assert代码串
# 3. 实现单独测试每一个test case -> 1
solution = solution["final_solution"]
test_code = test_cases_2_test_functions(solution, test_cases)
print("test_code", test_code)
try:
exec(test_code, globals())
except AssertionError as e:
exc_type, exc_value, exc_traceback = sys.exc_info()
tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback)
with open("tester.txt", "a") as f:
f.write("test_error" +problem_id + "\n")
error_infomation = {"test_fail_case": {
"error_type": "AssertionError",
"error_message": str(e),
"traceback": tb_str
}}
print("error here", error_infomation)
f.write("test_error" + problem_id + "\n")
error_infomation = {
"test_fail_case": {"error_type": "AssertionError", "error_message": str(e), "traceback": tb_str}
}
logger.info(f"test error: {error_infomation}")
return error_infomation
except Exception as e:
with open("tester.txt", "a") as f:
f.write(problem_id + "\n")
return {"exec_fail_case":str(e)}
return {"exec_fail_case": str(e)}
return []
async def __call__(self, problem_id, problem, rephrase_problem, solution, test_cases):
result = self.exec_code(solution, test_cases, problem_id)
print("result here", result)
if result == []:
return solution
elif "exec_fail_case" in result:
result = result["exec_fail_case"]
prompt = REFLECTION_ON_PUBILIC_TEST_PROMPT.format(problem_description=problem, rephrase_problem=rephrase_problem, code_solution=solution, exec_pass=f"executed unsuccessfully, error: \n {result}", test_fail="executed unsucessfully")
prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format(
problem_description=problem,
rephrase_problem=rephrase_problem,
code_solution=solution,
exec_pass=f"executed unsuccessfully, error: \n {result}",
test_fail="executed unsucessfully",
)
node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm)
response = node.instruct_content.model_dump()
return {"final_solution":response["refined_solution"]}
return {"final_solution": response["refined_solution"]}
else:
result = result["test_fail_case"]
prompt = REFLECTION_ON_PUBILIC_TEST_PROMPT.format(problem_description=problem, rephrase_problem=rephrase_problem, code_solution=solution, exec_pass="executed successfully", test_fail=result)
prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format(
problem_description=problem,
rephrase_problem=rephrase_problem,
code_solution=solution,
exec_pass="executed successfully",
test_fail=result,
)
node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm)
response = node.instruct_content.model_dump()
return {"final_solution":response["refined_solution"]}
return {"final_solution": response["refined_solution"]}
class FindFact(Operator):
pass
def __init__(self, name: str = "FindFact", llm: LLM = LLM()):
super().__init__(name, llm)
class SelfAsk(Operator):
pass
def __init__(self, name: str = "SelfAsk", llm: LLM = LLM()):
super().__init__(name, llm)
class Verify(Operator):
"""
? 还没有想好
"""
pass
def __init__(self, name: str = "Verify", llm: LLM = LLM()):
super().__init__(name, llm)

View file

@ -5,26 +5,42 @@
from pydantic import BaseModel, Field
class GenerateOp(BaseModel):
solution: str = Field(default="", description="Your Solution for this problem")
class GenerateCodeOp(BaseModel):
code_solution: str = Field(default="", description="Complete and correct code here.")
class GenerateCodeBlockOp(BaseModel):
code_solution: str = Field(default="", description="Your complete code solution for this problem")
class ReviewOp(BaseModel):
review_result: bool = Field(default=False, description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'")
feedback: str = Field(default="", description="Your FeedBack for this problem based on the criteria. If the review result is true, you can put it 'nothing here'.")
review_result: bool = Field(
default=False,
description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'",
)
feedback: str = Field(
default="",
description="Your FeedBack for this problem based on the criteria. If the review result is true, you can put it 'nothing here'.",
)
class ReviseOp(BaseModel):
revised_solution: str = Field(default="", description="Based on the feedback, revised solution for this problem")
class FuEnsembleOp(BaseModel):
thought: str = Field(default="", description="Analyze the solutions and think how to combine the advantages of various solutions to form the best possible solution.")
thought: str = Field(
default="",
description="Analyze the solutions and think how to combine the advantages of various solutions to form the best possible solution.",
)
final_solution: str = Field(default="", description="Output the final solution after analysis and integration")
class MdEnsembleOp(BaseModel):
thought: str = Field(
default="""Example thought process:
@ -35,22 +51,30 @@ class MdEnsembleOp(BaseModel):
5. The use of 'isinstance' for type checking is a good practice.
6. The function handles decimal separators well by replacing ',' with '.'.
Overall, this solution effectively solves the problem of comparing two values, with good error handling and flexibility. It could be improved by specifying behavior for equal values, but it's a strong solution as is.""",
description="Step-by-step analysis of the solutions to determine the best one."
)
solution_letter: str = Field(
default="",
description="The letter of the chosen best solution (only one letter)."
description="Step-by-step analysis of the solutions to determine the best one.",
)
solution_letter: str = Field(default="", description="The letter of the chosen best solution (only one letter).")
class TestCaseExtractOp(BaseModel):
test_cases: list = Field(default=[('<function name>', [5, 8, 7, 1], 12), ('<function name>', [3, 3, 3, 3, 3], 9)],
description="Extracted test cases from the problem description")
test_cases: list = Field(
default=[
"assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True",
"assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False",
"",
],
description="Extracted test cases from the problem description",
)
class RephraseOp(BaseModel):
rephrased_problem: str = Field(default="", description="Rephrased problem description for this problem")
class ReflectionTestOp(BaseModel):
reflection: str = Field(default="", description="对关于代码执行错误或者测试用例失败step by step的思考")
refined_solution: str = Field(default="", description="对于代码执行错误或者测试用例失败的修正方案")
reflection: str = Field(
default="", description="Step-by-step reflection on code execution errors or test case failures"
)
refined_solution: str = Field(
default="", description="Corrective solution for code execution errors or test case failures"
)

View file

@ -10,19 +10,19 @@ Generate Solution for the following problem: {problem_description}
GENERATE_CODE_PROMPT = """
You are an expert programmer tasked with solving a coding problem.
### Problem Description:
### Problem Description
{problem_description}
### Instructions:
### Instructions
The above is an incomplete Python code fragment. Return the complete and correct code with no additional text.
Please maintain the JSON format in your response.
### Your Response:
### Your Response
"""
GENERATE_CODEBLOCK_REPHRASE_PROMPT = """
Please provide a self-contained Python script that solves the following problem in a markdown code block:
### Problem Description:
### Problem Description
{problem_description}
### self reflection on the problem
@ -35,7 +35,7 @@ When creating your solution:
4. Avoid adding additional test cases beyond those provided in the problem description.
"""
GENERATE_CODEBLOCK_PROMPT ="""
GENERATE_CODEBLOCK_PROMPT = """
Please provide a self-contained Python script that solves the following problem in a markdown code block:
{problem_description}
@ -99,10 +99,10 @@ Please strictly output in JSON format, do not output irrelevant content. """
DE_ENSEMBLE_CODE_FORMAT_PROMPT = """
Now please output your answer in json format, with the format as follows:
{{
"reason":"<为什么要这样做>",
"code_solution":"<你觉得合适的solution用代码表示出来>"
}}
{
"reason":"<why do it this way>",
"code_solution":"<the solution you think is appropriate, expressed in code>"
}
Please strictly output in JSON format, do not output irrelevant content. """
DE_ENSEMBLE_ANGEL_PROMPT = """
@ -131,18 +131,6 @@ You, as the moderator, will evaluate both sides' answers and determine if there
Please strictly output in JSON format, do not output irrelevant content
"""
EXTRACT_CASE_PROMPT = """
You are given a coding problem, and you need to extract the test cases from the problem description.
{problem_description}
一个problem中会有多个测试用例每个测试用例包含三个部分
1. 函数名
2. 输入
3. 期望输出
每个测试用例包裹在一个三元组之中三元组之间用逗号分隔整体用列表包裹
由于结果需要被解析到JSON中True与False请表示为true, false;
"""
REPHRASE_ON_PROBLEM_PROMPT = """
You are given a code contest problem:
@ -155,26 +143,26 @@ Reflect on the problem, and describe it in your own words, in bullet points. Pay
"""
REFLECTION_ON_PUBILIC_TEST_PROMPT = """
REFLECTION_ON_PUBLIC_TEST_PROMPT = """
You are given a code contest problem, and a self-reflection on the problem:
### problem
{problem_description}
### self reflection on the problem
{rephrase_problem}
=======================
A Python code solution was generated for the problem:
### Code Solution
{code_solution}
=======================
This section of the code execution result is
### Execution Result
{exec_pass}
=======================
However, when running the following input example, the code solution above failed to produce the expected output:
#### Failed Test Case
{test_fail}
@ -182,4 +170,31 @@ However, when running the following input example, the code solution above faile
Your goal is to analyze the code solution and the error, and propose a fixed code which will produce the expected output for the provided test input.
The fixed code should keep the solution robust, and work for all other input examples as well.
Make sure the fixed code has a reasonable runtime - less than three seconds on a modern computer, given the problem constraints for large input.
"""
"""
EXTRACT_CASE_PROMPT = """
You are given a coding problem, and you need to extract the test cases from the problem description.
## Problem Description
{problem_description}
Your task is to extract test cases from the above description and convert them into Python assert statements (as strings). These statements should be returned in a list for testing purposes.
Example:
Input:
>>> has_close_elements([1.0, 2.0, 3.0], 0.5)
False
>>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
True
Output:
[
"assert candidate([1.0, 2.0, 3.0], 0.5) == False",
"assert candidate([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3) == True"
]
Please ensure that:
1. Each test case is converted to a separate assert statement.
2. The function name in the original example (e.g., 'has_close_elements') is replaced with 'candidate'.
3. The assert statements are returned as strings in a list.
"""

View file

@ -3,38 +3,42 @@
# @Author : didi
# @Desc : utils for experiment
import ast
import json
import re
import ast
from typing import List, Dict, Any, Tuple
from metagpt.llm import LLM
from metagpt.actions.action_node import ActionNode
from typing import Any, List, Tuple
from examples.ags.w_action_node.operator_an import TestCaseExtractOp
from examples.ags.w_action_node.prompt import EXTRACT_CASE_PROMPT
from metagpt.actions.action_node import ActionNode
from metagpt.llm import LLM
def extract_task_id(task_id: str) -> int:
"""Extract the numeric part of the task_id."""
match = re.search(r'/(\d+)', task_id)
match = re.search(r"/(\d+)", task_id)
return int(match.group(1)) if match else 0
def jsonl_ranker(input_file: str, output_file: str):
def sort_json_by_task_id(input_file: str, output_file: str):
"""
Read a JSONL file, sort the entries based on task_id, and write to a new JSONL file.
:param input_file: Path to the input JSONL file
:param output_file: Path to the output JSONL file
"""
# Read and parse the JSONL file
with open(input_file, 'r') as f:
with open(input_file, "r") as f:
data = [json.loads(line) for line in f]
# Sort the data based on the numeric part of task_id
sorted_data = sorted(data, key=lambda x: extract_task_id(x['task_id']))
sorted_data = sorted(data, key=lambda x: extract_task_id(x["task_id"]))
# Write the sorted data to a new JSONL file
with open(output_file, 'w') as f:
with open(output_file, "w") as f:
for item in sorted_data:
f.write(json.dumps(item) + '\n')
f.write(json.dumps(item) + "\n")
def parse_python_literal(s):
try:
@ -42,7 +46,8 @@ def parse_python_literal(s):
except (ValueError, SyntaxError):
return s
def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test_reflexion.jsonl"):
def extract_test_cases_from_jsonl(problem_id: str, file_path: str = "public_test_reflexion.jsonl"):
# 保留原有的硬编码测试用例
hardcoded_cases = {
"HumanEval/32": "",
@ -55,7 +60,7 @@ def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test_ref
return hardcoded_cases[problem_id]
# 如果没有硬编码的测试用例,从文件中读取
with open(file_path, 'r') as file:
with open(file_path, "r") as file:
for line in file:
data = json.loads(line)
if data.get("id") == problem_id:
@ -63,60 +68,63 @@ def extract_test_cases_from_jsonl(problem_id:str, file_path:str="public_test_ref
return None # 如果没有找到问题,返回 None
def extract_test_cases(docstring: str) -> List[Tuple[str, List[Any], Any]]:
# 使用正则表达式匹配测试用例,现在捕获函数名和任意输出
pattern = r'>>> (\w+)\((.*?)\)\n\s*(.*?)(?=\n|$)'
pattern = r">>> (\w+)\((.*?)\)\n\s*(.*?)(?=\n|$)"
matches = re.findall(pattern, docstring, re.DOTALL)
test_cases = []
for match in matches:
func_name, input_str, expected_output = match
# 处理输入
input_list = []
for item in input_str.split(','):
for item in input_str.split(","):
item = item.strip()
try:
# 尝试将输入转换为数值类型
if '.' in item:
if "." in item:
input_list.append(float(item))
else:
input_list.append(int(item))
except ValueError:
# 如果无法转换为数值,则保留为字符串
input_list.append(item.strip("'\""))
# 处理输出
try:
# 尝试将输出转换为数值或布尔值
if expected_output.lower() == 'true':
if expected_output.lower() == "true":
expected_output = True
elif expected_output.lower() == 'false':
elif expected_output.lower() == "false":
expected_output = False
elif '.' in expected_output:
elif "." in expected_output:
expected_output = float(expected_output)
else:
expected_output = int(expected_output)
except ValueError:
# 如果无法转换,则保留为字符串
expected_output = expected_output.strip("'\"")
test_cases.append([func_name, input_list, expected_output])
return test_cases
async def llm_extract_test_case(id, problem_description: str, file_path:str="public_test.jsonl"):
async def llm_extract_test_case(id, problem_description: str, file_path: str = "public_test.jsonl"):
prompt = EXTRACT_CASE_PROMPT.format(problem_description=problem_description)
node = await ActionNode.from_pydantic(TestCaseExtractOp).fill(context=prompt, llm=LLM())
result = node.instruct_content.model_dump()
with open(file_path,"a") as f:
f.write(json.dumps({id:result["test_cases"]}) + '\n')
return {id:result["test_cases"]}
with open(file_path, "a") as f:
f.write(json.dumps({id: result["test_cases"]}) + "\n")
return {id: result["test_cases"]}
def test_cases_2_test_functions(solution: str, test_cases: str):
tester_function = f"""
{solution}
{test_cases}
"""
return tester_function
"""
return tester_function