From 2937d9a13a625047cd003267ef53292422f9a1c0 Mon Sep 17 00:00:00 2001 From: didi <84363704+didiforgithub@users.noreply.github.com> Date: Fri, 23 Aug 2024 13:57:35 +0800 Subject: [PATCH] =?UTF-8?q?Update=20=E7=B2=97=E7=B3=99=E7=89=88=E6=9C=AC?= =?UTF-8?q?=20=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/ags/benchmark/hotpotqa.py | 173 ++++++ examples/ags/benchmark/humaneval.py | 48 +- examples/ags/w_action_node/evaluator.py | 30 + examples/ags/w_action_node/graph.py | 132 +---- .../w_action_node/graphs/gsm8k/basic/graph.py | 0 .../graphs/gsm8k/basic/operator.py | 0 .../graphs/gsm8k/basic/prompt.py | 0 .../graphs/gsm8k/handcraft/graph.py | 81 +++ .../graphs/gsm8k/optimized/round_1/graph.py | 0 .../gsm8k/optimized/round_1/operator.py | 0 .../gsm8k/optimized/round_1/operator_an.py | 0 .../graphs/gsm8k/optimized/round_1/prompt.py | 0 .../graphs/hotpotqa/basic/graph.py | 0 .../graphs/hotpotqa/basic/operator.py | 0 .../graphs/hotpotqa/basic/prompt.py | 0 .../w_action_node/graphs/hotpotqa/graph.py | 28 + .../graphs/humaneval/basic/graph.py | 0 .../graphs/humaneval/basic/operator.py | 0 .../graphs/humaneval/basic/prompt.py | 0 .../w_action_node/graphs/humaneval/graph.py | 82 +++ examples/ags/w_action_node/operator.py | 246 +++++--- examples/ags/w_action_node/operator_an.py | 18 +- examples/ags/w_action_node/operator_old.py | 550 ++++++++++++++++++ examples/ags/w_action_node/optimizer.py | 246 ++++++++ examples/ags/w_action_node/prompt.py | 130 ++++- .../w_action_node/prompts/optimize_prompt.py | 35 ++ examples/ags/w_action_node/utils.py | 11 + metagpt/actions/action_node.py | 20 +- metagpt/llm.py | 1 + 29 files changed, 1580 insertions(+), 251 deletions(-) create mode 100644 examples/ags/benchmark/hotpotqa.py create mode 100644 examples/ags/w_action_node/evaluator.py create mode 100644 examples/ags/w_action_node/graphs/gsm8k/basic/graph.py create mode 100644 examples/ags/w_action_node/graphs/gsm8k/basic/operator.py create mode 100644 examples/ags/w_action_node/graphs/gsm8k/basic/prompt.py create mode 100644 examples/ags/w_action_node/graphs/gsm8k/handcraft/graph.py create mode 100644 examples/ags/w_action_node/graphs/gsm8k/optimized/round_1/graph.py create mode 100644 examples/ags/w_action_node/graphs/gsm8k/optimized/round_1/operator.py create mode 100644 examples/ags/w_action_node/graphs/gsm8k/optimized/round_1/operator_an.py create mode 100644 examples/ags/w_action_node/graphs/gsm8k/optimized/round_1/prompt.py create mode 100644 examples/ags/w_action_node/graphs/hotpotqa/basic/graph.py create mode 100644 examples/ags/w_action_node/graphs/hotpotqa/basic/operator.py create mode 100644 examples/ags/w_action_node/graphs/hotpotqa/basic/prompt.py create mode 100644 examples/ags/w_action_node/graphs/hotpotqa/graph.py create mode 100644 examples/ags/w_action_node/graphs/humaneval/basic/graph.py create mode 100644 examples/ags/w_action_node/graphs/humaneval/basic/operator.py create mode 100644 examples/ags/w_action_node/graphs/humaneval/basic/prompt.py create mode 100644 examples/ags/w_action_node/graphs/humaneval/graph.py create mode 100644 examples/ags/w_action_node/operator_old.py create mode 100644 examples/ags/w_action_node/optimizer.py create mode 100644 examples/ags/w_action_node/prompts/optimize_prompt.py diff --git a/examples/ags/benchmark/hotpotqa.py b/examples/ags/benchmark/hotpotqa.py new file mode 100644 index 000000000..38449e6ea --- /dev/null +++ b/examples/ags/benchmark/hotpotqa.py @@ -0,0 +1,173 @@ +import asyncio +import json +import os +import re +import string +from typing import Literal, Optional + +import aiofiles + +from examples.ags.w_action_node.graph import HotpotQAGraph +from examples.ags.w_action_node.operator import Format, GenerateOnContext +from examples.ags.w_action_node.utils import get_hotpotqa +from metagpt.llm import LLM +from metagpt.logs import logger + +HOTPOTQA_PATH = "hotpotqa_1000.jsonl" + + +def sort_json_by_key(input_path, output_path): + with open(input_path) as f: + data = [json.loads(line) for line in f] + data.sort(key=lambda x: x["task_id"]) + with open(output_path, "w") as f: + for line in data: + f.write(json.dumps(line) + "\n") + + +extract_supporting_sentences = GenerateOnContext( + llm=LLM(), requirement="supporting sentences to get the final answers (split by newline)" +) +generate_on_context = GenerateOnContext(llm=LLM(), requirement="a concise answer without additional context") +format = Format(llm=LLM()) +solver = HotpotQAGraph( + name="solver", + llm=LLM(), + criteria="correctness, only concise answer, without additional context", + HOTPOTQA_PATH=HOTPOTQA_PATH, +) + +ModeType = Literal["ags", "alpha_codium", "llm"] + + +async def llm_generate(id): + dp = get_hotpotqa(HOTPOTQA_PATH)[id] + paragraphs = [item[1] for item in dp["context"] if isinstance(item[1], list)] + context_str = "\n".join(" ".join(paragraph) for paragraph in paragraphs) + + supporting_sentences = await extract_supporting_sentences(dp["question"], context_str) + supporting_sentences_str = "\n".join(supporting_sentences.get("solution")) + + answer_result = await generate_on_context(dp["question"], supporting_sentences_str) + answer_result = answer_result.get("solution") + + answer_formated = await format(dp["question"], answer_result) + sample_dict = dict( + task_id=id, + answer=answer_formated.get("solution"), + supporting_sentences=supporting_sentences.get("solution").split("\n"), + ) + return sample_dict + + +async def route_generate(mode: ModeType, id): + if mode == "ags": + sample_dict = await solver(id) + elif mode == "llm": + sample_dict = await llm_generate(id) + else: + raise ValueError(f"Invalid mode: {mode}") + + return sample_dict + + +async def sample_generate(id, result_path: str = "samples.jsonl", mode: ModeType = "llm"): + sample_dict = await route_generate(mode, id) + async with aiofiles.open(result_path, mode="a") as f: + await f.write(json.dumps(sample_dict) + "\n") + # sort_json_by_key(result_path, result_path) + + +async def samples_generate( + mode: ModeType, data_path: str = HOTPOTQA_PATH, result_path: str = "samples.jsonl", max_concurrency: int = 50 +): + ids = list(get_hotpotqa(HOTPOTQA_PATH).keys()) + + file_lock = asyncio.Lock() + semaphore = asyncio.Semaphore(max_concurrency) + + async def answer_and_write(mode: ModeType, id) -> Optional[str]: + async with semaphore: + try: + sample_dict = await route_generate(mode, id) + except Exception: + return id + async with file_lock: + async with aiofiles.open(result_path, mode="a") as f: + await f.write(json.dumps(sample_dict) + "\n") + return None + + tasks = [answer_and_write(mode, id) for id in ids] + results = await asyncio.gather(*tasks) + failed_ids = [id for id in results if id is not None] + + if failed_ids: + logger.info(failed_ids) + for id in failed_ids: + try: + await sample_generate(id, result_path, mode) + failed_ids.remove(id) + except Exception: + logger.error(f"Failed to generate sample for id: {id}") + + sort_json_by_key(result_path, result_path) + + if not failed_ids: + eval_path = result_path[:-6] + "_eval.json" + logger.info(eval(result_path, data_path, eval_path)) + + +def normalize_answer(s): + def remove_articles(text): + return re.sub(r"\b(a|an|the)\b", " ", text) + + def white_space_fix(text): + return " ".join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def exact_match_score(prediction, ground_truth): + return normalize_answer(prediction) == normalize_answer(ground_truth) + + +def eval(prediction_file, gold_file, eval_file): + # if existing eval file + if os.path.exists(eval_file): + # read the result + with open(eval_file) as f: + eval_results = [json.loads(line) for line in f] + em = sum([result["em"] for result in eval_results]) + logger.info(f"EM: {em/len(eval_results)}") + return + + sort_json_by_key(prediction_file, prediction_file) + with open(prediction_file) as f: + predictions = [json.loads(line) for line in f] + + with open(gold_file) as f: + golds = [json.loads(line) for line in f] + + eval_results = [] + em = 0 + for prediction, gold in zip(predictions, golds): + if prediction["task_id"] != gold["_id"]: + raise ValueError(f"Task ID {gold['_id']} do not match") + result = exact_match_score(prediction["answer"], gold["answer"]) + em += result + eval_results.append( + {"task_id": prediction["task_id"], "solution": prediction["answer"], "answer": gold["answer"], "em": result} + ) + + with open(eval_file, "w") as f: + for line in eval_results: + f.write(json.dumps(line) + "\n") + + logger.info(f"EM: {em/len(predictions)}") diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py index ec1a3402d..40d611b7b 100644 --- a/examples/ags/benchmark/humaneval.py +++ b/examples/ags/benchmark/humaneval.py @@ -14,16 +14,15 @@ import aiofiles from evalplus.data import get_human_eval_plus from examples.ags.w_action_node.graph import HumanEvalGraph -from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock +from examples.ags.w_action_node.operator import GenerateCodeBlock from examples.ags.w_action_node.utils import sort_json_by_key from metagpt.llm import LLM from metagpt.logs import logger from metagpt.utils.common import add_jsonl_file, read_json_file from metagpt.utils.exceptions import handle_exception -generate_code = GenerateCode(llm=LLM()) generate_code_block = GenerateCodeBlock(llm=LLM()) -solver = HumanEvalGraph(name="solver", llm=LLM(), criteria="correctness, efficiency, readability", vote_count=5) +solver = HumanEvalGraph(name="solver", llm=LLM(), criteria="correctness, efficiency, readability", vote_count=1) ModeType = Literal["ags", "alpha_codium", "llm"] @@ -37,49 +36,53 @@ async def llm_generate(id): async def ags_generate(id, ensemble_count: int = 5): case = get_human_eval_plus()[f"{id}"] - solution_result = await solver(case["prompt"], ensemble_count=ensemble_count) + solution_result = await solver(case["prompt"], case["entry_point"], ensemble_count=ensemble_count) sample_dict = dict(task_id=case["task_id"], solution=solution_result["final_solution"]) return sample_dict -async def alpha_codium_generate(id): +async def alpha_codium_generate(id, ensemble_count: int = 1): case = get_human_eval_plus()[f"{id}"] - solution_result = await solver.alpha_codium(case["task_id"], case["prompt"], ensemble_count=5) + solution_result = await solver.alpha_codium(case["task_id"], case["prompt"], ensemble_count=ensemble_count) sample_dict = dict(task_id=case["task_id"], solution=solution_result["final_solution"]) return sample_dict async def route_generate(mode: ModeType, id: str): + token_usage = 0 + money_usage = 0 if mode == "ags": sample_dict = await ags_generate(id) elif mode == "alpha_codium": - sample_dict = await alpha_codium_generate(id) + sample_dict = await alpha_codium_generate(id, 5) elif mode == "llm": sample_dict = await llm_generate(id) else: raise ValueError(f"Invalid mode: {mode}") - return sample_dict + return sample_dict, token_usage, money_usage async def sample_generate(id, result_path: str = "samples.jsonl", mode: ModeType = "ags"): - sample_dict = await route_generate(mode, id) + sample_dict, token_usage, money_usage = await route_generate(mode, id) add_jsonl_file(result_path, [sample_dict]) sort_json_by_key(result_path, result_path) -async def samples_generate(mode: ModeType, result_path: str = "samples.jsonl"): +async def samples_generate(mode: ModeType, result_path: str = "samples.jsonl", max_concurrency: int = 50): ids = list(get_human_eval_plus().keys()) file_lock = asyncio.Lock() + semaphore = asyncio.Semaphore(max_concurrency) async def solve_and_write(id: str, mode: ModeType) -> Optional[str]: - try: - sample_dict = await route_generate(mode, id) - except Exception: - return id - async with file_lock: - async with aiofiles.open(result_path, mode="a") as f: - await f.write(json.dumps(sample_dict) + "\n") - return None + async with semaphore: + try: + sample_dict, token_usage, money_usage = await route_generate(mode, id) + except Exception: + return id + async with file_lock: + async with aiofiles.open(result_path, mode="a") as f: + await f.write(json.dumps(sample_dict) + "\n") + return None tasks = [solve_and_write(id, mode) for id in ids] results = await asyncio.gather(*tasks) @@ -87,12 +90,17 @@ async def samples_generate(mode: ModeType, result_path: str = "samples.jsonl"): if failed_tasks: logger.info(failed_tasks) - for task_id in failed_tasks: + + async def retry_failed_task(task_id): try: await sample_generate(task_id, result_path, mode) - failed_tasks.remove(task_id) + return None except Exception: logger.error(f"{task_id} fail") + return task_id + + retry_results = await asyncio.gather(*[retry_failed_task(task_id) for task_id in failed_tasks]) + failed_tasks = [task_id for task_id in retry_results if task_id is not None] sort_json_by_key(result_path, result_path) diff --git a/examples/ags/w_action_node/evaluator.py b/examples/ags/w_action_node/evaluator.py new file mode 100644 index 000000000..75875c5b5 --- /dev/null +++ b/examples/ags/w_action_node/evaluator.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +# @Date : 8/23/2024 10:00 AM +# @Author : all +# @Desc : evaluate for different dataset +from typing import Literal + +# TODO 完成实验数据集的手动划分 + +DatasetType = Literal["humaneval", "gsm8k", "hotpotqa", "drop", "mmlu"] + + +class Evaluator: + """ + 在这里完成对不同数据集的评估 + """ + + def __init__(self, eval_path: str): + pass + + def validation_evaluate(self, dataset: DatasetType): + """ + Evaluates on validation dataset. + """ + pass + + def test_evaluate(self, dataset: DatasetType): + """ + Evaluates on test dataset. + """ + pass diff --git a/examples/ags/w_action_node/graph.py b/examples/ags/w_action_node/graph.py index 314f16af2..4b6286192 100644 --- a/examples/ags/w_action_node/graph.py +++ b/examples/ags/w_action_node/graph.py @@ -1,128 +1,32 @@ # -*- coding: utf-8 -*- # @Date : 6/27/2024 22:07 PM # @Author : didi -# @Desc : graph & an instance - humanevalgraph +# @Desc : Basic Graph Class -from typing import List +from typing import Literal -from evalplus.data import get_human_eval_plus - -from examples.ags.w_action_node.operator import ( - FuEnsemble, - Generate, - GenerateCode, - GenerateCodeBlock, - MdEnsemble, - Rephrase, - Review, - Revise, - Test, -) -from examples.ags.w_action_node.utils import extract_test_cases_from_jsonl from metagpt.llm import LLM +from metagpt.utils.cost_manager import CostManager + +DatasetType = Literal["humaneval", "gsm8k", "hotpotqa", "drop", "mmlu"] + +cost_manager = CostManager() class Graph: - def __init__(self, name: str, llm: LLM) -> None: + def __init__( + self, + name: str, + llm: LLM, + dataset: DatasetType, + ) -> None: self.name = name self.model = llm + self.dataset = dataset + self.cost = cost_manager # TODO def __call__(): + """ + Implementation of the graph + """ NotImplementedError("Subclasses must implement __call__ method") - - def optimize(dataset: List): - pass - - -class HumanEvalGraph(Graph): - def __init__(self, name: str, llm: LLM, criteria: str, vote_count: int = 5) -> None: - super().__init__(name, llm) - self.criteria = criteria # TODO 自动构建图时,图的初始参数与图所使用的算子要求的外部参数相匹配 - self.generate_code = GenerateCode(llm=llm) - self.generate_code_block = GenerateCodeBlock(llm=llm) - self.review = Review(llm=llm, criteria=criteria) - self.revise = Revise(llm=llm) - self.rephrase = Rephrase(llm=llm) - self.tester = Test(llm=llm) - self.fuensemble = FuEnsemble(llm=llm) - self.mdensemble = MdEnsemble(llm=llm, vote_count=vote_count) - - async def __call__(self, problem: str, function_name: str, ensemble_count: int = 3): - solution_list = [] - for _ in range(ensemble_count): - solution = await self.generate_code_block(problem, function_name) - solution = solution.get("code_solution") - solution_list.append(solution) - solution = await self.mdensemble("code", solution_list, problem) - return solution - - async def alpha_codium(self, problem_id: str, problem: str, ensemble_count: int = 3): - """ - Paper: Code Generation with AlphaCodium: From Prompt Engineering to Flow Engineering - Link: https://arxiv.org/abs/2404.14963 - Flow: An incomplete version of alpha codium, implementing the basic process of rephrase -> code ensemble -> tes - """ - test_cases = extract_test_cases_from_jsonl(problem_id) - entry_point = get_human_eval_plus()[problem_id]["entry_point"] - rephrase_problem = await self.rephrase(problem) # 在rephrase 中拼接原始的问题描述 - solution_list = [] - for _ in range(ensemble_count): - solution = await self.generate_code_block.rephrase_generate( - problem, rephrase_problem, function_name=entry_point - ) - solution = solution.get("code_solution") - solution_list.append(solution) - solution = await self.mdensemble("code", solution_list, problem) - solution = await self.tester(problem_id, problem, rephrase_problem, solution, test_cases, entry_point) - return solution - - async def review_revise_ensemble(self, problem: str, ensemble_count: int = 2, revise_round: int = 3): - solution_list = [] - for _ in range(ensemble_count): - solution = await self.single_solve(problem, revise_round) - solution_list.append(solution) - solution = await self.ensemble(solution_list, problem) - return solution - - async def simple_ensemble(self, problem: str, ensemble_count: int = 3): - solution_list = [] - for _ in range(ensemble_count): - solution = await self.generate_code(problem) - # solution = await self.generate_code_block(problem) - solution = solution.get("code_solution") - solution_list.append(solution) - solution = await self.fuensemble(solution_list, problem) - return solution - - async def single_solve(self, problem: str, max_loop: int): - solution = await self.generate_code(problem) - solution = solution.get("code_solution") - for _ in range(max_loop): - review_feedback = await self.review(problem, solution) - if review_feedback["review_result"]: - break - solution = await self.revise(problem, solution, review_feedback["feedback"]) - solution = solution.get("revised_solution") - return solution - - -class Gsm8kGraph(Graph): - def __init__(self, name: str, llm: LLM) -> None: - super().__init__(name, llm) - self.generate = Generate(llm=llm) - self.rephrase = Rephrase(llm=llm) - - async def __call__(self, problem: str): - solution = self.generate(problem) - return solution - - -class HotpotQAGraph(Graph): - def __init__(self, name: str, llm: LLM) -> None: - super().__init__(name, llm) - self.generate = Generate(llm=llm) - self.rephrase = Rephrase(llm=llm) - - async def __call__(self, problem: str): - solution = self.generate(problem) - return solution diff --git a/examples/ags/w_action_node/graphs/gsm8k/basic/graph.py b/examples/ags/w_action_node/graphs/gsm8k/basic/graph.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/gsm8k/basic/operator.py b/examples/ags/w_action_node/graphs/gsm8k/basic/operator.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/gsm8k/basic/prompt.py b/examples/ags/w_action_node/graphs/gsm8k/basic/prompt.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/gsm8k/handcraft/graph.py b/examples/ags/w_action_node/graphs/gsm8k/handcraft/graph.py new file mode 100644 index 000000000..97beebf40 --- /dev/null +++ b/examples/ags/w_action_node/graphs/gsm8k/handcraft/graph.py @@ -0,0 +1,81 @@ +class Gsm8kGraph(Graph): + def __init__(self, name: str, llm: LLM, criteria: str, vote_count: int = 5) -> None: + super().__init__(name, llm) + self.criteria = criteria + self.generate = Generate(llm=llm) + self.rephrase = Rephrase(llm=llm) + self.fuensemble = FuEnsemble(llm=llm) + self.mdensemble = MdEnsemble(llm=llm, vote_count=vote_count) + self.review = Review(llm=llm, criteria=criteria) + self.revise = Revise(llm=llm) + self.format = Format(llm=llm) + + async def __call__(self, problem: str): + rephrased_problem = await self.rephrase.math_rephrase(problem) + solution = await self.generate.math_generate(rephrased_problem) + formatted_solution = await self.format.math_answer_format(solution["solution"]) + return formatted_solution + + async def baseline(self, problem: str): + solution = await self.generate(problem) + formatted_solution = await self.format.math_answer_format(solution["solution"]) + return formatted_solution + + async def simple_ensemble(self, problem: str, ensemble_count: int = 3): + rephrased_problem = await self.rephrase.math_rephrase(problem) + solution_list = [] + answer_list = [] + + for _ in range(ensemble_count): + solution = await self.generate.math_generate(rephrased_problem) + solution = solution.get("solution") + answer = await self.format.math_answer_format(solution) + solution_list.append(solution) + answer_list.append(answer) + + if len(set(answer.get("solution") for answer in answer_list)) == 1: + formatted_solution = answer_list[0] + else: + # TODO 我个人感觉针对数学这种情景,使用self consistency 的ensemble方法可能会更好 + solution = await self.mdensemble("math", solution_list, problem) + formatted_solution = await self.format.math_answer_format(solution["final_solution"]) + + return formatted_solution + + async def single_solve(self, problem: str, max_loop: int = 3): + rephrased_problem = await self.rephrase.math_rephrase(problem) + solution = await self.generate.math_generate(rephrased_problem) + for _ in range(max_loop): + review_feedback = await self.review(rephrased_problem, solution["solution"]) + if review_feedback["review_result"]: + break + solution = await self.revise(rephrased_problem, solution["solution"], review_feedback["feedback"]) + solution = solution.get("revised_solution") + formatted_solution = await self.format.math_answer_format(solution) + return formatted_solution + + async def cot_ensemble(self, problem: str, ensemble_count: int = 1): + solution_list = [] + for _ in range(ensemble_count): + core = await self.rephrase.math_core(problem) + extract = await self.rephrase.math_extract(problem) + formatted_problem = ( + f"### Problem\n{problem}\n### Problem-Solving Info\n{extract}\n### Core Question\n{core}\n" + ) + solution = await self.generate.math_generate(formatted_problem) # 等待 generate 方法完成 + solution0 = solution.get("solution") + solution_list.append(solution0) + solution = await self.fuensemble(solution_list, problem) + solution0 = solution["solution"] + formatted_solution = await self.format.math_answer_format(solution) + return formatted_solution + + async def cot(self, problem: str): + core = await self.rephrase.math_core(problem) + extract = await self.rephrase.math_extract(problem) + formatted_problem = f"### Problem\n{problem}\n### Problem-Solving Info\n{extract}\n### Core Question\n{core}\n" + solution = await self.generate.math_generate(formatted_problem) # 等待 generate 方法完成 + solution.get("solution") + formatted_solution = await self.format.math_answer_format(solution) + + return formatted_solution diff --git a/examples/ags/w_action_node/graphs/gsm8k/optimized/round_1/graph.py b/examples/ags/w_action_node/graphs/gsm8k/optimized/round_1/graph.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/gsm8k/optimized/round_1/operator.py b/examples/ags/w_action_node/graphs/gsm8k/optimized/round_1/operator.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/gsm8k/optimized/round_1/operator_an.py b/examples/ags/w_action_node/graphs/gsm8k/optimized/round_1/operator_an.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/gsm8k/optimized/round_1/prompt.py b/examples/ags/w_action_node/graphs/gsm8k/optimized/round_1/prompt.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/hotpotqa/basic/graph.py b/examples/ags/w_action_node/graphs/hotpotqa/basic/graph.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/hotpotqa/basic/operator.py b/examples/ags/w_action_node/graphs/hotpotqa/basic/operator.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/hotpotqa/basic/prompt.py b/examples/ags/w_action_node/graphs/hotpotqa/basic/prompt.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/hotpotqa/graph.py b/examples/ags/w_action_node/graphs/hotpotqa/graph.py new file mode 100644 index 000000000..111db9c3c --- /dev/null +++ b/examples/ags/w_action_node/graphs/hotpotqa/graph.py @@ -0,0 +1,28 @@ +class HotpotQAGraph(Graph): + def __init__(self, name: str, llm: LLM, criteria: str, HOTPOTQA_PATH: str) -> None: + super().__init__(name, llm) + self.generate = Generate(llm=llm) + self.format = Format(llm=llm) + self.review = Review(llm=llm, criteria=criteria) + self.revise = Revise(llm=llm) + self.hotpotqa_path = HOTPOTQA_PATH + + async def __call__(self, id: str, max_loop: int = 1): + dp = get_hotpotqa(self.hotpotqa_path)[id] + paragraphs = [item[1] for item in dp["context"] if isinstance(item[1], list)] + context_str = "\n".join(" ".join(paragraph) for paragraph in paragraphs) + + answer_result = await self.generate.context_solution_generate(dp["question"], context_str) + answer_result = answer_result.get("solution") + + for _ in range(max_loop): + review_result = await self.review(dp["question"], answer_result) + if review_result["review_result"]: + break + answer_result = await self.revise(dp["question"], answer_result, review_result["feedback"]) + answer_result = answer_result.get("revised_solution") + + answer_formated = await self.format(dp["question"], answer_result) + + sample_dict = dict(task_id=id, answer=answer_formated.get("solution")) + return sample_dict diff --git a/examples/ags/w_action_node/graphs/humaneval/basic/graph.py b/examples/ags/w_action_node/graphs/humaneval/basic/graph.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/humaneval/basic/operator.py b/examples/ags/w_action_node/graphs/humaneval/basic/operator.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/humaneval/basic/prompt.py b/examples/ags/w_action_node/graphs/humaneval/basic/prompt.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/ags/w_action_node/graphs/humaneval/graph.py b/examples/ags/w_action_node/graphs/humaneval/graph.py new file mode 100644 index 000000000..4c667d318 --- /dev/null +++ b/examples/ags/w_action_node/graphs/humaneval/graph.py @@ -0,0 +1,82 @@ +class HumanEvalGraph(Graph): + def __init__(self, name: str, llm: LLM, criteria: str, vote_count: int = 5) -> None: + super().__init__(name, llm) + self.generate = Generate(llm=llm) + self.criteria = criteria # TODO 自动构建图时,图的初始参数与图所使用的算子要求的外部参数相匹配 + self.generate_code_block = GenerateCodeBlock(llm=llm) + self.review = Review(llm=llm, criteria=criteria) + self.revise = Revise(llm=llm) + self.rephrase = Rephrase(llm=llm) + self.tester = Test(llm=llm) + self.fuensemble = FuEnsemble(llm=llm) + self.mdensemble = MdEnsemble(llm=llm, vote_count=vote_count) + self.codeensemble = CodeEnsmble(llm=llm, vote_count=vote_count) + + async def __call__(self, problem: str, function_name: str, ensemble_count: int = 3): + solution_list = [] + for _ in range(ensemble_count): + solution = await self.generate_code_block(problem, function_name) + solution = solution.get("code_solution") + solution_list.append(solution) + solution = await self.mdensemble("code", solution_list, problem) + return solution + + async def alpha_codium(self, problem_id: str, problem: str, ensemble_count: int = 3, test_loop: int = 3): + """ + Paper: Code Generation with AlphaCodium: From Prompt Engineering to Flow Engineering + Link: https://arxiv.org/abs/2404.14963 + Flow: An incomplete version of alpha codium, implementing the basic process of rephrase -> code ensemble -> tes + """ + test_cases = extract_test_cases_from_jsonl(problem_id) + entry_point = get_human_eval_plus()[problem_id]["entry_point"] + rephrase_problem = await self.rephrase(problem) # 在rephrase 中拼接原始的问题描述 + code_solution_list = [] + solution_list = [] + + for _ in range(3): + """对文字版本的Solution进行ensemble""" + code_solution = await self.generate.code_solution_generate(problem, rephrase_problem) + code_solution = code_solution.get("content") + code_solution_list.append(code_solution) + final_code_solution = await self.mdensemble(code_solution_list, problem) + final_code_solution = final_code_solution.get("final_solution") + thought = f"""Reflection on the problem:\n{rephrase_problem} \n\nPossible solution:\n{final_code_solution}""" + + for _ in range(ensemble_count): + """对代码版本的Solution进行ensemble""" + solution = await self.generate_code_block.rephrase_generate(problem, thought, function_name=entry_point) + solution = solution.get("code_solution") + solution_list.append(solution) + solution = await self.codeensemble(solution_list, problem) + solution = await self.tester( + problem_id, problem, rephrase_problem, solution, test_cases, entry_point, test_loop + ) + return solution + + async def review_revise_ensemble(self, problem: str, ensemble_count: int = 2, revise_round: int = 3): + solution_list = [] + for _ in range(ensemble_count): + solution = await self.single_solve(problem, revise_round) + solution_list.append(solution) + solution = await self.ensemble(solution_list, problem) + return solution + + async def simple_ensemble(self, problem: str, ensemble_count: int = 3): + solution_list = [] + for _ in range(ensemble_count): + solution = await self.generate_code_block(problem) + solution = solution.get("code_solution") + solution_list.append(solution) + solution = await self.fuensemble(solution_list, problem) + return solution + + async def single_solve(self, problem: str, max_loop: int): + solution = await self.generate_code_block(problem) + solution = solution.get("code_solution") + for _ in range(max_loop): + review_feedback = await self.review(problem, solution) + if review_feedback["review_result"]: + break + solution = await self.revise(problem, solution, review_feedback["feedback"]) + solution = solution.get("revised_solution") + return solution diff --git a/examples/ags/w_action_node/operator.py b/examples/ags/w_action_node/operator.py index b7a1ad384..30b493814 100644 --- a/examples/ags/w_action_node/operator.py +++ b/examples/ags/w_action_node/operator.py @@ -12,9 +12,9 @@ from typing import Dict, List, Tuple from tenacity import retry, stop_after_attempt from examples.ags.w_action_node.operator_an import ( + FormatOp, FuEnsembleOp, GenerateCodeBlockOp, - GenerateCodeOp, GenerateOp, MdEnsembleOp, ReflectionTestOp, @@ -29,13 +29,17 @@ from examples.ags.w_action_node.prompt import ( DE_ENSEMBLE_JUDGE_FINAL_PROMPT, DE_ENSEMBLE_JUDGE_UNIVERSAL_PROMPT, DE_ENSEMBLE_TXT_FORMAT_PROMPT, + FORMAT_PROMPT, FU_ENSEMBLE_PROMPT, - GENERATE_CODE_PROMPT, GENERATE_CODEBLOCK_PROMPT, GENERATE_CODEBLOCK_REPHRASE_PROMPT, GENERATE_PROMPT, + MATH_CORE_PROMPT, + MATH_EXTRACT_PROMPT, + MATH_REPHRASE_ON_PROBLEM_PROMPT, MD_ENSEMBLE_PROMPT, REFLECTION_ON_PUBLIC_TEST_PROMPT, + REPHRASE_ON_CODE_PROMPT, REPHRASE_ON_PROBLEM_PROMPT, REVIEW_PROMPT, REVISE_PROMPT, @@ -56,6 +60,10 @@ class Operator: class Generate(Operator): + """ + 基于Action Node Fill Function的 Generate 算子 + """ + def __init__(self, name: str = "Generate", llm: LLM = LLM()): super().__init__(name, llm) @@ -66,17 +74,6 @@ class Generate(Operator): return response -class GenerateCode(Operator): - def __init__(self, name: str = "GenerateCode", llm: LLM = LLM()): - super().__init__(name, llm) - - async def __call__(self, problem_description): - prompt = GENERATE_CODE_PROMPT.format(problem_description=problem_description) - node = await ActionNode.from_pydantic(GenerateCodeOp).fill(context=prompt, llm=self.llm) - response = node.instruct_content.model_dump() - return response - - class GenerateCodeBlock(Operator): def __init__(self, name: str = "GenerateCodeBlock", llm: LLM = LLM()): super().__init__(name, llm) @@ -91,10 +88,8 @@ class GenerateCodeBlock(Operator): return response @retry(stop=stop_after_attempt(3)) - async def rephrase_generate(self, problem_description, rephrase_problem, function_name): - prompt = GENERATE_CODEBLOCK_REPHRASE_PROMPT.format( - problem_description=problem_description, rephrase_problem=rephrase_problem - ) + async def rephrase_generate(self, problem_description, thought, function_name): + prompt = GENERATE_CODEBLOCK_REPHRASE_PROMPT.format(problem_description=problem_description, thought=thought) node = await ActionNode.from_pydantic(GenerateCodeBlockOp).fill( context=prompt, llm=self.llm, mode="code_fill", function_name=function_name ) @@ -102,6 +97,17 @@ class GenerateCodeBlock(Operator): return response +class Format(Generate): + def __init__(self, name: str = "Format", llm: LLM = LLM()): + super().__init__(name, llm) + + async def __call__(self, problem_description, solution): + prompt = FORMAT_PROMPT.format(problem_description=problem_description, solution=solution) + node = await ActionNode.from_pydantic(FormatOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response + + class Review(Operator): def __init__(self, criteria, name: str = "Review", llm: LLM = LLM()): self.criteria = criteria @@ -162,29 +168,9 @@ class MdEnsemble(Operator): answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)} return shuffled_solutions, answer_mapping - async def __call__(self, solution_type: str, solutions: List[str], problem_description: str): + async def __call__(self, solutions: List[str], problem_description: str): + print(f"solution count: {len(solutions)}") all_responses = [] - # 当Ensmeble方案是Code类型时,我们使用AST进行去重 - # TODO AgentLess + 尝试权重 - if solution_type == "code": - unique_structures = {} - updated_solutions = [] - - for solution in solutions: - try: - tree = ast.parse(solution) - structure_key = ast.dump(tree, annotate_fields=False, include_attributes=False) - - if structure_key not in unique_structures: - unique_structures[structure_key] = solution - updated_solutions.append(solution) - except SyntaxError: - # If the solution has a syntax error, we'll skip it - continue - solutions = updated_solutions - updated_length = len(solutions) - if updated_length == 1: - return {"final_solution": solutions[0]} for _ in range(self.vote_count): shuffled_solutions, answer_mapping = self.shuffle_answers(solutions) @@ -202,7 +188,6 @@ class MdEnsemble(Operator): if answer in answer_mapping: original_index = answer_mapping[answer] - # print(f"original index: {original_index}") all_responses.append(original_index) most_frequent_index = Counter(all_responses).most_common(1)[0][0] @@ -210,8 +195,77 @@ class MdEnsemble(Operator): return {"final_solution": final_answer} -class Md_Ensmble: - pass +class CodeEnsmble(Operator): + def __init__(self, name: str = "CodeEnsemble", llm: LLM = LLM(), vote_count: int = 3): + super().__init__(name, llm) + self.vote_count = vote_count + + @staticmethod + def shuffle_answers(solutions: List[dict]) -> Tuple[List[str], Dict[str, str]]: + shuffled_solutions = solutions.copy() + random.shuffle(shuffled_solutions) + answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)} + return shuffled_solutions, answer_mapping + + async def __call__(self, solutions: List[str], problem_description: str): + all_responses = [] + + unique_structures = {} + unique_structures_count = {} + + valid_solutions_count = 0 # 添加计数器来跟踪有效的解决方案数量 + + for solution in solutions: + try: + tree = ast.parse(solution) + structure_key = ast.dump(tree, annotate_fields=False, include_attributes=False) + + if structure_key not in unique_structures: + unique_structures[structure_key] = solution + unique_structures_count[structure_key] = 1 + else: + unique_structures_count[structure_key] += 1 + + valid_solutions_count += 1 # 增加有效解决方案的计数 + except SyntaxError: + # 剔除语法错误的代码 + continue + + solutions = [ + {"code": unique_structures[structure_key], "weight": count / valid_solutions_count} # 使用有效解决方案的数量来计算权重 + for structure_key, count in unique_structures_count.items() + ] + + updated_length = len(solutions) + if updated_length == 1: + return {"final_solution": solutions[0]["code"]} + + for _ in range(self.vote_count): + shuffled_solutions, answer_mapping = self.shuffle_answers(solutions) + + solution_text = "" + for index, solution in enumerate(shuffled_solutions): + weight = str(solution["weight"]) + code = solution["code"] + solution_text += ( + f"{chr(65 + index)}: \n weight(proportion of occurrences in all solutions):{weight} \n{code}\n\n\n" + ) + + prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description) + node = await ActionNode.from_pydantic(MdEnsembleOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + + answer = response.get("solution_letter", "") + answer = answer.strip().upper() + + if answer in answer_mapping: + original_index = answer_mapping[answer] + # print(f"original index: {original_index}") + all_responses.append(original_index) + + most_frequent_index = Counter(all_responses).most_common(1)[0][0] + final_answer = solutions[most_frequent_index]["code"] + return {"final_solution": final_answer} class ScEnsemble(Operator): @@ -359,39 +413,36 @@ class Rephrase(Operator): response = node.instruct_content.model_dump() return response["rephrased_problem"] + async def code_rephrase(self, problem_description: str) -> str: + prompt = REPHRASE_ON_CODE_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(RephraseOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response["rephrased_problem"] + + async def math_rephrase(self, problem_description: str) -> str: + prompt = MATH_REPHRASE_ON_PROBLEM_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(RephraseOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response["rephrased_problem"] + + async def math_core(self, problem_description: str) -> str: + prompt = MATH_CORE_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(RephraseOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response["rephrased_problem"] + + async def math_extract(self, problem_description: str) -> str: + prompt = MATH_EXTRACT_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(RephraseOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response["rephrased_problem"] + class Test(Operator): def __init__(self, name: str = "Test", llm: LLM = LLM()): super().__init__(name, llm) - # def exec_code(self, solution, test_cases, problem_id): - # # TODO - # # 1. 获取更加详细的Test error信息 - # # 2. 更换Public Test数据集,当前使用的数据存在Label Leak(使用的Reflexion的数据集) -> 这个问题使用LLM抽取解决,直接生成为assert代码串 - # # 3. 实现单独测试每一个test case -> 1 - # solution = solution["final_solution"] - # test_code = test_cases_2_test_functions(solution, test_cases) - # fail_case = [] - # try: - # exec(test_code, globals()) - # except AssertionError as e: - # exc_type, exc_value, exc_traceback = sys.exc_info() - # tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback) - # with open("tester.txt", "a") as f: - # f.write("test_error" + problem_id + "\n") - # error_infomation = { - # "test_fail_case": {"error_type": "AssertionError", "error_message": str(e), "traceback": tb_str} - # } - # logger.info(f"test error: {error_infomation}") - # return error_infomation - # except Exception as e: - # with open("tester.txt", "a") as f: - # f.write(problem_id + "\n") - # return {"exec_fail_case": str(e)} - # return [] - def exec_code(self, solution, test_cases, problem_id, entry_point): - solution = solution["final_solution"] fail_cases = [] for test_case in test_cases: test_code = test_case_2_test_function(solution, test_case, entry_point) @@ -421,33 +472,36 @@ class Test(Operator): else: return "no error" - async def __call__(self, problem_id, problem, rephrase_problem, solution, test_cases, entry_point): - result = self.exec_code(solution, test_cases, problem_id, entry_point) - if result == "no error": - return solution - elif "exec_fail_case" in result: - result = result["exec_fail_case"] - prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format( - problem_description=problem, - rephrase_problem=rephrase_problem, - code_solution=solution, - exec_pass=f"executed unsuccessfully, error: \n {result}", - test_fail="executed unsucessfully", - ) - node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm) - response = node.instruct_content.model_dump() - return {"final_solution": response["refined_solution"]} - else: - prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format( - problem_description=problem, - rephrase_problem=rephrase_problem, - code_solution=solution, - exec_pass="executed successfully", - test_fail=result, - ) - node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm) - response = node.instruct_content.model_dump() - return {"final_solution": response["refined_solution"]} + async def __call__(self, problem_id, problem, rephrase_problem, solution, test_cases, entry_point, test_loop): + solution = solution["final_solution"] + for _ in range(test_loop): + result = self.exec_code(solution, test_cases, problem_id, entry_point) + if result == "no error": + return {"final_solution": solution} + elif "exec_fail_case" in result: + result = result["exec_fail_case"] + prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format( + problem_description=problem, + rephrase_problem=rephrase_problem, + code_solution=solution, + exec_pass=f"executed unsuccessfully, error: \n {result}", + test_fail="executed unsucessfully", + ) + node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + solution = response["refined_solution"] + else: + prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format( + problem_description=problem, + rephrase_problem=rephrase_problem, + code_solution=solution, + exec_pass="executed successfully", + test_fail=result, + ) + node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + solution = response["refined_solution"] + return {"final_solution": solution} class FindFact(Operator): diff --git a/examples/ags/w_action_node/operator_an.py b/examples/ags/w_action_node/operator_an.py index 9008742fa..30ef27986 100644 --- a/examples/ags/w_action_node/operator_an.py +++ b/examples/ags/w_action_node/operator_an.py @@ -7,17 +7,25 @@ from pydantic import BaseModel, Field class GenerateOp(BaseModel): - solution: str = Field(default="", description="Your Solution for this problem") - - -class GenerateCodeOp(BaseModel): - code_solution: str = Field(default="", description="Complete and correct code here.") + solution: str = Field(default="", description="Your solution for this problem") class GenerateCodeBlockOp(BaseModel): code_solution: str = Field(default="", description="Your complete code solution for this problem") +class GenerateCodeSolution(BaseModel): + content: str = Field(default="", description="A description of the solution") + thought: str = Field( + default="", + description="Shortly explain why this solution correctly solves the problem. Be specific and detailed regarding the problem rules and goals.", + ) + + +class FormatOp(BaseModel): + solution: str = Field(default="", description="Your formatted answer for this problem") + + class ReviewOp(BaseModel): review_result: bool = Field( default=False, diff --git a/examples/ags/w_action_node/operator_old.py b/examples/ags/w_action_node/operator_old.py new file mode 100644 index 000000000..c72126e97 --- /dev/null +++ b/examples/ags/w_action_node/operator_old.py @@ -0,0 +1,550 @@ +# -*- coding: utf-8 -*- +# @Date : 6/27/2024 17:36 PM +# @Author : didi +# @Desc : operator demo of ags +import ast +import random +import sys +import traceback +from collections import Counter +from typing import Dict, List, Tuple + +from tenacity import retry, stop_after_attempt + +from examples.ags.w_action_node.operator_an import ( + FormatOp, + FuEnsembleOp, + GenerateCodeBlockOp, + GenerateCodeSolution, + GenerateOp, + MdEnsembleOp, + ReflectionTestOp, + RephraseOp, + ReviewOp, + ReviseOp, +) +from examples.ags.w_action_node.prompt import ( + DE_ENSEMBLE_ANGEL_PROMPT, + DE_ENSEMBLE_CODE_FORMAT_PROMPT, + DE_ENSEMBLE_DEVIL_PROMPT, + DE_ENSEMBLE_JUDGE_FINAL_PROMPT, + DE_ENSEMBLE_JUDGE_UNIVERSAL_PROMPT, + DE_ENSEMBLE_TXT_FORMAT_PROMPT, + FORMAT_PROMPT, + FU_ENSEMBLE_PROMPT, + GENERATE_CODE_SOLUTION_PROMPT, + GENERATE_CODEBLOCK_PROMPT, + GENERATE_CODEBLOCK_REPHRASE_PROMPT, + GENERATE_ON_CONTEXT_PROMPT, + GENERATE_PROMPT, + MATH_ANSWER_FORMAT_PROMPT, + MATH_CORE_PROMPT, + MATH_EXTRACT_PROMPT, + MATH_GENERATE_PROMPT, + MATH_REPHRASE_ON_PROBLEM_PROMPT, + MD_ENSEMBLE_PROMPT, + REFLECTION_ON_PUBLIC_TEST_PROMPT, + REPHRASE_ON_CODE_PROMPT, + REPHRASE_ON_PROBLEM_PROMPT, + REVIEW_PROMPT, + REVISE_PROMPT, +) +from examples.ags.w_action_node.utils import test_case_2_test_function +from metagpt.actions.action_node import ActionNode +from metagpt.llm import LLM +from metagpt.logs import logger + + +class Operator: + def __init__(self, name, llm: LLM): + self.name = name + self.llm = llm + + def __call__(self, *args, **kwargs): + raise NotImplementedError + + +class Generate(Operator): + """ + 基于Action Node Fill Function的 Generate 算子 + """ + + def __init__(self, name: str = "Generate", llm: LLM = LLM()): + super().__init__(name, llm) + + async def __call__(self, problem_description): + prompt = GENERATE_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(GenerateOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response + + async def math_generate(self, problem_description): + prompt = MATH_GENERATE_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(GenerateOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response + + async def code_solution_generate(self, problem_description: str, rephrase_problem: str): + prompt = GENERATE_CODE_SOLUTION_PROMPT.format( + problem_description=problem_description, rephrase_problem=rephrase_problem + ) + node = await ActionNode.from_pydantic(GenerateCodeSolution).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response + + async def context_solution_generate(self, question, context): + prompt = GENERATE_ON_CONTEXT_PROMPT.format(problem_description=question, context=context) + node = await ActionNode.from_pydantic(GenerateOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response + + +class GenerateCodeBlock(Operator): + def __init__(self, name: str = "GenerateCodeBlock", llm: LLM = LLM()): + super().__init__(name, llm) + + @retry(stop=stop_after_attempt(3)) + async def __call__(self, problem_description, function_name): + prompt = GENERATE_CODEBLOCK_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(GenerateCodeBlockOp).fill( + context=prompt, llm=self.llm, mode="code_fill", function_name=function_name + ) + response = node.instruct_content.model_dump() + return response + + @retry(stop=stop_after_attempt(3)) + async def rephrase_generate(self, problem_description, thought, function_name): + prompt = GENERATE_CODEBLOCK_REPHRASE_PROMPT.format(problem_description=problem_description, thought=thought) + node = await ActionNode.from_pydantic(GenerateCodeBlockOp).fill( + context=prompt, llm=self.llm, mode="code_fill", function_name=function_name + ) + response = node.instruct_content.model_dump() + return response + + +class Format(Operator): + def __init__(self, name: str = "Format", llm: LLM = LLM()): + super().__init__(name, llm) + + async def __call__(self, problem_description, solution): + prompt = FORMAT_PROMPT.format(problem_description=problem_description, solution=solution) + node = await ActionNode.from_pydantic(FormatOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response + + async def math_answer_format(self, problem_description: str) -> dict: + prompt = MATH_ANSWER_FORMAT_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(FormatOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response + + +class Review(Operator): + def __init__(self, criteria, name: str = "Review", llm: LLM = LLM()): + self.criteria = criteria + super().__init__(name, llm) + + async def __call__(self, problem_description, solution): + prompt = REVIEW_PROMPT.format( + problem_description=problem_description, solution=solution, criteria=self.criteria + ) + node = await ActionNode.from_pydantic(ReviewOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response + + +class Revise(Operator): + def __init__(self, name: str = "Revise", llm: LLM = LLM()): + super().__init__(name, llm) + + async def __call__(self, problem_description, solution, feedback): + prompt = REVISE_PROMPT.format(problem_description=problem_description, solution=solution, feedback=feedback) + node = await ActionNode.from_pydantic(ReviseOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response + + +class FuEnsemble(Operator): + """ + Function: Critically evaluating multiple solution candidates, synthesizing their strengths, and developing an enhanced, integrated solution. + """ + + def __init__(self, name: str = "FuEnsemble", llm: LLM = LLM()): + super().__init__(name, llm) + + async def __call__(self, solutions: List, problem_description): + solution_text = "" + for solution in solutions: + solution_text += str(solution) + "\n" + prompt = FU_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description) + node = await ActionNode.from_pydantic(FuEnsembleOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response + + +class MdEnsemble(Operator): + """ + Paper: Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine + Link: https://arxiv.org/abs/2311.16452 + """ + + def __init__(self, name: str = "MdEnsemble", llm: LLM = LLM(), vote_count: int = 3): + super().__init__(name, llm) + self.vote_count = vote_count + + @staticmethod + def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]: + shuffled_solutions = solutions.copy() + random.shuffle(shuffled_solutions) + answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)} + return shuffled_solutions, answer_mapping + + async def __call__(self, solutions: List[str], problem_description: str): + print(f"solution count: {len(solutions)}") + all_responses = [] + + for _ in range(self.vote_count): + shuffled_solutions, answer_mapping = self.shuffle_answers(solutions) + + solution_text = "" + for index, solution in enumerate(shuffled_solutions): + solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n" + + prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description) + node = await ActionNode.from_pydantic(MdEnsembleOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + + answer = response.get("solution_letter", "") + answer = answer.strip().upper() + + if answer in answer_mapping: + original_index = answer_mapping[answer] + all_responses.append(original_index) + + most_frequent_index = Counter(all_responses).most_common(1)[0][0] + final_answer = solutions[most_frequent_index] + return {"final_solution": final_answer} + + +class CodeEnsmble(Operator): + def __init__(self, name: str = "CodeEnsemble", llm: LLM = LLM(), vote_count: int = 3): + super().__init__(name, llm) + self.vote_count = vote_count + + @staticmethod + def shuffle_answers(solutions: List[dict]) -> Tuple[List[str], Dict[str, str]]: + shuffled_solutions = solutions.copy() + random.shuffle(shuffled_solutions) + answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)} + return shuffled_solutions, answer_mapping + + async def __call__(self, solutions: List[str], problem_description: str): + all_responses = [] + + unique_structures = {} + unique_structures_count = {} + + valid_solutions_count = 0 # 添加计数器来跟踪有效的解决方案数量 + + for solution in solutions: + try: + tree = ast.parse(solution) + structure_key = ast.dump(tree, annotate_fields=False, include_attributes=False) + + if structure_key not in unique_structures: + unique_structures[structure_key] = solution + unique_structures_count[structure_key] = 1 + else: + unique_structures_count[structure_key] += 1 + + valid_solutions_count += 1 # 增加有效解决方案的计数 + except SyntaxError: + # 剔除语法错误的代码 + continue + + solutions = [ + {"code": unique_structures[structure_key], "weight": count / valid_solutions_count} # 使用有效解决方案的数量来计算权重 + for structure_key, count in unique_structures_count.items() + ] + + updated_length = len(solutions) + if updated_length == 1: + return {"final_solution": solutions[0]["code"]} + + for _ in range(self.vote_count): + shuffled_solutions, answer_mapping = self.shuffle_answers(solutions) + + solution_text = "" + for index, solution in enumerate(shuffled_solutions): + weight = str(solution["weight"]) + code = solution["code"] + solution_text += ( + f"{chr(65 + index)}: \n weight(proportion of occurrences in all solutions):{weight} \n{code}\n\n\n" + ) + + prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description) + node = await ActionNode.from_pydantic(MdEnsembleOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + + answer = response.get("solution_letter", "") + answer = answer.strip().upper() + + if answer in answer_mapping: + original_index = answer_mapping[answer] + # print(f"original index: {original_index}") + all_responses.append(original_index) + + most_frequent_index = Counter(all_responses).most_common(1)[0][0] + final_answer = solutions[most_frequent_index]["code"] + return {"final_solution": final_answer} + + +class ScEnsemble(Operator): + """ + Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models + Link: https://arxiv.org/abs/2203.11171 + """ + + pass + + +class MADEnsemble(Operator): + """ + Paper: Should we be going MAD? A Look at Multi-Agent Debate Strategies for LLMs + Link: https://arxiv.org/abs/2311.17371 + """ + + def __init__(self, name: str = "DebateEnsemble", llm: LLM = LLM()): + super().__init__(name, llm) + self.agents = ["angel", "devil", "judge"] + self.format_requirements = {"txt": DE_ENSEMBLE_TXT_FORMAT_PROMPT, "code": DE_ENSEMBLE_CODE_FORMAT_PROMPT} + + def get_system_prompt(self, name: str, mode: str = "txt"): + if name == "angel": + if mode == "code": + return DE_ENSEMBLE_ANGEL_PROMPT + "\n" + DE_ENSEMBLE_CODE_FORMAT_PROMPT + return DE_ENSEMBLE_ANGEL_PROMPT + "\n" + DE_ENSEMBLE_TXT_FORMAT_PROMPT + elif name == "devil": + if mode == "code": + return DE_ENSEMBLE_DEVIL_PROMPT + "\n" + DE_ENSEMBLE_CODE_FORMAT_PROMPT + return DE_ENSEMBLE_DEVIL_PROMPT + "\n" + DE_ENSEMBLE_TXT_FORMAT_PROMPT + elif name == "judge": + if mode == "final": + return DE_ENSEMBLE_JUDGE_FINAL_PROMPT + return DE_ENSEMBLE_JUDGE_UNIVERSAL_PROMPT + + def construct_messages(self, message_history_with_name, name, mode: str = "txt", phase: str = "universal"): + """ + 基于name与mode来构建system message. + 基于name来构建messages + """ + messages = [] + messages.append({"role": "system", "content": self.get_system_prompt(name, mode)}) + + if name in ["angel", "devil"]: + messages = self._construct_debate(message_history_with_name, name, messages) + elif name == "judge": + messages = self._construct_judge(message_history_with_name, mode, messages) + return messages + + def _construct_debate(self, message_history_with_name, name, messages): + user_message = "" + + for message in message_history_with_name: + if message["name"] == "Judge": + continue + elif message["name"] == name: + if user_message: + messages.append( + { + "role": "user", + "name": "user", + "content": user_message.strip("\n"), + } + ) + messages.append( + { + "role": "assistant", + "name": name, + "content": message["content"], + } + ) + user_message = "" + else: + user_message += message["content"] + + if user_message: + messages.append( + { + "role": "user", + "name": "user", + "content": user_message.strip("\n"), + } + ) + + return messages + + def _construct_judge(self, message_history_with_name, mode, messages): + pass + + async def debate_answer(self, message_history: List, role: str = "angel"): + messages = self.construct_messages(message_history, role) + response = await self.llm.acompletion_text(messages=messages) + message_history.append({"role": "user", "name": role, "content": response}) + return message_history, response + + async def judge_answer(self, message_history: List, phase: str = "universal"): + messages = self.construct_messages(message_history, "judge", phase=phase) + response = await self.llm.acompletion_text(messages=messages) + message_history.append({"role": "user", "name": "judge", "content": response}) + return message_history, response + + async def __call__(self, origin_solution: str, problem_description: str, max_round: int = 3, mode: str = "txt"): + # 思路,输入一个原始答案,构建一个agent代表这个答案进行辩论;另一个agent(devil)使用debate llm的内容进行辩论;法官在每一轮次做出决定是否终止,到了maxround还没终止就由法官进行总结。 + message_history_with_name = [{"role": "user", "name": "angel", "content": origin_solution}] + + for index in range(max_round): + for agent in self.agents: + if agent == "angel": + if index == 0: + pass + message_history_with_name, rsp = self.debate_answer(message_history_with_name, role="angel") + elif agent == "devil": + message_history_with_name, rsp = self.debate_answer(message_history_with_name, role="devil") + elif agent == "judge": + message_history_with_name, judge_result = self.judge_answer( + message_history_with_name, phase="universal" + ) + if not judge_result["is_debating"]: + """ + 这里需要在 self.judge_answer 中设置一个自动给出solution的地方 + """ + return {"final_solution": judge_result["final_solution"]} + + message_history_with_name.pop(-1) + message_history_with_name, judge_answer = self.judge_answer(message_history_with_name, phase="final") + + return {"final_solution": judge_answer["debate_answer"]} + + +class Rephrase(Operator): + """ + Paper: Code Generation with AlphaCodium: From Prompt Engineering to Flow Engineering + Link: https://arxiv.org/abs/2404.14963 + Paper: Achieving >97% on GSM8K: Deeply Understanding the Problems Makes LLMs Better Solvers for Math Word Problems + Link: https://arxiv.org/abs/2404.14963 + """ + + def __init__(self, name: str = "Rephrase", llm: LLM = LLM()): + super().__init__(name, llm) + + async def __call__(self, problem_description: str) -> str: + prompt = REPHRASE_ON_PROBLEM_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(RephraseOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response["rephrased_problem"] + + async def code_rephrase(self, problem_description: str) -> str: + prompt = REPHRASE_ON_CODE_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(RephraseOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response["rephrased_problem"] + + async def math_rephrase(self, problem_description: str) -> str: + prompt = MATH_REPHRASE_ON_PROBLEM_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(RephraseOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response["rephrased_problem"] + + async def math_core(self, problem_description: str) -> str: + prompt = MATH_CORE_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(RephraseOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response["rephrased_problem"] + + async def math_extract(self, problem_description: str) -> str: + prompt = MATH_EXTRACT_PROMPT.format(problem_description=problem_description) + node = await ActionNode.from_pydantic(RephraseOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + return response["rephrased_problem"] + + +class Test(Operator): + def __init__(self, name: str = "Test", llm: LLM = LLM()): + super().__init__(name, llm) + + def exec_code(self, solution, test_cases, problem_id, entry_point): + fail_cases = [] + for test_case in test_cases: + test_code = test_case_2_test_function(solution, test_case, entry_point) + try: + exec(test_code, globals()) + except AssertionError as e: + exc_type, exc_value, exc_traceback = sys.exc_info() + tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback) + with open("tester.txt", "a") as f: + f.write("test_error" + problem_id + "\n") + error_infomation = { + "test_fail_case": { + "test_case": test_case, + "error_type": "AssertionError", + "error_message": str(e), + "traceback": tb_str, + } + } + fail_cases.append(error_infomation) + logger.info(f"test error: {error_infomation}") + except Exception as e: + with open("tester.txt", "a") as f: + f.write(problem_id + "\n") + return {"exec_fail_case": str(e)} + if fail_cases != []: + return fail_cases + else: + return "no error" + + async def __call__(self, problem_id, problem, rephrase_problem, solution, test_cases, entry_point, test_loop): + solution = solution["final_solution"] + for _ in range(test_loop): + result = self.exec_code(solution, test_cases, problem_id, entry_point) + if result == "no error": + return {"final_solution": solution} + elif "exec_fail_case" in result: + result = result["exec_fail_case"] + prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format( + problem_description=problem, + rephrase_problem=rephrase_problem, + code_solution=solution, + exec_pass=f"executed unsuccessfully, error: \n {result}", + test_fail="executed unsucessfully", + ) + node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + solution = response["refined_solution"] + else: + prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format( + problem_description=problem, + rephrase_problem=rephrase_problem, + code_solution=solution, + exec_pass="executed successfully", + test_fail=result, + ) + node = await ActionNode.from_pydantic(ReflectionTestOp).fill(context=prompt, llm=self.llm) + response = node.instruct_content.model_dump() + solution = response["refined_solution"] + return {"final_solution": solution} + + +class FindFact(Operator): + def __init__(self, name: str = "FindFact", llm: LLM = LLM()): + super().__init__(name, llm) + + +class SelfAsk(Operator): + def __init__(self, name: str = "SelfAsk", llm: LLM = LLM()): + super().__init__(name, llm) + + +class Verify(Operator): + def __init__(self, name: str = "Verify", llm: LLM = LLM()): + super().__init__(name, llm) diff --git a/examples/ags/w_action_node/optimizer.py b/examples/ags/w_action_node/optimizer.py new file mode 100644 index 000000000..a88f6baa3 --- /dev/null +++ b/examples/ags/w_action_node/optimizer.py @@ -0,0 +1,246 @@ +# -*- coding: utf-8 -*- +# @Date : 8/12/2024 22:00 PM +# @Author : issac +# @Desc : optimizer for graph + +import os +from typing import List, Literal + +import numpy as np + +from examples.ags.w_action_node.evaluator import Evaluator +from examples.ags.w_action_node.prompts.optimize_prompt import ( + INITIALIZE_OPERATOR_PROMPT, +) +from metagpt.llm import LLM +from metagpt.logs import logger + +config_iterate_path = "iterate" + +DatasetType = Literal["HumanEval", "MMBP", "Gsm8K", "MATH", "HotpotQa", "MMLU"] + +evaluator = Evaluator(eval_path="eval") + + +class Optimizer: + def __init__(self, dataset: DatasetType, llm: LLM, operators: List, optimized_path: str = None) -> None: + self.llm = llm + self.dataset = dataset + self.graph = None # 初始化为 None,稍后加载 + self.operators = operators + self._optimized_path = optimized_path + self.root_path = f"{self._optimized_path}/{self.dataset}" + self.sample = 6 # sample 含义是什么? + self.score = "None" + self.top_scores = [] + self.round = 1 # 起始轮次 + + def _initialize(self): + """ + 基于数据集、操作符初始化 operator 跟 graph + """ + basic_path = f"{config_iterate_path}/{self.dataset}/basic" + required_files = ["operator.py", "graph.py", "prompt.py"] + + def check_files_exist(basic_path, required_files): + missing_files = [] + + for file in required_files: + if not os.path.exists(os.path.join(basic_path, file)): + missing_files.append(file) + + if not missing_files: + return True, [] + else: + return False, missing_files + + if check_files_exist(basic_path, required_files): + logger.info(f"{self.dataset} has been initialized") + return True + else: + logger.info(f"{self.dataset} has not been initialized") + + # 瞎几把写的,需要改 + INITIALIZE_OPERATOR_PROMPT.format( + dataset_name=self.dataset, + dataset_description="...", + input_features="...", + output_features="...", + operator_name="...", + ) + + # 这里加一个迭代 Operator 的操作 + + # 这里生成一个初始的Graph就可以,比如一个基础的review revise 循环啥的 + + # TODO Graph __INIT__ 的时候,self.generate ... 与 optimizer 的 operators 对应 + + # TODO 所有的生成要放到对应的dataset的文件夹下面 + + pass + + def optimize(self): + """ + Optimize the graph + """ + self._initialize() + self._optimize() + + def _load_graph(self, round_number, graphs_path): + """ + 动态加载指定轮次的 Graph 类。 + """ + graph_module_name = f"{graphs_path}.round_{round_number}.graph" + try: + graph_module = __import__(graph_module_name, fromlist=[""]) + graph_class = getattr(graph_module, f"{self.dataset}Graph") + self.graph = graph_class + except ImportError as e: + print(f"Error loading graph for round {round_number}: {e}") + raise + + def _read_files(self, round_number, graphs_path): + """ + 动态读取指定轮次的 Prompt和Graph。 + """ + # 构建 prompt.py 文件的相对路径 + prompt_file_path = os.path.join(graphs_path, "prompt.py") + graph_file_path = os.path.join(graphs_path, "graph.py") + + try: + with open(prompt_file_path, "r", encoding="utf-8") as file: + prompt_content = file.read() + with open(graph_file_path, "r", encoding="utf-8") as file: + graph_content = file.read() + except FileNotFoundError as e: + print(f"Error: File not found for round {round_number}: {e}") + raise + except Exception as e: + print(f"Error loading prompt for round {round_number}: {e}") + raise + return prompt_content, graph_content + + def _load_scores(self): + """ + 重写这个函数,写一个新的结构存储分数 + """ + # 对所有轮次的分数进行排序 + self.top_scores.sort(key=lambda x: x["score"], reverse=True) + + def _exponential_decay(self, ranks, alpha=0.3): + # 根据ranks计算权重 + weights = np.exp(-alpha * ranks) + # 归一化权重使其总和为1 + prob = weights / np.sum(weights) + return prob + + def _select_round(self, items): + # 首先根据'score'字段对items列表进行降序排序 + sorted_items = sorted(items, key=lambda x: x["score"], reverse=True) + + # 提取排序后的位次(从1开始) + ranks = np.array([i for i in range(1, len(sorted_items) + 1)]) + + # 计算概率分布 + probabilities = self._exponential_decay(ranks) + + # 选择一个索引 + selected_index = np.random.choice(len(sorted_items), p=probabilities) + + # 返回选定的条目 + return sorted_items[selected_index] + + def _get_top_rounds(self): + """ + 返回分数最高的 top_x 个轮次,并确保返回的轮次不重复。 + """ + self._load_scores() + # 创建一个集合来跟踪已包含的轮次 + unique_rounds = set() + unique_top_scores = [] + + # 首先,添加第一轮(轮次 1),如果它存在的话 + first_round = next((item for item in self.top_scores if item["round"] == 1), None) + if first_round: + unique_top_scores.append(first_round) + unique_rounds.add(1) + + # 遍历 top_scores 列表 + for item in self.top_scores: + if item["round"] not in unique_rounds: + unique_top_scores.append(item) + unique_rounds.add(item["round"]) + + # 如果已经收集到了足够的唯一轮次,则提前终止循环 + if len(unique_top_scores) == self.sample: + break + + return unique_top_scores + + def _load_experience(self): + root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + rounds_dir = os.path.join(root_dir, "graphs", "gsm8k") + experience_data = defaultdict(lambda: {"score": None, "success": [], "failure": []}) + + # 遍历所有轮次的文件夹 + for round_dir in os.listdir(rounds_dir): + if os.path.isdir(os.path.join(rounds_dir, round_dir)) and round_dir.startswith("round_"): + round_path = os.path.join(rounds_dir, round_dir) + try: + # 查找 experience.json 文件 + json_file_path = os.path.join(round_path, "experience.json") + if os.path.exists(json_file_path): + with open(json_file_path, "r", encoding="utf-8") as json_file: # 指定 UTF-8 编码 + data = json.load(json_file) + father_node = data["father node"] + + # 如果这是该父节点的第一条记录,设置其分数 + if experience_data[father_node]["score"] is None: + experience_data[father_node]["score"] = data["before"] + + # 创建子节点数据 + child_data = {"modification": data["modification"], "score": data["after"]} + + # 根据成功与否添加到相应列表 + if data["succeed"]: + experience_data[father_node]["success"].append(child_data) + else: + experience_data[father_node]["failure"].append(child_data) + else: + print(f"experience.json not found for round {round_dir}") + except Exception as e: + print(f"Error processing {round_dir}: {str(e)}") + + # 将defaultdict转换为普通dict + experience_data = dict(experience_data) + + # 保存为JSON文件 + output_path = os.path.join(root_dir, "graphs", "gsm8k", "processed_experience.json") + with open(output_path, "w", encoding="utf-8") as outfile: # 指定 UTF-8 编码 + json.dump(experience_data, outfile, indent=4, ensure_ascii=False) # ensure_ascii=False 以正确保存中文字符 + + print(f"Processed experience data saved to {output_path}") + return experience_data + + def _optimize(self): + """ + 这里替代原有的Iterate与Evaluate部分,其中Evaluate部分的具体实现 @ yzy 来完成 + """ + # TODO 读取basic模版(从对应的dataset文件夹 {dataset}/basic/operator.py, graph.py, prompt.py ),Operator几乎不用动 + # TODO 动Prompt内容;动Graph连接 + graph_path = f"{self.root_path}/graphs" + f"{graph_path}/round_{self.round + 1}" + + # TODO 填充Optimize 逻辑 + + experience = {} + + score = evaluator.validation_evaluate(self.dataset, self.graph) + experience["after"] = score + experience["succeed"] = bool(score > experience["before"]) + + def test(self, graph_path: str): + """ + 在测试集上验证最佳效果,收集Performance, Pareto Front 等指标, + """ + pass diff --git a/examples/ags/w_action_node/prompt.py b/examples/ags/w_action_node/prompt.py index f7c68e3ed..dfaeb4f52 100644 --- a/examples/ags/w_action_node/prompt.py +++ b/examples/ags/w_action_node/prompt.py @@ -7,16 +7,25 @@ GENERATE_PROMPT = """ Generate Solution for the following problem: {problem_description} """ -GENERATE_CODE_PROMPT = """ -You are an expert programmer tasked with solving a coding problem. +GENERATE_SOLUTION_PROMPT = """ +Generate a text solution for the following problemL: {problem_description} +""" + +GENERATE_CODE_SOLUTION_PROMPT = """ +You are given a code contest problem, and a self-reflection on the problem: ### Problem Description {problem_description} -### Instructions -The above is an incomplete Python code fragment. Return the complete and correct code with no additional text. -Please maintain the JSON format in your response. -### Your Response +### self reflection on the problem +{rephrase_problem} + +Your goal is to come up with a possible text solution to the code contest problem. + +Guidelines: +- Make sure solution fully addresses the problem goals, constraints, examples, and notes. +- Each solution must have reasonable runtime and memory complexity - less than three seconds on a modern computer, given the problem constraints for large inputs. +- Double-check the solutions. Each possible solution must be able to generalize to additional test cases, not just the ones provided in the problem description. """ GENERATE_CODEBLOCK_REPHRASE_PROMPT = """ @@ -25,10 +34,10 @@ Please provide a self-contained Python script that solves the following problem ### Problem Description {problem_description} -### self reflection on the problem -{rephrase_problem} +### reflection and possible solution on the problem +{thought} -When creating your solution: +When writing python script: 1. Consider all edge cases and boundary conditions. 2. Avoid oversimplification - address all aspects of the problem. 3. Ensure your logic covers all stated requirements. @@ -47,9 +56,42 @@ When creating your solution: 4. Avoid adding additional test cases beyond those provided in the problem description. """ +GENERATE_ON_CONTEXT_PROMPT = """ +Please generate a solution for the following problem based on the provided context: + +### Problem Description +{problem_description} + +### Context +{context} +""" + +FORMAT_PROMPT = """ +For the question described as {problem_description}, +please extract a short and concise answer contains only one word/few words from the following solution: {solution}. +Make sure there are no additional comments or explanations in your response. +""" + REVIEW_PROMPT = """ For the question described as {problem_description}, please review the following solution: {solution}, and provide a review result in boolean format. +``` +You will be reviewing the problem-solving process of another AI assistant that has answered a mathematical question. Your task is to evaluate the solution and provide a detailed review for refinement. Follow these steps: + +Carefully read through the original question and entire solution, paying close attention to the relevant concepts, thinking process, calculations, and final result. Assess whether the solution is clear, logical, and well-organized. Write your initial review in tags. + + +Evaluate the reasoning and logic behind the solution. Ensure that the thinking process is clear, coherent, and mathematically sound. If you find any areas that need clarification or improvement, provide your suggestions inside tags. + + +Re-do the calculations presented in the section **carefully and step-by-step** to verify the accuracy. Break down the calculations into the simplest possible steps and check each step for errors. You must not be careless and treat every part with rigor. Don't neglect checking any calculation part of the solution process. If you find any mistakes, note them down inside tags. + + +Provide an overall assessment of the solution's thoroughness, accuracy, and clarity inside tags. Highlight the strengths and weaknesses of the solution and offer suggestions for improvement, if any. + +use XML tags to present your complete evaluation, including initial review, calculation errors, reasoning feedback, and overall assessment, in a well-organized and easy-to-follow format. +Remember to be thorough, constructive, and professional in your review. Your goal is to help improve the quality and accuracy of the mathematical problem-solving process. +``` If you believe the solution is capable of resolving the issue, return True; otherwise, return False, and include your comments """ @@ -87,7 +129,7 @@ Here is a list of possible solutions to the problem: Using the inputs above, your goal is to choose the best solution to the code contest problem. Don't just pick the most efficient solution. The main consideration is that the solution can fully solve the problem in a correct and robust manner. -Provide your final decision by writing the chosen solution letter (e.g., B). +Provide your final decision by writing the chosen solution letter. Please maintain the JSON format in your response. """ @@ -137,10 +179,20 @@ You are given a code contest problem: ### problem {problem_description} +### instrcutions +Given the problem, Your Goal is: +Reflect on the problem, and describe it in your own words, in bullet points. Pay attention to small details, nuances, notes and examples in the problem description. +""" + +REPHRASE_ON_CODE_PROMPT = """ +You are given a code contest problem: + +### problem +{problem_description} + ### instrcutions Given the code contest problem, Your Goal is: Reflect on the problem, and describe it in your own words, in bullet points. Pay attention to small details, nuances, notes and examples in the problem description. - """ REFLECTION_ON_PUBLIC_TEST_PROMPT = """ @@ -198,3 +250,59 @@ Please ensure that: 2. The function name in the original example (e.g., 'has_close_elements') is replaced with 'candidate'. 3. The assert statements are returned as strings in a list. """ + +MATH_GENERATE_PROMPT = """ +{problem_description} + +### Instructions +Please extract the core question, only the most comprehensive and detailed one! +extract the problem-solving information related to the core question , Only extract the most useful information, list them one by one! +Please understand the Hint and question information, then solve the question step by step and show the answer. + +Please provide the solution without using any backslash (\) characters (such as LaTeX formatting) or other special symbols that may cause formatting issues in JSON. +""" + +MATH_GENERATE_PROMPT = """ +{problem_description} +### Instructions +Please extract the core question, only the most comprehensive and detailed one! +extract the problem-solving information related to the core question , Only extract the most useful information, list them one by one! +Understand the Hint and question information, then solve the question step by step and show the answer. +""" + +MATH_CORE_PROMPT = """ +### Problem +{problem_description} + +Please extract the core question, only the most comprehensive and detailed one! +""" + +MATH_EXTRACT_PROMPT = """ +### Problem +{problem_description} + +Note: Please extract the problem-solving information related to the core question [Core Question info], Only extract the most useful information, list them one by one! +""" + +MATH_REPHRASE_ON_PROBLEM_PROMPT = """ +You are presented with a math contest question: + +### Problem +{problem_description} + +### Instructions +When faced with this math problem, your goal is to: +1. Read the problem carefully and understand the basic requirements and conditions. +2. Restate the problem in your own words, capturing the nuances, details, notes, and examples provided in the problem description. +3. List the key points for solving the problem, including known conditions, unknowns, and mathematical concepts or formulas that need to be applied. +4. Consider possible strategies and methods for solving the problem, thinking about how to break it down into smaller parts or steps. +5. Attempt to represent the problem with mathematical expressions or equations to prepare for solving it. +""" + +MATH_ANSWER_FORMAT_PROMPT = """ +### Answer +{problem_description} + +### Instructions +Provide the answer as a numerical value only, without units or any additional text. +""" diff --git a/examples/ags/w_action_node/prompts/optimize_prompt.py b/examples/ags/w_action_node/prompts/optimize_prompt.py new file mode 100644 index 000000000..08ce446c8 --- /dev/null +++ b/examples/ags/w_action_node/prompts/optimize_prompt.py @@ -0,0 +1,35 @@ +INITIALIZE_OPERATOR_PROMPT = """ +您正在处理一个名为{dataset_name}的数据集。该数据集{dataset_description}。 + +输入特征包括: +{input_features} + +输出特征为: +{output_features} + +请根据以上信息,优化用途为{operator_name}的prompt以便更好地处理这个数据集: + +{initial_prompt} + +您的任务是: +1. 分析数据集的特点和结构 +2. 考虑输入和输出特征之间的关系 +3. 调整initial_prompt以更好地利用数据集信息 +4. 提供一个经过优化的prompt版本 + +请提供您优化后的prompt,并简要解释您所做的更改及其原因。 +""" + +# TODO 这里也需要自适应的完成针对不同数据集的GRAPH OPTIMIZE PROMPT + +GRAPH_OPTIMIZE_PROMPT = """You are building a Graph and corresponding Prompt to jointly solve mathematical problems. +Referring to the given combination of graph and prompt, which forms a basic example of a mathematical solution approach, please reconstruct and optimize the Prompt and Graph. You can add, modify, or delete nodes and parameters in the graph, as well as modify, delete, or add new Prompts. +Put your modification (only make one point of change, i.e., one sentence), and the modified Prompt and Graph in XML tags in your reply. They will be used as new Prompt and Graph for calculation and iteration. Please ensure they are complete and correct, otherwise it may lead to runtime failures. +Only modify the parts in Prompt and Graph within /async def __call__(self, problem: str):/, otherwise it will cause parsing failure. +Reply format (must be strictly followed) (do not include any other formats except for the given XML format): + +You should fill in the details of your modifications here, to facilitate future review. +graph +prompt + +When optimizing, you can refer to critical thinking, and can incorporate methods such as Review, Revise, Ensemble, selfAsk, etc. Don't be limited to the previous format.You can consider Python's built-in loops (like for, while, and list comprehensions) or conditional statements (such as if-elif-else and ternary operators), or even machine learning methods ranging from basic supervised learning techniques (e.g., linear regression, decision trees) to more advanced approaches like neural networks and clustering algorithms. However, you must ensure that each call to the Graph internally involves at most 10 interactions, i.e., the complexity of the graph does not exceed 15.""" diff --git a/examples/ags/w_action_node/utils.py b/examples/ags/w_action_node/utils.py index 13de2a27d..7d692c3c4 100644 --- a/examples/ags/w_action_node/utils.py +++ b/examples/ags/w_action_node/utils.py @@ -20,6 +20,17 @@ def extract_task_id(task_id: str) -> int: return int(match.group(1)) if match else 0 +def get_hotpotqa(path: str): + # Parses each jsonl line and yields it as a dictionary + def parse_jsonl(path): + with open(path) as f: + for line in f: + yield json.loads(line) + + datas = list(parse_jsonl(path)) + return {data["_id"]: data for data in datas} + + def sort_json_by_key(input_file: str, output_file: str, key: str = "task_id"): """ Read a JSONL file, sort the entries based on task_id, and write to a new JSONL file. diff --git a/metagpt/actions/action_node.py b/metagpt/actions/action_node.py index a09d4a1da..7c91e2558 100644 --- a/metagpt/actions/action_node.py +++ b/metagpt/actions/action_node.py @@ -39,6 +39,7 @@ class ReviseMode(Enum): TAG = "CONTENT" MODE_CODE_FILL = "code_fill" +CONTEXT_FILL = "context_fill" LANGUAGE_CONSTRAINT = "Language: Please use the same language as Human INPUT." FORMAT_CONSTRAINT = f"Format: output wrapped inside [{TAG}][/{TAG}] like format example, nothing else." @@ -481,6 +482,9 @@ class ActionNode: # If there are multiple fields, we might want to use self.key to find the right one return self.key + def xml_compile(self, context): + pass + async def code_fill(self, context, function_name=None, timeout=USE_CONFIG_TIMEOUT): """ fill CodeBlock Node @@ -493,12 +497,9 @@ class ActionNode: result = {field_name: extracted_code} return result - async def messages_fill( - self, - ): + async def context_fill(self, context): """ - 参考这个代码,只不过LLM调用方式改成使用; - 参考 + 这个地方的代码实现的目的是 """ pass @@ -544,6 +545,15 @@ class ActionNode: self.instruct_content = self.create_class()(**result) return self + elif mode == CONTEXT_FILL: + """ + 使用xml_compile,但是这个版本没有办法实现system message 跟 temperature + """ + context = self.xml_compile(context=self.context) + result = await self.context_fill(context, timeout) + self.instruct_content = self.create_class()(**result) + return self + if strgy == "simple": return await self.simple_fill(schema=schema, mode=mode, images=images, timeout=timeout, exclude=exclude) elif strgy == "complex": diff --git a/metagpt/llm.py b/metagpt/llm.py index 88fe8bd44..a918edd2a 100644 --- a/metagpt/llm.py +++ b/metagpt/llm.py @@ -17,6 +17,7 @@ global cost_manager if not globals().get("cost_manager"): cost_manager = CostManager() + def LLM(llm_config: Optional[LLMConfig] = None, context: Context = None) -> BaseLLM: """get the default llm provider if name is None""" ctx = context or Context()