mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-01 20:03:28 +02:00
Update
This commit is contained in:
parent
6a01a679ce
commit
7c2501e08b
12 changed files with 593 additions and 59 deletions
|
|
@ -0,0 +1,29 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Date :
|
||||
# @Author : issac
|
||||
# @Desc : test on gsm8k
|
||||
import asyncio
|
||||
|
||||
from deepeval.models.base_model import DeepEvalBaseLLM
|
||||
|
||||
|
||||
# 这里是DeepEval强制定义的模型基础格式,这里不需要进行改动,只需要调用即可
|
||||
class GraphModel(DeepEvalBaseLLM):
|
||||
def __init__(self, graph):
|
||||
self.solver = graph
|
||||
|
||||
def load_model(self):
|
||||
pass
|
||||
|
||||
async def a_generate(self, prompt: str) -> str:
|
||||
# TODO 还需要在这里继续整合Cost
|
||||
solution_result, total_cost = await self.solver(prompt)
|
||||
return solution_result
|
||||
|
||||
def generate(self, prompt: str) -> str:
|
||||
loop = asyncio.get_event_loop()
|
||||
solution_result = loop.run_until_complete(self.a_generate(prompt)) # 等待 a_generate 方法完成
|
||||
return solution_result
|
||||
|
||||
def get_model_name(self):
|
||||
return "Custom Azure OpenAI Model"
|
||||
|
|
@ -3,12 +3,16 @@
|
|||
# @Author : all
|
||||
# @Desc : evaluate for different dataset
|
||||
import datetime
|
||||
import inspect
|
||||
import os
|
||||
from typing import Literal
|
||||
|
||||
import pandas as pd
|
||||
from deepeval.benchmarks import GSM8K
|
||||
|
||||
from examples.ags.benchmark.gsm8k import GraphModel
|
||||
from examples.ags.w_action_node.graph import SolveGraph
|
||||
|
||||
# TODO 完成实验数据集的手动划分
|
||||
|
||||
DatasetType = Literal["HumanEval", "MMBP", "Gsm8K", "MATH", "HotpotQa", "MMLU"]
|
||||
|
|
@ -20,14 +24,14 @@ class Evaluator:
|
|||
"""
|
||||
|
||||
def __init__(self, eval_path: str):
|
||||
pass
|
||||
self.eval_path = eval_path
|
||||
|
||||
def validation_evaluate(self, dataset: DatasetType, result_path: str):
|
||||
def validation_evaluate(self, dataset: DatasetType, graph, params: dict):
|
||||
"""
|
||||
Evaluates on validation dataset.
|
||||
"""
|
||||
if dataset == "Gsm8K":
|
||||
return self._gsm8k_eval(result_path)
|
||||
return self._gsm8k_eval(graph, params)
|
||||
pass
|
||||
|
||||
def test_evaluate(self, dataset: DatasetType):
|
||||
|
|
@ -36,16 +40,12 @@ class Evaluator:
|
|||
"""
|
||||
pass
|
||||
|
||||
def _gsm8k_eval(self, model, result_path, samples: int = 1000):
|
||||
def _gsm8k_eval(self, graph_class, params, samples: int = 1000):
|
||||
"""
|
||||
Evaluate on GSM8K dataset.
|
||||
"""
|
||||
if model is None:
|
||||
raise ValueError("Model is required for evaluation.")
|
||||
|
||||
benchmark = GSM8K(n_problems=samples, n_shots=0, enable_cot=False)
|
||||
goldens = benchmark.load_benchmark_dataset()[: benchmark.n_problems]
|
||||
|
||||
# TODO 划分验证集测试集
|
||||
def _evaluate_problem(model, golden, benchmark):
|
||||
prompt = golden.input
|
||||
|
||||
|
|
@ -70,12 +70,6 @@ class Evaluator:
|
|||
|
||||
return golden.input, str(prediction), golden.expected_output, score
|
||||
|
||||
results = [_evaluate_problem(model, golden, benchmark) for golden in goldens]
|
||||
|
||||
overall_correct_predictions = sum(score for _, _, _, score in results)
|
||||
overall_total_predictions = benchmark.n_problems
|
||||
overall_accuracy = overall_correct_predictions / overall_total_predictions
|
||||
|
||||
def process_gsm8k_csv(file_path, tolerance=1e-6):
|
||||
# 读取 CSV 文件
|
||||
df = pd.read_csv(file_path, dtype=str) # 使用默认逗号分隔符,并指定所有列为字符串类型
|
||||
|
|
@ -129,6 +123,24 @@ class Evaluator:
|
|||
|
||||
return average_score
|
||||
|
||||
dataset = params["dataset"]
|
||||
llm_config = params["llm_config"]
|
||||
|
||||
# TODO 给到的是load出来的Graph,怎么让他做实例化?
|
||||
graph = SolveGraph(name="Gsm8K", llm_config=llm_config, dataset=dataset)
|
||||
model = GraphModel(graph)
|
||||
benchmark = GSM8K(n_problems=samples, n_shots=0, enable_cot=False)
|
||||
|
||||
graph_module = inspect.getmodule(graph_class)
|
||||
os.path.dirname(graph_module.__file__)
|
||||
goldens = benchmark.load_benchmark_dataset()[: benchmark.n_problems]
|
||||
|
||||
results = [_evaluate_problem(model, golden, benchmark) for golden in goldens]
|
||||
|
||||
overall_correct_predictions = sum(score for _, _, _, score in results)
|
||||
overall_total_predictions = benchmark.n_problems
|
||||
overall_accuracy = overall_correct_predictions / overall_total_predictions
|
||||
|
||||
predictions_row = [
|
||||
(input, prediction, expected_output, score) for input, prediction, expected_output, score in results
|
||||
]
|
||||
|
|
@ -137,11 +149,11 @@ class Evaluator:
|
|||
)
|
||||
benchmark.overall_score = overall_accuracy
|
||||
now = datetime.datetime.now()
|
||||
now.strftime("%Y-%m-%d_%H-%M-%S").replace(":", "_")
|
||||
now_time = now.strftime("%Y-%m-%d_%H-%M-%S").replace(":", "_")
|
||||
|
||||
# file_path = f'gsm8k_{overall_accuracy}_{now_time}.csv'
|
||||
file_path = f"{self.eval_path}/gsm8k_{overall_accuracy}_{now_time}.csv"
|
||||
|
||||
benchmark.predictions.to_csv(result_path, index=False)
|
||||
benchmark.predictions.to_csv(file_path, index=False)
|
||||
|
||||
score = process_gsm8k_csv(file_path=result_path)
|
||||
score = process_gsm8k_csv(file_path=file_path)
|
||||
return {"score": score}
|
||||
|
|
|
|||
|
|
@ -12,27 +12,30 @@ from typing import Dict, List, Tuple
|
|||
from tenacity import retry, stop_after_attempt
|
||||
|
||||
from examples.ags.w_action_node.operator_an import (
|
||||
CodeGenerateOp,
|
||||
FormatOp,
|
||||
FuEnsembleOp,
|
||||
GenerateCodeBlockOp,
|
||||
GenerateOp,
|
||||
MdEnsembleOp,
|
||||
ReflectionTestOp,
|
||||
RephraseOp,
|
||||
ReviewOp,
|
||||
ReviseOp,
|
||||
ScEnsembleOp,
|
||||
)
|
||||
from examples.ags.w_action_node.prompt import (
|
||||
CODE_CONTEXTUAL_GENERATE_PROMPT,
|
||||
CONTEXTUAL_GENERATE_PROMPT,
|
||||
FORMAT_PROMPT,
|
||||
FU_ENSEMBLE_PROMPT,
|
||||
GENERATE_CODEBLOCK_PROMPT,
|
||||
GENERATE_CODEBLOCK_REPHRASE_PROMPT,
|
||||
GENERATE_PROMPT,
|
||||
MD_ENSEMBLE_PROMPT,
|
||||
REFLECTION_ON_PUBLIC_TEST_PROMPT,
|
||||
REPHRASE_ON_PROBLEM_PROMPT,
|
||||
REVIEW_PROMPT,
|
||||
REVISE_PROMPT,
|
||||
SC_ENSEMBLE_PROMPT,
|
||||
)
|
||||
from examples.ags.w_action_node.utils import test_case_2_test_function
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
|
|
@ -54,7 +57,7 @@ class Generate(Operator):
|
|||
基于Action Node Fill Function的 Generate 算子
|
||||
"""
|
||||
|
||||
def __init__(self, name: str = "Generate", llm: LLM = LLM()):
|
||||
def __init__(self, llm: LLM, name: str = "Generate"):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, problem_description):
|
||||
|
|
@ -64,23 +67,42 @@ class Generate(Operator):
|
|||
return response
|
||||
|
||||
|
||||
class GenerateCodeBlock(Operator):
|
||||
def __init__(self, name: str = "GenerateCodeBlock", llm: LLM = LLM()):
|
||||
class ContextualGenerate(Operator):
|
||||
def __init__(self, llm: LLM, name: str = "ContextualGenerate"):
|
||||
super().__init__(name, llm)
|
||||
|
||||
@retry(stop=stop_after_attempt(3))
|
||||
async def __call__(self, problem_description, thought, function_name):
|
||||
prompt = CONTEXTUAL_GENERATE_PROMPT.format(problem_description=problem_description, thought=thought)
|
||||
node = await ActionNode.from_pydantic(GenerateOp).fill(
|
||||
context=prompt, llm=self.llm, function_name=function_name
|
||||
)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
|
||||
class CodeGenerate(Operator):
|
||||
def __init__(self, name: str = "CodeGenerate", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
||||
@retry(stop=stop_after_attempt(3))
|
||||
async def __call__(self, problem_description, function_name):
|
||||
prompt = GENERATE_CODEBLOCK_PROMPT.format(problem_description=problem_description)
|
||||
node = await ActionNode.from_pydantic(GenerateCodeBlockOp).fill(
|
||||
node = await ActionNode.from_pydantic(CodeGenerateOp).fill(
|
||||
context=prompt, llm=self.llm, mode="code_fill", function_name=function_name
|
||||
)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
|
||||
class CodeContextualGenerate(Operator):
|
||||
def __init__(self, llm: LLM, name: str = "CodeContextualGenerate"):
|
||||
super().__init__(name, llm)
|
||||
|
||||
@retry(stop=stop_after_attempt(3))
|
||||
async def rephrase_generate(self, problem_description, thought, function_name):
|
||||
prompt = GENERATE_CODEBLOCK_REPHRASE_PROMPT.format(problem_description=problem_description, thought=thought)
|
||||
node = await ActionNode.from_pydantic(GenerateCodeBlockOp).fill(
|
||||
async def __call__(self, problem_description, thought, function_name):
|
||||
prompt = CODE_CONTEXTUAL_GENERATE_PROMPT.format(problem_description=problem_description, thought=thought)
|
||||
node = await ActionNode.from_pydantic(CodeGenerateOp).fill(
|
||||
context=prompt, llm=self.llm, mode="code_fill", function_name=function_name
|
||||
)
|
||||
response = node.instruct_content.model_dump()
|
||||
|
|
@ -262,9 +284,28 @@ class ScEnsemble(Operator):
|
|||
"""
|
||||
Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
|
||||
Link: https://arxiv.org/abs/2203.11171
|
||||
Paper: Universal Self-Consistency for Large Language Model Generation
|
||||
Link: https://arxiv.org/abs/2311.17311
|
||||
"""
|
||||
|
||||
pass
|
||||
def __init__(self, name: str = "ScEnsemble", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, solutions: List[str], problem_description: str):
|
||||
answer_mapping = {}
|
||||
solution_text = ""
|
||||
for index, solution in enumerate(solutions):
|
||||
answer_mapping[chr(65 + index)] = index
|
||||
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
|
||||
|
||||
prompt = SC_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description)
|
||||
node = await ActionNode.from_pydantic(ScEnsembleOp).fill(context=prompt, llm=self.llm)
|
||||
response = node.instruct_content.model_dump()
|
||||
|
||||
answer = response.get("solution_letter", "")
|
||||
answer = answer.strip().upper()
|
||||
|
||||
return {"final_solution": solutions[answer_mapping[answer]]}
|
||||
|
||||
|
||||
class Rephrase(Operator):
|
||||
|
|
@ -351,18 +392,3 @@ class Test(Operator):
|
|||
response = node.instruct_content.model_dump()
|
||||
solution = response["refined_solution"]
|
||||
return {"final_solution": solution}
|
||||
|
||||
|
||||
class FindFact(Operator):
|
||||
def __init__(self, name: str = "FindFact", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
||||
|
||||
class SelfAsk(Operator):
|
||||
def __init__(self, name: str = "SelfAsk", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
||||
|
||||
class Verify(Operator):
|
||||
def __init__(self, name: str = "Verify", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ class GenerateOp(BaseModel):
|
|||
solution: str = Field(default="", description="Your solution for this problem")
|
||||
|
||||
|
||||
class GenerateCodeBlockOp(BaseModel):
|
||||
class CodeGenerateOp(BaseModel):
|
||||
code_solution: str = Field(default="", description="Your complete code solution for this problem")
|
||||
|
||||
|
||||
|
|
@ -88,5 +88,5 @@ class ReflectionTestOp(BaseModel):
|
|||
)
|
||||
|
||||
|
||||
class Optimize(BaseModel):
|
||||
graph: str = Field(default="", description="graph")
|
||||
class ScEnsembleOp(BaseModel):
|
||||
solution_letter: str = Field(default="", description="The letter of most consistent solution.")
|
||||
|
|
|
|||
|
|
@ -0,0 +1,211 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Date : 6/27/2024 17:36 PM
|
||||
# @Author : didi
|
||||
# @Desc : operator demo of ags
|
||||
import random
|
||||
from collections import Counter
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from tenacity import retry, stop_after_attempt
|
||||
|
||||
from examples.ags.w_action_node.optimized.Gsm8K.graphs.round_1.operator_an import (
|
||||
FormatOp,
|
||||
FuEnsembleOp,
|
||||
GenerateOp,
|
||||
MdEnsembleOp,
|
||||
RephraseOp,
|
||||
ReviewOp,
|
||||
ReviseOp,
|
||||
ScEnsembleOp,
|
||||
)
|
||||
from examples.ags.w_action_node.prompt import (
|
||||
CONTEXTUAL_GENERATE_PROMPT,
|
||||
FORMAT_PROMPT,
|
||||
FU_ENSEMBLE_PROMPT,
|
||||
GENERATE_PROMPT,
|
||||
MD_ENSEMBLE_PROMPT,
|
||||
REPHRASE_ON_PROBLEM_PROMPT,
|
||||
REVIEW_PROMPT,
|
||||
REVISE_PROMPT,
|
||||
SC_ENSEMBLE_PROMPT,
|
||||
)
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
from metagpt.llm import LLM
|
||||
|
||||
|
||||
class Operator:
|
||||
def __init__(self, name, llm: LLM):
|
||||
self.name = name
|
||||
self.llm = llm
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Generate(Operator):
|
||||
def __init__(self, llm: LLM, name: str = "Generate"):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, problem_description):
|
||||
prompt = GENERATE_PROMPT.format(problem_description=problem_description)
|
||||
node = await ActionNode.from_pydantic(GenerateOp).fill(context=prompt, llm=self.llm)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
|
||||
class ContextualGenerate(Operator):
|
||||
def __init__(self, llm: LLM, name: str = "ContextualGenerate"):
|
||||
super().__init__(name, llm)
|
||||
|
||||
@retry(stop=stop_after_attempt(3))
|
||||
async def __call__(self, problem_description, thought, function_name):
|
||||
prompt = CONTEXTUAL_GENERATE_PROMPT.format(problem_description=problem_description, thought=thought)
|
||||
node = await ActionNode.from_pydantic(GenerateOp).fill(
|
||||
context=prompt, llm=self.llm, function_name=function_name
|
||||
)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
|
||||
class Format(Generate):
|
||||
def __init__(self, name: str = "Format", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, problem_description, solution):
|
||||
prompt = FORMAT_PROMPT.format(problem_description=problem_description, solution=solution)
|
||||
node = await ActionNode.from_pydantic(FormatOp).fill(context=prompt, llm=self.llm)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
|
||||
class Review(Operator):
|
||||
def __init__(self, criteria, name: str = "Review", llm: LLM = LLM()):
|
||||
self.criteria = criteria
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, problem_description, solution):
|
||||
prompt = REVIEW_PROMPT.format(
|
||||
problem_description=problem_description, solution=solution, criteria=self.criteria
|
||||
)
|
||||
node = await ActionNode.from_pydantic(ReviewOp).fill(context=prompt, llm=self.llm)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
|
||||
class Revise(Operator):
|
||||
def __init__(self, name: str = "Revise", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, problem_description, solution, feedback):
|
||||
prompt = REVISE_PROMPT.format(problem_description=problem_description, solution=solution, feedback=feedback)
|
||||
node = await ActionNode.from_pydantic(ReviseOp).fill(context=prompt, llm=self.llm)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
|
||||
class FuEnsemble(Operator):
|
||||
"""
|
||||
Function: Critically evaluating multiple solution candidates, synthesizing their strengths, and developing an enhanced, integrated solution.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str = "FuEnsemble", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, solutions: List, problem_description):
|
||||
solution_text = ""
|
||||
for solution in solutions:
|
||||
solution_text += str(solution) + "\n"
|
||||
prompt = FU_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description)
|
||||
node = await ActionNode.from_pydantic(FuEnsembleOp).fill(context=prompt, llm=self.llm)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
|
||||
class MdEnsemble(Operator):
|
||||
"""
|
||||
Paper: Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine
|
||||
Link: https://arxiv.org/abs/2311.16452
|
||||
"""
|
||||
|
||||
def __init__(self, name: str = "MdEnsemble", llm: LLM = LLM(), vote_count: int = 3):
|
||||
super().__init__(name, llm)
|
||||
self.vote_count = vote_count
|
||||
|
||||
@staticmethod
|
||||
def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
|
||||
shuffled_solutions = solutions.copy()
|
||||
random.shuffle(shuffled_solutions)
|
||||
answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
|
||||
return shuffled_solutions, answer_mapping
|
||||
|
||||
async def __call__(self, solutions: List[str], problem_description: str):
|
||||
print(f"solution count: {len(solutions)}")
|
||||
all_responses = []
|
||||
|
||||
for _ in range(self.vote_count):
|
||||
shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
|
||||
|
||||
solution_text = ""
|
||||
for index, solution in enumerate(shuffled_solutions):
|
||||
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
|
||||
|
||||
prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description)
|
||||
node = await ActionNode.from_pydantic(MdEnsembleOp).fill(context=prompt, llm=self.llm)
|
||||
response = node.instruct_content.model_dump()
|
||||
|
||||
answer = response.get("solution_letter", "")
|
||||
answer = answer.strip().upper()
|
||||
|
||||
if answer in answer_mapping:
|
||||
original_index = answer_mapping[answer]
|
||||
all_responses.append(original_index)
|
||||
|
||||
most_frequent_index = Counter(all_responses).most_common(1)[0][0]
|
||||
final_answer = solutions[most_frequent_index]
|
||||
return {"final_solution": final_answer}
|
||||
|
||||
|
||||
class ScEnsemble(Operator):
|
||||
"""
|
||||
Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
|
||||
Link: https://arxiv.org/abs/2203.11171
|
||||
Paper: Universal Self-Consistency for Large Language Model Generation
|
||||
Link: https://arxiv.org/abs/2311.17311
|
||||
"""
|
||||
|
||||
def __init__(self, name: str = "ScEnsemble", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, solutions: List[str], problem_description: str):
|
||||
answer_mapping = {}
|
||||
solution_text = ""
|
||||
for index, solution in enumerate(solutions):
|
||||
answer_mapping[chr(65 + index)] = index
|
||||
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
|
||||
|
||||
prompt = SC_ENSEMBLE_PROMPT.format(solutions=solution_text, problem_description=problem_description)
|
||||
node = await ActionNode.from_pydantic(ScEnsembleOp).fill(context=prompt, llm=self.llm)
|
||||
response = node.instruct_content.model_dump()
|
||||
|
||||
answer = response.get("solution_letter", "")
|
||||
answer = answer.strip().upper()
|
||||
|
||||
return {"final_solution": solutions[answer_mapping[answer]]}
|
||||
|
||||
|
||||
class Rephrase(Operator):
|
||||
"""
|
||||
Paper: Code Generation with AlphaCodium: From Prompt Engineering to Flow Engineering
|
||||
Link: https://arxiv.org/abs/2404.14963
|
||||
Paper: Achieving >97% on GSM8K: Deeply Understanding the Problems Makes LLMs Better Solvers for Math Word Problems
|
||||
Link: https://arxiv.org/abs/2404.14963
|
||||
"""
|
||||
|
||||
def __init__(self, name: str = "Rephrase", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, problem_description: str) -> str:
|
||||
prompt = REPHRASE_ON_PROBLEM_PROMPT.format(problem_description=problem_description)
|
||||
node = await ActionNode.from_pydantic(RephraseOp).fill(context=prompt, llm=self.llm)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response["rephrased_problem"]
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class GenerateOp(BaseModel):
|
||||
solution: str = Field(default="", description="Your solution for this problem")
|
||||
|
||||
|
||||
class FormatOp(BaseModel):
|
||||
solution: str = Field(default="", description="Your formatted answer for this problem")
|
||||
|
||||
|
||||
class ReviewOp(BaseModel):
|
||||
review_result: bool = Field(
|
||||
default=False,
|
||||
description="The Review Result (Bool). If you think this solution looks good for you, return 'true'; If not, return 'false'",
|
||||
)
|
||||
feedback: str = Field(
|
||||
default="",
|
||||
description="Your FeedBack for this problem based on the criteria. If the review result is true, you can put it 'nothing here'.",
|
||||
)
|
||||
|
||||
|
||||
class ReviseOp(BaseModel):
|
||||
revised_solution: str = Field(default="", description="Based on the feedback, revised solution for this problem")
|
||||
|
||||
|
||||
class FuEnsembleOp(BaseModel):
|
||||
thought: str = Field(
|
||||
default="",
|
||||
description="Analyze the solutions and think how to combine the advantages of various solutions to form the best possible solution.",
|
||||
)
|
||||
final_solution: str = Field(default="", description="Output the final solution after analysis and integration")
|
||||
|
||||
|
||||
class MdEnsembleOp(BaseModel):
|
||||
thought: str = Field(
|
||||
default="""Example thought process:
|
||||
1. Examined the 'compare_one' function.
|
||||
2. The function correctly handles both numeric and string inputs by converting strings to floats.
|
||||
3. It properly compares two values and returns the larger one.
|
||||
4. The function returns None if the values are equal, which might be useful in some contexts but could be improved by returning either value.
|
||||
5. The use of 'isinstance' for type checking is a good practice.
|
||||
6. The function handles decimal separators well by replacing ',' with '.'.
|
||||
Overall, this solution effectively solves the problem of comparing two values, with good error handling and flexibility. It could be improved by specifying behavior for equal values, but it's a strong solution as is.""",
|
||||
description="Step-by-step analysis of the solutions to determine the best one.",
|
||||
)
|
||||
solution_letter: str = Field(default="", description="The letter of the chosen best solution (only one letter).")
|
||||
|
||||
|
||||
class RephraseOp(BaseModel):
|
||||
rephrased_problem: str = Field(default="", description="Rephrased problem description for this problem")
|
||||
|
||||
|
||||
class ScEnsembleOp(BaseModel):
|
||||
solution_letter: str = Field(default="", description="The letter of most consistent solution.")
|
||||
|
|
@ -3,6 +3,7 @@
|
|||
# @Author : issac
|
||||
# @Desc : optimizer for graph
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
|
@ -18,6 +19,9 @@ from examples.ags.w_action_node.prompts.optimize_prompt import (
|
|||
GRAPH_INPUT,
|
||||
GRAPH_OPTIMIZE_PROMPT,
|
||||
GRAPH_TEMPLATE,
|
||||
OPERATOR_INPUT,
|
||||
OPERATOR_OPTIMIZE_PROMPT,
|
||||
OPERATOR_TEMPLATE,
|
||||
)
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
from metagpt.llm import LLM
|
||||
|
|
@ -26,6 +30,7 @@ from metagpt.logs import logger
|
|||
config_iterate_path = "iterate"
|
||||
|
||||
DatasetType = Literal["HumanEval", "MMBP", "Gsm8K", "MATH", "HotpotQa", "MMLU"]
|
||||
OptimizerType = Literal["Complete", "Graph", "Operator"]
|
||||
|
||||
evaluator = Evaluator(eval_path="eval")
|
||||
|
||||
|
|
@ -50,12 +55,14 @@ class Optimizer:
|
|||
optimized_path: str = None,
|
||||
sample: int = 6,
|
||||
q_type: str = "math", # math,code,quiz
|
||||
op: str = "Generator", # 需要优化的Operator
|
||||
) -> None:
|
||||
self.optimize_llm = opt_llm
|
||||
self.execute_llm = exec_llm
|
||||
self.dataset = dataset
|
||||
self.graph = None # 初始化为 None,稍后加载
|
||||
self.operators = operators
|
||||
self.op = op
|
||||
self.optimize_prompt = ""
|
||||
self._optimized_path = optimized_path
|
||||
self.root_path = f"{self._optimized_path}/{self.dataset}"
|
||||
|
|
@ -104,12 +111,27 @@ class Optimizer:
|
|||
|
||||
# 初始化Graph,直接手动从模版中取出(COT)
|
||||
|
||||
def optimize(self):
|
||||
def optimize(self, mode: OptimizerType = "Complete", max_rounds: int = 100):
|
||||
"""
|
||||
Optimize the graph and operator for the dataset.
|
||||
"""
|
||||
self._initialize() # Operator's Optimization
|
||||
self._optimize() # Graph's Optimization
|
||||
if mode == "Complete":
|
||||
self._initialize() # Operator's Optimization
|
||||
|
||||
for opt_round in range(max_rounds):
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
score = loop.run_until_complete(self._optimize_graph())
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
self.round += 1
|
||||
|
||||
print(f"Score for round {self.round}: {score}")
|
||||
|
||||
def _load_graph(self, round_number, graphs_path):
|
||||
"""
|
||||
|
|
@ -281,7 +303,7 @@ class Optimizer:
|
|||
print(f"Processed experience data saved to {output_path}")
|
||||
return experience_data
|
||||
|
||||
async def _optimize(self):
|
||||
async def _optimize_graph(self):
|
||||
"""
|
||||
Optimize Graph's Structure and Prompt
|
||||
"""
|
||||
|
|
@ -329,10 +351,11 @@ class Optimizer:
|
|||
)
|
||||
graph_system = GRAPH_OPTIMIZE_PROMPT.format(type=self.type)
|
||||
|
||||
node_prompt = graph_system + graph_input # TODO 看一眼谁先谁后这个地方
|
||||
graph_optimize_prompt = graph_system + graph_input # TODO 看一眼谁先谁后这个地方
|
||||
|
||||
node = await ActionNode.from_pydantic(GraphOptimize).fill(
|
||||
context=node_prompt, mode="context_fill", llm=self.llm
|
||||
# TODO 从这里开始,Graph Optimize 可以作为一个Operator放入 Operator.py 之中
|
||||
graph_optimize_node = await ActionNode.from_pydantic(GraphOptimize).fill(
|
||||
context=graph_optimize_prompt, mode="context_fill", llm=self.llm
|
||||
)
|
||||
|
||||
max_retries = 5
|
||||
|
|
@ -341,7 +364,7 @@ class Optimizer:
|
|||
while retries < max_retries:
|
||||
try:
|
||||
# TODO 需要和评测的模型分开(传入模型或其它方法),如果能实现Temperature调整更好
|
||||
response = node.instruct_content.model_dump()
|
||||
response = graph_optimize_node.instruct_content.model_dump()
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -353,7 +376,6 @@ class Optimizer:
|
|||
break
|
||||
time.sleep(5)
|
||||
|
||||
# TODO 这里其实可以省去
|
||||
graph_match = response["graph"]
|
||||
prompt_match = response["prompt"]
|
||||
modification_match = response["modification"]
|
||||
|
|
@ -388,6 +410,118 @@ class Optimizer:
|
|||
score = evaluator.validation_evaluate(self.dataset, self.graph)
|
||||
experience["after"] = score
|
||||
experience["succeed"] = bool(score > experience["before"])
|
||||
return score
|
||||
|
||||
async def _optimize_operator(self):
|
||||
"""
|
||||
Optimize Graph's Structure and Prompt
|
||||
"""
|
||||
# 获取项目的根目录
|
||||
graph_path = f"{self.root_path}/operators"
|
||||
|
||||
# 创建文件夹(如果不存在)
|
||||
directory = os.path.join(graph_path, f"round_{self.round + 1}")
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
|
||||
top_rounds = self._get_top_rounds()
|
||||
|
||||
sample = self._select_round(top_rounds)
|
||||
|
||||
print(top_rounds)
|
||||
|
||||
prompt, graph_load, operator_load = self._read_files(sample["round"])
|
||||
score = sample["score"]
|
||||
|
||||
# 正则表达式匹配 SolveGraph 开始的内容
|
||||
operator_pattern = rf"class {self.op}(Operator):.+"
|
||||
|
||||
graph_pattern = r"class SolveGraph:.+"
|
||||
|
||||
# 使用re.findall找到所有匹配项
|
||||
operator = re.findall(operator_pattern, operator_load, re.DOTALL)
|
||||
graph = re.findall(graph_pattern, graph_load, re.DOTALL)
|
||||
|
||||
# 加载处理过的 experience 数据
|
||||
processed_experience = self._load_experience()
|
||||
|
||||
# 获取当前轮次的 experience 数据
|
||||
current_round = int(sample["round"]) # 确保是字符串类型
|
||||
experience_data = processed_experience.get(current_round)
|
||||
|
||||
if experience_data:
|
||||
# 构建 experience 字符串
|
||||
experience = f"Original Score: {experience_data['score']}\n"
|
||||
experience += "Failed modifications:\n"
|
||||
for mod in experience_data["failure"]:
|
||||
experience += f"- {mod['modification']} (Score: {mod['score']})\n"
|
||||
experience += "\n\nNote: Reference failed experiences, avoid trying failed approaches again, attempt to change your thinking, not limited to using more advanced Python syntax like for, if, else, etc., or modifying the Prompt part"
|
||||
else:
|
||||
experience = f"No experience data found for round {current_round}."
|
||||
|
||||
operator_input = OPERATOR_INPUT.format(
|
||||
experinece=experience, score=score, operator=operator[0], prompt=prompt, type=self.type, graph=graph[0]
|
||||
)
|
||||
operator_system = OPERATOR_OPTIMIZE_PROMPT.format(type=self.type)
|
||||
|
||||
node_prompt = operator_system + operator_input # TODO 看一眼谁先谁后这个地方
|
||||
|
||||
node = await ActionNode.from_pydantic(GraphOptimize).fill(
|
||||
context=node_prompt, mode="context_fill", llm=self.llm
|
||||
)
|
||||
|
||||
max_retries = 5
|
||||
retries = 0
|
||||
|
||||
while retries < max_retries:
|
||||
try:
|
||||
# TODO 需要和评测的模型分开(传入模型或其它方法),如果能实现Temperature调整更好
|
||||
response = node.instruct_content.model_dump()
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
retries += 1
|
||||
print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
|
||||
|
||||
if retries == max_retries:
|
||||
print("Maximum retries reached. Skipping this sample.")
|
||||
break
|
||||
time.sleep(5)
|
||||
|
||||
# TODO 这里其实可以省去
|
||||
operator_match = response["operator"]
|
||||
prompt_match = response["prompt"]
|
||||
modification_match = response["modification"]
|
||||
|
||||
modification = modification_match.group(1)
|
||||
prompt = prompt_match.group(1)
|
||||
operator = OPERATOR_TEMPLATE.format(operator=operator_match.group(1), round=self.round + 1)
|
||||
|
||||
# 将 operator.py 文件写入到目录中
|
||||
with open(os.path.join(directory, "operator.py"), "w", encoding="utf-8") as file:
|
||||
file.write(operator)
|
||||
|
||||
# 将 prompt.py 文件写入到目录中
|
||||
with open(os.path.join(directory, "prompt.py"), "w", encoding="utf-8") as file:
|
||||
file.write(prompt)
|
||||
|
||||
# 将 prompt.py 文件写入到目录中
|
||||
with open(os.path.join(directory, "__init__.py"), "w", encoding="utf-8") as file:
|
||||
file.write("")
|
||||
|
||||
experience = {
|
||||
"father node": sample["round"],
|
||||
"modification": modification,
|
||||
"before": sample["score"],
|
||||
"after": None,
|
||||
"succeed": None,
|
||||
}
|
||||
|
||||
with open(os.path.join(directory, "experience.json"), "w", encoding="utf-8") as file:
|
||||
json.dump(experience, file, ensure_ascii=False, indent=4)
|
||||
|
||||
score = evaluator.validation_evaluate(self.dataset, self.graph)
|
||||
experience["after"] = score
|
||||
experience["succeed"] = bool(score > experience["before"])
|
||||
|
||||
def test(self, graph_path: str):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -7,8 +7,14 @@ GENERATE_PROMPT = """
|
|||
Generate Solution for the following problem: {problem_description}
|
||||
"""
|
||||
|
||||
GENERATE_SOLUTION_PROMPT = """
|
||||
Generate a text solution for the following problemL: {problem_description}
|
||||
CONTEXTUAL_GENERATE_PROMPT = """
|
||||
Generate Solution for the following problem:
|
||||
|
||||
## Problem Description
|
||||
{problem_description}
|
||||
|
||||
## Thought
|
||||
{thought}
|
||||
"""
|
||||
|
||||
GENERATE_CODE_SOLUTION_PROMPT = """
|
||||
|
|
@ -28,7 +34,7 @@ Guidelines:
|
|||
- Double-check the solutions. Each possible solution must be able to generalize to additional test cases, not just the ones provided in the problem description.
|
||||
"""
|
||||
|
||||
GENERATE_CODEBLOCK_REPHRASE_PROMPT = """
|
||||
CODE_CONTEXTUAL_GENERATE_PROMPT = """
|
||||
Please provide a self-contained Python script that solves the following problem in a markdown code block:
|
||||
|
||||
### Problem Description
|
||||
|
|
@ -134,6 +140,16 @@ Provide your final decision by writing the chosen solution letter.
|
|||
Please maintain the JSON format in your response.
|
||||
"""
|
||||
|
||||
SC_ENSEMBLE_PROMPT = """
|
||||
I have generated the following solutions to the question: {problem_description}
|
||||
|
||||
{solutions}
|
||||
|
||||
Evaluate these solutions.
|
||||
Select the most consistent solution based on majority consensus.
|
||||
Give your answer with a single id of solution (without anything else).
|
||||
"""
|
||||
|
||||
DE_ENSEMBLE_TXT_FORMAT_PROMPT = """
|
||||
Now please output your answer in json format, with the format as follows:
|
||||
{\"Reason\": \"\", \"debate_answer\": \"the capital letter corresponding to the answer\"}.
|
||||
|
|
|
|||
|
|
@ -69,3 +69,54 @@ GRAPH_TEMPLATE = """import os
|
|||
|
||||
{graph}
|
||||
"""
|
||||
|
||||
OPERATOR_OPTIMIZE_PROMPT = """You are building a Operator and corresponding Prompt to jointly solve {type} problems.
|
||||
Referring to the given combination of Operator and prompt, which forms a basic example of a {type} solution approach, please reconstruct and optimize the Prompt and Operator. You can add, modify, or delete nodes and parameters in the Operator, as well as modify, delete, or add new Prompts.
|
||||
Put your modification (only make one point of change, i.e., one sentence), and the modified Prompt and Operator in XML tags in your reply. They will be used as new Prompt and Operator for calculation and iteration. Please ensure they are complete and correct, otherwise it may lead to runtime failures.
|
||||
Only modify the parts in Prompt and Operator.
|
||||
|
||||
Don't be limited to the previous format.You can consider Python's built-in loops (like for, while, and list comprehensions) or conditional statements (such as if-elif-else and ternary operators), or even machine learning methods ranging from basic supervised learning techniques (e.g., linear regression, decision trees) to more advanced approaches like neural networks and clustering algorithms. However, you must ensure that each call to the Operator internally involves at most 10 interactions, i.e., the complexity of the Operator does not exceed 15."""
|
||||
|
||||
|
||||
OPERATOR_INPUT = """
|
||||
Here is a Operator and corresponding Prompt that performed excellently in a previous iteration (maximum score is 1), Graph calls the Operator:\n
|
||||
<sample>
|
||||
<experience>{experience}</experience>
|
||||
<modification>None</modification>
|
||||
<score>{score}</score>
|
||||
<operator>{operator}</operator>
|
||||
<prompt>{prompt}</prompt>
|
||||
<graph>{graph}</graph>
|
||||
</sample>
|
||||
First provide optimization ideas. Note that ANSWER_FORMAT_PROMPT must exist and cannot be modified. Only add/modify/delete one detail point, extensive modifications are prohibited.\n\n"
|
||||
"""
|
||||
|
||||
|
||||
OPERATOR_TEMPLATE = """
|
||||
import ast
|
||||
import random
|
||||
import sys
|
||||
import traceback
|
||||
from collections import Counter
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from tenacity import retry, stop_after_attempt
|
||||
from examples.ags.w_action_node.optimized.gsm8k.operators.round_{round}.prompt import *
|
||||
from examples.ags.w_action_node.operator_an import (
|
||||
GenerateOp,
|
||||
)
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
from metagpt.llm import LLM
|
||||
from metagpt.logs import logger
|
||||
|
||||
|
||||
class Operator:
|
||||
def __init__(self, name, llm: LLM):
|
||||
self.name = name
|
||||
self.llm = llm
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
{operator}
|
||||
"""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue