mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-08 15:05:17 +02:00
更新了eval BUG,同时更新了新的baseline
This commit is contained in:
parent
b9a2d94da2
commit
b805da0bbe
20 changed files with 274 additions and 40 deletions
|
|
@ -39,12 +39,12 @@ def loose_match_score(expected_output: str, prediction: str, tolerance: float =
|
|||
return 0
|
||||
|
||||
|
||||
async def load_data(file_path: str, samples=1) -> List[dict]:
|
||||
async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
|
||||
data = []
|
||||
async with aiofiles.open(file_path, mode="r") as file:
|
||||
async for line in file:
|
||||
data.append(json.loads(line))
|
||||
random_indices = generate_random_indices(len(data), samples)
|
||||
random_indices = generate_random_indices(len(data), samples, test=test)
|
||||
data = [data[i] for i in random_indices]
|
||||
return data
|
||||
|
||||
|
|
@ -64,26 +64,33 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
|
|||
prompt = input
|
||||
max_retries = 5
|
||||
retries = 0
|
||||
|
||||
while retries < max_retries:
|
||||
try:
|
||||
prediction = await graph(prompt)
|
||||
cost = prediction[1]
|
||||
output = prediction[0]["solution"]
|
||||
prediction = await graph(prompt)
|
||||
cost = prediction[1]
|
||||
output = prediction[0]["solution"]
|
||||
|
||||
score = loose_match_score(expected_output, output)
|
||||
break
|
||||
print(output)
|
||||
|
||||
except Exception as e:
|
||||
retries += 1
|
||||
print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
|
||||
score = loose_match_score(expected_output, output)
|
||||
# break
|
||||
# while retries < max_retries:
|
||||
# try:
|
||||
# prediction = await graph(prompt)
|
||||
# cost = prediction[1]
|
||||
# output = prediction[0]["solution"]
|
||||
|
||||
if retries == max_retries:
|
||||
print("Maximum retries reached. Skipping this sample.")
|
||||
output = None
|
||||
cost = None
|
||||
score = 0
|
||||
break
|
||||
# score = loose_match_score(expected_output, output)
|
||||
# break
|
||||
|
||||
# except Exception as e:
|
||||
# retries += 1
|
||||
# print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
|
||||
|
||||
# if retries == max_retries:
|
||||
# print("Maximum retries reached. Skipping this sample.")
|
||||
# output = None
|
||||
# cost = None
|
||||
# score = 0
|
||||
# break
|
||||
|
||||
return input, output, expected_output, score, cost
|
||||
|
||||
|
|
@ -101,9 +108,9 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
|
|||
|
||||
return await tqdm_asyncio.gather(*tasks, desc="Evaluating problems", total=len(data))
|
||||
|
||||
async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
|
||||
async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
|
||||
"""GSM8K evaluation main function"""
|
||||
data = await load_data(file_path, samples)
|
||||
data = await load_data(file_path, samples, test=test)
|
||||
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5)
|
||||
average_score, total_cost = save_results_to_csv(results, path=path)
|
||||
print(f"Average score: {average_score:.5f}")
|
||||
|
|
|
|||
133
examples/ags/experiments/baselines/medprompt_gsm8k.py
Normal file
133
examples/ags/experiments/baselines/medprompt_gsm8k.py
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
from examples.ags.scripts.operator import Operator
|
||||
from examples.ags.scripts.graph import SolveGraph
|
||||
from examples.ags.benchmark.gsm8k import gsm8k_evaluation
|
||||
from examples.ags.scripts.operator_an import GenerateOp
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
from metagpt.configs.models_config import ModelsConfig
|
||||
from metagpt.llm import LLM
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Dict, Any, List, Tuple
|
||||
from collections import Counter
|
||||
|
||||
import random
|
||||
|
||||
GSM8K_PROMPT_GPT = """
|
||||
{question}\nPlease reason step by step, and to ensure accuracy, provide the correct answer in the final, without any additional text.
|
||||
"""
|
||||
|
||||
GSM8K_PROMPT_DS = """
|
||||
{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
|
||||
"""
|
||||
|
||||
class GenerateOp(BaseModel):
|
||||
solution: str = Field(default="", description="solution for the problem")
|
||||
|
||||
class CoTGenerate(Operator):
|
||||
def __init__(self, llm: LLM, name: str = "Generate"):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, problem, mode: str = None):
|
||||
prompt = GSM8K_PROMPT_GPT.format(question=problem)
|
||||
fill_kwargs = {"context": prompt, "llm": self.llm}
|
||||
if mode:
|
||||
fill_kwargs["mode"] = mode
|
||||
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
MD_ENSEMBLE_PROMPT = """
|
||||
You are given a problem:
|
||||
{question}
|
||||
|
||||
Here is a list of possible solutions to the problem:
|
||||
{solutions}
|
||||
|
||||
Using the inputs above, your goal is to choose the best solution to the problem.
|
||||
The main consideration is that the solution can fully solve the problem in a correct and robust manner.
|
||||
Provide your final decision by writing the chosen solution letter.
|
||||
|
||||
Please follow the required format in your response.
|
||||
"""
|
||||
|
||||
class MdEnsembleOp(BaseModel):
|
||||
thought: str = Field(
|
||||
default="",
|
||||
description="Step-by-step analysis of the solutions to determine the best one.",
|
||||
)
|
||||
solution_letter: str = Field(default="", description="The letter of the chosen best solution (only one letter).")
|
||||
|
||||
|
||||
class MdEnsemble(Operator):
|
||||
"""
|
||||
Paper: Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine
|
||||
Link: https://arxiv.org/abs/2311.16452
|
||||
"""
|
||||
|
||||
def __init__(self, name: str = "MdEnsemble", llm: LLM = LLM(), vote_count: int = 3):
|
||||
super().__init__(name, llm)
|
||||
self.vote_count = vote_count
|
||||
|
||||
@staticmethod
|
||||
def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
|
||||
shuffled_solutions = solutions.copy()
|
||||
random.shuffle(shuffled_solutions)
|
||||
answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
|
||||
return shuffled_solutions, answer_mapping
|
||||
|
||||
async def __call__(self, solutions: List[str], problem: str, mode: str = None):
|
||||
print(f"solution count: {len(solutions)}")
|
||||
all_responses = []
|
||||
|
||||
for _ in range(self.vote_count):
|
||||
shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
|
||||
|
||||
solution_text = ""
|
||||
for index, solution in enumerate(shuffled_solutions):
|
||||
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
|
||||
|
||||
prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
|
||||
fill_kwargs = {"context": prompt, "llm": self.llm}
|
||||
if mode:
|
||||
fill_kwargs["mode"] = mode
|
||||
node = await ActionNode.from_pydantic(MdEnsembleOp).fill(**fill_kwargs)
|
||||
response = node.instruct_content.model_dump()
|
||||
|
||||
answer = response.get("solution_letter", "")
|
||||
answer = answer.strip().upper()
|
||||
|
||||
if answer in answer_mapping:
|
||||
original_index = answer_mapping[answer]
|
||||
all_responses.append(original_index)
|
||||
|
||||
most_frequent_index = Counter(all_responses).most_common(1)[0][0]
|
||||
final_answer = solutions[most_frequent_index]
|
||||
return {"solution": final_answer}
|
||||
|
||||
class MedPromptGraph(SolveGraph):
|
||||
def __init__(self, name: str, llm_config, dataset: str, vote_count: int = 3):
|
||||
super().__init__(name, llm_config, dataset)
|
||||
self.cot_generate = CoTGenerate(self.llm)
|
||||
self.md_ensemble = MdEnsemble(self.llm, vote_count=vote_count)
|
||||
|
||||
async def __call__(self, problem):
|
||||
solutions = []
|
||||
for i in range(2):
|
||||
solution = await self.cot_generate(problem, mode="context_fill")
|
||||
solutions.append(solution["solution"])
|
||||
solution = await self.md_ensemble(solutions, problem, mode="context_fill")
|
||||
return solution, self.llm.cost_manager.total_cost
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def main():
|
||||
llm_config = ModelsConfig.default().get("deepseek-coder")
|
||||
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
|
||||
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
|
||||
graph = MedPromptGraph(name="MedPrompt", llm_config=llm_config, dataset="Gsm8K", vote_count=2)
|
||||
file_path = "examples/ags/data/gsm8k.jsonl"
|
||||
samples = 264
|
||||
path = "examples/ags/data/baselines/general"
|
||||
score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=True)
|
||||
return score, cost
|
||||
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
109
examples/ags/experiments/baselines/self_consistency_gsm8k.py
Normal file
109
examples/ags/experiments/baselines/self_consistency_gsm8k.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
from examples.ags.scripts.operator import Operator
|
||||
from examples.ags.scripts.graph import SolveGraph
|
||||
from examples.ags.benchmark.gsm8k import gsm8k_evaluation
|
||||
from examples.ags.scripts.operator_an import GenerateOp
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
from metagpt.configs.models_config import ModelsConfig
|
||||
from metagpt.llm import LLM
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Dict, Any, List, Tuple
|
||||
from collections import Counter
|
||||
|
||||
import random
|
||||
|
||||
GSM8K_PROMPT_GPT = """
|
||||
{question}\nPlease reason step by step, and to ensure accuracy, provide the correct answer in the final, without any additional text.
|
||||
"""
|
||||
|
||||
GSM8K_PROMPT_DS = """
|
||||
{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
|
||||
"""
|
||||
|
||||
class GenerateOp(BaseModel):
|
||||
solution: str = Field(default="", description="solution for the problem")
|
||||
|
||||
class CoTGenerate(Operator):
|
||||
def __init__(self, llm: LLM, name: str = "Generate"):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, problem, mode: str = None):
|
||||
prompt = GSM8K_PROMPT_GPT.format(question=problem)
|
||||
fill_kwargs = {"context": prompt, "llm": self.llm}
|
||||
if mode:
|
||||
fill_kwargs["mode"] = mode
|
||||
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
SC_ENSEMBLE_PROMPT = """
|
||||
Given the question descripted as follows: {question}
|
||||
some solutions are generated to solve the question as follows:
|
||||
{solutions}
|
||||
|
||||
Evaluate these solutions and select the most consistent solution based on majority consensus.
|
||||
Give your answer with a single id of solution (without anything else).
|
||||
"""
|
||||
|
||||
class ScEnsembleOp(BaseModel):
|
||||
solution_letter: str = Field(default="", description="The letter of most consistent solution.")
|
||||
|
||||
|
||||
class ScEnsemble(Operator):
|
||||
"""
|
||||
Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
|
||||
Link: https://arxiv.org/abs/2203.11171
|
||||
Paper: Universal Self-Consistency for Large Language Model Generation
|
||||
Link: https://arxiv.org/abs/2311.17311
|
||||
"""
|
||||
|
||||
def __init__(self, name: str = "ScEnsemble", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, solutions: List[str], problem: str, mode: str = None):
|
||||
answer_mapping = {}
|
||||
solution_text = ""
|
||||
for index, solution in enumerate(solutions):
|
||||
answer_mapping[chr(65 + index)] = index
|
||||
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
|
||||
|
||||
prompt = SC_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
|
||||
fill_kwargs = {"context": prompt, "llm": self.llm}
|
||||
if mode:
|
||||
fill_kwargs["mode"] = mode
|
||||
node = await ActionNode.from_pydantic(ScEnsembleOp).fill(**fill_kwargs)
|
||||
response = node.instruct_content.model_dump()
|
||||
|
||||
answer = response.get("solution_letter", "")
|
||||
answer = answer.strip().upper()
|
||||
|
||||
return {"solution": solutions[answer_mapping[answer]]}
|
||||
|
||||
|
||||
class SelfConsistencyGraph(SolveGraph):
|
||||
def __init__(self, name: str, llm_config, dataset: str):
|
||||
super().__init__(name, llm_config, dataset)
|
||||
self.cot_generate = CoTGenerate(self.llm)
|
||||
self.sc_ensemble = ScEnsemble(self.llm)
|
||||
|
||||
async def __call__(self, problem):
|
||||
solutions = []
|
||||
for i in range(2):
|
||||
solution = await self.cot_generate(problem, mode="context_fill")
|
||||
solutions.append(solution["solution"])
|
||||
solution = await self.sc_ensemble(solutions, problem, mode="context_fill")
|
||||
return solution, self.llm.cost_manager.total_cost
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def main():
|
||||
llm_config = ModelsConfig.default().get("deepseek-coder")
|
||||
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
|
||||
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
|
||||
graph = SelfConsistencyGraph(name="SelfConsistency", llm_config=llm_config, dataset="Gsm8K")
|
||||
file_path = "examples/ags/data/gsm8k.jsonl"
|
||||
samples = 1
|
||||
path = "examples/ags/data/baselines/general"
|
||||
score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=False)
|
||||
return score, cost
|
||||
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
|
|
@ -127,17 +127,17 @@ Based on the given problem and solution candidates:
|
|||
"""
|
||||
|
||||
MD_ENSEMBLE_PROMPT = """
|
||||
You are given a coding problem:
|
||||
You are given a problem:
|
||||
{problem_description}
|
||||
|
||||
Here is a list of possible solutions to the problem:
|
||||
{solutions}
|
||||
|
||||
Using the inputs above, your goal is to choose the best solution to the code contest problem.
|
||||
Don't just pick the most efficient solution. The main consideration is that the solution can fully solve the problem in a correct and robust manner.
|
||||
Using the inputs above, your goal is to choose the best solution to the problem.
|
||||
The main consideration is that the solution can fully solve the problem in a correct and robust manner.
|
||||
Provide your final decision by writing the chosen solution letter.
|
||||
|
||||
Please maintain the JSON format in your response.
|
||||
Please follow the required format in your response.
|
||||
"""
|
||||
|
||||
SC_ENSEMBLE_PROMPT = """
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue