mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-08 15:05:17 +02:00
更新了eval BUG,同时更新了新的baseline
This commit is contained in:
parent
b9a2d94da2
commit
b805da0bbe
20 changed files with 274 additions and 40 deletions
|
|
@ -39,12 +39,12 @@ def loose_match_score(expected_output: str, prediction: str, tolerance: float =
|
|||
return 0
|
||||
|
||||
|
||||
async def load_data(file_path: str, samples=1) -> List[dict]:
|
||||
async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
|
||||
data = []
|
||||
async with aiofiles.open(file_path, mode="r") as file:
|
||||
async for line in file:
|
||||
data.append(json.loads(line))
|
||||
random_indices = generate_random_indices(len(data), samples)
|
||||
random_indices = generate_random_indices(len(data), samples, test=test)
|
||||
data = [data[i] for i in random_indices]
|
||||
return data
|
||||
|
||||
|
|
@ -64,26 +64,33 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
|
|||
prompt = input
|
||||
max_retries = 5
|
||||
retries = 0
|
||||
|
||||
while retries < max_retries:
|
||||
try:
|
||||
prediction = await graph(prompt)
|
||||
cost = prediction[1]
|
||||
output = prediction[0]["solution"]
|
||||
prediction = await graph(prompt)
|
||||
cost = prediction[1]
|
||||
output = prediction[0]["solution"]
|
||||
|
||||
score = loose_match_score(expected_output, output)
|
||||
break
|
||||
print(output)
|
||||
|
||||
except Exception as e:
|
||||
retries += 1
|
||||
print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
|
||||
score = loose_match_score(expected_output, output)
|
||||
# break
|
||||
# while retries < max_retries:
|
||||
# try:
|
||||
# prediction = await graph(prompt)
|
||||
# cost = prediction[1]
|
||||
# output = prediction[0]["solution"]
|
||||
|
||||
if retries == max_retries:
|
||||
print("Maximum retries reached. Skipping this sample.")
|
||||
output = None
|
||||
cost = None
|
||||
score = 0
|
||||
break
|
||||
# score = loose_match_score(expected_output, output)
|
||||
# break
|
||||
|
||||
# except Exception as e:
|
||||
# retries += 1
|
||||
# print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
|
||||
|
||||
# if retries == max_retries:
|
||||
# print("Maximum retries reached. Skipping this sample.")
|
||||
# output = None
|
||||
# cost = None
|
||||
# score = 0
|
||||
# break
|
||||
|
||||
return input, output, expected_output, score, cost
|
||||
|
||||
|
|
@ -101,9 +108,9 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
|
|||
|
||||
return await tqdm_asyncio.gather(*tasks, desc="Evaluating problems", total=len(data))
|
||||
|
||||
async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
|
||||
async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
|
||||
"""GSM8K evaluation main function"""
|
||||
data = await load_data(file_path, samples)
|
||||
data = await load_data(file_path, samples, test=test)
|
||||
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5)
|
||||
average_score, total_cost = save_results_to_csv(results, path=path)
|
||||
print(f"Average score: {average_score:.5f}")
|
||||
|
|
|
|||
133
examples/ags/experiments/baselines/medprompt_gsm8k.py
Normal file
133
examples/ags/experiments/baselines/medprompt_gsm8k.py
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
from examples.ags.scripts.operator import Operator
|
||||
from examples.ags.scripts.graph import SolveGraph
|
||||
from examples.ags.benchmark.gsm8k import gsm8k_evaluation
|
||||
from examples.ags.scripts.operator_an import GenerateOp
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
from metagpt.configs.models_config import ModelsConfig
|
||||
from metagpt.llm import LLM
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Dict, Any, List, Tuple
|
||||
from collections import Counter
|
||||
|
||||
import random
|
||||
|
||||
GSM8K_PROMPT_GPT = """
|
||||
{question}\nPlease reason step by step, and to ensure accuracy, provide the correct answer in the final, without any additional text.
|
||||
"""
|
||||
|
||||
GSM8K_PROMPT_DS = """
|
||||
{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
|
||||
"""
|
||||
|
||||
class GenerateOp(BaseModel):
|
||||
solution: str = Field(default="", description="solution for the problem")
|
||||
|
||||
class CoTGenerate(Operator):
|
||||
def __init__(self, llm: LLM, name: str = "Generate"):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, problem, mode: str = None):
|
||||
prompt = GSM8K_PROMPT_GPT.format(question=problem)
|
||||
fill_kwargs = {"context": prompt, "llm": self.llm}
|
||||
if mode:
|
||||
fill_kwargs["mode"] = mode
|
||||
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
MD_ENSEMBLE_PROMPT = """
|
||||
You are given a problem:
|
||||
{question}
|
||||
|
||||
Here is a list of possible solutions to the problem:
|
||||
{solutions}
|
||||
|
||||
Using the inputs above, your goal is to choose the best solution to the problem.
|
||||
The main consideration is that the solution can fully solve the problem in a correct and robust manner.
|
||||
Provide your final decision by writing the chosen solution letter.
|
||||
|
||||
Please follow the required format in your response.
|
||||
"""
|
||||
|
||||
class MdEnsembleOp(BaseModel):
|
||||
thought: str = Field(
|
||||
default="",
|
||||
description="Step-by-step analysis of the solutions to determine the best one.",
|
||||
)
|
||||
solution_letter: str = Field(default="", description="The letter of the chosen best solution (only one letter).")
|
||||
|
||||
|
||||
class MdEnsemble(Operator):
|
||||
"""
|
||||
Paper: Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine
|
||||
Link: https://arxiv.org/abs/2311.16452
|
||||
"""
|
||||
|
||||
def __init__(self, name: str = "MdEnsemble", llm: LLM = LLM(), vote_count: int = 3):
|
||||
super().__init__(name, llm)
|
||||
self.vote_count = vote_count
|
||||
|
||||
@staticmethod
|
||||
def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
|
||||
shuffled_solutions = solutions.copy()
|
||||
random.shuffle(shuffled_solutions)
|
||||
answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
|
||||
return shuffled_solutions, answer_mapping
|
||||
|
||||
async def __call__(self, solutions: List[str], problem: str, mode: str = None):
|
||||
print(f"solution count: {len(solutions)}")
|
||||
all_responses = []
|
||||
|
||||
for _ in range(self.vote_count):
|
||||
shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
|
||||
|
||||
solution_text = ""
|
||||
for index, solution in enumerate(shuffled_solutions):
|
||||
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
|
||||
|
||||
prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
|
||||
fill_kwargs = {"context": prompt, "llm": self.llm}
|
||||
if mode:
|
||||
fill_kwargs["mode"] = mode
|
||||
node = await ActionNode.from_pydantic(MdEnsembleOp).fill(**fill_kwargs)
|
||||
response = node.instruct_content.model_dump()
|
||||
|
||||
answer = response.get("solution_letter", "")
|
||||
answer = answer.strip().upper()
|
||||
|
||||
if answer in answer_mapping:
|
||||
original_index = answer_mapping[answer]
|
||||
all_responses.append(original_index)
|
||||
|
||||
most_frequent_index = Counter(all_responses).most_common(1)[0][0]
|
||||
final_answer = solutions[most_frequent_index]
|
||||
return {"solution": final_answer}
|
||||
|
||||
class MedPromptGraph(SolveGraph):
|
||||
def __init__(self, name: str, llm_config, dataset: str, vote_count: int = 3):
|
||||
super().__init__(name, llm_config, dataset)
|
||||
self.cot_generate = CoTGenerate(self.llm)
|
||||
self.md_ensemble = MdEnsemble(self.llm, vote_count=vote_count)
|
||||
|
||||
async def __call__(self, problem):
|
||||
solutions = []
|
||||
for i in range(2):
|
||||
solution = await self.cot_generate(problem, mode="context_fill")
|
||||
solutions.append(solution["solution"])
|
||||
solution = await self.md_ensemble(solutions, problem, mode="context_fill")
|
||||
return solution, self.llm.cost_manager.total_cost
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def main():
|
||||
llm_config = ModelsConfig.default().get("deepseek-coder")
|
||||
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
|
||||
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
|
||||
graph = MedPromptGraph(name="MedPrompt", llm_config=llm_config, dataset="Gsm8K", vote_count=2)
|
||||
file_path = "examples/ags/data/gsm8k.jsonl"
|
||||
samples = 264
|
||||
path = "examples/ags/data/baselines/general"
|
||||
score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=True)
|
||||
return score, cost
|
||||
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
109
examples/ags/experiments/baselines/self_consistency_gsm8k.py
Normal file
109
examples/ags/experiments/baselines/self_consistency_gsm8k.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
from examples.ags.scripts.operator import Operator
|
||||
from examples.ags.scripts.graph import SolveGraph
|
||||
from examples.ags.benchmark.gsm8k import gsm8k_evaluation
|
||||
from examples.ags.scripts.operator_an import GenerateOp
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
from metagpt.configs.models_config import ModelsConfig
|
||||
from metagpt.llm import LLM
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Dict, Any, List, Tuple
|
||||
from collections import Counter
|
||||
|
||||
import random
|
||||
|
||||
GSM8K_PROMPT_GPT = """
|
||||
{question}\nPlease reason step by step, and to ensure accuracy, provide the correct answer in the final, without any additional text.
|
||||
"""
|
||||
|
||||
GSM8K_PROMPT_DS = """
|
||||
{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
|
||||
"""
|
||||
|
||||
class GenerateOp(BaseModel):
|
||||
solution: str = Field(default="", description="solution for the problem")
|
||||
|
||||
class CoTGenerate(Operator):
|
||||
def __init__(self, llm: LLM, name: str = "Generate"):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, problem, mode: str = None):
|
||||
prompt = GSM8K_PROMPT_GPT.format(question=problem)
|
||||
fill_kwargs = {"context": prompt, "llm": self.llm}
|
||||
if mode:
|
||||
fill_kwargs["mode"] = mode
|
||||
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
SC_ENSEMBLE_PROMPT = """
|
||||
Given the question descripted as follows: {question}
|
||||
some solutions are generated to solve the question as follows:
|
||||
{solutions}
|
||||
|
||||
Evaluate these solutions and select the most consistent solution based on majority consensus.
|
||||
Give your answer with a single id of solution (without anything else).
|
||||
"""
|
||||
|
||||
class ScEnsembleOp(BaseModel):
|
||||
solution_letter: str = Field(default="", description="The letter of most consistent solution.")
|
||||
|
||||
|
||||
class ScEnsemble(Operator):
|
||||
"""
|
||||
Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
|
||||
Link: https://arxiv.org/abs/2203.11171
|
||||
Paper: Universal Self-Consistency for Large Language Model Generation
|
||||
Link: https://arxiv.org/abs/2311.17311
|
||||
"""
|
||||
|
||||
def __init__(self, name: str = "ScEnsemble", llm: LLM = LLM()):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, solutions: List[str], problem: str, mode: str = None):
|
||||
answer_mapping = {}
|
||||
solution_text = ""
|
||||
for index, solution in enumerate(solutions):
|
||||
answer_mapping[chr(65 + index)] = index
|
||||
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
|
||||
|
||||
prompt = SC_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
|
||||
fill_kwargs = {"context": prompt, "llm": self.llm}
|
||||
if mode:
|
||||
fill_kwargs["mode"] = mode
|
||||
node = await ActionNode.from_pydantic(ScEnsembleOp).fill(**fill_kwargs)
|
||||
response = node.instruct_content.model_dump()
|
||||
|
||||
answer = response.get("solution_letter", "")
|
||||
answer = answer.strip().upper()
|
||||
|
||||
return {"solution": solutions[answer_mapping[answer]]}
|
||||
|
||||
|
||||
class SelfConsistencyGraph(SolveGraph):
|
||||
def __init__(self, name: str, llm_config, dataset: str):
|
||||
super().__init__(name, llm_config, dataset)
|
||||
self.cot_generate = CoTGenerate(self.llm)
|
||||
self.sc_ensemble = ScEnsemble(self.llm)
|
||||
|
||||
async def __call__(self, problem):
|
||||
solutions = []
|
||||
for i in range(2):
|
||||
solution = await self.cot_generate(problem, mode="context_fill")
|
||||
solutions.append(solution["solution"])
|
||||
solution = await self.sc_ensemble(solutions, problem, mode="context_fill")
|
||||
return solution, self.llm.cost_manager.total_cost
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def main():
|
||||
llm_config = ModelsConfig.default().get("deepseek-coder")
|
||||
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
|
||||
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
|
||||
graph = SelfConsistencyGraph(name="SelfConsistency", llm_config=llm_config, dataset="Gsm8K")
|
||||
file_path = "examples/ags/data/gsm8k.jsonl"
|
||||
samples = 1
|
||||
path = "examples/ags/data/baselines/general"
|
||||
score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=False)
|
||||
return score, cost
|
||||
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
|
|
@ -127,17 +127,17 @@ Based on the given problem and solution candidates:
|
|||
"""
|
||||
|
||||
MD_ENSEMBLE_PROMPT = """
|
||||
You are given a coding problem:
|
||||
You are given a problem:
|
||||
{problem_description}
|
||||
|
||||
Here is a list of possible solutions to the problem:
|
||||
{solutions}
|
||||
|
||||
Using the inputs above, your goal is to choose the best solution to the code contest problem.
|
||||
Don't just pick the most efficient solution. The main consideration is that the solution can fully solve the problem in a correct and robust manner.
|
||||
Using the inputs above, your goal is to choose the best solution to the problem.
|
||||
The main consideration is that the solution can fully solve the problem in a correct and robust manner.
|
||||
Provide your final decision by writing the chosen solution letter.
|
||||
|
||||
Please maintain the JSON format in your response.
|
||||
Please follow the required format in your response.
|
||||
"""
|
||||
|
||||
SC_ENSEMBLE_PROMPT = """
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue