更新了eval BUG,同时更新了新的baseline

This commit is contained in:
didi 2024-09-11 17:00:14 +08:00
parent b9a2d94da2
commit b805da0bbe
20 changed files with 274 additions and 40 deletions

View file

@ -39,12 +39,12 @@ def loose_match_score(expected_output: str, prediction: str, tolerance: float =
return 0
async def load_data(file_path: str, samples=1) -> List[dict]:
async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
data = []
async with aiofiles.open(file_path, mode="r") as file:
async for line in file:
data.append(json.loads(line))
random_indices = generate_random_indices(len(data), samples)
random_indices = generate_random_indices(len(data), samples, test=test)
data = [data[i] for i in random_indices]
return data
@ -64,26 +64,33 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
prompt = input
max_retries = 5
retries = 0
while retries < max_retries:
try:
prediction = await graph(prompt)
cost = prediction[1]
output = prediction[0]["solution"]
prediction = await graph(prompt)
cost = prediction[1]
output = prediction[0]["solution"]
score = loose_match_score(expected_output, output)
break
print(output)
except Exception as e:
retries += 1
print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
score = loose_match_score(expected_output, output)
# break
# while retries < max_retries:
# try:
# prediction = await graph(prompt)
# cost = prediction[1]
# output = prediction[0]["solution"]
if retries == max_retries:
print("Maximum retries reached. Skipping this sample.")
output = None
cost = None
score = 0
break
# score = loose_match_score(expected_output, output)
# break
# except Exception as e:
# retries += 1
# print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
# if retries == max_retries:
# print("Maximum retries reached. Skipping this sample.")
# output = None
# cost = None
# score = 0
# break
return input, output, expected_output, score, cost
@ -101,9 +108,9 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
return await tqdm_asyncio.gather(*tasks, desc="Evaluating problems", total=len(data))
async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
"""GSM8K evaluation main function"""
data = await load_data(file_path, samples)
data = await load_data(file_path, samples, test=test)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5)
average_score, total_cost = save_results_to_csv(results, path=path)
print(f"Average score: {average_score:.5f}")

View file

@ -0,0 +1,133 @@
from examples.ags.scripts.operator import Operator
from examples.ags.scripts.graph import SolveGraph
from examples.ags.benchmark.gsm8k import gsm8k_evaluation
from examples.ags.scripts.operator_an import GenerateOp
from metagpt.actions.action_node import ActionNode
from metagpt.configs.models_config import ModelsConfig
from metagpt.llm import LLM
from pydantic import BaseModel, Field
from typing import Dict, Any, List, Tuple
from collections import Counter
import random
GSM8K_PROMPT_GPT = """
{question}\nPlease reason step by step, and to ensure accuracy, provide the correct answer in the final, without any additional text.
"""
GSM8K_PROMPT_DS = """
{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
"""
class GenerateOp(BaseModel):
solution: str = Field(default="", description="solution for the problem")
class CoTGenerate(Operator):
def __init__(self, llm: LLM, name: str = "Generate"):
super().__init__(name, llm)
async def __call__(self, problem, mode: str = None):
prompt = GSM8K_PROMPT_GPT.format(question=problem)
fill_kwargs = {"context": prompt, "llm": self.llm}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
return response
MD_ENSEMBLE_PROMPT = """
You are given a problem:
{question}
Here is a list of possible solutions to the problem:
{solutions}
Using the inputs above, your goal is to choose the best solution to the problem.
The main consideration is that the solution can fully solve the problem in a correct and robust manner.
Provide your final decision by writing the chosen solution letter.
Please follow the required format in your response.
"""
class MdEnsembleOp(BaseModel):
thought: str = Field(
default="",
description="Step-by-step analysis of the solutions to determine the best one.",
)
solution_letter: str = Field(default="", description="The letter of the chosen best solution (only one letter).")
class MdEnsemble(Operator):
"""
Paper: Can Generalist Foundation Models Outcompete Special-Purpose Tuning? Case Study in Medicine
Link: https://arxiv.org/abs/2311.16452
"""
def __init__(self, name: str = "MdEnsemble", llm: LLM = LLM(), vote_count: int = 3):
super().__init__(name, llm)
self.vote_count = vote_count
@staticmethod
def shuffle_answers(solutions: List[str]) -> Tuple[List[str], Dict[str, str]]:
shuffled_solutions = solutions.copy()
random.shuffle(shuffled_solutions)
answer_mapping = {chr(65 + i): solutions.index(solution) for i, solution in enumerate(shuffled_solutions)}
return shuffled_solutions, answer_mapping
async def __call__(self, solutions: List[str], problem: str, mode: str = None):
print(f"solution count: {len(solutions)}")
all_responses = []
for _ in range(self.vote_count):
shuffled_solutions, answer_mapping = self.shuffle_answers(solutions)
solution_text = ""
for index, solution in enumerate(shuffled_solutions):
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
prompt = MD_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
fill_kwargs = {"context": prompt, "llm": self.llm}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(MdEnsembleOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
answer = response.get("solution_letter", "")
answer = answer.strip().upper()
if answer in answer_mapping:
original_index = answer_mapping[answer]
all_responses.append(original_index)
most_frequent_index = Counter(all_responses).most_common(1)[0][0]
final_answer = solutions[most_frequent_index]
return {"solution": final_answer}
class MedPromptGraph(SolveGraph):
def __init__(self, name: str, llm_config, dataset: str, vote_count: int = 3):
super().__init__(name, llm_config, dataset)
self.cot_generate = CoTGenerate(self.llm)
self.md_ensemble = MdEnsemble(self.llm, vote_count=vote_count)
async def __call__(self, problem):
solutions = []
for i in range(2):
solution = await self.cot_generate(problem, mode="context_fill")
solutions.append(solution["solution"])
solution = await self.md_ensemble(solutions, problem, mode="context_fill")
return solution, self.llm.cost_manager.total_cost
if __name__ == "__main__":
async def main():
llm_config = ModelsConfig.default().get("deepseek-coder")
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
graph = MedPromptGraph(name="MedPrompt", llm_config=llm_config, dataset="Gsm8K", vote_count=2)
file_path = "examples/ags/data/gsm8k.jsonl"
samples = 264
path = "examples/ags/data/baselines/general"
score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=True)
return score, cost
import asyncio
asyncio.run(main())

View file

@ -0,0 +1,109 @@
from examples.ags.scripts.operator import Operator
from examples.ags.scripts.graph import SolveGraph
from examples.ags.benchmark.gsm8k import gsm8k_evaluation
from examples.ags.scripts.operator_an import GenerateOp
from metagpt.actions.action_node import ActionNode
from metagpt.configs.models_config import ModelsConfig
from metagpt.llm import LLM
from pydantic import BaseModel, Field
from typing import Dict, Any, List, Tuple
from collections import Counter
import random
GSM8K_PROMPT_GPT = """
{question}\nPlease reason step by step, and to ensure accuracy, provide the correct answer in the final, without any additional text.
"""
GSM8K_PROMPT_DS = """
{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
"""
class GenerateOp(BaseModel):
solution: str = Field(default="", description="solution for the problem")
class CoTGenerate(Operator):
def __init__(self, llm: LLM, name: str = "Generate"):
super().__init__(name, llm)
async def __call__(self, problem, mode: str = None):
prompt = GSM8K_PROMPT_GPT.format(question=problem)
fill_kwargs = {"context": prompt, "llm": self.llm}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
return response
SC_ENSEMBLE_PROMPT = """
Given the question descripted as follows: {question}
some solutions are generated to solve the question as follows:
{solutions}
Evaluate these solutions and select the most consistent solution based on majority consensus.
Give your answer with a single id of solution (without anything else).
"""
class ScEnsembleOp(BaseModel):
solution_letter: str = Field(default="", description="The letter of most consistent solution.")
class ScEnsemble(Operator):
"""
Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
Link: https://arxiv.org/abs/2203.11171
Paper: Universal Self-Consistency for Large Language Model Generation
Link: https://arxiv.org/abs/2311.17311
"""
def __init__(self, name: str = "ScEnsemble", llm: LLM = LLM()):
super().__init__(name, llm)
async def __call__(self, solutions: List[str], problem: str, mode: str = None):
answer_mapping = {}
solution_text = ""
for index, solution in enumerate(solutions):
answer_mapping[chr(65 + index)] = index
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
prompt = SC_ENSEMBLE_PROMPT.format(solutions=solution_text, question=problem)
fill_kwargs = {"context": prompt, "llm": self.llm}
if mode:
fill_kwargs["mode"] = mode
node = await ActionNode.from_pydantic(ScEnsembleOp).fill(**fill_kwargs)
response = node.instruct_content.model_dump()
answer = response.get("solution_letter", "")
answer = answer.strip().upper()
return {"solution": solutions[answer_mapping[answer]]}
class SelfConsistencyGraph(SolveGraph):
def __init__(self, name: str, llm_config, dataset: str):
super().__init__(name, llm_config, dataset)
self.cot_generate = CoTGenerate(self.llm)
self.sc_ensemble = ScEnsemble(self.llm)
async def __call__(self, problem):
solutions = []
for i in range(2):
solution = await self.cot_generate(problem, mode="context_fill")
solutions.append(solution["solution"])
solution = await self.sc_ensemble(solutions, problem, mode="context_fill")
return solution, self.llm.cost_manager.total_cost
if __name__ == "__main__":
async def main():
llm_config = ModelsConfig.default().get("deepseek-coder")
# llm_config = ModelsConfig.default().get("gpt-4o-mini")
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
graph = SelfConsistencyGraph(name="SelfConsistency", llm_config=llm_config, dataset="Gsm8K")
file_path = "examples/ags/data/gsm8k.jsonl"
samples = 1
path = "examples/ags/data/baselines/general"
score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=False)
return score, cost
import asyncio
asyncio.run(main())

View file

@ -127,17 +127,17 @@ Based on the given problem and solution candidates:
"""
MD_ENSEMBLE_PROMPT = """
You are given a coding problem:
You are given a problem:
{problem_description}
Here is a list of possible solutions to the problem:
{solutions}
Using the inputs above, your goal is to choose the best solution to the code contest problem.
Don't just pick the most efficient solution. The main consideration is that the solution can fully solve the problem in a correct and robust manner.
Using the inputs above, your goal is to choose the best solution to the problem.
The main consideration is that the solution can fully solve the problem in a correct and robust manner.
Provide your final decision by writing the chosen solution letter.
Please maintain the JSON format in your response.
Please follow the required format in your response.
"""
SC_ENSEMBLE_PROMPT = """