mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-03 21:02:38 +02:00
Update baselines
This commit is contained in:
parent
0b0a49d772
commit
4ce18d7f48
4 changed files with 192 additions and 4 deletions
|
|
@ -6,6 +6,8 @@ import numpy as np
|
|||
from scipy.optimize import linear_sum_assignment
|
||||
from tqdm.asyncio import tqdm_asyncio
|
||||
|
||||
from examples.ags.benchmark.utils import generate_random_indices
|
||||
|
||||
def is_number(text: str) -> bool:
|
||||
try:
|
||||
float(text)
|
||||
|
|
@ -101,10 +103,14 @@ def f1_score(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
|
|||
f1 = (2 * precision * recall) / (precision + recall) if not (precision == 0.0 and recall == 0.0) else 0.0
|
||||
return f1
|
||||
|
||||
def load_data(file_path: str) -> List[Tuple[str, Dict[str, Any]]]:
|
||||
def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]:
|
||||
with open(file_path, mode="r") as file:
|
||||
data = json.load(file)
|
||||
return list(data.items())
|
||||
data = list(data.items())
|
||||
|
||||
random_indices = generate_random_indices(len(data), samples)
|
||||
data = [data[i] for i in random_indices]
|
||||
return data
|
||||
|
||||
async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float]:
|
||||
def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str]:
|
||||
|
|
@ -178,8 +184,8 @@ def save_results_to_csv(results: List[List[Any]], path: str) -> float:
|
|||
|
||||
return average_score
|
||||
|
||||
async def drop_evaluation(graph: Callable, file_path: str, path: str) -> float:
|
||||
data = load_data(file_path)
|
||||
async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
|
||||
data = load_data(file_path, samples)
|
||||
results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20)
|
||||
average_score = save_results_to_csv(results, path=path)
|
||||
print(f"Average score on DROP dataset: {average_score:.5f}")
|
||||
|
|
|
|||
57
examples/ags/experiments/baselines/cot_drop.py
Normal file
57
examples/ags/experiments/baselines/cot_drop.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
from examples.ags.scripts.operator import Operator
|
||||
from examples.ags.scripts.graph import SolveGraph
|
||||
from examples.ags.benchmark.drop import drop_evaluation
|
||||
from examples.ags.scripts.operator_an import GenerateOp
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
from metagpt.configs.models_config import ModelsConfig
|
||||
from metagpt.llm import LLM
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Tuple
|
||||
|
||||
DROP_PROMPT = """
|
||||
问题:{question}
|
||||
|
||||
上下文:
|
||||
{context}
|
||||
|
||||
请一步步思考,并在最后给出你的答案。使用XML标签包裹内容。
|
||||
"""
|
||||
|
||||
class GenerateOp(BaseModel):
|
||||
answer: str = Field(default="", description="问题的答案")
|
||||
|
||||
class CoTGenerate(Operator):
|
||||
def __init__(self, llm: LLM, name: str = "Generate"):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, question: str, context: str, mode: str = None) -> Tuple[str, str]:
|
||||
prompt = DROP_PROMPT.format(question=question, context=context)
|
||||
fill_kwargs = {"context": prompt, "llm": self.llm}
|
||||
if mode:
|
||||
fill_kwargs["mode"] = mode
|
||||
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response["answer"]
|
||||
|
||||
class CoTSolveGraph(SolveGraph):
|
||||
def __init__(self, name: str, llm_config, dataset: str):
|
||||
super().__init__(name, llm_config, dataset)
|
||||
self.cot_generate = CoTGenerate(self.llm)
|
||||
|
||||
async def __call__(self, question: str, context: str) -> Tuple[str, str]:
|
||||
answer = await self.cot_generate(question, context, mode="context_fill")
|
||||
return answer
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def main():
|
||||
llm_config = ModelsConfig.default().get("gpt-4o-mini")
|
||||
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
|
||||
graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="DROP")
|
||||
file_path = "examples/ags/data/drop_dataset_dev.json"
|
||||
samples = 3
|
||||
path = "examples/ags/data/baselines/general/drop"
|
||||
score = await drop_evaluation(graph, file_path, samples, path)
|
||||
return score
|
||||
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
73
examples/ags/experiments/baselines/cot_math.py
Normal file
73
examples/ags/experiments/baselines/cot_math.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
from examples.ags.scripts.operator import Operator
|
||||
from examples.ags.scripts.graph import SolveGraph
|
||||
from examples.ags.benchmark.math import math_evaluation
|
||||
from examples.ags.scripts.operator_an import GenerateOp
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
from metagpt.configs.models_config import ModelsConfig
|
||||
from metagpt.llm import LLM
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Dict, Any
|
||||
|
||||
MATH_PROMPT_GPT = """
|
||||
{question}\nPlease reason step by step, and put your final answer in the end. Wrap content using xml tags.
|
||||
"""
|
||||
|
||||
MATH_PROMPT_DS = """
|
||||
{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.
|
||||
"""
|
||||
|
||||
class GenerateOp(BaseModel):
|
||||
solution: str = Field(default="", description="solution for the problem")
|
||||
|
||||
class CoTGenerate(Operator):
|
||||
def __init__(self, llm: LLM, name: str = "Generate"):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, problem, mode: str = None):
|
||||
prompt = MATH_PROMPT_GPT.format(question=problem)
|
||||
fill_kwargs = {"context": prompt, "llm": self.llm}
|
||||
if mode:
|
||||
fill_kwargs["mode"] = mode
|
||||
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
class CoTSolveGraph(SolveGraph):
|
||||
def __init__(self, name: str, llm_config, dataset: str):
|
||||
super().__init__(name, llm_config, dataset)
|
||||
self.cot_generate = CoTGenerate(self.llm)
|
||||
|
||||
async def __call__(self, problem):
|
||||
solution = await self.cot_generate(problem, mode="context_fill")
|
||||
return solution, self.llm.cost_manager.total_cost
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def main():
|
||||
# llm_config = ModelsConfig.default().get("deepseek-coder")
|
||||
llm_config = ModelsConfig.default().get("gpt-4o-mini")
|
||||
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
|
||||
graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K")
|
||||
file_path = "examples/ags/data/math.jsonl"
|
||||
samples = 100
|
||||
# samples = 100
|
||||
path = "examples/ags/data/baselines/general/math"
|
||||
score = await math_evaluation(graph, file_path, samples, path)
|
||||
return score
|
||||
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
|
||||
|
||||
# self consistency operator; universal self consistency;
|
||||
|
||||
# IO指的没有任何Trick,看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。
|
||||
|
||||
# deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106
|
||||
|
||||
|
||||
|
||||
GENERATE_PROMPT = """
|
||||
Generate Solution for the following problem: {problem_description}
|
||||
"""
|
||||
|
||||
# med ensemble
|
||||
52
examples/ags/experiments/baselines/cot_mbpp.py
Normal file
52
examples/ags/experiments/baselines/cot_mbpp.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
from examples.ags.scripts.operator import Operator
|
||||
from examples.ags.scripts.graph import SolveGraph
|
||||
from examples.ags.benchmark.mbpp import mbpp_evaluation
|
||||
from examples.ags.scripts.operator_an import GenerateOp
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
from metagpt.configs.models_config import ModelsConfig
|
||||
from metagpt.llm import LLM
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Tuple
|
||||
|
||||
MBPP_PROMPT = """
|
||||
{question}\nPlease reason step by step, and put your python function in the end.
|
||||
"""
|
||||
|
||||
class GenerateOp(BaseModel):
|
||||
solution: str = Field(default="", description="问题的Python函数实现")
|
||||
|
||||
class CoTGenerate(Operator):
|
||||
def __init__(self, llm: LLM, name: str = "Generate"):
|
||||
super().__init__(name, llm)
|
||||
|
||||
async def __call__(self, question: str, mode: str = None) -> Tuple[str, str]:
|
||||
prompt = MBPP_PROMPT.format(question=question)
|
||||
fill_kwargs = {"context": prompt, "llm": self.llm}
|
||||
if mode:
|
||||
fill_kwargs["mode"] = mode
|
||||
node = await ActionNode.from_pydantic(GenerateOp).fill(**fill_kwargs)
|
||||
response = node.instruct_content.model_dump()
|
||||
return response
|
||||
|
||||
class CoTSolveGraph(SolveGraph):
|
||||
def __init__(self, name: str, llm_config, dataset: str):
|
||||
super().__init__(name, llm_config, dataset)
|
||||
self.cot_generate = CoTGenerate(self.llm)
|
||||
|
||||
async def __call__(self, question: str) -> Tuple[str, str]:
|
||||
response = await self.cot_generate(question, mode="context_fill")
|
||||
return response["solution"]
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def main():
|
||||
llm_config = ModelsConfig.default().get("gpt-4o-mini")
|
||||
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
|
||||
graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="MBPP")
|
||||
file_path = "examples/ags/data/mbpp-new.jsonl"
|
||||
samples = 30
|
||||
path = "examples/ags/data/baselines/general/mbpp"
|
||||
score = await mbpp_evaluation(graph, file_path, samples, path)
|
||||
return score
|
||||
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue