mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-08 15:05:17 +02:00
Refactor Evaluator
This commit is contained in:
parent
5d6fa7a68f
commit
17f3cd4955
1 changed files with 27 additions and 130 deletions
|
|
@ -3,7 +3,8 @@
|
|||
# @Author : all
|
||||
# @Desc : Evaluation for different datasets
|
||||
|
||||
from typing import Literal
|
||||
from typing import Literal, Tuple, Optional
|
||||
import asyncio
|
||||
|
||||
from examples.aflow.benchmark.gsm8k import optimize_gsm8k_evaluation
|
||||
from examples.aflow.benchmark.math import optimize_math_evaluation
|
||||
|
|
@ -22,155 +23,51 @@ class Evaluator:
|
|||
|
||||
def __init__(self, eval_path: str):
|
||||
self.eval_path = eval_path
|
||||
self.dataset_configs = {
|
||||
"GSM8K": {"name": "GSM8K", "eval_func": optimize_gsm8k_evaluation},
|
||||
"MATH": {"name": "MATH", "eval_func": optimize_math_evaluation},
|
||||
"HumanEval": {"name": "HumanEval", "eval_func": optimize_humaneval_evaluation},
|
||||
"HotpotQA": {"name": "HotpotQA", "eval_func": optimize_hotpotqa_evaluation},
|
||||
"MBPP": {"name": "MBPP", "eval_func": optimize_mbpp_evaluation},
|
||||
"DROP": {"name": "DROP", "eval_func": optimize_drop_evaluation},
|
||||
}
|
||||
|
||||
def graph_evaluate(self, dataset: DatasetType, graph, params: dict, path, is_test=False):
|
||||
"""
|
||||
Evaluates on validation dataset.
|
||||
"""
|
||||
if dataset == "GSM8K":
|
||||
return self._gsm8k_eval(graph, params, path, is_test)
|
||||
elif dataset == "MATH":
|
||||
return self._math_eval(graph, params, path, is_test)
|
||||
elif dataset == "HumanEval":
|
||||
return self._humaneval_eval(graph, params, path, is_test)
|
||||
elif dataset == "HotpotQA":
|
||||
return self._hotpotqa_eval(graph, params, path, is_test)
|
||||
elif dataset == "MBPP":
|
||||
return self._mbpp_eval(graph, params, path, is_test)
|
||||
elif dataset == "DROP":
|
||||
return self._drop_eval(graph, params, path, is_test)
|
||||
if dataset in self.dataset_configs:
|
||||
return self._generic_eval(dataset, graph, params, path, is_test)
|
||||
else:
|
||||
return None
|
||||
|
||||
async def _gsm8k_eval(self, graph_class, params, path, test=False):
|
||||
async def _generic_eval(self, dataset: DatasetType, graph_class, params: dict, path: str, test: bool = False) -> Tuple[float, float, float]:
|
||||
"""
|
||||
Evaluate GSM8K dataset.
|
||||
Generic evaluation function for all datasets.
|
||||
"""
|
||||
async def load_graph():
|
||||
dataset = params["dataset"]
|
||||
dataset_config = params["dataset"]
|
||||
llm_config = params["llm_config"]
|
||||
return graph_class(name="GSM8K", llm_config=llm_config, dataset=dataset)
|
||||
|
||||
if test:
|
||||
data_path = "examples/aflow/data/gsm8k_test.jsonl" # Replace with your JSONL file path
|
||||
va_list = None
|
||||
else:
|
||||
data_path = "examples/aflow/data/gsm8k_validate.jsonl" # Replace with your JSONL file path
|
||||
va_list = [1,2,3] # Replace with the filtered index list
|
||||
return graph_class(name=self.dataset_configs[dataset]["name"], llm_config=llm_config, dataset=dataset_config)
|
||||
|
||||
data_path, va_list = self._get_data_path_and_va_list(dataset, test)
|
||||
graph = await load_graph()
|
||||
|
||||
avg_score, avg_cost, total_cost = await optimize_gsm8k_evaluation(graph, data_path, path, va_list)
|
||||
eval_func = self.dataset_configs[dataset]["eval_func"]
|
||||
avg_score, avg_cost, total_cost = await eval_func(graph, data_path, path, va_list)
|
||||
|
||||
return avg_score, avg_cost, total_cost
|
||||
|
||||
async def _math_eval(self, graph_class, params, path, test=False):
|
||||
def _get_data_path_and_va_list(self, dataset: DatasetType, test: bool) -> Tuple[str, Optional[list]]:
|
||||
"""
|
||||
Evaluate MATH dataset.
|
||||
Get data path and validation list based on dataset and test flag.
|
||||
"""
|
||||
async def load_graph():
|
||||
dataset = params["dataset"]
|
||||
llm_config = params["llm_config"]
|
||||
return graph_class(name="MATH", llm_config=llm_config, dataset=dataset)
|
||||
|
||||
base_path = f"examples/aflow/data/{dataset.lower()}"
|
||||
if test:
|
||||
data_path = "examples/aflow/data/math_test.jsonl"
|
||||
va_list = None
|
||||
return f"{base_path}_test.jsonl", None
|
||||
else:
|
||||
data_path = "examples/aflow/data/math_validate.jsonl"
|
||||
va_list = [1,2,3] # Replace with the filtered index list
|
||||
return f"{base_path}_validate.jsonl", [1, 2, 3] # Replace with the actual filtered index list
|
||||
|
||||
graph = await load_graph()
|
||||
|
||||
avg_score, avg_cost, total_cost = await optimize_math_evaluation(graph, data_path, path, va_list)
|
||||
|
||||
return avg_score, avg_cost, total_cost
|
||||
|
||||
async def _humaneval_eval(self, graph_class, params, path, test=False):
|
||||
"""
|
||||
Evaluate HumanEval dataset.
|
||||
"""
|
||||
async def load_graph():
|
||||
dataset = params["dataset"]
|
||||
llm_config = params["llm_config"]
|
||||
return graph_class(name="HumanEval", llm_config=llm_config, dataset=dataset)
|
||||
|
||||
if test:
|
||||
data_path = "examples/aflow/data/human-eval_test.jsonl" # Replace with your JSONL file path
|
||||
va_list = None
|
||||
else:
|
||||
data_path = "examples/aflow/data/human-eval_validate.jsonl" # Replace with your JSONL file path
|
||||
va_list = [1,2,3] # Replace with the filtered index list
|
||||
|
||||
graph = await load_graph()
|
||||
|
||||
avg_score, avg_cost, total_cost = await optimize_humaneval_evaluation(graph, data_path, path, va_list)
|
||||
|
||||
return avg_score, avg_cost, total_cost
|
||||
|
||||
async def _mbpp_eval(self, graph_class, params, path, test=False):
|
||||
"""
|
||||
Evaluate MBPP dataset.
|
||||
"""
|
||||
async def load_graph():
|
||||
dataset = params["dataset"]
|
||||
llm_config = params["llm_config"]
|
||||
return graph_class(name="MBPP", llm_config=llm_config, dataset=dataset)
|
||||
|
||||
if test:
|
||||
data_path = "examples/aflow/data/mbpp_test.jsonl"
|
||||
va_list = None
|
||||
else:
|
||||
data_path = "examples/aflow/data/mbpp_validate.jsonl"
|
||||
va_list = [1,2,3] # Replace with the filtered index list
|
||||
|
||||
graph = await load_graph()
|
||||
|
||||
avg_score, avg_cost, total_cost = await optimize_mbpp_evaluation(graph, data_path, path, va_list)
|
||||
|
||||
return avg_score, avg_cost, total_cost
|
||||
|
||||
async def _hotpotqa_eval(self, graph_class, params, path, test=False):
|
||||
"""
|
||||
Evaluate HotpotQA dataset.
|
||||
"""
|
||||
async def load_graph():
|
||||
dataset = params["dataset"]
|
||||
llm_config = params["llm_config"]
|
||||
return graph_class(name="HotpotQA", llm_config=llm_config, dataset=dataset)
|
||||
|
||||
if test:
|
||||
data_path = "examples/aflow/data/hotpotqa_test.jsonl"
|
||||
va_list = None
|
||||
else:
|
||||
data_path = "examples/aflow/data/hotpotqa_validate.jsonl"
|
||||
va_list = [1,2,3] # Replace with the filtered index list
|
||||
|
||||
graph = await load_graph()
|
||||
|
||||
avg_score, avg_cost, total_cost = await optimize_hotpotqa_evaluation(graph, data_path, path, va_list)
|
||||
|
||||
return avg_score, avg_cost, total_cost
|
||||
|
||||
|
||||
async def _drop_eval(self, graph_class, params, path, test=False):
|
||||
"""
|
||||
Evaluate DROP dataset.
|
||||
"""
|
||||
async def load_graph():
|
||||
dataset = params["dataset"]
|
||||
llm_config = params["llm_config"]
|
||||
return graph_class(name="DROP", llm_config=llm_config, dataset=dataset)
|
||||
|
||||
if test:
|
||||
data_path = "examples/aflow/data/drop_test.jsonl"
|
||||
va_list = None
|
||||
else:
|
||||
data_path = "examples/aflow/data/drop_validate.jsonl"
|
||||
va_list = [1,2,3] # Replace with the filtered index list
|
||||
|
||||
graph = await load_graph()
|
||||
|
||||
avg_score, avg_cost, total_cost = await optimize_drop_evaluation(graph, data_path, path, va_list)
|
||||
|
||||
return avg_score, avg_cost, total_cost
|
||||
# Alias methods for backward compatibility
|
||||
for dataset in ["gsm8k", "math", "humaneval", "mbpp", "hotpotqa", "drop"]:
|
||||
setattr(Evaluator, f"_{dataset}_eval", lambda self, *args, dataset=dataset.upper(), **kwargs: self._generic_eval(dataset, *args, **kwargs))
|
||||
Loading…
Add table
Add a link
Reference in a new issue