Refactor Evaluator

This commit is contained in:
didi 2024-10-19 07:41:59 +08:00
parent 5d6fa7a68f
commit 17f3cd4955

View file

@ -3,7 +3,8 @@
# @Author : all
# @Desc : Evaluation for different datasets
from typing import Literal
from typing import Literal, Tuple, Optional
import asyncio
from examples.aflow.benchmark.gsm8k import optimize_gsm8k_evaluation
from examples.aflow.benchmark.math import optimize_math_evaluation
@ -22,155 +23,51 @@ class Evaluator:
def __init__(self, eval_path: str):
self.eval_path = eval_path
self.dataset_configs = {
"GSM8K": {"name": "GSM8K", "eval_func": optimize_gsm8k_evaluation},
"MATH": {"name": "MATH", "eval_func": optimize_math_evaluation},
"HumanEval": {"name": "HumanEval", "eval_func": optimize_humaneval_evaluation},
"HotpotQA": {"name": "HotpotQA", "eval_func": optimize_hotpotqa_evaluation},
"MBPP": {"name": "MBPP", "eval_func": optimize_mbpp_evaluation},
"DROP": {"name": "DROP", "eval_func": optimize_drop_evaluation},
}
def graph_evaluate(self, dataset: DatasetType, graph, params: dict, path, is_test=False):
"""
Evaluates on validation dataset.
"""
if dataset == "GSM8K":
return self._gsm8k_eval(graph, params, path, is_test)
elif dataset == "MATH":
return self._math_eval(graph, params, path, is_test)
elif dataset == "HumanEval":
return self._humaneval_eval(graph, params, path, is_test)
elif dataset == "HotpotQA":
return self._hotpotqa_eval(graph, params, path, is_test)
elif dataset == "MBPP":
return self._mbpp_eval(graph, params, path, is_test)
elif dataset == "DROP":
return self._drop_eval(graph, params, path, is_test)
if dataset in self.dataset_configs:
return self._generic_eval(dataset, graph, params, path, is_test)
else:
return None
async def _gsm8k_eval(self, graph_class, params, path, test=False):
async def _generic_eval(self, dataset: DatasetType, graph_class, params: dict, path: str, test: bool = False) -> Tuple[float, float, float]:
"""
Evaluate GSM8K dataset.
Generic evaluation function for all datasets.
"""
async def load_graph():
dataset = params["dataset"]
dataset_config = params["dataset"]
llm_config = params["llm_config"]
return graph_class(name="GSM8K", llm_config=llm_config, dataset=dataset)
if test:
data_path = "examples/aflow/data/gsm8k_test.jsonl" # Replace with your JSONL file path
va_list = None
else:
data_path = "examples/aflow/data/gsm8k_validate.jsonl" # Replace with your JSONL file path
va_list = [1,2,3] # Replace with the filtered index list
return graph_class(name=self.dataset_configs[dataset]["name"], llm_config=llm_config, dataset=dataset_config)
data_path, va_list = self._get_data_path_and_va_list(dataset, test)
graph = await load_graph()
avg_score, avg_cost, total_cost = await optimize_gsm8k_evaluation(graph, data_path, path, va_list)
eval_func = self.dataset_configs[dataset]["eval_func"]
avg_score, avg_cost, total_cost = await eval_func(graph, data_path, path, va_list)
return avg_score, avg_cost, total_cost
async def _math_eval(self, graph_class, params, path, test=False):
def _get_data_path_and_va_list(self, dataset: DatasetType, test: bool) -> Tuple[str, Optional[list]]:
"""
Evaluate MATH dataset.
Get data path and validation list based on dataset and test flag.
"""
async def load_graph():
dataset = params["dataset"]
llm_config = params["llm_config"]
return graph_class(name="MATH", llm_config=llm_config, dataset=dataset)
base_path = f"examples/aflow/data/{dataset.lower()}"
if test:
data_path = "examples/aflow/data/math_test.jsonl"
va_list = None
return f"{base_path}_test.jsonl", None
else:
data_path = "examples/aflow/data/math_validate.jsonl"
va_list = [1,2,3] # Replace with the filtered index list
return f"{base_path}_validate.jsonl", [1, 2, 3] # Replace with the actual filtered index list
graph = await load_graph()
avg_score, avg_cost, total_cost = await optimize_math_evaluation(graph, data_path, path, va_list)
return avg_score, avg_cost, total_cost
async def _humaneval_eval(self, graph_class, params, path, test=False):
"""
Evaluate HumanEval dataset.
"""
async def load_graph():
dataset = params["dataset"]
llm_config = params["llm_config"]
return graph_class(name="HumanEval", llm_config=llm_config, dataset=dataset)
if test:
data_path = "examples/aflow/data/human-eval_test.jsonl" # Replace with your JSONL file path
va_list = None
else:
data_path = "examples/aflow/data/human-eval_validate.jsonl" # Replace with your JSONL file path
va_list = [1,2,3] # Replace with the filtered index list
graph = await load_graph()
avg_score, avg_cost, total_cost = await optimize_humaneval_evaluation(graph, data_path, path, va_list)
return avg_score, avg_cost, total_cost
async def _mbpp_eval(self, graph_class, params, path, test=False):
"""
Evaluate MBPP dataset.
"""
async def load_graph():
dataset = params["dataset"]
llm_config = params["llm_config"]
return graph_class(name="MBPP", llm_config=llm_config, dataset=dataset)
if test:
data_path = "examples/aflow/data/mbpp_test.jsonl"
va_list = None
else:
data_path = "examples/aflow/data/mbpp_validate.jsonl"
va_list = [1,2,3] # Replace with the filtered index list
graph = await load_graph()
avg_score, avg_cost, total_cost = await optimize_mbpp_evaluation(graph, data_path, path, va_list)
return avg_score, avg_cost, total_cost
async def _hotpotqa_eval(self, graph_class, params, path, test=False):
"""
Evaluate HotpotQA dataset.
"""
async def load_graph():
dataset = params["dataset"]
llm_config = params["llm_config"]
return graph_class(name="HotpotQA", llm_config=llm_config, dataset=dataset)
if test:
data_path = "examples/aflow/data/hotpotqa_test.jsonl"
va_list = None
else:
data_path = "examples/aflow/data/hotpotqa_validate.jsonl"
va_list = [1,2,3] # Replace with the filtered index list
graph = await load_graph()
avg_score, avg_cost, total_cost = await optimize_hotpotqa_evaluation(graph, data_path, path, va_list)
return avg_score, avg_cost, total_cost
async def _drop_eval(self, graph_class, params, path, test=False):
"""
Evaluate DROP dataset.
"""
async def load_graph():
dataset = params["dataset"]
llm_config = params["llm_config"]
return graph_class(name="DROP", llm_config=llm_config, dataset=dataset)
if test:
data_path = "examples/aflow/data/drop_test.jsonl"
va_list = None
else:
data_path = "examples/aflow/data/drop_validate.jsonl"
va_list = [1,2,3] # Replace with the filtered index list
graph = await load_graph()
avg_score, avg_cost, total_cost = await optimize_drop_evaluation(graph, data_path, path, va_list)
return avg_score, avg_cost, total_cost
# Alias methods for backward compatibility
for dataset in ["gsm8k", "math", "humaneval", "mbpp", "hotpotqa", "drop"]:
setattr(Evaluator, f"_{dataset}_eval", lambda self, *args, dataset=dataset.upper(), **kwargs: self._generic_eval(dataset, *args, **kwargs))