diff --git a/examples/aflow/scripts/evaluator.py b/examples/aflow/scripts/evaluator.py index d19be02a8..285627b75 100644 --- a/examples/aflow/scripts/evaluator.py +++ b/examples/aflow/scripts/evaluator.py @@ -3,7 +3,8 @@ # @Author : all # @Desc : Evaluation for different datasets -from typing import Literal +from typing import Literal, Tuple, Optional +import asyncio from examples.aflow.benchmark.gsm8k import optimize_gsm8k_evaluation from examples.aflow.benchmark.math import optimize_math_evaluation @@ -22,155 +23,51 @@ class Evaluator: def __init__(self, eval_path: str): self.eval_path = eval_path + self.dataset_configs = { + "GSM8K": {"name": "GSM8K", "eval_func": optimize_gsm8k_evaluation}, + "MATH": {"name": "MATH", "eval_func": optimize_math_evaluation}, + "HumanEval": {"name": "HumanEval", "eval_func": optimize_humaneval_evaluation}, + "HotpotQA": {"name": "HotpotQA", "eval_func": optimize_hotpotqa_evaluation}, + "MBPP": {"name": "MBPP", "eval_func": optimize_mbpp_evaluation}, + "DROP": {"name": "DROP", "eval_func": optimize_drop_evaluation}, + } def graph_evaluate(self, dataset: DatasetType, graph, params: dict, path, is_test=False): """ Evaluates on validation dataset. """ - if dataset == "GSM8K": - return self._gsm8k_eval(graph, params, path, is_test) - elif dataset == "MATH": - return self._math_eval(graph, params, path, is_test) - elif dataset == "HumanEval": - return self._humaneval_eval(graph, params, path, is_test) - elif dataset == "HotpotQA": - return self._hotpotqa_eval(graph, params, path, is_test) - elif dataset == "MBPP": - return self._mbpp_eval(graph, params, path, is_test) - elif dataset == "DROP": - return self._drop_eval(graph, params, path, is_test) + if dataset in self.dataset_configs: + return self._generic_eval(dataset, graph, params, path, is_test) else: return None - async def _gsm8k_eval(self, graph_class, params, path, test=False): + async def _generic_eval(self, dataset: DatasetType, graph_class, params: dict, path: str, test: bool = False) -> Tuple[float, float, float]: """ - Evaluate GSM8K dataset. + Generic evaluation function for all datasets. """ async def load_graph(): - dataset = params["dataset"] + dataset_config = params["dataset"] llm_config = params["llm_config"] - return graph_class(name="GSM8K", llm_config=llm_config, dataset=dataset) - - if test: - data_path = "examples/aflow/data/gsm8k_test.jsonl" # Replace with your JSONL file path - va_list = None - else: - data_path = "examples/aflow/data/gsm8k_validate.jsonl" # Replace with your JSONL file path - va_list = [1,2,3] # Replace with the filtered index list + return graph_class(name=self.dataset_configs[dataset]["name"], llm_config=llm_config, dataset=dataset_config) + data_path, va_list = self._get_data_path_and_va_list(dataset, test) graph = await load_graph() - avg_score, avg_cost, total_cost = await optimize_gsm8k_evaluation(graph, data_path, path, va_list) + eval_func = self.dataset_configs[dataset]["eval_func"] + avg_score, avg_cost, total_cost = await eval_func(graph, data_path, path, va_list) return avg_score, avg_cost, total_cost - async def _math_eval(self, graph_class, params, path, test=False): + def _get_data_path_and_va_list(self, dataset: DatasetType, test: bool) -> Tuple[str, Optional[list]]: """ - Evaluate MATH dataset. + Get data path and validation list based on dataset and test flag. """ - async def load_graph(): - dataset = params["dataset"] - llm_config = params["llm_config"] - return graph_class(name="MATH", llm_config=llm_config, dataset=dataset) - + base_path = f"examples/aflow/data/{dataset.lower()}" if test: - data_path = "examples/aflow/data/math_test.jsonl" - va_list = None + return f"{base_path}_test.jsonl", None else: - data_path = "examples/aflow/data/math_validate.jsonl" - va_list = [1,2,3] # Replace with the filtered index list + return f"{base_path}_validate.jsonl", [1, 2, 3] # Replace with the actual filtered index list - graph = await load_graph() - - avg_score, avg_cost, total_cost = await optimize_math_evaluation(graph, data_path, path, va_list) - - return avg_score, avg_cost, total_cost - - async def _humaneval_eval(self, graph_class, params, path, test=False): - """ - Evaluate HumanEval dataset. - """ - async def load_graph(): - dataset = params["dataset"] - llm_config = params["llm_config"] - return graph_class(name="HumanEval", llm_config=llm_config, dataset=dataset) - - if test: - data_path = "examples/aflow/data/human-eval_test.jsonl" # Replace with your JSONL file path - va_list = None - else: - data_path = "examples/aflow/data/human-eval_validate.jsonl" # Replace with your JSONL file path - va_list = [1,2,3] # Replace with the filtered index list - - graph = await load_graph() - - avg_score, avg_cost, total_cost = await optimize_humaneval_evaluation(graph, data_path, path, va_list) - - return avg_score, avg_cost, total_cost - - async def _mbpp_eval(self, graph_class, params, path, test=False): - """ - Evaluate MBPP dataset. - """ - async def load_graph(): - dataset = params["dataset"] - llm_config = params["llm_config"] - return graph_class(name="MBPP", llm_config=llm_config, dataset=dataset) - - if test: - data_path = "examples/aflow/data/mbpp_test.jsonl" - va_list = None - else: - data_path = "examples/aflow/data/mbpp_validate.jsonl" - va_list = [1,2,3] # Replace with the filtered index list - - graph = await load_graph() - - avg_score, avg_cost, total_cost = await optimize_mbpp_evaluation(graph, data_path, path, va_list) - - return avg_score, avg_cost, total_cost - - async def _hotpotqa_eval(self, graph_class, params, path, test=False): - """ - Evaluate HotpotQA dataset. - """ - async def load_graph(): - dataset = params["dataset"] - llm_config = params["llm_config"] - return graph_class(name="HotpotQA", llm_config=llm_config, dataset=dataset) - - if test: - data_path = "examples/aflow/data/hotpotqa_test.jsonl" - va_list = None - else: - data_path = "examples/aflow/data/hotpotqa_validate.jsonl" - va_list = [1,2,3] # Replace with the filtered index list - - graph = await load_graph() - - avg_score, avg_cost, total_cost = await optimize_hotpotqa_evaluation(graph, data_path, path, va_list) - - return avg_score, avg_cost, total_cost - - - async def _drop_eval(self, graph_class, params, path, test=False): - """ - Evaluate DROP dataset. - """ - async def load_graph(): - dataset = params["dataset"] - llm_config = params["llm_config"] - return graph_class(name="DROP", llm_config=llm_config, dataset=dataset) - - if test: - data_path = "examples/aflow/data/drop_test.jsonl" - va_list = None - else: - data_path = "examples/aflow/data/drop_validate.jsonl" - va_list = [1,2,3] # Replace with the filtered index list - - graph = await load_graph() - - avg_score, avg_cost, total_cost = await optimize_drop_evaluation(graph, data_path, path, va_list) - - return avg_score, avg_cost, total_cost \ No newline at end of file +# Alias methods for backward compatibility +for dataset in ["gsm8k", "math", "humaneval", "mbpp", "hotpotqa", "drop"]: + setattr(Evaluator, f"_{dataset}_eval", lambda self, *args, dataset=dataset.upper(), **kwargs: self._generic_eval(dataset, *args, **kwargs)) \ No newline at end of file