Update Multi LLM Config & Basic Evaluator

This commit is contained in:
didi 2024-08-25 22:16:39 +08:00
parent a3ff25430e
commit 1593e98c45
7 changed files with 302 additions and 29 deletions

View file

@ -2,8 +2,13 @@
# @Date : 8/23/2024 10:00 AM
# @Author : all
# @Desc : evaluate for different dataset
import datetime
import os
from typing import Literal
import pandas as pd
from deepeval.benchmarks import GSM8K
# TODO 完成实验数据集的手动划分
DatasetType = Literal["HumanEval", "MMBP", "Gsm8K", "MATH", "HotpotQa", "MMLU"]
@ -17,10 +22,12 @@ class Evaluator:
def __init__(self, eval_path: str):
pass
def validation_evaluate(self, dataset: DatasetType):
def validation_evaluate(self, dataset: DatasetType, result_path: str):
"""
Evaluates on validation dataset.
"""
if dataset == "Gsm8K":
return self._gsm8k_eval(result_path)
pass
def test_evaluate(self, dataset: DatasetType):
@ -28,3 +35,113 @@ class Evaluator:
Evaluates on test dataset.
"""
pass
def _gsm8k_eval(self, model, result_path, samples: int = 1000):
"""
Evaluate on GSM8K dataset.
"""
if model is None:
raise ValueError("Model is required for evaluation.")
benchmark = GSM8K(n_problems=samples, n_shots=0, enable_cot=False)
goldens = benchmark.load_benchmark_dataset()[: benchmark.n_problems]
def _evaluate_problem(model, golden, benchmark):
prompt = golden.input
max_retries = 50
retries = 0
while retries < max_retries:
try:
prediction = model.a_generate(prompt)
score = benchmark.scorer.exact_match_score(golden.expected_output, prediction)
break
except Exception as e:
retries += 1
print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
if retries == max_retries:
print("Maximum retries reached. Skipping this sample.")
prediction = None
score = 0
break
return golden.input, str(prediction), golden.expected_output, score
results = [_evaluate_problem(model, golden, benchmark) for golden in goldens]
overall_correct_predictions = sum(score for _, _, _, score in results)
overall_total_predictions = benchmark.n_problems
overall_accuracy = overall_correct_predictions / overall_total_predictions
def process_gsm8k_csv(file_path, tolerance=1e-6):
# 读取 CSV 文件
df = pd.read_csv(file_path, dtype=str) # 使用默认逗号分隔符,并指定所有列为字符串类型
# 清理预测和期望输出列
df["prediction"] = df["prediction"].str.strip()
df["prediction"] = df["prediction"].str.replace(",", "", regex=True)
df["expected output"] = df["expected output"].str.strip()
df["expected output"] = df["expected output"].str.replace(",", "", regex=True)
# 将列转换为数值类型
df["prediction"] = pd.to_numeric(df["prediction"], errors="coerce")
df["expected output"] = pd.to_numeric(df["expected output"], errors="coerce")
# 计算 score 列
# 对于浮点数,使用近似相等的逻辑
df["score"] = (df["prediction"] - df["expected output"]).abs() <= tolerance
# 将布尔值转换为整数
df["score"] = df["score"].astype(int)
# 计算 score 列的平均值
average_score = df["score"].mean()
# 获取输入文件的目录
input_dir = os.path.dirname(file_path)
# 创建输出文件路径
output_file_name = f"{average_score:.4f}.csv"
output_file_path = os.path.join(input_dir, output_file_name)
# 写入新的 CSV 文件
df.to_csv(output_file_path, index=False)
print(f"Data written to {output_file_path}")
print(f"Average score: {average_score:.4f}")
# 统计空值数量
num_empty_predictions = df["prediction"].isna().sum()
# 删除包含空 prediction 的行
df = df.dropna(subset=["prediction"])
# 重新计算正确的、错误的以及空的个数
num_correct = (df["score"] == 1).sum()
num_incorrect = (df["score"] == 0).sum()
print(f"Number of empty predictions: {num_empty_predictions}")
print(f"Number of correct predictions after removing empty ones: {num_correct}")
print(f"Number of incorrect predictions after removing empty ones: {num_incorrect}")
return average_score
predictions_row = [
(input, prediction, expected_output, score) for input, prediction, expected_output, score in results
]
benchmark.predictions = pd.DataFrame(
predictions_row, columns=["input", "prediction", "expected output", "score"]
)
benchmark.overall_score = overall_accuracy
now = datetime.datetime.now()
now.strftime("%Y-%m-%d_%H-%M-%S").replace(":", "_")
# file_path = f'gsm8k_{overall_accuracy}_{now_time}.csv'
benchmark.predictions.to_csv(result_path, index=False)
score = process_gsm8k_csv(file_path=result_path)
return {"score": score}

View file

@ -5,7 +5,7 @@
from typing import Literal
from metagpt.llm import LLM
from metagpt.provider.llm_provider_registry import create_llm_instance
from metagpt.utils.cost_manager import CostManager
DatasetType = Literal["HumanEval", "MMBP", "Gsm8K", "MATH", "HotpotQa", "MMLU"]
@ -17,16 +17,16 @@ class Graph:
def __init__(
self,
name: str,
llm: LLM,
llm_config,
dataset: DatasetType,
) -> None:
self.name = name
self.model = llm
self.dataset = dataset
self.cost = cost_manager # TODO
self.llm = create_llm_instance(llm_config)
self.llm.cost_manager = CostManager()
def __call__():
def __call__(self):
"""
Implementation of the graph
"""
NotImplementedError("Subclasses must implement __call__ method")
return self.llm.cost_manager.total_cost

View file

@ -49,6 +49,7 @@ class Optimizer:
operators: List,
optimized_path: str = None,
sample: int = 6,
q_type: str = "math", # math,code,quiz
) -> None:
self.optimize_llm = opt_llm
self.execute_llm = exec_llm
@ -61,6 +62,7 @@ class Optimizer:
self.sample = sample
self.score = "None"
self.top_scores = []
self.type = q_type
self.round = 1 # 起始轮次
def _initialize_oprimizer(self):
@ -130,33 +132,57 @@ class Optimizer:
# examples/ags/w_action_node/optimized/gsm8k/graphs/round_1
prompt_file_path = os.path.join(graphs_path, "prompt.py")
graph_file_path = os.path.join(graphs_path, "graph.py")
operator_file_path = os.path.join(graphs_path, "operator.py")
try:
with open(prompt_file_path, "r", encoding="utf-8") as file:
prompt_content = file.read()
with open(graph_file_path, "r", encoding="utf-8") as file:
graph_content = file.read()
with open(operator_file_path, "r", encoding="utf-8") as file:
operator_content = file.read()
except FileNotFoundError as e:
print(f"Error: File not found for round {round_number}: {e}")
raise
except Exception as e:
print(f"Error loading prompt for round {round_number}: {e}")
raise
return prompt_content, graph_content
return prompt_content, graph_content, operator_content
def _load_scores(self):
"""
# TODO 重写这个函数,写一个新的结构存储分数
"""
round_number = 1
score = 1
rounds_dir = os.path.join(self.root_path, "graphs")
self.top_scores = []
self.top_scores.append(
{
"round": round_number,
"score": score,
}
)
# 遍历所有轮次的文件夹
for round_dir in os.listdir(rounds_dir):
if os.path.isdir(os.path.join(rounds_dir, round_dir)) and round_dir.startswith("round_"):
round_number = int(round_dir.replace("round_", ""))
csv_file_path = os.path.join(rounds_dir, round_dir)
try:
# 遍历文件夹中的文件,查找 CSV 文件
for filename in os.listdir(csv_file_path):
score = 0
if filename.endswith(".csv"):
# 文件名就是分数
score = float(filename[:-4]) # 去除.csv
self.top_scores.append(
{
"round": round_number,
"score": score,
}
)
except FileNotFoundError as e:
print(f"Error: File not found for round {round_number}: {e}")
continue
except ValueError as e:
print(f"Error parsing score from filename for round {round_number}: {e}")
continue
except Exception as e:
print(f"Error processing round {round_number}: {e}")
continue
# 对所有轮次的分数进行排序
self.top_scores.sort(key=lambda x: x["score"], reverse=True)
@ -272,7 +298,7 @@ class Optimizer:
print(top_rounds)
prompt, graph_load = self._read_files(sample["round"])
prompt, graph_load, operator = self._read_files(sample["round"])
score = sample["score"]
# 正则表达式匹配 SolveGraph 开始的内容
@ -298,9 +324,12 @@ class Optimizer:
else:
experience = f"No experience data found for round {current_round}."
graph_input = GRAPH_INPUT.format(experinece=experience, score=score, graph=graph[0], prompt=prompt)
graph_input = GRAPH_INPUT.format(
experinece=experience, score=score, graph=graph[0], prompt=prompt, type=self.type
)
graph_system = GRAPH_OPTIMIZE_PROMPT.format(type=self.type)
node_prompt = GRAPH_OPTIMIZE_PROMPT + graph_input # TODO 看一眼谁先谁后这个地方
node_prompt = graph_system + graph_input # TODO 看一眼谁先谁后这个地方
node = await ActionNode.from_pydantic(GraphOptimize).fill(
context=node_prompt, mode="context_fill", llm=self.llm