mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-08 15:05:17 +02:00
Update Multi LLM Config & Basic Evaluator
This commit is contained in:
parent
a3ff25430e
commit
1593e98c45
7 changed files with 302 additions and 29 deletions
|
|
@ -2,8 +2,13 @@
|
|||
# @Date : 8/23/2024 10:00 AM
|
||||
# @Author : all
|
||||
# @Desc : evaluate for different dataset
|
||||
import datetime
|
||||
import os
|
||||
from typing import Literal
|
||||
|
||||
import pandas as pd
|
||||
from deepeval.benchmarks import GSM8K
|
||||
|
||||
# TODO 完成实验数据集的手动划分
|
||||
|
||||
DatasetType = Literal["HumanEval", "MMBP", "Gsm8K", "MATH", "HotpotQa", "MMLU"]
|
||||
|
|
@ -17,10 +22,12 @@ class Evaluator:
|
|||
def __init__(self, eval_path: str):
|
||||
pass
|
||||
|
||||
def validation_evaluate(self, dataset: DatasetType):
|
||||
def validation_evaluate(self, dataset: DatasetType, result_path: str):
|
||||
"""
|
||||
Evaluates on validation dataset.
|
||||
"""
|
||||
if dataset == "Gsm8K":
|
||||
return self._gsm8k_eval(result_path)
|
||||
pass
|
||||
|
||||
def test_evaluate(self, dataset: DatasetType):
|
||||
|
|
@ -28,3 +35,113 @@ class Evaluator:
|
|||
Evaluates on test dataset.
|
||||
"""
|
||||
pass
|
||||
|
||||
def _gsm8k_eval(self, model, result_path, samples: int = 1000):
|
||||
"""
|
||||
Evaluate on GSM8K dataset.
|
||||
"""
|
||||
if model is None:
|
||||
raise ValueError("Model is required for evaluation.")
|
||||
|
||||
benchmark = GSM8K(n_problems=samples, n_shots=0, enable_cot=False)
|
||||
goldens = benchmark.load_benchmark_dataset()[: benchmark.n_problems]
|
||||
|
||||
def _evaluate_problem(model, golden, benchmark):
|
||||
prompt = golden.input
|
||||
|
||||
max_retries = 50
|
||||
retries = 0
|
||||
|
||||
while retries < max_retries:
|
||||
try:
|
||||
prediction = model.a_generate(prompt)
|
||||
score = benchmark.scorer.exact_match_score(golden.expected_output, prediction)
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
retries += 1
|
||||
print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
|
||||
|
||||
if retries == max_retries:
|
||||
print("Maximum retries reached. Skipping this sample.")
|
||||
prediction = None
|
||||
score = 0
|
||||
break
|
||||
|
||||
return golden.input, str(prediction), golden.expected_output, score
|
||||
|
||||
results = [_evaluate_problem(model, golden, benchmark) for golden in goldens]
|
||||
|
||||
overall_correct_predictions = sum(score for _, _, _, score in results)
|
||||
overall_total_predictions = benchmark.n_problems
|
||||
overall_accuracy = overall_correct_predictions / overall_total_predictions
|
||||
|
||||
def process_gsm8k_csv(file_path, tolerance=1e-6):
|
||||
# 读取 CSV 文件
|
||||
df = pd.read_csv(file_path, dtype=str) # 使用默认逗号分隔符,并指定所有列为字符串类型
|
||||
|
||||
# 清理预测和期望输出列
|
||||
df["prediction"] = df["prediction"].str.strip()
|
||||
df["prediction"] = df["prediction"].str.replace(",", "", regex=True)
|
||||
df["expected output"] = df["expected output"].str.strip()
|
||||
df["expected output"] = df["expected output"].str.replace(",", "", regex=True)
|
||||
|
||||
# 将列转换为数值类型
|
||||
df["prediction"] = pd.to_numeric(df["prediction"], errors="coerce")
|
||||
df["expected output"] = pd.to_numeric(df["expected output"], errors="coerce")
|
||||
|
||||
# 计算 score 列
|
||||
# 对于浮点数,使用近似相等的逻辑
|
||||
df["score"] = (df["prediction"] - df["expected output"]).abs() <= tolerance
|
||||
|
||||
# 将布尔值转换为整数
|
||||
df["score"] = df["score"].astype(int)
|
||||
|
||||
# 计算 score 列的平均值
|
||||
average_score = df["score"].mean()
|
||||
|
||||
# 获取输入文件的目录
|
||||
input_dir = os.path.dirname(file_path)
|
||||
|
||||
# 创建输出文件路径
|
||||
output_file_name = f"{average_score:.4f}.csv"
|
||||
output_file_path = os.path.join(input_dir, output_file_name)
|
||||
|
||||
# 写入新的 CSV 文件
|
||||
df.to_csv(output_file_path, index=False)
|
||||
|
||||
print(f"Data written to {output_file_path}")
|
||||
print(f"Average score: {average_score:.4f}")
|
||||
|
||||
# 统计空值数量
|
||||
num_empty_predictions = df["prediction"].isna().sum()
|
||||
|
||||
# 删除包含空 prediction 的行
|
||||
df = df.dropna(subset=["prediction"])
|
||||
|
||||
# 重新计算正确的、错误的以及空的个数
|
||||
num_correct = (df["score"] == 1).sum()
|
||||
num_incorrect = (df["score"] == 0).sum()
|
||||
|
||||
print(f"Number of empty predictions: {num_empty_predictions}")
|
||||
print(f"Number of correct predictions after removing empty ones: {num_correct}")
|
||||
print(f"Number of incorrect predictions after removing empty ones: {num_incorrect}")
|
||||
|
||||
return average_score
|
||||
|
||||
predictions_row = [
|
||||
(input, prediction, expected_output, score) for input, prediction, expected_output, score in results
|
||||
]
|
||||
benchmark.predictions = pd.DataFrame(
|
||||
predictions_row, columns=["input", "prediction", "expected output", "score"]
|
||||
)
|
||||
benchmark.overall_score = overall_accuracy
|
||||
now = datetime.datetime.now()
|
||||
now.strftime("%Y-%m-%d_%H-%M-%S").replace(":", "_")
|
||||
|
||||
# file_path = f'gsm8k_{overall_accuracy}_{now_time}.csv'
|
||||
|
||||
benchmark.predictions.to_csv(result_path, index=False)
|
||||
|
||||
score = process_gsm8k_csv(file_path=result_path)
|
||||
return {"score": score}
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
from typing import Literal
|
||||
|
||||
from metagpt.llm import LLM
|
||||
from metagpt.provider.llm_provider_registry import create_llm_instance
|
||||
from metagpt.utils.cost_manager import CostManager
|
||||
|
||||
DatasetType = Literal["HumanEval", "MMBP", "Gsm8K", "MATH", "HotpotQa", "MMLU"]
|
||||
|
|
@ -17,16 +17,16 @@ class Graph:
|
|||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
llm: LLM,
|
||||
llm_config,
|
||||
dataset: DatasetType,
|
||||
) -> None:
|
||||
self.name = name
|
||||
self.model = llm
|
||||
self.dataset = dataset
|
||||
self.cost = cost_manager # TODO
|
||||
self.llm = create_llm_instance(llm_config)
|
||||
self.llm.cost_manager = CostManager()
|
||||
|
||||
def __call__():
|
||||
def __call__(self):
|
||||
"""
|
||||
Implementation of the graph
|
||||
"""
|
||||
NotImplementedError("Subclasses must implement __call__ method")
|
||||
return self.llm.cost_manager.total_cost
|
||||
|
|
|
|||
|
|
@ -49,6 +49,7 @@ class Optimizer:
|
|||
operators: List,
|
||||
optimized_path: str = None,
|
||||
sample: int = 6,
|
||||
q_type: str = "math", # math,code,quiz
|
||||
) -> None:
|
||||
self.optimize_llm = opt_llm
|
||||
self.execute_llm = exec_llm
|
||||
|
|
@ -61,6 +62,7 @@ class Optimizer:
|
|||
self.sample = sample
|
||||
self.score = "None"
|
||||
self.top_scores = []
|
||||
self.type = q_type
|
||||
self.round = 1 # 起始轮次
|
||||
|
||||
def _initialize_oprimizer(self):
|
||||
|
|
@ -130,33 +132,57 @@ class Optimizer:
|
|||
# examples/ags/w_action_node/optimized/gsm8k/graphs/round_1
|
||||
prompt_file_path = os.path.join(graphs_path, "prompt.py")
|
||||
graph_file_path = os.path.join(graphs_path, "graph.py")
|
||||
operator_file_path = os.path.join(graphs_path, "operator.py")
|
||||
|
||||
try:
|
||||
with open(prompt_file_path, "r", encoding="utf-8") as file:
|
||||
prompt_content = file.read()
|
||||
with open(graph_file_path, "r", encoding="utf-8") as file:
|
||||
graph_content = file.read()
|
||||
with open(operator_file_path, "r", encoding="utf-8") as file:
|
||||
operator_content = file.read()
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: File not found for round {round_number}: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error loading prompt for round {round_number}: {e}")
|
||||
raise
|
||||
return prompt_content, graph_content
|
||||
return prompt_content, graph_content, operator_content
|
||||
|
||||
def _load_scores(self):
|
||||
"""
|
||||
# TODO 重写这个函数,写一个新的结构存储分数
|
||||
"""
|
||||
round_number = 1
|
||||
score = 1
|
||||
rounds_dir = os.path.join(self.root_path, "graphs")
|
||||
self.top_scores = []
|
||||
|
||||
self.top_scores.append(
|
||||
{
|
||||
"round": round_number,
|
||||
"score": score,
|
||||
}
|
||||
)
|
||||
# 遍历所有轮次的文件夹
|
||||
for round_dir in os.listdir(rounds_dir):
|
||||
if os.path.isdir(os.path.join(rounds_dir, round_dir)) and round_dir.startswith("round_"):
|
||||
round_number = int(round_dir.replace("round_", ""))
|
||||
csv_file_path = os.path.join(rounds_dir, round_dir)
|
||||
try:
|
||||
# 遍历文件夹中的文件,查找 CSV 文件
|
||||
for filename in os.listdir(csv_file_path):
|
||||
score = 0
|
||||
|
||||
if filename.endswith(".csv"):
|
||||
# 文件名就是分数
|
||||
score = float(filename[:-4]) # 去除.csv
|
||||
|
||||
self.top_scores.append(
|
||||
{
|
||||
"round": round_number,
|
||||
"score": score,
|
||||
}
|
||||
)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: File not found for round {round_number}: {e}")
|
||||
continue
|
||||
except ValueError as e:
|
||||
print(f"Error parsing score from filename for round {round_number}: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Error processing round {round_number}: {e}")
|
||||
continue
|
||||
|
||||
# 对所有轮次的分数进行排序
|
||||
self.top_scores.sort(key=lambda x: x["score"], reverse=True)
|
||||
|
|
@ -272,7 +298,7 @@ class Optimizer:
|
|||
|
||||
print(top_rounds)
|
||||
|
||||
prompt, graph_load = self._read_files(sample["round"])
|
||||
prompt, graph_load, operator = self._read_files(sample["round"])
|
||||
score = sample["score"]
|
||||
|
||||
# 正则表达式匹配 SolveGraph 开始的内容
|
||||
|
|
@ -298,9 +324,12 @@ class Optimizer:
|
|||
else:
|
||||
experience = f"No experience data found for round {current_round}."
|
||||
|
||||
graph_input = GRAPH_INPUT.format(experinece=experience, score=score, graph=graph[0], prompt=prompt)
|
||||
graph_input = GRAPH_INPUT.format(
|
||||
experinece=experience, score=score, graph=graph[0], prompt=prompt, type=self.type
|
||||
)
|
||||
graph_system = GRAPH_OPTIMIZE_PROMPT.format(type=self.type)
|
||||
|
||||
node_prompt = GRAPH_OPTIMIZE_PROMPT + graph_input # TODO 看一眼谁先谁后这个地方
|
||||
node_prompt = graph_system + graph_input # TODO 看一眼谁先谁后这个地方
|
||||
|
||||
node = await ActionNode.from_pydantic(GraphOptimize).fill(
|
||||
context=node_prompt, mode="context_fill", llm=self.llm
|
||||
|
|
|
|||
|
|
@ -8,12 +8,14 @@
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
from metagpt.configs.models_config import ModelsConfig
|
||||
from metagpt.context_mixin import ContextMixin
|
||||
from metagpt.provider.llm_provider_registry import create_llm_instance
|
||||
from metagpt.schema import (
|
||||
CodePlanAndChangeContext,
|
||||
CodeSummarizeContext,
|
||||
|
|
@ -35,6 +37,19 @@ class Action(SerializationMixin, ContextMixin, BaseModel):
|
|||
prefix: str = "" # aask*时会加上prefix,作为system_message
|
||||
desc: str = "" # for skill manager
|
||||
node: ActionNode = Field(default=None, exclude=True)
|
||||
# The model name or API type of LLM of the `models` in the `config2.yaml`;
|
||||
# Using `None` to use the `llm` configuration in the `config2.yaml`.
|
||||
llm_name_or_type: Optional[str] = None
|
||||
|
||||
@model_validator(mode="after")
|
||||
@classmethod
|
||||
def _update_private_llm(cls, data: Any) -> Any:
|
||||
config = ModelsConfig.default().get(data.llm_name_or_type)
|
||||
if config:
|
||||
llm = create_llm_instance(config)
|
||||
llm.cost_manager = data.llm.cost_manager
|
||||
data.llm = llm
|
||||
return data
|
||||
|
||||
@property
|
||||
def repo(self) -> ProjectRepo:
|
||||
|
|
|
|||
|
|
@ -504,7 +504,6 @@ class ActionNode:
|
|||
### format example (must be strictly followed) (do not include any other formats except for the given XML format)
|
||||
{example_str}
|
||||
"""
|
||||
print(context)
|
||||
return context
|
||||
|
||||
async def code_fill(self, context, function_name=None, timeout=USE_CONFIG_TIMEOUT):
|
||||
|
|
|
|||
|
|
@ -10,9 +10,9 @@ from typing import Optional
|
|||
|
||||
from pydantic import field_validator
|
||||
|
||||
from metagpt.const import LLM_API_TIMEOUT
|
||||
from metagpt.const import CONFIG_ROOT, LLM_API_TIMEOUT, METAGPT_ROOT
|
||||
from metagpt.utils.yaml_model import YamlModel
|
||||
from metagpt.const import METAGPT_ROOT, CONFIG_ROOT
|
||||
|
||||
|
||||
class LLMType(Enum):
|
||||
OPENAI = "openai"
|
||||
|
|
@ -97,12 +97,13 @@ class LLMConfig(YamlModel):
|
|||
repo_config_path = METAGPT_ROOT / "config/config2.yaml"
|
||||
root_config_path = CONFIG_ROOT / "config2.yaml"
|
||||
if root_config_path.exists():
|
||||
raise ValueError(
|
||||
f"Please set your API key in {root_config_path}. If you also set your config in {repo_config_path}, \nthe former will overwrite the latter. This may cause unexpected result.\n")
|
||||
raise ValueError(
|
||||
f"Please set your API key in {root_config_path}. If you also set your config in {repo_config_path}, \nthe former will overwrite the latter. This may cause unexpected result.\n"
|
||||
)
|
||||
elif repo_config_path.exists():
|
||||
raise ValueError(f"Please set your API key in {repo_config_path}")
|
||||
else:
|
||||
raise ValueError(f"Please set your API key in config2.yaml")
|
||||
raise ValueError("Please set your API key in config2.yaml")
|
||||
return v
|
||||
|
||||
@field_validator("timeout")
|
||||
|
|
|
|||
112
metagpt/configs/models_config.py
Normal file
112
metagpt/configs/models_config.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
models_config.py
|
||||
|
||||
This module defines the ModelsConfig class for handling configuration of LLM models.
|
||||
|
||||
Attributes:
|
||||
CONFIG_ROOT (Path): Root path for configuration files.
|
||||
METAGPT_ROOT (Path): Root path for MetaGPT files.
|
||||
|
||||
Classes:
|
||||
ModelsConfig (YamlModel): Configuration class for LLM models.
|
||||
"""
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from pydantic import Field, field_validator
|
||||
|
||||
from metagpt.config2 import merge_dict
|
||||
from metagpt.configs.llm_config import LLMConfig
|
||||
from metagpt.const import CONFIG_ROOT, METAGPT_ROOT
|
||||
from metagpt.utils.yaml_model import YamlModel
|
||||
|
||||
|
||||
class ModelsConfig(YamlModel):
|
||||
"""
|
||||
Configuration class for `models` in `config2.yaml`.
|
||||
|
||||
Attributes:
|
||||
models (Dict[str, LLMConfig]): Dictionary mapping model names or types to LLMConfig objects.
|
||||
|
||||
Methods:
|
||||
update_llm_model(cls, value): Validates and updates LLM model configurations.
|
||||
from_home(cls, path): Loads configuration from ~/.metagpt/config2.yaml.
|
||||
default(cls): Loads default configuration from predefined paths.
|
||||
get(self, name_or_type: str) -> Optional[LLMConfig]: Retrieves LLMConfig by name or API type.
|
||||
"""
|
||||
|
||||
models: Dict[str, LLMConfig] = Field(default_factory=dict)
|
||||
|
||||
@field_validator("models", mode="before")
|
||||
@classmethod
|
||||
def update_llm_model(cls, value):
|
||||
"""
|
||||
Validates and updates LLM model configurations.
|
||||
|
||||
Args:
|
||||
value (Dict[str, Union[LLMConfig, dict]]): Dictionary of LLM configurations.
|
||||
|
||||
Returns:
|
||||
Dict[str, Union[LLMConfig, dict]]: Updated dictionary of LLM configurations.
|
||||
"""
|
||||
for key, config in value.items():
|
||||
if isinstance(config, LLMConfig):
|
||||
config.model = config.model or key
|
||||
elif isinstance(config, dict):
|
||||
config["model"] = config.get("model") or key
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def from_home(cls, path):
|
||||
"""
|
||||
Loads configuration from ~/.metagpt/config2.yaml.
|
||||
|
||||
Args:
|
||||
path (str): Relative path to configuration file.
|
||||
|
||||
Returns:
|
||||
Optional[ModelsConfig]: Loaded ModelsConfig object or None if file doesn't exist.
|
||||
"""
|
||||
pathname = CONFIG_ROOT / path
|
||||
if not pathname.exists():
|
||||
return None
|
||||
return ModelsConfig.from_yaml_file(pathname)
|
||||
|
||||
@classmethod
|
||||
def default(cls):
|
||||
"""
|
||||
Loads default configuration from predefined paths.
|
||||
|
||||
Returns:
|
||||
ModelsConfig: Default ModelsConfig object.
|
||||
"""
|
||||
default_config_paths: List[Path] = [
|
||||
METAGPT_ROOT / "config/config2.yaml",
|
||||
CONFIG_ROOT / "config2.yaml",
|
||||
]
|
||||
|
||||
dicts = [ModelsConfig.read_yaml(path) for path in default_config_paths]
|
||||
final = merge_dict(dicts)
|
||||
return ModelsConfig(**final)
|
||||
|
||||
def get(self, name_or_type: str) -> Optional[LLMConfig]:
|
||||
"""
|
||||
Retrieves LLMConfig object by name or API type.
|
||||
|
||||
Args:
|
||||
name_or_type (str): Name or API type of the LLM model.
|
||||
|
||||
Returns:
|
||||
Optional[LLMConfig]: LLMConfig object if found, otherwise None.
|
||||
"""
|
||||
if not name_or_type:
|
||||
return None
|
||||
model = self.models.get(name_or_type)
|
||||
if model:
|
||||
return model
|
||||
for m in self.models.values():
|
||||
if m.api_type == name_or_type:
|
||||
return m
|
||||
return None
|
||||
Loading…
Add table
Add a link
Reference in a new issue