diff --git a/examples/ags/w_action_node/evaluator.py b/examples/ags/w_action_node/evaluator.py index 42e2a7d96..9fbffeea1 100644 --- a/examples/ags/w_action_node/evaluator.py +++ b/examples/ags/w_action_node/evaluator.py @@ -2,8 +2,13 @@ # @Date : 8/23/2024 10:00 AM # @Author : all # @Desc : evaluate for different dataset +import datetime +import os from typing import Literal +import pandas as pd +from deepeval.benchmarks import GSM8K + # TODO 完成实验数据集的手动划分 DatasetType = Literal["HumanEval", "MMBP", "Gsm8K", "MATH", "HotpotQa", "MMLU"] @@ -17,10 +22,12 @@ class Evaluator: def __init__(self, eval_path: str): pass - def validation_evaluate(self, dataset: DatasetType): + def validation_evaluate(self, dataset: DatasetType, result_path: str): """ Evaluates on validation dataset. """ + if dataset == "Gsm8K": + return self._gsm8k_eval(result_path) pass def test_evaluate(self, dataset: DatasetType): @@ -28,3 +35,113 @@ class Evaluator: Evaluates on test dataset. """ pass + + def _gsm8k_eval(self, model, result_path, samples: int = 1000): + """ + Evaluate on GSM8K dataset. + """ + if model is None: + raise ValueError("Model is required for evaluation.") + + benchmark = GSM8K(n_problems=samples, n_shots=0, enable_cot=False) + goldens = benchmark.load_benchmark_dataset()[: benchmark.n_problems] + + def _evaluate_problem(model, golden, benchmark): + prompt = golden.input + + max_retries = 50 + retries = 0 + + while retries < max_retries: + try: + prediction = model.a_generate(prompt) + score = benchmark.scorer.exact_match_score(golden.expected_output, prediction) + break + + except Exception as e: + retries += 1 + print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})") + + if retries == max_retries: + print("Maximum retries reached. Skipping this sample.") + prediction = None + score = 0 + break + + return golden.input, str(prediction), golden.expected_output, score + + results = [_evaluate_problem(model, golden, benchmark) for golden in goldens] + + overall_correct_predictions = sum(score for _, _, _, score in results) + overall_total_predictions = benchmark.n_problems + overall_accuracy = overall_correct_predictions / overall_total_predictions + + def process_gsm8k_csv(file_path, tolerance=1e-6): + # 读取 CSV 文件 + df = pd.read_csv(file_path, dtype=str) # 使用默认逗号分隔符,并指定所有列为字符串类型 + + # 清理预测和期望输出列 + df["prediction"] = df["prediction"].str.strip() + df["prediction"] = df["prediction"].str.replace(",", "", regex=True) + df["expected output"] = df["expected output"].str.strip() + df["expected output"] = df["expected output"].str.replace(",", "", regex=True) + + # 将列转换为数值类型 + df["prediction"] = pd.to_numeric(df["prediction"], errors="coerce") + df["expected output"] = pd.to_numeric(df["expected output"], errors="coerce") + + # 计算 score 列 + # 对于浮点数,使用近似相等的逻辑 + df["score"] = (df["prediction"] - df["expected output"]).abs() <= tolerance + + # 将布尔值转换为整数 + df["score"] = df["score"].astype(int) + + # 计算 score 列的平均值 + average_score = df["score"].mean() + + # 获取输入文件的目录 + input_dir = os.path.dirname(file_path) + + # 创建输出文件路径 + output_file_name = f"{average_score:.4f}.csv" + output_file_path = os.path.join(input_dir, output_file_name) + + # 写入新的 CSV 文件 + df.to_csv(output_file_path, index=False) + + print(f"Data written to {output_file_path}") + print(f"Average score: {average_score:.4f}") + + # 统计空值数量 + num_empty_predictions = df["prediction"].isna().sum() + + # 删除包含空 prediction 的行 + df = df.dropna(subset=["prediction"]) + + # 重新计算正确的、错误的以及空的个数 + num_correct = (df["score"] == 1).sum() + num_incorrect = (df["score"] == 0).sum() + + print(f"Number of empty predictions: {num_empty_predictions}") + print(f"Number of correct predictions after removing empty ones: {num_correct}") + print(f"Number of incorrect predictions after removing empty ones: {num_incorrect}") + + return average_score + + predictions_row = [ + (input, prediction, expected_output, score) for input, prediction, expected_output, score in results + ] + benchmark.predictions = pd.DataFrame( + predictions_row, columns=["input", "prediction", "expected output", "score"] + ) + benchmark.overall_score = overall_accuracy + now = datetime.datetime.now() + now.strftime("%Y-%m-%d_%H-%M-%S").replace(":", "_") + + # file_path = f'gsm8k_{overall_accuracy}_{now_time}.csv' + + benchmark.predictions.to_csv(result_path, index=False) + + score = process_gsm8k_csv(file_path=result_path) + return {"score": score} diff --git a/examples/ags/w_action_node/graph.py b/examples/ags/w_action_node/graph.py index 24865a2a5..5f80c9364 100644 --- a/examples/ags/w_action_node/graph.py +++ b/examples/ags/w_action_node/graph.py @@ -5,7 +5,7 @@ from typing import Literal -from metagpt.llm import LLM +from metagpt.provider.llm_provider_registry import create_llm_instance from metagpt.utils.cost_manager import CostManager DatasetType = Literal["HumanEval", "MMBP", "Gsm8K", "MATH", "HotpotQa", "MMLU"] @@ -17,16 +17,16 @@ class Graph: def __init__( self, name: str, - llm: LLM, + llm_config, dataset: DatasetType, ) -> None: self.name = name - self.model = llm self.dataset = dataset - self.cost = cost_manager # TODO + self.llm = create_llm_instance(llm_config) + self.llm.cost_manager = CostManager() - def __call__(): + def __call__(self): """ Implementation of the graph """ - NotImplementedError("Subclasses must implement __call__ method") + return self.llm.cost_manager.total_cost diff --git a/examples/ags/w_action_node/optimizer.py b/examples/ags/w_action_node/optimizer.py index 5a05969f1..b60f8f3ec 100644 --- a/examples/ags/w_action_node/optimizer.py +++ b/examples/ags/w_action_node/optimizer.py @@ -49,6 +49,7 @@ class Optimizer: operators: List, optimized_path: str = None, sample: int = 6, + q_type: str = "math", # math,code,quiz ) -> None: self.optimize_llm = opt_llm self.execute_llm = exec_llm @@ -61,6 +62,7 @@ class Optimizer: self.sample = sample self.score = "None" self.top_scores = [] + self.type = q_type self.round = 1 # 起始轮次 def _initialize_oprimizer(self): @@ -130,33 +132,57 @@ class Optimizer: # examples/ags/w_action_node/optimized/gsm8k/graphs/round_1 prompt_file_path = os.path.join(graphs_path, "prompt.py") graph_file_path = os.path.join(graphs_path, "graph.py") + operator_file_path = os.path.join(graphs_path, "operator.py") try: with open(prompt_file_path, "r", encoding="utf-8") as file: prompt_content = file.read() with open(graph_file_path, "r", encoding="utf-8") as file: graph_content = file.read() + with open(operator_file_path, "r", encoding="utf-8") as file: + operator_content = file.read() except FileNotFoundError as e: print(f"Error: File not found for round {round_number}: {e}") raise except Exception as e: print(f"Error loading prompt for round {round_number}: {e}") raise - return prompt_content, graph_content + return prompt_content, graph_content, operator_content def _load_scores(self): - """ - # TODO 重写这个函数,写一个新的结构存储分数 - """ - round_number = 1 - score = 1 + rounds_dir = os.path.join(self.root_path, "graphs") + self.top_scores = [] - self.top_scores.append( - { - "round": round_number, - "score": score, - } - ) + # 遍历所有轮次的文件夹 + for round_dir in os.listdir(rounds_dir): + if os.path.isdir(os.path.join(rounds_dir, round_dir)) and round_dir.startswith("round_"): + round_number = int(round_dir.replace("round_", "")) + csv_file_path = os.path.join(rounds_dir, round_dir) + try: + # 遍历文件夹中的文件,查找 CSV 文件 + for filename in os.listdir(csv_file_path): + score = 0 + + if filename.endswith(".csv"): + # 文件名就是分数 + score = float(filename[:-4]) # 去除.csv + + self.top_scores.append( + { + "round": round_number, + "score": score, + } + ) + + except FileNotFoundError as e: + print(f"Error: File not found for round {round_number}: {e}") + continue + except ValueError as e: + print(f"Error parsing score from filename for round {round_number}: {e}") + continue + except Exception as e: + print(f"Error processing round {round_number}: {e}") + continue # 对所有轮次的分数进行排序 self.top_scores.sort(key=lambda x: x["score"], reverse=True) @@ -272,7 +298,7 @@ class Optimizer: print(top_rounds) - prompt, graph_load = self._read_files(sample["round"]) + prompt, graph_load, operator = self._read_files(sample["round"]) score = sample["score"] # 正则表达式匹配 SolveGraph 开始的内容 @@ -298,9 +324,12 @@ class Optimizer: else: experience = f"No experience data found for round {current_round}." - graph_input = GRAPH_INPUT.format(experinece=experience, score=score, graph=graph[0], prompt=prompt) + graph_input = GRAPH_INPUT.format( + experinece=experience, score=score, graph=graph[0], prompt=prompt, type=self.type + ) + graph_system = GRAPH_OPTIMIZE_PROMPT.format(type=self.type) - node_prompt = GRAPH_OPTIMIZE_PROMPT + graph_input # TODO 看一眼谁先谁后这个地方 + node_prompt = graph_system + graph_input # TODO 看一眼谁先谁后这个地方 node = await ActionNode.from_pydantic(GraphOptimize).fill( context=node_prompt, mode="context_fill", llm=self.llm diff --git a/metagpt/actions/action.py b/metagpt/actions/action.py index 1b93213f7..20c052aa9 100644 --- a/metagpt/actions/action.py +++ b/metagpt/actions/action.py @@ -8,12 +8,14 @@ from __future__ import annotations -from typing import Optional, Union +from typing import Any, Optional, Union from pydantic import BaseModel, ConfigDict, Field, model_validator from metagpt.actions.action_node import ActionNode +from metagpt.configs.models_config import ModelsConfig from metagpt.context_mixin import ContextMixin +from metagpt.provider.llm_provider_registry import create_llm_instance from metagpt.schema import ( CodePlanAndChangeContext, CodeSummarizeContext, @@ -35,6 +37,19 @@ class Action(SerializationMixin, ContextMixin, BaseModel): prefix: str = "" # aask*时会加上prefix,作为system_message desc: str = "" # for skill manager node: ActionNode = Field(default=None, exclude=True) + # The model name or API type of LLM of the `models` in the `config2.yaml`; + # Using `None` to use the `llm` configuration in the `config2.yaml`. + llm_name_or_type: Optional[str] = None + + @model_validator(mode="after") + @classmethod + def _update_private_llm(cls, data: Any) -> Any: + config = ModelsConfig.default().get(data.llm_name_or_type) + if config: + llm = create_llm_instance(config) + llm.cost_manager = data.llm.cost_manager + data.llm = llm + return data @property def repo(self) -> ProjectRepo: diff --git a/metagpt/actions/action_node.py b/metagpt/actions/action_node.py index 343a4f9ac..3b93b47df 100644 --- a/metagpt/actions/action_node.py +++ b/metagpt/actions/action_node.py @@ -504,7 +504,6 @@ class ActionNode: ### format example (must be strictly followed) (do not include any other formats except for the given XML format) {example_str} """ - print(context) return context async def code_fill(self, context, function_name=None, timeout=USE_CONFIG_TIMEOUT): diff --git a/metagpt/configs/llm_config.py b/metagpt/configs/llm_config.py index 0284c8993..67fb6afdb 100644 --- a/metagpt/configs/llm_config.py +++ b/metagpt/configs/llm_config.py @@ -10,9 +10,9 @@ from typing import Optional from pydantic import field_validator -from metagpt.const import LLM_API_TIMEOUT +from metagpt.const import CONFIG_ROOT, LLM_API_TIMEOUT, METAGPT_ROOT from metagpt.utils.yaml_model import YamlModel -from metagpt.const import METAGPT_ROOT, CONFIG_ROOT + class LLMType(Enum): OPENAI = "openai" @@ -97,12 +97,13 @@ class LLMConfig(YamlModel): repo_config_path = METAGPT_ROOT / "config/config2.yaml" root_config_path = CONFIG_ROOT / "config2.yaml" if root_config_path.exists(): - raise ValueError( - f"Please set your API key in {root_config_path}. If you also set your config in {repo_config_path}, \nthe former will overwrite the latter. This may cause unexpected result.\n") + raise ValueError( + f"Please set your API key in {root_config_path}. If you also set your config in {repo_config_path}, \nthe former will overwrite the latter. This may cause unexpected result.\n" + ) elif repo_config_path.exists(): raise ValueError(f"Please set your API key in {repo_config_path}") else: - raise ValueError(f"Please set your API key in config2.yaml") + raise ValueError("Please set your API key in config2.yaml") return v @field_validator("timeout") diff --git a/metagpt/configs/models_config.py b/metagpt/configs/models_config.py new file mode 100644 index 000000000..bc4897fec --- /dev/null +++ b/metagpt/configs/models_config.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +models_config.py + +This module defines the ModelsConfig class for handling configuration of LLM models. + +Attributes: + CONFIG_ROOT (Path): Root path for configuration files. + METAGPT_ROOT (Path): Root path for MetaGPT files. + +Classes: + ModelsConfig (YamlModel): Configuration class for LLM models. +""" +from pathlib import Path +from typing import Dict, List, Optional + +from pydantic import Field, field_validator + +from metagpt.config2 import merge_dict +from metagpt.configs.llm_config import LLMConfig +from metagpt.const import CONFIG_ROOT, METAGPT_ROOT +from metagpt.utils.yaml_model import YamlModel + + +class ModelsConfig(YamlModel): + """ + Configuration class for `models` in `config2.yaml`. + + Attributes: + models (Dict[str, LLMConfig]): Dictionary mapping model names or types to LLMConfig objects. + + Methods: + update_llm_model(cls, value): Validates and updates LLM model configurations. + from_home(cls, path): Loads configuration from ~/.metagpt/config2.yaml. + default(cls): Loads default configuration from predefined paths. + get(self, name_or_type: str) -> Optional[LLMConfig]: Retrieves LLMConfig by name or API type. + """ + + models: Dict[str, LLMConfig] = Field(default_factory=dict) + + @field_validator("models", mode="before") + @classmethod + def update_llm_model(cls, value): + """ + Validates and updates LLM model configurations. + + Args: + value (Dict[str, Union[LLMConfig, dict]]): Dictionary of LLM configurations. + + Returns: + Dict[str, Union[LLMConfig, dict]]: Updated dictionary of LLM configurations. + """ + for key, config in value.items(): + if isinstance(config, LLMConfig): + config.model = config.model or key + elif isinstance(config, dict): + config["model"] = config.get("model") or key + return value + + @classmethod + def from_home(cls, path): + """ + Loads configuration from ~/.metagpt/config2.yaml. + + Args: + path (str): Relative path to configuration file. + + Returns: + Optional[ModelsConfig]: Loaded ModelsConfig object or None if file doesn't exist. + """ + pathname = CONFIG_ROOT / path + if not pathname.exists(): + return None + return ModelsConfig.from_yaml_file(pathname) + + @classmethod + def default(cls): + """ + Loads default configuration from predefined paths. + + Returns: + ModelsConfig: Default ModelsConfig object. + """ + default_config_paths: List[Path] = [ + METAGPT_ROOT / "config/config2.yaml", + CONFIG_ROOT / "config2.yaml", + ] + + dicts = [ModelsConfig.read_yaml(path) for path in default_config_paths] + final = merge_dict(dicts) + return ModelsConfig(**final) + + def get(self, name_or_type: str) -> Optional[LLMConfig]: + """ + Retrieves LLMConfig object by name or API type. + + Args: + name_or_type (str): Name or API type of the LLM model. + + Returns: + Optional[LLMConfig]: LLMConfig object if found, otherwise None. + """ + if not name_or_type: + return None + model = self.models.get(name_or_type) + if model: + return model + for m in self.models.values(): + if m.api_type == name_or_type: + return m + return None