Update Multi LLM Config & Basic Evaluator

2026-07-26 17:11:07 +02:00 · 2024-08-25 22:16:39 +08:00 · 2024-08-25 22:16:39 +08:00 · 1593e98c45
commit 1593e98c45
parent a3ff25430e
7 changed files with 302 additions and 29 deletions
--- a/examples/ags/w_action_node/evaluator.py
+++ b/examples/ags/w_action_node/evaluator.py
@ -2,8 +2,13 @@
 # @Date    : 8/23/2024 10:00 AM
 # @Author  : all
 # @Desc    : evaluate for different dataset
+import datetime
+import os
 from typing import Literal

+import pandas as pd
+from deepeval.benchmarks import GSM8K
+
 # TODO 完成实验数据集的手动划分

 DatasetType = Literal["HumanEval", "MMBP", "Gsm8K", "MATH", "HotpotQa", "MMLU"]
@ -17,10 +22,12 @@ class Evaluator:
    def __init__(self, eval_path: str):
        pass

-    def validation_evaluate(self, dataset: DatasetType):
+    def validation_evaluate(self, dataset: DatasetType, result_path: str):
        """
        Evaluates on validation dataset.
        """
+        if dataset == "Gsm8K":
+            return self._gsm8k_eval(result_path)
        pass

    def test_evaluate(self, dataset: DatasetType):
@ -28,3 +35,113 @@ class Evaluator:
        Evaluates on test dataset.
        """
        pass
+
+    def _gsm8k_eval(self, model, result_path, samples: int = 1000):
+        """
+        Evaluate on GSM8K dataset.
+        """
+        if model is None:
+            raise ValueError("Model is required for evaluation.")
+
+        benchmark = GSM8K(n_problems=samples, n_shots=0, enable_cot=False)
+        goldens = benchmark.load_benchmark_dataset()[: benchmark.n_problems]
+
+        def _evaluate_problem(model, golden, benchmark):
+            prompt = golden.input
+
+            max_retries = 50
+            retries = 0
+
+            while retries < max_retries:
+                try:
+                    prediction = model.a_generate(prompt)
+                    score = benchmark.scorer.exact_match_score(golden.expected_output, prediction)
+                    break
+
+                except Exception as e:
+                    retries += 1
+                    print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
+
+                    if retries == max_retries:
+                        print("Maximum retries reached. Skipping this sample.")
+                        prediction = None
+                        score = 0
+                        break
+
+            return golden.input, str(prediction), golden.expected_output, score
+
+        results = [_evaluate_problem(model, golden, benchmark) for golden in goldens]
+
+        overall_correct_predictions = sum(score for _, _, _, score in results)
+        overall_total_predictions = benchmark.n_problems
+        overall_accuracy = overall_correct_predictions / overall_total_predictions
+
+        def process_gsm8k_csv(file_path, tolerance=1e-6):
+            # 读取 CSV 文件
+            df = pd.read_csv(file_path, dtype=str)  # 使用默认逗号分隔符，并指定所有列为字符串类型
+
+            # 清理预测和期望输出列
+            df["prediction"] = df["prediction"].str.strip()
+            df["prediction"] = df["prediction"].str.replace(",", "", regex=True)
+            df["expected output"] = df["expected output"].str.strip()
+            df["expected output"] = df["expected output"].str.replace(",", "", regex=True)
+
+            # 将列转换为数值类型
+            df["prediction"] = pd.to_numeric(df["prediction"], errors="coerce")
+            df["expected output"] = pd.to_numeric(df["expected output"], errors="coerce")
+
+            # 计算 score 列
+            # 对于浮点数，使用近似相等的逻辑
+            df["score"] = (df["prediction"] - df["expected output"]).abs() <= tolerance
+
+            # 将布尔值转换为整数
+            df["score"] = df["score"].astype(int)
+
+            # 计算 score 列的平均值
+            average_score = df["score"].mean()
+
+            # 获取输入文件的目录
+            input_dir = os.path.dirname(file_path)
+
+            # 创建输出文件路径
+            output_file_name = f"{average_score:.4f}.csv"
+            output_file_path = os.path.join(input_dir, output_file_name)
+
+            # 写入新的 CSV 文件
+            df.to_csv(output_file_path, index=False)
+
+            print(f"Data written to {output_file_path}")
+            print(f"Average score: {average_score:.4f}")
+
+            # 统计空值数量
+            num_empty_predictions = df["prediction"].isna().sum()
+
+            # 删除包含空 prediction 的行
+            df = df.dropna(subset=["prediction"])
+
+            # 重新计算正确的、错误的以及空的个数
+            num_correct = (df["score"] == 1).sum()
+            num_incorrect = (df["score"] == 0).sum()
+
+            print(f"Number of empty predictions: {num_empty_predictions}")
+            print(f"Number of correct predictions after removing empty ones: {num_correct}")
+            print(f"Number of incorrect predictions after removing empty ones: {num_incorrect}")
+
+            return average_score
+
+        predictions_row = [
+            (input, prediction, expected_output, score) for input, prediction, expected_output, score in results
+        ]
+        benchmark.predictions = pd.DataFrame(
+            predictions_row, columns=["input", "prediction", "expected output", "score"]
+        )
+        benchmark.overall_score = overall_accuracy
+        now = datetime.datetime.now()
+        now.strftime("%Y-%m-%d_%H-%M-%S").replace(":", "_")
+
+        # file_path = f'gsm8k_{overall_accuracy}_{now_time}.csv'
+
+        benchmark.predictions.to_csv(result_path, index=False)
+
+        score = process_gsm8k_csv(file_path=result_path)
+        return {"score": score}
--- a/examples/ags/w_action_node/graph.py
+++ b/examples/ags/w_action_node/graph.py
@ -5,7 +5,7 @@

 from typing import Literal

-from metagpt.llm import LLM
+from metagpt.provider.llm_provider_registry import create_llm_instance
 from metagpt.utils.cost_manager import CostManager

 DatasetType = Literal["HumanEval", "MMBP", "Gsm8K", "MATH", "HotpotQa", "MMLU"]
@ -17,16 +17,16 @@ class Graph:
    def __init__(
        self,
        name: str,
-        llm: LLM,
+        llm_config,
        dataset: DatasetType,
    ) -> None:
        self.name = name
-        self.model = llm
        self.dataset = dataset
-        self.cost = cost_manager  # TODO
+        self.llm = create_llm_instance(llm_config)
+        self.llm.cost_manager = CostManager()

-    def __call__():
+    def __call__(self):
        """
        Implementation of the graph
        """
-        NotImplementedError("Subclasses must implement __call__ method")
+        return self.llm.cost_manager.total_cost
--- a/examples/ags/w_action_node/optimizer.py
+++ b/examples/ags/w_action_node/optimizer.py
@ -49,6 +49,7 @@ class Optimizer:
        operators: List,
        optimized_path: str = None,
        sample: int = 6,
+        q_type: str = "math",  # math,code,quiz
    ) -> None:
        self.optimize_llm = opt_llm
        self.execute_llm = exec_llm
@ -61,6 +62,7 @@ class Optimizer:
        self.sample = sample
        self.score = "None"
        self.top_scores = []
+        self.type = q_type
        self.round = 1  # 起始轮次

    def _initialize_oprimizer(self):
@ -130,33 +132,57 @@ class Optimizer:
        # examples/ags/w_action_node/optimized/gsm8k/graphs/round_1
        prompt_file_path = os.path.join(graphs_path, "prompt.py")
        graph_file_path = os.path.join(graphs_path, "graph.py")
+        operator_file_path = os.path.join(graphs_path, "operator.py")

        try:
            with open(prompt_file_path, "r", encoding="utf-8") as file:
                prompt_content = file.read()
            with open(graph_file_path, "r", encoding="utf-8") as file:
                graph_content = file.read()
+            with open(operator_file_path, "r", encoding="utf-8") as file:
+                operator_content = file.read()
        except FileNotFoundError as e:
            print(f"Error: File not found for round {round_number}: {e}")
            raise
        except Exception as e:
            print(f"Error loading prompt for round {round_number}: {e}")
            raise
-        return prompt_content, graph_content
+        return prompt_content, graph_content, operator_content

    def _load_scores(self):
-        """
-        # TODO 重写这个函数，写一个新的结构存储分数
-        """
-        round_number = 1
-        score = 1
+        rounds_dir = os.path.join(self.root_path, "graphs")
+        self.top_scores = []

-        self.top_scores.append(
-            {
-                "round": round_number,
-                "score": score,
-            }
-        )
+        # 遍历所有轮次的文件夹
+        for round_dir in os.listdir(rounds_dir):
+            if os.path.isdir(os.path.join(rounds_dir, round_dir)) and round_dir.startswith("round_"):
+                round_number = int(round_dir.replace("round_", ""))
+                csv_file_path = os.path.join(rounds_dir, round_dir)
+                try:
+                    # 遍历文件夹中的文件，查找 CSV 文件
+                    for filename in os.listdir(csv_file_path):
+                        score = 0
+
+                        if filename.endswith(".csv"):
+                            # 文件名就是分数
+                            score = float(filename[:-4])  # 去除.csv
+
+                        self.top_scores.append(
+                            {
+                                "round": round_number,
+                                "score": score,
+                            }
+                        )
+
+                except FileNotFoundError as e:
+                    print(f"Error: File not found for round {round_number}: {e}")
+                    continue
+                except ValueError as e:
+                    print(f"Error parsing score from filename for round {round_number}: {e}")
+                    continue
+                except Exception as e:
+                    print(f"Error processing round {round_number}: {e}")
+                    continue

        # 对所有轮次的分数进行排序
        self.top_scores.sort(key=lambda x: x["score"], reverse=True)
@ -272,7 +298,7 @@ class Optimizer:

        print(top_rounds)

-        prompt, graph_load = self._read_files(sample["round"])
+        prompt, graph_load, operator = self._read_files(sample["round"])
        score = sample["score"]

        # 正则表达式匹配 SolveGraph 开始的内容
@ -298,9 +324,12 @@ class Optimizer:
        else:
            experience = f"No experience data found for round {current_round}."

-        graph_input = GRAPH_INPUT.format(experinece=experience, score=score, graph=graph[0], prompt=prompt)
+        graph_input = GRAPH_INPUT.format(
+            experinece=experience, score=score, graph=graph[0], prompt=prompt, type=self.type
+        )
+        graph_system = GRAPH_OPTIMIZE_PROMPT.format(type=self.type)

-        node_prompt = GRAPH_OPTIMIZE_PROMPT + graph_input  # TODO 看一眼谁先谁后这个地方
+        node_prompt = graph_system + graph_input  # TODO 看一眼谁先谁后这个地方

        node = await ActionNode.from_pydantic(GraphOptimize).fill(
            context=node_prompt, mode="context_fill", llm=self.llm
--- a/metagpt/actions/action.py
+++ b/metagpt/actions/action.py
@ -8,12 +8,14 @@

 from __future__ import annotations

-from typing import Optional, Union
+from typing import Any, Optional, Union

 from pydantic import BaseModel, ConfigDict, Field, model_validator

 from metagpt.actions.action_node import ActionNode
+from metagpt.configs.models_config import ModelsConfig
 from metagpt.context_mixin import ContextMixin
+from metagpt.provider.llm_provider_registry import create_llm_instance
 from metagpt.schema import (
    CodePlanAndChangeContext,
    CodeSummarizeContext,
@ -35,6 +37,19 @@ class Action(SerializationMixin, ContextMixin, BaseModel):
    prefix: str = ""  # aask*时会加上prefix，作为system_message
    desc: str = ""  # for skill manager
    node: ActionNode = Field(default=None, exclude=True)
+    # The model name or API type of LLM of the `models` in the `config2.yaml`;
+    #   Using `None` to use the `llm` configuration in the `config2.yaml`.
+    llm_name_or_type: Optional[str] = None
+
+    @model_validator(mode="after")
+    @classmethod
+    def _update_private_llm(cls, data: Any) -> Any:
+        config = ModelsConfig.default().get(data.llm_name_or_type)
+        if config:
+            llm = create_llm_instance(config)
+            llm.cost_manager = data.llm.cost_manager
+            data.llm = llm
+        return data

    @property
    def repo(self) -> ProjectRepo:
--- a/metagpt/actions/action_node.py
+++ b/metagpt/actions/action_node.py
@ -504,7 +504,6 @@ class ActionNode:
 ### format example (must be strictly followed) (do not include any other formats except for the given XML format)
 {example_str}
 """
-        print(context)
        return context

    async def code_fill(self, context, function_name=None, timeout=USE_CONFIG_TIMEOUT):
--- a/metagpt/configs/llm_config.py
+++ b/metagpt/configs/llm_config.py
@ -10,9 +10,9 @@ from typing import Optional

 from pydantic import field_validator

-from metagpt.const import LLM_API_TIMEOUT
+from metagpt.const import CONFIG_ROOT, LLM_API_TIMEOUT, METAGPT_ROOT
 from metagpt.utils.yaml_model import YamlModel
-from metagpt.const import METAGPT_ROOT, CONFIG_ROOT
+

 class LLMType(Enum):
    OPENAI = "openai"
@ -97,12 +97,13 @@ class LLMConfig(YamlModel):
            repo_config_path = METAGPT_ROOT / "config/config2.yaml"
            root_config_path = CONFIG_ROOT / "config2.yaml"
            if root_config_path.exists():
-                 raise ValueError(
-                    f"Please set your API key in {root_config_path}. If you also set your config in {repo_config_path}, \nthe former will overwrite the latter. This may cause unexpected result.\n")
+                raise ValueError(
+                    f"Please set your API key in {root_config_path}. If you also set your config in {repo_config_path}, \nthe former will overwrite the latter. This may cause unexpected result.\n"
+                )
            elif repo_config_path.exists():
                raise ValueError(f"Please set your API key in {repo_config_path}")
            else:
-                raise ValueError(f"Please set your API key in config2.yaml")
+                raise ValueError("Please set your API key in config2.yaml")
        return v

    @field_validator("timeout")
--- a/metagpt/configs/models_config.py
+++ b/metagpt/configs/models_config.py
@ -0,0 +1,112 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+models_config.py
+
+This module defines the ModelsConfig class for handling configuration of LLM models.
+
+Attributes:
+    CONFIG_ROOT (Path): Root path for configuration files.
+    METAGPT_ROOT (Path): Root path for MetaGPT files.
+
+Classes:
+    ModelsConfig (YamlModel): Configuration class for LLM models.
+"""
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from pydantic import Field, field_validator
+
+from metagpt.config2 import merge_dict
+from metagpt.configs.llm_config import LLMConfig
+from metagpt.const import CONFIG_ROOT, METAGPT_ROOT
+from metagpt.utils.yaml_model import YamlModel
+
+
+class ModelsConfig(YamlModel):
+    """
+    Configuration class for `models` in `config2.yaml`.
+
+    Attributes:
+        models (Dict[str, LLMConfig]): Dictionary mapping model names or types to LLMConfig objects.
+
+    Methods:
+        update_llm_model(cls, value): Validates and updates LLM model configurations.
+        from_home(cls, path): Loads configuration from ~/.metagpt/config2.yaml.
+        default(cls): Loads default configuration from predefined paths.
+        get(self, name_or_type: str) -> Optional[LLMConfig]: Retrieves LLMConfig by name or API type.
+    """
+
+    models: Dict[str, LLMConfig] = Field(default_factory=dict)
+
+    @field_validator("models", mode="before")
+    @classmethod
+    def update_llm_model(cls, value):
+        """
+        Validates and updates LLM model configurations.
+
+        Args:
+            value (Dict[str, Union[LLMConfig, dict]]): Dictionary of LLM configurations.
+
+        Returns:
+            Dict[str, Union[LLMConfig, dict]]: Updated dictionary of LLM configurations.
+        """
+        for key, config in value.items():
+            if isinstance(config, LLMConfig):
+                config.model = config.model or key
+            elif isinstance(config, dict):
+                config["model"] = config.get("model") or key
+        return value
+
+    @classmethod
+    def from_home(cls, path):
+        """
+        Loads configuration from ~/.metagpt/config2.yaml.
+
+        Args:
+            path (str): Relative path to configuration file.
+
+        Returns:
+            Optional[ModelsConfig]: Loaded ModelsConfig object or None if file doesn't exist.
+        """
+        pathname = CONFIG_ROOT / path
+        if not pathname.exists():
+            return None
+        return ModelsConfig.from_yaml_file(pathname)
+
+    @classmethod
+    def default(cls):
+        """
+        Loads default configuration from predefined paths.
+
+        Returns:
+            ModelsConfig: Default ModelsConfig object.
+        """
+        default_config_paths: List[Path] = [
+            METAGPT_ROOT / "config/config2.yaml",
+            CONFIG_ROOT / "config2.yaml",
+        ]
+
+        dicts = [ModelsConfig.read_yaml(path) for path in default_config_paths]
+        final = merge_dict(dicts)
+        return ModelsConfig(**final)
+
+    def get(self, name_or_type: str) -> Optional[LLMConfig]:
+        """
+        Retrieves LLMConfig object by name or API type.
+
+        Args:
+            name_or_type (str): Name or API type of the LLM model.
+
+        Returns:
+            Optional[LLMConfig]: LLMConfig object if found, otherwise None.
+        """
+        if not name_or_type:
+            return None
+        model = self.models.get(name_or_type)
+        if model:
+            return model
+        for m in self.models.values():
+            if m.api_type == name_or_type:
+                return m
+        return None