From 1ead3e4d8083c258d0d418eb3cfab3564504a188 Mon Sep 17 00:00:00 2001 From: seehi <6580@pm.me> Date: Mon, 8 Jul 2024 20:55:32 +0800 Subject: [PATCH] update simple_scorer --- examples/exp_pool/scorer.py | 19 ++++--- metagpt/exp_pool/decorator.py | 2 +- metagpt/exp_pool/scorers/base.py | 16 +----- metagpt/exp_pool/scorers/simple.py | 53 ++++++------------- tests/metagpt/exp_pool/test_decorator.py | 7 ++- .../test_scorers/test_simple_scorer.py | 49 +++++++++++------ 6 files changed, 71 insertions(+), 75 deletions(-) diff --git a/examples/exp_pool/scorer.py b/examples/exp_pool/scorer.py index 1efe07bdf..c412feaf3 100644 --- a/examples/exp_pool/scorer.py +++ b/examples/exp_pool/scorer.py @@ -1,20 +1,27 @@ import asyncio from metagpt.exp_pool.scorers import SimpleScorer -from metagpt.logs import logger +REQ = "Write a program to implement quicksort in python." -def echo(req: str): - """Echo from req.""" +RESP1 = """ +def quicksort(arr): + return quicksort([x for x in arr[1:] if x <= arr[0]]) + [arr[0]] + quicksort([x for x in arr[1:] if x > arr[0]]) +""" - return req +RESP2 = """ +def quicksort(arr): + if len(arr) <= 1: + return arr + return quicksort([x for x in arr[1:] if x <= arr[0]]) + [arr[0]] + quicksort([x for x in arr[1:] if x > arr[0]]) +""" async def simple(): scorer = SimpleScorer() - score = await scorer.evaluate(echo, "data", ("data",)) - logger.info(f"The score is: {score}") + await scorer.evaluate(req=REQ, resp=RESP1) + await scorer.evaluate(req=REQ, resp=RESP2) async def main(): diff --git a/metagpt/exp_pool/decorator.py b/metagpt/exp_pool/decorator.py index 10f3355f9..4e7213dfe 100644 --- a/metagpt/exp_pool/decorator.py +++ b/metagpt/exp_pool/decorator.py @@ -159,7 +159,7 @@ class ExpCacheHandler(BaseModel): async def evaluate_experience(self): """Evaluate the experience, and save the score.""" - self._score = await self.exp_scorer.evaluate(self.func, self._resp, self.args, self.kwargs) + self._score = await self.exp_scorer.evaluate(self._req, self._resp) def save_experience(self): """Save the new experience.""" diff --git a/metagpt/exp_pool/scorers/base.py b/metagpt/exp_pool/scorers/base.py index 94623c30f..97cac4992 100644 --- a/metagpt/exp_pool/scorers/base.py +++ b/metagpt/exp_pool/scorers/base.py @@ -1,7 +1,6 @@ """Base scorer.""" from abc import ABC, abstractmethod -from typing import Any, Callable from pydantic import BaseModel, ConfigDict @@ -12,16 +11,5 @@ class BaseScorer(BaseModel, ABC): model_config = ConfigDict(arbitrary_types_allowed=True) @abstractmethod - async def evaluate(self, func: Callable, result: Any, args: tuple = None, kwargs: dict = None) -> Score: - """Evaluate the quality of the result produced by the function and parameters. - - Args: - func (Callable): The function whose result is to be evaluated. - result (Any): The result produced by the function. - args (Tuple[Any, ...]): The tuple of arguments that were passed to the function. - kwargs (Dict[str, Any]): The dictionary of keyword arguments that were passed to the function. - - Example: - result = await sample(5, name="foo") - score = await scorer.evaluate(sample, result, args=(5), kwargs={"name": "foo"}) - """ + async def evaluate(self, req: str, resp: str) -> Score: + """Evaluates the quality of a response relative to a given request.""" diff --git a/metagpt/exp_pool/scorers/simple.py b/metagpt/exp_pool/scorers/simple.py index 1fda189d1..fd7b6537b 100644 --- a/metagpt/exp_pool/scorers/simple.py +++ b/metagpt/exp_pool/scorers/simple.py @@ -1,8 +1,6 @@ """Simple scorer.""" -import inspect import json -from typing import Any, Callable from pydantic import Field @@ -13,24 +11,16 @@ from metagpt.provider.base_llm import BaseLLM from metagpt.utils.common import CodeParser SIMPLE_SCORER_TEMPLATE = """ -Role: You're an expert score evaluator. You specialize in assessing the output of the given function, based on its intended requirement and produced result. +Role: You are a highly efficient assistant, tasked with evaluating a response to a given request. The response is generated by a large language model (LLM). + +I will provide you with a request and a corresponding response. Your task is to assess this response and provide a score from a human perspective. ## Context -### Function Name -{func_name} +### Request +{req} -### Function Document -{func_doc} - -### Function Signature -{func_signature} - -### Function Parameters -args: {func_args} -kwargs: {func_kwargs} - -### Produced Result By Function and Parameters -{func_result} +### Response +{resp} ## Format Example ```json @@ -41,10 +31,10 @@ kwargs: {func_kwargs} ``` ## Instructions -- Understand the function and requirements given by the user. -- Analyze the results produced by the function. -- Grade the results based on level of alignment with the requirements. -- Provide a score on a scale defined by user or a default scale (1 to 10). +- Understand the request and response given by the user. +- Evaluate the response based on its quality relative to the given request. +- Provide a score from 1 to 10, where 10 is the best. +- Provide a reason supporting your score. ## Constraint Format: Just print the result in json format like **Format Example**. @@ -57,26 +47,17 @@ Follow instructions, generate output and make sure it follows the **Constraint** class SimpleScorer(BaseScorer): llm: BaseLLM = Field(default_factory=LLM) - async def evaluate(self, func: Callable, result: Any, args: tuple = None, kwargs: dict = None) -> Score: - """Evaluates the quality of content by LLM. + async def evaluate(self, req: str, resp: str) -> Score: + """Evaluates the quality of a response relative to a given request, as scored by an LLM. Args: - func: The function to evaluate. - result: The result produced by the function. - args: The positional arguments used when calling the function, if any. - kwargs: The keyword arguments used when calling the function, if any. + req (str): The request. + resp (str): The response. Returns: - A Score object containing the evaluation results. + Score: An object containing the score (1-10) and the reasoning. """ - prompt = SIMPLE_SCORER_TEMPLATE.format( - func_name=func.__name__, - func_doc=func.__doc__, - func_signature=inspect.signature(func), - func_args=args, - func_kwargs=kwargs, - func_result=result, - ) + prompt = SIMPLE_SCORER_TEMPLATE.format(req=req, resp=resp) resp = await self.llm.aask(prompt) resp_json = json.loads(CodeParser.parse_code(resp, lang="json")) diff --git a/tests/metagpt/exp_pool/test_decorator.py b/tests/metagpt/exp_pool/test_decorator.py index c0b3fe36d..0c02dcdfc 100644 --- a/tests/metagpt/exp_pool/test_decorator.py +++ b/tests/metagpt/exp_pool/test_decorator.py @@ -2,6 +2,8 @@ import asyncio import pytest +from metagpt.config2 import Config +from metagpt.configs.exp_pool_config import ExperiencePoolConfig from metagpt.exp_pool.context_builders import SimpleContextBuilder from metagpt.exp_pool.decorator import ExpCacheHandler, exp_cache from metagpt.exp_pool.manager import ExperienceManager @@ -20,6 +22,8 @@ class TestExpCacheHandler: def mock_exp_manager(self, mocker): manager = mocker.MagicMock(spec=ExperienceManager) manager.storage = mocker.MagicMock(spec=SimpleEngine) + manager.config = mocker.MagicMock(spec=Config) + manager.config.exp_pool = ExperiencePoolConfig() manager.query_exps = mocker.AsyncMock() manager.create_exp = mocker.MagicMock() return manager @@ -131,9 +135,10 @@ class TestExpCacheHandler: class TestExpCache: @pytest.fixture - def mock_exp_manager(self, mocker): + def mock_exp_manager(self, mocker, mock_config): manager = mocker.MagicMock(spec=ExperienceManager) manager.storage = mocker.MagicMock(spec=SimpleEngine) + manager.config = mock_config manager.query_exps = mocker.AsyncMock() manager.create_exp = mocker.MagicMock() return manager diff --git a/tests/metagpt/exp_pool/test_scorers/test_simple_scorer.py b/tests/metagpt/exp_pool/test_scorers/test_simple_scorer.py index 043f105d0..e17edfca8 100644 --- a/tests/metagpt/exp_pool/test_scorers/test_simple_scorer.py +++ b/tests/metagpt/exp_pool/test_scorers/test_simple_scorer.py @@ -1,3 +1,5 @@ +import json + import pytest from metagpt.exp_pool.schema import Score @@ -20,30 +22,43 @@ class TestSimpleScorer: assert isinstance(scorer.llm, BaseLLM) @pytest.mark.asyncio - async def test_evaluate(self, simple_scorer, mock_llm): - # Mock function to evaluate - def mock_func(a, b): - """This is a mock function.""" - return a + b + async def test_evaluate(self, simple_scorer, mock_llm, mocker): + # Mock request and response + req = "What is the capital of France?" + resp = "The capital of France is Paris." # Mock LLM response - mock_llm.aask.return_value = '```json\n{"val": 8, "reason": "Good performance"}\n```' + mock_llm_response = '{"val": 9, "reason": "Accurate and concise answer"}' + mock_llm.aask.return_value = f"```json\n{mock_llm_response}\n```" + + # Mock CodeParser.parse_code + mocker.patch("metagpt.utils.common.CodeParser.parse_code", return_value=mock_llm_response) # Test evaluate method - result = await simple_scorer.evaluate(mock_func, 5, args=(2, 3), kwargs={}) + result = await simple_scorer.evaluate(req, resp) # Assert LLM was called with correct prompt - expected_prompt = SIMPLE_SCORER_TEMPLATE.format( - func_name=mock_func.__name__, - func_doc=mock_func.__doc__, - func_signature="(a, b)", - func_args=(2, 3), - func_kwargs={}, - func_result=5, - ) + expected_prompt = SIMPLE_SCORER_TEMPLATE.format(req=req, resp=resp) mock_llm.aask.assert_called_once_with(expected_prompt) # Assert the result is correct assert isinstance(result, Score) - assert result.val == 8 - assert result.reason == "Good performance" + assert result.val == 9 + assert result.reason == "Accurate and concise answer" + + @pytest.mark.asyncio + async def test_evaluate_invalid_response(self, simple_scorer, mock_llm, mocker): + # Mock request and response + req = "What is the capital of France?" + resp = "The capital of France is Paris." + + # Mock LLM response with invalid JSON + mock_llm_response = "Invalid JSON" + mock_llm.aask.return_value = f"```json\n{mock_llm_response}\n```" + + # Mock CodeParser.parse_code + mocker.patch("metagpt.utils.common.CodeParser.parse_code", return_value=mock_llm_response) + + # Test evaluate method with invalid response + with pytest.raises(json.JSONDecodeError): + await simple_scorer.evaluate(req, resp)