From 1ead3e4d8083c258d0d418eb3cfab3564504a188 Mon Sep 17 00:00:00 2001
From: seehi <6580@pm.me>
Date: Mon, 8 Jul 2024 20:55:32 +0800
Subject: [PATCH] update simple_scorer

---
 examples/exp_pool/scorer.py                   | 19 ++++---
 metagpt/exp_pool/decorator.py                 |  2 +-
 metagpt/exp_pool/scorers/base.py              | 16 +-----
 metagpt/exp_pool/scorers/simple.py            | 53 ++++++-------------
 tests/metagpt/exp_pool/test_decorator.py      |  7 ++-
 .../test_scorers/test_simple_scorer.py        | 49 +++++++++++------
 6 files changed, 71 insertions(+), 75 deletions(-)

diff --git a/examples/exp_pool/scorer.py b/examples/exp_pool/scorer.py
index 1efe07bdf..c412feaf3 100644
--- a/examples/exp_pool/scorer.py
+++ b/examples/exp_pool/scorer.py
@@ -1,20 +1,27 @@
 import asyncio
 
 from metagpt.exp_pool.scorers import SimpleScorer
-from metagpt.logs import logger
 
+REQ = "Write a program to implement quicksort in python."
 
-def echo(req: str):
-    """Echo from req."""
+RESP1 = """
+def quicksort(arr):
+    return quicksort([x for x in arr[1:] if x <= arr[0]]) + [arr[0]] + quicksort([x for x in arr[1:] if x > arr[0]])
+"""
 
-    return req
+RESP2 = """
+def quicksort(arr):
+    if len(arr) <= 1:
+        return arr
+    return quicksort([x for x in arr[1:] if x <= arr[0]]) + [arr[0]] + quicksort([x for x in arr[1:] if x > arr[0]])
+"""
 
 
 async def simple():
     scorer = SimpleScorer()
 
-    score = await scorer.evaluate(echo, "data", ("data",))
-    logger.info(f"The score is: {score}")
+    await scorer.evaluate(req=REQ, resp=RESP1)
+    await scorer.evaluate(req=REQ, resp=RESP2)
 
 
 async def main():
diff --git a/metagpt/exp_pool/decorator.py b/metagpt/exp_pool/decorator.py
index 10f3355f9..4e7213dfe 100644
--- a/metagpt/exp_pool/decorator.py
+++ b/metagpt/exp_pool/decorator.py
@@ -159,7 +159,7 @@ class ExpCacheHandler(BaseModel):
     async def evaluate_experience(self):
         """Evaluate the experience, and save the score."""
 
-        self._score = await self.exp_scorer.evaluate(self.func, self._resp, self.args, self.kwargs)
+        self._score = await self.exp_scorer.evaluate(self._req, self._resp)
 
     def save_experience(self):
         """Save the new experience."""
diff --git a/metagpt/exp_pool/scorers/base.py b/metagpt/exp_pool/scorers/base.py
index 94623c30f..97cac4992 100644
--- a/metagpt/exp_pool/scorers/base.py
+++ b/metagpt/exp_pool/scorers/base.py
@@ -1,7 +1,6 @@
 """Base scorer."""
 
 from abc import ABC, abstractmethod
-from typing import Any, Callable
 
 from pydantic import BaseModel, ConfigDict
 
@@ -12,16 +11,5 @@ class BaseScorer(BaseModel, ABC):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     @abstractmethod
-    async def evaluate(self, func: Callable, result: Any, args: tuple = None, kwargs: dict = None) -> Score:
-        """Evaluate the quality of the result produced by the function and parameters.
-
-        Args:
-            func (Callable): The function whose result is to be evaluated.
-            result (Any): The result produced by the function.
-            args (Tuple[Any, ...]): The tuple of arguments that were passed to the function.
-            kwargs (Dict[str, Any]): The dictionary of keyword arguments that were passed to the function.
-
-        Example:
-            result = await sample(5, name="foo")
-            score = await scorer.evaluate(sample, result, args=(5), kwargs={"name": "foo"})
-        """
+    async def evaluate(self, req: str, resp: str) -> Score:
+        """Evaluates the quality of a response relative to a given request."""
diff --git a/metagpt/exp_pool/scorers/simple.py b/metagpt/exp_pool/scorers/simple.py
index 1fda189d1..fd7b6537b 100644
--- a/metagpt/exp_pool/scorers/simple.py
+++ b/metagpt/exp_pool/scorers/simple.py
@@ -1,8 +1,6 @@
 """Simple scorer."""
 
-import inspect
 import json
-from typing import Any, Callable
 
 from pydantic import Field
 
@@ -13,24 +11,16 @@ from metagpt.provider.base_llm import BaseLLM
 from metagpt.utils.common import CodeParser
 
 SIMPLE_SCORER_TEMPLATE = """
-Role: You're an expert score evaluator. You specialize in assessing the output of the given function, based on its intended requirement and produced result.
+Role: You are a highly efficient assistant, tasked with evaluating a response to a given request. The response is generated by a large language model (LLM). 
+
+I will provide you with a request and a corresponding response. Your task is to assess this response and provide a score from a human perspective.
 
 ## Context
-### Function Name
-{func_name}
+### Request
+{req}
 
-### Function Document
-{func_doc}
-
-### Function Signature
-{func_signature}
-
-### Function Parameters
-args: {func_args}
-kwargs: {func_kwargs}
-
-### Produced Result By Function and Parameters
-{func_result}
+### Response
+{resp}
 
 ## Format Example
 ```json
@@ -41,10 +31,10 @@ kwargs: {func_kwargs}
 ```
 
 ## Instructions
-- Understand the function and requirements given by the user.
-- Analyze the results produced by the function.
-- Grade the results based on level of alignment with the requirements.
-- Provide a score on a scale defined by user or a default scale (1 to 10).
+- Understand the request and response given by the user.
+- Evaluate the response based on its quality relative to the given request.
+- Provide a score from 1 to 10, where 10 is the best.
+- Provide a reason supporting your score.
 
 ## Constraint
 Format: Just print the result in json format like **Format Example**.
@@ -57,26 +47,17 @@ Follow instructions, generate output and make sure it follows the **Constraint**
 class SimpleScorer(BaseScorer):
     llm: BaseLLM = Field(default_factory=LLM)
 
-    async def evaluate(self, func: Callable, result: Any, args: tuple = None, kwargs: dict = None) -> Score:
-        """Evaluates the quality of content by LLM.
+    async def evaluate(self, req: str, resp: str) -> Score:
+        """Evaluates the quality of a response relative to a given request, as scored by an LLM.
 
         Args:
-            func: The function to evaluate.
-            result: The result produced by the function.
-            args: The positional arguments used when calling the function, if any.
-            kwargs: The keyword arguments used when calling the function, if any.
+            req (str): The request.
+            resp (str): The response.
 
         Returns:
-            A Score object containing the evaluation results.
+            Score: An object containing the score (1-10) and the reasoning.
         """
-        prompt = SIMPLE_SCORER_TEMPLATE.format(
-            func_name=func.__name__,
-            func_doc=func.__doc__,
-            func_signature=inspect.signature(func),
-            func_args=args,
-            func_kwargs=kwargs,
-            func_result=result,
-        )
+        prompt = SIMPLE_SCORER_TEMPLATE.format(req=req, resp=resp)
         resp = await self.llm.aask(prompt)
         resp_json = json.loads(CodeParser.parse_code(resp, lang="json"))
 
diff --git a/tests/metagpt/exp_pool/test_decorator.py b/tests/metagpt/exp_pool/test_decorator.py
index c0b3fe36d..0c02dcdfc 100644
--- a/tests/metagpt/exp_pool/test_decorator.py
+++ b/tests/metagpt/exp_pool/test_decorator.py
@@ -2,6 +2,8 @@ import asyncio
 
 import pytest
 
+from metagpt.config2 import Config
+from metagpt.configs.exp_pool_config import ExperiencePoolConfig
 from metagpt.exp_pool.context_builders import SimpleContextBuilder
 from metagpt.exp_pool.decorator import ExpCacheHandler, exp_cache
 from metagpt.exp_pool.manager import ExperienceManager
@@ -20,6 +22,8 @@ class TestExpCacheHandler:
     def mock_exp_manager(self, mocker):
         manager = mocker.MagicMock(spec=ExperienceManager)
         manager.storage = mocker.MagicMock(spec=SimpleEngine)
+        manager.config = mocker.MagicMock(spec=Config)
+        manager.config.exp_pool = ExperiencePoolConfig()
         manager.query_exps = mocker.AsyncMock()
         manager.create_exp = mocker.MagicMock()
         return manager
@@ -131,9 +135,10 @@ class TestExpCacheHandler:
 
 class TestExpCache:
     @pytest.fixture
-    def mock_exp_manager(self, mocker):
+    def mock_exp_manager(self, mocker, mock_config):
         manager = mocker.MagicMock(spec=ExperienceManager)
         manager.storage = mocker.MagicMock(spec=SimpleEngine)
+        manager.config = mock_config
         manager.query_exps = mocker.AsyncMock()
         manager.create_exp = mocker.MagicMock()
         return manager
diff --git a/tests/metagpt/exp_pool/test_scorers/test_simple_scorer.py b/tests/metagpt/exp_pool/test_scorers/test_simple_scorer.py
index 043f105d0..e17edfca8 100644
--- a/tests/metagpt/exp_pool/test_scorers/test_simple_scorer.py
+++ b/tests/metagpt/exp_pool/test_scorers/test_simple_scorer.py
@@ -1,3 +1,5 @@
+import json
+
 import pytest
 
 from metagpt.exp_pool.schema import Score
@@ -20,30 +22,43 @@ class TestSimpleScorer:
         assert isinstance(scorer.llm, BaseLLM)
 
     @pytest.mark.asyncio
-    async def test_evaluate(self, simple_scorer, mock_llm):
-        # Mock function to evaluate
-        def mock_func(a, b):
-            """This is a mock function."""
-            return a + b
+    async def test_evaluate(self, simple_scorer, mock_llm, mocker):
+        # Mock request and response
+        req = "What is the capital of France?"
+        resp = "The capital of France is Paris."
 
         # Mock LLM response
-        mock_llm.aask.return_value = '```json\n{"val": 8, "reason": "Good performance"}\n```'
+        mock_llm_response = '{"val": 9, "reason": "Accurate and concise answer"}'
+        mock_llm.aask.return_value = f"```json\n{mock_llm_response}\n```"
+
+        # Mock CodeParser.parse_code
+        mocker.patch("metagpt.utils.common.CodeParser.parse_code", return_value=mock_llm_response)
 
         # Test evaluate method
-        result = await simple_scorer.evaluate(mock_func, 5, args=(2, 3), kwargs={})
+        result = await simple_scorer.evaluate(req, resp)
 
         # Assert LLM was called with correct prompt
-        expected_prompt = SIMPLE_SCORER_TEMPLATE.format(
-            func_name=mock_func.__name__,
-            func_doc=mock_func.__doc__,
-            func_signature="(a, b)",
-            func_args=(2, 3),
-            func_kwargs={},
-            func_result=5,
-        )
+        expected_prompt = SIMPLE_SCORER_TEMPLATE.format(req=req, resp=resp)
         mock_llm.aask.assert_called_once_with(expected_prompt)
 
         # Assert the result is correct
         assert isinstance(result, Score)
-        assert result.val == 8
-        assert result.reason == "Good performance"
+        assert result.val == 9
+        assert result.reason == "Accurate and concise answer"
+
+    @pytest.mark.asyncio
+    async def test_evaluate_invalid_response(self, simple_scorer, mock_llm, mocker):
+        # Mock request and response
+        req = "What is the capital of France?"
+        resp = "The capital of France is Paris."
+
+        # Mock LLM response with invalid JSON
+        mock_llm_response = "Invalid JSON"
+        mock_llm.aask.return_value = f"```json\n{mock_llm_response}\n```"
+
+        # Mock CodeParser.parse_code
+        mocker.patch("metagpt.utils.common.CodeParser.parse_code", return_value=mock_llm_response)
+
+        # Test evaluate method with invalid response
+        with pytest.raises(json.JSONDecodeError):
+            await simple_scorer.evaluate(req, resp)