mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-08 15:05:17 +02:00
update simple_scorer
This commit is contained in:
parent
f61506bd32
commit
1ead3e4d80
6 changed files with 71 additions and 75 deletions
|
|
@ -1,20 +1,27 @@
|
|||
import asyncio
|
||||
|
||||
from metagpt.exp_pool.scorers import SimpleScorer
|
||||
from metagpt.logs import logger
|
||||
|
||||
REQ = "Write a program to implement quicksort in python."
|
||||
|
||||
def echo(req: str):
|
||||
"""Echo from req."""
|
||||
RESP1 = """
|
||||
def quicksort(arr):
|
||||
return quicksort([x for x in arr[1:] if x <= arr[0]]) + [arr[0]] + quicksort([x for x in arr[1:] if x > arr[0]])
|
||||
"""
|
||||
|
||||
return req
|
||||
RESP2 = """
|
||||
def quicksort(arr):
|
||||
if len(arr) <= 1:
|
||||
return arr
|
||||
return quicksort([x for x in arr[1:] if x <= arr[0]]) + [arr[0]] + quicksort([x for x in arr[1:] if x > arr[0]])
|
||||
"""
|
||||
|
||||
|
||||
async def simple():
|
||||
scorer = SimpleScorer()
|
||||
|
||||
score = await scorer.evaluate(echo, "data", ("data",))
|
||||
logger.info(f"The score is: {score}")
|
||||
await scorer.evaluate(req=REQ, resp=RESP1)
|
||||
await scorer.evaluate(req=REQ, resp=RESP2)
|
||||
|
||||
|
||||
async def main():
|
||||
|
|
|
|||
|
|
@ -159,7 +159,7 @@ class ExpCacheHandler(BaseModel):
|
|||
async def evaluate_experience(self):
|
||||
"""Evaluate the experience, and save the score."""
|
||||
|
||||
self._score = await self.exp_scorer.evaluate(self.func, self._resp, self.args, self.kwargs)
|
||||
self._score = await self.exp_scorer.evaluate(self._req, self._resp)
|
||||
|
||||
def save_experience(self):
|
||||
"""Save the new experience."""
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
"""Base scorer."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable
|
||||
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
|
|
@ -12,16 +11,5 @@ class BaseScorer(BaseModel, ABC):
|
|||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
@abstractmethod
|
||||
async def evaluate(self, func: Callable, result: Any, args: tuple = None, kwargs: dict = None) -> Score:
|
||||
"""Evaluate the quality of the result produced by the function and parameters.
|
||||
|
||||
Args:
|
||||
func (Callable): The function whose result is to be evaluated.
|
||||
result (Any): The result produced by the function.
|
||||
args (Tuple[Any, ...]): The tuple of arguments that were passed to the function.
|
||||
kwargs (Dict[str, Any]): The dictionary of keyword arguments that were passed to the function.
|
||||
|
||||
Example:
|
||||
result = await sample(5, name="foo")
|
||||
score = await scorer.evaluate(sample, result, args=(5), kwargs={"name": "foo"})
|
||||
"""
|
||||
async def evaluate(self, req: str, resp: str) -> Score:
|
||||
"""Evaluates the quality of a response relative to a given request."""
|
||||
|
|
|
|||
|
|
@ -1,8 +1,6 @@
|
|||
"""Simple scorer."""
|
||||
|
||||
import inspect
|
||||
import json
|
||||
from typing import Any, Callable
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
|
|
@ -13,24 +11,16 @@ from metagpt.provider.base_llm import BaseLLM
|
|||
from metagpt.utils.common import CodeParser
|
||||
|
||||
SIMPLE_SCORER_TEMPLATE = """
|
||||
Role: You're an expert score evaluator. You specialize in assessing the output of the given function, based on its intended requirement and produced result.
|
||||
Role: You are a highly efficient assistant, tasked with evaluating a response to a given request. The response is generated by a large language model (LLM).
|
||||
|
||||
I will provide you with a request and a corresponding response. Your task is to assess this response and provide a score from a human perspective.
|
||||
|
||||
## Context
|
||||
### Function Name
|
||||
{func_name}
|
||||
### Request
|
||||
{req}
|
||||
|
||||
### Function Document
|
||||
{func_doc}
|
||||
|
||||
### Function Signature
|
||||
{func_signature}
|
||||
|
||||
### Function Parameters
|
||||
args: {func_args}
|
||||
kwargs: {func_kwargs}
|
||||
|
||||
### Produced Result By Function and Parameters
|
||||
{func_result}
|
||||
### Response
|
||||
{resp}
|
||||
|
||||
## Format Example
|
||||
```json
|
||||
|
|
@ -41,10 +31,10 @@ kwargs: {func_kwargs}
|
|||
```
|
||||
|
||||
## Instructions
|
||||
- Understand the function and requirements given by the user.
|
||||
- Analyze the results produced by the function.
|
||||
- Grade the results based on level of alignment with the requirements.
|
||||
- Provide a score on a scale defined by user or a default scale (1 to 10).
|
||||
- Understand the request and response given by the user.
|
||||
- Evaluate the response based on its quality relative to the given request.
|
||||
- Provide a score from 1 to 10, where 10 is the best.
|
||||
- Provide a reason supporting your score.
|
||||
|
||||
## Constraint
|
||||
Format: Just print the result in json format like **Format Example**.
|
||||
|
|
@ -57,26 +47,17 @@ Follow instructions, generate output and make sure it follows the **Constraint**
|
|||
class SimpleScorer(BaseScorer):
|
||||
llm: BaseLLM = Field(default_factory=LLM)
|
||||
|
||||
async def evaluate(self, func: Callable, result: Any, args: tuple = None, kwargs: dict = None) -> Score:
|
||||
"""Evaluates the quality of content by LLM.
|
||||
async def evaluate(self, req: str, resp: str) -> Score:
|
||||
"""Evaluates the quality of a response relative to a given request, as scored by an LLM.
|
||||
|
||||
Args:
|
||||
func: The function to evaluate.
|
||||
result: The result produced by the function.
|
||||
args: The positional arguments used when calling the function, if any.
|
||||
kwargs: The keyword arguments used when calling the function, if any.
|
||||
req (str): The request.
|
||||
resp (str): The response.
|
||||
|
||||
Returns:
|
||||
A Score object containing the evaluation results.
|
||||
Score: An object containing the score (1-10) and the reasoning.
|
||||
"""
|
||||
prompt = SIMPLE_SCORER_TEMPLATE.format(
|
||||
func_name=func.__name__,
|
||||
func_doc=func.__doc__,
|
||||
func_signature=inspect.signature(func),
|
||||
func_args=args,
|
||||
func_kwargs=kwargs,
|
||||
func_result=result,
|
||||
)
|
||||
prompt = SIMPLE_SCORER_TEMPLATE.format(req=req, resp=resp)
|
||||
resp = await self.llm.aask(prompt)
|
||||
resp_json = json.loads(CodeParser.parse_code(resp, lang="json"))
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@ import asyncio
|
|||
|
||||
import pytest
|
||||
|
||||
from metagpt.config2 import Config
|
||||
from metagpt.configs.exp_pool_config import ExperiencePoolConfig
|
||||
from metagpt.exp_pool.context_builders import SimpleContextBuilder
|
||||
from metagpt.exp_pool.decorator import ExpCacheHandler, exp_cache
|
||||
from metagpt.exp_pool.manager import ExperienceManager
|
||||
|
|
@ -20,6 +22,8 @@ class TestExpCacheHandler:
|
|||
def mock_exp_manager(self, mocker):
|
||||
manager = mocker.MagicMock(spec=ExperienceManager)
|
||||
manager.storage = mocker.MagicMock(spec=SimpleEngine)
|
||||
manager.config = mocker.MagicMock(spec=Config)
|
||||
manager.config.exp_pool = ExperiencePoolConfig()
|
||||
manager.query_exps = mocker.AsyncMock()
|
||||
manager.create_exp = mocker.MagicMock()
|
||||
return manager
|
||||
|
|
@ -131,9 +135,10 @@ class TestExpCacheHandler:
|
|||
|
||||
class TestExpCache:
|
||||
@pytest.fixture
|
||||
def mock_exp_manager(self, mocker):
|
||||
def mock_exp_manager(self, mocker, mock_config):
|
||||
manager = mocker.MagicMock(spec=ExperienceManager)
|
||||
manager.storage = mocker.MagicMock(spec=SimpleEngine)
|
||||
manager.config = mock_config
|
||||
manager.query_exps = mocker.AsyncMock()
|
||||
manager.create_exp = mocker.MagicMock()
|
||||
return manager
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from metagpt.exp_pool.schema import Score
|
||||
|
|
@ -20,30 +22,43 @@ class TestSimpleScorer:
|
|||
assert isinstance(scorer.llm, BaseLLM)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate(self, simple_scorer, mock_llm):
|
||||
# Mock function to evaluate
|
||||
def mock_func(a, b):
|
||||
"""This is a mock function."""
|
||||
return a + b
|
||||
async def test_evaluate(self, simple_scorer, mock_llm, mocker):
|
||||
# Mock request and response
|
||||
req = "What is the capital of France?"
|
||||
resp = "The capital of France is Paris."
|
||||
|
||||
# Mock LLM response
|
||||
mock_llm.aask.return_value = '```json\n{"val": 8, "reason": "Good performance"}\n```'
|
||||
mock_llm_response = '{"val": 9, "reason": "Accurate and concise answer"}'
|
||||
mock_llm.aask.return_value = f"```json\n{mock_llm_response}\n```"
|
||||
|
||||
# Mock CodeParser.parse_code
|
||||
mocker.patch("metagpt.utils.common.CodeParser.parse_code", return_value=mock_llm_response)
|
||||
|
||||
# Test evaluate method
|
||||
result = await simple_scorer.evaluate(mock_func, 5, args=(2, 3), kwargs={})
|
||||
result = await simple_scorer.evaluate(req, resp)
|
||||
|
||||
# Assert LLM was called with correct prompt
|
||||
expected_prompt = SIMPLE_SCORER_TEMPLATE.format(
|
||||
func_name=mock_func.__name__,
|
||||
func_doc=mock_func.__doc__,
|
||||
func_signature="(a, b)",
|
||||
func_args=(2, 3),
|
||||
func_kwargs={},
|
||||
func_result=5,
|
||||
)
|
||||
expected_prompt = SIMPLE_SCORER_TEMPLATE.format(req=req, resp=resp)
|
||||
mock_llm.aask.assert_called_once_with(expected_prompt)
|
||||
|
||||
# Assert the result is correct
|
||||
assert isinstance(result, Score)
|
||||
assert result.val == 8
|
||||
assert result.reason == "Good performance"
|
||||
assert result.val == 9
|
||||
assert result.reason == "Accurate and concise answer"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_evaluate_invalid_response(self, simple_scorer, mock_llm, mocker):
|
||||
# Mock request and response
|
||||
req = "What is the capital of France?"
|
||||
resp = "The capital of France is Paris."
|
||||
|
||||
# Mock LLM response with invalid JSON
|
||||
mock_llm_response = "Invalid JSON"
|
||||
mock_llm.aask.return_value = f"```json\n{mock_llm_response}\n```"
|
||||
|
||||
# Mock CodeParser.parse_code
|
||||
mocker.patch("metagpt.utils.common.CodeParser.parse_code", return_value=mock_llm_response)
|
||||
|
||||
# Test evaluate method with invalid response
|
||||
with pytest.raises(json.JSONDecodeError):
|
||||
await simple_scorer.evaluate(req, resp)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue