Update readme and better optimizer

This commit is contained in:
didi 2024-10-23 12:54:42 +08:00
parent 4564b70d75
commit d2f90dbda0
11 changed files with 173 additions and 7 deletions

View file

@ -0,0 +1,29 @@
# Custom Evaluation Function via Benchmark Class
## How to Use
To create a benchmark for a new dataset, follow these steps:
1. Create a new Python file, e.g., `my_dataset_benchmark.py`
2. Import the base class:
```python
from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark
```
3. Create a new class that inherits from `BaseBenchmark`:
```python
class MyDatasetBenchmark(BaseBenchmark):
def __init__(self, name: str, file_path: str, log_path: str):
super().__init__(name, file_path, log_path)
```
4. Implement the required abstract methods:
- `evaluate_problem`: Evaluate a single problem
- `calculate_score`: Calculate the score for a prediction
- `get_result_columns`: Define column names for the results CSV file
5. Override other methods as needed, such as `load_data` or `save_results_to_csv`
## Example
Refer to the `DROPBenchmark` class in the `drop.py` file for an example of how to implement a benchmark for a specific dataset.
By following these guidelines, you can easily create custom benchmark evaluations for new datasets.

View file

@ -64,8 +64,18 @@ datasets_to_download: Dict[str, Dict[str, str]] = {
}
def is_directory_empty(path: str) -> bool:
"""Check if the directory is empty"""
return len(os.listdir(path)) == 0
def download(datasets):
"""Main function to process all selected datasets."""
"""Main function to process all selected datasets"""
for dataset_name in datasets:
dataset = datasets_to_download[dataset_name]
process_dataset(dataset["url"], dataset["filename"], dataset["extract_path"])
extract_path = dataset["extract_path"]
if os.path.exists(extract_path) and not is_directory_empty(extract_path):
logger.info(f"Target folder {extract_path} for {dataset_name} is not empty, skipping download and extraction.")
continue
process_dataset(dataset["url"], dataset["filename"], extract_path)

View file

@ -45,8 +45,10 @@ class Evaluator:
# Use params to configure the graph and benchmark
configured_graph = await self._configure_graph(dataset, graph, params)
va_list = [1, 2, 3] # Use va_list from params, or use default value if not provided
if is_test:
va_list = None # For test data, generally use None to test all
else:
va_list = None # Use None to test all Validation data, or set va_list (e.g., [1, 2, 3]) to use partial data
return await benchmark.run_evaluation(configured_graph, va_list)
async def _configure_graph(self, dataset, graph, params: dict):

View file

@ -100,14 +100,14 @@ class ScEnsemble(Operator):
def __init__(self, llm: LLM, name: str = "ScEnsemble"):
super().__init__(llm, name)
async def __call__(self, solutions: List[str]):
async def __call__(self, solutions: List[str], problem: str):
answer_mapping = {}
solution_text = ""
for index, solution in enumerate(solutions):
answer_mapping[chr(65 + index)] = index
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
prompt = SC_ENSEMBLE_PROMPT.format(solutions=solution_text)
prompt = SC_ENSEMBLE_PROMPT.format(question=problem, solutions=solution_text)
response = await self._fill_node(ScEnsembleOp, prompt, mode="xml_fill")
answer = response.get("solution_letter", "")

View file

@ -0,0 +1,30 @@
from typing import Literal
import metagpt.ext.aflow.scripts.optimized.MATH.workflows.template.operator as operator
import metagpt.ext.aflow.scripts.optimized.MATH.workflows.round_2.prompt as prompt_custom
from metagpt.provider.llm_provider_registry import create_llm_instance
from metagpt.utils.cost_manager import CostManager
DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"]
class Workflow:
def __init__(
self,
name: str,
llm_config,
dataset: DatasetType,
) -> None:
self.name = name
self.dataset = dataset
self.llm = create_llm_instance(llm_config)
self.llm.cost_manager = CostManager()
self.custom = operator.Custom(self.llm)
self.sc_ensemble = operator.ScEnsemble(self.llm)
async def __call__(self, problem: str):
"""
Implementation of the workflow
"""
initial_solution = await self.custom(input=problem, instruction=prompt_custom.INITIAL_SOLUTION_PROMPT)
revised_solution = await self.custom(input=problem + f"\nInitial solution: {initial_solution['response']}", instruction=prompt_custom.REVISE_SOLUTION_PROMPT)
final_solution = await self.sc_ensemble(solutions=[initial_solution['response'], revised_solution['response']], problem=problem)
return final_solution['response'], self.llm.cost_manager.total_cost

View file

@ -0,0 +1,17 @@
INITIAL_SOLUTION_PROMPT = """
You are a math expert tasked with solving a complex problem. Please provide a step-by-step solution to the given problem, showing all your work and explaining your reasoning clearly. If the problem involves calculations, make sure to include them in your response.
Problem:
"""
REVISE_SOLUTION_PROMPT = """
You are a math expert tasked with reviewing and improving a solution to a complex problem. An initial solution has been provided, but it may contain errors or be incomplete. Your task is to carefully review the initial solution, identify any mistakes or areas for improvement, and provide a revised, more accurate solution.
Please follow these steps:
1. Review the initial solution thoroughly.
2. Identify any errors or areas that need improvement.
3. Provide a revised solution, explaining your changes and reasoning.
4. Ensure your revised solution is complete, accurate, and clearly explained.
Problem:
"""

View file

@ -42,6 +42,7 @@ class Optimizer:
optimized_path: str = None,
initial_round: int = 1,
max_rounds: int = 20,
validation_rounds: int = 5,
) -> None:
self.optimize_llm_config = opt_llm_config
self.optimize_llm = create_llm_instance(self.optimize_llm_config)
@ -59,6 +60,7 @@ class Optimizer:
self.top_scores = []
self.round = initial_round
self.max_rounds = max_rounds
self.validation_rounds = validation_rounds
self.graph_utils = GraphUtils(self.root_path)
self.data_utils = DataUtils(self.root_path)
@ -116,7 +118,7 @@ class Optimizer:
time.sleep(5)
async def _optimize_graph(self):
validation_n = 2 # validation datasets's execution number
validation_n = self.validation_rounds # validation datasets's execution number
graph_path = f"{self.root_path}/workflows"
data = self.data_utils.load_results(graph_path)