mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-05-24 14:15:17 +02:00
update
This commit is contained in:
parent
56d0af1e9e
commit
27e942cc5e
11 changed files with 19 additions and 58 deletions
|
|
@ -9,6 +9,8 @@ import aiofiles
|
|||
import pandas as pd
|
||||
from tqdm.asyncio import tqdm_asyncio
|
||||
|
||||
from metagpt.logs import logger
|
||||
|
||||
|
||||
class BaseBenchmark(ABC):
|
||||
def __init__(self, name: str, file_path: str, log_path: str):
|
||||
|
|
@ -39,7 +41,7 @@ class BaseBenchmark(ABC):
|
|||
output_file = os.path.join(self.log_path, filename)
|
||||
|
||||
df.to_csv(output_file, index=False)
|
||||
print(f"Results saved to {output_file}")
|
||||
logger.info(f"Results saved to {output_file}")
|
||||
|
||||
return avg_score, a_cost, t_cost
|
||||
|
||||
|
|
@ -95,6 +97,6 @@ class BaseBenchmark(ABC):
|
|||
results = await self.evaluate_all_problems(data, graph, max_concurrent_tasks)
|
||||
columns = self.get_result_columns()
|
||||
average_score, average_cost, total_cost = self.save_results_to_csv(results, columns)
|
||||
print(f"Average score on {self.name} dataset: {average_score:.5f}")
|
||||
print(f"Total Cost: {total_cost:.5f}")
|
||||
logger.info(f"Average score on {self.name} dataset: {average_score:.5f}")
|
||||
logger.info(f"Total Cost: {total_cost:.5f}")
|
||||
return average_score, average_cost, total_cost
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from typing import Callable, List, Optional, Tuple
|
|||
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
|
||||
|
||||
from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark
|
||||
|
||||
from metagpt.logs import logger
|
||||
|
||||
class GSM8KBenchmark(BaseBenchmark):
|
||||
def __init__(self, name: str, file_path: str, log_path: str):
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from typing import Callable, List, Tuple
|
|||
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
|
||||
|
||||
from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark
|
||||
|
||||
from metagpt.logs import logger
|
||||
|
||||
class HotpotQABenchmark(BaseBenchmark):
|
||||
def __init__(self, name: str, file_path: str, log_path: str):
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|||
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
|
||||
|
||||
from metagpt.actions.code_sanitize import sanitize
|
||||
from metagpt.logs import logger
|
||||
from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark
|
||||
|
||||
|
||||
|
|
@ -138,11 +139,11 @@ class HumanEvalBenchmark(BaseBenchmark):
|
|||
return input_text, prediction, expected_output, score, cost
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
print("Timeout error. Skipping this sample.")
|
||||
logger.info("Timeout error. Skipping this sample.")
|
||||
return input_text, "Timeout", expected_output, 0.0, 0.0
|
||||
|
||||
except Exception as e:
|
||||
print(f"Maximum retries reached. Skipping this sample. Error: {e}")
|
||||
logger.info(f"Maximum retries reached. Skipping this sample. Error: {e}")
|
||||
return input_text, str(e), expected_output, 0.0, 0.0
|
||||
|
||||
def calculate_score(self, expected_output: str, prediction: str) -> Tuple[float, str]:
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ from sympy.parsing.sympy_parser import parse_expr
|
|||
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
|
||||
|
||||
from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark
|
||||
|
||||
from metagpt.logs import logger
|
||||
|
||||
class MATHBenchmark(BaseBenchmark):
|
||||
def __init__(self, name: str, file_path: str, log_path: str):
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|||
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
|
||||
|
||||
from metagpt.actions.code_sanitize import sanitize
|
||||
from metagpt.logs import logger
|
||||
from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark
|
||||
|
||||
|
||||
|
|
@ -112,7 +113,7 @@ class MBPPBenchmark(BaseBenchmark):
|
|||
return input_text, prediction, expected_output, score, cost
|
||||
|
||||
except Exception as e:
|
||||
print(f"Maximum retries reached. Skipping this sample. Error: {e}")
|
||||
logger.info(f"Maximum retries reached. Skipping this sample. Error: {e}")
|
||||
return input_text, str(e), expected_output, 0.0, 0.0
|
||||
|
||||
def calculate_score(self, expected_output: str, prediction: str) -> Tuple[float, str]:
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ from typing import Dict
|
|||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
from metagpt.logs import logger
|
||||
|
||||
def download_file(url: str, filename: str) -> None:
|
||||
"""Download a file from the given URL and show progress."""
|
||||
|
|
@ -33,16 +34,16 @@ def extract_tar_gz(filename: str, extract_path: str) -> None:
|
|||
|
||||
def process_dataset(url: str, filename: str, extract_path: str) -> None:
|
||||
"""Download, extract, and clean up a dataset."""
|
||||
print(f"Downloading {filename}...")
|
||||
logger.info(f"Downloading {filename}...")
|
||||
download_file(url, filename)
|
||||
|
||||
print(f"Extracting {filename}...")
|
||||
logger.info(f"Extracting {filename}...")
|
||||
extract_tar_gz(filename, extract_path)
|
||||
|
||||
print(f"{filename} download and extraction completed.")
|
||||
logger.info(f"{filename} download and extraction completed.")
|
||||
|
||||
os.remove(filename)
|
||||
print(f"Removed {filename}")
|
||||
logger.info(f"Removed {filename}")
|
||||
|
||||
|
||||
# Define the datasets to be downloaded
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ from typing import Dict, List, Tuple
|
|||
from tenacity import retry, stop_after_attempt, wait_fixed
|
||||
|
||||
from metagpt.actions.action_node import ActionNode
|
||||
from metagpt.logs import logger
|
||||
from metagpt.ext.aflow.scripts.operator_an import (
|
||||
AnswerGenerateOp,
|
||||
CodeGenerateOp,
|
||||
|
|
|
|||
|
|
@ -1,33 +0,0 @@
|
|||
from typing import Literal
|
||||
import metagpt.ext.aflow.scripts.optimized.GSM8K.workflows.template.operator as operator
|
||||
import metagpt.ext.aflow.scripts.optimized.GSM8K.workflows.round_2.prompt as prompt_custom
|
||||
from metagpt.provider.llm_provider_registry import create_llm_instance
|
||||
from metagpt.utils.cost_manager import CostManager
|
||||
|
||||
DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"]
|
||||
|
||||
class Workflow:
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
llm_config,
|
||||
dataset: DatasetType,
|
||||
) -> None:
|
||||
self.name = name
|
||||
self.dataset = dataset
|
||||
self.llm = create_llm_instance(llm_config)
|
||||
self.llm.cost_manager = CostManager()
|
||||
self.custom = operator.Custom(self.llm)
|
||||
self.sc_ensemble = operator.ScEnsemble(self.llm)
|
||||
|
||||
async def __call__(self, problem: str):
|
||||
"""
|
||||
Implementation of the workflow
|
||||
"""
|
||||
solutions = []
|
||||
for _ in range(3):
|
||||
solution = await self.custom(input=problem, instruction=prompt_custom.SOLVE_PROMPT)
|
||||
solutions.append(solution['response'])
|
||||
|
||||
final_solution = await self.sc_ensemble(solutions=solutions, problem=problem)
|
||||
return final_solution['response'], self.llm.cost_manager.total_cost
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
SOLVE_PROMPT = """
|
||||
You are a mathematical problem solver. Your task is to solve the given problem step by step, showing all your work. After solving the problem, provide the final numerical answer without any units or explanations. Make sure to:
|
||||
|
||||
1. Break down the problem into clear steps.
|
||||
2. Show all calculations.
|
||||
3. Use proper mathematical notation.
|
||||
4. Double-check your work for accuracy.
|
||||
5. Provide only the final numerical answer at the end, with no additional text.
|
||||
|
||||
Solve the following problem:
|
||||
|
||||
"""
|
||||
Loading…
Add table
Add a link
Reference in a new issue