diff --git a/examples/aflow/optimize.py b/examples/aflow/optimize.py new file mode 100644 index 000000000..a3f64d86a --- /dev/null +++ b/examples/aflow/optimize.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +# @Date : 8/23/2024 20:00 PM +# @Author : didi +# @Desc : Entrance of AFlow. + +from metagpt.ext.aflow.scripts.optimizer import Optimizer +from metagpt.ext.aflow.scripts.evaluator import DatasetType, QuestionType, OptimizerType +from metagpt.ext.aflow.data.download_data import download +from metagpt.configs.models_config import ModelsConfig +from typing import Literal + +# DatasetType, QuestionType, and OptimizerType definitions +# DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"] +# QuestionType = Literal["math", "code", "qa"] +# OptimizerType = Literal["Graph", "Test"] + +# When you fisrt use, please download the datasets and initial rounds; If you want to get a look of the results, please download the results. +# download(["datasets", "results", "initial_rounds"]) + +# Crucial Parameters +dataset: DatasetType = "GSM8K" # Ensure the type is consistent with DatasetType +sample: int = 4 # Sample Count, which means how many workflows will be resampled from generated workflows +question_type: QuestionType = "code" # Ensure the type is consistent with QuestionType +optimized_path: str = "examples/aflow/scripts/optimized" # Optimized Result Save Path +initial_round: int = 1 # Corrected the case from Initial_round to initial_round +max_rounds: int = 20 +check_convergence: bool = True + +# Config llm model, you can modify `config/config2.yaml` to use more llms. +mini_llm_config = ModelsConfig.default().get("gpt-4o-mini") +claude_llm_config = ModelsConfig.default().get("claude-3-5-sonnet-20240620") + +# Config operators. +operators = [ + "Custom", # It's basic unit of a fixed node. optimizer can modify its prompt to get vairous nodes. + # "AnswerGenerate" # It's for qa + # "CustomCodeGenerate", # It's for code + "ScEnsemble", # It's for code, math and qa + # "Test", # It's for code + "Programmer", # It's for math +] + +# Create an optimizer instance +optimizer = Optimizer( + dataset=dataset, # Config dataset + question_type=question_type, # Config Question Type + opt_llm_config=claude_llm_config, # Config Optimizer LLM + exec_llm_config=mini_llm_config, # Config Execution LLM + check_convergence=check_convergence, # Whether Early Stop + operators=operators, # Config Operators you want to use + optimized_path=optimized_path, # Config Optimized workflow's file path + sample=sample, # Only Top(sample) rounds will be selected. + initial_round=initial_round, # Optimize from initial round + max_rounds=max_rounds # The max iteration of AFLOW. +) + +if __name__ == "__main__": + # Optimize workflow via setting the optimizer's mode to 'Graph' + optimizer.optimize("Graph") + # Test workflow via setting the optimizer's mode to 'Test' + # optimizer.optimize("Test") \ No newline at end of file diff --git a/metagpt/ext/aflow/README.md b/metagpt/ext/aflow/README.md new file mode 100644 index 000000000..62b92548d --- /dev/null +++ b/metagpt/ext/aflow/README.md @@ -0,0 +1,70 @@ +# AFlow: Automating Agentic Workflow Generation + +AFlow is a framework for automatically generating and optimizing Agentic Workflows. It uses Monte Carlo tree search in a code-represented workflow space to find effective workflows, replacing manual development with machine effort. Our approach shows potential to outperform handcrafted workflows on various tasks. + +[Read our paper on arXiv](https://arxiv.org/abs/2410.10762) + +[Insert performance graph/image here] + +## Framework Components + +- **Node**: Basic unit of LLM invocation. See `action_node.py` for a flexible interface to control LLM, temperature, format, and prompt. +- **Operator**: Predefined combinations of Nodes to enhance search efficiency. Encapsulates common operations like Generate, Format, Review, Revise, Ensemble, Test, and Programmer. +- **Workflow**: A sequence of LLM-invoking nodes connected by edges. Can be represented as graphs, neural networks, or code to express various execution structures. +- **Optimizer**: Uses LLMs within a Monte Carlo Tree Search variant to explore and refine workflows. Iteratively selects, expands, evaluates, and updates workflows based on performance. +- **Evaluator**: Assesses workflow performance on given tasks. Provides feedback to guide the optimization process towards more effective workflows. + +## Datasets + +We provide implementations for [list datasets here]. + +Data is available at [link to data]. + +For custom tasks, [brief instructions or link to documentation]. + +## Quick Start + +1. Configure your search in `optimize.py`: + - Open `examples/aflow/scripts/optimize.py` + - Set the following parameters: + ```python + dataset = "HumanEval" # Choose from: "HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP" or your custom dataset name + question_type = "code" # Choose from: "math", "code", "qa" + sample = 5 # Number of samples to use for optimization + check_convergence = True # Whether to check for convergence + optimized_path = "path/to/optimized/workflows" # Path to save optimized workflows + initial_round = 1 # Starting round number + max_rounds = 20 # Maximum number of optimization rounds + ``` + - Adjust these parameters according to your specific requirements and dataset +2. Set up parameters in `config/config2.yaml` (see `examples/aflow/config2.example.yaml` for reference) +3. Set the operator you want to use in `optimize.py` and in `xxxx` +4. Download the init round of six datasets and put them in `xxxxxx` +5. Add your custom dataset and corresponding evaluation function: + +- Create a new Python file in the `examples/aflow/benchmark/` directory, named `{custom_dataset_name}.py` +- Implement the following key functions in this new file: + - `load_data`: for loading the dataset + - `evaluate_problem`: for evaluating a single problem solution + - `evaluate_all_problems`: for evaluating all problems + - `save_results_to_csv`: for saving evaluation results + - `optimize_{custom_dataset_name}_evaluation`: main evaluation function that integrates the above functionalities +- Add your custom dataset name and config val_list in `examples/aflow/scripts/evaluator.py` + + +## License + +[License information] + +## Citation + +If you use AFlow in your research, please cite our paper: + +``` +@article{zhang2024aflow, + title={AFlow: Automating Agentic Workflow Generation}, + author={Zhang, Jiayi and Xiang, Jinyu and Yu, Zhaoyang and Teng, Fengwei and Chen, Xionghui and Chen, Jiaqi and Zhuge, Mingchen and Cheng, Xin and Hong, Sirui and Wang, Jinlin and others}, + journal={arXiv preprint arXiv:2410.10762}, + year={2024} +} +``` \ No newline at end of file diff --git a/examples/aflow/benchmark/benchmark.py b/metagpt/ext/aflow/benchmark/benchmark.py similarity index 100% rename from examples/aflow/benchmark/benchmark.py rename to metagpt/ext/aflow/benchmark/benchmark.py diff --git a/examples/aflow/benchmark/drop.py b/metagpt/ext/aflow/benchmark/drop.py similarity index 98% rename from examples/aflow/benchmark/drop.py rename to metagpt/ext/aflow/benchmark/drop.py index e8ee124d7..61e5bb616 100644 --- a/examples/aflow/benchmark/drop.py +++ b/metagpt/ext/aflow/benchmark/drop.py @@ -12,7 +12,7 @@ import pandas as pd from tqdm.asyncio import tqdm_asyncio from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type -from examples.aflow.benchmark.benchmark import BaseBenchmark +from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark class DROPBenchmark(BaseBenchmark): def __init__(self, name: str, file_path: str, log_path: str): diff --git a/examples/aflow/benchmark/gsm8k.py b/metagpt/ext/aflow/benchmark/gsm8k.py similarity index 97% rename from examples/aflow/benchmark/gsm8k.py rename to metagpt/ext/aflow/benchmark/gsm8k.py index f3b86644c..1292dded1 100644 --- a/examples/aflow/benchmark/gsm8k.py +++ b/metagpt/ext/aflow/benchmark/gsm8k.py @@ -17,7 +17,7 @@ from datetime import datetime from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type -from examples.aflow.benchmark.benchmark import BaseBenchmark +from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark class GSM8KBenchmark(BaseBenchmark): def __init__(self, name: str, file_path: str, log_path: str): diff --git a/examples/aflow/benchmark/hotpotqa.py b/metagpt/ext/aflow/benchmark/hotpotqa.py similarity index 97% rename from examples/aflow/benchmark/hotpotqa.py rename to metagpt/ext/aflow/benchmark/hotpotqa.py index 2a715c0ca..4b9e81d9f 100644 --- a/examples/aflow/benchmark/hotpotqa.py +++ b/metagpt/ext/aflow/benchmark/hotpotqa.py @@ -9,7 +9,7 @@ import os from collections import Counter from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type -from examples.aflow.benchmark.benchmark import BaseBenchmark +from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark class HotpotQABenchmark(BaseBenchmark): def __init__(self, name: str, file_path: str, log_path: str): diff --git a/examples/aflow/benchmark/humaneval.py b/metagpt/ext/aflow/benchmark/humaneval.py similarity index 98% rename from examples/aflow/benchmark/humaneval.py rename to metagpt/ext/aflow/benchmark/humaneval.py index 75252626e..2fea25bbe 100644 --- a/examples/aflow/benchmark/humaneval.py +++ b/metagpt/ext/aflow/benchmark/humaneval.py @@ -10,7 +10,7 @@ from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_t import pandas as pd -from examples.aflow.benchmark.benchmark import BaseBenchmark +from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark from metagpt.actions.code_sanitize import sanitize class HumanEvalBenchmark(BaseBenchmark): diff --git a/examples/aflow/benchmark/math.py b/metagpt/ext/aflow/benchmark/math.py similarity index 98% rename from examples/aflow/benchmark/math.py rename to metagpt/ext/aflow/benchmark/math.py index 1b91c1bde..6ea6189f3 100644 --- a/examples/aflow/benchmark/math.py +++ b/metagpt/ext/aflow/benchmark/math.py @@ -9,7 +9,7 @@ from typing import Any, Callable, Tuple, List from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type -from examples.aflow.benchmark.benchmark import BaseBenchmark +from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark class MATHBenchmark(BaseBenchmark): def __init__(self, name: str, file_path: str, log_path: str): diff --git a/examples/aflow/benchmark/mbpp.py b/metagpt/ext/aflow/benchmark/mbpp.py similarity index 98% rename from examples/aflow/benchmark/mbpp.py rename to metagpt/ext/aflow/benchmark/mbpp.py index 2edede302..d94c57cc5 100644 --- a/examples/aflow/benchmark/mbpp.py +++ b/metagpt/ext/aflow/benchmark/mbpp.py @@ -8,7 +8,7 @@ from typing import List, Tuple, Callable, Any, Optional, Dict from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type from metagpt.actions.code_sanitize import sanitize -from examples.aflow.benchmark.benchmark import BaseBenchmark +from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark class MBPPBenchmark(BaseBenchmark): def __init__(self, name: str, file_path: str, log_path: str): diff --git a/examples/aflow/benchmark/utils.py b/metagpt/ext/aflow/benchmark/utils.py similarity index 100% rename from examples/aflow/benchmark/utils.py rename to metagpt/ext/aflow/benchmark/utils.py diff --git a/metagpt/ext/aflow/config2.example.yaml b/metagpt/ext/aflow/config2.example.yaml new file mode 100644 index 000000000..ebaef33e2 --- /dev/null +++ b/metagpt/ext/aflow/config2.example.yaml @@ -0,0 +1,12 @@ +models: + "": # model: "gpt-4-turbo" # or gpt-3.5-turbo + api_type: "openai" # or azure / ollama / groq etc. + base_url: "" + api_key: "" + temperature: 0 + "": + api_type: "openai" + base_url: "" + api_key: "" + temperature: 0 +CALC_USAGE: True diff --git a/examples/aflow/data/download_data.py b/metagpt/ext/aflow/data/download_data.py similarity index 100% rename from examples/aflow/data/download_data.py rename to metagpt/ext/aflow/data/download_data.py diff --git a/metagpt/ext/aflow/full data(include baselines).zip b/metagpt/ext/aflow/full data(include baselines).zip new file mode 100644 index 000000000..4ddfadbfd Binary files /dev/null and b/metagpt/ext/aflow/full data(include baselines).zip differ diff --git a/examples/aflow/scripts/evaluator.py b/metagpt/ext/aflow/scripts/evaluator.py similarity index 83% rename from examples/aflow/scripts/evaluator.py rename to metagpt/ext/aflow/scripts/evaluator.py index 14b068168..26a493402 100644 --- a/examples/aflow/scripts/evaluator.py +++ b/metagpt/ext/aflow/scripts/evaluator.py @@ -6,13 +6,13 @@ from typing import Literal, Tuple, Optional, Dict import asyncio -from examples.aflow.benchmark.benchmark import BaseBenchmark -from examples.aflow.benchmark.gsm8k import GSM8KBenchmark -from examples.aflow.benchmark.math import MATHBenchmark -from examples.aflow.benchmark.humaneval import HumanEvalBenchmark -from examples.aflow.benchmark.hotpotqa import HotpotQABenchmark -from examples.aflow.benchmark.mbpp import MBPPBenchmark -from examples.aflow.benchmark.drop import DROPBenchmark +from metagpt.ext.aflow.benchmark.benchmark import BaseBenchmark +from metagpt.ext.aflow.benchmark.gsm8k import GSM8KBenchmark +from metagpt.ext.aflow.benchmark.math import MATHBenchmark +from metagpt.ext.aflow.benchmark.humaneval import HumanEvalBenchmark +from metagpt.ext.aflow.benchmark.hotpotqa import HotpotQABenchmark +from metagpt.ext.aflow.benchmark.mbpp import MBPPBenchmark +from metagpt.ext.aflow.benchmark.drop import DROPBenchmark # If you want to customize tasks, add task types here and provide evaluation functions, just like the ones given above DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"] diff --git a/examples/aflow/scripts/operator.py b/metagpt/ext/aflow/scripts/operator.py similarity index 98% rename from examples/aflow/scripts/operator.py rename to metagpt/ext/aflow/scripts/operator.py index 7e567eac5..f94a0db32 100644 --- a/examples/aflow/scripts/operator.py +++ b/metagpt/ext/aflow/scripts/operator.py @@ -11,9 +11,9 @@ from typing import Dict, List, Tuple import concurrent.futures from tenacity import retry, stop_after_attempt, wait_fixed -from examples.aflow.scripts.utils import extract_test_cases_from_jsonl +from metagpt.ext.aflow.scripts.utils import extract_test_cases_from_jsonl -from examples.aflow.scripts.operator_an import ( +from metagpt.ext.aflow.scripts.operator_an import ( FormatOp, GenerateOp, CodeGenerateOp, @@ -25,7 +25,7 @@ from examples.aflow.scripts.operator_an import ( ReviseOp, ) -from examples.aflow.scripts.prompts.prompt import ( +from metagpt.ext.aflow.scripts.prompts.prompt import ( FORMAT_PROMPT, ANSWER_GENERATION_PROMPT, SC_ENSEMBLE_PROMPT, @@ -35,7 +35,7 @@ from examples.aflow.scripts.prompts.prompt import ( REVIEW_PROMPT, REVISE_PROMPT, ) -from examples.aflow.scripts.utils import test_case_2_test_function +from metagpt.ext.aflow.scripts.utils import test_case_2_test_function from metagpt.actions.action_node import ActionNode from metagpt.llm import LLM from metagpt.logs import logger diff --git a/examples/aflow/scripts/operator_an.py b/metagpt/ext/aflow/scripts/operator_an.py similarity index 100% rename from examples/aflow/scripts/operator_an.py rename to metagpt/ext/aflow/scripts/operator_an.py diff --git a/examples/aflow/scripts/optimized/__init__.py b/metagpt/ext/aflow/scripts/optimized/__init__.py similarity index 100% rename from examples/aflow/scripts/optimized/__init__.py rename to metagpt/ext/aflow/scripts/optimized/__init__.py diff --git a/metagpt/ext/aflow/scripts/optimized/optimized.zip b/metagpt/ext/aflow/scripts/optimized/optimized.zip new file mode 100644 index 000000000..5de192467 Binary files /dev/null and b/metagpt/ext/aflow/scripts/optimized/optimized.zip differ diff --git a/examples/aflow/scripts/optimizer.py b/metagpt/ext/aflow/scripts/optimizer.py similarity index 94% rename from examples/aflow/scripts/optimizer.py rename to metagpt/ext/aflow/scripts/optimizer.py index 5f70d3bcd..f0bb260f3 100644 --- a/examples/aflow/scripts/optimizer.py +++ b/metagpt/ext/aflow/scripts/optimizer.py @@ -12,11 +12,11 @@ from pydantic import BaseModel, Field from metagpt.actions.action_node import ActionNode from metagpt.provider.llm_provider_registry import create_llm_instance from metagpt.logs import logger -from examples.aflow.scripts.optimizer_utils.graph_utils import GraphUtils -from examples.aflow.scripts.optimizer_utils.data_utils import DataUtils -from examples.aflow.scripts.optimizer_utils.experience_utils import ExperienceUtils -from examples.aflow.scripts.optimizer_utils.evaluation_utils import EvaluationUtils -from examples.aflow.scripts.optimizer_utils.convergence_utils import ConvergenceUtils +from metagpt.ext.aflow.scripts.optimizer_utils.graph_utils import GraphUtils +from metagpt.ext.aflow.scripts.optimizer_utils.data_utils import DataUtils +from metagpt.ext.aflow.scripts.optimizer_utils.experience_utils import ExperienceUtils +from metagpt.ext.aflow.scripts.optimizer_utils.evaluation_utils import EvaluationUtils +from metagpt.ext.aflow.scripts.optimizer_utils.convergence_utils import ConvergenceUtils DatasetType = Literal["HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP"] QuestionType = Literal["math", "code", "qa"] diff --git a/examples/aflow/scripts/optimizer_utils/convergence_utils.py b/metagpt/ext/aflow/scripts/optimizer_utils/convergence_utils.py similarity index 100% rename from examples/aflow/scripts/optimizer_utils/convergence_utils.py rename to metagpt/ext/aflow/scripts/optimizer_utils/convergence_utils.py diff --git a/examples/aflow/scripts/optimizer_utils/data_utils.py b/metagpt/ext/aflow/scripts/optimizer_utils/data_utils.py similarity index 100% rename from examples/aflow/scripts/optimizer_utils/data_utils.py rename to metagpt/ext/aflow/scripts/optimizer_utils/data_utils.py diff --git a/examples/aflow/scripts/optimizer_utils/evaluation_utils.py b/metagpt/ext/aflow/scripts/optimizer_utils/evaluation_utils.py similarity index 97% rename from examples/aflow/scripts/optimizer_utils/evaluation_utils.py rename to metagpt/ext/aflow/scripts/optimizer_utils/evaluation_utils.py index 2e4947186..79d015666 100644 --- a/examples/aflow/scripts/optimizer_utils/evaluation_utils.py +++ b/metagpt/ext/aflow/scripts/optimizer_utils/evaluation_utils.py @@ -1,4 +1,4 @@ -from examples.aflow.scripts.evaluator import Evaluator +from metagpt.ext.aflow.scripts.evaluator import Evaluator class EvaluationUtils: diff --git a/examples/aflow/scripts/optimizer_utils/experience_utils.py b/metagpt/ext/aflow/scripts/optimizer_utils/experience_utils.py similarity index 100% rename from examples/aflow/scripts/optimizer_utils/experience_utils.py rename to metagpt/ext/aflow/scripts/optimizer_utils/experience_utils.py diff --git a/examples/aflow/scripts/optimizer_utils/graph_utils.py b/metagpt/ext/aflow/scripts/optimizer_utils/graph_utils.py similarity index 98% rename from examples/aflow/scripts/optimizer_utils/graph_utils.py rename to metagpt/ext/aflow/scripts/optimizer_utils/graph_utils.py index 3dca1edce..0471ad306 100644 --- a/examples/aflow/scripts/optimizer_utils/graph_utils.py +++ b/metagpt/ext/aflow/scripts/optimizer_utils/graph_utils.py @@ -6,7 +6,7 @@ import traceback import time from metagpt.logs import logger -from examples.aflow.scripts.prompts.optimize_prompt import ( +from metagpt.ext.aflow.scripts.prompts.optimize_prompt import ( WORKFLOW_CUSTOM_USE, WORKFLOW_INPUT, WORKFLOW_OPTIMIZE_PROMPT, diff --git a/examples/aflow/scripts/prompts/optimize_prompt.py b/metagpt/ext/aflow/scripts/prompts/optimize_prompt.py similarity index 96% rename from examples/aflow/scripts/prompts/optimize_prompt.py rename to metagpt/ext/aflow/scripts/prompts/optimize_prompt.py index 1436329e0..231506a37 100644 --- a/examples/aflow/scripts/prompts/optimize_prompt.py +++ b/metagpt/ext/aflow/scripts/prompts/optimize_prompt.py @@ -48,8 +48,8 @@ Note: In custom, the input and instruction are directly concatenated(instruction """ WORKFLOW_TEMPLATE = """from typing import Literal -import examples.aflow.scripts.optimized.{dataset}.workflows.template.operator as operator -import examples.aflow.scripts.optimized.{dataset}.workflows.round_{round}.prompt as prompt_custom +import metagpt.ext.aflow.scripts.optimized.{dataset}.workflows.template.operator as operator +import metagpt.ext.aflow.scripts.optimized.{dataset}.workflows.round_{round}.prompt as prompt_custom from metagpt.provider.llm_provider_registry import create_llm_instance from metagpt.utils.cost_manager import CostManager diff --git a/examples/aflow/scripts/prompts/prompt.py b/metagpt/ext/aflow/scripts/prompts/prompt.py similarity index 100% rename from examples/aflow/scripts/prompts/prompt.py rename to metagpt/ext/aflow/scripts/prompts/prompt.py diff --git a/examples/aflow/scripts/utils.py b/metagpt/ext/aflow/scripts/utils.py similarity index 100% rename from examples/aflow/scripts/utils.py rename to metagpt/ext/aflow/scripts/utils.py diff --git a/examples/aflow/scripts/workflow.py b/metagpt/ext/aflow/scripts/workflow.py similarity index 93% rename from examples/aflow/scripts/workflow.py rename to metagpt/ext/aflow/scripts/workflow.py index e45a9ece2..37133bcc2 100644 --- a/examples/aflow/scripts/workflow.py +++ b/metagpt/ext/aflow/scripts/workflow.py @@ -5,7 +5,7 @@ from typing import Literal -from examples.aflow.scripts.operator import Generate +from metagpt.ext.aflow.scripts.operator import Generate from metagpt.provider.llm_provider_registry import create_llm_instance from metagpt.utils.cost_manager import CostManager diff --git a/optimize.py b/optimize.py index dd93f84c9..a3f64d86a 100644 --- a/optimize.py +++ b/optimize.py @@ -3,9 +3,9 @@ # @Author : didi # @Desc : Entrance of AFlow. -from examples.aflow.scripts.optimizer import Optimizer -from examples.aflow.scripts.evaluator import DatasetType, QuestionType, OptimizerType -from examples.aflow.data.download_data import download +from metagpt.ext.aflow.scripts.optimizer import Optimizer +from metagpt.ext.aflow.scripts.evaluator import DatasetType, QuestionType, OptimizerType +from metagpt.ext.aflow.data.download_data import download from metagpt.configs.models_config import ModelsConfig from typing import Literal