Delete unnecessary part & Update Optimize for better use

2026-07-26 17:11:07 +02:00 · 2024-10-25 16:52:49 +08:00 · 2024-10-25 16:52:49 +08:00 · 38c825d04c
commit 38c825d04c
parent 06c191514b
6 changed files with 64 additions and 45 deletions
--- a/examples/aflow/README.md
+++ b/examples/aflow/README.md
@ -38,14 +38,15 @@ ## Quick Start
   - Open `examples/aflow/optimize.py`
   - Set the following parameters:
     ```python
-     dataset = "HumanEval"  # Choose from: "HumanEval", "MBPP", "GSM8K", "MATH", "HotpotQA", "DROP" or your custom dataset name
-     question_type = "code"  # Choose from: "math", "code", "qa"
-     sample = 4  # Number of samples to use for optimization
-     check_convergence = True  # Whether to check for convergence
-     optimized_path = "path/to/optimized/workflows"  # Path to save optimized workflows, defaults to metagpt/ext/aflow/scripts/optimized
-     initial_round = 1  # Starting round number
-     max_rounds = 20  # Maximum number of optimization rounds
-     validation_rounds = 5  # The validation rounds of AFLOW.
+     dataset: DatasetType = "MATH"  # Ensure the type is consistent with DatasetType
+     sample: int = 4  # Sample Count, which means how many workflows will be resampled from generated workflows
+     question_type: QuestionType = "math"  # Ensure the type is consistent with QuestionType
+     optimized_path: str = "metagpt/ext/aflow/scripts/optimized"  # Optimized Result Save Path
+     initial_round: int = 1  # Corrected the case from Initial_round to initial_round
+     max_rounds: int = 20  # The max iteration of AFLOW.
+     check_convergence: bool = True  # Whether Early Stop
+     validation_rounds: int = 5  # The validation rounds of AFLOW.
+     if_fisrt_optimize = True  # You should change it to False after the first optimize.
     ```
   - Adjust these parameters according to your specific requirements and dataset
 2. Set up parameters in `config/config2.yaml` (see `examples/aflow/config2.example.yaml` for reference)
@ -66,10 +67,13 @@ ## Citation
 If you use AFlow in your research, please cite our paper:

 ```
-@article{zhang2024aflow,
-  title={AFlow: Automating Agentic Workflow Generation},
-  author={Zhang, Jiayi and Xiang, Jinyu and Yu, Zhaoyang and Teng, Fengwei and Chen, Xionghui and Chen, Jiaqi and Zhuge, Mingchen and Cheng, Xin and Hong, Sirui and Wang, Jinlin and others},
-  journal={arXiv preprint arXiv:2410.10762},
-  year={2024}
+@misc{zhang2024aflow,
+      title={AFlow: Automating Agentic Workflow Generation}, 
+      author={Jiayi Zhang and Jinyu Xiang and Zhaoyang Yu and Fengwei Teng and Xionghui Chen and Jiaqi Chen and Mingchen Zhuge and Xin Cheng and Sirui Hong and Jinlin Wang and Bingnan Zheng and Bang Liu and Yuyu Luo and Chenglin Wu},
+      year={2024},
+      eprint={2410.10762},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2410.10762}, 
 }
 ```
--- a/examples/aflow/optimize.py
+++ b/examples/aflow/optimize.py
@ -3,6 +3,18 @@
 # @Author  : didi
 # @Desc    : Entrance of AFlow.

+import os
+import sys
+
+
+def setup_environment():
+    current_path = os.path.abspath(__file__)
+    root_path = os.path.dirname(os.path.dirname(os.path.dirname(current_path)))
+    sys.path.insert(0, root_path)
+    os.chdir(root_path)
+
+
+setup_environment()

 from metagpt.configs.models_config import ModelsConfig
 from metagpt.ext.aflow.data.download_data import download
@ -13,9 +25,6 @@ from metagpt.ext.aflow.scripts.optimizer import DatasetType, Optimizer, Question
 # QuestionType = Literal["math", "code", "qa"]
 # OptimizerType = Literal["Graph", "Test"]

-# When you fisrt use, please download the datasets and initial rounds; If you want to get a look of the results, please download the results.
-download(["datasets", "initial_rounds"])
-
 # Crucial Parameters
 dataset: DatasetType = "MATH"  # Ensure the type is consistent with DatasetType
 sample: int = 4  # Sample Count, which means how many workflows will be resampled from generated workflows
@ -25,6 +34,7 @@ initial_round: int = 1  # Corrected the case from Initial_round to initial_round
 max_rounds: int = 20  # The max iteration of AFLOW.
 check_convergence: bool = True  # Whether Early Stop
 validation_rounds: int = 5  # The validation rounds of AFLOW.
+if_fisrt_optimize = True  # You should change it to False after the first optimize.

 # Config llm model, you can modify `config/config2.yaml` to use more llms.
 mini_llm_config = ModelsConfig.default().get("gpt-4o-mini")
@ -56,6 +66,8 @@ optimizer = Optimizer(
 )

 if __name__ == "__main__":
+    # When you fisrt use, please download the datasets and initial rounds; If you want to get a look of the results, please download the results.
+    download(["datasets", "initial_rounds"], if_first_download=if_fisrt_optimize)
    # Optimize workflow via setting the optimizer's mode to 'Graph'
    optimizer.optimize("Graph")
    # Test workflow via setting the optimizer's mode to 'Test'
--- a/metagpt/ext/aflow/benchmark/benchmark.py
+++ b/metagpt/ext/aflow/benchmark/benchmark.py
@ -23,11 +23,9 @@ class BaseBenchmark(ABC):
        async with aiofiles.open(self.file_path, mode="r", encoding="utf-8") as file:
            async for line in file:
                data.append(json.loads(line))
-
        if specific_indices is not None:
            filtered_data = [data[i] for i in specific_indices if i < len(data)]
            return filtered_data
-
        return data

    def save_results_to_csv(self, results: List[Tuple[Any, ...]], columns: List[str]):
@ -35,26 +33,29 @@ class BaseBenchmark(ABC):
        avg_score = df["score"].mean()
        t_cost = df["cost"].max()
        a_cost = t_cost / len(df) if len(df) > 0 else 0
-
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{avg_score:.5f}_{current_time}.csv"
        output_file = os.path.join(self.log_path, filename)
-
        df.to_csv(output_file, index=False)
        logger.info(f"Results saved to {output_file}")
-
        return avg_score, a_cost, t_cost

-    def log_mismatch(self, problem: str, expected_output: Any, prediction: str, extracted_output: Any):
+    def log_mismatch(
+        self,
+        problem: str,
+        expected_output: Any,
+        prediction: str,
+        extracted_output: Any,
+        extract_answer_code: str = "None",
+    ):
        log_data = {
            "question": problem,
            "right_answer": expected_output,
            "model_output": prediction,
            "extracted_output": extracted_output,
+            "extract_answer_code": extract_answer_code,
        }
-
        log_file = os.path.join(self.log_path, "log.json")
-
        if os.path.exists(log_file):
            with open(log_file, "r", encoding="utf-8") as f:
                try:
@ -63,9 +64,7 @@ class BaseBenchmark(ABC):
                    data = []
        else:
            data = []
-
        data.append(log_data)
-
        with open(log_file, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)

@ -89,7 +88,6 @@ class BaseBenchmark(ABC):
                return await self.evaluate_problem(problem, graph)

        tasks = [sem_evaluate(problem) for problem in data]
-
        return await tqdm_asyncio.gather(*tasks, desc=f"Evaluating {self.name} problems", total=len(data))

    async def run_evaluation(self, graph: Callable, va_list: List[int], max_concurrent_tasks: int = 50):
--- a/metagpt/ext/aflow/benchmark/math.py
+++ b/metagpt/ext/aflow/benchmark/math.py
@ -1,3 +1,4 @@
+import inspect
 import re
 from math import isclose
 from typing import Any, Callable, List, Tuple
@ -98,6 +99,13 @@ class MATHBenchmark(BaseBenchmark):
            pass
        return False

+    def get_function_code(self, func):
+        try:
+            source_code = inspect.getsource(func)
+            return source_code
+        except OSError:
+            return "no code"
+
    @retry(stop=stop_after_attempt(5), wait=wait_fixed(1), retry=retry_if_exception_type(Exception), reraise=True)
    async def _generate_output(self, graph, input_text):
        return await graph(input_text)
@ -111,7 +119,13 @@ class MATHBenchmark(BaseBenchmark):
            uni_score, extracted_output = self.calculate_score(expected_output, output)

            if uni_score == 0:
-                self.log_mismatch(input_text, expected_output, output, extracted_output)
+                self.log_mismatch(
+                    input_text,
+                    expected_output,
+                    output,
+                    extracted_output,
+                    extract_answer_code=self.get_function_code(self.extract_model_answer),
+                )

            return input_text, output, expected_output, uni_score, cost

--- a/metagpt/ext/aflow/data/download_data.py
+++ b/metagpt/ext/aflow/data/download_data.py
@ -68,21 +68,12 @@ datasets_to_download: Dict[str, Dict[str, str]] = {
 }


-def is_directory_empty(path: str) -> bool:
-    """Check if the directory is empty"""
-    return len(os.listdir(path)) == 0
-
-
-def download(datasets):
+def download(required_datasets, if_first_download: bool = True):
    """Main function to process all selected datasets"""
-    for dataset_name in datasets:
-        dataset = datasets_to_download[dataset_name]
-        extract_path = dataset["extract_path"]
-
-        if os.path.exists(extract_path) and not is_directory_empty(extract_path):
-            logger.info(
-                f"Target folder {extract_path} for {dataset_name} is not empty, skipping download and extraction."
-            )
-            continue
-
-        process_dataset(dataset["url"], dataset["filename"], extract_path)
+    if if_first_download:
+        for dataset_name in required_datasets:
+            dataset = datasets_to_download[dataset_name]
+            extract_path = dataset["extract_path"]
+            process_dataset(dataset["url"], dataset["filename"], extract_path)
+    else:
+        logger.info("Skip downloading datasets")
--- a/metagpt/ext/aflow/scripts/optimized/optimized.zip
+++ b/metagpt/ext/aflow/scripts/optimized/optimized.zip