diff --git a/examples/ags/benchmark/gsm8k.py b/examples/ags/benchmark/gsm8k.py
index fc79823a5..be33ce977 100644
--- a/examples/ags/benchmark/gsm8k.py
+++ b/examples/ags/benchmark/gsm8k.py
@@ -52,7 +52,7 @@ def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str
     """Save results to CSV file"""
     df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
     average_score = df["score"].mean()
-    total_cost = df["cost"].iloc[-1]
+    total_cost = df["cost"].max()
 
     output_file = f"{path}/{average_score:.5f}.csv"
     df.to_csv(output_file, index=False)
@@ -103,7 +103,7 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
 async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
     """GSM8K evaluation main function"""
     data = await load_data(file_path, samples, test=test)
-    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5)
+    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=10)
     average_score, total_cost = save_results_to_csv(results, path=path)
     print(f"Average score: {average_score:.5f}")
     print(f"Total Cost: {total_cost:.5f}")
diff --git a/examples/ags/experiments/baselines/cot_gsm8k.py b/examples/ags/experiments/baselines/cot_gsm8k.py
index 9acc91346..90266d546 100644
--- a/examples/ags/experiments/baselines/cot_gsm8k.py
+++ b/examples/ags/experiments/baselines/cot_gsm8k.py
@@ -9,7 +9,7 @@ from pydantic import BaseModel, Field
 from typing import Dict, Any
 
 GSM8K_PROMPT_GPT = """
-{question}\nPlease reason step by step, and put your final answer in the end. Wrap content using xml tags.
+{question}\nPlease reason step by step. At the end, provide the final answer in the format "Answer is <number>", where <number> is a single number, without any additional information or explanation.
 """
 
 GSM8K_PROMPT_DS = """
@@ -39,28 +39,24 @@ class CoTSolveGraph(SolveGraph):
 
     async def __call__(self, problem):
         solution = await self.cot_generate(problem, mode="context_fill")
-        return solution, self.llm.cost_manager.total_cost
+        return solution, self.llm.cost_manager.total_cost # {"solution": solution}
 
 if __name__ == "__main__":
     async def main():
-        # llm_config = ModelsConfig.default().get("deepseek-coder")
+        llm_config = ModelsConfig.default().get("deepseek-coder")
         # llm_config = ModelsConfig.default().get("gpt-4o-mini")
-        llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
+        # llm_config = ModelsConfig.default().get("gpt-4o")
         graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K")
         file_path = "examples/ags/data/gsm8k.jsonl"
-        samples = 1055
-        path = "examples/ags/data/baselines/general"
-        score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
-        return score, cost
+        samples = 264 #264 # 1055 #314  
+        # samples = 100
+        path = "examples/ags/data/baselines/general/gsm8k/"
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path, test=False)
+        return score, cost 
 
     import asyncio
     asyncio.run(main())
+    
 
-
-# medprompt operator; universal self consistency; 
-
-# IO指的没有任何Trick，看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。
-
-# deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106
-
-# med ensemble 
\ No newline at end of file
+# self consistency; medprompt 已有的Operator来实现这两个方法
\ No newline at end of file