diff --git a/examples/ags/benchmark/drop.py b/examples/ags/benchmark/drop.py
index 1dbea0f50..a27c82f07 100644
--- a/examples/ags/benchmark/drop.py
+++ b/examples/ags/benchmark/drop.py
@@ -112,7 +112,7 @@ def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]:
     data = [data[i] for i in random_indices]
     return data
 
-async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float]:
+async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float, str]:
     def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str]:
         if "number" in answer and answer["number"]:
             return tuple([str(answer["number"])]), "number"
@@ -133,6 +133,8 @@ async def evaluate_problem(question: str, passage: str, answers: List[Dict[str,
             raise ValueError(f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}")
 
     prediction = await graph(question, passage)
+    cost = prediction[1]  # 添加这行来获取cost
+    prediction = prediction[0]  # 修改这行以获取实际的预测结果
 
     def get_f1_score(prediction: str, golden_answer: str) -> float:
         predicted_bags = answer_to_bags(prediction)
@@ -152,7 +154,7 @@ async def evaluate_problem(question: str, passage: str, answers: List[Dict[str,
             max_score = score
             best_answer = golden_answer
 
-    return best_answer, prediction, max_score
+    return best_answer, prediction, max_score, cost  # 修改返回值以包含cost
 
 async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], graph: Callable, max_concurrent_tasks: int = 50) -> List[List[Any]]:
     semaphore = asyncio.Semaphore(max_concurrent_tasks)
@@ -166,27 +168,29 @@ async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], g
                 answers = [qa_pair["answer"]]
                 if "validated_answers" in qa_pair and qa_pair["validated_answers"]:
                     answers.extend(qa_pair["validated_answers"])
-                best_answer, prediction, score = await evaluate_problem(question, passage, answers, graph)
-                results.append([id, question, prediction, best_answer, score])
+                best_answer, prediction, score, cost = await evaluate_problem(question, passage, answers, graph)
+                results.append([id, question, prediction, best_answer, score, cost])  # 修改这行以包含cost
 
     tasks = [sem_evaluate(id, annotation) for id, annotation in annotations]
     await tqdm_asyncio.gather(*tasks, desc="Evaluating DROP passages", total=len(annotations))
 
     return results
 
-def save_results_to_csv(results: List[List[Any]], path: str) -> float:
-    df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score"])
+def save_results_to_csv(results: List[List[Any]], path: str) -> Tuple[float, float]:
+    df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score", "cost"])
     average_score = df["score"].mean()
+    total_cost = df["cost"].iloc[-1] # 添加这行来计算总cost
 
     output_file = f"{path}/{average_score:.5f}.csv"
     df.to_csv(output_file, index=False)
     print(f"Results saved to {output_file}")
 
-    return average_score
+    return average_score, total_cost  # 修改返回值以包含total_cost
 
-async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
+async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
     data = load_data(file_path, samples)
     results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20)
-    average_score = save_results_to_csv(results, path=path)
+    average_score, total_cost = save_results_to_csv(results, path=path)
     print(f"Average score on DROP dataset: {average_score:.5f}")
-    return average_score
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost  # 修改返回值以包含total_cost
diff --git a/examples/ags/benchmark/gsm8k.py b/examples/ags/benchmark/gsm8k.py
index 97dce7bff..ddc4a0169 100644
--- a/examples/ags/benchmark/gsm8k.py
+++ b/examples/ags/benchmark/gsm8k.py
@@ -14,7 +14,7 @@ from tqdm.asyncio import tqdm_asyncio
 from examples.ags.benchmark.utils import generate_random_indices
 
 def extract_number(text: str) -> Optional[float]:
-    """清理文本并提取单个数字"""
+    """Clean text and extract a single number"""
     matches = re.findall(r"[-+]?\d+(?:,\d{3})*(?:\.\d+)?|\d+\.\d+", text)
     if matches:
         last_number = matches[-1].replace(",", "")
@@ -26,7 +26,7 @@ def extract_number(text: str) -> Optional[float]:
         return None
 
 def loose_match_score(expected_output: str, prediction: str, tolerance: float = 1e-6) -> int:
-    """宽松匹配分数计算函数"""
+    """Loose match score calculation function"""
     expected_number = extract_number(expected_output)
     predicted_number = extract_number(prediction)
 
@@ -48,18 +48,19 @@ async def load_data(file_path: str, samples=1) -> List[dict]:
     data = [data[i] for i in random_indices]
     return data
         
-def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> float:
-    """保存结果到CSV文件"""
+def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
+    """Save results to CSV file"""
     df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
     average_score = df["score"].mean()
+    total_cost = df["cost"].iloc[-1]
 
     output_file = f"{path}/{average_score:.5f}.csv"
     df.to_csv(output_file, index=False)
     print(f"Results saved to {output_file}")
-    return average_score
+    return average_score, total_cost
 
 async def evaluate_problem(input: str, graph: Callable, expected_output: str) -> Tuple[str, str, str, int, str]:
-    """评估单个问题"""
+    """Evaluate a single problem"""
     prompt = input
     max_retries = 5
     retries = 0
@@ -87,7 +88,7 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
     return input, output, expected_output, score, cost
 
 async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 20) -> List[Tuple[str, str, str, int, str]]:
-    """评估所有问题"""
+    """Evaluate all problems"""
     semaphore = asyncio.Semaphore(max_concurrent_tasks)
 
     async def sem_evaluate(problem):
@@ -100,11 +101,11 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
 
     return await tqdm_asyncio.gather(*tasks, desc="Evaluating problems", total=len(data))
 
-async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
-    """GSM8K评估主函数"""
+async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
+    """GSM8K evaluation main function"""
     data = await load_data(file_path, samples)
     results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5)
-    print(results)
-    average_score = save_results_to_csv(results, path=path)
+    average_score, total_cost = save_results_to_csv(results, path=path)
     print(f"Average score: {average_score:.5f}")
-    return average_score
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost
diff --git a/examples/ags/benchmark/hotpotqa.py b/examples/ags/benchmark/hotpotqa.py
index 375882511..0990cc781 100644
--- a/examples/ags/benchmark/hotpotqa.py
+++ b/examples/ags/benchmark/hotpotqa.py
@@ -120,7 +120,7 @@ async def evaluate_problem(input: str, context_str: str, graph: Callable, expect
 
     while retries < max_retries:
         try:
-            prediction, supporting_sentences = await graph(input, context_str) if graph else "None"
+            prediction, supporting_sentences, cost = await graph(input, context_str) if graph else ("None", None, 0)
             predicted_bags = answer_to_bags(prediction)
             gold_bags = answer_to_bags(expected_output)
 
@@ -137,9 +137,10 @@ async def evaluate_problem(input: str, context_str: str, graph: Callable, expect
                 prediction = None
                 supporting_sentences = None
                 score = 0
+                cost = 0
                 break
 
-    return input, prediction, expected_output, supporting_sentences, score
+    return input, prediction, expected_output, supporting_sentences, score, cost
 
 async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50):
     semaphore = asyncio.Semaphore(max_concurrent_tasks)
@@ -156,21 +157,23 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
 
     return await tqdm_asyncio.gather(*tasks, desc="Evaluating HotpotQA problems", total=len(data))
 
-def save_results_to_csv(results: List[Tuple[str, str, str, str, float]], path: str) -> float:
+def save_results_to_csv(results: List[Tuple[str, str, str, str, float, str]], path: str) -> Tuple[float, float]:
     df = pd.DataFrame(
-        results, columns=["question", "prediction", "expected_output", "supporting_sentences", "score"]
+        results, columns=["question", "prediction", "expected_output", "supporting_sentences", "score", "cost"]
     )
     average_score = df["score"].mean()
+    total_cost = df["cost"].iloc[-1]
 
     output_file = f"{path}/{average_score:.5f}.csv"
     df.to_csv(output_file, index=False)
     print(f"Results saved to {output_file}")
 
-    return average_score
+    return average_score, total_cost
 
-async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
+async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
     data = await load_data(file_path, samples)
     results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
-    average_score = save_results_to_csv(results, path=path)
+    average_score, total_cost = save_results_to_csv(results, path=path)
     print(f"Average score on HotpotQA dataset: {average_score:.5f}")
-    return average_score
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost
diff --git a/examples/ags/benchmark/humaneval.py b/examples/ags/benchmark/humaneval.py
index fd28cff26..65a7f3f16 100644
--- a/examples/ags/benchmark/humaneval.py
+++ b/examples/ags/benchmark/humaneval.py
@@ -46,13 +46,15 @@ async def check_solution(solution, test_cases, entry_point):
 
     return FAIL, details
 
-async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int]:
+async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
     max_retries = 5
     retries = 0
 
     while retries < max_retries:
         try:
-            solution = await graph(data["prompt"]) if graph else "None"
+            prediction = await graph(data["prompt"]) if graph else "None"
+            cost = prediction[1]  # 添加这行来获取cost
+            solution = prediction[0]  # 修改这行以获取实际的预测结果
             ret = await check_solution(solution, data["test_cases"], data["entry_point"])
 
             score = 1 if ret[0] == PASS else 0
@@ -67,11 +69,12 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str,
                 solution = None
                 ret = (FAIL, [])
                 score = 0
+                cost = 0  # 添加这行来处理错误情况下的cost
                 break
 
-    return data["prompt"], solution, ret[1], score
+    return data["prompt"], solution, ret[1], score, cost  # 修改返回值以包含cost
 
-async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int]]:
+async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int, str]]:
     semaphore = asyncio.Semaphore(max_concurrent_tasks)
 
     async def sem_evaluate(problem):
@@ -86,8 +89,9 @@ import os
 import time
 import json
 
-def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -> float:
+def save_results_to_jsonl(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
     avg_score = 0
+    total_cost = 0  # 添加这行来计算总cost
     timestamp = int(time.time())
     filename = f"humaneval_results_{timestamp}.jsonl"
     full_path = os.path.join(path, filename)
@@ -101,19 +105,23 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -
                         "prediction": result[1],
                         "test_case_details": result[2],
                         "score": result[3],
+                        "cost": result[4],  # 添加这行来包含cost
                     }
                 )
                 + "\n"
             )
             avg_score += result[3]
+            total_cost += float(result[4])  # 添加这行来累加cost
     print(f"save to {full_path}")
     avg_score /= len(results)
+    total_cost = results[-1][4]  # 使用最后一个结果的cost作为总cost
 
-    return round(avg_score, 5)
+    return round(avg_score, 5), round(total_cost, 5)  # 修改返回值以包含total_cost
 
-async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
+async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
     data = await load_data(file_path, samples)
     results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
-    average_score = save_results_to_jsonl(results, path=path)
+    average_score, total_cost = save_results_to_jsonl(results, path=path)
     print(f"Average score on HumanEval dataset: {average_score:.5f}")
-    return average_score
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost  # 修改返回值以包含total_cost
diff --git a/examples/ags/benchmark/math.py b/examples/ags/benchmark/math.py
index 329237a51..2a6a5cf4c 100644
--- a/examples/ags/benchmark/math.py
+++ b/examples/ags/benchmark/math.py
@@ -221,14 +221,15 @@ async def load_data(file_path: str, samples: int = 200) -> List[dict]:
     data = [data[i] for i in random_indices]
     return data
 
-def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> float:
+def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
     df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
     average_score = df["score"].mean()
+    total_cost = df["cost"].iloc[-1]
 
     output_file = f"{path}/{average_score:.5f}.csv"
     df.to_csv(output_file, index=False)
     print(f"Results saved to {output_file}")
-    return average_score
+    return average_score, total_cost
 
 async def evaluate_problem(problem: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
     input_text = problem["problem"]
@@ -269,9 +270,10 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
 
     return await tqdm_asyncio.gather(*tasks, desc="Evaluating MATH problems", total=len(data))
 
-async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
+async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
     data = await load_data(file_path, samples)
     results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
-    average_score = save_results_to_csv(results, path=path)
+    average_score, total_cost = save_results_to_csv(results, path=path)
     print(f"Average score on MATH dataset: {average_score:.5f}")
-    return average_score
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost
diff --git a/examples/ags/benchmark/mbpp.py b/examples/ags/benchmark/mbpp.py
index 14075874a..b0722eb9c 100644
--- a/examples/ags/benchmark/mbpp.py
+++ b/examples/ags/benchmark/mbpp.py
@@ -51,13 +51,15 @@ async def check_solution(solution, test_cases, timeout=1):
 
     return FAIL, details
 
-async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int]:
+async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
     max_retries = 5
     retries = 0
 
     while retries < max_retries:
         try:
-            solution = await graph(data["prompt"]) if graph else "None"
+            prediction = await graph(data["prompt"]) if graph else "None"
+            cost = prediction[1]
+            solution = prediction[0]
             ret = await check_solution(solution, data["test_list"])
 
             score = 1 if ret[0] == PASS else 0
@@ -74,9 +76,9 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str,
                 score = 0
                 break
 
-    return data["prompt"], solution, ret[1], score
+    return data["prompt"], solution, ret[1], score, cost
 
-async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int]]:
+async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int, str]]:
     semaphore = asyncio.Semaphore(max_concurrent_tasks)
 
     async def sem_evaluate(problem):
@@ -87,19 +89,20 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
 
     return await tqdm_asyncio.gather(*tasks, desc="Evaluating MBPP problems", total=len(data))
 
-def save_results_to_csv(results: List[Tuple[str, str, str, int]], path: str) -> float:
-    df = pd.DataFrame(results, columns=["question", "prediction", "test_case_details", "score"])
+def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
+    df = pd.DataFrame(results, columns=["question", "prediction", "test_case_details", "score", "cost"])
     average_score = df["score"].mean()
+    total_cost = df["cost"].iloc[-1]
 
     output_file = f"{path}/{average_score:.5f}.csv"
     df.to_csv(output_file, index=False)
     print(f"Results saved to {output_file}")
+    return average_score, total_cost
 
-    return average_score
-
-async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
+async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
     data = await load_data(file_path, samples)
     results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
-    average_score = save_results_to_csv(results, path=path)
+    average_score, total_cost = save_results_to_csv(results, path=path)
     print(f"Average score on MBPP dataset: {average_score:.5f}")
-    return average_score
+    print(f"Total Cost: {total_cost:.5f}")
+    return average_score, total_cost
diff --git a/examples/ags/experiments/baselines/cot_drop.py b/examples/ags/experiments/baselines/cot_drop.py
index d0d3ecb34..692856766 100644
--- a/examples/ags/experiments/baselines/cot_drop.py
+++ b/examples/ags/experiments/baselines/cot_drop.py
@@ -40,7 +40,7 @@ class CoTSolveGraph(SolveGraph):
 
     async def __call__(self, question: str, context: str) -> Tuple[str, str]:
         answer = await self.cot_generate(question, context, mode="context_fill")
-        return answer
+        return answer, self.llm.cost_manager.total_cost
 
 if __name__ == "__main__":
     async def main():
@@ -48,7 +48,7 @@ if __name__ == "__main__":
         # llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
         graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="DROP")
         file_path = "examples/ags/data/drop_dataset_dev.json"
-        samples = 3
+        samples = 1
         path = "examples/ags/data/baselines/general/drop"
         score = await drop_evaluation(graph, file_path, samples, path)
         return score
diff --git a/examples/ags/experiments/baselines/cot_gsm8k.py b/examples/ags/experiments/baselines/cot_gsm8k.py
index 3e08ff47d..9acc91346 100644
--- a/examples/ags/experiments/baselines/cot_gsm8k.py
+++ b/examples/ags/experiments/baselines/cot_gsm8k.py
@@ -49,25 +49,18 @@ if __name__ == "__main__":
         graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K")
         file_path = "examples/ags/data/gsm8k.jsonl"
         samples = 1055
-        # samples = 100
         path = "examples/ags/data/baselines/general"
-        score = await gsm8k_evaluation(graph, file_path, samples, path)
-        return score
+        score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
+        return score, cost
 
     import asyncio
     asyncio.run(main())
 
 
-# self consistency operator; universal self consistency; 
+# medprompt operator; universal self consistency; 
 
 # IO指的没有任何Trick，看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。
 
 # deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106
 
-
-
-GENERATE_PROMPT = """
-Generate Solution for the following problem: {problem_description}
-"""
-
 # med ensemble 
\ No newline at end of file