diff --git a/examples/ags/benchmark/drop.py b/examples/ags/benchmark/drop.py
index 3aa82e8c9..e69de29bb 100644
--- a/examples/ags/benchmark/drop.py
+++ b/examples/ags/benchmark/drop.py
@@ -1,362 +0,0 @@
-import json
-import asyncio
-import pandas as pd
-import string
-import re
-from typing import List, Tuple, Callable, Dict, Any, Set, Union
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-from tqdm.asyncio import tqdm_asyncio
-
-from examples.ags.benchmark.utils import generate_random_indices
-
-def _remove_articles(text: str) -> str:
-    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-    return re.sub(regex, " ", text)
-
-
-def _white_space_fix(text: str) -> str:
-    return " ".join(text.split())
-
-
-EXCLUDE = set(string.punctuation)
-
-def _is_number(text: str) -> bool:
-    try:
-        float(text)
-        return True
-    except ValueError:
-        return False
-
-def _normalize_number(text: str) -> str:
-    if _is_number(text):
-        return str(float(text))
-    else:
-        return text
-
-def _remove_punc(text: str) -> str:
-    if not _is_number(text):
-        return "".join(ch for ch in text if ch not in EXCLUDE)
-    else:
-        return text
-
-
-def _lower(text: str) -> str:
-    return text.lower()
-
-
-def _tokenize(text: str) -> List[str]:
-    return re.split(" |-", text)
-
-
-def _normalize_answer(text: str) -> str:
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    parts = [
-        _white_space_fix(_remove_articles(_normalize_number(_remove_punc(_lower(token)))))
-        for token in _tokenize(text)
-    ]
-    parts = [part for part in parts if part.strip()]
-    normalized = " ".join(parts).strip()
-    return normalized
-
-def _answer_to_bags(
-    answer: Union[str, List[str], Tuple[str, ...]]
-) -> Tuple[List[str], List[Set[str]]]:
-    if isinstance(answer, (list, tuple)):
-        raw_spans = answer
-    else:
-        raw_spans = [answer]
-    normalized_spans: List[str] = []
-    token_bags = []
-    for raw_span in raw_spans:
-        normalized_span = _normalize_answer(raw_span)
-        normalized_spans.append(normalized_span)
-        token_bags.append(set(normalized_span.split()))
-    return normalized_spans, token_bags
-
-#!/usr/bin/python
-
-from collections import defaultdict
-from typing import Any, Dict, List, Set, Tuple, Union, Optional
-import json
-import argparse
-import string
-import re
-
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-
-
-# From here through _normalize_answer was originally copied from:
-# https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
-# Then cleaned up and modified a bit.
-def _remove_articles(text: str) -> str:
-    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-    return re.sub(regex, " ", text)
-
-
-def _white_space_fix(text: str) -> str:
-    return " ".join(text.split())
-
-
-EXCLUDE = set(string.punctuation)
-
-
-def _remove_punc(text: str) -> str:
-    if not _is_number(text):
-        return "".join(ch for ch in text if ch not in EXCLUDE)
-    else:
-        return text
-
-
-def _lower(text: str) -> str:
-    return text.lower()
-
-
-def _tokenize(text: str) -> List[str]:
-    return re.split(" |-", text)
-
-
-def _normalize_answer(text: str) -> str:
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    parts = [
-        _white_space_fix(_remove_articles(_normalize_number(_remove_punc(_lower(token)))))
-        for token in _tokenize(text)
-    ]
-    parts = [part for part in parts if part.strip()]
-    normalized = " ".join(parts).strip()
-    return normalized
-
-
-def _is_number(text: str) -> bool:
-    try:
-        float(text)
-        return True
-    except ValueError:
-        return False
-
-
-def _normalize_number(text: str) -> str:
-    if _is_number(text):
-        return str(float(text))
-    else:
-        return text
-
-
-def _answer_to_bags(
-    answer: Union[str, List[str], Tuple[str, ...]]
-) -> Tuple[List[str], List[Set[str]]]:
-    if isinstance(answer, (list, tuple)):
-        raw_spans = answer
-    else:
-        raw_spans = [answer]
-    normalized_spans: List[str] = []
-    token_bags = []
-    for raw_span in raw_spans:
-        normalized_span = _normalize_answer(raw_span)
-        normalized_spans.append(normalized_span)
-        token_bags.append(set(normalized_span.split()))
-    return normalized_spans, token_bags
-
-
-def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
-    """
-    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
-    between them and gets maximum metric values over all the answers.
-    """
-    scores = np.zeros([len(gold), len(predicted)])
-    for gold_index, gold_item in enumerate(gold):
-        for pred_index, pred_item in enumerate(predicted):
-            if _match_numbers_if_present(gold_item, pred_item):
-                scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
-    row_ind, col_ind = linear_sum_assignment(-scores)
-
-    max_scores = np.zeros([max(len(gold), len(predicted))])
-    for row, column in zip(row_ind, col_ind):
-        max_scores[row] = max(max_scores[row], scores[row, column])
-    return max_scores
-
-
-def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
-    intersection = len(gold_bag.intersection(predicted_bag))
-    if not predicted_bag:
-        precision = 1.0
-    else:
-        precision = intersection / float(len(predicted_bag))
-    if not gold_bag:
-        recall = 1.0
-    else:
-        recall = intersection / float(len(gold_bag))
-    f1 = (
-        (2 * precision * recall) / (precision + recall)
-        if not (precision == 0.0 and recall == 0.0)
-        else 0.0
-    )
-    return f1
-
-def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool:
-    gold_numbers = set()
-    predicted_numbers = set()
-    for word in gold_bag:
-        if _is_number(word):
-            gold_numbers.add(word)
-    for word in predicted_bag:
-        if _is_number(word):
-            predicted_numbers.add(word)
-    if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
-        return True
-    return False
-
-def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
-    intersection = len(gold_bag.intersection(predicted_bag))
-    if not predicted_bag:
-        precision = 1.0
-    else:
-        precision = intersection / float(len(predicted_bag))
-    if not gold_bag:
-        recall = 1.0
-    else:
-        recall = intersection / float(len(gold_bag))
-    f1 = (
-        (2 * precision * recall) / (precision + recall)
-        if not (precision == 0.0 and recall == 0.0)
-        else 0.0
-    )
-    return f1
-
-def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
-    """
-    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
-    between them and gets maximum metric values over all the answers.
-    """
-    scores = np.zeros([len(gold), len(predicted)])
-    for gold_index, gold_item in enumerate(gold):
-        for pred_index, pred_item in enumerate(predicted):
-            if _match_numbers_if_present(gold_item, pred_item):
-                scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
-    row_ind, col_ind = linear_sum_assignment(-scores)
-
-    max_scores = np.zeros([max(len(gold), len(predicted))])
-    for row, column in zip(row_ind, col_ind):
-        max_scores[row] = max(max_scores[row], scores[row, column])
-    return max_scores
-
-def get_metrics(
-    predicted: Union[str, List[str], Tuple[str, ...]], gold: Union[str, List[str], Tuple[str, ...]]
-) -> Tuple[float, float]:
-    """
-    Takes a predicted answer and a gold answer (that are both either a string or a list of
-    strings), and returns exact match and the DROP F1 metric for the prediction.  If you are
-    writing a script for evaluating objects in memory (say, the output of predictions during
-    validation, or while training), this is the function you want to call, after using
-    :func:`answer_json_to_strings` when reading the gold answer from the released data file.
-    """
-    predicted_bags = _answer_to_bags(predicted)
-    gold_bags = _answer_to_bags(gold)
-
-    if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
-        exact_match = 1.0
-    else:
-        exact_match = 0.0
-
-    f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
-    f1 = np.mean(f1_per_bag)
-    f1 = round(f1, 2)
-    return exact_match, f1
-
-def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str]:
-    """
-    Takes an answer JSON blob from the DROP data release and converts it into strings used for
-    evaluation.
-    """
-    if "number" in answer and answer["number"]:
-        return tuple([str(answer["number"])]), "number"
-    elif "spans" in answer and answer["spans"]:
-        return tuple(answer["spans"]), "span" if len(answer["spans"]) == 1 else "spans"
-    elif "date" in answer:
-        return (
-            tuple(
-                [
-                    "{0} {1} {2}".format(
-                        answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]
-                    )
-                ]
-            ),
-            "date",
-        )
-    else:
-        raise ValueError(
-            f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}"
-        )
-
-def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]:
-    with open(file_path, mode="r") as file:
-        data = json.load(file)
-        data = list(data.items())
-
-    random_indices = generate_random_indices(len(data), samples)
-    data = [data[i] for i in random_indices]
-    return data
-
-async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float]:
-
-    max_retries = 5
-    retries = 0
-
-    while retries < max_retries:
-        try:
-            prediction = await graph(question, passage)
-
-
-    max_score = 0.0
-    best_answer = None
-    for answer in answers:
-        golden_answer, _ = answer_json_to_strings(answer)
-        golden_answer = golden_answer[0]
-        score = get_f1_score(prediction, golden_answer)
-        if score > max_score:
-            max_score = score
-            best_answer = golden_answer
-
-    return best_answer, prediction, max_score, cost  # 修改返回值以包含cost
-
-async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], graph: Callable, max_concurrent_tasks: int = 50) -> List[List[Any]]:
-    semaphore = asyncio.Semaphore(max_concurrent_tasks)
-    results = []
-
-    async def sem_evaluate(id: str, annotation: Dict[str, Any]):
-        async with semaphore:
-            passage = annotation["passage"]
-            for qa_pair in annotation["qa_pairs"]:
-                question = qa_pair["question"]
-                answers = [qa_pair["answer"]]
-                if "validated_answers" in qa_pair and qa_pair["validated_answers"]:
-                    answers += qa_pair["validated_answers"]
-                best_answer, prediction, score, cost = await evaluate_problem(question, passage, answers, graph)
-                results.append([id, question, prediction, best_answer, score, cost])  # 修改这行以包含cost
-
-    tasks = [sem_evaluate(id, annotation) for id, annotation in annotations]
-    await tqdm_asyncio.gather(*tasks, desc="Evaluating DROP passages", total=len(annotations))
-
-    return results
-
-def save_results_to_csv(results: List[List[Any]], path: str) -> Tuple[float, float]:
-    df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score", "cost"])
-    average_score = df["score"].mean()
-    total_cost = df["cost"].iloc[-1] # 添加这行来计算总cost
-
-    output_file = f"{path}/{average_score:.5f}.csv"
-    df.to_csv(output_file, index=False)
-    print(f"Results saved to {output_file}")
-
-    return average_score, total_cost  # 修改返回值以包含total_cost
-
-async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
-    data = load_data(file_path, samples)
-    results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20)
-    average_score, total_cost = save_results_to_csv(results, path=path)
-    print(f"Average score on DROP dataset: {average_score:.5f}")
-    print(f"Total Cost: {total_cost:.5f}")
-    return average_score, total_cost  # 修改返回值以包含total_cost
diff --git a/examples/ags/benchmark/hotpotqa.py b/examples/ags/benchmark/hotpotqa.py
index 185ada56a..e69de29bb 100644
--- a/examples/ags/benchmark/hotpotqa.py
+++ b/examples/ags/benchmark/hotpotqa.py
@@ -1,126 +0,0 @@
-import json
-import asyncio
-import aiofiles
-import pandas as pd
-import numpy as np
-from typing import List, Tuple, Callable, Set
-from collections import Counter
-from tqdm.asyncio import tqdm_asyncio
-from scipy.optimize import linear_sum_assignment
-import string
-import re
-
-
-from examples.ags.benchmark.utils import generate_random_indices
-
-def is_number(text: str) -> bool:
-    try:
-        float(text)
-        return True
-    except ValueError:
-        return False
-
-def normalize_answer(s):
-    """
-    Normalize answers for evaluation.
-    """
-
-    def remove_articles(text):
-        return re.sub(r"\b(a|an|the)\b", " ", text)
-
-    def white_space_fix(text):
-        return " ".join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return "".join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-def f1_score(prediction, ground_truth):
-    """
-    Compute the F1 score between prediction and ground truth answers.
-    """
-    prediction_tokens = normalize_answer(prediction).split()
-    ground_truth_tokens = normalize_answer(ground_truth).split()
-    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
-    num_same = sum(common.values())
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(prediction_tokens)
-    recall = 1.0 * num_same / len(ground_truth_tokens)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
-
-
-async def load_data(file_path: str, samples=20, total_length=1000) -> List[dict]:
-    data = []
-    async with aiofiles.open(file_path, mode="r") as file:
-        async for line in file:
-            data.append(json.loads(line))
-    data = data[:total_length] 
-    random_indices = generate_random_indices(len(data), samples)
-    data = [data[i] for i in random_indices]
-    return data
-
-async def evaluate_problem(input: str, context_str: str, graph: Callable, expected_output: str):
-    max_retries = 5
-    retries = 0
-
-    while retries < max_retries:
-        try:
-            prediction, cost = await graph(input, context_str) if graph else ("None", None, 0)
-            score = f1_score(prediction, expected_output)
-
-            break
-        except Exception as e:
-            retries += 1
-            print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
-
-            if retries == max_retries:
-                print("Maximum retries reached. Skipping this sample.")
-                prediction = None
-                score = 0
-                cost = 0
-                break
-
-    return input, prediction, expected_output, score, cost
-
-async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50):
-    semaphore = asyncio.Semaphore(max_concurrent_tasks)
-
-    async def sem_evaluate(problem):
-        async with semaphore:
-            input_text = problem["question"]
-            expected_output = problem["answer"]
-            paragraphs = [item[1] for item in problem["context"] if isinstance(item[1], list)]
-            context_str = "\n".join(" ".join(paragraph) for paragraph in paragraphs)
-            return await evaluate_problem(input_text, context_str, graph, expected_output)
-
-    tasks = [sem_evaluate(problem) for problem in data]
-
-    return await tqdm_asyncio.gather(*tasks, desc="Evaluating HotpotQA problems", total=len(data))
-
-def save_results_to_csv(results: List[Tuple[str, str, str, float, str]], path: str) -> Tuple[float, float]:
-    df = pd.DataFrame(
-        results, columns=["question", "prediction", "expected_output", "score", "cost"]
-    )
-    average_score = df["score"].mean()
-    total_cost = df["cost"].iloc[-1]
-
-    output_file = f"{path}/{average_score:.5f}.csv"
-    df.to_csv(output_file, index=False)
-    print(f"Results saved to {output_file}")
-
-    return average_score, total_cost
-
-async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
-    data = await load_data(file_path, samples)
-    results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
-    average_score, total_cost = save_results_to_csv(results, path=path)
-    print(f"Average score on HotpotQA dataset: {average_score:.5f}")
-    print(f"Total Cost: {total_cost:.5f}")
-    return average_score, total_cost
diff --git a/examples/ags/experiments/baselines/cot_hotpotqa.py b/examples/ags/experiments/baselines/cot_hotpotqa.py
index b3a919fff..e9f5592de 100644
--- a/examples/ags/experiments/baselines/cot_hotpotqa.py
+++ b/examples/ags/experiments/baselines/cot_hotpotqa.py
@@ -40,7 +40,7 @@ class CoTSolveGraph(SolveGraph):
 
     async def __call__(self, question: str, context: str) -> Tuple[str, str]:
         answer = await self.cot_generate(question, context, mode="context_fill")
-        return answer
+        return answer, self.llm.cost_manager.total_cost
 
 if __name__ == "__main__":
     async def main():