mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-14 15:25:17 +02:00
添加了cost计算
This commit is contained in:
parent
4ce18d7f48
commit
ab112462a5
8 changed files with 81 additions and 67 deletions
|
|
@ -112,7 +112,7 @@ def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]:
|
|||
data = [data[i] for i in random_indices]
|
||||
return data
|
||||
|
||||
async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float]:
|
||||
async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float, str]:
|
||||
def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str]:
|
||||
if "number" in answer and answer["number"]:
|
||||
return tuple([str(answer["number"])]), "number"
|
||||
|
|
@ -133,6 +133,8 @@ async def evaluate_problem(question: str, passage: str, answers: List[Dict[str,
|
|||
raise ValueError(f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}")
|
||||
|
||||
prediction = await graph(question, passage)
|
||||
cost = prediction[1] # 添加这行来获取cost
|
||||
prediction = prediction[0] # 修改这行以获取实际的预测结果
|
||||
|
||||
def get_f1_score(prediction: str, golden_answer: str) -> float:
|
||||
predicted_bags = answer_to_bags(prediction)
|
||||
|
|
@ -152,7 +154,7 @@ async def evaluate_problem(question: str, passage: str, answers: List[Dict[str,
|
|||
max_score = score
|
||||
best_answer = golden_answer
|
||||
|
||||
return best_answer, prediction, max_score
|
||||
return best_answer, prediction, max_score, cost # 修改返回值以包含cost
|
||||
|
||||
async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], graph: Callable, max_concurrent_tasks: int = 50) -> List[List[Any]]:
|
||||
semaphore = asyncio.Semaphore(max_concurrent_tasks)
|
||||
|
|
@ -166,27 +168,29 @@ async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], g
|
|||
answers = [qa_pair["answer"]]
|
||||
if "validated_answers" in qa_pair and qa_pair["validated_answers"]:
|
||||
answers.extend(qa_pair["validated_answers"])
|
||||
best_answer, prediction, score = await evaluate_problem(question, passage, answers, graph)
|
||||
results.append([id, question, prediction, best_answer, score])
|
||||
best_answer, prediction, score, cost = await evaluate_problem(question, passage, answers, graph)
|
||||
results.append([id, question, prediction, best_answer, score, cost]) # 修改这行以包含cost
|
||||
|
||||
tasks = [sem_evaluate(id, annotation) for id, annotation in annotations]
|
||||
await tqdm_asyncio.gather(*tasks, desc="Evaluating DROP passages", total=len(annotations))
|
||||
|
||||
return results
|
||||
|
||||
def save_results_to_csv(results: List[List[Any]], path: str) -> float:
|
||||
df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score"])
|
||||
def save_results_to_csv(results: List[List[Any]], path: str) -> Tuple[float, float]:
|
||||
df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score", "cost"])
|
||||
average_score = df["score"].mean()
|
||||
total_cost = df["cost"].iloc[-1] # 添加这行来计算总cost
|
||||
|
||||
output_file = f"{path}/{average_score:.5f}.csv"
|
||||
df.to_csv(output_file, index=False)
|
||||
print(f"Results saved to {output_file}")
|
||||
|
||||
return average_score
|
||||
return average_score, total_cost # 修改返回值以包含total_cost
|
||||
|
||||
async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
|
||||
async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
|
||||
data = load_data(file_path, samples)
|
||||
results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20)
|
||||
average_score = save_results_to_csv(results, path=path)
|
||||
average_score, total_cost = save_results_to_csv(results, path=path)
|
||||
print(f"Average score on DROP dataset: {average_score:.5f}")
|
||||
return average_score
|
||||
print(f"Total Cost: {total_cost:.5f}")
|
||||
return average_score, total_cost # 修改返回值以包含total_cost
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ from tqdm.asyncio import tqdm_asyncio
|
|||
from examples.ags.benchmark.utils import generate_random_indices
|
||||
|
||||
def extract_number(text: str) -> Optional[float]:
|
||||
"""清理文本并提取单个数字"""
|
||||
"""Clean text and extract a single number"""
|
||||
matches = re.findall(r"[-+]?\d+(?:,\d{3})*(?:\.\d+)?|\d+\.\d+", text)
|
||||
if matches:
|
||||
last_number = matches[-1].replace(",", "")
|
||||
|
|
@ -26,7 +26,7 @@ def extract_number(text: str) -> Optional[float]:
|
|||
return None
|
||||
|
||||
def loose_match_score(expected_output: str, prediction: str, tolerance: float = 1e-6) -> int:
|
||||
"""宽松匹配分数计算函数"""
|
||||
"""Loose match score calculation function"""
|
||||
expected_number = extract_number(expected_output)
|
||||
predicted_number = extract_number(prediction)
|
||||
|
||||
|
|
@ -48,18 +48,19 @@ async def load_data(file_path: str, samples=1) -> List[dict]:
|
|||
data = [data[i] for i in random_indices]
|
||||
return data
|
||||
|
||||
def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> float:
|
||||
"""保存结果到CSV文件"""
|
||||
def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
|
||||
"""Save results to CSV file"""
|
||||
df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
|
||||
average_score = df["score"].mean()
|
||||
total_cost = df["cost"].iloc[-1]
|
||||
|
||||
output_file = f"{path}/{average_score:.5f}.csv"
|
||||
df.to_csv(output_file, index=False)
|
||||
print(f"Results saved to {output_file}")
|
||||
return average_score
|
||||
return average_score, total_cost
|
||||
|
||||
async def evaluate_problem(input: str, graph: Callable, expected_output: str) -> Tuple[str, str, str, int, str]:
|
||||
"""评估单个问题"""
|
||||
"""Evaluate a single problem"""
|
||||
prompt = input
|
||||
max_retries = 5
|
||||
retries = 0
|
||||
|
|
@ -87,7 +88,7 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
|
|||
return input, output, expected_output, score, cost
|
||||
|
||||
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 20) -> List[Tuple[str, str, str, int, str]]:
|
||||
"""评估所有问题"""
|
||||
"""Evaluate all problems"""
|
||||
semaphore = asyncio.Semaphore(max_concurrent_tasks)
|
||||
|
||||
async def sem_evaluate(problem):
|
||||
|
|
@ -100,11 +101,11 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
|
|||
|
||||
return await tqdm_asyncio.gather(*tasks, desc="Evaluating problems", total=len(data))
|
||||
|
||||
async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
|
||||
"""GSM8K评估主函数"""
|
||||
async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
|
||||
"""GSM8K evaluation main function"""
|
||||
data = await load_data(file_path, samples)
|
||||
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5)
|
||||
print(results)
|
||||
average_score = save_results_to_csv(results, path=path)
|
||||
average_score, total_cost = save_results_to_csv(results, path=path)
|
||||
print(f"Average score: {average_score:.5f}")
|
||||
return average_score
|
||||
print(f"Total Cost: {total_cost:.5f}")
|
||||
return average_score, total_cost
|
||||
|
|
|
|||
|
|
@ -120,7 +120,7 @@ async def evaluate_problem(input: str, context_str: str, graph: Callable, expect
|
|||
|
||||
while retries < max_retries:
|
||||
try:
|
||||
prediction, supporting_sentences = await graph(input, context_str) if graph else "None"
|
||||
prediction, supporting_sentences, cost = await graph(input, context_str) if graph else ("None", None, 0)
|
||||
predicted_bags = answer_to_bags(prediction)
|
||||
gold_bags = answer_to_bags(expected_output)
|
||||
|
||||
|
|
@ -137,9 +137,10 @@ async def evaluate_problem(input: str, context_str: str, graph: Callable, expect
|
|||
prediction = None
|
||||
supporting_sentences = None
|
||||
score = 0
|
||||
cost = 0
|
||||
break
|
||||
|
||||
return input, prediction, expected_output, supporting_sentences, score
|
||||
return input, prediction, expected_output, supporting_sentences, score, cost
|
||||
|
||||
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50):
|
||||
semaphore = asyncio.Semaphore(max_concurrent_tasks)
|
||||
|
|
@ -156,21 +157,23 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
|
|||
|
||||
return await tqdm_asyncio.gather(*tasks, desc="Evaluating HotpotQA problems", total=len(data))
|
||||
|
||||
def save_results_to_csv(results: List[Tuple[str, str, str, str, float]], path: str) -> float:
|
||||
def save_results_to_csv(results: List[Tuple[str, str, str, str, float, str]], path: str) -> Tuple[float, float]:
|
||||
df = pd.DataFrame(
|
||||
results, columns=["question", "prediction", "expected_output", "supporting_sentences", "score"]
|
||||
results, columns=["question", "prediction", "expected_output", "supporting_sentences", "score", "cost"]
|
||||
)
|
||||
average_score = df["score"].mean()
|
||||
total_cost = df["cost"].iloc[-1]
|
||||
|
||||
output_file = f"{path}/{average_score:.5f}.csv"
|
||||
df.to_csv(output_file, index=False)
|
||||
print(f"Results saved to {output_file}")
|
||||
|
||||
return average_score
|
||||
return average_score, total_cost
|
||||
|
||||
async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
|
||||
async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
|
||||
data = await load_data(file_path, samples)
|
||||
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
|
||||
average_score = save_results_to_csv(results, path=path)
|
||||
average_score, total_cost = save_results_to_csv(results, path=path)
|
||||
print(f"Average score on HotpotQA dataset: {average_score:.5f}")
|
||||
return average_score
|
||||
print(f"Total Cost: {total_cost:.5f}")
|
||||
return average_score, total_cost
|
||||
|
|
|
|||
|
|
@ -46,13 +46,15 @@ async def check_solution(solution, test_cases, entry_point):
|
|||
|
||||
return FAIL, details
|
||||
|
||||
async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int]:
|
||||
async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
|
||||
max_retries = 5
|
||||
retries = 0
|
||||
|
||||
while retries < max_retries:
|
||||
try:
|
||||
solution = await graph(data["prompt"]) if graph else "None"
|
||||
prediction = await graph(data["prompt"]) if graph else "None"
|
||||
cost = prediction[1] # 添加这行来获取cost
|
||||
solution = prediction[0] # 修改这行以获取实际的预测结果
|
||||
ret = await check_solution(solution, data["test_cases"], data["entry_point"])
|
||||
|
||||
score = 1 if ret[0] == PASS else 0
|
||||
|
|
@ -67,11 +69,12 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str,
|
|||
solution = None
|
||||
ret = (FAIL, [])
|
||||
score = 0
|
||||
cost = 0 # 添加这行来处理错误情况下的cost
|
||||
break
|
||||
|
||||
return data["prompt"], solution, ret[1], score
|
||||
return data["prompt"], solution, ret[1], score, cost # 修改返回值以包含cost
|
||||
|
||||
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int]]:
|
||||
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int, str]]:
|
||||
semaphore = asyncio.Semaphore(max_concurrent_tasks)
|
||||
|
||||
async def sem_evaluate(problem):
|
||||
|
|
@ -86,8 +89,9 @@ import os
|
|||
import time
|
||||
import json
|
||||
|
||||
def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -> float:
|
||||
def save_results_to_jsonl(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
|
||||
avg_score = 0
|
||||
total_cost = 0 # 添加这行来计算总cost
|
||||
timestamp = int(time.time())
|
||||
filename = f"humaneval_results_{timestamp}.jsonl"
|
||||
full_path = os.path.join(path, filename)
|
||||
|
|
@ -101,19 +105,23 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -
|
|||
"prediction": result[1],
|
||||
"test_case_details": result[2],
|
||||
"score": result[3],
|
||||
"cost": result[4], # 添加这行来包含cost
|
||||
}
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
avg_score += result[3]
|
||||
total_cost += float(result[4]) # 添加这行来累加cost
|
||||
print(f"save to {full_path}")
|
||||
avg_score /= len(results)
|
||||
total_cost = results[-1][4] # 使用最后一个结果的cost作为总cost
|
||||
|
||||
return round(avg_score, 5)
|
||||
return round(avg_score, 5), round(total_cost, 5) # 修改返回值以包含total_cost
|
||||
|
||||
async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
|
||||
async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
|
||||
data = await load_data(file_path, samples)
|
||||
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
|
||||
average_score = save_results_to_jsonl(results, path=path)
|
||||
average_score, total_cost = save_results_to_jsonl(results, path=path)
|
||||
print(f"Average score on HumanEval dataset: {average_score:.5f}")
|
||||
return average_score
|
||||
print(f"Total Cost: {total_cost:.5f}")
|
||||
return average_score, total_cost # 修改返回值以包含total_cost
|
||||
|
|
|
|||
|
|
@ -221,14 +221,15 @@ async def load_data(file_path: str, samples: int = 200) -> List[dict]:
|
|||
data = [data[i] for i in random_indices]
|
||||
return data
|
||||
|
||||
def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> float:
|
||||
def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
|
||||
df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
|
||||
average_score = df["score"].mean()
|
||||
total_cost = df["cost"].iloc[-1]
|
||||
|
||||
output_file = f"{path}/{average_score:.5f}.csv"
|
||||
df.to_csv(output_file, index=False)
|
||||
print(f"Results saved to {output_file}")
|
||||
return average_score
|
||||
return average_score, total_cost
|
||||
|
||||
async def evaluate_problem(problem: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
|
||||
input_text = problem["problem"]
|
||||
|
|
@ -269,9 +270,10 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
|
|||
|
||||
return await tqdm_asyncio.gather(*tasks, desc="Evaluating MATH problems", total=len(data))
|
||||
|
||||
async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
|
||||
async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
|
||||
data = await load_data(file_path, samples)
|
||||
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
|
||||
average_score = save_results_to_csv(results, path=path)
|
||||
average_score, total_cost = save_results_to_csv(results, path=path)
|
||||
print(f"Average score on MATH dataset: {average_score:.5f}")
|
||||
return average_score
|
||||
print(f"Total Cost: {total_cost:.5f}")
|
||||
return average_score, total_cost
|
||||
|
|
|
|||
|
|
@ -51,13 +51,15 @@ async def check_solution(solution, test_cases, timeout=1):
|
|||
|
||||
return FAIL, details
|
||||
|
||||
async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int]:
|
||||
async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
|
||||
max_retries = 5
|
||||
retries = 0
|
||||
|
||||
while retries < max_retries:
|
||||
try:
|
||||
solution = await graph(data["prompt"]) if graph else "None"
|
||||
prediction = await graph(data["prompt"]) if graph else "None"
|
||||
cost = prediction[1]
|
||||
solution = prediction[0]
|
||||
ret = await check_solution(solution, data["test_list"])
|
||||
|
||||
score = 1 if ret[0] == PASS else 0
|
||||
|
|
@ -74,9 +76,9 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str,
|
|||
score = 0
|
||||
break
|
||||
|
||||
return data["prompt"], solution, ret[1], score
|
||||
return data["prompt"], solution, ret[1], score, cost
|
||||
|
||||
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int]]:
|
||||
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int, str]]:
|
||||
semaphore = asyncio.Semaphore(max_concurrent_tasks)
|
||||
|
||||
async def sem_evaluate(problem):
|
||||
|
|
@ -87,19 +89,20 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
|
|||
|
||||
return await tqdm_asyncio.gather(*tasks, desc="Evaluating MBPP problems", total=len(data))
|
||||
|
||||
def save_results_to_csv(results: List[Tuple[str, str, str, int]], path: str) -> float:
|
||||
df = pd.DataFrame(results, columns=["question", "prediction", "test_case_details", "score"])
|
||||
def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
|
||||
df = pd.DataFrame(results, columns=["question", "prediction", "test_case_details", "score", "cost"])
|
||||
average_score = df["score"].mean()
|
||||
total_cost = df["cost"].iloc[-1]
|
||||
|
||||
output_file = f"{path}/{average_score:.5f}.csv"
|
||||
df.to_csv(output_file, index=False)
|
||||
print(f"Results saved to {output_file}")
|
||||
return average_score, total_cost
|
||||
|
||||
return average_score
|
||||
|
||||
async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
|
||||
async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
|
||||
data = await load_data(file_path, samples)
|
||||
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
|
||||
average_score = save_results_to_csv(results, path=path)
|
||||
average_score, total_cost = save_results_to_csv(results, path=path)
|
||||
print(f"Average score on MBPP dataset: {average_score:.5f}")
|
||||
return average_score
|
||||
print(f"Total Cost: {total_cost:.5f}")
|
||||
return average_score, total_cost
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ class CoTSolveGraph(SolveGraph):
|
|||
|
||||
async def __call__(self, question: str, context: str) -> Tuple[str, str]:
|
||||
answer = await self.cot_generate(question, context, mode="context_fill")
|
||||
return answer
|
||||
return answer, self.llm.cost_manager.total_cost
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def main():
|
||||
|
|
@ -48,7 +48,7 @@ if __name__ == "__main__":
|
|||
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
|
||||
graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="DROP")
|
||||
file_path = "examples/ags/data/drop_dataset_dev.json"
|
||||
samples = 3
|
||||
samples = 1
|
||||
path = "examples/ags/data/baselines/general/drop"
|
||||
score = await drop_evaluation(graph, file_path, samples, path)
|
||||
return score
|
||||
|
|
|
|||
|
|
@ -49,25 +49,18 @@ if __name__ == "__main__":
|
|||
graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K")
|
||||
file_path = "examples/ags/data/gsm8k.jsonl"
|
||||
samples = 1055
|
||||
# samples = 100
|
||||
path = "examples/ags/data/baselines/general"
|
||||
score = await gsm8k_evaluation(graph, file_path, samples, path)
|
||||
return score
|
||||
score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
|
||||
return score, cost
|
||||
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
|
||||
|
||||
# self consistency operator; universal self consistency;
|
||||
# medprompt operator; universal self consistency;
|
||||
|
||||
# IO指的没有任何Trick,看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。
|
||||
|
||||
# deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106
|
||||
|
||||
|
||||
|
||||
GENERATE_PROMPT = """
|
||||
Generate Solution for the following problem: {problem_description}
|
||||
"""
|
||||
|
||||
# med ensemble
|
||||
Loading…
Add table
Add a link
Reference in a new issue