添加了cost计算

This commit is contained in:
didi 2024-09-10 18:51:18 +08:00
parent 4ce18d7f48
commit ab112462a5
8 changed files with 81 additions and 67 deletions

View file

@ -112,7 +112,7 @@ def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]:
data = [data[i] for i in random_indices]
return data
async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float]:
async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float, str]:
def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str]:
if "number" in answer and answer["number"]:
return tuple([str(answer["number"])]), "number"
@ -133,6 +133,8 @@ async def evaluate_problem(question: str, passage: str, answers: List[Dict[str,
raise ValueError(f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}")
prediction = await graph(question, passage)
cost = prediction[1] # 添加这行来获取cost
prediction = prediction[0] # 修改这行以获取实际的预测结果
def get_f1_score(prediction: str, golden_answer: str) -> float:
predicted_bags = answer_to_bags(prediction)
@ -152,7 +154,7 @@ async def evaluate_problem(question: str, passage: str, answers: List[Dict[str,
max_score = score
best_answer = golden_answer
return best_answer, prediction, max_score
return best_answer, prediction, max_score, cost # 修改返回值以包含cost
async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], graph: Callable, max_concurrent_tasks: int = 50) -> List[List[Any]]:
semaphore = asyncio.Semaphore(max_concurrent_tasks)
@ -166,27 +168,29 @@ async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], g
answers = [qa_pair["answer"]]
if "validated_answers" in qa_pair and qa_pair["validated_answers"]:
answers.extend(qa_pair["validated_answers"])
best_answer, prediction, score = await evaluate_problem(question, passage, answers, graph)
results.append([id, question, prediction, best_answer, score])
best_answer, prediction, score, cost = await evaluate_problem(question, passage, answers, graph)
results.append([id, question, prediction, best_answer, score, cost]) # 修改这行以包含cost
tasks = [sem_evaluate(id, annotation) for id, annotation in annotations]
await tqdm_asyncio.gather(*tasks, desc="Evaluating DROP passages", total=len(annotations))
return results
def save_results_to_csv(results: List[List[Any]], path: str) -> float:
df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score"])
def save_results_to_csv(results: List[List[Any]], path: str) -> Tuple[float, float]:
df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score", "cost"])
average_score = df["score"].mean()
total_cost = df["cost"].iloc[-1] # 添加这行来计算总cost
output_file = f"{path}/{average_score:.5f}.csv"
df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")
return average_score
return average_score, total_cost # 修改返回值以包含total_cost
async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
data = load_data(file_path, samples)
results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20)
average_score = save_results_to_csv(results, path=path)
average_score, total_cost = save_results_to_csv(results, path=path)
print(f"Average score on DROP dataset: {average_score:.5f}")
return average_score
print(f"Total Cost: {total_cost:.5f}")
return average_score, total_cost # 修改返回值以包含total_cost

View file

@ -14,7 +14,7 @@ from tqdm.asyncio import tqdm_asyncio
from examples.ags.benchmark.utils import generate_random_indices
def extract_number(text: str) -> Optional[float]:
"""清理文本并提取单个数字"""
"""Clean text and extract a single number"""
matches = re.findall(r"[-+]?\d+(?:,\d{3})*(?:\.\d+)?|\d+\.\d+", text)
if matches:
last_number = matches[-1].replace(",", "")
@ -26,7 +26,7 @@ def extract_number(text: str) -> Optional[float]:
return None
def loose_match_score(expected_output: str, prediction: str, tolerance: float = 1e-6) -> int:
"""宽松匹配分数计算函数"""
"""Loose match score calculation function"""
expected_number = extract_number(expected_output)
predicted_number = extract_number(prediction)
@ -48,18 +48,19 @@ async def load_data(file_path: str, samples=1) -> List[dict]:
data = [data[i] for i in random_indices]
return data
def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> float:
"""保存结果到CSV文件"""
def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
"""Save results to CSV file"""
df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
average_score = df["score"].mean()
total_cost = df["cost"].iloc[-1]
output_file = f"{path}/{average_score:.5f}.csv"
df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")
return average_score
return average_score, total_cost
async def evaluate_problem(input: str, graph: Callable, expected_output: str) -> Tuple[str, str, str, int, str]:
"""评估单个问题"""
"""Evaluate a single problem"""
prompt = input
max_retries = 5
retries = 0
@ -87,7 +88,7 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
return input, output, expected_output, score, cost
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 20) -> List[Tuple[str, str, str, int, str]]:
"""评估所有问题"""
"""Evaluate all problems"""
semaphore = asyncio.Semaphore(max_concurrent_tasks)
async def sem_evaluate(problem):
@ -100,11 +101,11 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
return await tqdm_asyncio.gather(*tasks, desc="Evaluating problems", total=len(data))
async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
"""GSM8K评估主函数"""
async def gsm8k_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
"""GSM8K evaluation main function"""
data = await load_data(file_path, samples)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=5)
print(results)
average_score = save_results_to_csv(results, path=path)
average_score, total_cost = save_results_to_csv(results, path=path)
print(f"Average score: {average_score:.5f}")
return average_score
print(f"Total Cost: {total_cost:.5f}")
return average_score, total_cost

View file

@ -120,7 +120,7 @@ async def evaluate_problem(input: str, context_str: str, graph: Callable, expect
while retries < max_retries:
try:
prediction, supporting_sentences = await graph(input, context_str) if graph else "None"
prediction, supporting_sentences, cost = await graph(input, context_str) if graph else ("None", None, 0)
predicted_bags = answer_to_bags(prediction)
gold_bags = answer_to_bags(expected_output)
@ -137,9 +137,10 @@ async def evaluate_problem(input: str, context_str: str, graph: Callable, expect
prediction = None
supporting_sentences = None
score = 0
cost = 0
break
return input, prediction, expected_output, supporting_sentences, score
return input, prediction, expected_output, supporting_sentences, score, cost
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50):
semaphore = asyncio.Semaphore(max_concurrent_tasks)
@ -156,21 +157,23 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
return await tqdm_asyncio.gather(*tasks, desc="Evaluating HotpotQA problems", total=len(data))
def save_results_to_csv(results: List[Tuple[str, str, str, str, float]], path: str) -> float:
def save_results_to_csv(results: List[Tuple[str, str, str, str, float, str]], path: str) -> Tuple[float, float]:
df = pd.DataFrame(
results, columns=["question", "prediction", "expected_output", "supporting_sentences", "score"]
results, columns=["question", "prediction", "expected_output", "supporting_sentences", "score", "cost"]
)
average_score = df["score"].mean()
total_cost = df["cost"].iloc[-1]
output_file = f"{path}/{average_score:.5f}.csv"
df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")
return average_score
return average_score, total_cost
async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
data = await load_data(file_path, samples)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
average_score = save_results_to_csv(results, path=path)
average_score, total_cost = save_results_to_csv(results, path=path)
print(f"Average score on HotpotQA dataset: {average_score:.5f}")
return average_score
print(f"Total Cost: {total_cost:.5f}")
return average_score, total_cost

View file

@ -46,13 +46,15 @@ async def check_solution(solution, test_cases, entry_point):
return FAIL, details
async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int]:
async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
max_retries = 5
retries = 0
while retries < max_retries:
try:
solution = await graph(data["prompt"]) if graph else "None"
prediction = await graph(data["prompt"]) if graph else "None"
cost = prediction[1] # 添加这行来获取cost
solution = prediction[0] # 修改这行以获取实际的预测结果
ret = await check_solution(solution, data["test_cases"], data["entry_point"])
score = 1 if ret[0] == PASS else 0
@ -67,11 +69,12 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str,
solution = None
ret = (FAIL, [])
score = 0
cost = 0 # 添加这行来处理错误情况下的cost
break
return data["prompt"], solution, ret[1], score
return data["prompt"], solution, ret[1], score, cost # 修改返回值以包含cost
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int]]:
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int, str]]:
semaphore = asyncio.Semaphore(max_concurrent_tasks)
async def sem_evaluate(problem):
@ -86,8 +89,9 @@ import os
import time
import json
def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -> float:
def save_results_to_jsonl(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
avg_score = 0
total_cost = 0 # 添加这行来计算总cost
timestamp = int(time.time())
filename = f"humaneval_results_{timestamp}.jsonl"
full_path = os.path.join(path, filename)
@ -101,19 +105,23 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int]], path: str) -
"prediction": result[1],
"test_case_details": result[2],
"score": result[3],
"cost": result[4], # 添加这行来包含cost
}
)
+ "\n"
)
avg_score += result[3]
total_cost += float(result[4]) # 添加这行来累加cost
print(f"save to {full_path}")
avg_score /= len(results)
total_cost = results[-1][4] # 使用最后一个结果的cost作为总cost
return round(avg_score, 5)
return round(avg_score, 5), round(total_cost, 5) # 修改返回值以包含total_cost
async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
data = await load_data(file_path, samples)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
average_score = save_results_to_jsonl(results, path=path)
average_score, total_cost = save_results_to_jsonl(results, path=path)
print(f"Average score on HumanEval dataset: {average_score:.5f}")
return average_score
print(f"Total Cost: {total_cost:.5f}")
return average_score, total_cost # 修改返回值以包含total_cost

View file

@ -221,14 +221,15 @@ async def load_data(file_path: str, samples: int = 200) -> List[dict]:
data = [data[i] for i in random_indices]
return data
def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> float:
def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
df = pd.DataFrame(results, columns=["question", "prediction", "expected_output", "score", "cost"])
average_score = df["score"].mean()
total_cost = df["cost"].iloc[-1]
output_file = f"{path}/{average_score:.5f}.csv"
df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")
return average_score
return average_score, total_cost
async def evaluate_problem(problem: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
input_text = problem["problem"]
@ -269,9 +270,10 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
return await tqdm_asyncio.gather(*tasks, desc="Evaluating MATH problems", total=len(data))
async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
data = await load_data(file_path, samples)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
average_score = save_results_to_csv(results, path=path)
average_score, total_cost = save_results_to_csv(results, path=path)
print(f"Average score on MATH dataset: {average_score:.5f}")
return average_score
print(f"Total Cost: {total_cost:.5f}")
return average_score, total_cost

View file

@ -51,13 +51,15 @@ async def check_solution(solution, test_cases, timeout=1):
return FAIL, details
async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int]:
async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str, int, str]:
max_retries = 5
retries = 0
while retries < max_retries:
try:
solution = await graph(data["prompt"]) if graph else "None"
prediction = await graph(data["prompt"]) if graph else "None"
cost = prediction[1]
solution = prediction[0]
ret = await check_solution(solution, data["test_list"])
score = 1 if ret[0] == PASS else 0
@ -74,9 +76,9 @@ async def evaluate_problem(data: dict, graph: Callable) -> Tuple[str, str, str,
score = 0
break
return data["prompt"], solution, ret[1], score
return data["prompt"], solution, ret[1], score, cost
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int]]:
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50) -> List[Tuple[str, str, str, int, str]]:
semaphore = asyncio.Semaphore(max_concurrent_tasks)
async def sem_evaluate(problem):
@ -87,19 +89,20 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
return await tqdm_asyncio.gather(*tasks, desc="Evaluating MBPP problems", total=len(data))
def save_results_to_csv(results: List[Tuple[str, str, str, int]], path: str) -> float:
df = pd.DataFrame(results, columns=["question", "prediction", "test_case_details", "score"])
def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str) -> Tuple[float, float]:
df = pd.DataFrame(results, columns=["question", "prediction", "test_case_details", "score", "cost"])
average_score = df["score"].mean()
total_cost = df["cost"].iloc[-1]
output_file = f"{path}/{average_score:.5f}.csv"
df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")
return average_score, total_cost
return average_score
async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
data = await load_data(file_path, samples)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
average_score = save_results_to_csv(results, path=path)
average_score, total_cost = save_results_to_csv(results, path=path)
print(f"Average score on MBPP dataset: {average_score:.5f}")
return average_score
print(f"Total Cost: {total_cost:.5f}")
return average_score, total_cost

View file

@ -40,7 +40,7 @@ class CoTSolveGraph(SolveGraph):
async def __call__(self, question: str, context: str) -> Tuple[str, str]:
answer = await self.cot_generate(question, context, mode="context_fill")
return answer
return answer, self.llm.cost_manager.total_cost
if __name__ == "__main__":
async def main():
@ -48,7 +48,7 @@ if __name__ == "__main__":
# llm_config = ModelsConfig.default().get("gpt-35-turbo-1106")
graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="DROP")
file_path = "examples/ags/data/drop_dataset_dev.json"
samples = 3
samples = 1
path = "examples/ags/data/baselines/general/drop"
score = await drop_evaluation(graph, file_path, samples, path)
return score

View file

@ -49,25 +49,18 @@ if __name__ == "__main__":
graph = CoTSolveGraph(name="CoT", llm_config=llm_config, dataset="Gsm8K")
file_path = "examples/ags/data/gsm8k.jsonl"
samples = 1055
# samples = 100
path = "examples/ags/data/baselines/general"
score = await gsm8k_evaluation(graph, file_path, samples, path)
return score
score, cost = await gsm8k_evaluation(graph, file_path, samples, path)
return score, cost
import asyncio
asyncio.run(main())
# self consistency operator; universal self consistency;
# medprompt operator; universal self consistency;
# IO指的没有任何Trick看LLM自身的一个效果。使用 model 发布者在对应的 dataset 使用的 prompt。
# deepseek-chat; gpt-4o-mini; gpt-35-turbo-1106
GENERATE_PROMPT = """
Generate Solution for the following problem: {problem_description}
"""
# med ensemble