更新了eval索引的入口

This commit is contained in:
didi 2024-09-11 17:53:52 +08:00
parent b805da0bbe
commit 0704f341de
6 changed files with 36 additions and 44 deletions

View file

@ -210,12 +210,12 @@ def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str
f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}"
)
def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]:
def load_data(file_path: str, samples: int, test=False) -> List[Tuple[str, Dict[str, Any]]]:
with open(file_path, mode="r") as file:
data = json.load(file)
data = list(data.items())
random_indices = generate_random_indices(len(data), samples)
random_indices = generate_random_indices(len(data), samples, test)
data = [data[i] for i in random_indices]
return data
@ -287,8 +287,8 @@ def save_results_to_csv(results: List[List[Any]], path: str) -> float:
return average_score
async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
data = load_data(file_path, samples)
async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> float:
data = load_data(file_path, samples, test=test)
results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20)
average_score = save_results_to_csv(results, path=path)
print(f"Average score on DROP dataset: {average_score:.5f}")

View file

@ -64,33 +64,25 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
prompt = input
max_retries = 5
retries = 0
prediction = await graph(prompt)
cost = prediction[1]
output = prediction[0]["solution"]
while retries < max_retries:
try:
prediction = await graph(prompt)
cost = prediction[1]
output = prediction[0]["solution"]
print(output)
score = loose_match_score(expected_output, output)
break
score = loose_match_score(expected_output, output)
# break
# while retries < max_retries:
# try:
# prediction = await graph(prompt)
# cost = prediction[1]
# output = prediction[0]["solution"]
except Exception as e:
retries += 1
print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
# score = loose_match_score(expected_output, output)
# break
# except Exception as e:
# retries += 1
# print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
# if retries == max_retries:
# print("Maximum retries reached. Skipping this sample.")
# output = None
# cost = None
# score = 0
# break
if retries == max_retries:
print("Maximum retries reached. Skipping this sample.")
output = None
cost = None
score = 0
break
return input, output, expected_output, score, cost

View file

@ -59,13 +59,13 @@ def f1_score(prediction, ground_truth):
return f1
async def load_data(file_path: str, samples=20, total_length=1000) -> List[dict]:
async def load_data(file_path: str, samples=20, total_length=1000, test=False) -> List[dict]:
data = []
async with aiofiles.open(file_path, mode="r") as file:
async for line in file:
data.append(json.loads(line))
data = data[:total_length]
random_indices = generate_random_indices(len(data), samples)
random_indices = generate_random_indices(len(data), samples, test)
data = [data[i] for i in random_indices]
return data
@ -119,8 +119,8 @@ def save_results_to_csv(results: List[Tuple[str, str, str, float]], path: str) -
return average_score
async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
data = await load_data(file_path, samples)
async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> float:
data = await load_data(file_path, samples, test=test)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
average_score = save_results_to_csv(results, path=path)
print(f"Average score on HotpotQA dataset: {average_score:.5f}")

View file

@ -10,12 +10,12 @@ from examples.ags.benchmark.utils import generate_random_indices
PASS = "pass"
FAIL = "fail"
async def load_data(file_path: str, samples=1) -> List[dict]:
async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
data = []
async with aiofiles.open(file_path, mode="r") as file:
async for line in file:
data.append(json.loads(line))
random_indices = generate_random_indices(len(data), samples)
random_indices = generate_random_indices(len(data), samples, test)
data = [data[i] for i in random_indices]
return data
@ -118,8 +118,8 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int, str]], path: s
return round(avg_score, 5), round(total_cost, 5) # 修改返回值以包含total_cost
async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
data = await load_data(file_path, samples)
async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
data = await load_data(file_path, samples, test=test)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
average_score, total_cost = save_results_to_jsonl(results, path=path)
print(f"Average score on HumanEval dataset: {average_score:.5f}")

View file

@ -212,12 +212,12 @@ def calculate_score(expected_output: str, prediction: str) -> int:
return 1 if math_equal(predicted_answer, expected_answer) else 0
async def load_data(file_path: str, samples: int = 200) -> List[dict]:
async def load_data(file_path: str, samples: int = 200, test=False) -> List[dict]:
data = []
async with aiofiles.open(file_path, mode="r") as file:
async for line in file:
data.append(json.loads(line))
random_indices = generate_random_indices(len(data), samples)
random_indices = generate_random_indices(len(data), samples, test)
data = [data[i] for i in random_indices]
return data
@ -270,8 +270,8 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
return await tqdm_asyncio.gather(*tasks, desc="Evaluating MATH problems", total=len(data))
async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
data = await load_data(file_path, samples)
async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
data = await load_data(file_path, samples, test=test)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
average_score, total_cost = save_results_to_csv(results, path=path)
print(f"Average score on MATH dataset: {average_score:.5f}")

View file

@ -10,12 +10,12 @@ from examples.ags.benchmark.utils import generate_random_indices
PASS = "pass"
FAIL = "fail"
async def load_data(file_path: str, samples=1) -> List[dict]:
async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
data = []
async with aiofiles.open(file_path, mode="r") as file:
async for line in file:
data.append(json.loads(line))
random_indices = generate_random_indices(len(data), samples)
random_indices = generate_random_indices(len(data), samples, test)
data = [data[i] for i in random_indices]
return data
@ -99,10 +99,10 @@ def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str
print(f"Results saved to {output_file}")
return average_score, total_cost
async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
data = await load_data(file_path, samples)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
average_score, total_cost = save_results_to_csv(results, path=path)
average_score, total_cost = save_results_to_csv(results, path=path, test=test)
print(f"Average score on MBPP dataset: {average_score:.5f}")
print(f"Total Cost: {total_cost:.5f}")
return average_score, total_cost