mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-08 15:05:17 +02:00
更新了eval索引的入口
This commit is contained in:
parent
b805da0bbe
commit
0704f341de
6 changed files with 36 additions and 44 deletions
|
|
@ -210,12 +210,12 @@ def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str
|
|||
f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}"
|
||||
)
|
||||
|
||||
def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]:
|
||||
def load_data(file_path: str, samples: int, test=False) -> List[Tuple[str, Dict[str, Any]]]:
|
||||
with open(file_path, mode="r") as file:
|
||||
data = json.load(file)
|
||||
data = list(data.items())
|
||||
|
||||
random_indices = generate_random_indices(len(data), samples)
|
||||
random_indices = generate_random_indices(len(data), samples, test)
|
||||
data = [data[i] for i in random_indices]
|
||||
return data
|
||||
|
||||
|
|
@ -287,8 +287,8 @@ def save_results_to_csv(results: List[List[Any]], path: str) -> float:
|
|||
|
||||
return average_score
|
||||
|
||||
async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
|
||||
data = load_data(file_path, samples)
|
||||
async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> float:
|
||||
data = load_data(file_path, samples, test=test)
|
||||
results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20)
|
||||
average_score = save_results_to_csv(results, path=path)
|
||||
print(f"Average score on DROP dataset: {average_score:.5f}")
|
||||
|
|
|
|||
|
|
@ -64,33 +64,25 @@ async def evaluate_problem(input: str, graph: Callable, expected_output: str) ->
|
|||
prompt = input
|
||||
max_retries = 5
|
||||
retries = 0
|
||||
prediction = await graph(prompt)
|
||||
cost = prediction[1]
|
||||
output = prediction[0]["solution"]
|
||||
while retries < max_retries:
|
||||
try:
|
||||
prediction = await graph(prompt)
|
||||
cost = prediction[1]
|
||||
output = prediction[0]["solution"]
|
||||
|
||||
print(output)
|
||||
score = loose_match_score(expected_output, output)
|
||||
break
|
||||
|
||||
score = loose_match_score(expected_output, output)
|
||||
# break
|
||||
# while retries < max_retries:
|
||||
# try:
|
||||
# prediction = await graph(prompt)
|
||||
# cost = prediction[1]
|
||||
# output = prediction[0]["solution"]
|
||||
except Exception as e:
|
||||
retries += 1
|
||||
print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
|
||||
|
||||
# score = loose_match_score(expected_output, output)
|
||||
# break
|
||||
|
||||
# except Exception as e:
|
||||
# retries += 1
|
||||
# print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
|
||||
|
||||
# if retries == max_retries:
|
||||
# print("Maximum retries reached. Skipping this sample.")
|
||||
# output = None
|
||||
# cost = None
|
||||
# score = 0
|
||||
# break
|
||||
if retries == max_retries:
|
||||
print("Maximum retries reached. Skipping this sample.")
|
||||
output = None
|
||||
cost = None
|
||||
score = 0
|
||||
break
|
||||
|
||||
return input, output, expected_output, score, cost
|
||||
|
||||
|
|
|
|||
|
|
@ -59,13 +59,13 @@ def f1_score(prediction, ground_truth):
|
|||
return f1
|
||||
|
||||
|
||||
async def load_data(file_path: str, samples=20, total_length=1000) -> List[dict]:
|
||||
async def load_data(file_path: str, samples=20, total_length=1000, test=False) -> List[dict]:
|
||||
data = []
|
||||
async with aiofiles.open(file_path, mode="r") as file:
|
||||
async for line in file:
|
||||
data.append(json.loads(line))
|
||||
data = data[:total_length]
|
||||
random_indices = generate_random_indices(len(data), samples)
|
||||
random_indices = generate_random_indices(len(data), samples, test)
|
||||
data = [data[i] for i in random_indices]
|
||||
return data
|
||||
|
||||
|
|
@ -119,8 +119,8 @@ def save_results_to_csv(results: List[Tuple[str, str, str, float]], path: str) -
|
|||
|
||||
return average_score
|
||||
|
||||
async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> float:
|
||||
data = await load_data(file_path, samples)
|
||||
async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> float:
|
||||
data = await load_data(file_path, samples, test=test)
|
||||
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
|
||||
average_score = save_results_to_csv(results, path=path)
|
||||
print(f"Average score on HotpotQA dataset: {average_score:.5f}")
|
||||
|
|
|
|||
|
|
@ -10,12 +10,12 @@ from examples.ags.benchmark.utils import generate_random_indices
|
|||
PASS = "pass"
|
||||
FAIL = "fail"
|
||||
|
||||
async def load_data(file_path: str, samples=1) -> List[dict]:
|
||||
async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
|
||||
data = []
|
||||
async with aiofiles.open(file_path, mode="r") as file:
|
||||
async for line in file:
|
||||
data.append(json.loads(line))
|
||||
random_indices = generate_random_indices(len(data), samples)
|
||||
random_indices = generate_random_indices(len(data), samples, test)
|
||||
data = [data[i] for i in random_indices]
|
||||
return data
|
||||
|
||||
|
|
@ -118,8 +118,8 @@ def save_results_to_jsonl(results: List[Tuple[str, str, str, int, str]], path: s
|
|||
|
||||
return round(avg_score, 5), round(total_cost, 5) # 修改返回值以包含total_cost
|
||||
|
||||
async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
|
||||
data = await load_data(file_path, samples)
|
||||
async def humaneval_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
|
||||
data = await load_data(file_path, samples, test=test)
|
||||
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
|
||||
average_score, total_cost = save_results_to_jsonl(results, path=path)
|
||||
print(f"Average score on HumanEval dataset: {average_score:.5f}")
|
||||
|
|
|
|||
|
|
@ -212,12 +212,12 @@ def calculate_score(expected_output: str, prediction: str) -> int:
|
|||
|
||||
return 1 if math_equal(predicted_answer, expected_answer) else 0
|
||||
|
||||
async def load_data(file_path: str, samples: int = 200) -> List[dict]:
|
||||
async def load_data(file_path: str, samples: int = 200, test=False) -> List[dict]:
|
||||
data = []
|
||||
async with aiofiles.open(file_path, mode="r") as file:
|
||||
async for line in file:
|
||||
data.append(json.loads(line))
|
||||
random_indices = generate_random_indices(len(data), samples)
|
||||
random_indices = generate_random_indices(len(data), samples, test)
|
||||
data = [data[i] for i in random_indices]
|
||||
return data
|
||||
|
||||
|
|
@ -270,8 +270,8 @@ async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurren
|
|||
|
||||
return await tqdm_asyncio.gather(*tasks, desc="Evaluating MATH problems", total=len(data))
|
||||
|
||||
async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
|
||||
data = await load_data(file_path, samples)
|
||||
async def math_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
|
||||
data = await load_data(file_path, samples, test=test)
|
||||
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
|
||||
average_score, total_cost = save_results_to_csv(results, path=path)
|
||||
print(f"Average score on MATH dataset: {average_score:.5f}")
|
||||
|
|
|
|||
|
|
@ -10,12 +10,12 @@ from examples.ags.benchmark.utils import generate_random_indices
|
|||
PASS = "pass"
|
||||
FAIL = "fail"
|
||||
|
||||
async def load_data(file_path: str, samples=1) -> List[dict]:
|
||||
async def load_data(file_path: str, samples=1, test=False) -> List[dict]:
|
||||
data = []
|
||||
async with aiofiles.open(file_path, mode="r") as file:
|
||||
async for line in file:
|
||||
data.append(json.loads(line))
|
||||
random_indices = generate_random_indices(len(data), samples)
|
||||
random_indices = generate_random_indices(len(data), samples, test)
|
||||
data = [data[i] for i in random_indices]
|
||||
return data
|
||||
|
||||
|
|
@ -99,10 +99,10 @@ def save_results_to_csv(results: List[Tuple[str, str, str, int, str]], path: str
|
|||
print(f"Results saved to {output_file}")
|
||||
return average_score, total_cost
|
||||
|
||||
async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
|
||||
async def mbpp_evaluation(graph: Callable, file_path: str, samples: int, path: str, test=False) -> Tuple[float, float]:
|
||||
data = await load_data(file_path, samples)
|
||||
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
|
||||
average_score, total_cost = save_results_to_csv(results, path=path)
|
||||
average_score, total_cost = save_results_to_csv(results, path=path, test=test)
|
||||
print(f"Average score on MBPP dataset: {average_score:.5f}")
|
||||
print(f"Total Cost: {total_cost:.5f}")
|
||||
return average_score, total_cost
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue