This commit is contained in:
didi 2024-09-10 19:06:17 +08:00
commit bdf865eb0d
3 changed files with 1 additions and 489 deletions

View file

@ -1,362 +0,0 @@
import json
import asyncio
import pandas as pd
import string
import re
from typing import List, Tuple, Callable, Dict, Any, Set, Union
import numpy as np
from scipy.optimize import linear_sum_assignment
from tqdm.asyncio import tqdm_asyncio
from examples.ags.benchmark.utils import generate_random_indices
def _remove_articles(text: str) -> str:
regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
return re.sub(regex, " ", text)
def _white_space_fix(text: str) -> str:
return " ".join(text.split())
EXCLUDE = set(string.punctuation)
def _is_number(text: str) -> bool:
try:
float(text)
return True
except ValueError:
return False
def _normalize_number(text: str) -> str:
if _is_number(text):
return str(float(text))
else:
return text
def _remove_punc(text: str) -> str:
if not _is_number(text):
return "".join(ch for ch in text if ch not in EXCLUDE)
else:
return text
def _lower(text: str) -> str:
return text.lower()
def _tokenize(text: str) -> List[str]:
return re.split(" |-", text)
def _normalize_answer(text: str) -> str:
"""Lower text and remove punctuation, articles and extra whitespace."""
parts = [
_white_space_fix(_remove_articles(_normalize_number(_remove_punc(_lower(token)))))
for token in _tokenize(text)
]
parts = [part for part in parts if part.strip()]
normalized = " ".join(parts).strip()
return normalized
def _answer_to_bags(
answer: Union[str, List[str], Tuple[str, ...]]
) -> Tuple[List[str], List[Set[str]]]:
if isinstance(answer, (list, tuple)):
raw_spans = answer
else:
raw_spans = [answer]
normalized_spans: List[str] = []
token_bags = []
for raw_span in raw_spans:
normalized_span = _normalize_answer(raw_span)
normalized_spans.append(normalized_span)
token_bags.append(set(normalized_span.split()))
return normalized_spans, token_bags
#!/usr/bin/python
from collections import defaultdict
from typing import Any, Dict, List, Set, Tuple, Union, Optional
import json
import argparse
import string
import re
import numpy as np
from scipy.optimize import linear_sum_assignment
# From here through _normalize_answer was originally copied from:
# https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
# Then cleaned up and modified a bit.
def _remove_articles(text: str) -> str:
regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
return re.sub(regex, " ", text)
def _white_space_fix(text: str) -> str:
return " ".join(text.split())
EXCLUDE = set(string.punctuation)
def _remove_punc(text: str) -> str:
if not _is_number(text):
return "".join(ch for ch in text if ch not in EXCLUDE)
else:
return text
def _lower(text: str) -> str:
return text.lower()
def _tokenize(text: str) -> List[str]:
return re.split(" |-", text)
def _normalize_answer(text: str) -> str:
"""Lower text and remove punctuation, articles and extra whitespace."""
parts = [
_white_space_fix(_remove_articles(_normalize_number(_remove_punc(_lower(token)))))
for token in _tokenize(text)
]
parts = [part for part in parts if part.strip()]
normalized = " ".join(parts).strip()
return normalized
def _is_number(text: str) -> bool:
try:
float(text)
return True
except ValueError:
return False
def _normalize_number(text: str) -> str:
if _is_number(text):
return str(float(text))
else:
return text
def _answer_to_bags(
answer: Union[str, List[str], Tuple[str, ...]]
) -> Tuple[List[str], List[Set[str]]]:
if isinstance(answer, (list, tuple)):
raw_spans = answer
else:
raw_spans = [answer]
normalized_spans: List[str] = []
token_bags = []
for raw_span in raw_spans:
normalized_span = _normalize_answer(raw_span)
normalized_spans.append(normalized_span)
token_bags.append(set(normalized_span.split()))
return normalized_spans, token_bags
def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
"""
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
between them and gets maximum metric values over all the answers.
"""
scores = np.zeros([len(gold), len(predicted)])
for gold_index, gold_item in enumerate(gold):
for pred_index, pred_item in enumerate(predicted):
if _match_numbers_if_present(gold_item, pred_item):
scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
row_ind, col_ind = linear_sum_assignment(-scores)
max_scores = np.zeros([max(len(gold), len(predicted))])
for row, column in zip(row_ind, col_ind):
max_scores[row] = max(max_scores[row], scores[row, column])
return max_scores
def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
intersection = len(gold_bag.intersection(predicted_bag))
if not predicted_bag:
precision = 1.0
else:
precision = intersection / float(len(predicted_bag))
if not gold_bag:
recall = 1.0
else:
recall = intersection / float(len(gold_bag))
f1 = (
(2 * precision * recall) / (precision + recall)
if not (precision == 0.0 and recall == 0.0)
else 0.0
)
return f1
def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool:
gold_numbers = set()
predicted_numbers = set()
for word in gold_bag:
if _is_number(word):
gold_numbers.add(word)
for word in predicted_bag:
if _is_number(word):
predicted_numbers.add(word)
if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
return True
return False
def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
intersection = len(gold_bag.intersection(predicted_bag))
if not predicted_bag:
precision = 1.0
else:
precision = intersection / float(len(predicted_bag))
if not gold_bag:
recall = 1.0
else:
recall = intersection / float(len(gold_bag))
f1 = (
(2 * precision * recall) / (precision + recall)
if not (precision == 0.0 and recall == 0.0)
else 0.0
)
return f1
def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
"""
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
between them and gets maximum metric values over all the answers.
"""
scores = np.zeros([len(gold), len(predicted)])
for gold_index, gold_item in enumerate(gold):
for pred_index, pred_item in enumerate(predicted):
if _match_numbers_if_present(gold_item, pred_item):
scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
row_ind, col_ind = linear_sum_assignment(-scores)
max_scores = np.zeros([max(len(gold), len(predicted))])
for row, column in zip(row_ind, col_ind):
max_scores[row] = max(max_scores[row], scores[row, column])
return max_scores
def get_metrics(
predicted: Union[str, List[str], Tuple[str, ...]], gold: Union[str, List[str], Tuple[str, ...]]
) -> Tuple[float, float]:
"""
Takes a predicted answer and a gold answer (that are both either a string or a list of
strings), and returns exact match and the DROP F1 metric for the prediction. If you are
writing a script for evaluating objects in memory (say, the output of predictions during
validation, or while training), this is the function you want to call, after using
:func:`answer_json_to_strings` when reading the gold answer from the released data file.
"""
predicted_bags = _answer_to_bags(predicted)
gold_bags = _answer_to_bags(gold)
if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
exact_match = 1.0
else:
exact_match = 0.0
f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
f1 = np.mean(f1_per_bag)
f1 = round(f1, 2)
return exact_match, f1
def answer_json_to_strings(answer: Dict[str, Any]) -> Tuple[Tuple[str, ...], str]:
"""
Takes an answer JSON blob from the DROP data release and converts it into strings used for
evaluation.
"""
if "number" in answer and answer["number"]:
return tuple([str(answer["number"])]), "number"
elif "spans" in answer and answer["spans"]:
return tuple(answer["spans"]), "span" if len(answer["spans"]) == 1 else "spans"
elif "date" in answer:
return (
tuple(
[
"{0} {1} {2}".format(
answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]
)
]
),
"date",
)
else:
raise ValueError(
f"Answer type not found, should be one of number, spans or date at: {json.dumps(answer)}"
)
def load_data(file_path: str, samples: int) -> List[Tuple[str, Dict[str, Any]]]:
with open(file_path, mode="r") as file:
data = json.load(file)
data = list(data.items())
random_indices = generate_random_indices(len(data), samples)
data = [data[i] for i in random_indices]
return data
async def evaluate_problem(question: str, passage: str, answers: List[Dict[str, Any]], graph: Callable) -> Tuple[str, str, float]:
max_retries = 5
retries = 0
while retries < max_retries:
try:
prediction = await graph(question, passage)
max_score = 0.0
best_answer = None
for answer in answers:
golden_answer, _ = answer_json_to_strings(answer)
golden_answer = golden_answer[0]
score = get_f1_score(prediction, golden_answer)
if score > max_score:
max_score = score
best_answer = golden_answer
return best_answer, prediction, max_score, cost # 修改返回值以包含cost
async def evaluate_all_passages(annotations: List[Tuple[str, Dict[str, Any]]], graph: Callable, max_concurrent_tasks: int = 50) -> List[List[Any]]:
semaphore = asyncio.Semaphore(max_concurrent_tasks)
results = []
async def sem_evaluate(id: str, annotation: Dict[str, Any]):
async with semaphore:
passage = annotation["passage"]
for qa_pair in annotation["qa_pairs"]:
question = qa_pair["question"]
answers = [qa_pair["answer"]]
if "validated_answers" in qa_pair and qa_pair["validated_answers"]:
answers += qa_pair["validated_answers"]
best_answer, prediction, score, cost = await evaluate_problem(question, passage, answers, graph)
results.append([id, question, prediction, best_answer, score, cost]) # 修改这行以包含cost
tasks = [sem_evaluate(id, annotation) for id, annotation in annotations]
await tqdm_asyncio.gather(*tasks, desc="Evaluating DROP passages", total=len(annotations))
return results
def save_results_to_csv(results: List[List[Any]], path: str) -> Tuple[float, float]:
df = pd.DataFrame(results, columns=["id", "question", "prediction", "best_answer", "score", "cost"])
average_score = df["score"].mean()
total_cost = df["cost"].iloc[-1] # 添加这行来计算总cost
output_file = f"{path}/{average_score:.5f}.csv"
df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")
return average_score, total_cost # 修改返回值以包含total_cost
async def drop_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
data = load_data(file_path, samples)
results = await evaluate_all_passages(data, graph, max_concurrent_tasks=20)
average_score, total_cost = save_results_to_csv(results, path=path)
print(f"Average score on DROP dataset: {average_score:.5f}")
print(f"Total Cost: {total_cost:.5f}")
return average_score, total_cost # 修改返回值以包含total_cost

View file

@ -1,126 +0,0 @@
import json
import asyncio
import aiofiles
import pandas as pd
import numpy as np
from typing import List, Tuple, Callable, Set
from collections import Counter
from tqdm.asyncio import tqdm_asyncio
from scipy.optimize import linear_sum_assignment
import string
import re
from examples.ags.benchmark.utils import generate_random_indices
def is_number(text: str) -> bool:
try:
float(text)
return True
except ValueError:
return False
def normalize_answer(s):
"""
Normalize answers for evaluation.
"""
def remove_articles(text):
return re.sub(r"\b(a|an|the)\b", " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
"""
Compute the F1 score between prediction and ground truth answers.
"""
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
async def load_data(file_path: str, samples=20, total_length=1000) -> List[dict]:
data = []
async with aiofiles.open(file_path, mode="r") as file:
async for line in file:
data.append(json.loads(line))
data = data[:total_length]
random_indices = generate_random_indices(len(data), samples)
data = [data[i] for i in random_indices]
return data
async def evaluate_problem(input: str, context_str: str, graph: Callable, expected_output: str):
max_retries = 5
retries = 0
while retries < max_retries:
try:
prediction, cost = await graph(input, context_str) if graph else ("None", None, 0)
score = f1_score(prediction, expected_output)
break
except Exception as e:
retries += 1
print(f"Error generating prediction: {e}. Retrying... ({retries}/{max_retries})")
if retries == max_retries:
print("Maximum retries reached. Skipping this sample.")
prediction = None
score = 0
cost = 0
break
return input, prediction, expected_output, score, cost
async def evaluate_all_problems(data: List[dict], graph: Callable, max_concurrent_tasks: int = 50):
semaphore = asyncio.Semaphore(max_concurrent_tasks)
async def sem_evaluate(problem):
async with semaphore:
input_text = problem["question"]
expected_output = problem["answer"]
paragraphs = [item[1] for item in problem["context"] if isinstance(item[1], list)]
context_str = "\n".join(" ".join(paragraph) for paragraph in paragraphs)
return await evaluate_problem(input_text, context_str, graph, expected_output)
tasks = [sem_evaluate(problem) for problem in data]
return await tqdm_asyncio.gather(*tasks, desc="Evaluating HotpotQA problems", total=len(data))
def save_results_to_csv(results: List[Tuple[str, str, str, float, str]], path: str) -> Tuple[float, float]:
df = pd.DataFrame(
results, columns=["question", "prediction", "expected_output", "score", "cost"]
)
average_score = df["score"].mean()
total_cost = df["cost"].iloc[-1]
output_file = f"{path}/{average_score:.5f}.csv"
df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")
return average_score, total_cost
async def hotpotqa_evaluation(graph: Callable, file_path: str, samples: int, path: str) -> Tuple[float, float]:
data = await load_data(file_path, samples)
results = await evaluate_all_problems(data, graph, max_concurrent_tasks=20)
average_score, total_cost = save_results_to_csv(results, path=path)
print(f"Average score on HotpotQA dataset: {average_score:.5f}")
print(f"Total Cost: {total_cost:.5f}")
return average_score, total_cost

View file

@ -40,7 +40,7 @@ class CoTSolveGraph(SolveGraph):
async def __call__(self, question: str, context: str) -> Tuple[str, str]:
answer = await self.cot_generate(question, context, mode="context_fill")
return answer
return answer, self.llm.cost_manager.total_cost
if __name__ == "__main__":
async def main():