From ac31e6760730c1a7611fc4c93e2f5c68910c2a7f Mon Sep 17 00:00:00 2001 From: YangQianli92 <108046369+YangQianli92@users.noreply.github.com> Date: Wed, 17 Apr 2024 11:05:56 +0800 Subject: [PATCH] Delete examples/base.py --- examples/base.py | 196 ----------------------------------------------- 1 file changed, 196 deletions(-) delete mode 100644 examples/base.py diff --git a/examples/base.py b/examples/base.py deleted file mode 100644 index 01dc0fbb0..000000000 --- a/examples/base.py +++ /dev/null @@ -1,196 +0,0 @@ -import asyncio -import json -import os -from typing import List - -import evaluate -import jieba -from llama_index.core.embeddings import BaseEmbedding -from llama_index.core.evaluation import SemanticSimilarityEvaluator -from llama_index.core.schema import NodeWithScore -from pydantic import BaseModel - -from metagpt.const import EXAMPLE_BENCHMARK_PATH -from metagpt.logs import logger -from metagpt.rag.factories import get_rag_embedding -from metagpt.utils.common import read_json_file - - -class DatasetInfo(BaseModel): - name: str - document_files: List[str] - gt_info: List[dict] - - -class DatasetConfig(BaseModel): - datasets: List[DatasetInfo] - - -class RAGBenchmark: - def __init__( - self, - embed_model: BaseEmbedding = None, - ): - self.evaluator = SemanticSimilarityEvaluator( - embed_model=embed_model or get_rag_embedding(), - ) - - def _set_metrics( - self, - bleu_avg :float = 0.0, - bleu_1 :float = 0.0, - bleu_2 :float = 0.0, - bleu_3 :float = 0.0, - bleu_4 :float = 0.0, - rouge_l :float = 0.0, - semantic_similarity :float = 0.0, - recall :float = 0.0, - hit_rate :float = 0.0, - mrr :float = 0.0, - length :float = 0.0, - generated_text :str = None, - ground_truth_text: str = None, - question: str = None - ): - metrics = { - "bleu-avg": bleu_avg, - "bleu-1": bleu_1, - "bleu-2": bleu_2, - "bleu-3": bleu_3, - "bleu-4": bleu_4, - "rouge-L": rouge_l, - "semantic similarity": semantic_similarity, - "recall": recall, - "hit_rate": hit_rate, - "mrr": mrr, - "length": length, - } - - log = { - "generated_text": generated_text, - "ground_truth_text": ground_truth_text, - "question": question, - } - - return {"metrics": metrics, "log": log} - - - def bleu_score(self, response: str, reference: str, with_penalty=False) -> float: - f = lambda text: list(jieba.cut(text)) - bleu = evaluate.load(path="bleu") - results = bleu.compute(predictions=[response], references=[[reference]], tokenizer=f) - - bleu_avg = results["bleu"] - bleu1 = results["precisions"][0] - bleu2 = results["precisions"][1] - bleu3 = results["precisions"][2] - bleu4 = results["precisions"][3] - brevity_penalty = results["brevity_penalty"] - - if with_penalty: - return bleu_avg, bleu1, bleu2, bleu3, bleu4 - else: - return 0.0 if brevity_penalty == 0 else bleu_avg / brevity_penalty, bleu1, bleu2, bleu3, bleu4 - - def rougel_score(self, response: str, reference: str) -> float: - # pip install rouge_score - f = lambda text: list(jieba.cut(text)) - rouge = evaluate.load(path="rouge") - - results = rouge.compute(predictions=[response], references=[[reference]], tokenizer=f, rouge_types=["rougeL"]) - score = results["rougeL"] - return score - - def recall(self, nodes: list[NodeWithScore], reference_docs: list[str]) -> float: - if nodes: - total_recall = sum(any(node.text in doc for node in nodes) for doc in reference_docs) - return total_recall / len(reference_docs) - else: - return 0.0 - - def HitRate(self, nodes: list[NodeWithScore], reference_docs: list[str]) -> float: - if nodes: - return 1.0 if any(node.text in doc for doc in reference_docs for node in nodes) else 0.0 - else: - return 0.0 - - def MRR(self, nodes: list[NodeWithScore], reference_docs: list[str]) -> float: - mrr_sum = 0.0 - - for i, doc in enumerate(reference_docs, start=1): - for node in nodes: - if node.text in doc: - mrr_sum += 1.0 / i - break - - return mrr_sum / len(reference_docs) if reference_docs else 0.0 - - async def SemanticSimilarity(self, response: str, reference: str) -> float: - result = await self.evaluator.aevaluate( - response=response, - reference=reference, - ) - - return result.score - - async def compute_metric( - self, - response: str = None, - reference: str = None, - nodes: list[NodeWithScore] = None, - reference_doc: list[str] = None, - question: str = None, - ): - recall = self.recall(nodes, reference_doc) - bleu_avg, bleu1, bleu2, bleu3, bleu4 = self.bleu_score(response, reference) - rouge_l = self.rougel_score(response, reference) - hit_rate = self.HitRate(nodes, reference_doc) - mrr = self.MRR(nodes, reference_doc) - - similarity = await self.SemanticSimilarity(response, reference) - result = self._set_metrics( - bleu_avg, bleu1, bleu2, bleu3, bleu4, rouge_l, - similarity, - recall, hit_rate, mrr, len(response), response, reference, question - ) - - return result - - @staticmethod - def load_dataset(ds_names: list[str] = ["all"]): - infos = read_json_file(os.path.join(EXAMPLE_BENCHMARK_PATH, "dataset_info.json")) - dataset_config = DatasetConfig( - datasets=[ - DatasetInfo( - name=name, - document_files=[ - os.path.join(EXAMPLE_BENCHMARK_PATH, name, file) - for file in info["document_file"] - ], - gt_info=read_json_file(os.path.join(EXAMPLE_BENCHMARK_PATH, name, info["gt_file"])), - ) - for dataset_info in infos - for name, info in dataset_info.items() - if name in ds_names or "all" in ds_names - ] - ) - - return dataset_config - - -if __name__ == "__main__": - benchmark = RAGBenchmark() - answer = "是的,根据提供的信息,2023年7月20日,应急管理部和财政部确实联合发布了《因灾倒塌、损坏住房恢复重建救助工作规范》的通知。这份《规范》旨在进一步规范因灾倒塌、损坏住房的恢复重建救助相关工作。它明确了地方各级政府负责实施救助工作,应急管理部和财政部则负责统筹指导。地方财政应安排足够的资金,中央财政也会提供适当的补助。救助资金将通过专账管理,并采取特定的管理方式。救助对象是那些因自然灾害导致住房倒塌或损坏,并向政府提出申请且符合条件的受灾家庭。相关部门将组织调查统计救助对象信息,并建立档案。此外,《规范》还强调了资金发放的具体方式和公开透明的要求。" - ground_truth = "“启明行动”是为了防控儿童青少年的近视问题,并发布了《防控儿童青少年近视核心知识十条》。" - bleu_avg, bleu1, bleu2, bleu3, bleu4 = benchmark.bleu_score(answer, ground_truth) - logger.info(f"bleu_avg = {bleu_avg}") - logger.info(f"bleu1 = {bleu1}") - logger.info(f"bleu2 = {bleu2}") - logger.info(f"bleu3 = {bleu3}") - logger.info(f"bleu4 = {bleu4}") - - rougeL_score = benchmark.rougel_score(answer, ground_truth) - logger.info(f"rougeL_score = {rougeL_score}") - - similarity = asyncio.run(benchmark.SemanticSimilarity(answer, ground_truth)) - logger.info(f"similarity = {similarity}")