From d37954d43a638d1904e409965161e2e924e01345 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8E=98=E6=9D=83=20=E9=A9=AC?= Date: Thu, 19 Sep 2024 17:25:09 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E4=BC=98=E5=8C=96=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- metagpt/tools/libs/index_repo.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/metagpt/tools/libs/index_repo.py b/metagpt/tools/libs/index_repo.py index a5d9fc6aa..b853b38ac 100644 --- a/metagpt/tools/libs/index_repo.py +++ b/metagpt/tools/libs/index_repo.py @@ -93,6 +93,12 @@ class IndexRepo(BaseModel): meta = await self._read_meta() new_files = {} for i in filenames: + if Path(i).suffix.lower() in {".pdf", ".doc", ".docx"}: + if str(i) not in self.fingerprints: + new_files[i] = "" + logger.warning(f'file: "{i}" not indexed') + filter_filenames.add(str(i)) + continue content = await File.read_text_file(i) token_count = len(encoding.encode(content)) if not self._is_buildable( @@ -101,9 +107,7 @@ class IndexRepo(BaseModel): result.append(TextScore(filename=str(i), text=content)) continue file_fingerprint = generate_fingerprint(content) - if str(i) not in self.fingerprints or ( - self.fingerprints.get(str(i)) != file_fingerprint and Path(i).suffix.lower() not in {".pdf"} - ): + if str(i) not in self.fingerprints or (self.fingerprints.get(str(i)) != file_fingerprint): new_files[i] = content logger.warning(f'file: "{i}" changed but not indexed') continue @@ -113,6 +117,7 @@ class IndexRepo(BaseModel): filter_filenames.update([str(i) for i in added]) for i in others: result.append(TextScore(filename=str(i), text=new_files.get(i))) + filter_filenames.discard(str(i)) nodes = await self._search(query=query, filters=filter_filenames) return result + nodes @@ -142,7 +147,21 @@ class IndexRepo(BaseModel): scores = [] query_embedding = await self.embedding.aget_text_embedding(query) for i in flat_nodes: - text_embedding = await self.embedding.aget_text_embedding(i.text) + try: + text_embedding = await self.embedding.aget_text_embedding(i.text) + except Exception as e: # 超过最大长度 + tenth = int(len(i.text) / 10) # DEFAULT_MIN_TOKEN_COUNT = 10000 + logger.warning( + f"{e}, tenth len={tenth}, pre_part_len={len(i.text[: tenth * 6])}, post_part_len={len(i.text[tenth * 4:])}" + ) + pre_win_part = await self.embedding.aget_text_embedding(i.text[: tenth * 6]) + post_win_part = await self.embedding.aget_text_embedding(i.text[tenth * 4 :]) + similarity = max( + self.embedding.similarity(query_embedding, pre_win_part), + self.embedding.similarity(query_embedding, post_win_part), + ) + scores.append((similarity, i)) + continue similarity = self.embedding.similarity(query_embedding, text_embedding) scores.append((similarity, i)) scores.sort(key=lambda x: x[0], reverse=True) @@ -169,6 +188,7 @@ class IndexRepo(BaseModel): file_datas = file_datas or {} for i in filenames: content = file_datas.get(i) or await File.read_text_file(i) + file_datas[i] = content if not self._is_fingerprint_changed(filename=i, content=content): continue token_count = len(encoding.encode(content))