From daea102ee7ee47c23fca0e79147a308170a9fd86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=8E=98=E6=9D=83=20=E9=A9=AC?= <mashenquan@fuzhi.ai>
Date: Wed, 11 Sep 2024 12:30:53 +0800
Subject: [PATCH] =?UTF-8?q?fixbug:=20=E5=A6=82=E6=9E=9Cindex=5Fbuilder?=
 =?UTF-8?q?=E5=BC=82=E5=B8=B8=E6=9C=AA=E5=8F=8A=E6=97=B6=E5=BB=BA=E7=B4=A2?=
 =?UTF-8?q?=E5=BC=95=EF=BC=8C=E5=88=99=E8=87=AA=E5=8A=A8=E5=BB=BA=E7=B4=A2?=
 =?UTF-8?q?=E5=BC=95=E4=BB=A5=E9=98=B2max=20token?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 metagpt/tools/libs/index_repo.py        | 33 +++++++++++++++++--------
 tests/metagpt/tools/libs/test_editor.py |  3 ---
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/metagpt/tools/libs/index_repo.py b/metagpt/tools/libs/index_repo.py
index 24065c4be..9f2fbdb27 100644
--- a/metagpt/tools/libs/index_repo.py
+++ b/metagpt/tools/libs/index_repo.py
@@ -90,6 +90,7 @@ class IndexRepo(BaseModel):
             raise ValueError(f"Unsupported file types: {[str(i) for i in excludes]}")
         filter_filenames = set()
         meta = await self._read_meta()
+        new_files = {}
         for i in filenames:
             content = await File.read_text_file(i)
             token_count = len(encoding.encode(content))
@@ -99,10 +100,16 @@ class IndexRepo(BaseModel):
                 result.append(TextScore(filename=str(i), text=content))
                 continue
             file_fingerprint = generate_fingerprint(content)
-            if self.fingerprints.get(str(i)) != file_fingerprint and Path(i).suffix.lower() not in {".pdf"}:
-                logger.error(f'file: "{i}" changed but not indexed')
+            if str(i) not in self.fingerprints or (
+                self.fingerprints.get(str(i)) != file_fingerprint and Path(i).suffix.lower() not in {".pdf"}
+            ):
+                new_files[i] = content
+                logger.warning(f'file: "{i}" changed but not indexed')
                 continue
             filter_filenames.add(str(i))
+        if new_files:
+            await self.add(paths=list(new_files.keys()), file_datas=new_files)
+            filter_filenames.update([str(i) for i in new_files.keys()])
         nodes = await self._search(query=query, filters=filter_filenames)
         return result + nodes
 
@@ -138,7 +145,7 @@ class IndexRepo(BaseModel):
         scores.sort(key=lambda x: x[0], reverse=True)
         return [i[1] for i in scores][: self.recall_count]
 
-    async def add(self, paths: List[Path]):
+    async def add(self, paths: List[Path], file_datas: Dict[Union[str, Path], str] = None):
         """Add new documents to the index.
 
         Args:
@@ -148,8 +155,9 @@ class IndexRepo(BaseModel):
         filenames, _ = await self._filter(paths)
         filter_filenames = []
         delete_filenames = []
+        file_datas = file_datas or {}
         for i in filenames:
-            content = await File.read_text_file(i)
+            content = file_datas.get(i) or await File.read_text_file(i)
             if not self._is_fingerprint_changed(filename=i, content=content):
                 continue
             token_count = len(encoding.encode(content))
@@ -159,9 +167,14 @@ class IndexRepo(BaseModel):
             else:
                 delete_filenames.append(i)
                 logger.debug(f"{i} not is_buildable: {token_count}, {self.min_token_count}~{self.max_token_count}")
-        await self._add_batch(filenames=filter_filenames, delete_filenames=delete_filenames)
+        await self._add_batch(filenames=filter_filenames, delete_filenames=delete_filenames, file_datas=file_datas)
 
-    async def _add_batch(self, filenames: List[Union[str, Path]], delete_filenames: List[Union[str, Path]]):
+    async def _add_batch(
+        self,
+        filenames: List[Union[str, Path]],
+        delete_filenames: List[Union[str, Path]],
+        file_datas: Dict[Union[str, Path], str],
+    ):
         """Add and remove documents in a batch operation.
 
         Args:
@@ -180,9 +193,9 @@ class IndexRepo(BaseModel):
             )
             try:
                 engine.delete_docs(filenames + delete_filenames)
-                logger.debug(f"delete docs {filenames + delete_filenames}")
+                logger.info(f"delete docs {filenames + delete_filenames}")
                 engine.add_docs(input_files=filenames)
-                logger.debug(f"add docs {filenames}")
+                logger.info(f"add docs {filenames}")
             except NotImplementedError as e:
                 logger.debug(f"{e}")
                 filenames = list(set([str(i) for i in filenames] + list(self.fingerprints.keys())))
@@ -194,10 +207,10 @@ class IndexRepo(BaseModel):
                 retriever_configs=[FAISSRetrieverConfig()],
                 ranker_configs=[LLMRankerConfig()],
             )
-            logger.debug(f"add docs {filenames}")
+            logger.info(f"add docs {filenames}")
         engine.persist(persist_dir=self.persist_path)
         for i in filenames:
-            content = await File.read_text_file(i)
+            content = file_datas.get(i) or await File.read_text_file(i)
             fp = generate_fingerprint(content)
             self.fingerprints[str(i)] = fp
         await awrite(filename=Path(self.persist_path) / self.fingerprint_filename, data=json.dumps(self.fingerprints))
diff --git a/tests/metagpt/tools/libs/test_editor.py b/tests/metagpt/tools/libs/test_editor.py
index c601ee5a4..1adcbc2b7 100644
--- a/tests/metagpt/tools/libs/test_editor.py
+++ b/tests/metagpt/tools/libs/test_editor.py
@@ -10,7 +10,6 @@ from metagpt.tools.libs.index_repo import (
     CHATS_INDEX_ROOT,
     CHATS_ROOT,
     UPLOAD_ROOT,
-    UPLOADS_INDEX_ROOT,
     IndexRepo,
 )
 from metagpt.utils.common import list_files
@@ -677,8 +676,6 @@ async def mock_index_repo():
     os.system(command)
     filenames = list_files(UPLOAD_ROOT)
     uploads_files = [i for i in filenames if Path(i).suffix in {".md", ".txt", ".json", ".pdf"}]
-    uploads_repo = IndexRepo(persist_path=UPLOADS_INDEX_ROOT, root_path=UPLOAD_ROOT, min_token_count=0)
-    await uploads_repo.add(uploads_files)
     assert uploads_files
 
     filenames = list_files(src_path)