feat: +Editor.search_index_repo

2026-06-05 14:55:18 +02:00 · 2024-09-05 17:21:27 +08:00 · 2024-09-05 17:21:27 +08:00 · 4523615dd9
commit 4523615dd9
parent 1eb3b8fb8c
4 changed files with 177 additions and 5 deletions
--- a/metagpt/tools/libs/editor.py
+++ b/metagpt/tools/libs/editor.py
@ -3,6 +3,7 @@ This file is borrowed from OpenDevin
 You can find the original repository here:
 https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/plugins/agent_skills/file_ops/file_ops.py
 """
+import asyncio
 import base64
 import os
 import re
@ -16,6 +17,7 @@ from pydantic import BaseModel, ConfigDict
 from metagpt.config2 import Config
 from metagpt.const import DEFAULT_WORKSPACE_ROOT
 from metagpt.logs import logger
+from metagpt.tools.libs.index_repo import OTHER_TYPE, IndexRepo
 from metagpt.tools.libs.linter import Linter
 from metagpt.tools.tool_registry import register_tool
 from metagpt.utils import read_docx
@ -951,3 +953,52 @@ class Editor(BaseModel):
        if not path.is_absolute():
            path = self.working_dir / path
        return path
+
+    @staticmethod
+    async def search_index_repo(
+        query: str, files_or_paths: List[Union[str, Path]], min_token_count: int = 0
+    ) -> List[str]:
+        """Searches the index repository for a given query across specified files or paths.
+
+        This method classifies the provided files or paths, performing a search on each cluster
+        of files while handling other types of files separately. It merges results from structured
+        indices with any results from non-indexed files.
+
+        Args:
+            query (str): The search query string to look for in the indexed files.
+            files_or_paths (List[Union[str, Path]]): A list of file paths or names to search within.
+            min_token_count (int, optional): The minimum token count to consider for indexing. Defaults to 0.
+
+        Returns:
+            List[str]: A list of search results as strings, containing the text from the merged results
+                        and any direct results from other files.
+        """
+        clusters, roots = IndexRepo.classify_path(files_or_paths)
+        futures = []
+        others = set()
+        for persist_path, filenames in clusters.items():
+            if persist_path == OTHER_TYPE:
+                others.update(filenames)
+                continue
+            root = roots[persist_path]
+            repo = IndexRepo(persist_path=persist_path, root_path=root, min_token_count=min_token_count)
+            futures.append(repo.search(query=query, filenames=list(filenames)))
+
+        for i in others:
+            futures.append(aread(filename=i))
+
+        futures_results = []
+        if futures:
+            futures_results = await asyncio.gather(*futures)
+
+        result = []
+        v_result = []
+        for i in futures_results:
+            if isinstance(i, str):
+                result.append(i)
+            else:
+                v_result.append(i)
+
+        repo = IndexRepo(min_token_count=min_token_count)
+        merged = await repo.merge(query=query, indices_list=v_result)
+        return [i.text for i in merged] + result
--- a/metagpt/tools/libs/index_repo.py
+++ b/metagpt/tools/libs/index_repo.py
@ -2,8 +2,9 @@
 # -*- coding: utf-8 -*-

 import json
+import re
 from pathlib import Path
-from typing import Dict, List, Optional, Set, Union
+from typing import Dict, List, Optional, Set, Tuple, Union

 import tiktoken
 from llama_index.core.base.embeddings.base import BaseEmbedding
@ -18,6 +19,14 @@ from metagpt.rag.schema import FAISSIndexConfig, FAISSRetrieverConfig, LLMRanker
 from metagpt.utils.common import aread, awrite, generate_fingerprint, list_files
 from metagpt.utils.repo_to_markdown import is_text_file

+UPLOADS_INDEX_ROOT = "/data/.index/uploads"
+DEFAULT_INDEX_ROOT = UPLOADS_INDEX_ROOT
+UPLOAD_ROOT = "/data/uploads"
+DEFAULT_ROOT = UPLOAD_ROOT
+CHATS_INDEX_ROOT = "/data/.index/chats"
+CHATS_ROOT = "/data/chats/"
+OTHER_TYPE = "other"
+

 class TextScore(BaseModel):
    filename: str
@ -26,8 +35,10 @@ class TextScore(BaseModel):


 class IndexRepo(BaseModel):
-    persist_path: str  # The persist path of the index repo, `/data/.index/uploads/` or `/data/.index/chats/{chat_id}/`
-    root_path: str  # `/data/uploads` or r`/data/chats/\d+`, the root path of files indexed by the index repo.
+    persist_path: str = DEFAULT_INDEX_ROOT  # The persist path of the index repo, `/data/.index/uploads/` or `/data/.index/chats/{chat_id}/`
+    root_path: str = (
+        DEFAULT_ROOT  # `/data/uploads` or r`/data/chats/\d+`, the root path of files indexed by the index repo.
+    )
    fingerprint_filename: str = "fingerprint.json"
    model: Optional[str] = None
    min_token_count: int = 10000
@ -93,6 +104,10 @@ class IndexRepo(BaseModel):
        Returns:
            List[Union[NodeWithScore, TextScore]]: A list of merged results sorted by similarity.
        """
+        flat_nodes = [node for indices in indices_list for node in indices]
+        if len(flat_nodes) <= self.recall_count:
+            return flat_nodes
+
        if not self.embedding:
            config = Config.default()
            if self.model:
@ -102,7 +117,6 @@ class IndexRepo(BaseModel):

        scores = []
        query_embedding = await self.embedding.aget_text_embedding(query)
-        flat_nodes = [node for indices in indices_list for node in indices]
        for i in flat_nodes:
            text_embedding = await self.embedding.aget_text_embedding(i.text)
            similarity = self.embedding.similarity(query_embedding, text_embedding)
@ -262,3 +276,33 @@ class IndexRepo(BaseModel):
            return True
        fp = generate_fingerprint(content)
        return old_fp != fp
+
+    @staticmethod
+    def classify_path(files_or_paths: List[Union[str, Path]]) -> Tuple[Dict[str, Set[Path]], Dict[str, str]]:
+        mappings = {
+            UPLOADS_INDEX_ROOT: re.compile(r"^/data/uploads($|/.*)"),
+            CHATS_INDEX_ROOT: re.compile(r"^/data/chats/\d+($|/.*)"),
+        }
+
+        clusters = {}
+        roots = {}
+        for i in files_or_paths:
+            path = Path(i).absolute()
+            path_type = OTHER_TYPE
+            for type_, pattern in mappings.items():
+                if re.match(pattern, str(i)):
+                    path_type = type_
+                    break
+            if path_type == CHATS_INDEX_ROOT:
+                chat_id = path.parts[3]
+                path_type = str(Path(path_type) / chat_id)
+                roots[path_type] = str(Path(CHATS_ROOT) / chat_id)
+            elif path_type == UPLOADS_INDEX_ROOT:
+                roots[path_type] = UPLOAD_ROOT
+
+            if path_type in clusters:
+                clusters[path_type].add(path)
+            else:
+                clusters[path_type] = {path}
+
+        return clusters, roots