feat: +Editor.search_index_repo

This commit is contained in:
莘权 马 2024-09-05 17:21:27 +08:00
parent 1eb3b8fb8c
commit 4523615dd9
4 changed files with 177 additions and 5 deletions

View file

@ -3,6 +3,7 @@ This file is borrowed from OpenDevin
You can find the original repository here:
https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/plugins/agent_skills/file_ops/file_ops.py
"""
import asyncio
import base64
import os
import re
@ -16,6 +17,7 @@ from pydantic import BaseModel, ConfigDict
from metagpt.config2 import Config
from metagpt.const import DEFAULT_WORKSPACE_ROOT
from metagpt.logs import logger
from metagpt.tools.libs.index_repo import OTHER_TYPE, IndexRepo
from metagpt.tools.libs.linter import Linter
from metagpt.tools.tool_registry import register_tool
from metagpt.utils import read_docx
@ -951,3 +953,52 @@ class Editor(BaseModel):
if not path.is_absolute():
path = self.working_dir / path
return path
@staticmethod
async def search_index_repo(
query: str, files_or_paths: List[Union[str, Path]], min_token_count: int = 0
) -> List[str]:
"""Searches the index repository for a given query across specified files or paths.
This method classifies the provided files or paths, performing a search on each cluster
of files while handling other types of files separately. It merges results from structured
indices with any results from non-indexed files.
Args:
query (str): The search query string to look for in the indexed files.
files_or_paths (List[Union[str, Path]]): A list of file paths or names to search within.
min_token_count (int, optional): The minimum token count to consider for indexing. Defaults to 0.
Returns:
List[str]: A list of search results as strings, containing the text from the merged results
and any direct results from other files.
"""
clusters, roots = IndexRepo.classify_path(files_or_paths)
futures = []
others = set()
for persist_path, filenames in clusters.items():
if persist_path == OTHER_TYPE:
others.update(filenames)
continue
root = roots[persist_path]
repo = IndexRepo(persist_path=persist_path, root_path=root, min_token_count=min_token_count)
futures.append(repo.search(query=query, filenames=list(filenames)))
for i in others:
futures.append(aread(filename=i))
futures_results = []
if futures:
futures_results = await asyncio.gather(*futures)
result = []
v_result = []
for i in futures_results:
if isinstance(i, str):
result.append(i)
else:
v_result.append(i)
repo = IndexRepo(min_token_count=min_token_count)
merged = await repo.merge(query=query, indices_list=v_result)
return [i.text for i in merged] + result

View file

@ -2,8 +2,9 @@
# -*- coding: utf-8 -*-
import json
import re
from pathlib import Path
from typing import Dict, List, Optional, Set, Union
from typing import Dict, List, Optional, Set, Tuple, Union
import tiktoken
from llama_index.core.base.embeddings.base import BaseEmbedding
@ -18,6 +19,14 @@ from metagpt.rag.schema import FAISSIndexConfig, FAISSRetrieverConfig, LLMRanker
from metagpt.utils.common import aread, awrite, generate_fingerprint, list_files
from metagpt.utils.repo_to_markdown import is_text_file
UPLOADS_INDEX_ROOT = "/data/.index/uploads"
DEFAULT_INDEX_ROOT = UPLOADS_INDEX_ROOT
UPLOAD_ROOT = "/data/uploads"
DEFAULT_ROOT = UPLOAD_ROOT
CHATS_INDEX_ROOT = "/data/.index/chats"
CHATS_ROOT = "/data/chats/"
OTHER_TYPE = "other"
class TextScore(BaseModel):
filename: str
@ -26,8 +35,10 @@ class TextScore(BaseModel):
class IndexRepo(BaseModel):
persist_path: str # The persist path of the index repo, `/data/.index/uploads/` or `/data/.index/chats/{chat_id}/`
root_path: str # `/data/uploads` or r`/data/chats/\d+`, the root path of files indexed by the index repo.
persist_path: str = DEFAULT_INDEX_ROOT # The persist path of the index repo, `/data/.index/uploads/` or `/data/.index/chats/{chat_id}/`
root_path: str = (
DEFAULT_ROOT # `/data/uploads` or r`/data/chats/\d+`, the root path of files indexed by the index repo.
)
fingerprint_filename: str = "fingerprint.json"
model: Optional[str] = None
min_token_count: int = 10000
@ -93,6 +104,10 @@ class IndexRepo(BaseModel):
Returns:
List[Union[NodeWithScore, TextScore]]: A list of merged results sorted by similarity.
"""
flat_nodes = [node for indices in indices_list for node in indices]
if len(flat_nodes) <= self.recall_count:
return flat_nodes
if not self.embedding:
config = Config.default()
if self.model:
@ -102,7 +117,6 @@ class IndexRepo(BaseModel):
scores = []
query_embedding = await self.embedding.aget_text_embedding(query)
flat_nodes = [node for indices in indices_list for node in indices]
for i in flat_nodes:
text_embedding = await self.embedding.aget_text_embedding(i.text)
similarity = self.embedding.similarity(query_embedding, text_embedding)
@ -262,3 +276,33 @@ class IndexRepo(BaseModel):
return True
fp = generate_fingerprint(content)
return old_fp != fp
@staticmethod
def classify_path(files_or_paths: List[Union[str, Path]]) -> Tuple[Dict[str, Set[Path]], Dict[str, str]]:
mappings = {
UPLOADS_INDEX_ROOT: re.compile(r"^/data/uploads($|/.*)"),
CHATS_INDEX_ROOT: re.compile(r"^/data/chats/\d+($|/.*)"),
}
clusters = {}
roots = {}
for i in files_or_paths:
path = Path(i).absolute()
path_type = OTHER_TYPE
for type_, pattern in mappings.items():
if re.match(pattern, str(i)):
path_type = type_
break
if path_type == CHATS_INDEX_ROOT:
chat_id = path.parts[3]
path_type = str(Path(path_type) / chat_id)
roots[path_type] = str(Path(CHATS_ROOT) / chat_id)
elif path_type == UPLOADS_INDEX_ROOT:
roots[path_type] = UPLOAD_ROOT
if path_type in clusters:
clusters[path_type].add(path)
else:
clusters[path_type] = {path}
return clusters, roots