mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-05 14:55:18 +02:00
feat: +Editor.search_index_repo
This commit is contained in:
parent
1eb3b8fb8c
commit
4523615dd9
4 changed files with 177 additions and 5 deletions
|
|
@ -3,6 +3,7 @@ This file is borrowed from OpenDevin
|
|||
You can find the original repository here:
|
||||
https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/plugins/agent_skills/file_ops/file_ops.py
|
||||
"""
|
||||
import asyncio
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
|
|
@ -16,6 +17,7 @@ from pydantic import BaseModel, ConfigDict
|
|||
from metagpt.config2 import Config
|
||||
from metagpt.const import DEFAULT_WORKSPACE_ROOT
|
||||
from metagpt.logs import logger
|
||||
from metagpt.tools.libs.index_repo import OTHER_TYPE, IndexRepo
|
||||
from metagpt.tools.libs.linter import Linter
|
||||
from metagpt.tools.tool_registry import register_tool
|
||||
from metagpt.utils import read_docx
|
||||
|
|
@ -951,3 +953,52 @@ class Editor(BaseModel):
|
|||
if not path.is_absolute():
|
||||
path = self.working_dir / path
|
||||
return path
|
||||
|
||||
@staticmethod
|
||||
async def search_index_repo(
|
||||
query: str, files_or_paths: List[Union[str, Path]], min_token_count: int = 0
|
||||
) -> List[str]:
|
||||
"""Searches the index repository for a given query across specified files or paths.
|
||||
|
||||
This method classifies the provided files or paths, performing a search on each cluster
|
||||
of files while handling other types of files separately. It merges results from structured
|
||||
indices with any results from non-indexed files.
|
||||
|
||||
Args:
|
||||
query (str): The search query string to look for in the indexed files.
|
||||
files_or_paths (List[Union[str, Path]]): A list of file paths or names to search within.
|
||||
min_token_count (int, optional): The minimum token count to consider for indexing. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of search results as strings, containing the text from the merged results
|
||||
and any direct results from other files.
|
||||
"""
|
||||
clusters, roots = IndexRepo.classify_path(files_or_paths)
|
||||
futures = []
|
||||
others = set()
|
||||
for persist_path, filenames in clusters.items():
|
||||
if persist_path == OTHER_TYPE:
|
||||
others.update(filenames)
|
||||
continue
|
||||
root = roots[persist_path]
|
||||
repo = IndexRepo(persist_path=persist_path, root_path=root, min_token_count=min_token_count)
|
||||
futures.append(repo.search(query=query, filenames=list(filenames)))
|
||||
|
||||
for i in others:
|
||||
futures.append(aread(filename=i))
|
||||
|
||||
futures_results = []
|
||||
if futures:
|
||||
futures_results = await asyncio.gather(*futures)
|
||||
|
||||
result = []
|
||||
v_result = []
|
||||
for i in futures_results:
|
||||
if isinstance(i, str):
|
||||
result.append(i)
|
||||
else:
|
||||
v_result.append(i)
|
||||
|
||||
repo = IndexRepo(min_token_count=min_token_count)
|
||||
merged = await repo.merge(query=query, indices_list=v_result)
|
||||
return [i.text for i in merged] + result
|
||||
|
|
|
|||
|
|
@ -2,8 +2,9 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set, Union
|
||||
from typing import Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
import tiktoken
|
||||
from llama_index.core.base.embeddings.base import BaseEmbedding
|
||||
|
|
@ -18,6 +19,14 @@ from metagpt.rag.schema import FAISSIndexConfig, FAISSRetrieverConfig, LLMRanker
|
|||
from metagpt.utils.common import aread, awrite, generate_fingerprint, list_files
|
||||
from metagpt.utils.repo_to_markdown import is_text_file
|
||||
|
||||
UPLOADS_INDEX_ROOT = "/data/.index/uploads"
|
||||
DEFAULT_INDEX_ROOT = UPLOADS_INDEX_ROOT
|
||||
UPLOAD_ROOT = "/data/uploads"
|
||||
DEFAULT_ROOT = UPLOAD_ROOT
|
||||
CHATS_INDEX_ROOT = "/data/.index/chats"
|
||||
CHATS_ROOT = "/data/chats/"
|
||||
OTHER_TYPE = "other"
|
||||
|
||||
|
||||
class TextScore(BaseModel):
|
||||
filename: str
|
||||
|
|
@ -26,8 +35,10 @@ class TextScore(BaseModel):
|
|||
|
||||
|
||||
class IndexRepo(BaseModel):
|
||||
persist_path: str # The persist path of the index repo, `/data/.index/uploads/` or `/data/.index/chats/{chat_id}/`
|
||||
root_path: str # `/data/uploads` or r`/data/chats/\d+`, the root path of files indexed by the index repo.
|
||||
persist_path: str = DEFAULT_INDEX_ROOT # The persist path of the index repo, `/data/.index/uploads/` or `/data/.index/chats/{chat_id}/`
|
||||
root_path: str = (
|
||||
DEFAULT_ROOT # `/data/uploads` or r`/data/chats/\d+`, the root path of files indexed by the index repo.
|
||||
)
|
||||
fingerprint_filename: str = "fingerprint.json"
|
||||
model: Optional[str] = None
|
||||
min_token_count: int = 10000
|
||||
|
|
@ -93,6 +104,10 @@ class IndexRepo(BaseModel):
|
|||
Returns:
|
||||
List[Union[NodeWithScore, TextScore]]: A list of merged results sorted by similarity.
|
||||
"""
|
||||
flat_nodes = [node for indices in indices_list for node in indices]
|
||||
if len(flat_nodes) <= self.recall_count:
|
||||
return flat_nodes
|
||||
|
||||
if not self.embedding:
|
||||
config = Config.default()
|
||||
if self.model:
|
||||
|
|
@ -102,7 +117,6 @@ class IndexRepo(BaseModel):
|
|||
|
||||
scores = []
|
||||
query_embedding = await self.embedding.aget_text_embedding(query)
|
||||
flat_nodes = [node for indices in indices_list for node in indices]
|
||||
for i in flat_nodes:
|
||||
text_embedding = await self.embedding.aget_text_embedding(i.text)
|
||||
similarity = self.embedding.similarity(query_embedding, text_embedding)
|
||||
|
|
@ -262,3 +276,33 @@ class IndexRepo(BaseModel):
|
|||
return True
|
||||
fp = generate_fingerprint(content)
|
||||
return old_fp != fp
|
||||
|
||||
@staticmethod
|
||||
def classify_path(files_or_paths: List[Union[str, Path]]) -> Tuple[Dict[str, Set[Path]], Dict[str, str]]:
|
||||
mappings = {
|
||||
UPLOADS_INDEX_ROOT: re.compile(r"^/data/uploads($|/.*)"),
|
||||
CHATS_INDEX_ROOT: re.compile(r"^/data/chats/\d+($|/.*)"),
|
||||
}
|
||||
|
||||
clusters = {}
|
||||
roots = {}
|
||||
for i in files_or_paths:
|
||||
path = Path(i).absolute()
|
||||
path_type = OTHER_TYPE
|
||||
for type_, pattern in mappings.items():
|
||||
if re.match(pattern, str(i)):
|
||||
path_type = type_
|
||||
break
|
||||
if path_type == CHATS_INDEX_ROOT:
|
||||
chat_id = path.parts[3]
|
||||
path_type = str(Path(path_type) / chat_id)
|
||||
roots[path_type] = str(Path(CHATS_ROOT) / chat_id)
|
||||
elif path_type == UPLOADS_INDEX_ROOT:
|
||||
roots[path_type] = UPLOAD_ROOT
|
||||
|
||||
if path_type in clusters:
|
||||
clusters[path_type].add(path)
|
||||
else:
|
||||
clusters[path_type] = {path}
|
||||
|
||||
return clusters, roots
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue