From 9eba81862b12923f15ae5d9b4cc647d9c2bec8e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8E=98=E6=9D=83=20=E9=A9=AC?= Date: Thu, 16 May 2024 12:40:55 +0800 Subject: [PATCH 01/13] fixbug: circular import --- metagpt/tools/libs/git.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metagpt/tools/libs/git.py b/metagpt/tools/libs/git.py index eb3fd6822..b4d759bf4 100644 --- a/metagpt/tools/libs/git.py +++ b/metagpt/tools/libs/git.py @@ -9,7 +9,6 @@ from github.Issue import Issue from github.PullRequest import PullRequest from metagpt.tools.tool_registry import register_tool -from metagpt.utils.git_repository import GitBranch, GitRepository @register_tool(tags=["software development", "git", "Commit the changes and push to remote git repository."]) @@ -18,7 +17,7 @@ async def git_push( access_token: str, comments: str = "Commit", new_branch: str = "", -) -> GitBranch: +) -> "GitBranch": """ Pushes changes from a local Git repository to its remote counterpart. @@ -49,6 +48,8 @@ async def git_push( base branch:'master', head branch:'feature/new', repo_name:'iorisa/snake-game' """ + from metagpt.utils.git_repository import GitRepository + if not GitRepository.is_git_dir(local_path): raise ValueError("Invalid local git repository") From 6326526a08d1aed1bc6178393540e98db31d804c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8E=98=E6=9D=83=20=E9=A9=AC?= Date: Fri, 17 May 2024 13:28:28 +0800 Subject: [PATCH 02/13] feat: +vault config demo --- config/vault.example.yaml | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 config/vault.example.yaml diff --git a/config/vault.example.yaml b/config/vault.example.yaml new file mode 100644 index 000000000..0e197d2a8 --- /dev/null +++ b/config/vault.example.yaml @@ -0,0 +1,48 @@ +# Usage: +# 1. Get value. +# >>> from metagpt.tools.libs.env import get_env +# >>> access_token = await get_env(key="access_token", app_name="github") +# >>> print(access_token) +# YOUR_ACCESS_TOKEN +# +# 2. Get description for LLM understanding. +# >>> from metagpt.tools.libs.env import get_env_description +# >>> descriptions = await get_env_description +# >>> for k, desc in descriptions.items(): +# >>> print(f"{key}:{desc}") +# await get_env(key="access_token", app_name="github"):Get github access token +# await get_env(key="access_token", app_name="gitlab"):Get gitlab access token +# ... + +vault: + github: + values: + access_token: "YOUR_ACCESS_TOKEN" + descriptions: + access_token: "Get github access token" + gitlab: + values: + access_token: "YOUR_ACCESS_TOKEN" + descriptions: + access_token: "Get gitlab access token" + iflytek_tts: + values: + api_id: "YOUR_APP_ID" + api_key: "YOUR_API_KEY" + api_secret: "YOUR_API_SECRET" + descriptions: + api_id: "Get the API ID of IFlyTek Text to Speech" + api_key: "Get the API KEY of IFlyTek Text to Speech" + api_secret: "Get the API SECRET of IFlyTek Text to Speech" + azure_tts: + values: + subscription_key: "YOUR_SUBSCRIPTION_KEY" + region: "YOUR_REGION" + descriptions: + subscription_key: "Get the subscription key of Azure Text to Speech." + region: "Get the region of Azure Text to Speech." + default: # All key-value pairs whose app name is an empty string are placed below + values: + proxy: "YOUR_PROXY" + descriptions: + proxy: "Get proxy for tools like requests, playwright, selenium, etc." \ No newline at end of file From d0486f8e11b9e1ff544444ea237b199b81e95874 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Sat, 11 May 2024 19:47:33 +0800 Subject: [PATCH 03/13] add cr reporter --- metagpt/actions/write_code.py | 2 +- metagpt/actions/write_code_review.py | 16 +++++++++++----- metagpt/utils/report.py | 8 ++++++-- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/metagpt/actions/write_code.py b/metagpt/actions/write_code.py index 67b859d23..dc8a6dee5 100644 --- a/metagpt/actions/write_code.py +++ b/metagpt/actions/write_code.py @@ -153,7 +153,7 @@ class WriteCode(Action): root_path = self.context.src_workspace if self.context.src_workspace else "" coding_context.code_doc = Document(filename=coding_context.filename, root_path=str(root_path)) coding_context.code_doc.content = code - await reporter.async_report(self.repo.workdir / coding_context.code_doc.root_relative_path, "path") + await reporter.async_report(coding_context.code_doc, "document") return coding_context @staticmethod diff --git a/metagpt/actions/write_code_review.py b/metagpt/actions/write_code_review.py index f0faea701..1b9f9554b 100644 --- a/metagpt/actions/write_code_review.py +++ b/metagpt/actions/write_code_review.py @@ -17,6 +17,7 @@ from metagpt.const import REQUIREMENT_FILENAME from metagpt.logs import logger from metagpt.schema import CodingContext from metagpt.utils.common import CodeParser +from metagpt.utils.report import EditorReporter PROMPT_TEMPLATE = """ # System @@ -128,16 +129,21 @@ class WriteCodeReview(Action): i_context: CodingContext = Field(default_factory=CodingContext) @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) - async def write_code_review_and_rewrite(self, context_prompt, cr_prompt, filename): + async def write_code_review_and_rewrite(self, context_prompt, cr_prompt, doc): + filename = doc.filename cr_rsp = await self._aask(context_prompt + cr_prompt) result = CodeParser.parse_block("Code Review Result", cr_rsp) if "LGTM" in result: return result, None # if LBTM, rewrite code - rewrite_prompt = f"{context_prompt}\n{cr_rsp}\n{REWRITE_CODE_TEMPLATE.format(filename=filename)}" - code_rsp = await self._aask(rewrite_prompt) - code = CodeParser.parse_code(text=code_rsp) + async with EditorReporter(enable_llm_stream=True) as reporter: + await reporter.async_report({"type": "code", "filename": filename, "src_path": doc.root_relative_path}, "meta") + rewrite_prompt = f"{context_prompt}\n{cr_rsp}\n{REWRITE_CODE_TEMPLATE.format(filename=filename)}" + code_rsp = await self._aask(rewrite_prompt) + code = CodeParser.parse_code(text=code_rsp) + doc.content = code + await reporter.async_report(doc, "document") return result, code async def run(self, *args, **kwargs) -> CodingContext: @@ -182,7 +188,7 @@ class WriteCodeReview(Action): f"len(self.i_context.code_doc.content)={len2}" ) result, rewrited_code = await self.write_code_review_and_rewrite( - context_prompt, cr_prompt, self.i_context.code_doc.filename + context_prompt, cr_prompt, self.i_context.code_doc ) if "LBTM" in result: iterative_code = rewrited_code diff --git a/metagpt/utils/report.py b/metagpt/utils/report.py index a61c77381..616a52f30 100644 --- a/metagpt/utils/report.py +++ b/metagpt/utils/report.py @@ -131,7 +131,11 @@ class ResourceReporter(BaseModel): def _format_data(self, value, name): data = self.model_dump(mode="json", exclude=("callback_url", "llm_stream")) - data["value"] = str(value) if isinstance(value, Path) else value + if isinstance(value, BaseModel): + value = value.model_dump(mode="json") + elif isinstance(value, Path): + value = str(value) + data["value"] = value data["name"] = name role = CURRENT_ROLE.get(None) if role: @@ -263,7 +267,7 @@ class FileReporter(ResourceReporter): """Report file resource synchronously.""" return super().report(value, name) - async def async_report(self, value: Path, name: Literal["path", "meta", "content"] = "path"): + async def async_report(self, value: Path, name: Literal["path", "meta", "content", "document"] = "path"): """Report file resource asynchronously.""" return await super().async_report(value, name) From 2079cbf3ae69ff2ae9d67d6d76fb9d963bf0c239 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Wed, 15 May 2024 14:34:15 +0800 Subject: [PATCH 04/13] report abs path --- metagpt/utils/report.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/metagpt/utils/report.py b/metagpt/utils/report.py index 616a52f30..85f9bfa22 100644 --- a/metagpt/utils/report.py +++ b/metagpt/utils/report.py @@ -135,6 +135,9 @@ class ResourceReporter(BaseModel): value = value.model_dump(mode="json") elif isinstance(value, Path): value = str(value) + + if name == "path": + value = os.path.abspath(value) data["value"] = value data["name"] = name role = CURRENT_ROLE.get(None) From 7745e09c397b78846723a4f1bfc5fdf17cd80649 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Fri, 17 May 2024 10:29:25 +0800 Subject: [PATCH 05/13] add thought reporter --- metagpt/roles/di/data_analyst.py | 5 +++-- metagpt/roles/di/data_interpreter.py | 4 +++- metagpt/roles/di/team_leader.py | 4 +++- metagpt/roles/role.py | 5 +++-- metagpt/utils/common.py | 2 +- metagpt/utils/report.py | 11 +++++++++++ 6 files changed, 24 insertions(+), 7 deletions(-) diff --git a/metagpt/roles/di/data_analyst.py b/metagpt/roles/di/data_analyst.py index 0fc95b9d6..fc298ea4c 100644 --- a/metagpt/roles/di/data_analyst.py +++ b/metagpt/roles/di/data_analyst.py @@ -20,6 +20,7 @@ from metagpt.strategy.thinking_command import ( ) from metagpt.tools.tool_recommend import BM25ToolRecommender from metagpt.utils.common import CodeParser +from metagpt.utils.report import ThoughtReporter class DataAnalyst(DataInterpreter): @@ -82,8 +83,8 @@ class DataAnalyst(DataInterpreter): available_commands=prepare_command_prompt(self.available_commands), ) context = self.llm.format_msg(self.working_memory.get() + [Message(content=prompt, role="user")]) - - rsp = await self.llm.aask(context) + async with ThoughtReporter(): + rsp = await self.llm.aask(context) self.commands = json.loads(CodeParser.parse_code(block=None, text=rsp)) self.rc.memory.add(Message(content=rsp, role="assistant")) diff --git a/metagpt/roles/di/data_interpreter.py b/metagpt/roles/di/data_interpreter.py index e147cbbe3..bdfc0e294 100644 --- a/metagpt/roles/di/data_interpreter.py +++ b/metagpt/roles/di/data_interpreter.py @@ -15,6 +15,7 @@ from metagpt.schema import Message, Task, TaskResult from metagpt.strategy.task_type import TaskType from metagpt.tools.tool_recommend import BM25ToolRecommender, ToolRecommender from metagpt.utils.common import CodeParser +from metagpt.utils.report import ThoughtReporter REACT_THINK_PROMPT = """ # User Requirement @@ -73,7 +74,8 @@ class DataInterpreter(Role): return True prompt = REACT_THINK_PROMPT.format(user_requirement=self.user_requirement, context=context) - rsp = await self.llm.aask(prompt) + async with ThoughtReporter(): + rsp = await self.llm.aask(prompt) rsp_dict = json.loads(CodeParser.parse_code(text=rsp)) self.working_memory.add(Message(content=rsp_dict["thoughts"], role="assistant")) need_action = rsp_dict["state"] diff --git a/metagpt/roles/di/team_leader.py b/metagpt/roles/di/team_leader.py index a1ef11fa6..2fa782ade 100644 --- a/metagpt/roles/di/team_leader.py +++ b/metagpt/roles/di/team_leader.py @@ -20,6 +20,7 @@ from metagpt.strategy.thinking_command import ( run_commands, ) from metagpt.utils.common import CodeParser +from metagpt.utils.report import ThoughtReporter class TeamLeader(Role): @@ -69,7 +70,8 @@ class TeamLeader(Role): ) context = self.llm.format_msg(self.get_memories(k=10) + [Message(content=prompt, role="user")]) - rsp = await self.llm.aask(context, system_msgs=[SYSTEM_PROMPT]) + async with ThoughtReporter(): + rsp = await self.llm.aask(context, system_msgs=[SYSTEM_PROMPT]) self.commands = json.loads(CodeParser.parse_code(text=rsp)) self.rc.memory.add(Message(content=rsp, role="assistant")) diff --git a/metagpt/roles/role.py b/metagpt/roles/role.py index 1eaa77fa3..f6d26eeb1 100644 --- a/metagpt/roles/role.py +++ b/metagpt/roles/role.py @@ -47,6 +47,7 @@ from metagpt.strategy.planner import Planner from metagpt.utils.common import any_to_name, any_to_str, role_raise_decorator from metagpt.utils.project_repo import ProjectRepo from metagpt.utils.repair_llm_raw_output import extract_state_value_from_output +from metagpt.utils.report import ThoughtReporter if TYPE_CHECKING: from metagpt.environment import Environment # noqa: F401 @@ -381,8 +382,8 @@ class Role(SerializationMixin, ContextMixin, BaseModel): n_states=len(self.states) - 1, previous_state=self.rc.state, ) - - next_state = await self.llm.aask(prompt) + async with ThoughtReporter(): + next_state = await self.llm.aask(prompt) next_state = extract_state_value_from_output(next_state) logger.debug(f"{prompt=}") diff --git a/metagpt/utils/common.py b/metagpt/utils/common.py index e2520ef13..fd7fdcb7a 100644 --- a/metagpt/utils/common.py +++ b/metagpt/utils/common.py @@ -646,7 +646,7 @@ def role_raise_decorator(func): raise Exception(format_trackback_info(limit=None)) except Exception as e: if self.latest_observed_msg: - logger.warning( + logger.exception( "There is a exception in role's execution, in order to resume, " "we delete the newest role communication message in the role's memory." ) diff --git a/metagpt/utils/report.py b/metagpt/utils/report.py index 85f9bfa22..491688f3a 100644 --- a/metagpt/utils/report.py +++ b/metagpt/utils/report.py @@ -39,6 +39,7 @@ class BlockType(str, Enum): GALLERY = "Gallery" NOTEBOOK = "Notebook" DOCS = "Docs" + THOUGHT = "Thought" END_MARKER_NAME = "end_marker" @@ -259,6 +260,16 @@ class TaskReporter(ObjectReporter): block: Literal[BlockType.TASK] = BlockType.TASK +class ThoughtReporter(ObjectReporter): + """Reporter for object resources to Task Block.""" + + block: Literal[BlockType.THOUGHT] = BlockType.THOUGHT + + async def __aenter__(self): + await self.async_report({}) + return await super().__aenter__() + + class FileReporter(ResourceReporter): """File resource callback for reporting complete file paths. From 76e3a14d38caa1a6cbd138598e6cfd5a3b3b0d9d Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Tue, 21 May 2024 10:28:24 +0800 Subject: [PATCH 06/13] add extra field for report --- metagpt/tools/libs/editor.py | 3 ++- metagpt/utils/report.py | 38 ++++++++++++++++++++++++------------ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/metagpt/tools/libs/editor.py b/metagpt/tools/libs/editor.py index e032dcef5..a2670a2bd 100644 --- a/metagpt/tools/libs/editor.py +++ b/metagpt/tools/libs/editor.py @@ -100,7 +100,8 @@ class Editor: file_path=file_path, block_content=block_content, ) - self.resource.report(result.file_path, "path") + self.resource.report(result.file_path, "path", + extra={"type": "search", "line_range": {"start": start, "end": end}}) return result return None diff --git a/metagpt/utils/report.py b/metagpt/utils/report.py index 491688f3a..2d72af111 100644 --- a/metagpt/utils/report.py +++ b/metagpt/utils/report.py @@ -56,23 +56,23 @@ class ResourceReporter(BaseModel): callback_url: str = Field(METAGPT_REPORTER_DEFAULT_URL, description="The URL to which the report should be sent") _llm_task: Optional[asyncio.Task] = PrivateAttr(None) - def report(self, value: Any, name: str): + def report(self, value: Any, name: str, extra: Optional[dict] = None): """Synchronously report resource observation data. Args: value: The data to report. name: The type name of the data. """ - return self._report(value, name) + return self._report(value, name, extra) - async def async_report(self, value: Any, name: str): + async def async_report(self, value: Any, name: str, extra: Optional[dict] = None): """Asynchronously report resource observation data. Args: value: The data to report. name: The type name of the data. """ - return await self._async_report(value, name) + return await self._async_report(value, name, extra) @classmethod def set_report_fn(cls, fn: Callable): @@ -101,20 +101,20 @@ class ResourceReporter(BaseModel): """ cls._async_report = fn - def _report(self, value: Any, name: str): + def _report(self, value: Any, name: str, extra: Optional[dict] = None): if not self.callback_url: return - data = self._format_data(value, name) + data = self._format_data(value, name, extra) resp = requests.post(self.callback_url, json=data) resp.raise_for_status() return resp.text - async def _async_report(self, value: Any, name: str): + async def _async_report(self, value: Any, name: str, extra: Optional[dict] = None): if not self.callback_url: return - data = self._format_data(value, name) + data = self._format_data(value, name, extra) url = self.callback_url _result = urlparse(url) sessiion_kwargs = {} @@ -130,7 +130,7 @@ class ResourceReporter(BaseModel): resp.raise_for_status() return await resp.text() - def _format_data(self, value, name): + def _format_data(self, value, name, extra): data = self.model_dump(mode="json", exclude=("callback_url", "llm_stream")) if isinstance(value, BaseModel): value = value.model_dump(mode="json") @@ -147,6 +147,8 @@ class ResourceReporter(BaseModel): else: role_name = os.environ.get("METAGPT_ROLE") data["role"] = role_name + if extra: + data["extra"] = extra return data def __enter__(self): @@ -277,13 +279,23 @@ class FileReporter(ResourceReporter): if the file can be partially output for display first, use streaming callback. """ - def report(self, value: Union[Path, dict, Any], name: Literal["path", "meta", "content"] = "path"): + def report( + self, + value: Union[Path, dict, Any], + name: Literal["path", "meta", "content"] = "path", + extra: Optional[dict] = None, + ): """Report file resource synchronously.""" - return super().report(value, name) + return super().report(value, name, extra) - async def async_report(self, value: Path, name: Literal["path", "meta", "content", "document"] = "path"): + async def async_report( + self, + value: Union[Path, dict, Any], + name: Literal["path", "meta", "content"] = "path", + extra: Optional[dict] = None, + ): """Report file resource asynchronously.""" - return await super().async_report(value, name) + return await super().async_report(value, name, extra) class NotebookReporter(FileReporter): From 6a38d5173362b2fd34de6e8b50963a5a05499ea7 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Thu, 23 May 2024 21:16:17 +0800 Subject: [PATCH 07/13] report the search content result --- metagpt/tools/libs/editor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metagpt/tools/libs/editor.py b/metagpt/tools/libs/editor.py index a2670a2bd..78560e375 100644 --- a/metagpt/tools/libs/editor.py +++ b/metagpt/tools/libs/editor.py @@ -100,8 +100,7 @@ class Editor: file_path=file_path, block_content=block_content, ) - self.resource.report(result.file_path, "path", - extra={"type": "search", "line_range": {"start": start, "end": end}}) + self.resource.report(result.file_path, "path", extra={"type": "search", "line": i, "symbol": symbol}) return result return None From 9ab36acdfc6f18ce91843b3a3b027ed868f4fd8f Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Tue, 28 May 2024 23:29:20 +0800 Subject: [PATCH 08/13] coalesce the stream output of the notebook --- metagpt/actions/di/execute_nb_code.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/metagpt/actions/di/execute_nb_code.py b/metagpt/actions/di/execute_nb_code.py index b4fe949fe..64620d9cc 100644 --- a/metagpt/actions/di/execute_nb_code.py +++ b/metagpt/actions/di/execute_nb_code.py @@ -65,7 +65,7 @@ class ExecuteNbCode(Action): """execute notebook code block, return result to llm, and display it.""" nb: NotebookNode - nb_client: NotebookClient = None + nb_client: RealtimeOutputNotebookClient = None console: Console interaction: str timeout: int = 600 @@ -78,11 +78,15 @@ class ExecuteNbCode(Action): interaction=("ipython" if self.is_ipython() else "terminal"), ) self.reporter = NotebookReporter() + self.set_nb_client() + + def set_nb_client(self): self.nb_client = RealtimeOutputNotebookClient( - nb, - timeout=timeout, + self.nb, + timeout=self.timeout, resources={"metadata": {"path": DEFAULT_WORKSPACE_ROOT}}, notebook_reporter=self.reporter, + coalesce_streams=True, ) async def build(self): @@ -118,7 +122,7 @@ class ExecuteNbCode(Action): # sleep 1s to wait for the kernel to be cleaned up completely await asyncio.sleep(1) await self.build() - self.nb_client = NotebookClient(self.nb, timeout=self.timeout) + self.set_nb_client() def add_code_cell(self, code: str): self.nb.cells.append(new_code_cell(source=code)) From 3afb8a87f3ab6525eebf6b414b7fd3e7363aea6c Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Thu, 30 May 2024 11:00:05 +0800 Subject: [PATCH 09/13] undo thought reporter in the base role --- metagpt/roles/role.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/metagpt/roles/role.py b/metagpt/roles/role.py index f6d26eeb1..1eaa77fa3 100644 --- a/metagpt/roles/role.py +++ b/metagpt/roles/role.py @@ -47,7 +47,6 @@ from metagpt.strategy.planner import Planner from metagpt.utils.common import any_to_name, any_to_str, role_raise_decorator from metagpt.utils.project_repo import ProjectRepo from metagpt.utils.repair_llm_raw_output import extract_state_value_from_output -from metagpt.utils.report import ThoughtReporter if TYPE_CHECKING: from metagpt.environment import Environment # noqa: F401 @@ -382,8 +381,8 @@ class Role(SerializationMixin, ContextMixin, BaseModel): n_states=len(self.states) - 1, previous_state=self.rc.state, ) - async with ThoughtReporter(): - next_state = await self.llm.aask(prompt) + + next_state = await self.llm.aask(prompt) next_state = extract_state_value_from_output(next_state) logger.debug(f"{prompt=}") From 4f43b905a2aa886203291de33f0bb2301ffeccf5 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Thu, 30 May 2024 20:04:02 +0800 Subject: [PATCH 10/13] add crawler tools --- examples/di/crawl_webpage.py | 12 ++++++---- metagpt/rag/engines/simple.py | 4 +++- metagpt/tools/libs/browser.py | 45 +++++++++++++++++++++++++++++++---- metagpt/utils/file.py | 8 +++++++ metagpt/utils/parse_html.py | 25 ++++++++++++++++++- 5 files changed, 83 insertions(+), 11 deletions(-) diff --git a/examples/di/crawl_webpage.py b/examples/di/crawl_webpage.py index b8226f4f4..10b230f2b 100644 --- a/examples/di/crawl_webpage.py +++ b/examples/di/crawl_webpage.py @@ -6,16 +6,19 @@ """ from metagpt.roles.di.data_interpreter import DataInterpreter +from metagpt.tools.libs.browser import Browser as _ + PAPER_LIST_REQ = """" Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/, -and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key variables* +and save it to a csv file. paper title must include `multiagent` or `large language model`. +**Notice: view the page element before writing scraping code** """ ECOMMERCE_REQ = """ Get products data from website https://scrapeme.live/shop/ and save it as a csv file. -**Notice: Firstly parse the web page encoding and the text HTML structure; -The first page product name, price, product URL, and image URL must be saved in the csv;** +The first page product name, price, product URL, and image URL must be saved in the csv. +**Notice: view the page element before writing scraping code** """ NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash 所有初创企业融资的信息, **注意: 这是一个中文网站**; @@ -25,11 +28,12 @@ NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash 3. 反思*快讯的html内容示例*中的规律, 设计正则匹配表达式来获取*`快讯`*的标题、链接、时间; 4. 筛选最近3天的初创企业融资*`快讯`*, 以list[dict]形式打印前5个。 5. 将全部结果存在本地csv中 +**Notice: view the page element before writing scraping code** """ async def main(): - di = DataInterpreter(tools=["scrape_web_playwright"]) + di = DataInterpreter(tools=["Browser"]) await di.run(ECOMMERCE_REQ) diff --git a/metagpt/rag/engines/simple.py b/metagpt/rag/engines/simple.py index 5c5810308..623b3f350 100644 --- a/metagpt/rag/engines/simple.py +++ b/metagpt/rag/engines/simple.py @@ -4,6 +4,7 @@ import json import os from typing import Any, Optional, Union +from fsspec import AbstractFileSystem from llama_index.core import SimpleDirectoryReader, VectorStoreIndex from llama_index.core.callbacks.base import CallbackManager from llama_index.core.embeddings import BaseEmbedding @@ -83,6 +84,7 @@ class SimpleEngine(RetrieverQueryEngine): llm: LLM = None, retriever_configs: list[BaseRetrieverConfig] = None, ranker_configs: list[BaseRankerConfig] = None, + fs: Optional[AbstractFileSystem] = None, ) -> "SimpleEngine": """From docs. @@ -100,7 +102,7 @@ class SimpleEngine(RetrieverQueryEngine): if not input_dir and not input_files: raise ValueError("Must provide either `input_dir` or `input_files`.") - documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files).load_data() + documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files, fs=fs).load_data() cls._fix_document_metadata(documents) index = VectorStoreIndex.from_documents( diff --git a/metagpt/tools/libs/browser.py b/metagpt/tools/libs/browser.py index 7fde804fe..223434b3a 100644 --- a/metagpt/tools/libs/browser.py +++ b/metagpt/tools/libs/browser.py @@ -1,9 +1,12 @@ from __future__ import annotations +import contextlib from playwright.async_api import async_playwright - +from metagpt.utils.file import MemoryFileSystem +from uuid import uuid4 from metagpt.const import DEFAULT_WORKSPACE_ROOT from metagpt.tools.tool_registry import register_tool +from metagpt.utils.parse_html import simplify_html from metagpt.utils.report import BrowserReporter @@ -35,16 +38,48 @@ class Browser: print("Now on page ", url) await self._view() - async def open_new_page(self, url: str): + async def open_new_page(self, url: str, timeout: float = 30000): """open a new page in the browser and view the page""" async with self.reporter as reporter: page = await self.browser.new_page() await reporter.async_report(url, "url") - await page.goto(url) + await page.goto(url, timeout=timeout) self.pages[url] = page await self._set_current_page(page, url) await reporter.async_report(page, "page") + async def view_page_element_to_scrape(self, requirement: str, keep_links: bool = False) -> None: + """view the HTML content of current page to understand the structure. When executed, the content will be printed out + + Args: + requirement (str): Providing a clear and detailed requirement helps in focusing the inspection on the desired elements. + keep_links (bool): Whether to keep the hyperlinks in the HTML content. Set to True if links are required + """ + html = await self.current_page.content() + html = simplify_html(html, url=self.current_page.url, keep_links=keep_links) + mem_fs = MemoryFileSystem() + filename = f"{uuid4().hex}.html" + with mem_fs.open(filename, "w") as f: + f.write(html) + + with contextlib.suppress(Exception): + + from metagpt.rag.engines import SimpleEngine # avoid circular import + + # TODO make `from_docs` asynchronous + engine = SimpleEngine.from_docs(input_files=[filename], fs=mem_fs) + nodes = await engine.aretrieve(requirement) + html = "\n".join(i.text for i in nodes) + + mem_fs.rm_file(filename) + print(html) + + async def get_page_content(self) -> str: + """Get the HTML content of current page.""" + html = await self.current_page.content() + html_content = html.strip() + return html_content + async def switch_page(self, url: str): """switch to an opened page in the browser and view the page""" if url in self.pages: @@ -152,8 +187,8 @@ class Browser: async def _view(self, keep_len: int = 5000) -> str: """simulate human viewing the current page, return the visible text with links""" - visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS) - print("The visible text and their links (if any): ", visible_text_with_links[:keep_len]) + # visible_text_with_links = await self.current_page.evaluate(VIEW_CONTENT_JS) + # print("The visible text and their links (if any): ", visible_text_with_links[:keep_len]) # html_content = await self._view_page_html(keep_len=keep_len) # print("The html content: ", html_content) diff --git a/metagpt/utils/file.py b/metagpt/utils/file.py index f62b44eb8..a8ed482d9 100644 --- a/metagpt/utils/file.py +++ b/metagpt/utils/file.py @@ -9,6 +9,7 @@ from pathlib import Path import aiofiles +from fsspec.implementations.memory import MemoryFileSystem as _MemoryFileSystem from metagpt.logs import logger from metagpt.utils.exceptions import handle_exception @@ -68,3 +69,10 @@ class File: content = b"".join(chunks) logger.debug(f"Successfully read file, the path of file: {file_path}") return content + + +class MemoryFileSystem(_MemoryFileSystem): + + @classmethod + def _strip_protocol(cls, path): + return super()._strip_protocol(str(path)) diff --git a/metagpt/utils/parse_html.py b/metagpt/utils/parse_html.py index 65aa3f236..3aac8ca6c 100644 --- a/metagpt/utils/parse_html.py +++ b/metagpt/utils/parse_html.py @@ -7,6 +7,8 @@ from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from pydantic import BaseModel, PrivateAttr +import htmlmin + class WebPage(BaseModel): inner_text: str @@ -38,6 +40,22 @@ class WebPage(BaseModel): elif url.startswith(("http://", "https://")): yield urljoin(self.url, url) + def get_slim_soup(self, keep_links: bool = False): + soup = _get_soup(self.html) + keep_attrs = ["class"] + if keep_links: + keep_attrs.append("href") + + for i in soup.find_all(True): + for name in list(i.attrs): + if i[name] and name not in keep_attrs: + del i[name] + + for i in soup.find_all(["svg", "img", "video", "audio"]): + i.decompose() + + return soup + def get_html_content(page: str, base: str): soup = _get_soup(page) @@ -48,7 +66,12 @@ def get_html_content(page: str, base: str): def _get_soup(page: str): soup = BeautifulSoup(page, "html.parser") # https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup - for s in soup(["style", "script", "[document]", "head", "title"]): + for s in soup(["style", "script", "[document]", "head", "title", "footer"]): s.extract() return soup + + +def simplify_html(html: str, url: str, keep_links: bool = False): + html = WebPage(inner_text="", html=html, url=url).get_slim_soup(keep_links).decode() + return htmlmin.minify(html, remove_comments=True, remove_empty_space=True) From aeef03e29c48b6ca64244ec949850129cb1b53d1 Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Thu, 30 May 2024 21:01:35 +0800 Subject: [PATCH 11/13] update the requirements --- requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b40c69c9f..83a904156 100644 --- a/requirements.txt +++ b/requirements.txt @@ -71,4 +71,6 @@ dashscope==1.14.1 rank-bm25==0.2.2 # for tool recommendation gymnasium==0.29.1 pylint~=3.0.3 -pygithub~=2.3 \ No newline at end of file +pygithub~=2.3 +htmlmin +fsspec From 9dc5212d4709733d63ffd0b9999660939f7369fd Mon Sep 17 00:00:00 2001 From: shenchucheng Date: Fri, 31 May 2024 15:50:05 +0800 Subject: [PATCH 12/13] Add explanation for error suppression in the method --- metagpt/tools/libs/browser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/metagpt/tools/libs/browser.py b/metagpt/tools/libs/browser.py index 223434b3a..8d6daec11 100644 --- a/metagpt/tools/libs/browser.py +++ b/metagpt/tools/libs/browser.py @@ -62,6 +62,7 @@ class Browser: with mem_fs.open(filename, "w") as f: f.write(html) + # Since RAG is an optional optimization, if it fails, the simplified HTML can be used as a fallback. with contextlib.suppress(Exception): from metagpt.rag.engines import SimpleEngine # avoid circular import From 2a0107679e6e8bc1bfe582c3533f83543183cc47 Mon Sep 17 00:00:00 2001 From: seehi <6580@pm.me> Date: Mon, 3 Jun 2024 10:44:03 +0800 Subject: [PATCH 13/13] merge the newest rag in github --- metagpt/config2.py | 4 + metagpt/configs/embedding_config.py | 50 ++++++++++ metagpt/rag/engines/simple.py | 74 ++++++++++---- metagpt/rag/factories/base.py | 20 ++-- metagpt/rag/factories/embedding.py | 88 ++++++++++++++--- metagpt/rag/factories/index.py | 2 +- metagpt/rag/factories/llm.py | 7 +- metagpt/rag/factories/ranker.py | 24 +++++ metagpt/rag/factories/retriever.py | 89 +++++++++++++---- metagpt/rag/retrievers/bm25_retriever.py | 6 +- metagpt/rag/schema.py | 37 ++++++- metagpt/utils/async_helper.py | 15 +++ setup.py | 3 + tests/metagpt/rag/engines/test_simple.py | 24 ++--- tests/metagpt/rag/factories/test_base.py | 5 +- tests/metagpt/rag/factories/test_embedding.py | 97 +++++++++++++++---- tests/metagpt/rag/factories/test_retriever.py | 50 +++++++--- 17 files changed, 482 insertions(+), 113 deletions(-) create mode 100644 metagpt/configs/embedding_config.py diff --git a/metagpt/config2.py b/metagpt/config2.py index 8c61fdbf2..717fe63a9 100644 --- a/metagpt/config2.py +++ b/metagpt/config2.py @@ -12,6 +12,7 @@ from typing import Dict, Iterable, List, Literal, Optional from pydantic import BaseModel, model_validator from metagpt.configs.browser_config import BrowserConfig +from metagpt.configs.embedding_config import EmbeddingConfig from metagpt.configs.llm_config import LLMConfig, LLMType from metagpt.configs.mermaid_config import MermaidConfig from metagpt.configs.redis_config import RedisConfig @@ -48,6 +49,9 @@ class Config(CLIParams, YamlModel): # Key Parameters llm: LLMConfig + # RAG Embedding + embedding: EmbeddingConfig = EmbeddingConfig() + # Global Proxy. Not used by LLM, but by other tools such as browsers. proxy: str = "" diff --git a/metagpt/configs/embedding_config.py b/metagpt/configs/embedding_config.py new file mode 100644 index 000000000..20de47999 --- /dev/null +++ b/metagpt/configs/embedding_config.py @@ -0,0 +1,50 @@ +from enum import Enum +from typing import Optional + +from pydantic import field_validator + +from metagpt.utils.yaml_model import YamlModel + + +class EmbeddingType(Enum): + OPENAI = "openai" + AZURE = "azure" + GEMINI = "gemini" + OLLAMA = "ollama" + + +class EmbeddingConfig(YamlModel): + """Config for Embedding. + + Examples: + --------- + api_type: "openai" + api_key: "YOU_API_KEY" + + api_type: "azure" + api_key: "YOU_API_KEY" + base_url: "YOU_BASE_URL" + api_version: "YOU_API_VERSION" + + api_type: "gemini" + api_key: "YOU_API_KEY" + + api_type: "ollama" + base_url: "YOU_BASE_URL" + model: "YOU_MODEL" + """ + + api_type: Optional[EmbeddingType] = None + api_key: Optional[str] = None + base_url: Optional[str] = None + api_version: Optional[str] = None + + model: Optional[str] = None + embed_batch_size: Optional[int] = None + + @field_validator("api_type", mode="before") + @classmethod + def check_api_type(cls, v): + if v == "": + return None + return v diff --git a/metagpt/rag/engines/simple.py b/metagpt/rag/engines/simple.py index 623b3f350..c237dcf69 100644 --- a/metagpt/rag/engines/simple.py +++ b/metagpt/rag/engines/simple.py @@ -4,8 +4,7 @@ import json import os from typing import Any, Optional, Union -from fsspec import AbstractFileSystem -from llama_index.core import SimpleDirectoryReader, VectorStoreIndex +from llama_index.core import SimpleDirectoryReader from llama_index.core.callbacks.base import CallbackManager from llama_index.core.embeddings import BaseEmbedding from llama_index.core.embeddings.mock_embed_model import MockEmbedding @@ -64,7 +63,7 @@ class SimpleEngine(RetrieverQueryEngine): response_synthesizer: Optional[BaseSynthesizer] = None, node_postprocessors: Optional[list[BaseNodePostprocessor]] = None, callback_manager: Optional[CallbackManager] = None, - index: Optional[BaseIndex] = None, + transformations: Optional[list[TransformComponent]] = None, ) -> None: super().__init__( retriever=retriever, @@ -72,7 +71,7 @@ class SimpleEngine(RetrieverQueryEngine): node_postprocessors=node_postprocessors, callback_manager=callback_manager, ) - self.index = index + self._transformations = transformations or self._default_transformations() @classmethod def from_docs( @@ -84,7 +83,6 @@ class SimpleEngine(RetrieverQueryEngine): llm: LLM = None, retriever_configs: list[BaseRetrieverConfig] = None, ranker_configs: list[BaseRankerConfig] = None, - fs: Optional[AbstractFileSystem] = None, ) -> "SimpleEngine": """From docs. @@ -102,15 +100,20 @@ class SimpleEngine(RetrieverQueryEngine): if not input_dir and not input_files: raise ValueError("Must provide either `input_dir` or `input_files`.") - documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files, fs=fs).load_data() + documents = SimpleDirectoryReader(input_dir=input_dir, input_files=input_files).load_data() cls._fix_document_metadata(documents) - index = VectorStoreIndex.from_documents( - documents=documents, - transformations=transformations or [SentenceSplitter()], - embed_model=cls._resolve_embed_model(embed_model, retriever_configs), + transformations = transformations or cls._default_transformations() + nodes = run_transformations(documents, transformations=transformations) + + return cls._from_nodes( + nodes=nodes, + transformations=transformations, + embed_model=embed_model, + llm=llm, + retriever_configs=retriever_configs, + ranker_configs=ranker_configs, ) - return cls._from_index(index, llm=llm, retriever_configs=retriever_configs, ranker_configs=ranker_configs) @classmethod def from_objs( @@ -139,12 +142,15 @@ class SimpleEngine(RetrieverQueryEngine): raise ValueError("In BM25RetrieverConfig, Objs must not be empty.") nodes = [ObjectNode(text=obj.rag_key(), metadata=ObjectNode.get_obj_metadata(obj)) for obj in objs] - index = VectorStoreIndex( + + return cls._from_nodes( nodes=nodes, - transformations=transformations or [SentenceSplitter()], - embed_model=cls._resolve_embed_model(embed_model, retriever_configs), + transformations=transformations, + embed_model=embed_model, + llm=llm, + retriever_configs=retriever_configs, + ranker_configs=ranker_configs, ) - return cls._from_index(index, llm=llm, retriever_configs=retriever_configs, ranker_configs=ranker_configs) @classmethod def from_index( @@ -163,6 +169,13 @@ class SimpleEngine(RetrieverQueryEngine): """Inplement tools.SearchInterface""" return await self.aquery(content) + def retrieve(self, query: QueryType) -> list[NodeWithScore]: + query_bundle = QueryBundle(query) if isinstance(query, str) else query + + nodes = super().retrieve(query_bundle) + self._try_reconstruct_obj(nodes) + return nodes + async def aretrieve(self, query: QueryType) -> list[NodeWithScore]: """Allow query to be str.""" query_bundle = QueryBundle(query) if isinstance(query, str) else query @@ -178,7 +191,7 @@ class SimpleEngine(RetrieverQueryEngine): documents = SimpleDirectoryReader(input_files=input_files).load_data() self._fix_document_metadata(documents) - nodes = run_transformations(documents, transformations=self.index._transformations) + nodes = run_transformations(documents, transformations=self._transformations) self._save_nodes(nodes) def add_objs(self, objs: list[RAGObject]): @@ -194,6 +207,29 @@ class SimpleEngine(RetrieverQueryEngine): self._persist(str(persist_dir), **kwargs) + @classmethod + def _from_nodes( + cls, + nodes: list[BaseNode], + transformations: Optional[list[TransformComponent]] = None, + embed_model: BaseEmbedding = None, + llm: LLM = None, + retriever_configs: list[BaseRetrieverConfig] = None, + ranker_configs: list[BaseRankerConfig] = None, + ) -> "SimpleEngine": + embed_model = cls._resolve_embed_model(embed_model, retriever_configs) + llm = llm or get_rag_llm() + + retriever = get_retriever(configs=retriever_configs, nodes=nodes, embed_model=embed_model) + rankers = get_rankers(configs=ranker_configs, llm=llm) # Default [] + + return cls( + retriever=retriever, + node_postprocessors=rankers, + response_synthesizer=get_response_synthesizer(llm=llm), + transformations=transformations, + ) + @classmethod def _from_index( cls, @@ -203,6 +239,7 @@ class SimpleEngine(RetrieverQueryEngine): ranker_configs: list[BaseRankerConfig] = None, ) -> "SimpleEngine": llm = llm or get_rag_llm() + retriever = get_retriever(configs=retriever_configs, index=index) # Default index.as_retriever rankers = get_rankers(configs=ranker_configs, llm=llm) # Default [] @@ -210,7 +247,6 @@ class SimpleEngine(RetrieverQueryEngine): retriever=retriever, node_postprocessors=rankers, response_synthesizer=get_response_synthesizer(llm=llm), - index=index, ) def _ensure_retriever_modifiable(self): @@ -261,3 +297,7 @@ class SimpleEngine(RetrieverQueryEngine): return MockEmbedding(embed_dim=1) return embed_model or get_rag_embedding() + + @staticmethod + def _default_transformations(): + return [SentenceSplitter()] diff --git a/metagpt/rag/factories/base.py b/metagpt/rag/factories/base.py index fbdfbf1a8..e58643efe 100644 --- a/metagpt/rag/factories/base.py +++ b/metagpt/rag/factories/base.py @@ -26,6 +26,9 @@ class GenericFactory: if creator: return creator(**kwargs) + self._raise_for_key(key) + + def _raise_for_key(self, key: Any): raise ValueError(f"Creator not registered for key: {key}") @@ -33,19 +36,26 @@ class ConfigBasedFactory(GenericFactory): """Designed to get objects based on object type.""" def get_instance(self, key: Any, **kwargs) -> Any: - """Key is config, such as a pydantic model. + """Get instance by the type of key. - Call func by the type of key, and the key will be passed to func. + Key is config, such as a pydantic model, call func by the type of key, and the key will be passed to func. + Raise Exception if key not found. """ creator = self._creators.get(type(key)) if creator: return creator(key, **kwargs) + self._raise_for_key(key) + + def _raise_for_key(self, key: Any): raise ValueError(f"Unknown config: `{type(key)}`, {key}") @staticmethod def _val_from_config_or_kwargs(key: str, config: object = None, **kwargs) -> Any: - """It prioritizes the configuration object's value unless it is None, in which case it looks into kwargs.""" + """It prioritizes the configuration object's value unless it is None, in which case it looks into kwargs. + + Return None if not found. + """ if config is not None and hasattr(config, key): val = getattr(config, key) if val is not None: @@ -54,6 +64,4 @@ class ConfigBasedFactory(GenericFactory): if key in kwargs: return kwargs[key] - raise KeyError( - f"The key '{key}' is required but not provided in either configuration object or keyword arguments." - ) + return None diff --git a/metagpt/rag/factories/embedding.py b/metagpt/rag/factories/embedding.py index 4247db256..3613fd228 100644 --- a/metagpt/rag/factories/embedding.py +++ b/metagpt/rag/factories/embedding.py @@ -1,37 +1,103 @@ """RAG Embedding Factory.""" +from __future__ import annotations + +from typing import Any from llama_index.core.embeddings import BaseEmbedding from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding +from llama_index.embeddings.gemini import GeminiEmbedding +from llama_index.embeddings.ollama import OllamaEmbedding from llama_index.embeddings.openai import OpenAIEmbedding from metagpt.config2 import config +from metagpt.configs.embedding_config import EmbeddingType from metagpt.configs.llm_config import LLMType from metagpt.rag.factories.base import GenericFactory class RAGEmbeddingFactory(GenericFactory): - """Create LlamaIndex Embedding with MetaGPT's config.""" + """Create LlamaIndex Embedding with MetaGPT's embedding config.""" def __init__(self): creators = { + EmbeddingType.OPENAI: self._create_openai, + EmbeddingType.AZURE: self._create_azure, + EmbeddingType.GEMINI: self._create_gemini, + EmbeddingType.OLLAMA: self._create_ollama, + # For backward compatibility LLMType.OPENAI: self._create_openai, LLMType.AZURE: self._create_azure, } super().__init__(creators) - def get_rag_embedding(self, key: LLMType = None) -> BaseEmbedding: - """Key is LLMType, default use config.llm.api_type.""" - return super().get_instance(key or config.llm.api_type) + def get_rag_embedding(self, key: EmbeddingType = None) -> BaseEmbedding: + """Key is EmbeddingType.""" + return super().get_instance(key or self._resolve_embedding_type()) - def _create_openai(self): - return OpenAIEmbedding(api_key=config.llm.api_key, api_base=config.llm.base_url) + def _resolve_embedding_type(self) -> EmbeddingType | LLMType: + """Resolves the embedding type. - def _create_azure(self): - return AzureOpenAIEmbedding( - azure_endpoint=config.llm.base_url, - api_key=config.llm.api_key, - api_version=config.llm.api_version, + If the embedding type is not specified, for backward compatibility, it checks if the LLM API type is either OPENAI or AZURE. + Raise TypeError if embedding type not found. + """ + if config.embedding.api_type: + return config.embedding.api_type + + if config.llm.api_type in [LLMType.OPENAI, LLMType.AZURE]: + return config.llm.api_type + + raise TypeError("To use RAG, please set your embedding in config2.yaml.") + + def _create_openai(self) -> OpenAIEmbedding: + params = dict( + api_key=config.embedding.api_key or config.llm.api_key, + api_base=config.embedding.base_url or config.llm.base_url, ) + self._try_set_model_and_batch_size(params) + + return OpenAIEmbedding(**params) + + def _create_azure(self) -> AzureOpenAIEmbedding: + params = dict( + api_key=config.embedding.api_key or config.llm.api_key, + azure_endpoint=config.embedding.base_url or config.llm.base_url, + api_version=config.embedding.api_version or config.llm.api_version, + ) + + self._try_set_model_and_batch_size(params) + + return AzureOpenAIEmbedding(**params) + + def _create_gemini(self) -> GeminiEmbedding: + params = dict( + api_key=config.embedding.api_key, + api_base=config.embedding.base_url, + ) + + self._try_set_model_and_batch_size(params) + + return GeminiEmbedding(**params) + + def _create_ollama(self) -> OllamaEmbedding: + params = dict( + base_url=config.embedding.base_url, + ) + + self._try_set_model_and_batch_size(params) + + return OllamaEmbedding(**params) + + def _try_set_model_and_batch_size(self, params: dict): + """Set the model_name and embed_batch_size only when they are specified.""" + if config.embedding.model: + params["model_name"] = config.embedding.model + + if config.embedding.embed_batch_size: + params["embed_batch_size"] = config.embedding.embed_batch_size + + def _raise_for_key(self, key: Any): + raise ValueError(f"The embedding type is currently not supported: `{type(key)}`, {key}") + get_rag_embedding = RAGEmbeddingFactory().get_rag_embedding diff --git a/metagpt/rag/factories/index.py b/metagpt/rag/factories/index.py index a56471359..f897af3ad 100644 --- a/metagpt/rag/factories/index.py +++ b/metagpt/rag/factories/index.py @@ -48,7 +48,7 @@ class RAGIndexFactory(ConfigBasedFactory): def _create_chroma(self, config: ChromaIndexConfig, **kwargs) -> VectorStoreIndex: db = chromadb.PersistentClient(str(config.persist_path)) - chroma_collection = db.get_or_create_collection(config.collection_name) + chroma_collection = db.get_or_create_collection(config.collection_name, metadata=config.metadata) vector_store = ChromaVectorStore(chroma_collection=chroma_collection) return self._index_from_vector_store(vector_store=vector_store, config=config, **kwargs) diff --git a/metagpt/rag/factories/llm.py b/metagpt/rag/factories/llm.py index 17c499b76..9fd19cab5 100644 --- a/metagpt/rag/factories/llm.py +++ b/metagpt/rag/factories/llm.py @@ -1,5 +1,5 @@ """RAG LLM.""" - +import asyncio from typing import Any from llama_index.core.constants import DEFAULT_CONTEXT_WINDOW @@ -15,7 +15,7 @@ from pydantic import Field from metagpt.config2 import config from metagpt.llm import LLM from metagpt.provider.base_llm import BaseLLM -from metagpt.utils.async_helper import run_coroutine_in_new_loop +from metagpt.utils.async_helper import NestAsyncio from metagpt.utils.token_counter import TOKEN_MAX @@ -39,7 +39,8 @@ class RAGLLM(CustomLLM): @llm_completion_callback() def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse: - return run_coroutine_in_new_loop(self.acomplete(prompt, **kwargs)) + NestAsyncio.apply_once() + return asyncio.get_event_loop().run_until_complete(self.acomplete(prompt, **kwargs)) @llm_completion_callback() async def acomplete(self, prompt: str, formatted: bool = False, **kwargs: Any) -> CompletionResponse: diff --git a/metagpt/rag/factories/ranker.py b/metagpt/rag/factories/ranker.py index 476fe8c1a..7abda162a 100644 --- a/metagpt/rag/factories/ranker.py +++ b/metagpt/rag/factories/ranker.py @@ -8,6 +8,8 @@ from metagpt.rag.factories.base import ConfigBasedFactory from metagpt.rag.rankers.object_ranker import ObjectSortPostprocessor from metagpt.rag.schema import ( BaseRankerConfig, + BGERerankConfig, + CohereRerankConfig, ColbertRerankConfig, LLMRankerConfig, ObjectRankerConfig, @@ -22,6 +24,8 @@ class RankerFactory(ConfigBasedFactory): LLMRankerConfig: self._create_llm_ranker, ColbertRerankConfig: self._create_colbert_ranker, ObjectRankerConfig: self._create_object_ranker, + CohereRerankConfig: self._create_cohere_rerank, + BGERerankConfig: self._create_bge_rerank, } super().__init__(creators) @@ -45,6 +49,26 @@ class RankerFactory(ConfigBasedFactory): ) return ColbertRerank(**config.model_dump()) + def _create_cohere_rerank(self, config: CohereRerankConfig, **kwargs) -> LLMRerank: + try: + from llama_index.postprocessor.cohere_rerank import CohereRerank + except ImportError: + raise ImportError( + "`llama-index-postprocessor-cohere-rerank` package not found, please run `pip install llama-index-postprocessor-cohere-rerank`" + ) + return CohereRerank(**config.model_dump()) + + def _create_bge_rerank(self, config: BGERerankConfig, **kwargs) -> LLMRerank: + try: + from llama_index.postprocessor.flag_embedding_reranker import ( + FlagEmbeddingReranker, + ) + except ImportError: + raise ImportError( + "`llama-index-postprocessor-flag-embedding-reranker` package not found, please run `pip install llama-index-postprocessor-flag-embedding-reranker`" + ) + return FlagEmbeddingReranker(**config.model_dump()) + def _create_object_ranker(self, config: ObjectRankerConfig, **kwargs) -> LLMRerank: return ObjectSortPostprocessor(**config.model_dump()) diff --git a/metagpt/rag/factories/retriever.py b/metagpt/rag/factories/retriever.py index 65729002e..1460e131b 100644 --- a/metagpt/rag/factories/retriever.py +++ b/metagpt/rag/factories/retriever.py @@ -1,10 +1,13 @@ """RAG Retriever Factory.""" -import copy + +from functools import wraps import chromadb import faiss from llama_index.core import StorageContext, VectorStoreIndex +from llama_index.core.embeddings import BaseEmbedding +from llama_index.core.schema import BaseNode from llama_index.core.vector_stores.types import BasePydanticVectorStore from llama_index.vector_stores.chroma import ChromaVectorStore from llama_index.vector_stores.elasticsearch import ElasticsearchStore @@ -24,10 +27,25 @@ from metagpt.rag.schema import ( ElasticsearchKeywordRetrieverConfig, ElasticsearchRetrieverConfig, FAISSRetrieverConfig, - IndexRetrieverConfig, ) +def get_or_build_index(build_index_func): + """Decorator to get or build an index. + + Get index using `_extract_index` method, if not found, using build_index_func. + """ + + @wraps(build_index_func) + def wrapper(self, config, **kwargs): + index = self._extract_index(config, **kwargs) + if index is not None: + return index + return build_index_func(self, config, **kwargs) + + return wrapper + + class RetrieverFactory(ConfigBasedFactory): """Modify creators for dynamically instance implementation.""" @@ -54,48 +72,79 @@ class RetrieverFactory(ConfigBasedFactory): return SimpleHybridRetriever(*retrievers) if len(retrievers) > 1 else retrievers[0] def _create_default(self, **kwargs) -> RAGRetriever: - return self._extract_index(**kwargs).as_retriever() + index = self._extract_index(None, **kwargs) or self._build_default_index(**kwargs) + + return index.as_retriever() def _create_faiss_retriever(self, config: FAISSRetrieverConfig, **kwargs) -> FAISSRetriever: - vector_store = FaissVectorStore(faiss_index=faiss.IndexFlatL2(config.dimensions)) - config.index = self._build_index_from_vector_store(config, vector_store, **kwargs) + config.index = self._build_faiss_index(config, **kwargs) return FAISSRetriever(**config.model_dump()) def _create_bm25_retriever(self, config: BM25RetrieverConfig, **kwargs) -> DynamicBM25Retriever: - config.index = copy.deepcopy(self._extract_index(config, **kwargs)) + index = self._extract_index(config, **kwargs) + nodes = list(index.docstore.docs.values()) if index else self._extract_nodes(config, **kwargs) - return DynamicBM25Retriever(nodes=list(config.index.docstore.docs.values()), **config.model_dump()) + return DynamicBM25Retriever(nodes=nodes, **config.model_dump()) def _create_chroma_retriever(self, config: ChromaRetrieverConfig, **kwargs) -> ChromaRetriever: - db = chromadb.PersistentClient(path=str(config.persist_path)) - chroma_collection = db.get_or_create_collection(config.collection_name) - - vector_store = ChromaVectorStore(chroma_collection=chroma_collection) - config.index = self._build_index_from_vector_store(config, vector_store, **kwargs) + config.index = self._build_chroma_index(config, **kwargs) return ChromaRetriever(**config.model_dump()) def _create_es_retriever(self, config: ElasticsearchRetrieverConfig, **kwargs) -> ElasticsearchRetriever: - vector_store = ElasticsearchStore(**config.store_config.model_dump()) - config.index = self._build_index_from_vector_store(config, vector_store, **kwargs) + config.index = self._build_es_index(config, **kwargs) return ElasticsearchRetriever(**config.model_dump()) def _extract_index(self, config: BaseRetrieverConfig = None, **kwargs) -> VectorStoreIndex: return self._val_from_config_or_kwargs("index", config, **kwargs) + def _extract_nodes(self, config: BaseRetrieverConfig = None, **kwargs) -> list[BaseNode]: + return self._val_from_config_or_kwargs("nodes", config, **kwargs) + + def _extract_embed_model(self, config: BaseRetrieverConfig = None, **kwargs) -> BaseEmbedding: + return self._val_from_config_or_kwargs("embed_model", config, **kwargs) + + def _build_default_index(self, **kwargs) -> VectorStoreIndex: + index = VectorStoreIndex( + nodes=self._extract_nodes(**kwargs), + embed_model=self._extract_embed_model(**kwargs), + ) + + return index + + @get_or_build_index + def _build_faiss_index(self, config: FAISSRetrieverConfig, **kwargs) -> VectorStoreIndex: + vector_store = FaissVectorStore(faiss_index=faiss.IndexFlatL2(config.dimensions)) + + return self._build_index_from_vector_store(config, vector_store, **kwargs) + + @get_or_build_index + def _build_chroma_index(self, config: ChromaRetrieverConfig, **kwargs) -> VectorStoreIndex: + db = chromadb.PersistentClient(path=str(config.persist_path)) + chroma_collection = db.get_or_create_collection(config.collection_name, metadata=config.metadata) + vector_store = ChromaVectorStore(chroma_collection=chroma_collection) + + return self._build_index_from_vector_store(config, vector_store, **kwargs) + + @get_or_build_index + def _build_es_index(self, config: ElasticsearchRetrieverConfig, **kwargs) -> VectorStoreIndex: + vector_store = ElasticsearchStore(**config.store_config.model_dump()) + + return self._build_index_from_vector_store(config, vector_store, **kwargs) + def _build_index_from_vector_store( - self, config: IndexRetrieverConfig, vector_store: BasePydanticVectorStore, **kwargs + self, config: BaseRetrieverConfig, vector_store: BasePydanticVectorStore, **kwargs ) -> VectorStoreIndex: storage_context = StorageContext.from_defaults(vector_store=vector_store) - old_index = self._extract_index(config, **kwargs) - new_index = VectorStoreIndex( - nodes=list(old_index.docstore.docs.values()), + index = VectorStoreIndex( + nodes=self._extract_nodes(config, **kwargs), storage_context=storage_context, - embed_model=old_index._embed_model, + embed_model=self._extract_embed_model(config, **kwargs), ) - return new_index + + return index get_retriever = RetrieverFactory().get_retriever diff --git a/metagpt/rag/retrievers/bm25_retriever.py b/metagpt/rag/retrievers/bm25_retriever.py index 241820cf4..3b085cb73 100644 --- a/metagpt/rag/retrievers/bm25_retriever.py +++ b/metagpt/rag/retrievers/bm25_retriever.py @@ -40,8 +40,10 @@ class DynamicBM25Retriever(BM25Retriever): self._corpus = [self._tokenizer(node.get_content()) for node in self._nodes] self.bm25 = BM25Okapi(self._corpus) - self._index.insert_nodes(nodes, **kwargs) + if self._index: + self._index.insert_nodes(nodes, **kwargs) def persist(self, persist_dir: str, **kwargs) -> None: """Support persist.""" - self._index.storage_context.persist(persist_dir) + if self._index: + self._index.storage_context.persist(persist_dir) \ No newline at end of file diff --git a/metagpt/rag/schema.py b/metagpt/rag/schema.py index 183f6e0c7..e7b2e5ce9 100644 --- a/metagpt/rag/schema.py +++ b/metagpt/rag/schema.py @@ -1,14 +1,17 @@ """RAG schemas.""" from pathlib import Path -from typing import Any, Literal, Union +from typing import Any, ClassVar, Literal, Optional, Union +from chromadb.api.types import CollectionMetadata from llama_index.core.embeddings import BaseEmbedding from llama_index.core.indices.base import BaseIndex from llama_index.core.schema import TextNode from llama_index.core.vector_stores.types import VectorStoreQueryMode -from pydantic import BaseModel, ConfigDict, Field, PrivateAttr +from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_validator +from metagpt.config2 import config +from metagpt.configs.embedding_config import EmbeddingType from metagpt.rag.interface import RAGObject @@ -31,7 +34,19 @@ class IndexRetrieverConfig(BaseRetrieverConfig): class FAISSRetrieverConfig(IndexRetrieverConfig): """Config for FAISS-based retrievers.""" - dimensions: int = Field(default=1536, description="Dimensionality of the vectors for FAISS index construction.") + dimensions: int = Field(default=0, description="Dimensionality of the vectors for FAISS index construction.") + + _embedding_type_to_dimensions: ClassVar[dict[EmbeddingType, int]] = { + EmbeddingType.GEMINI: 768, + EmbeddingType.OLLAMA: 4096, + } + + @model_validator(mode="after") + def check_dimensions(self): + if self.dimensions == 0: + self.dimensions = self._embedding_type_to_dimensions.get(config.embedding.api_type, 1536) + + return self class BM25RetrieverConfig(IndexRetrieverConfig): @@ -45,6 +60,9 @@ class ChromaRetrieverConfig(IndexRetrieverConfig): persist_path: Union[str, Path] = Field(default="./chroma_db", description="The directory to save data.") collection_name: str = Field(default="metagpt", description="The name of the collection.") + metadata: Optional[CollectionMetadata] = Field( + default=None, description="Optional metadata to associate with the collection" + ) class ElasticsearchStoreConfig(BaseModel): @@ -101,6 +119,16 @@ class ColbertRerankConfig(BaseRankerConfig): keep_retrieval_score: bool = Field(default=False, description="Whether to keep the retrieval score in metadata.") +class CohereRerankConfig(BaseRankerConfig): + model: str = Field(default="rerank-english-v3.0") + api_key: str = Field(default="YOUR_COHERE_API") + + +class BGERerankConfig(BaseRankerConfig): + model: str = Field(default="BAAI/bge-reranker-large", description="BAAI Reranker model name.") + use_fp16: bool = Field(default=True, description="Whether to use fp16 for inference.") + + class ObjectRankerConfig(BaseRankerConfig): field_name: str = Field(..., description="field name of the object, field's value must can be compared.") order: Literal["desc", "asc"] = Field(default="desc", description="the direction of order.") @@ -130,6 +158,9 @@ class ChromaIndexConfig(VectorIndexConfig): """Config for chroma-based index.""" collection_name: str = Field(default="metagpt", description="The name of the collection.") + metadata: Optional[CollectionMetadata] = Field( + default=None, description="Optional metadata to associate with the collection" + ) class BM25IndexConfig(BaseIndexConfig): diff --git a/metagpt/utils/async_helper.py b/metagpt/utils/async_helper.py index ee440ef44..cecb20c5d 100644 --- a/metagpt/utils/async_helper.py +++ b/metagpt/utils/async_helper.py @@ -20,3 +20,18 @@ def run_coroutine_in_new_loop(coroutine) -> Any: new_loop.call_soon_threadsafe(new_loop.stop) t.join() new_loop.close() + + +class NestAsyncio: + """Make asyncio event loop reentrant.""" + + is_applied = False + + @classmethod + def apply_once(cls): + """Ensures `nest_asyncio.apply()` is called only once.""" + if not cls.is_applied: + import nest_asyncio + + nest_asyncio.apply() + cls.is_applied = True diff --git a/setup.py b/setup.py index 382e13a47..79b65ad47 100644 --- a/setup.py +++ b/setup.py @@ -32,12 +32,15 @@ extras_require = { "llama-index-core==0.10.15", "llama-index-embeddings-azure-openai==0.1.6", "llama-index-embeddings-openai==0.1.5", + "llama-index-embeddings-gemini==0.1.6", + "llama-index-embeddings-ollama==0.1.2", "llama-index-llms-azure-openai==0.1.4", "llama-index-readers-file==0.1.4", "llama-index-retrievers-bm25==0.1.3", "llama-index-vector-stores-faiss==0.1.1", "llama-index-vector-stores-elasticsearch==0.1.6", "llama-index-vector-stores-chroma==0.1.6", + "docx2txt==0.8", ], } diff --git a/tests/metagpt/rag/engines/test_simple.py b/tests/metagpt/rag/engines/test_simple.py index 9262ccb07..8c7a15be2 100644 --- a/tests/metagpt/rag/engines/test_simple.py +++ b/tests/metagpt/rag/engines/test_simple.py @@ -25,10 +25,6 @@ class TestSimpleEngine: def mock_simple_directory_reader(self, mocker): return mocker.patch("metagpt.rag.engines.simple.SimpleDirectoryReader") - @pytest.fixture - def mock_vector_store_index(self, mocker): - return mocker.patch("metagpt.rag.engines.simple.VectorStoreIndex.from_documents") - @pytest.fixture def mock_get_retriever(self, mocker): return mocker.patch("metagpt.rag.engines.simple.get_retriever") @@ -45,7 +41,6 @@ class TestSimpleEngine: self, mocker, mock_simple_directory_reader, - mock_vector_store_index, mock_get_retriever, mock_get_rankers, mock_get_response_synthesizer, @@ -81,11 +76,8 @@ class TestSimpleEngine: # Assert mock_simple_directory_reader.assert_called_once_with(input_dir=input_dir, input_files=input_files) - mock_vector_store_index.assert_called_once() - mock_get_retriever.assert_called_once_with( - configs=retriever_configs, index=mock_vector_store_index.return_value - ) - mock_get_rankers.assert_called_once_with(configs=ranker_configs, llm=llm) + mock_get_retriever.assert_called_once() + mock_get_rankers.assert_called_once() mock_get_response_synthesizer.assert_called_once_with(llm=llm) assert isinstance(engine, SimpleEngine) @@ -119,7 +111,7 @@ class TestSimpleEngine: # Assert assert isinstance(engine, SimpleEngine) - assert engine.index is not None + assert engine._transformations is not None def test_from_objs_with_bm25_config(self): # Setup @@ -137,6 +129,7 @@ class TestSimpleEngine: def test_from_index(self, mocker, mock_llm, mock_embedding): # Mock mock_index = mocker.MagicMock(spec=VectorStoreIndex) + mock_index.as_retriever.return_value = "retriever" mock_get_index = mocker.patch("metagpt.rag.engines.simple.get_index") mock_get_index.return_value = mock_index @@ -149,7 +142,7 @@ class TestSimpleEngine: # Assert assert isinstance(engine, SimpleEngine) - assert engine.index is mock_index + assert engine._retriever == "retriever" @pytest.mark.asyncio async def test_asearch(self, mocker): @@ -200,14 +193,11 @@ class TestSimpleEngine: mock_retriever = mocker.MagicMock(spec=ModifiableRAGRetriever) - mock_index = mocker.MagicMock(spec=VectorStoreIndex) - mock_index._transformations = mocker.MagicMock() - mock_run_transformations = mocker.patch("metagpt.rag.engines.simple.run_transformations") mock_run_transformations.return_value = ["node1", "node2"] # Setup - engine = SimpleEngine(retriever=mock_retriever, index=mock_index) + engine = SimpleEngine(retriever=mock_retriever) input_files = ["test_file1", "test_file2"] # Exec @@ -230,7 +220,7 @@ class TestSimpleEngine: return "" objs = [CustomTextNode(text=f"text_{i}", metadata={"obj": f"obj_{i}"}) for i in range(2)] - engine = SimpleEngine(retriever=mock_retriever, index=mocker.MagicMock()) + engine = SimpleEngine(retriever=mock_retriever) # Exec engine.add_objs(objs=objs) diff --git a/tests/metagpt/rag/factories/test_base.py b/tests/metagpt/rag/factories/test_base.py index 1d41e1872..0b0a44976 100644 --- a/tests/metagpt/rag/factories/test_base.py +++ b/tests/metagpt/rag/factories/test_base.py @@ -97,6 +97,5 @@ class TestConfigBasedFactory: def test_val_from_config_or_kwargs_key_error(self): # Test KeyError when the key is not found in both config object and kwargs config = DummyConfig(name=None) - with pytest.raises(KeyError) as exc_info: - ConfigBasedFactory._val_from_config_or_kwargs("missing_key", config) - assert "The key 'missing_key' is required but not provided" in str(exc_info.value) + val = ConfigBasedFactory._val_from_config_or_kwargs("missing_key", config) + assert val is None diff --git a/tests/metagpt/rag/factories/test_embedding.py b/tests/metagpt/rag/factories/test_embedding.py index 1ded6b4a8..1a9e9b2c9 100644 --- a/tests/metagpt/rag/factories/test_embedding.py +++ b/tests/metagpt/rag/factories/test_embedding.py @@ -1,5 +1,6 @@ import pytest +from metagpt.configs.embedding_config import EmbeddingType from metagpt.configs.llm_config import LLMType from metagpt.rag.factories.embedding import RAGEmbeddingFactory @@ -10,30 +11,51 @@ class TestRAGEmbeddingFactory: self.embedding_factory = RAGEmbeddingFactory() @pytest.fixture - def mock_openai_embedding(self, mocker): + def mock_config(self, mocker): + return mocker.patch("metagpt.rag.factories.embedding.config") + + @staticmethod + def mock_openai_embedding(mocker): return mocker.patch("metagpt.rag.factories.embedding.OpenAIEmbedding") - @pytest.fixture - def mock_azure_embedding(self, mocker): + @staticmethod + def mock_azure_embedding(mocker): return mocker.patch("metagpt.rag.factories.embedding.AzureOpenAIEmbedding") - def test_get_rag_embedding_openai(self, mock_openai_embedding): - # Exec - self.embedding_factory.get_rag_embedding(LLMType.OPENAI) + @staticmethod + def mock_gemini_embedding(mocker): + return mocker.patch("metagpt.rag.factories.embedding.GeminiEmbedding") - # Assert - mock_openai_embedding.assert_called_once() + @staticmethod + def mock_ollama_embedding(mocker): + return mocker.patch("metagpt.rag.factories.embedding.OllamaEmbedding") - def test_get_rag_embedding_azure(self, mock_azure_embedding): - # Exec - self.embedding_factory.get_rag_embedding(LLMType.AZURE) - - # Assert - mock_azure_embedding.assert_called_once() - - def test_get_rag_embedding_default(self, mocker, mock_openai_embedding): + @pytest.mark.parametrize( + ("mock_func", "embedding_type"), + [ + (mock_openai_embedding, LLMType.OPENAI), + (mock_azure_embedding, LLMType.AZURE), + (mock_openai_embedding, EmbeddingType.OPENAI), + (mock_azure_embedding, EmbeddingType.AZURE), + (mock_gemini_embedding, EmbeddingType.GEMINI), + (mock_ollama_embedding, EmbeddingType.OLLAMA), + ], + ) + def test_get_rag_embedding(self, mock_func, embedding_type, mocker): # Mock - mock_config = mocker.patch("metagpt.rag.factories.embedding.config") + mock = mock_func(mocker) + + # Exec + self.embedding_factory.get_rag_embedding(embedding_type) + + # Assert + mock.assert_called_once() + + def test_get_rag_embedding_default(self, mocker, mock_config): + # Mock + mock_openai_embedding = self.mock_openai_embedding(mocker) + + mock_config.embedding.api_type = None mock_config.llm.api_type = LLMType.OPENAI # Exec @@ -41,3 +63,44 @@ class TestRAGEmbeddingFactory: # Assert mock_openai_embedding.assert_called_once() + + @pytest.mark.parametrize( + "model, embed_batch_size, expected_params", + [("test_model", 100, {"model_name": "test_model", "embed_batch_size": 100}), (None, None, {})], + ) + def test_try_set_model_and_batch_size(self, mock_config, model, embed_batch_size, expected_params): + # Mock + mock_config.embedding.model = model + mock_config.embedding.embed_batch_size = embed_batch_size + + # Setup + test_params = {} + + # Exec + self.embedding_factory._try_set_model_and_batch_size(test_params) + + # Assert + assert test_params == expected_params + + def test_resolve_embedding_type(self, mock_config): + # Mock + mock_config.embedding.api_type = EmbeddingType.OPENAI + + # Exec + embedding_type = self.embedding_factory._resolve_embedding_type() + + # Assert + assert embedding_type == EmbeddingType.OPENAI + + def test_resolve_embedding_type_exception(self, mock_config): + # Mock + mock_config.embedding.api_type = None + mock_config.llm.api_type = LLMType.GEMINI + + # Assert + with pytest.raises(TypeError): + self.embedding_factory._resolve_embedding_type() + + def test_raise_for_key(self): + with pytest.raises(ValueError): + self.embedding_factory._raise_for_key("key") diff --git a/tests/metagpt/rag/factories/test_retriever.py b/tests/metagpt/rag/factories/test_retriever.py index ef1cef7e0..cd55a32db 100644 --- a/tests/metagpt/rag/factories/test_retriever.py +++ b/tests/metagpt/rag/factories/test_retriever.py @@ -1,6 +1,8 @@ import faiss import pytest from llama_index.core import VectorStoreIndex +from llama_index.core.embeddings import MockEmbedding +from llama_index.core.schema import TextNode from llama_index.vector_stores.chroma import ChromaVectorStore from llama_index.vector_stores.elasticsearch import ElasticsearchStore @@ -43,6 +45,14 @@ class TestRetrieverFactory: def mock_es_vector_store(self, mocker): return mocker.MagicMock(spec=ElasticsearchStore) + @pytest.fixture + def mock_nodes(self, mocker): + return [TextNode(text="msg")] + + @pytest.fixture + def mock_embedding(self): + return MockEmbedding(embed_dim=1) + def test_get_retriever_with_faiss_config(self, mock_faiss_index, mocker, mock_vector_store_index): mock_config = FAISSRetrieverConfig(dimensions=128) mocker.patch("faiss.IndexFlatL2", return_value=mock_faiss_index) @@ -52,42 +62,40 @@ class TestRetrieverFactory: assert isinstance(retriever, FAISSRetriever) - def test_get_retriever_with_bm25_config(self, mocker, mock_vector_store_index): + def test_get_retriever_with_bm25_config(self, mocker, mock_nodes): mock_config = BM25RetrieverConfig() mocker.patch("rank_bm25.BM25Okapi.__init__", return_value=None) - mocker.patch.object(self.retriever_factory, "_extract_index", return_value=mock_vector_store_index) - retriever = self.retriever_factory.get_retriever(configs=[mock_config]) + retriever = self.retriever_factory.get_retriever(configs=[mock_config], nodes=mock_nodes) assert isinstance(retriever, DynamicBM25Retriever) - def test_get_retriever_with_multiple_configs_returns_hybrid(self, mocker, mock_vector_store_index): - mock_faiss_config = FAISSRetrieverConfig(dimensions=128) + def test_get_retriever_with_multiple_configs_returns_hybrid(self, mocker, mock_nodes, mock_embedding): + mock_faiss_config = FAISSRetrieverConfig(dimensions=1) mock_bm25_config = BM25RetrieverConfig() mocker.patch("rank_bm25.BM25Okapi.__init__", return_value=None) - mocker.patch.object(self.retriever_factory, "_extract_index", return_value=mock_vector_store_index) - retriever = self.retriever_factory.get_retriever(configs=[mock_faiss_config, mock_bm25_config]) + retriever = self.retriever_factory.get_retriever( + configs=[mock_faiss_config, mock_bm25_config], nodes=mock_nodes, embed_model=mock_embedding + ) assert isinstance(retriever, SimpleHybridRetriever) - def test_get_retriever_with_chroma_config(self, mocker, mock_vector_store_index, mock_chroma_vector_store): + def test_get_retriever_with_chroma_config(self, mocker, mock_chroma_vector_store, mock_embedding): mock_config = ChromaRetrieverConfig(persist_path="/path/to/chroma", collection_name="test_collection") mock_chromadb = mocker.patch("metagpt.rag.factories.retriever.chromadb.PersistentClient") mock_chromadb.get_or_create_collection.return_value = mocker.MagicMock() mocker.patch("metagpt.rag.factories.retriever.ChromaVectorStore", return_value=mock_chroma_vector_store) - mocker.patch.object(self.retriever_factory, "_extract_index", return_value=mock_vector_store_index) - retriever = self.retriever_factory.get_retriever(configs=[mock_config]) + retriever = self.retriever_factory.get_retriever(configs=[mock_config], nodes=[], embed_model=mock_embedding) assert isinstance(retriever, ChromaRetriever) - def test_get_retriever_with_es_config(self, mocker, mock_vector_store_index, mock_es_vector_store): + def test_get_retriever_with_es_config(self, mocker, mock_es_vector_store, mock_embedding): mock_config = ElasticsearchRetrieverConfig(store_config=ElasticsearchStoreConfig()) mocker.patch("metagpt.rag.factories.retriever.ElasticsearchStore", return_value=mock_es_vector_store) - mocker.patch.object(self.retriever_factory, "_extract_index", return_value=mock_vector_store_index) - retriever = self.retriever_factory.get_retriever(configs=[mock_config]) + retriever = self.retriever_factory.get_retriever(configs=[mock_config], nodes=[], embed_model=mock_embedding) assert isinstance(retriever, ElasticsearchRetriever) @@ -111,3 +119,19 @@ class TestRetrieverFactory: extracted_index = self.retriever_factory._extract_index(index=mock_vector_store_index) assert extracted_index == mock_vector_store_index + + def test_get_or_build_when_get(self, mocker): + want = "existing_index" + mocker.patch.object(self.retriever_factory, "_extract_index", return_value=want) + + got = self.retriever_factory._build_es_index(None) + + assert got == want + + def test_get_or_build_when_build(self, mocker): + want = "call_build_es_index" + mocker.patch.object(self.retriever_factory, "_build_es_index", return_value=want) + + got = self.retriever_factory._build_es_index(None) + + assert got == want