Merge branch 'check_role_zero' into da_change

2026-06-08 15:05:17 +02:00 · 2024-08-14 22:28:45 +08:00 · 2024-08-14 22:28:45 +08:00 · c79a70517f
commit c79a70517f
parent 9963f90c92 55c5f794d7
54 changed files with 376 additions and 216 deletions
--- a/examples/agent_creator.py
+++ b/examples/agent_creator.py
@ -6,12 +6,13 @@ Author: garylin2099
 import re

 from metagpt.actions import Action
-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.const import METAGPT_ROOT
 from metagpt.logs import logger
 from metagpt.roles import Role
 from metagpt.schema import Message

+config = Config.default()
 EXAMPLE_CODE_FILE = METAGPT_ROOT / "examples/build_customized_agent.py"
 MULTI_ACTION_AGENT_CODE_EXAMPLE = EXAMPLE_CODE_FILE.read_text()

--- a/examples/exp_pool/decorator.py
+++ b/examples/exp_pool/decorator.py
@ -5,7 +5,7 @@ This script demonstrates how to automatically store experiences using @exp_cache
 import asyncio
 import uuid

-from metagpt.exp_pool import exp_cache, exp_manager
+from metagpt.exp_pool import exp_cache, get_exp_manager
 from metagpt.logs import logger


@ -20,7 +20,7 @@ async def main():
    resp = await produce(req=req)
    logger.info(f"The response of `produce({req})` is: {resp}")

-    exps = await exp_manager.query_exps(req)
+    exps = await get_exp_manager().query_exps(req)
    logger.info(f"Find experiences: {exps}")


--- a/metagpt/actions/rebuild_class_view.py
+++ b/metagpt/actions/rebuild_class_view.py
@ -14,7 +14,6 @@ from typing import Optional, Set, Tuple
 import aiofiles

 from metagpt.actions import Action
-from metagpt.config2 import config
 from metagpt.const import (
    AGGREGATION,
    COMPOSITION,
@ -40,7 +39,7 @@ class RebuildClassView(Action):

    graph_db: Optional[GraphRepository] = None

-    async def run(self, with_messages=None, format=config.prompt_schema):
+    async def run(self, with_messages=None, format=None):
        """
        Implementation of `Action`'s `run` method.

@ -48,6 +47,7 @@ class RebuildClassView(Action):
            with_messages (Optional[Type]): An optional argument specifying messages to react to.
            format (str): The format for the prompt schema.
        """
+        format = format if format else self.config.prompt_schema
        graph_repo_pathname = self.context.git_repo.workdir / GRAPH_REPO_FILE_REPO / self.context.git_repo.workdir.name
        self.graph_db = await DiGraphRepository.load_from(str(graph_repo_pathname.with_suffix(".json")))
        repo_parser = RepoParser(base_directory=Path(self.i_context))
--- a/metagpt/actions/rebuild_sequence_view.py
+++ b/metagpt/actions/rebuild_sequence_view.py
@ -18,7 +18,6 @@ from pydantic import BaseModel
 from tenacity import retry, stop_after_attempt, wait_random_exponential

 from metagpt.actions import Action
-from metagpt.config2 import config
 from metagpt.const import GRAPH_REPO_FILE_REPO
 from metagpt.logs import logger
 from metagpt.repo_parser import CodeBlockInfo, DotClassInfo
@ -84,7 +83,7 @@ class RebuildSequenceView(Action):

    graph_db: Optional[GraphRepository] = None

-    async def run(self, with_messages=None, format=config.prompt_schema):
+    async def run(self, with_messages=None, format=None):
        """
        Implementation of `Action`'s `run` method.

@ -92,6 +91,7 @@ class RebuildSequenceView(Action):
            with_messages (Optional[Type]): An optional argument specifying messages to react to.
            format (str): The format for the prompt schema.
        """
+        format = format if format else self.config.prompt_schema
        graph_repo_pathname = self.context.git_repo.workdir / GRAPH_REPO_FILE_REPO / self.context.git_repo.workdir.name
        self.graph_db = await DiGraphRepository.load_from(str(graph_repo_pathname.with_suffix(".json")))
        if not self.i_context:
--- a/metagpt/actions/research.py
+++ b/metagpt/actions/research.py
@ -8,7 +8,6 @@ from typing import Any, Callable, Coroutine, Optional, Union
 from pydantic import TypeAdapter, model_validator

 from metagpt.actions import Action
-from metagpt.config2 import config
 from metagpt.logs import logger
 from metagpt.tools.search_engine import SearchEngine
 from metagpt.tools.web_browser_engine import WebBrowserEngine
@ -134,8 +133,8 @@ class CollectLinks(Action):
                if len(remove) == 0:
                    break

-        model_name = config.llm.model
-        prompt = reduce_message_length(gen_msg(), model_name, system_text, config.llm.max_token)
+        model_name = self.config.llm.model
+        prompt = reduce_message_length(gen_msg(), model_name, system_text, self.config.llm.max_token)
        logger.debug(prompt)
        queries = await self._aask(prompt, [system_text])
        try:
--- a/metagpt/actions/search_enhanced_qa.py
+++ b/metagpt/actions/search_enhanced_qa.py
@ -45,7 +45,9 @@ Follow **Instructions**, generate output and make sure it follows the **Constrai
 SEARCH_ENHANCED_QA_SYSTEM_PROMPT = """
 You are a large language AI assistant built by MGX. You are given a user question, and please write clean, concise and accurate answer to the question. You will be given a set of related contexts to the question, each starting with a reference number like [[citation:x]], where x is a number. Please use the context.

-Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context do not provide sufficient information. Do not include [citation] in your anwser.
+Your answer must be correct, accurate and written by an expert using an unbiased and professional tone. Please limit to 1024 tokens. Do not give any information that is not related to the question, and do not repeat. Say "information is missing on" followed by the related topic, if the given context do not provide sufficient information.
+
+Do not include [citation:x] in your anwser, where x is a number. Other than code and specific names and citations, your answer must be written in the same language as the question.

 Here are the set of contexts:

@ -62,7 +64,7 @@ class SearchEnhancedQA(Action):
    desc: str = "Integrating search engine results to anwser the question."

    collect_links_action: CollectLinks = Field(
-        default=CollectLinks(), description="Action to collect relevant links from a search engine."
+        default_factory=CollectLinks, description="Action to collect relevant links from a search engine."
    )
    web_browse_and_summarize_action: WebBrowseAndSummarize = Field(
        default=None,
@ -95,7 +97,7 @@ class SearchEnhancedQA(Action):
    @model_validator(mode="after")
    def initialize(self):
        if self.web_browse_and_summarize_action is None:
-            self.web_browser_engine = WebBrowserEngine.from_browser_config(
+            web_browser_engine = WebBrowserEngine.from_browser_config(
                self.config.browser,
                proxy=self.config.proxy,
                java_script_enabled=self.java_script_enabled,
@ -103,7 +105,7 @@ class SearchEnhancedQA(Action):
                user_agent=self.user_agent,
            )

-            self.web_browse_and_summarize_action = WebBrowseAndSummarize(web_browser_engine=self.web_browser_engine)
+            self.web_browse_and_summarize_action = WebBrowseAndSummarize(web_browser_engine=web_browser_engine)

        return self

--- a/metagpt/actions/talk_action.py
+++ b/metagpt/actions/talk_action.py
@ -9,7 +9,6 @@
 from typing import Optional

 from metagpt.actions import Action
-from metagpt.config2 import config
 from metagpt.logs import logger
 from metagpt.schema import Message

@ -26,7 +25,7 @@ class TalkAction(Action):

    @property
    def language(self):
-        return self.context.kwargs.language or config.language
+        return self.context.kwargs.language or self.config.language

    @property
    def prompt(self):
--- a/metagpt/config2.py
+++ b/metagpt/config2.py
@ -97,20 +97,21 @@ class Config(CLIParams, YamlModel):
        return Config.from_yaml_file(pathname)

    @classmethod
-    def default(cls):
+    def default(cls, reload: bool = False):
        """Load default config
        - Priority: env < default_config_paths
        - Inside default_config_paths, the latter one overwrites the former one
        """
-        default_config_paths: List[Path] = [
+        default_config_paths = (
            METAGPT_ROOT / "config/config2.yaml",
            CONFIG_ROOT / "config2.yaml",
-        ]
-
-        dicts = [dict(os.environ)]
-        dicts += [Config.read_yaml(path) for path in default_config_paths]
-        final = merge_dict(dicts)
-        return Config(**final)
+        )
+        if reload or default_config_paths not in _CONFIG_CACHE:
+            dicts = [dict(os.environ)]
+            dicts += [Config.read_yaml(path) for path in default_config_paths]
+            final = merge_dict(dicts)
+            _CONFIG_CACHE[default_config_paths] = Config(**final)
+        return _CONFIG_CACHE[default_config_paths]

    @classmethod
    def from_llm_config(cls, llm_config: dict):
@ -160,4 +161,4 @@ def merge_dict(dicts: Iterable[Dict]) -> Dict:
    return result


-config = Config.default()
+_CONFIG_CACHE = {}
--- a/metagpt/configs/browser_config.py
+++ b/metagpt/configs/browser_config.py
@ -5,12 +5,23 @@
@Author  : alexanderwu
@File    : browser_config.py
 """
+from enum import Enum
 from typing import Literal

-from metagpt.tools import WebBrowserEngineType
 from metagpt.utils.yaml_model import YamlModel


+class WebBrowserEngineType(Enum):
+    PLAYWRIGHT = "playwright"
+    SELENIUM = "selenium"
+    CUSTOM = "custom"
+
+    @classmethod
+    def __missing__(cls, key):
+        """Default type conversion"""
+        return cls.CUSTOM
+
+
 class BrowserConfig(YamlModel):
    """Config for Browser"""

--- a/metagpt/configs/search_config.py
+++ b/metagpt/configs/search_config.py
@ -5,14 +5,23 @@
@Author  : alexanderwu
@File    : search_config.py
 """
+from enum import Enum
 from typing import Callable, Optional

 from pydantic import ConfigDict, Field

-from metagpt.tools import SearchEngineType
 from metagpt.utils.yaml_model import YamlModel


+class SearchEngineType(Enum):
+    SERPAPI_GOOGLE = "serpapi"
+    SERPER_GOOGLE = "serper"
+    DIRECT_GOOGLE = "google"
+    DUCK_DUCK_GO = "ddg"
+    CUSTOM_ENGINE = "custom"
+    BING = "bing"
+
+
 class SearchConfig(YamlModel):
    """Config for Search"""

--- a/metagpt/context.py
+++ b/metagpt/context.py
@ -10,7 +10,7 @@ from __future__ import annotations
 import os
 from typing import Any, Dict, Optional

-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field

 from metagpt.config2 import Config
 from metagpt.configs.llm_config import LLMConfig, LLMType
@ -61,7 +61,7 @@ class Context(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    kwargs: AttrDict = AttrDict()
-    config: Config = Config.default()
+    config: Config = Field(default_factory=Config.default)

    cost_manager: CostManager = CostManager()

--- a/metagpt/environment/mgx/mgx_env.py
+++ b/metagpt/environment/mgx/mgx_env.py
@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from metagpt.actions import (
    UserRequirement,
    WriteDesign,
@ -6,12 +8,12 @@ from metagpt.actions import (
    WriteTest,
 )
 from metagpt.actions.summarize_code import SummarizeCode
-from metagpt.const import AGENT
+from metagpt.const import AGENT, IMAGES
 from metagpt.environment.base_env import Environment
 from metagpt.logs import get_human_input
 from metagpt.roles import Architect, ProductManager, ProjectManager, Role
 from metagpt.schema import Message, SerializationMixin
-from metagpt.utils.common import any_to_str, any_to_str_set
+from metagpt.utils.common import any_to_str, any_to_str_set, extract_and_encode_images


 class MGXEnv(Environment, SerializationMixin):
@ -27,6 +29,8 @@ class MGXEnv(Environment, SerializationMixin):

    def publish_message(self, message: Message, user_defined_recipient: str = "", publicer: str = "") -> bool:
        """let the team leader take over message publishing"""
+        message = self.attach_images(message)  # for multi-modal message
+
        tl = self.get_role("Mike")  # TeamLeader's name is Mike

        if user_defined_recipient:
@ -119,9 +123,16 @@ class MGXEnv(Environment, SerializationMixin):
            converted_msg.role = "assistant"
        sent_from = converted_msg.metadata[AGENT] if AGENT in converted_msg.metadata else converted_msg.sent_from
        converted_msg.content = (
-            f"[Message] from {sent_from if sent_from else 'User'} to {converted_msg.send_to}: {converted_msg.content}"
+            f"[Message] from {sent_from or 'User'} to {converted_msg.send_to}: {converted_msg.content}"
        )
        return converted_msg

+    def attach_images(self, message: Message) -> Message:
+        if message.role == "user":
+            images = extract_and_encode_images(message.content)
+            if images:
+                message.add_metadata(IMAGES, images)
+        return message
+
    def __repr__(self):
        return "MGXEnv()"
--- a/metagpt/environment/minecraft/minecraft_env.py
+++ b/metagpt/environment/minecraft/minecraft_env.py
@ -11,7 +11,7 @@ from typing import Any, Iterable
 from llama_index.vector_stores.chroma import ChromaVectorStore
 from pydantic import ConfigDict, Field

-from metagpt.config2 import config as CONFIG
+from metagpt.config2 import Config
 from metagpt.environment.base_env import Environment
 from metagpt.environment.minecraft.const import MC_CKPT_DIR
 from metagpt.environment.minecraft.minecraft_ext_env import MinecraftExtEnv
@ -82,7 +82,7 @@ class MinecraftEnv(Environment, MinecraftExtEnv):
            persist_dir=f"{MC_CKPT_DIR}/skill/vectordb",
        )

-        if CONFIG.resume:
+        if Config.default().resume:
            logger.info(f"Loading Action Developer from {MC_CKPT_DIR}/action")
            self.chest_memory = read_json_file(f"{MC_CKPT_DIR}/action/chest_memory.json")

--- a/metagpt/exp_pool/init.py
+++ b/metagpt/exp_pool/init.py
@ -1,6 +1,6 @@
 """Experience pool init."""

-from metagpt.exp_pool.manager import exp_manager
+from metagpt.exp_pool.manager import get_exp_manager
 from metagpt.exp_pool.decorator import exp_cache

-__all__ = ["exp_manager", "exp_cache"]
+__all__ = ["get_exp_manager", "exp_cache"]
--- a/metagpt/exp_pool/decorator.py
+++ b/metagpt/exp_pool/decorator.py
@ -6,9 +6,9 @@ from typing import Any, Callable, Optional, TypeVar

 from pydantic import BaseModel, ConfigDict, model_validator

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.exp_pool.context_builders import BaseContextBuilder, SimpleContextBuilder
-from metagpt.exp_pool.manager import ExperienceManager, exp_manager
+from metagpt.exp_pool.manager import ExperienceManager, get_exp_manager
 from metagpt.exp_pool.perfect_judges import BasePerfectJudge, SimplePerfectJudge
 from metagpt.exp_pool.schema import Experience, Metric, QueryType, Score
 from metagpt.exp_pool.scorers import BaseScorer, SimpleScorer
@ -50,11 +50,14 @@ def exp_cache(
    """

    def decorator(func: Callable[..., ReturnType]) -> Callable[..., ReturnType]:
-        if not config.exp_pool.enabled:
-            return func
-
        @functools.wraps(func)
        async def get_or_create(args: Any, kwargs: Any) -> ReturnType:
+            config = Config.default()
+
+            if not config.exp_pool.enabled:
+                rsp = func(*args, **kwargs)
+                return await rsp if asyncio.iscoroutine(rsp) else rsp
+
            handler = ExpCacheHandler(
                func=func,
                args=args,
@ -114,7 +117,7 @@ class ExpCacheHandler(BaseModel):

        self._validate_params()

-        self.exp_manager = self.exp_manager or exp_manager
+        self.exp_manager = self.exp_manager or get_exp_manager()
        self.exp_scorer = self.exp_scorer or SimpleScorer()
        self.exp_perfect_judge = self.exp_perfect_judge or SimplePerfectJudge()
        self.context_builder = self.context_builder or SimpleContextBuilder()
--- a/metagpt/exp_pool/manager.py
+++ b/metagpt/exp_pool/manager.py
@ -2,9 +2,9 @@

 from typing import TYPE_CHECKING, Any

-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field

-from metagpt.config2 import Config, config
+from metagpt.config2 import Config
 from metagpt.exp_pool.schema import (
    DEFAULT_COLLECTION_NAME,
    DEFAULT_SIMILARITY_TOP_K,
@ -29,7 +29,7 @@ class ExperienceManager(BaseModel):

    model_config = ConfigDict(arbitrary_types_allowed=True)

-    config: Config = config
+    config: Config = Field(default_factory=Config.default)

    _storage: Any = None
    _vector_store: Any = None
@ -113,4 +113,11 @@ class ExperienceManager(BaseModel):
        return self.vector_store._collection.count()


-exp_manager = ExperienceManager()
+_exp_manager = None
+
+
+def get_exp_manager():
+    global _exp_manager
+    if _exp_manager is None:
+        _exp_manager = ExperienceManager()
+    return _exp_manager
--- a/metagpt/ext/stanford_town/actions/st_action.py
+++ b/metagpt/ext/stanford_town/actions/st_action.py
@ -8,7 +8,6 @@ from pathlib import Path
 from typing import Any, Optional, Union

 from metagpt.actions.action import Action
-from metagpt.config2 import config
 from metagpt.ext.stanford_town.utils.const import PROMPTS_DIR
 from metagpt.logs import logger

@ -62,13 +61,13 @@ class STAction(Action):
    async def _run_gpt35_max_tokens(self, prompt: str, max_tokens: int = 50, retry: int = 3):
        for idx in range(retry):
            try:
-                tmp_max_tokens_rsp = getattr(config.llm, "max_token", 1500)
-                setattr(config.llm, "max_token", max_tokens)
+                tmp_max_tokens_rsp = getattr(self.config.llm, "max_token", 1500)
+                setattr(self.config.llm, "max_token", max_tokens)
                self.llm.use_system_prompt = False  # to make it behave like a non-chat completions

                llm_resp = await self._aask(prompt)

-                setattr(config.llm, "max_token", tmp_max_tokens_rsp)
+                setattr(self.config.llm, "max_token", tmp_max_tokens_rsp)
                logger.info(f"Action: {self.cls_name} llm _run_gpt35_max_tokens raw resp: {llm_resp}")
                if self._func_validate(llm_resp, prompt):
                    return self._func_cleanup(llm_resp, prompt)
--- a/metagpt/ext/stanford_town/utils/utils.py
+++ b/metagpt/ext/stanford_town/utils/utils.py
@ -13,7 +13,7 @@ from typing import Union

 from openai import OpenAI

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.logs import logger


@ -48,6 +48,7 @@ def read_csv_to_list(curr_file: str, header=False, strip_trail=True):


 def get_embedding(text, model: str = "text-embedding-ada-002"):
+    config = Config.default()
    text = text.replace("\n", " ")
    if not text:
        text = "this is blank"
--- a/metagpt/learn/text_to_embedding.py
+++ b/metagpt/learn/text_to_embedding.py
@ -6,12 +6,13 @@
@File    : text_to_embedding.py
@Desc    : Text-to-Embedding skill, which provides text-to-embedding functionality.
 """
-import metagpt.config2
+from typing import Optional
+
 from metagpt.config2 import Config
 from metagpt.tools.openai_text_to_embedding import oas3_openai_text_to_embedding


-async def text_to_embedding(text, model="text-embedding-ada-002", config: Config = metagpt.config2.config):
+async def text_to_embedding(text, model="text-embedding-ada-002", config: Optional[Config] = None):
    """Text to embedding

    :param text: The text used for embedding.
@ -19,6 +20,7 @@ async def text_to_embedding(text, model="text-embedding-ada-002", config: Config
    :param config: OpenAI config with API key, For more details, checkout: `https://platform.openai.com/account/api-keys`
    :return: A json object of :class:`ResultEmbedding` class if successful, otherwise `{}`.
    """
+    config = config if config else Config.default()
    openai_api_key = config.get_openai_llm().api_key
    proxy = config.get_openai_llm().proxy
    return await oas3_openai_text_to_embedding(text, model=model, openai_api_key=openai_api_key, proxy=proxy)
--- a/metagpt/learn/text_to_image.py
+++ b/metagpt/learn/text_to_image.py
@ -7,8 +7,8 @@
@Desc    : Text-to-Image skill, which provides text-to-image functionality.
 """
 import base64
+from typing import Optional

-import metagpt.config2
 from metagpt.config2 import Config
 from metagpt.const import BASE64_FORMAT
 from metagpt.llm import LLM
@ -17,7 +17,7 @@ from metagpt.tools.openai_text_to_image import oas3_openai_text_to_image
 from metagpt.utils.s3 import S3


-async def text_to_image(text, size_type: str = "512x512", config: Config = metagpt.config2.config):
+async def text_to_image(text, size_type: str = "512x512", config: Optional[Config] = None):
    """Text to image

    :param text: The text used for image conversion.
@ -25,6 +25,7 @@ async def text_to_image(text, size_type: str = "512x512", config: Config = metag
    :param config: Config
    :return: The image data is returned in Base64 encoding.
    """
+    config = config if config else Config.default()
    image_declaration = "data:image/png;base64,"

    model_url = config.metagpt_tti_url
--- a/metagpt/learn/text_to_speech.py
+++ b/metagpt/learn/text_to_speech.py
@ -6,7 +6,8 @@
@File    : text_to_speech.py
@Desc    : Text-to-Speech skill, which provides text-to-speech functionality
 """
-import metagpt.config2
+from typing import Optional
+
 from metagpt.config2 import Config
 from metagpt.const import BASE64_FORMAT
 from metagpt.tools.azure_tts import oas3_azsure_tts
@ -20,7 +21,7 @@ async def text_to_speech(
    voice="zh-CN-XiaomoNeural",
    style="affectionate",
    role="Girl",
-    config: Config = metagpt.config2.config,
+    config: Optional[Config] = None,
 ):
    """Text to speech
    For more details, check out:`https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts`
@ -38,7 +39,7 @@ async def text_to_speech(
    :return: Returns the Base64-encoded .wav/.mp3 file data if successful, otherwise an empty string.

    """
-
+    config = config if config else Config.default()
    subscription_key = config.azure_tts_subscription_key
    region = config.azure_tts_region
    if subscription_key and region:
--- a/metagpt/memory/brain_memory.py
+++ b/metagpt/memory/brain_memory.py
@ -12,9 +12,9 @@ import json
 import re
 from typing import Dict, List, Optional

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator

-from metagpt.config2 import config
+from metagpt.config2 import Config as _Config
 from metagpt.const import DEFAULT_MAX_TOKENS, DEFAULT_TOKEN_SIZE
 from metagpt.logs import logger
 from metagpt.provider import MetaGPTLLM
@ -32,6 +32,12 @@ class BrainMemory(BaseModel):
    last_talk: Optional[str] = None
    cacheable: bool = True
    llm: Optional[BaseLLM] = Field(default=None, exclude=True)
+    config: Optional[_Config] = None
+
+    @field_validator("config")
+    @classmethod
+    def set_default_config(cls, config):
+        return config if config else _Config.default()

    class Config:
        arbitrary_types_allowed = True
@ -54,9 +60,8 @@ class BrainMemory(BaseModel):
        texts = [m.content for m in self.knowledge]
        return "\n".join(texts)

-    @staticmethod
-    async def loads(redis_key: str) -> "BrainMemory":
-        redis = Redis(config.redis)
+    async def loads(self, redis_key: str) -> "BrainMemory":
+        redis = Redis(self.config.redis)
        if not redis_key:
            return BrainMemory()
        v = await redis.get(key=redis_key)
@ -70,7 +75,7 @@ class BrainMemory(BaseModel):
    async def dumps(self, redis_key: str, timeout_sec: int = 30 * 60):
        if not self.is_dirty:
            return
-        redis = Redis(config.redis)
+        redis = Redis(self.config.redis)
        if not redis_key:
            return False
        v = self.model_dump_json()
@ -140,7 +145,7 @@ class BrainMemory(BaseModel):
            return text
        summary = await self._summarize(text=text, max_words=max_words, keep_language=keep_language, limit=limit)
        if summary:
-            await self.set_history_summary(history_summary=summary, redis_key=config.redis_key)
+            await self.set_history_summary(history_summary=summary, redis_key=self.config.redis_key)
            return summary
        raise ValueError(f"text too long:{text_length}")

@ -164,7 +169,7 @@ class BrainMemory(BaseModel):
        msgs.reverse()
        self.history = msgs
        self.is_dirty = True
-        await self.dumps(redis_key=config.redis.key)
+        await self.dumps(redis_key=self.config.redis.key)
        self.is_dirty = False

        return BrainMemory.to_metagpt_history_format(self.history)
@ -181,7 +186,7 @@ class BrainMemory(BaseModel):

        summary = await self.summarize(llm=llm, max_words=500)

-        language = config.language
+        language = self.config.language
        command = f"Translate the above summary into a {language} title of less than {max_words} words."
        summaries = [summary, command]
        msg = "\n".join(summaries)
--- a/metagpt/prompts/di/role_zero.py
+++ b/metagpt/prompts/di/role_zero.py
@ -38,11 +38,9 @@ class Task(BaseModel):
 {available_commands}
 Special Command: Use {{"command_name": "end"}} to do nothing or indicate completion of all requirements and the end of actions.

-
 # Example
 {example}

-
 # Instruction
 {instruction}
 """
@ -125,30 +123,46 @@ If no issues are detected, the original json data should be returned unchanged.
 Output the JSON data in a format that can be loaded by the json.loads() function.
 """

-QUICK_THINK_PROMPT = """
+QUICK_THINK_SYSTEM_PROMPT = """
+{role_info}
+Your role is to determine the appropriate response category for the given request.
+
 # Response Categories
 ## QUICK: 
-For straightforward questions or requests that can be answered directly. This includes common-sense inquiries, legal or logical questions, basic math, short coding tasks, multiple-choice questions, greetings, casual chat, and inquiries about you or your team.
+For straightforward questions or requests that can be answered directly. This includes common-sense inquiries, legal or logical questions, basic math, short coding tasks, multiple-choice questions, greetings, casual chat, daily planning, and inquiries about you or your team.

 ## SEARCH
 For queries that require retrieving up-to-date or detailed information. This includes time-sensitive or location-specific questions like current events or weather. Use this only if the information isn't readily available.
+If a file or link is provided, you don't need to search for additional information.

 ## TASK
-For complex requests that involve multiple steps or detailed instructions. Examples include software development, project planning, or any task that requires a sequence of actions.
+For requests that involve tool utilizations, computer operations, multiple steps or detailed instructions. Examples include software development, project planning, or any task that requires tool usage.

 ## AMBIGUOUS
 For requests that are unclear, lack sufficient detail, or are outside the system's capabilities. Common characteristics of AMBIGUOUS requests:

- Incomplete Information: Requests that imply complex tasks but lack critical details  (e.g., "Redesign this logo" without providing the original logo or specifying design requirements).
+- Incomplete Information: Requests that imply complex tasks but lack critical details  (e.g., "Redesign this logo" without specifying design requirements).
 - Vagueness: Broad, unspecified, or unclear requests that make it difficult to provide a precise answer. 
- Out of Expertise: Requests for specialized advice (e.g., medical or legal advice) or highly technical tasks beyond the model's scope.
 - Unrealistic Scope: Overly broad requests that are impossible to address meaningfully in a single response (e.g., "Tell me everything about...").
+- Missing files: Requests that refer to specific documents, images, or data without providing them for reference. (when providing a file, website, or data, either the content, link, or path **must** be included)

-**Note:** Before categorizing a request as TASK, consider whether the user has provided sufficient information to proceed with the task. If the request is complex but lacks essential details or the mentioned files, it should fall under AMBIGUOUS.
+**Note:** Before categorizing a request as TASK:
+1. Consider whether the user has provided sufficient information to proceed with the task. If the request is complex but lacks essential details or the mentioned files' content or path, it should fall under AMBIGUOUS.
+2. If the request is a "how-to" question that asks for a general plan, approach or strategy, it should be categorized as QUICK.

 {examples}
+"""

-Respond with a concise thought, then provide the appropriate response category: QUICK, SEARCH, TASK, or AMBIGUOUS. Your response:
+QUICK_THINK_PROMPT = """
+# Instruction
+Determine the previous message's intent.
+Respond with a concise thought, then provide the appropriate response category: QUICK, SEARCH, TASK, or AMBIGUOUS. 
+
+# Format
+Thought: [Your thought here]
+Response Category: [QUICK/SEARCH/TASK/AMBIGUOUS]
+
+# Response:
 """


@ -163,8 +177,8 @@ Response Category: QUICK.
 Thought: This is a general knowledge question that can be answered concisely. 
 Response Category: QUICK.

-3. Request: "Can you help me plan a healthy diet for a week?"
-Thought: The user is requesting a simple plan that can be provided immediately. 
+3. Request: "Please help me write a learning plan for Python web crawlers"
+Thought: Writing a learning plan is a daily planning task that can be answered directly.
 Response Category: QUICK.

 4. Request: "Can you help me find the latest research papers on deep learning?"
@ -176,18 +190,20 @@ Thought: This is a detailed software development task that requires multiple ste
 Response Category: TASK.

 6. Request: "Summarize this document for me."
-Thought: The request mentions summarizing a document but doesn't provide the document itself, making it impossible to fulfill. 
+Thought: The request mentions summarizing a document but doesn't provide the path or content of the document, making it impossible to fulfill. 
 Response Category: AMBIGUOUS.

-7. Request: "Optimize this process." 
+7. Request: "Summarize this document for me '/data/path/docmument.pdf'." 
+Thought: The request mentions summarizing a document and has provided the path to the document. It can be done by reading the document using a tool then summarizing it.
+Response Category: TASK.
+
+8. Request: "Optimize this process." 
 Thought: The request is vague and lacks specifics, requiring clarification on the process to optimize.
 Response Category: AMBIGUOUS.

-8. Request: "Create a poster for our upcoming event." 
-Thought: Critical details like event theme, date, and location are missing, making it impossible to complete the task.
-Response Category: AMBIGUOUS.
-
-# Instruction
 """

-QUICK_THINK_PROMPT = QUICK_THINK_PROMPT.format(examples=QUICK_THINK_EXAMPLES)
+QUICK_RESPONSE_SYSTEM_PROMPT = """
+{role_info}
+However, you MUST respond to the user message by yourself directly, DON'T ask your team members.
+"""
--- a/metagpt/prompts/di/team_leader.py
+++ b/metagpt/prompts/di/team_leader.py
@ -35,11 +35,11 @@ Sixth, describe the requirements as they pertain to software development, data a
 Seventh, describe the technologies you must use.  
 """
 )
-QUICK_THINK_SYSTEM_PROMPT = """
+
+TL_INFO = """
 {role_info}
 Your team member:
 {team_info}
-However, you MUST respond to the user message by yourself directly, DON'T ask your team members.
 """

 FINISH_CURRENT_TASK_CMD = """
--- a/metagpt/provider/base_llm.py
+++ b/metagpt/provider/base_llm.py
@ -24,8 +24,9 @@ from tenacity import (

 from metagpt.configs.compress_msg_config import CompressType
 from metagpt.configs.llm_config import LLMConfig
-from metagpt.const import LLM_API_TIMEOUT, USE_CONFIG_TIMEOUT
+from metagpt.const import IMAGES, LLM_API_TIMEOUT, USE_CONFIG_TIMEOUT
 from metagpt.logs import logger
+from metagpt.provider.constant import MULTI_MODAL_MODELS
 from metagpt.schema import Message
 from metagpt.utils.common import log_and_reraise
 from metagpt.utils.cost_manager import CostManager, Costs
@ -50,7 +51,7 @@ class BaseLLM(ABC):
        pass

    def _user_msg(self, msg: str, images: Optional[Union[str, list[str]]] = None) -> dict[str, Union[str, dict]]:
-        if images:
+        if images and self.support_image_input():
            # as gpt-4v, chat with image
            return self._user_msg_with_imgs(msg, images)
        else:
@ -76,6 +77,9 @@ class BaseLLM(ABC):
    def _system_msg(self, msg: str) -> dict[str, str]:
        return {"role": "system", "content": msg}

+    def support_image_input(self) -> bool:
+        return any([m in self.config.model for m in MULTI_MODAL_MODELS])
+
    def format_msg(self, messages: Union[str, Message, list[dict], list[Message], list[str]]) -> list[dict]:
        """convert messages to list[dict]."""
        from metagpt.schema import Message
@ -91,7 +95,9 @@ class BaseLLM(ABC):
                assert set(msg.keys()) == set(["role", "content"])
                processed_messages.append(msg)
            elif isinstance(msg, Message):
-                processed_messages.append(msg.to_dict())
+                images = msg.metadata.get(IMAGES)
+                processed_msg = self._user_msg(msg=msg.content, images=images) if images else msg.to_dict()
+                processed_messages.append(processed_msg)
            else:
                raise ValueError(
                    f"Only support message type are: str, Message, dict, but got {type(messages).__name__}!"
--- a/metagpt/rag/factories/embedding.py
+++ b/metagpt/rag/factories/embedding.py
@ -1,7 +1,7 @@
 """RAG Embedding Factory."""
 from __future__ import annotations

-from typing import Any
+from typing import Any, Optional

 from llama_index.core.embeddings import BaseEmbedding
 from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
@ -9,7 +9,7 @@ from llama_index.embeddings.gemini import GeminiEmbedding
 from llama_index.embeddings.ollama import OllamaEmbedding
 from llama_index.embeddings.openai import OpenAIEmbedding

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.configs.embedding_config import EmbeddingType
 from metagpt.configs.llm_config import LLMType
 from metagpt.rag.factories.base import GenericFactory
@ -18,7 +18,7 @@ from metagpt.rag.factories.base import GenericFactory
 class RAGEmbeddingFactory(GenericFactory):
    """Create LlamaIndex Embedding with MetaGPT's embedding config."""

-    def __init__(self):
+    def __init__(self, config: Optional[Config] = None):
        creators = {
            EmbeddingType.OPENAI: self._create_openai,
            EmbeddingType.AZURE: self._create_azure,
@ -29,6 +29,7 @@ class RAGEmbeddingFactory(GenericFactory):
            LLMType.AZURE: self._create_azure,
        }
        super().__init__(creators)
+        self.config = config if self.config else Config.default()

    def get_rag_embedding(self, key: EmbeddingType = None) -> BaseEmbedding:
        """Key is EmbeddingType."""
@ -40,18 +41,18 @@ class RAGEmbeddingFactory(GenericFactory):
        If the embedding type is not specified, for backward compatibility, it checks if the LLM API type is either OPENAI or AZURE.
        Raise TypeError if embedding type not found.
        """
-        if config.embedding.api_type:
-            return config.embedding.api_type
+        if self.config.embedding.api_type:
+            return self.config.embedding.api_type

-        if config.llm.api_type in [LLMType.OPENAI, LLMType.AZURE]:
-            return config.llm.api_type
+        if self.config.llm.api_type in [LLMType.OPENAI, LLMType.AZURE]:
+            return self.config.llm.api_type

        raise TypeError("To use RAG, please set your embedding in config2.yaml.")

    def _create_openai(self) -> OpenAIEmbedding:
        params = dict(
-            api_key=config.embedding.api_key or config.llm.api_key,
-            api_base=config.embedding.base_url or config.llm.base_url,
+            api_key=self.config.embedding.api_key or self.config.llm.api_key,
+            api_base=self.config.embedding.base_url or self.config.llm.base_url,
        )

        self._try_set_model_and_batch_size(params)
@ -60,9 +61,9 @@ class RAGEmbeddingFactory(GenericFactory):

    def _create_azure(self) -> AzureOpenAIEmbedding:
        params = dict(
-            api_key=config.embedding.api_key or config.llm.api_key,
-            azure_endpoint=config.embedding.base_url or config.llm.base_url,
-            api_version=config.embedding.api_version or config.llm.api_version,
+            api_key=self.config.embedding.api_key or self.config.llm.api_key,
+            azure_endpoint=self.config.embedding.base_url or self.config.llm.base_url,
+            api_version=self.config.embedding.api_version or self.config.llm.api_version,
        )

        self._try_set_model_and_batch_size(params)
@ -71,8 +72,8 @@ class RAGEmbeddingFactory(GenericFactory):

    def _create_gemini(self) -> GeminiEmbedding:
        params = dict(
-            api_key=config.embedding.api_key,
-            api_base=config.embedding.base_url,
+            api_key=self.config.embedding.api_key,
+            api_base=self.config.embedding.base_url,
        )

        self._try_set_model_and_batch_size(params)
@ -81,7 +82,7 @@ class RAGEmbeddingFactory(GenericFactory):

    def _create_ollama(self) -> OllamaEmbedding:
        params = dict(
-            base_url=config.embedding.base_url,
+            base_url=self.config.embedding.base_url,
        )

        self._try_set_model_and_batch_size(params)
@ -90,14 +91,15 @@ class RAGEmbeddingFactory(GenericFactory):

    def _try_set_model_and_batch_size(self, params: dict):
        """Set the model_name and embed_batch_size only when they are specified."""
-        if config.embedding.model:
-            params["model_name"] = config.embedding.model
+        if self.config.embedding.model:
+            params["model_name"] = self.config.embedding.model

-        if config.embedding.embed_batch_size:
-            params["embed_batch_size"] = config.embedding.embed_batch_size
+        if self.config.embedding.embed_batch_size:
+            params["embed_batch_size"] = self.config.embedding.embed_batch_size

    def _raise_for_key(self, key: Any):
        raise ValueError(f"The embedding type is currently not supported: `{type(key)}`, {key}")


-get_rag_embedding = RAGEmbeddingFactory().get_rag_embedding
+def get_rag_embedding(key: EmbeddingType = None, config: Optional[Config] = None):
+    return RAGEmbeddingFactory(config=config).get_rag_embedding(key)
--- a/metagpt/rag/factories/llm.py
+++ b/metagpt/rag/factories/llm.py
@ -10,9 +10,9 @@ from llama_index.core.llms import (
    LLMMetadata,
 )
 from llama_index.core.llms.callbacks import llm_completion_callback
-from pydantic import Field
+from pydantic import Field, model_validator

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.llm import LLM
 from metagpt.provider.base_llm import BaseLLM
 from metagpt.utils.async_helper import NestAsyncio
@ -26,9 +26,23 @@ class RAGLLM(CustomLLM):
    """

    model_infer: BaseLLM = Field(..., description="The MetaGPT's LLM.")
-    context_window: int = TOKEN_MAX.get(config.llm.model, DEFAULT_CONTEXT_WINDOW)
-    num_output: int = config.llm.max_token
-    model_name: str = config.llm.model
+    context_window: int = -1
+    num_output: int = -1
+    model_name: str = ""
+
+    @model_validator(mode="after")
+    def update_from_config(self):
+        config = Config.default()
+        if self.context_window < 0:
+            self.context_window = TOKEN_MAX.get(config.llm.model, DEFAULT_CONTEXT_WINDOW)
+
+        if self.num_output < 0:
+            self.num_output = config.llm.max_token
+
+        if not self.model_name:
+            self.model_name = config.llm.model
+
+        return self

    @property
    def metadata(self) -> LLMMetadata:
--- a/metagpt/rag/schema.py
+++ b/metagpt/rag/schema.py
@ -10,7 +10,7 @@ from llama_index.core.schema import TextNode
 from llama_index.core.vector_stores.types import VectorStoreQueryMode
 from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_validator

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.configs.embedding_config import EmbeddingType
 from metagpt.logs import logger
 from metagpt.rag.interface import RAGObject
@ -45,6 +45,7 @@ class FAISSRetrieverConfig(IndexRetrieverConfig):
    @model_validator(mode="after")
    def check_dimensions(self):
        if self.dimensions == 0:
+            config = Config.default()
            self.dimensions = config.embedding.dimensions or self._embedding_type_to_dimensions.get(
                config.embedding.api_type, 1536
            )
--- a/metagpt/roles/di/role_zero.py
+++ b/metagpt/roles/di/role_zero.py
@ -2,7 +2,6 @@ from __future__ import annotations

 import inspect
 import json
-import os
 import re
 import traceback
 from typing import Annotated, Callable, Dict, List, Literal, Optional, Tuple
@ -13,6 +12,7 @@ from metagpt.actions import Action, UserRequirement
 from metagpt.actions.analyze_requirements import AnalyzeRequirementsRestrictions
 from metagpt.actions.di.run_command import RunCommand
 from metagpt.actions.search_enhanced_qa import SearchEnhancedQA
+from metagpt.const import IMAGES
 from metagpt.exp_pool import exp_cache
 from metagpt.exp_pool.context_builders import RoleZeroContextBuilder
 from metagpt.exp_pool.serializers import RoleZeroSerializer
@ -22,6 +22,9 @@ from metagpt.prompts.di.role_zero import (
    CMD_PROMPT,
    JSON_REPAIR_PROMPT,
    QUICK_THINK_PROMPT,
+    QUICK_THINK_EXAMPLES,
+    QUICK_THINK_SYSTEM_PROMPT,
+    QUICK_RESPONSE_SYSTEM_PROMPT,
    REGENERATE_PROMPT,
    ROLE_INSTRUCTION,
    SYSTEM_PROMPT,
@ -35,13 +38,7 @@ from metagpt.tools.libs.browser import Browser
 from metagpt.tools.libs.editor import Editor
 from metagpt.tools.tool_recommend import BM25ToolRecommender, ToolRecommender
 from metagpt.tools.tool_registry import register_tool
-from metagpt.utils.common import (
-    CodeParser,
-    any_to_str,
-    encode_image,
-    extract_image_paths,
-    is_support_image_input,
-)
+from metagpt.utils.common import CodeParser, any_to_str, extract_and_encode_images
 from metagpt.utils.repair_llm_raw_output import (
    RepairType,
    repair_escape_error,
@ -219,15 +216,14 @@ class RoleZero(Role):
        return memory

    def parse_images(self, memory: list[Message]) -> list[Message]:
-        if not is_support_image_input(self.llm.model):
+        if not self.llm.support_image_input():
            return memory
-        for i, msg in enumerate(memory):
-            if msg.role == "user" and isinstance(msg.content, str) and extract_image_paths(msg.content):
-                images = []
-                for path in extract_image_paths(msg.content):
-                    if os.path.exists(path):
-                        images.append(encode_image(path))
-                memory[i] = self.llm._user_msg_with_imgs(msg.content, images=images)
+        for msg in memory:
+            if IMAGES in msg.metadata or msg.role != "user":
+                continue
+            images = extract_and_encode_images(msg.content)
+            if images:
+                msg.add_metadata(IMAGES, images)
        return memory

    async def _act(self) -> Message:
@ -273,6 +269,10 @@ class RoleZero(Role):
            rsp = await self._act()
            actions_taken += 1
        return rsp  # return output from the last action
+    
+    def format_quick_system_prompt(self) -> str:
+        """Format the system prompt for quick thinking."""
+        return QUICK_THINK_SYSTEM_PROMPT.format(examples=QUICK_THINK_EXAMPLES, role_info=self._get_prefix())

    async def _quick_think(self) -> Tuple[Message, str]:
        answer = ""
@ -284,12 +284,12 @@ class RoleZero(Role):
        # routing
        memory = self.get_memories(k=4)  # FIXME: A magic number for two rounds of Q&A
        context = self.llm.format_msg(memory + [UserMessage(content=QUICK_THINK_PROMPT)])
-        intent_result = await self.llm.aask(context)
+        intent_result = await self.llm.aask(context, system_msgs=[self.format_quick_system_prompt()])

-        if "QUICK" in intent_result or "AMBIGUOUS " in intent_result:  # llm call with the original context
+        if "QUICK" in intent_result or "AMBIGUOUS" in intent_result:  # llm call with the original context
            async with ThoughtReporter(enable_llm_stream=True) as reporter:
                await reporter.async_report({"type": "quick"})
-                answer = await self.llm.aask(self.llm.format_msg(memory))
+                answer = await self.llm.aask(self.llm.format_msg(memory), system_msgs=[QUICK_RESPONSE_SYSTEM_PROMPT.format(role_info=self._get_prefix())])
        elif "SEARCH" in intent_result:
            query = "\n".join(str(msg) for msg in memory)
            answer = await SearchEnhancedQA().run(query)
@ -404,11 +404,10 @@ class RoleZero(Role):
        """command requiring special check or parsing"""
        command_output = ""

-        if cmd["command_name"] == "Plan.finish_current_task" and not self.planner.plan.is_plan_finished():
-            # task_result = TaskResult(code=str(commands), result=outputs, is_success=is_success)
-            # self.planner.plan.current_task.update_task_result(task_result=task_result)
-            self.planner.plan.finish_current_task()
-            command_output = "Current task is finished. "
+        if cmd["command_name"] == "Plan.finish_current_task":
+            if not self.planner.plan.is_plan_finished():
+                self.planner.plan.finish_current_task()
+            command_output = "Current task is finished. If all tasks are finished, use 'end' to stop."

        elif cmd["command_name"] == "end":
            self._set_state(-1)
@ -438,7 +437,19 @@ class RoleZero(Role):
            if self.planner.plan.current_task
            else ""
        )
-        return plan_status, current_task
+        # format plan status
+        # Example:
+        # [GOAL] create a 2048 game
+        # [TASK_ID 1] (finished) Create a Product Requirement Document (PRD) for the 2048 game. This task depends on tasks[]. [Assign to Alice]
+        # [TASK_ID 2] (        ) Design the system architecture for the 2048 game. This task depends on tasks[1]. [Assign to Bob]
+        formatted_plan_status = f"[GOAL] {plan_status['goal']}\n"
+        if len(plan_status["tasks"]) > 0:
+            formatted_plan_status += "[Plan]\n"
+            for task in plan_status["tasks"]:
+                formatted_plan_status += f"[TASK_ID {task['task_id']}] ({'finished' if task['is_finished'] else '    '}){task['instruction']} This task depends on tasks{task['dependent_task_ids']}. [Assign to {task['assignee']}]\n"
+        else:
+            formatted_plan_status += "No Plan \n"
+        return formatted_plan_status, current_task

    def _retrieve_experience(self) -> str:
        """Default implementation of experience retrieval. Can be overwritten in subclasses."""
--- a/metagpt/roles/di/team_leader.py
+++ b/metagpt/roles/di/team_leader.py
@ -7,7 +7,7 @@ from pydantic import Field
 from metagpt.actions.di.run_command import RunCommand
 from metagpt.prompts.di.team_leader import (
    FINISH_CURRENT_TASK_CMD,
-    QUICK_THINK_SYSTEM_PROMPT,
+    TL_INFO,
    TL_INSTRUCTION,
    TL_THOUGHT_GUIDANCE,
 )
@ -47,15 +47,12 @@ class TeamLeader(RoleZero):
            #     continue
            team_info += f"{role.name}: {role.profile}, {role.goal}\n"
        return team_info
-
-    async def _quick_think(self) -> Message:
-        # insert team info for quick question
-        self.llm.system_prompt = QUICK_THINK_SYSTEM_PROMPT.format(
-            role_info=super()._get_prefix(),
-            team_info=self._get_team_info(),
-        )
-        return await super()._quick_think()
-
+    
+    def _get_prefix(self) -> str:
+        role_info = super()._get_prefix()
+        team_info = self._get_team_info()
+        return TL_INFO.format(role_info=role_info, team_info=team_info)
+    
    async def _think(self) -> bool:
        self.instruction = TL_INSTRUCTION.format(team_info=self._get_team_info())
        return await super()._think()
--- a/metagpt/schema.py
+++ b/metagpt/schema.py
@ -121,7 +121,7 @@ class SerializationMixin(BaseModel, extra="forbid"):

        if class_type is None:
            # TODO could try dynamic import
-            raise TypeError("Trying to instantiate {class_full_name}, which has not yet been defined!")
+            raise TypeError(f"Trying to instantiate {class_full_name}, which has not yet been defined!")

        return class_type(**value)

--- a/metagpt/software_company.py
+++ b/metagpt/software_company.py
@ -27,7 +27,7 @@ def generate_repo(
    recover_path=None,
 ):
    """Run the startup logic. Can be called from CLI or other Python scripts."""
-    from metagpt.config2 import config
+    from metagpt.config2 import Config
    from metagpt.context import Context
    from metagpt.roles import (
        Architect,
@ -38,6 +38,8 @@ def generate_repo(
    )
    from metagpt.team import Team

+    config = Config.default()
+
    config.update_via_cli(project_path, project_name, inc, reqa_file, max_auto_summarize_code)
    ctx = Context(config=config)

--- a/metagpt/tools/init.py
+++ b/metagpt/tools/init.py
@ -6,33 +6,18 @@
@File    : __init__.py
 """

-from enum import Enum
 from metagpt.tools import libs  # this registers all tools
 from metagpt.tools.tool_registry import TOOL_REGISTRY
+from metagpt.configs.search_config import SearchEngineType
+from metagpt.configs.browser_config import WebBrowserEngineType
+

 _ = libs, TOOL_REGISTRY  # Avoid pre-commit error


-class SearchEngineType(Enum):
-    SERPAPI_GOOGLE = "serpapi"
-    SERPER_GOOGLE = "serper"
-    DIRECT_GOOGLE = "google"
-    DUCK_DUCK_GO = "ddg"
-    CUSTOM_ENGINE = "custom"
-    BING = "bing"
-
-
-class WebBrowserEngineType(Enum):
-    PLAYWRIGHT = "playwright"
-    SELENIUM = "selenium"
-    CUSTOM = "custom"
-
-    @classmethod
-    def __missing__(cls, key):
-        """Default type conversion"""
-        return cls.CUSTOM
-
-
 class SearchInterface:
    async def asearch(self, *args, **kwargs):
        ...
+
+
+__all__ = ["SearchEngineType", "WebBrowserEngineType", "TOOL_REGISTRY"]
--- a/metagpt/tools/libs/browser.py
+++ b/metagpt/tools/libs/browser.py
@ -75,7 +75,7 @@ class Browser(BaseModel):
    page: Optional[Page] = None
    accessibility_tree: list = Field(default_factory=list)
    headless: bool = True
-    proxy: Optional[str] = Field(default_factory=get_proxy_from_env)
+    proxy: Optional[dict] = Field(default_factory=get_proxy_from_env)
    is_empty_page: bool = True
    reporter: BrowserReporter = Field(default_factory=BrowserReporter)

--- a/metagpt/tools/libs/editor.py
+++ b/metagpt/tools/libs/editor.py
@ -10,7 +10,7 @@ from pydantic import BaseModel, ConfigDict
 from metagpt.logs import logger
 from metagpt.tools.tool_registry import register_tool
 from metagpt.utils import read_docx
-from metagpt.utils.common import aread_bin, awrite_bin
+from metagpt.utils.common import aread, aread_bin, awrite_bin
 from metagpt.utils.repo_to_markdown import is_text_file
 from metagpt.utils.report import EditorReporter

@ -51,7 +51,7 @@ class Editor(BaseModel):
        """Read the whole content of a file. Using absolute paths as the argument for specifying the file location."""
        is_text, mime_type = await is_text_file(path)
        if is_text:
-            lines = self._read_text(path)
+            lines = await self._read_text(path)
        elif mime_type == "application/pdf":
            lines = await self._read_pdf(path)
        elif mime_type in {
@ -221,9 +221,9 @@ class Editor(BaseModel):
        return lint_passed, lint_message

    @staticmethod
-    def _read_text(path: Union[str, Path]) -> List[str]:
-        with open(str(path), "r") as f:
-            lines = f.readlines()
+    async def _read_text(path: Union[str, Path]) -> List[str]:
+        content = await aread(path)
+        lines = content.split("\n")
        return lines

    @staticmethod
--- a/metagpt/tools/libs/gpt_v_generator.py
+++ b/metagpt/tools/libs/gpt_v_generator.py
@ -7,7 +7,9 @@
 """
 import re
 from pathlib import Path
+from typing import Optional

+from metagpt.config2 import Config
 from metagpt.const import DEFAULT_WORKSPACE_ROOT
 from metagpt.logs import logger
 from metagpt.tools.tool_registry import register_tool
@ -36,11 +38,11 @@ class GPTvGenerator:
    It utilizes a vision model to analyze the layout from an image and generate webpage codes accordingly.
    """

-    def __init__(self):
+    def __init__(self, config: Optional[Config]):
        """Initialize GPTvGenerator class with default values from the configuration."""
-        from metagpt.config2 import config
        from metagpt.llm import LLM

+        config = config if config else Config.default()
        self.llm = LLM(llm_config=config.get_openai_llm())
        self.llm.model = "gpt-4-vision-preview"

--- a/metagpt/tools/ut_writer.py
+++ b/metagpt/tools/ut_writer.py
@ -4,7 +4,7 @@
 import json
 from pathlib import Path

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.provider.openai_api import OpenAILLM as GPTAPI
 from metagpt.utils.common import awrite

@ -282,6 +282,7 @@ class UTGenerator:
        """Choose based on different calling methods"""
        result = ""
        if self.chatgpt_method == "API":
+            config = Config.default()
            result = await GPTAPI(config.get_openai_llm()).aask_code(messages=messages)

        return result
--- a/metagpt/utils/common.py
+++ b/metagpt/utils/common.py
@ -566,7 +566,7 @@ def general_after_log(i: "loguru.Logger", sec_format: str = "%0.3f") -> Callable
    return log_it


-def read_json_file(json_file: str, encoding="utf-8") -> list[Any]:
+def read_json_file(json_file: str, encoding: str = "utf-8") -> list[Any]:
    if not Path(json_file).exists():
        raise FileNotFoundError(f"json_file: {json_file} not exist, return []")

@ -595,7 +595,7 @@ def handle_unknown_serialization(x: Any) -> str:
    return f"<Unserializable {type(x).__name__} object>"


-def write_json_file(json_file: str, data: Any, encoding: str = None, indent: int = 4, use_fallback: bool = False):
+def write_json_file(json_file: str, data: Any, encoding: str = "utf-8", indent: int = 4, use_fallback: bool = False):
    folder_path = Path(json_file).parent
    if not folder_path.exists():
        folder_path.mkdir(parents=True, exist_ok=True)
@ -840,12 +840,6 @@ def decode_image(img_url_or_b64: str) -> Image:
    return img


-def is_support_image_input(model_name: str) -> bool:
-    # model name can be gpt-4o-2024-08-06
-    support_models = ["gpt-4o", "gpt-4o-mini"]  # FIXME: hard code for now
-    return any([m in model_name for m in support_models])
-
-
 def extract_image_paths(content: str) -> bool:
    # We require that the path must have a space preceding it, like "xxx /an/absolute/path.jpg xxx"
    pattern = r"[^\s]+\.(?:png|jpe?g|gif|bmp|tiff)"
@ -853,6 +847,14 @@ def extract_image_paths(content: str) -> bool:
    return image_paths


+def extract_and_encode_images(content: str) -> list[str]:
+    images = []
+    for path in extract_image_paths(content):
+        if os.path.exists(path):
+            images.append(encode_image(path))
+    return images
+
+
 def log_and_reraise(retry_state: RetryCallState):
    logger.error(f"Retry attempts exhausted. Last exception: {retry_state.outcome.exception()}")
    logger.warning(
--- a/metagpt/utils/embedding.py
+++ b/metagpt/utils/embedding.py
@ -7,10 +7,11 @@
 """
 from llama_index.embeddings.openai import OpenAIEmbedding

-from metagpt.config2 import config
+from metagpt.config2 import Config


 def get_embedding() -> OpenAIEmbedding:
+    config = Config.default()
    llm = config.get_openai_llm()
    if llm is None:
        raise ValueError("To use OpenAIEmbedding, please ensure that config.llm.api_type is correctly set to 'openai'.")
--- a/metagpt/utils/make_sk_kernel.py
+++ b/metagpt/utils/make_sk_kernel.py
@ -13,10 +13,11 @@ from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion impo
    OpenAIChatCompletion,
 )

-from metagpt.config2 import config
+from metagpt.config2 import Config


 def make_sk_kernel():
+    config = Config.default()
    kernel = sk.Kernel()
    if llm := config.get_azure_llm():
        kernel.add_chat_service(
--- a/metagpt/utils/mermaid.py
+++ b/metagpt/utils/mermaid.py
@ -9,12 +9,14 @@ import asyncio
 import os
 from pathlib import Path

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.logs import logger
 from metagpt.utils.common import awrite, check_cmd_exists


-async def mermaid_to_file(engine, mermaid_code, output_file_without_suffix, width=2048, height=2048) -> int:
+async def mermaid_to_file(
+    engine, mermaid_code, output_file_without_suffix, width=2048, height=2048, config=None
+) -> int:
    """suffix: png/svg/pdf

    :param mermaid_code: mermaid code
@ -24,6 +26,7 @@ async def mermaid_to_file(engine, mermaid_code, output_file_without_suffix, widt
    :return: 0 if succeed, -1 if failed
    """
    # Write the Mermaid code to a temporary file
+    config = config if config else Config.default()
    dir_name = os.path.dirname(output_file_without_suffix)
    if dir_name and not os.path.exists(dir_name):
        os.makedirs(dir_name)
--- a/metagpt/utils/mmdc_pyppeteer.py
+++ b/metagpt/utils/mmdc_pyppeteer.py
@ -10,11 +10,11 @@ from urllib.parse import urljoin

 from pyppeteer import launch

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.logs import logger


-async def mermaid_to_file(mermaid_code, output_file_without_suffix, width=2048, height=2048) -> int:
+async def mermaid_to_file(mermaid_code, output_file_without_suffix, width=2048, height=2048, config=None) -> int:
    """
    Converts the given Mermaid code to various output formats and saves them to files.

@ -27,6 +27,7 @@ async def mermaid_to_file(mermaid_code, output_file_without_suffix, width=2048,
    Returns:
        int: Returns 1 if the conversion and saving were successful, -1 otherwise.
    """
+    config = config if config else Config.default()
    suffixes = ["png", "svg", "pdf"]
    __dirname = os.path.dirname(os.path.abspath(__file__))

--- a/metagpt/utils/repair_llm_raw_output.py
+++ b/metagpt/utils/repair_llm_raw_output.py
@ -4,12 +4,12 @@

 import copy
 from enum import Enum
-from typing import Callable, Union
+from typing import Callable, Optional, Union

 import regex as re
 from tenacity import RetryCallState, retry, stop_after_attempt, wait_fixed

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.logs import logger
 from metagpt.utils.custom_decoder import CustomDecoder

@ -154,7 +154,9 @@ def _repair_llm_raw_output(output: str, req_key: str, repair_type: RepairType =
    return output


-def repair_llm_raw_output(output: str, req_keys: list[str], repair_type: RepairType = None) -> str:
+def repair_llm_raw_output(
+    output: str, req_keys: list[str], repair_type: RepairType = None, config: Optional[Config] = None
+) -> str:
    """
    in open-source llm model, it usually can't follow the instruction well, the output may be incomplete,
    so here we try to repair it and use all repair methods by default.
@ -169,6 +171,7 @@ def repair_llm_raw_output(output: str, req_keys: list[str], repair_type: RepairT
            target: { xxx }
            output: { xxx }]
    """
+    config = config if config else Config.default()
    if not config.repair_llm_output:
        return output

@ -256,6 +259,7 @@ def run_after_exp_and_passon_next_retry(logger: "loguru.Logger") -> Callable[["R
                "next_action":"None"
            }
        """
+        config = Config.default()
        if retry_state.outcome.failed:
            if retry_state.args:
                # # can't be used as args=retry_state.args
@ -276,8 +280,12 @@ def run_after_exp_and_passon_next_retry(logger: "loguru.Logger") -> Callable[["R
    return run_and_passon


+def repair_stop_after_attempt(retry_state):
+    return stop_after_attempt(3 if Config.default().repair_llm_output else 0)(retry_state)
+
+
@retry(
-    stop=stop_after_attempt(3 if config.repair_llm_output else 0),
+    stop=repair_stop_after_attempt,
    wait=wait_fixed(1),
    after=run_after_exp_and_passon_next_retry(logger),
 )
--- a/tests/metagpt/provider/test_base_llm.py
+++ b/tests/metagpt/provider/test_base_llm.py
@ -10,8 +10,9 @@ import pytest

 from metagpt.configs.compress_msg_config import CompressType
 from metagpt.configs.llm_config import LLMConfig
+from metagpt.const import IMAGES
 from metagpt.provider.base_llm import BaseLLM
-from metagpt.schema import Message
+from metagpt.schema import AIMessage, Message, UserMessage
 from tests.metagpt.provider.mock_llm_config import mock_llm_config
 from tests.metagpt.provider.req_resp_const import (
    default_resp_cont,
@ -163,3 +164,41 @@ def test_compress_messages_long_no_sys_msg(compress_type):
    print(compressed)
    assert compressed
    assert len(compressed[0]["content"]) < len(messages[0]["content"])
+
+
+def test_format_msg(mocker):
+    base_llm = MockBaseLLM()
+    messages = [UserMessage(content="req"), AIMessage(content="rsp")]
+    formatted_msgs = base_llm.format_msg(messages)
+    assert formatted_msgs == [{"role": "user", "content": "req"}, {"role": "assistant", "content": "rsp"}]
+
+
+def test_format_msg_w_images(mocker):
+    base_llm = MockBaseLLM()
+    base_llm.config.model = "gpt-4o"
+    msg_w_images = UserMessage(content="req1")
+    msg_w_images.add_metadata(IMAGES, ["base64 string 1", "base64 string 2"])
+    msg_w_empty_images = UserMessage(content="req2")
+    msg_w_empty_images.add_metadata(IMAGES, [])
+    messages = [
+        msg_w_images,  # should be converted
+        AIMessage(content="rsp"),
+        msg_w_empty_images,  # should not be converted
+    ]
+    formatted_msgs = base_llm.format_msg(messages)
+    assert formatted_msgs == [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "req1"},
+                {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,base64 string 1"}},
+                {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,base64 string 2"}},
+            ],
+        },
+        {"role": "assistant", "content": "rsp"},
+        {"role": "user", "content": "req2"},
+    ]
+
+
+if name == "__main__":
+    pytest.main([__file__, "-s"])
--- a/tests/metagpt/roles/di/run_swe_agent_for_benchmark.py
+++ b/tests/metagpt/roles/di/run_swe_agent_for_benchmark.py
@ -2,13 +2,14 @@ import asyncio
 import json
 from datetime import datetime

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.const import DEFAULT_WORKSPACE_ROOT, METAGPT_ROOT
 from metagpt.logs import logger
 from metagpt.roles.di.swe_agent import SWEAgent
 from metagpt.tools.libs.terminal import Terminal
 from metagpt.tools.swe_agent_commands.swe_agent_utils import load_hf_dataset

+config = Config.default()
 # Specify by yourself
 TEST_REPO_DIR = METAGPT_ROOT / "data" / "test_repo"
 DATA_DIR = METAGPT_ROOT / "data/hugging_face"
--- a/tests/metagpt/test_document.py
+++ b/tests/metagpt/test_document.py
@ -5,10 +5,12 @@
@Author  : alexanderwu
@File    : test_document.py
 """
-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.document import Repo
 from metagpt.logs import logger

+config = Config.default()
+

 def set_existing_repo(path):
    repo1 = Repo.from_path(path)
--- a/tests/metagpt/tools/test_azure_tts.py
+++ b/tests/metagpt/tools/test_azure_tts.py
@ -12,9 +12,11 @@ from pathlib import Path
 import pytest
 from azure.cognitiveservices.speech import ResultReason, SpeechSynthesizer

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.tools.azure_tts import AzureTTS

+config = Config.default()
+

@pytest.mark.asyncio
 async def test_azure_tts(mocker):
--- a/tests/metagpt/tools/test_metagpt_text_to_image.py
+++ b/tests/metagpt/tools/test_metagpt_text_to_image.py
@ -10,9 +10,11 @@ from unittest.mock import AsyncMock

 import pytest

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.tools.metagpt_text_to_image import oas3_metagpt_text_to_image

+config = Config.default()
+

@pytest.mark.asyncio
 async def test_draw(mocker):
--- a/tests/metagpt/tools/test_moderation.py
+++ b/tests/metagpt/tools/test_moderation.py
@ -8,10 +8,12 @@

 import pytest

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.llm import LLM
 from metagpt.tools.moderation import Moderation

+config = Config.default()
+

@pytest.mark.asyncio
@pytest.mark.parametrize(
--- a/tests/metagpt/tools/test_openai_text_to_image.py
+++ b/tests/metagpt/tools/test_openai_text_to_image.py
@ -11,7 +11,7 @@ import openai
 import pytest
 from pydantic import BaseModel

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.llm import LLM
 from metagpt.tools.openai_text_to_image import (
    OpenAIText2Image,
@ -19,6 +19,8 @@ from metagpt.tools.openai_text_to_image import (
 )
 from metagpt.utils.s3 import S3

+config = Config.default()
+

@pytest.mark.asyncio
 async def test_draw(mocker):
--- a/tests/metagpt/tools/test_ut_writer.py
+++ b/tests/metagpt/tools/test_ut_writer.py
@ -20,10 +20,12 @@ from openai.types.chat.chat_completion_message_tool_call import (
    Function,
 )

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.const import API_QUESTIONS_PATH, UT_PY_PATH
 from metagpt.tools.ut_writer import YFT_PROMPT_PREFIX, UTGenerator

+config = Config.default()
+

 class TestUTWriter:
    @pytest.mark.asyncio
--- a/tests/metagpt/utils/test_common.py
+++ b/tests/metagpt/utils/test_common.py
@ -29,9 +29,9 @@ from metagpt.utils.common import (
    awrite,
    check_cmd_exists,
    concat_namespace,
+    extract_and_encode_images,
    extract_image_paths,
    import_class_inst,
-    is_support_image_input,
    parse_recipient,
    print_members,
    read_file_block,
@ -231,9 +231,8 @@ def test_extract_image_paths():
    assert not extract_image_paths(content)


-def test_is_support_image_input():
-    assert is_support_image_input("gpt-4o-2024-08-06")
-    assert not is_support_image_input("deepseek-coder")
+def test_extract_and_encode_images():
+    assert not extract_and_encode_images("a non-existing.jpg")


 if __name__ == "__main__":
--- a/tests/metagpt/utils/test_repair_llm_raw_output.py
+++ b/tests/metagpt/utils/test_repair_llm_raw_output.py
@ -2,7 +2,9 @@
 # -*- coding: utf-8 -*-
 # @Desc   : unittest of repair_llm_raw_output

-from metagpt.config2 import config
+from metagpt.config2 import Config
+
+config = Config.default()

 """
 CONFIG.repair_llm_output should be True before retry_parse_json_text imported.
--- a/tests/mock/mock_llm.py
+++ b/tests/mock/mock_llm.py
@ -1,7 +1,7 @@
 import json
 from typing import Optional, Union

-from metagpt.config2 import config
+from metagpt.config2 import Config
 from metagpt.configs.llm_config import LLMType
 from metagpt.const import LLM_API_TIMEOUT
 from metagpt.logs import logger
@ -10,6 +10,8 @@ from metagpt.provider.constant import GENERAL_FUNCTION_SCHEMA
 from metagpt.provider.openai_api import OpenAILLM
 from metagpt.schema import Message

+config = Config.default()
+
 OriginalLLM = OpenAILLM if config.llm.api_type == LLMType.OPENAI else AzureOpenAILLM