Merge pull request #685 from garylin2099/llm_mock

Reduce test time with a global LLM mock
2026-07-17 16:41:05 +02:00 · 2024-01-05 17:11:25 +08:00 · 2024-01-05 17:11:25 +08:00 · 230192f5e0
commit 230192f5e0
parent 136b3f5d28 bd4a35fd94
51 changed files with 289 additions and 217 deletions
--- a/metagpt/actions/invoice_ocr.py
+++ b/metagpt/actions/invoice_ocr.py
@ -88,6 +88,8 @@ class InvoiceOCR(Action):
    async def _ocr(invoice_file_path: Path):
        ocr = PaddleOCR(use_angle_cls=True, lang="ch", page_num=1)
        ocr_result = ocr.ocr(str(invoice_file_path), cls=True)
+        for result in ocr_result[0]:
+            result[1] = (result[1][0], round(result[1][1], 2))  # round long confidence scores to reduce token costs
        return ocr_result

    async def run(self, file_path: Path, *args, **kwargs) -> list:
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -12,7 +12,6 @@ import logging
 import os
 import re
 import uuid
-from typing import Optional

 import pytest

@ -20,49 +19,13 @@ from metagpt.config import CONFIG, Config
 from metagpt.const import DEFAULT_WORKSPACE_ROOT, TEST_DATA_PATH
 from metagpt.llm import LLM
 from metagpt.logs import logger
-from metagpt.provider.openai_api import OpenAILLM
 from metagpt.utils.git_repository import GitRepository
+from tests.mock.mock_llm import MockLLM

-
-class MockLLM(OpenAILLM):
-    rsp_cache: dict = {}
-
-    async def original_aask(
-        self,
-        msg: str,
-        system_msgs: Optional[list[str]] = None,
-        format_msgs: Optional[list[dict[str, str]]] = None,
-        timeout=3,
-        stream=True,
-    ):
-        """A copy of metagpt.provider.base_llm.BaseLLM.aask, we can't use super().aask because it will be mocked"""
-        if system_msgs:
-            message = self._system_msgs(system_msgs)
-        else:
-            message = [self._default_system_msg()] if self.use_system_prompt else []
-        if format_msgs:
-            message.extend(format_msgs)
-        message.append(self._user_msg(msg))
-        rsp = await self.acompletion_text(message, stream=stream, timeout=timeout)
-        return rsp
-
-    async def aask(
-        self,
-        msg: str,
-        system_msgs: Optional[list[str]] = None,
-        format_msgs: Optional[list[dict[str, str]]] = None,
-        timeout=3,
-        stream=True,
-    ) -> str:
-        if msg not in self.rsp_cache:
-            # Call the original unmocked method
-            rsp = await self.original_aask(msg, system_msgs, format_msgs, timeout, stream)
-            logger.info(f"Added '{rsp[:20]}' ... to response cache")
-            self.rsp_cache[msg] = rsp
-            return rsp
-        else:
-            logger.info("Use response cache")
-            return self.rsp_cache[msg]
+RSP_CACHE_NEW = {}  # used globally for producing new and useful only response cache
+ALLOW_OPENAI_API_CALL = os.environ.get(
+    "ALLOW_OPENAI_API_CALL", True
+)  # NOTE: should change to default False once mock is complete


@pytest.fixture(scope="session")
@ -76,16 +39,37 @@ def rsp_cache():
    else:
        rsp_cache_json = {}
    yield rsp_cache_json
-    with open(new_rsp_cache_file_path, "w") as f2:
+    with open(rsp_cache_file_path, "w") as f2:
        json.dump(rsp_cache_json, f2, indent=4, ensure_ascii=False)
+    with open(new_rsp_cache_file_path, "w") as f2:
+        json.dump(RSP_CACHE_NEW, f2, indent=4, ensure_ascii=False)


-@pytest.fixture(scope="function")
-def llm_mock(rsp_cache, mocker):
-    llm = MockLLM()
+# Hook to capture the test result
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    outcome = yield
+    rep = outcome.get_result()
+    if rep.when == "call":
+        item.test_outcome = rep
+
+
+@pytest.fixture(scope="function", autouse=True)
+def llm_mock(rsp_cache, mocker, request):
+    llm = MockLLM(allow_open_api_call=ALLOW_OPENAI_API_CALL)
    llm.rsp_cache = rsp_cache
    mocker.patch("metagpt.provider.base_llm.BaseLLM.aask", llm.aask)
+    mocker.patch("metagpt.provider.base_llm.BaseLLM.aask_batch", llm.aask_batch)
    yield mocker
+    if hasattr(request.node, "test_outcome") and request.node.test_outcome.passed:
+        if llm.rsp_candidates:
+            for rsp_candidate in llm.rsp_candidates:
+                cand_key = list(rsp_candidate.keys())[0]
+                cand_value = list(rsp_candidate.values())[0]
+                if cand_key not in llm.rsp_cache:
+                    logger.info(f"Added '{cand_key[:100]} ... -> {cand_value[:20]} ...' to response cache")
+                    llm.rsp_cache.update(rsp_candidate)
+                RSP_CACHE_NEW.update(rsp_candidate)


 class Context:
@ -173,6 +157,13 @@ def init_config():
    Config()


+@pytest.fixture(scope="function")
+def new_filename(mocker):
+    # NOTE: Mock new filename to make reproducible llm aask, should consider changing after implementing requirement segmentation
+    mocker.patch("metagpt.utils.file_repository.FileRepository.new_filename", lambda: "20240101")
+    yield mocker
+
+
@pytest.fixture
 def aiohttp_mocker(mocker):
    class MockAioResponse:
--- a/tests/data/rsp_cache.json
+++ b/tests/data/rsp_cache.json
--- a/tests/metagpt/actions/test_debug_error.py
+++ b/tests/metagpt/actions/test_debug_error.py
@ -117,7 +117,6 @@ if __name__ == '__main__':


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_debug_error():
    CONFIG.src_workspace = CONFIG.git_repo.workdir / uuid.uuid4().hex
    ctx = RunCodeContext(
--- a/tests/metagpt/actions/test_design_api.py
+++ b/tests/metagpt/actions/test_design_api.py
@ -17,7 +17,6 @@ from tests.metagpt.actions.mock_markdown import PRD_SAMPLE


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_design_api():
    inputs = ["我们需要一个音乐播放器，它应该有播放、暂停、上一曲、下一曲等功能。", PRD_SAMPLE]
    for prd in inputs:
--- a/tests/metagpt/actions/test_design_api_review.py
+++ b/tests/metagpt/actions/test_design_api_review.py
@ -11,7 +11,6 @@ from metagpt.actions.design_api_review import DesignReview


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_design_api_review():
    prd = "我们需要一个音乐播放器，它应该有播放、暂停、上一曲、下一曲等功能。"
    api_design = """
--- a/tests/metagpt/actions/test_generate_questions.py
+++ b/tests/metagpt/actions/test_generate_questions.py
@ -20,7 +20,6 @@ context = """


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_generate_questions():
    action = GenerateQuestions()
    rsp = await action.run(context)
--- a/tests/metagpt/actions/test_invoice_ocr.py
+++ b/tests/metagpt/actions/test_invoice_ocr.py
@ -54,7 +54,6 @@ async def test_generate_table(invoice_path: Path, expected_result: dict):
    ("invoice_path", "query", "expected_result"),
    [(Path("invoices/invoice-1.pdf"), "Invoicing date", "2023年02月03日")],
 )
-@pytest.mark.usefixtures("llm_mock")
 async def test_reply_question(invoice_path: Path, query: dict, expected_result: str):
    invoice_path = TEST_DATA_PATH / invoice_path
    ocr_result = await InvoiceOCR().run(file_path=Path(invoice_path))
--- a/tests/metagpt/actions/test_prepare_interview.py
+++ b/tests/metagpt/actions/test_prepare_interview.py
@ -12,7 +12,6 @@ from metagpt.logs import logger


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_prepare_interview():
    action = PrepareInterview()
    rsp = await action.run("I just graduated and hope to find a job as a Python engineer")
--- a/tests/metagpt/actions/test_project_management.py
+++ b/tests/metagpt/actions/test_project_management.py
@ -18,7 +18,6 @@ from tests.metagpt.actions.mock_json import DESIGN, PRD


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_design_api():
    await FileRepository.save_file("1.txt", content=str(PRD), relative_path=PRDS_FILE_REPO)
    await FileRepository.save_file("1.txt", content=str(DESIGN), relative_path=SYSTEM_DESIGN_FILE_REPO)
--- a/tests/metagpt/actions/test_research.py
+++ b/tests/metagpt/actions/test_research.py
@ -8,14 +8,7 @@

 import pytest

-from metagpt.actions import CollectLinks, research
-
-
-@pytest.mark.asyncio
-async def test_action():
-    action = CollectLinks()
-    result = await action.run(topic="baidu")
-    assert result
+from metagpt.actions import research


@pytest.mark.asyncio
--- a/tests/metagpt/actions/test_summarize_code.py
+++ b/tests/metagpt/actions/test_summarize_code.py
@ -177,7 +177,6 @@ class Snake:


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_summarize_code():
    CONFIG.src_workspace = CONFIG.git_repo.workdir / "src"
    await FileRepository.save_file(filename="1.json", relative_path=SYSTEM_DESIGN_FILE_REPO, content=DESIGN_CONTENT)
--- a/tests/metagpt/actions/test_talk_action.py
+++ b/tests/metagpt/actions/test_talk_action.py
@ -33,7 +33,6 @@ from metagpt.schema import Message
        ),
    ],
 )
-@pytest.mark.usefixtures("llm_mock")
 async def test_prompt(agent_description, language, context, knowledge, history_summary):
    # Prerequisites
    CONFIG.agent_description = agent_description
--- a/tests/metagpt/actions/test_write_code.py
+++ b/tests/metagpt/actions/test_write_code.py
@ -28,7 +28,6 @@ from tests.metagpt.actions.mock_markdown import TASKS_2, WRITE_CODE_PROMPT_SAMPL


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_code():
    context = CodingContext(
        filename="task_filename.py", design_doc=Document(content="设计一个名为'add'的函数，该函数接受两个整数作为输入，并返回它们的和。")
@ -45,7 +44,6 @@ async def test_write_code():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_code_directly():
    prompt = WRITE_CODE_PROMPT_SAMPLE + "\n" + TASKS_2[0]
    llm = LLM()
@ -54,7 +52,6 @@ async def test_write_code_directly():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_code_deps():
    # Prerequisites
    CONFIG.src_workspace = CONFIG.git_repo.workdir / "snake1/snake1"
--- a/tests/metagpt/actions/test_write_code_review.py
+++ b/tests/metagpt/actions/test_write_code_review.py
@ -12,7 +12,6 @@ from metagpt.schema import CodingContext, Document


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_code_review(capfd):
    code = """
 def add(a, b):
--- a/tests/metagpt/actions/test_write_docstring.py
+++ b/tests/metagpt/actions/test_write_docstring.py
@ -27,14 +27,12 @@ class Person:
    ],
    ids=["google", "numpy", "sphinx"],
 )
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_docstring(style: str, part: str):
    ret = await WriteDocstring().run(code, style=style)
    assert part in ret


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write():
    code = await WriteDocstring.write_docstring(__file__)
    assert code
--- a/tests/metagpt/actions/test_write_prd.py
+++ b/tests/metagpt/actions/test_write_prd.py
@ -18,8 +18,7 @@ from metagpt.utils.file_repository import FileRepository


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
-async def test_write_prd():
+async def test_write_prd(new_filename):
    product_manager = ProductManager()
    requirements = "开发一个基于大语言模型与私有知识库的搜索引擎，希望可以基于大语言模型进行搜索总结"
    await FileRepository.save_file(filename=REQUIREMENT_FILENAME, content=requirements, relative_path=DOCS_FILE_REPO)
--- a/tests/metagpt/actions/test_write_prd_review.py
+++ b/tests/metagpt/actions/test_write_prd_review.py
@ -11,7 +11,6 @@ from metagpt.actions.write_prd_review import WritePRDReview


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_prd_review():
    prd = """
    Introduction: This is a new feature for our product.
--- a/tests/metagpt/actions/test_write_review.py
+++ b/tests/metagpt/actions/test_write_review.py
@ -46,7 +46,6 @@ CONTEXT = """


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_review():
    write_review = WriteReview()
    review = await write_review.run(CONTEXT)
--- a/tests/metagpt/actions/test_write_teaching_plan.py
+++ b/tests/metagpt/actions/test_write_teaching_plan.py
@ -16,7 +16,6 @@ from metagpt.actions.write_teaching_plan import WriteTeachingPlanPart
    ("topic", "context"),
    [("Title", "Lesson 1: Learn to draw an apple."), ("Teaching Content", "Lesson 1: Learn to draw an apple.")],
 )
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_teaching_plan_part(topic, context):
    action = WriteTeachingPlanPart(topic=topic, context=context)
    rsp = await action.run()
--- a/tests/metagpt/actions/test_write_test.py
+++ b/tests/metagpt/actions/test_write_test.py
@ -13,7 +13,6 @@ from metagpt.schema import Document, TestingContext


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_test():
    code = """
    import random
@ -40,7 +39,6 @@ async def test_write_test():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_code_invalid_code(mocker):
    # Mock the _aask method to return an invalid code string
    mocker.patch.object(WriteTest, "_aask", return_value="Invalid Code String")
--- a/tests/metagpt/actions/test_write_tutorial.py
+++ b/tests/metagpt/actions/test_write_tutorial.py
@ -14,7 +14,6 @@ from metagpt.actions.write_tutorial import WriteContent, WriteDirectory

@pytest.mark.asyncio
@pytest.mark.parametrize(("language", "topic"), [("English", "Write a tutorial about Python")])
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_directory(language: str, topic: str):
    ret = await WriteDirectory(language=language).run(topic=topic)
    assert isinstance(ret, dict)
@ -30,7 +29,6 @@ async def test_write_directory(language: str, topic: str):
    ("language", "topic", "directory"),
    [("English", "Write a tutorial about Python", {"Introduction": ["What is Python?", "Why learn Python?"]})],
 )
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_content(language: str, topic: str, directory: Dict):
    ret = await WriteContent(language=language, directory=directory).run(topic=topic)
    assert isinstance(ret, str)
--- a/tests/metagpt/document_store/test_qdrant_store.py
+++ b/tests/metagpt/document_store/test_qdrant_store.py
@ -29,6 +29,16 @@ points = [
 ]


+def assert_almost_equal(actual, expected):
+    delta = 1e-10
+    if isinstance(expected, list):
+        assert len(actual) == len(expected)
+        for ac, exp in zip(actual, expected):
+            assert abs(ac - exp) <= delta, f"{ac} is not within {delta} of {exp}"
+    else:
+        assert abs(actual - expected) <= delta, f"{actual} is not within {delta} of {expected}"
+
+
 def test_qdrant_store():
    qdrant_connection = QdrantConnection(memory=True)
    vectors_config = VectorParams(size=2, distance=Distance.COSINE)
@ -42,30 +52,30 @@ def test_qdrant_store():
    qdrant_store.add("Book", points)
    results = qdrant_store.search("Book", query=[1.0, 1.0])
    assert results[0]["id"] == 2
-    assert results[0]["score"] == 0.999106722578389
+    assert_almost_equal(results[0]["score"], 0.999106722578389)
    assert results[1]["id"] == 7
-    assert results[1]["score"] == 0.9961650411397226
+    assert_almost_equal(results[1]["score"], 0.9961650411397226)
    results = qdrant_store.search("Book", query=[1.0, 1.0], return_vector=True)
    assert results[0]["id"] == 2
-    assert results[0]["score"] == 0.999106722578389
-    assert results[0]["vector"] == [0.7363563179969788, 0.6765939593315125]
+    assert_almost_equal(results[0]["score"], 0.999106722578389)
+    assert_almost_equal(results[0]["vector"], [0.7363563179969788, 0.6765939593315125])
    assert results[1]["id"] == 7
-    assert results[1]["score"] == 0.9961650411397226
-    assert results[1]["vector"] == [0.7662628889083862, 0.6425272226333618]
+    assert_almost_equal(results[1]["score"], 0.9961650411397226)
+    assert_almost_equal(results[1]["vector"], [0.7662628889083862, 0.6425272226333618])
    results = qdrant_store.search(
        "Book",
        query=[1.0, 1.0],
        query_filter=Filter(must=[FieldCondition(key="rand_number", range=Range(gte=8))]),
    )
    assert results[0]["id"] == 8
-    assert results[0]["score"] == 0.9100373450784073
+    assert_almost_equal(results[0]["score"], 0.9100373450784073)
    assert results[1]["id"] == 9
-    assert results[1]["score"] == 0.7127610621127889
+    assert_almost_equal(results[1]["score"], 0.7127610621127889)
    results = qdrant_store.search(
        "Book",
        query=[1.0, 1.0],
        query_filter=Filter(must=[FieldCondition(key="rand_number", range=Range(gte=8))]),
        return_vector=True,
    )
-    assert results[0]["vector"] == [0.35037919878959656, 0.9366079568862915]
-    assert results[1]["vector"] == [0.9999677538871765, 0.00802854634821415]
+    assert_almost_equal(results[0]["vector"], [0.35037919878959656, 0.9366079568862915])
+    assert_almost_equal(results[1]["vector"], [0.9999677538871765, 0.00802854634821415])
--- a/tests/metagpt/provider/conftest.py
+++ b/tests/metagpt/provider/conftest.py
@ -0,0 +1,8 @@
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def llm_mock(rsp_cache, mocker, request):
+    # An empty fixture to overwrite the global llm_mock fixture
+    # because in provider folder, we want to test the aask and aask functions for the specific models
+    pass
--- a/tests/metagpt/roles/test_architect.py
+++ b/tests/metagpt/roles/test_architect.py
@ -22,7 +22,6 @@ from tests.metagpt.roles.mock import MockMessages


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_architect():
    # Prerequisites
    filename = uuid.uuid4().hex + ".json"
--- a/tests/metagpt/roles/test_assistant.py
+++ b/tests/metagpt/roles/test_assistant.py
@ -13,7 +13,6 @@ from pydantic import BaseModel
 from metagpt.actions.skill_action import SkillAction
 from metagpt.actions.talk_action import TalkAction
 from metagpt.config import CONFIG
-from metagpt.logs import logger
 from metagpt.memory.brain_memory import BrainMemory
 from metagpt.roles.assistant import Assistant
 from metagpt.schema import Message
@ -21,7 +20,6 @@ from metagpt.utils.common import any_to_str


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_run():
    CONFIG.language = "Chinese"

@ -88,7 +86,7 @@ async def test_run():
            if not has_action:
                break
            msg: Message = await role.act()
-            logger.info(msg)
+            # logger.info(msg)
            assert msg
            assert msg.cause_by == seed.cause_by
            assert msg.content
--- a/tests/metagpt/roles/test_engineer.py
+++ b/tests/metagpt/roles/test_engineer.py
@ -30,7 +30,6 @@ from tests.metagpt.roles.mock import STRS_FOR_PARSING, TASKS, MockMessages


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_engineer():
    # Prerequisites
    rqno = "20231221155954.json"
@ -114,7 +113,6 @@ def test_todo():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_new_coding_context():
    # Prerequisites
    demo_path = Path(__file__).parent / "../../data/demo_project"
--- a/tests/metagpt/roles/test_invoice_ocr_assistant.py
+++ b/tests/metagpt/roles/test_invoice_ocr_assistant.py
@ -41,7 +41,6 @@ from metagpt.schema import Message
        ),
    ],
 )
-@pytest.mark.usefixtures("llm_mock")
 async def test_invoice_ocr_assistant(query: str, invoice_path: Path, invoice_table_path: Path, expected_result: dict):
    invoice_path = TEST_DATA_PATH / invoice_path
    role = InvoiceOCRAssistant()
--- a/tests/metagpt/roles/test_product_manager.py
+++ b/tests/metagpt/roles/test_product_manager.py
@ -13,8 +13,7 @@ from tests.metagpt.roles.mock import MockMessages


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
-async def test_product_manager():
+async def test_product_manager(new_filename):
    product_manager = ProductManager()
    rsp = await product_manager.run(MockMessages.req)
    logger.info(rsp)
--- a/tests/metagpt/roles/test_project_manager.py
+++ b/tests/metagpt/roles/test_project_manager.py
@ -13,7 +13,6 @@ from tests.metagpt.roles.mock import MockMessages


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_project_manager():
    project_manager = ProjectManager()
    rsp = await project_manager.run(MockMessages.system_design)
--- a/tests/metagpt/roles/test_teacher.py
+++ b/tests/metagpt/roles/test_teacher.py
@ -103,7 +103,6 @@ async def test_new_file_name():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_run():
    CONFIG.set_context({"language": "Chinese", "teaching_language": "English"})
    lesson = """
--- a/tests/metagpt/roles/test_tutorial_assistant.py
+++ b/tests/metagpt/roles/test_tutorial_assistant.py
@ -15,7 +15,6 @@ from metagpt.roles.tutorial_assistant import TutorialAssistant

@pytest.mark.asyncio
@pytest.mark.parametrize(("language", "topic"), [("Chinese", "Write a tutorial about pip")])
-@pytest.mark.usefixtures("llm_mock")
 async def test_tutorial_assistant(language: str, topic: str):
    role = TutorialAssistant(language=language)
    msg = await role.run(topic)
--- a/tests/metagpt/serialize_deserialize/test_action.py
+++ b/tests/metagpt/serialize_deserialize/test_action.py
@ -21,7 +21,6 @@ def test_action_serialize():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_action_deserialize():
    action = Action()
    serialized_data = action.model_dump()
--- a/tests/metagpt/serialize_deserialize/test_architect_deserialize.py
+++ b/tests/metagpt/serialize_deserialize/test_architect_deserialize.py
@ -17,7 +17,6 @@ def test_architect_serialize():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_architect_deserialize():
    role = Architect()
    ser_role_dict = role.model_dump(by_alias=True)
--- a/tests/metagpt/serialize_deserialize/test_prepare_interview.py
+++ b/tests/metagpt/serialize_deserialize/test_prepare_interview.py
@ -8,7 +8,6 @@ from metagpt.actions.prepare_interview import PrepareInterview


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_action_deserialize():
    action = PrepareInterview()
    serialized_data = action.model_dump()
--- a/tests/metagpt/serialize_deserialize/test_product_manager.py
+++ b/tests/metagpt/serialize_deserialize/test_product_manager.py
@ -10,8 +10,7 @@ from metagpt.schema import Message


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
-async def test_product_manager_deserialize():
+async def test_product_manager_deserialize(new_filename):
    role = ProductManager()
    ser_role_dict = role.model_dump(by_alias=True)
    new_role = ProductManager(**ser_role_dict)
--- a/tests/metagpt/serialize_deserialize/test_project_manager.py
+++ b/tests/metagpt/serialize_deserialize/test_project_manager.py
@ -18,7 +18,6 @@ def test_project_manager_serialize():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_project_manager_deserialize():
    role = ProjectManager()
    ser_role_dict = role.model_dump(by_alias=True)
--- a/tests/metagpt/serialize_deserialize/test_role.py
+++ b/tests/metagpt/serialize_deserialize/test_role.py
@ -69,7 +69,6 @@ def test_engineer_serialize():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_engineer_deserialize():
    role = Engineer(use_code_review=True)
    ser_role_dict = role.model_dump()
@ -97,7 +96,6 @@ def test_role_serdeser_save():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_role_serdeser_interrupt():
    role_c = RoleC()
    shutil.rmtree(SERDESER_PATH.joinpath("team"), ignore_errors=True)
--- a/tests/metagpt/serialize_deserialize/test_team.py
+++ b/tests/metagpt/serialize_deserialize/test_team.py
@ -109,7 +109,6 @@ async def test_team_recover_save():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_team_recover_multi_roles_save():
    idea = "write a snake game"
    stg_path = SERDESER_PATH.joinpath("team")
--- a/tests/metagpt/serialize_deserialize/test_write_code.py
+++ b/tests/metagpt/serialize_deserialize/test_write_code.py
@ -17,7 +17,6 @@ def test_write_design_serialize():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_code_deserialize():
    context = CodingContext(
        filename="test_code.py", design_doc=Document(content="write add function to calculate two numbers")
--- a/tests/metagpt/serialize_deserialize/test_write_code_review.py
+++ b/tests/metagpt/serialize_deserialize/test_write_code_review.py
@ -9,7 +9,6 @@ from metagpt.schema import CodingContext, Document


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_code_review_deserialize():
    code_content = """
 def div(a: int, b: int = 0):
--- a/tests/metagpt/serialize_deserialize/test_write_design.py
+++ b/tests/metagpt/serialize_deserialize/test_write_design.py
@ -22,7 +22,6 @@ def test_write_task_serialize():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_design_deserialize():
    action = WriteDesign()
    serialized_data = action.model_dump()
@ -32,7 +31,6 @@ async def test_write_design_deserialize():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_task_deserialize():
    action = WriteTasks()
    serialized_data = action.model_dump()
--- a/tests/metagpt/serialize_deserialize/test_write_docstring.py
+++ b/tests/metagpt/serialize_deserialize/test_write_docstring.py
@ -29,7 +29,6 @@ class Person:
    ],
    ids=["google", "numpy", "sphinx"],
 )
-@pytest.mark.usefixtures("llm_mock")
 async def test_action_deserialize(style: str, part: str):
    action = WriteDocstring()
    serialized_data = action.model_dump()
--- a/tests/metagpt/serialize_deserialize/test_write_prd.py
+++ b/tests/metagpt/serialize_deserialize/test_write_prd.py
@ -9,7 +9,7 @@ from metagpt.actions import WritePRD
 from metagpt.schema import Message


-def test_action_serialize():
+def test_action_serialize(new_filename):
    action = WritePRD()
    ser_action_dict = action.model_dump()
    assert "name" in ser_action_dict
@ -17,8 +17,7 @@ def test_action_serialize():


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
-async def test_action_deserialize():
+async def test_action_deserialize(new_filename):
    action = WritePRD()
    serialized_data = action.model_dump()
    new_action = WritePRD(**serialized_data)
--- a/tests/metagpt/serialize_deserialize/test_write_review.py
+++ b/tests/metagpt/serialize_deserialize/test_write_review.py
@ -42,7 +42,6 @@ CONTEXT = """


@pytest.mark.asyncio
-@pytest.mark.usefixtures("llm_mock")
 async def test_action_deserialize():
    action = WriteReview()
    serialized_data = action.model_dump()
--- a/tests/metagpt/serialize_deserialize/test_write_tutorial.py
+++ b/tests/metagpt/serialize_deserialize/test_write_tutorial.py
@ -9,7 +9,6 @@ from metagpt.actions.write_tutorial import WriteContent, WriteDirectory

@pytest.mark.asyncio
@pytest.mark.parametrize(("language", "topic"), [("English", "Write a tutorial about Python")])
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_directory_deserialize(language: str, topic: str):
    action = WriteDirectory()
    serialized_data = action.model_dump()
@ -31,7 +30,6 @@ async def test_write_directory_deserialize(language: str, topic: str):
    ("language", "topic", "directory"),
    [("English", "Write a tutorial about Python", {"Introduction": ["What is Python?", "Why learn Python?"]})],
 )
-@pytest.mark.usefixtures("llm_mock")
 async def test_write_content_deserialize(language: str, topic: str, directory: Dict):
    action = WriteContent(language=language, directory=directory)
    serialized_data = action.model_dump()
--- a/tests/metagpt/test_environment.py
+++ b/tests/metagpt/test_environment.py
@ -45,7 +45,7 @@ def test_get_roles(env: Environment):


@pytest.mark.asyncio
-async def test_publish_and_process_message(env: Environment):
+async def test_publish_and_process_message(env: Environment, new_filename):
    if CONFIG.git_repo:
        CONFIG.git_repo.delete_repository()
        CONFIG.git_repo = None
--- a/tests/metagpt/test_startup.py
+++ b/tests/metagpt/test_startup.py
@ -16,14 +16,14 @@ runner = CliRunner()


@pytest.mark.asyncio
-async def test_empty_team():
+async def test_empty_team(new_filename):
    # FIXME: we're now using "metagpt" cli, so the entrance should be replaced instead.
    company = Team()
    history = await company.run(idea="Build a simple search system. I will upload my files later.")
    logger.info(history)


-def test_startup():
+def test_startup(new_filename):
    args = ["Make a cli snake game"]
    result = runner.invoke(app, args)
    logger.info(result)
--- a/tests/metagpt/tools/test_sd_tool.py
+++ b/tests/metagpt/tools/test_sd_tool.py
@ -1,26 +0,0 @@
-# -*- coding: utf-8 -*-
-# @Date    : 2023/7/22 02:40
-# @Author  : stellahong (stellahong@deepwisdom.ai)
-#
-import os
-
-from metagpt.config import CONFIG
-from metagpt.tools.sd_engine import SDEngine
-
-
-def test_sd_engine_init():
-    sd_engine = SDEngine()
-    assert sd_engine.payload["seed"] == -1
-
-
-def test_sd_engine_generate_prompt():
-    sd_engine = SDEngine()
-    sd_engine.construct_payload(prompt="test")
-    assert sd_engine.payload["prompt"] == "test"
-
-
-async def test_sd_engine_run_t2i():
-    sd_engine = SDEngine()
-    await sd_engine.run_t2i(prompts=["test"])
-    img_path = CONFIG.workspace_path / "resources" / "SD_Output" / "output_0.png"
-    assert os.path.exists(img_path)
--- a/tests/metagpt/tools/test_translate.py
+++ b/tests/metagpt/tools/test_translate.py
@ -14,7 +14,6 @@ from metagpt.tools.translator import Translator

@pytest.mark.asyncio
@pytest.mark.usefixtures("llm_api")
-@pytest.mark.usefixtures("llm_mock")
 async def test_translate(llm_api):
    poetries = [
        ("Let life be beautiful like summer flowers", "花"),
--- a/tests/mock/mock_llm.py
+++ b/tests/mock/mock_llm.py
@ -0,0 +1,95 @@
+from typing import Optional
+
+from metagpt.logs import log_llm_stream, logger
+from metagpt.provider.openai_api import OpenAILLM
+
+
+class MockLLM(OpenAILLM):
+    def __init__(self, allow_open_api_call):
+        super().__init__()
+        self.allow_open_api_call = allow_open_api_call
+        self.rsp_cache: dict = {}
+        self.rsp_candidates: list[dict] = []  # a test can have multiple calls with the same llm, thus a list
+
+    async def acompletion_text(self, messages: list[dict], stream=False, timeout=3) -> str:
+        """Overwrite original acompletion_text to cancel retry"""
+        if stream:
+            resp = self._achat_completion_stream(messages, timeout=timeout)
+
+            collected_messages = []
+            async for i in resp:
+                log_llm_stream(i)
+                collected_messages.append(i)
+
+            full_reply_content = "".join(collected_messages)
+            usage = self._calc_usage(messages, full_reply_content)
+            self._update_costs(usage)
+            return full_reply_content
+
+        rsp = await self._achat_completion(messages, timeout=timeout)
+        return self.get_choice_text(rsp)
+
+    async def original_aask(
+        self,
+        msg: str,
+        system_msgs: Optional[list[str]] = None,
+        format_msgs: Optional[list[dict[str, str]]] = None,
+        timeout=3,
+        stream=True,
+    ):
+        """A copy of metagpt.provider.base_llm.BaseLLM.aask, we can't use super().aask because it will be mocked"""
+        if system_msgs:
+            message = self._system_msgs(system_msgs)
+        else:
+            message = [self._default_system_msg()] if self.use_system_prompt else []
+        if format_msgs:
+            message.extend(format_msgs)
+        message.append(self._user_msg(msg))
+        rsp = await self.acompletion_text(message, stream=stream, timeout=timeout)
+        return rsp
+
+    async def original_aask_batch(self, msgs: list, timeout=3) -> str:
+        """A copy of metagpt.provider.base_llm.BaseLLM.aask_batch, we can't use super().aask because it will be mocked"""
+        context = []
+        for msg in msgs:
+            umsg = self._user_msg(msg)
+            context.append(umsg)
+            rsp_text = await self.acompletion_text(context, timeout=timeout)
+            context.append(self._assistant_msg(rsp_text))
+        return self._extract_assistant_rsp(context)
+
+    async def aask(
+        self,
+        msg: str,
+        system_msgs: Optional[list[str]] = None,
+        format_msgs: Optional[list[dict[str, str]]] = None,
+        timeout=3,
+        stream=True,
+    ) -> str:
+        msg_key = msg  # used to identify it a message has been called before
+        if system_msgs:
+            joined_system_msg = "#MSG_SEP#".join(system_msgs) + "#SYSTEM_MSG_END#"
+            msg_key = joined_system_msg + msg_key
+        rsp = await self._mock_rsp(msg_key, self.original_aask, msg, system_msgs, format_msgs, timeout, stream)
+        return rsp
+
+    async def aask_batch(self, msgs: list, timeout=3) -> str:
+        msg_key = "#MSG_SEP#".join([msg if isinstance(msg, str) else msg.content for msg in msgs])
+        rsp = await self._mock_rsp(msg_key, self.original_aask_batch, msgs, timeout)
+        return rsp
+
+    async def _mock_rsp(self, msg_key, ask_func, *args, **kwargs):
+        if msg_key not in self.rsp_cache:
+            if not self.allow_open_api_call:
+                raise ValueError(
+                    "In current test setting, api call is not allowed, you should properly mock your tests, "
+                    "or add expected api response in tests/data/rsp_cache.json. "
+                    f"The prompt you want for api call: {msg_key}"
+                )
+            # Call the original unmocked method
+            rsp = await ask_func(*args, **kwargs)
+        else:
+            logger.warning("Use response cache")
+            rsp = self.rsp_cache[msg_key]
+        self.rsp_candidates.append({msg_key: rsp})
+        return rsp