diff --git a/metagpt/tools/libs/editor.py b/metagpt/tools/libs/editor.py index c5c828ee2..a37c01421 100644 --- a/metagpt/tools/libs/editor.py +++ b/metagpt/tools/libs/editor.py @@ -10,10 +10,11 @@ import tempfile from pathlib import Path from typing import List, Optional, Union +import tiktoken from pydantic import BaseModel, ConfigDict from metagpt.const import DEFAULT_WORKSPACE_ROOT -from metagpt.tools.libs.index_repo import IndexRepo +from metagpt.tools.libs.index_repo import DEFAULT_MIN_TOKEN_COUNT, IndexRepo from metagpt.tools.libs.linter import Linter from metagpt.tools.tool_registry import register_tool from metagpt.utils.file import File @@ -106,6 +107,11 @@ class Editor(BaseModel): content = await File.read_text_file(path) if not content: return FileBlock(file_path=str(path), block_content="") + if self.is_large_file(content=content): + return FileBlock( + file_path=str(path), + block_content="The file is too large to read. Use `Editor.similarity_search` to read the file instead.", + ) self.resource.report(str(path), "path") lines = content.splitlines(keepends=True) @@ -959,3 +965,10 @@ class Editor(BaseModel): >>> print(texts) """ return await IndexRepo.cross_repo_search(query=query, file_or_path=file_or_path) + + @staticmethod + def is_large_file(content: str, mix_token_count: int = 0) -> bool: + encoding = tiktoken.get_encoding("cl100k_base") + token_count = len(encoding.encode(content)) + mix_token_count = mix_token_count or DEFAULT_MIN_TOKEN_COUNT + return token_count >= mix_token_count diff --git a/tests/metagpt/tools/libs/test_editor.py b/tests/metagpt/tools/libs/test_editor.py index 9a6e4b62f..40b6d1426 100644 --- a/tests/metagpt/tools/libs/test_editor.py +++ b/tests/metagpt/tools/libs/test_editor.py @@ -737,5 +737,13 @@ async def test_similarity_search(query, filename): save_to.unlink(missing_ok=True) +@pytest.mark.skip +@pytest.mark.asyncio +async def test_read(): + editor = Editor() + content = await editor.read(str(TEST_DATA_PATH / "pdf/9112674.pdf")) + assert "similarity_search" in content.block_content + + if __name__ == "__main__": pytest.main([__file__, "-s"])