Merge branch 'fixbug/index_max_token' into 'mgx_ops'

feat: IndexRepo + config

See merge request pub/MetaGPT!376
This commit is contained in:
林义章 2024-09-25 07:14:55 +00:00
commit a9ec57dbbb
8 changed files with 185 additions and 34 deletions

View file

@ -0,0 +1,21 @@
import pytest
from metagpt.const import TEST_DATA_PATH
from metagpt.roles.di.data_analyst import DataAnalyst
@pytest.mark.skip
@pytest.mark.asyncio
@pytest.mark.parametrize(
("query", "filename"), [("similarity search about '有哪些需求描述?' in document ", TEST_DATA_PATH / "requirements/2.pdf")]
)
async def test_similarity_search(query, filename):
di = DataAnalyst()
query += f"'{str(filename)}'"
rsp = await di.run(query)
assert rsp
if __name__ == "__main__":
pytest.main([__file__, "-s"])

View file

@ -9,8 +9,8 @@ from metagpt.tools.libs.editor import Editor
from metagpt.tools.libs.index_repo import (
CHATS_INDEX_ROOT,
CHATS_ROOT,
DEFAULT_MIN_TOKEN_COUNT,
UPLOAD_ROOT,
UPLOADS_INDEX_ROOT,
IndexRepo,
)
from metagpt.utils.common import list_files
@ -756,8 +756,6 @@ async def mock_index_repo():
os.system(command)
filenames = list_files(UPLOAD_ROOT)
uploads_files = [i for i in filenames if Path(i).suffix in {".md", ".txt", ".json", ".pdf"}]
uploads_repo = IndexRepo(persist_path=UPLOADS_INDEX_ROOT, root_path=UPLOAD_ROOT, min_token_count=0)
await uploads_repo.add(uploads_files)
assert uploads_files
filenames = list_files(src_path)
@ -771,19 +769,63 @@ async def mock_index_repo():
@pytest.mark.asyncio
async def test_index_repo():
# mock data
chat_path, UPLOAD_ROOT, src_path = await mock_index_repo()
chat_path, upload_path, src_path = await mock_index_repo()
editor = Editor()
rsp = await editor.search_index_repo(query="业务线", file_or_path=chat_path)
rsp = await editor.similarity_search(query="业务线", file_or_path=chat_path)
assert rsp
rsp = await editor.search_index_repo(query="业务线", file_or_path=UPLOAD_ROOT)
rsp = await editor.similarity_search(query="业务线", file_or_path=upload_path)
assert rsp
rsp = await editor.search_index_repo(query="业务线", file_or_path=src_path)
rsp = await editor.similarity_search(query="业务线", file_or_path=src_path)
assert rsp
shutil.rmtree(CHATS_ROOT)
shutil.rmtree(UPLOAD_ROOT)
@pytest.mark.skip
@pytest.mark.asyncio
@pytest.mark.parametrize(
("query", "filename"),
[
(
"In this document, who are the legal representatives of both parties?",
TEST_DATA_PATH / "pdf/20210709逗你学云豆付费课程协议.pdf",
),
(
"What is the short name of the company in this document?",
TEST_DATA_PATH / "pdf/company_stock_code.pdf",
),
("平安创新推出中国版的什么模式,将差异化的医疗健康服务与作为支付方的金融业务无缝结合", TEST_DATA_PATH / "pdf/9112674.pdf"),
(
"What principle is introduced by the author to explain the conditions necessary for the emergence of complexity?",
TEST_DATA_PATH / "pdf/9781444323498.ch2_1.pdf",
),
("行高的继承性的代码示例是?", TEST_DATA_PATH / "pdf/02-CSS.pdf"),
],
)
async def test_similarity_search(query, filename):
filename = Path(filename)
save_to = Path(UPLOAD_ROOT) / filename.name
save_to.parent.mkdir(parents=True, exist_ok=True)
os.system(f"cp {str(filename)} {str(save_to)}")
editor = Editor()
rsp = await editor.similarity_search(query=query, file_or_path=save_to)
assert rsp
save_to.unlink(missing_ok=True)
@pytest.mark.skip
@pytest.mark.asyncio
async def test_read():
editor = Editor()
filename = TEST_DATA_PATH / "pdf/9112674.pdf"
content = await editor.read(str(filename))
size = filename.stat().st_size
assert "similarity_search" in content.block_content and size > 5 * DEFAULT_MIN_TOKEN_COUNT
if __name__ == "__main__":
pytest.main([__file__, "-s"])