From 6c39c80d94467a7aec1331b2163837178eb1fb8a Mon Sep 17 00:00:00 2001
From: liuminhui <huidbk@163.com>
Date: Tue, 23 Jul 2024 10:26:31 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/rag/omniparse.py         | 1 -
 metagpt/rag/engines/simple.py     | 2 +-
 metagpt/utils/omniparse_client.py | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/rag/omniparse.py b/examples/rag/omniparse.py
index af8207c5a..b9159dae5 100644
--- a/examples/rag/omniparse.py
+++ b/examples/rag/omniparse.py
@@ -11,7 +11,6 @@ TEST_DOCX = EXAMPLE_DATA_PATH / "omniparse/test01.docx"
 TEST_PDF = EXAMPLE_DATA_PATH / "omniparse/test02.pdf"
 TEST_VIDEO = EXAMPLE_DATA_PATH / "omniparse/test03.mp4"
 TEST_AUDIO = EXAMPLE_DATA_PATH / "omniparse/test04.mp3"
-TEST_WEBSITE_URL = "https://github.com/geekan/MetaGPT"
 
 
 async def omniparse_client_example():
diff --git a/metagpt/rag/engines/simple.py b/metagpt/rag/engines/simple.py
index e015b7b7f..a03e0149c 100644
--- a/metagpt/rag/engines/simple.py
+++ b/metagpt/rag/engines/simple.py
@@ -315,7 +315,7 @@ class SimpleEngine(RetrieverQueryEngine):
     def _get_file_extractor() -> dict[str:BaseReader]:
         """
         Get the file extractor.
-        Currently, only PDF use OmniParse
+        Currently, only PDF use OmniParse. Other document types use the built-in reader from llama_index.
 
         Returns:
             dict[file_type: BaseReader]
diff --git a/metagpt/utils/omniparse_client.py b/metagpt/utils/omniparse_client.py
index 12c5ac392..e7c5a3d44 100644
--- a/metagpt/utils/omniparse_client.py
+++ b/metagpt/utils/omniparse_client.py
@@ -122,6 +122,7 @@ class OmniParseClient:
             OmniParsedResult: The result of the pdf parsing.
         """
         self.verify_file_ext(file_input, {".pdf"})
+        # parse_pdf supports parsing by accepting only the byte data of the file.
         file_info = await self.get_file_info(file_input, only_bytes=True)
         endpoint = f"{self.parse_document_endpoint}/pdf"
         resp = await self._request_parse(endpoint=endpoint, files={"file": file_info})