From 6c39c80d94467a7aec1331b2163837178eb1fb8a Mon Sep 17 00:00:00 2001 From: liuminhui Date: Tue, 23 Jul 2024 10:26:31 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/rag/omniparse.py | 1 - metagpt/rag/engines/simple.py | 2 +- metagpt/utils/omniparse_client.py | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/rag/omniparse.py b/examples/rag/omniparse.py index af8207c5a..b9159dae5 100644 --- a/examples/rag/omniparse.py +++ b/examples/rag/omniparse.py @@ -11,7 +11,6 @@ TEST_DOCX = EXAMPLE_DATA_PATH / "omniparse/test01.docx" TEST_PDF = EXAMPLE_DATA_PATH / "omniparse/test02.pdf" TEST_VIDEO = EXAMPLE_DATA_PATH / "omniparse/test03.mp4" TEST_AUDIO = EXAMPLE_DATA_PATH / "omniparse/test04.mp3" -TEST_WEBSITE_URL = "https://github.com/geekan/MetaGPT" async def omniparse_client_example(): diff --git a/metagpt/rag/engines/simple.py b/metagpt/rag/engines/simple.py index e015b7b7f..a03e0149c 100644 --- a/metagpt/rag/engines/simple.py +++ b/metagpt/rag/engines/simple.py @@ -315,7 +315,7 @@ class SimpleEngine(RetrieverQueryEngine): def _get_file_extractor() -> dict[str:BaseReader]: """ Get the file extractor. - Currently, only PDF use OmniParse + Currently, only PDF use OmniParse. Other document types use the built-in reader from llama_index. Returns: dict[file_type: BaseReader] diff --git a/metagpt/utils/omniparse_client.py b/metagpt/utils/omniparse_client.py index 12c5ac392..e7c5a3d44 100644 --- a/metagpt/utils/omniparse_client.py +++ b/metagpt/utils/omniparse_client.py @@ -122,6 +122,7 @@ class OmniParseClient: OmniParsedResult: The result of the pdf parsing. """ self.verify_file_ext(file_input, {".pdf"}) + # parse_pdf supports parsing by accepting only the byte data of the file. file_info = await self.get_file_info(file_input, only_bytes=True) endpoint = f"{self.parse_document_endpoint}/pdf" resp = await self._request_parse(endpoint=endpoint, files={"file": file_info})