代码优化

2026-07-17 16:41:05 +02:00 · 2024-07-23 10:26:31 +08:00 · 2024-07-23 10:26:31 +08:00 · 6c39c80d94
commit 6c39c80d94
parent f9d3a8c521
3 changed files with 2 additions and 2 deletions
--- a/examples/rag/omniparse.py
+++ b/examples/rag/omniparse.py
@ -11,7 +11,6 @@ TEST_DOCX = EXAMPLE_DATA_PATH / "omniparse/test01.docx"
 TEST_PDF = EXAMPLE_DATA_PATH / "omniparse/test02.pdf"
 TEST_VIDEO = EXAMPLE_DATA_PATH / "omniparse/test03.mp4"
 TEST_AUDIO = EXAMPLE_DATA_PATH / "omniparse/test04.mp3"
-TEST_WEBSITE_URL = "https://github.com/geekan/MetaGPT"


 async def omniparse_client_example():
--- a/metagpt/rag/engines/simple.py
+++ b/metagpt/rag/engines/simple.py
@ -315,7 +315,7 @@ class SimpleEngine(RetrieverQueryEngine):
    def _get_file_extractor() -> dict[str:BaseReader]:
        """
        Get the file extractor.
-        Currently, only PDF use OmniParse
+        Currently, only PDF use OmniParse. Other document types use the built-in reader from llama_index.

        Returns:
            dict[file_type: BaseReader]
--- a/metagpt/utils/omniparse_client.py
+++ b/metagpt/utils/omniparse_client.py
@ -122,6 +122,7 @@ class OmniParseClient:
            OmniParsedResult: The result of the pdf parsing.
        """
        self.verify_file_ext(file_input, {".pdf"})
+        # parse_pdf supports parsing by accepting only the byte data of the file.
        file_info = await self.get_file_info(file_input, only_bytes=True)
        endpoint = f"{self.parse_document_endpoint}/pdf"
        resp = await self._request_parse(endpoint=endpoint, files={"file": file_info})