mg集成omniparse

2026-06-14 15:25:17 +02:00 · 2024-07-18 20:40:20 +08:00 · 2024-07-18 20:40:20 +08:00 · 22b9990ccf
commit 22b9990ccf
parent 39eb534ca0
14 changed files with 381 additions and 14 deletions
--- a/examples/rag/omniparse_client.py
+++ b/examples/rag/omniparse_client.py
@ -0,0 +1,45 @@
+import asyncio
+
+from llama_parse import ResultType
+
+from metagpt.config2 import config
+from metagpt.logs import logger
+from metagpt.rag.parser.omniparse.client import OmniParseClient
+from metagpt.rag.parser.omniparse.parse import OmniParse
+from metagpt.rag.schema import OmniParseOptions, OmniParseType
+
+
+async def omniparse_client_example():
+    client = OmniParseClient(base_url=config.omniparse.base_url)
+
+    with open("../data/rag/test01.docx", "rb") as f:
+        filelike = f.read()
+    parse_document_ret = await client.parse_document(filelike=filelike, bytes_filename="test_01.docx")
+    logger.info(parse_document_ret)
+
+    parse_pdf_ret = await client.parse_pdf(filelike="../data/rag/test02.pdf")
+    logger.info(parse_pdf_ret)
+
+
+async def omniparse_example():
+    parser = OmniParse(
+        api_key=config.omniparse.api_key,
+        base_url=config.omniparse.base_url,
+        parse_options=OmniParseOptions(parse_type=OmniParseType.PDF, result_type=ResultType.MD)
+    )
+    ret = await parser.aload_data(file_path="../data/rag/test02.pdf")
+    logger.info(ret)
+
+    file_paths = ["../data/rag/test01.docx", "../data/rag/test02.pdf"]
+    parser.parse_type = OmniParseType.DOCUMENT
+    ret = await parser.aload_data(file_path=file_paths)
+    logger.info(ret)
+
+
+async def main():
+    await omniparse_client_example()
+    await omniparse_example()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
--- a/examples/rag/rag_pipeline.py
+++ b/examples/rag/rag_pipeline.py
@ -0,0 +1,260 @@
+"""RAG pipeline"""
+
+import asyncio
+
+from pydantic import BaseModel
+
+from metagpt.const import DATA_PATH, EXAMPLE_DATA_PATH
+from metagpt.logs import logger
+from metagpt.rag.engines import SimpleEngine
+from metagpt.rag.schema import (
+    ChromaIndexConfig,
+    ChromaRetrieverConfig,
+    ElasticsearchIndexConfig,
+    ElasticsearchRetrieverConfig,
+    ElasticsearchStoreConfig,
+    FAISSRetrieverConfig,
+    LLMRankerConfig,
+)
+from metagpt.utils.exceptions import handle_exception
+
+LLM_TIP = "If you not sure, just answer I don't know."
+
+DOC_PATH = EXAMPLE_DATA_PATH / "rag/writer.txt"
+QUESTION = f"What are key qualities to be a good writer? {LLM_TIP}"
+
+TRAVEL_DOC_PATH = EXAMPLE_DATA_PATH / "rag/travel.txt"
+TRAVEL_QUESTION = f"What does Bob like? {LLM_TIP}"
+
+
+class Player(BaseModel):
+    """To demonstrate rag add objs."""
+
+    name: str = ""
+    goal: str = "Win The 100-meter Sprint."
+    tool: str = "Red Bull Energy Drink."
+
+    def rag_key(self) -> str:
+        """For search"""
+        return self.goal
+
+
+class RAGExample:
+    """Show how to use RAG."""
+
+    def __init__(self, engine: SimpleEngine = None, use_llm_ranker: bool = True):
+        self._engine = engine
+        self._use_llm_ranker = use_llm_ranker
+
+    @property
+    def engine(self):
+        if not self._engine:
+            ranker_configs = [LLMRankerConfig()] if self._use_llm_ranker else None
+
+            self._engine = SimpleEngine.from_docs(
+                input_files=[DOC_PATH],
+                retriever_configs=[FAISSRetrieverConfig()],
+                ranker_configs=ranker_configs,
+            )
+        return self._engine
+
+    @engine.setter
+    def engine(self, value: SimpleEngine):
+        self._engine = value
+
+    @handle_exception
+    async def run_pipeline(self, question=QUESTION, print_title=True):
+        """This example run rag pipeline, use faiss retriever and llm ranker, will print something like:
+
+        Retrieve Result:
+        0. Productivi..., 10.0
+        1. I wrote cu..., 7.0
+        2. I highly r..., 5.0
+
+        Query Result:
+        Passion, adaptability, open-mindedness, creativity, discipline, and empathy are key qualities to be a good writer.
+        """
+        if print_title:
+            self._print_title("Run Pipeline")
+
+        nodes = await self.engine.aretrieve(question)
+        self._print_retrieve_result(nodes)
+
+        answer = await self.engine.aquery(question)
+        self._print_query_result(answer)
+
+    @handle_exception
+    async def add_docs(self):
+        """This example show how to add docs.
+
+        Before add docs llm anwser I don't know.
+        After add docs llm give the correct answer, will print something like:
+
+        [Before add docs]
+        Retrieve Result:
+
+        Query Result:
+        Empty Response
+
+        [After add docs]
+        Retrieve Result:
+        0. Bob like..., 10.0
+
+        Query Result:
+        Bob likes traveling.
+        """
+        self._print_title("Add Docs")
+
+        travel_question = f"{TRAVEL_QUESTION}"
+        travel_filepath = TRAVEL_DOC_PATH
+
+        logger.info("[Before add docs]")
+        await self.run_pipeline(question=travel_question, print_title=False)
+
+        logger.info("[After add docs]")
+        self.engine.add_docs([travel_filepath])
+        await self.run_pipeline(question=travel_question, print_title=False)
+
+    @handle_exception
+    async def add_objects(self, print_title=True):
+        """This example show how to add objects.
+
+        Before add docs, engine retrieve nothing.
+        After add objects, engine give the correct answer, will print something like:
+
+        [Before add objs]
+        Retrieve Result:
+
+        [After add objs]
+        Retrieve Result:
+        0. 100m Sprin..., 10.0
+
+        [Object Detail]
+        {'name': 'Mike', 'goal': 'Win The 100-meter Sprint', 'tool': 'Red Bull Energy Drink'}
+        """
+        if print_title:
+            self._print_title("Add Objects")
+
+        player = Player(name="Mike")
+        question = f"{player.rag_key()}"
+
+        logger.info("[Before add objs]")
+        await self._retrieve_and_print(question)
+
+        logger.info("[After add objs]")
+        self.engine.add_objs([player])
+
+        try:
+            nodes = await self._retrieve_and_print(question)
+
+            logger.info("[Object Detail]")
+            player: Player = nodes[0].metadata["obj"]
+            logger.info(player.name)
+        except Exception as e:
+            logger.error(f"nodes is empty, llm don't answer correctly, exception: {e}")
+
+    @handle_exception
+    async def init_objects(self):
+        """This example show how to from objs, will print something like:
+
+        Same as add_objects.
+        """
+        self._print_title("Init Objects")
+
+        pre_engine = self.engine
+        self.engine = SimpleEngine.from_objs(retriever_configs=[FAISSRetrieverConfig()])
+        await self.add_objects(print_title=False)
+        self.engine = pre_engine
+
+    @handle_exception
+    async def init_and_query_chromadb(self):
+        """This example show how to use chromadb. how to save and load index. will print something like:
+
+        Query Result:
+        Bob likes traveling.
+        """
+        self._print_title("Init And Query ChromaDB")
+
+        # 1. save index
+        output_dir = DATA_PATH / "rag"
+        SimpleEngine.from_docs(
+            input_files=[TRAVEL_DOC_PATH],
+            retriever_configs=[ChromaRetrieverConfig(persist_path=output_dir)],
+        )
+
+        # 2. load index
+        engine = SimpleEngine.from_index(index_config=ChromaIndexConfig(persist_path=output_dir))
+
+        # 3. query
+        answer = await engine.aquery(TRAVEL_QUESTION)
+        self._print_query_result(answer)
+
+    @handle_exception
+    async def init_and_query_es(self):
+        """This example show how to use es. how to save and load index. will print something like:
+
+        Query Result:
+        Bob likes traveling.
+        """
+        self._print_title("Init And Query Elasticsearch")
+
+        # 1. create es index and save docs
+        store_config = ElasticsearchStoreConfig(index_name="travel", es_url="http://127.0.0.1:9200")
+        engine = SimpleEngine.from_docs(
+            input_files=[TRAVEL_DOC_PATH],
+            retriever_configs=[ElasticsearchRetrieverConfig(store_config=store_config)],
+        )
+
+        # 2. load index
+        engine = SimpleEngine.from_index(index_config=ElasticsearchIndexConfig(store_config=store_config))
+
+        # 3. query
+        answer = await engine.aquery(TRAVEL_QUESTION)
+        self._print_query_result(answer)
+
+    @staticmethod
+    def _print_title(title):
+        logger.info(f"{'#'*30} {title} {'#'*30}")
+
+    @staticmethod
+    def _print_retrieve_result(result):
+        """Print retrieve result."""
+        logger.info("Retrieve Result:")
+
+        for i, node in enumerate(result):
+            logger.info(f"{i}. {node.text[:10]}..., {node.score}")
+
+        logger.info("")
+
+    @staticmethod
+    def _print_query_result(result):
+        """Print query result."""
+        logger.info("Query Result:")
+
+        logger.info(f"{result}\n")
+
+    async def _retrieve_and_print(self, question):
+        nodes = await self.engine.aretrieve(question)
+        self._print_retrieve_result(nodes)
+        return nodes
+
+
+async def main():
+    """RAG pipeline.
+
+    Note:
+    1. If `use_llm_ranker` is True, then it will use LLM Reranker to get better result, but it is not always guaranteed that the output will be parseable for reranking,
+       prefer `gpt-4-turbo`, otherwise might encounter `IndexError: list index out of range` or `ValueError: invalid literal for int() with base 10`.
+    """
+    e = RAGExample(use_llm_ranker=False)
+
+    await e.run_pipeline()
+    await e.add_docs()
+    await e.add_objects()
+    await e.init_objects()
+    await e.init_and_query_chromadb()
+    await e.init_and_query_es()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/examples/rag/rag_search.py
+++ b/examples/rag/rag_search.py
@ -0,0 +1,21 @@
+"""Agent with RAG search."""
+
+import asyncio
+
+from examples.rag.rag_pipeline import DOC_PATH, QUESTION
+from metagpt.logs import logger
+from metagpt.rag.engines import SimpleEngine
+from metagpt.roles import Sales
+
+
+async def search():
+    """Agent with RAG search."""
+
+    store = SimpleEngine.from_docs(input_files=[DOC_PATH])
+    role = Sales(profile="Sales", store=store)
+    result = await role.run(QUESTION)
+    logger.info(result)
+
+
+if __name__ == "__main__":
+    asyncio.run(search())