rm

2026-07-14 16:32:16 +02:00 · 2024-10-11 16:48:08 +08:00 · 2024-10-11 16:48:08 +08:00 · a7efa27ce0
commit a7efa27ce0
parent f6cbf06e7b 5be77dfd73
55 changed files with 1009 additions and 72 deletions
--- a/metagpt/utils/omniparse_client.py
+++ b/metagpt/utils/omniparse_client.py
@ -0,0 +1,239 @@
+import mimetypes
+import os
+from pathlib import Path
+from typing import Union
+
+import httpx
+
+from metagpt.rag.schema import OmniParsedResult
+from metagpt.utils.common import aread_bin
+
+
+class OmniParseClient:
+    """
+    OmniParse Server Client
+    This client interacts with the OmniParse server to parse different types of media, documents.
+
+    OmniParse API Documentation: https://docs.cognitivelab.in/api
+
+    Attributes:
+        ALLOWED_DOCUMENT_EXTENSIONS (set): A set of supported document file extensions.
+        ALLOWED_AUDIO_EXTENSIONS (set): A set of supported audio file extensions.
+        ALLOWED_VIDEO_EXTENSIONS (set): A set of supported video file extensions.
+    """
+
+    ALLOWED_DOCUMENT_EXTENSIONS = {".pdf", ".ppt", ".pptx", ".doc", ".docx"}
+    ALLOWED_AUDIO_EXTENSIONS = {".mp3", ".wav", ".aac"}
+    ALLOWED_VIDEO_EXTENSIONS = {".mp4", ".mkv", ".avi", ".mov"}
+
+    def __init__(self, api_key: str = None, base_url: str = "http://localhost:8000", max_timeout: int = 120):
+        """
+        Args:
+            api_key: Default None, can be used for authentication later.
+            base_url: Base URL for the API.
+            max_timeout: Maximum request timeout in seconds.
+        """
+        self.api_key = api_key
+        self.base_url = base_url
+        self.max_timeout = max_timeout
+
+        self.parse_media_endpoint = "/parse_media"
+        self.parse_website_endpoint = "/parse_website"
+        self.parse_document_endpoint = "/parse_document"
+
+    async def _request_parse(
+        self,
+        endpoint: str,
+        method: str = "POST",
+        files: dict = None,
+        params: dict = None,
+        data: dict = None,
+        json: dict = None,
+        headers: dict = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Request OmniParse API to parse a document.
+
+        Args:
+            endpoint (str): API endpoint.
+            method (str, optional): HTTP method to use. Default is "POST".
+            files (dict, optional): Files to include in the request.
+            params (dict, optional): Query string parameters.
+            data (dict, optional): Form data to include in the request body.
+            json (dict, optional): JSON data to include in the request body.
+            headers (dict, optional): HTTP headers to include in the request.
+            **kwargs: Additional keyword arguments for httpx.AsyncClient.request()
+
+        Returns:
+            dict: JSON response data.
+        """
+        url = f"{self.base_url}{endpoint}"
+        method = method.upper()
+        headers = headers or {}
+        _headers = {"Authorization": f"Bearer {self.api_key}"} if self.api_key else {}
+        headers.update(**_headers)
+        async with httpx.AsyncClient() as client:
+            response = await client.request(
+                url=url,
+                method=method,
+                files=files,
+                params=params,
+                json=json,
+                data=data,
+                headers=headers,
+                timeout=self.max_timeout,
+                **kwargs,
+            )
+            response.raise_for_status()
+            return response.json()
+
+    async def parse_document(self, file_input: Union[str, bytes, Path], bytes_filename: str = None) -> OmniParsedResult:
+        """
+        Parse document-type data (supports ".pdf", ".ppt", ".pptx", ".doc", ".docx").
+
+        Args:
+            file_input: File path or file byte data.
+            bytes_filename: Filename for byte data, useful for determining MIME type for the HTTP request.
+
+        Raises:
+            ValueError: If the file extension is not allowed.
+
+        Returns:
+            OmniParsedResult: The result of the document parsing.
+        """
+        self.verify_file_ext(file_input, self.ALLOWED_DOCUMENT_EXTENSIONS, bytes_filename)
+        file_info = await self.get_file_info(file_input, bytes_filename)
+        resp = await self._request_parse(self.parse_document_endpoint, files={"file": file_info})
+        data = OmniParsedResult(**resp)
+        return data
+
+    async def parse_pdf(self, file_input: Union[str, bytes, Path]) -> OmniParsedResult:
+        """
+        Parse pdf document.
+
+        Args:
+            file_input: File path or file byte data.
+
+        Raises:
+            ValueError: If the file extension is not allowed.
+
+        Returns:
+            OmniParsedResult: The result of the pdf parsing.
+        """
+        self.verify_file_ext(file_input, {".pdf"})
+        # parse_pdf supports parsing by accepting only the byte data of the file.
+        file_info = await self.get_file_info(file_input, only_bytes=True)
+        endpoint = f"{self.parse_document_endpoint}/pdf"
+        resp = await self._request_parse(endpoint=endpoint, files={"file": file_info})
+        data = OmniParsedResult(**resp)
+        return data
+
+    async def parse_video(self, file_input: Union[str, bytes, Path], bytes_filename: str = None) -> dict:
+        """
+        Parse video-type data (supports ".mp4", ".mkv", ".avi", ".mov").
+
+        Args:
+            file_input: File path or file byte data.
+            bytes_filename: Filename for byte data, useful for determining MIME type for the HTTP request.
+
+        Raises:
+            ValueError: If the file extension is not allowed.
+
+        Returns:
+            dict: JSON response data.
+        """
+        self.verify_file_ext(file_input, self.ALLOWED_VIDEO_EXTENSIONS, bytes_filename)
+        file_info = await self.get_file_info(file_input, bytes_filename)
+        return await self._request_parse(f"{self.parse_media_endpoint}/video", files={"file": file_info})
+
+    async def parse_audio(self, file_input: Union[str, bytes, Path], bytes_filename: str = None) -> dict:
+        """
+        Parse audio-type data (supports ".mp3", ".wav", ".aac").
+
+        Args:
+            file_input: File path or file byte data.
+            bytes_filename: Filename for byte data, useful for determining MIME type for the HTTP request.
+
+        Raises:
+            ValueError: If the file extension is not allowed.
+
+        Returns:
+            dict: JSON response data.
+        """
+        self.verify_file_ext(file_input, self.ALLOWED_AUDIO_EXTENSIONS, bytes_filename)
+        file_info = await self.get_file_info(file_input, bytes_filename)
+        return await self._request_parse(f"{self.parse_media_endpoint}/audio", files={"file": file_info})
+
+    @staticmethod
+    def verify_file_ext(file_input: Union[str, bytes, Path], allowed_file_extensions: set, bytes_filename: str = None):
+        """
+        Verify the file extension.
+
+        Args:
+            file_input: File path or file byte data.
+            allowed_file_extensions: Set of allowed file extensions.
+            bytes_filename: Filename to use for verification when `file_input` is byte data.
+
+        Raises:
+            ValueError: If the file extension is not allowed.
+
+        Returns:
+        """
+        verify_file_path = None
+        if isinstance(file_input, (str, Path)):
+            verify_file_path = str(file_input)
+        elif isinstance(file_input, bytes) and bytes_filename:
+            verify_file_path = bytes_filename
+
+        if not verify_file_path:
+            # Do not verify if only byte data is provided
+            return
+
+        file_ext = os.path.splitext(verify_file_path)[1].lower()
+        if file_ext not in allowed_file_extensions:
+            raise ValueError(f"Not allowed {file_ext} File extension must be one of {allowed_file_extensions}")
+
+    @staticmethod
+    async def get_file_info(
+        file_input: Union[str, bytes, Path],
+        bytes_filename: str = None,
+        only_bytes: bool = False,
+    ) -> Union[bytes, tuple]:
+        """
+        Get file information.
+
+        Args:
+            file_input: File path or file byte data.
+            bytes_filename: Filename to use when uploading byte data, useful for determining MIME type.
+            only_bytes: Whether to return only byte data. Default is False, which returns a tuple.
+
+        Raises:
+            ValueError: If bytes_filename is not provided when file_input is bytes or if file_input is not a valid type.
+
+        Notes:
+            Since `parse_document`,`parse_video`, `parse_audio` supports parsing various file types,
+            the MIME type of the file must be specified when uploading.
+
+        Returns: [bytes, tuple]
+            Returns bytes if only_bytes is True, otherwise returns a tuple (filename, file_bytes, mime_type).
+        """
+        if isinstance(file_input, (str, Path)):
+            filename = os.path.basename(str(file_input))
+            file_bytes = await aread_bin(file_input)
+
+            if only_bytes:
+                return file_bytes
+
+            mime_type = mimetypes.guess_type(file_input)[0]
+            return filename, file_bytes, mime_type
+        elif isinstance(file_input, bytes):
+            if only_bytes:
+                return file_input
+            if not bytes_filename:
+                raise ValueError("bytes_filename must be set when passing bytes")
+
+            mime_type = mimetypes.guess_type(bytes_filename)[0]
+            return bytes_filename, file_input, mime_type
+        else:
+            raise ValueError("file_input must be a string (file path) or bytes.")
--- a/metagpt/utils/token_counter.py
+++ b/metagpt/utils/token_counter.py
@ -40,11 +40,20 @@ TOKEN_COSTS = {
    "gpt-4-vision-preview": {"prompt": 0.01, "completion": 0.03},  # TODO add extra image price calculator
    "gpt-4-1106-vision-preview": {"prompt": 0.01, "completion": 0.03},
    "gpt-4o": {"prompt": 0.005, "completion": 0.015},
+    "gpt-4o-mini": {"prompt": 0.00015, "completion": 0.0006},
+    "gpt-4o-mini-2024-07-18": {"prompt": 0.00015, "completion": 0.0006},
    "gpt-4o-2024-05-13": {"prompt": 0.005, "completion": 0.015},
+    "gpt-4o-2024-08-06": {"prompt": 0.0025, "completion": 0.01},
+    "o1-preview": {"prompt": 0.015, "completion": 0.06},
+    "o1-preview-2024-09-12": {"prompt": 0.015, "completion": 0.06},
+    "o1-mini": {"prompt": 0.003, "completion": 0.012},
+    "o1-mini-2024-09-12": {"prompt": 0.003, "completion": 0.012},
    "text-embedding-ada-002": {"prompt": 0.0004, "completion": 0.0},
    "glm-3-turbo": {"prompt": 0.0007, "completion": 0.0007},  # 128k version, prompt + completion tokens=0.005￥/k-tokens
    "glm-4": {"prompt": 0.014, "completion": 0.014},  # 128k version, prompt + completion tokens=0.1￥/k-tokens
-    "gemini-pro": {"prompt": 0.00025, "completion": 0.0005},
+    "gemini-1.5-flash": {"prompt": 0.000075, "completion": 0.0003},
+    "gemini-1.5-pro": {"prompt": 0.0035, "completion": 0.0105},
+    "gemini-1.0-pro": {"prompt": 0.0005, "completion": 0.0015},
    "moonshot-v1-8k": {"prompt": 0.012, "completion": 0.012},  # prompt + completion tokens=0.012￥/k-tokens
    "moonshot-v1-32k": {"prompt": 0.024, "completion": 0.024},
    "moonshot-v1-128k": {"prompt": 0.06, "completion": 0.06},
@ -68,15 +77,20 @@ TOKEN_COSTS = {
    "llama3-70b-8192": {"prompt": 0.0059, "completion": 0.0079},
    "openai/gpt-3.5-turbo-0125": {"prompt": 0.0005, "completion": 0.0015},
    "openai/gpt-4-turbo-preview": {"prompt": 0.01, "completion": 0.03},
+    "openai/o1-preview": {"prompt": 0.015, "completion": 0.06},
+    "openai/o1-mini": {"prompt": 0.003, "completion": 0.012},
+    "anthropic/claude-3-opus": {"prompt": 0.015, "completion": 0.075},
+    "anthropic/claude-3.5-sonnet": {"prompt": 0.003, "completion": 0.015},
+    "google/gemini-pro-1.5": {"prompt": 0.0025, "completion": 0.0075},  # for openrouter, end
    "deepseek-chat": {"prompt": 0.00014, "completion": 0.00028},
    "deepseek-coder": {"prompt": 0.00014, "completion": 0.00028},
    # For ark model https://www.volcengine.com/docs/82379/1099320
-    "doubao-lite-4k-240515": {"prompt": 0.000042, "completion": 0.000084},
-    "doubao-lite-32k-240515": {"prompt": 0.000042, "completion": 0.000084},
-    "doubao-lite-128k-240515": {"prompt": 0.00011, "completion": 0.00013},
-    "doubao-pro-4k-240515": {"prompt": 0.00011, "completion": 0.00028},
-    "doubao-pro-32k-240515": {"prompt": 0.00011, "completion": 0.00028},
-    "doubao-pro-128k-240515": {"prompt": 0.0007, "completion": 0.0012},
+    "doubao-lite-4k-240515": {"prompt": 0.000043, "completion": 0.000086},
+    "doubao-lite-32k-240515": {"prompt": 0.000043, "completion": 0.000086},
+    "doubao-lite-128k-240515": {"prompt": 0.00011, "completion": 0.00014},
+    "doubao-pro-4k-240515": {"prompt": 0.00011, "completion": 0.00029},
+    "doubao-pro-32k-240515": {"prompt": 0.00011, "completion": 0.00029},
+    "doubao-pro-128k-240515": {"prompt": 0.0007, "completion": 0.0013},
    "llama3-70b-llama3-70b-instruct": {"prompt": 0.0, "completion": 0.0},
    "llama3-8b-llama3-8b-instruct": {"prompt": 0.0, "completion": 0.0},
 }
@ -137,8 +151,17 @@ QIANFAN_ENDPOINT_TOKEN_COSTS = {
 """
 DashScope Token price https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-thousand-questions-metering-and-billing
 Different model has different detail page. Attention, some model are free for a limited time.
+Some new model published by Alibaba will be prioritized to be released on the Model Studio instead of the Dashscope.
+Token price on Model Studio shows on https://help.aliyun.com/zh/model-studio/getting-started/models#ced16cb6cdfsy
 """
 DASHSCOPE_TOKEN_COSTS = {
+    "qwen2.5-72b-instruct": {"prompt": 0.00057, "completion": 0.0017},  # per 1k tokens
+    "qwen2.5-32b-instruct": {"prompt": 0.0005, "completion": 0.001},
+    "qwen2.5-14b-instruct": {"prompt": 0.00029, "completion": 0.00086},
+    "qwen2.5-7b-instruct": {"prompt": 0.00014, "completion": 0.00029},
+    "qwen2.5-3b-instruct": {"prompt": 0.0, "completion": 0.0},
+    "qwen2.5-1.5b-instruct": {"prompt": 0.0, "completion": 0.0},
+    "qwen2.5-0.5b-instruct": {"prompt": 0.0, "completion": 0.0},
    "qwen2-72b-instruct": {"prompt": 0.000714, "completion": 0.001428},
    "qwen2-57b-a14b-instruct": {"prompt": 0.0005, "completion": 0.001},
    "qwen2-7b-instruct": {"prompt": 0.000143, "completion": 0.000286},
@ -187,10 +210,26 @@ FIREWORKS_GRADE_TOKEN_COSTS = {
    "mixtral-8x7b": {"prompt": 0.4, "completion": 1.6},
 }

+# https://console.volcengine.com/ark/region:ark+cn-beijing/model
+DOUBAO_TOKEN_COSTS = {
+    "doubao-lite": {"prompt": 0.000043, "completion": 0.000086},
+    "doubao-lite-128k": {"prompt": 0.00011, "completion": 0.00014},
+    "doubao-pro": {"prompt": 0.00011, "completion": 0.00029},
+    "doubao-pro-128k": {"prompt": 0.00071, "completion": 0.0013},
+    "doubao-pro-256k": {"prompt": 0.00071, "completion": 0.0013},
+}
+
 # https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
 TOKEN_MAX = {
-    "gpt-4o-2024-05-13": 128000,
+    "o1-preview": 128000,
+    "o1-preview-2024-09-12": 128000,
+    "o1-mini": 128000,
+    "o1-mini-2024-09-12": 128000,
    "gpt-4o": 128000,
+    "gpt-4o-2024-05-13": 128000,
+    "gpt-4o-2024-08-06": 128000,
+    "gpt-4o-mini-2024-07-18": 128000,
+    "gpt-4o-mini": 128000,
    "gpt-4-turbo-2024-04-09": 128000,
    "gpt-4-0125-preview": 128000,
    "gpt-4-turbo-preview": 128000,
@ -212,7 +251,9 @@ TOKEN_MAX = {
    "text-embedding-ada-002": 8192,
    "glm-3-turbo": 128000,
    "glm-4": 128000,
-    "gemini-pro": 32768,
+    "gemini-1.5-flash": 1000000,
+    "gemini-1.5-pro": 2000000,
+    "gemini-1.0-pro": 32000,
    "moonshot-v1-8k": 8192,
    "moonshot-v1-32k": 32768,
    "moonshot-v1-128k": 128000,
@ -236,6 +277,11 @@ TOKEN_MAX = {
    "llama3-70b-8192": 8192,
    "openai/gpt-3.5-turbo-0125": 16385,
    "openai/gpt-4-turbo-preview": 128000,
+    "openai/o1-preview": 128000,
+    "openai/o1-mini": 128000,
+    "anthropic/claude-3-opus": 200000,
+    "anthropic/claude-3.5-sonnet": 200000,
+    "google/gemini-pro-1.5": 4000000,
    "deepseek-chat": 32768,
    "deepseek-coder": 16385,
    "doubao-lite-4k-240515": 4000,
@ -245,6 +291,13 @@ TOKEN_MAX = {
    "doubao-pro-32k-240515": 32000,
    "doubao-pro-128k-240515": 128000,
    # Qwen https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-7b-14b-72b-api-detailes?spm=a2c4g.11186623.0.i20
+    "qwen2.5-72b-instruct": 131072,
+    "qwen2.5-32b-instruct": 131072,
+    "qwen2.5-14b-instruct": 131072,
+    "qwen2.5-7b-instruct": 131072,
+    "qwen2.5-3b-instruct": 32768,
+    "qwen2.5-1.5b-instruct": 32768,
+    "qwen2.5-0.5b-instruct": 32768,
    "qwen2-57b-a14b-instruct": 32768,
    "qwen2-72b-instruct": 131072,
    "qwen2-7b-instruct": 32768,
@ -344,11 +397,19 @@ def count_input_tokens(messages, model="gpt-3.5-turbo-0125"):
        "gpt-4-turbo",
        "gpt-4-turbo-preview",
        "gpt-4-0125-preview",
+        "gpt-4-1106-preview",
        "gpt-4-turbo",
        "gpt-4-vision-preview",
        "gpt-4-1106-vision-preview",
-        "gpt-4o-2024-05-13",
        "gpt-4o",
+        "gpt-4o-2024-05-13",
+        "gpt-4o-2024-08-06",
+        "gpt-4o-mini",
+        "gpt-4o-mini-2024-07-18",
+        "o1-preview",
+        "o1-preview-2024-09-12",
+        "o1-mini",
+        "o1-mini-2024-09-12",
    }:
        tokens_per_message = 3  # # every reply is primed with <|start|>assistant<|message|>
        tokens_per_name = 1