feat: Editor + read pdf/docx...

2026-06-23 15:48:11 +02:00 · 2024-08-07 20:13:11 +08:00 · 2024-08-07 20:13:11 +08:00 · dc578d8b0b
commit dc578d8b0b
parent fa06a67a64
15 changed files with 447 additions and 30 deletions
--- a/metagpt/actions/di/execute_nb_code.py
+++ b/metagpt/actions/di/execute_nb_code.py
@ -191,7 +191,7 @@ class ExecuteNbCode(Action):
                output_text = remove_log_and_warning_lines(output_text)
            # The useful information of the exception is at the end,
            # the useful information of normal output is at the begining.
-            if '<!DOCTYPE html>' not in output_text:
+            if "<!DOCTYPE html>" not in output_text:
                output_text = output_text[:keep_len] if is_success else output_text[-keep_len:]

            parsed_output.append(output_text)
@ -286,11 +286,7 @@ class ExecuteNbCode(Action):
 def remove_log_and_warning_lines(input_str: str) -> str:
    delete_lines = ["[warning]", "warning:", "[cv]", "[info]"]
    result = "\n".join(
-        [
-            line
-            for line in input_str.split("\n")
-            if not any(dl in line.lower() for dl in delete_lines)
-        ]
+        [line for line in input_str.split("\n") if not any(dl in line.lower() for dl in delete_lines)]
    ).strip()
    return result

--- a/metagpt/rag/schema.py
+++ b/metagpt/rag/schema.py
@ -1,7 +1,7 @@
 """RAG schemas."""
-
+from enum import Enum
 from pathlib import Path
-from typing import Any, ClassVar, Literal, Optional, Union
+from typing import Any, ClassVar, List, Literal, Optional, Union

 from chromadb.api.types import CollectionMetadata
 from llama_index.core.embeddings import BaseEmbedding
@ -12,6 +12,7 @@ from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_validator

 from metagpt.config2 import config
 from metagpt.configs.embedding_config import EmbeddingType
+from metagpt.logs import logger
 from metagpt.rag.interface import RAGObject


@ -44,7 +45,13 @@ class FAISSRetrieverConfig(IndexRetrieverConfig):
    @model_validator(mode="after")
    def check_dimensions(self):
        if self.dimensions == 0:
-            self.dimensions = self._embedding_type_to_dimensions.get(config.embedding.api_type, 1536)
+            self.dimensions = config.embedding.dimensions or self._embedding_type_to_dimensions.get(
+                config.embedding.api_type, 1536
+            )
+            if not config.embedding.dimensions and config.embedding.api_type not in self._embedding_type_to_dimensions:
+                logger.warning(
+                    f"You didn't set dimensions in config when using {config.embedding.api_type}, default to 1536"
+                )

        return self

@ -207,3 +214,51 @@ class ObjectNode(TextNode):
        )

        return metadata.model_dump()
+
+
+class OmniParseType(str, Enum):
+    """OmniParseType"""
+
+    PDF = "PDF"
+    DOCUMENT = "DOCUMENT"
+
+
+class ParseResultType(str, Enum):
+    """The result type for the parser."""
+
+    TXT = "text"
+    MD = "markdown"
+    JSON = "json"
+
+
+class OmniParseOptions(BaseModel):
+    """OmniParse Options config"""
+
+    result_type: ParseResultType = Field(default=ParseResultType.MD, description="OmniParse result_type")
+    parse_type: OmniParseType = Field(default=OmniParseType.DOCUMENT, description="OmniParse parse_type")
+    max_timeout: Optional[int] = Field(default=120, description="Maximum timeout for OmniParse service requests")
+    num_workers: int = Field(
+        default=5,
+        gt=0,
+        lt=10,
+        description="Number of concurrent requests for multiple files",
+    )
+
+
+class OminParseImage(BaseModel):
+    image: str = Field(default="", description="image str bytes")
+    image_name: str = Field(default="", description="image name")
+    image_info: Optional[dict] = Field(default={}, description="image info")
+
+
+class OmniParsedResult(BaseModel):
+    markdown: str = Field(default="", description="markdown text")
+    text: str = Field(default="", description="plain text")
+    images: Optional[List[OminParseImage]] = Field(default=[], description="images")
+    metadata: Optional[dict] = Field(default={}, description="metadata")
+
+    @model_validator(mode="before")
+    def set_markdown(cls, values):
+        if not values.get("markdown"):
+            values["markdown"] = values.get("text")
+        return values
--- a/metagpt/strategy/task_type.py
+++ b/metagpt/strategy/task_type.py
@ -8,7 +8,8 @@ from metagpt.prompts.task_type import (
    FEATURE_ENGINEERING_PROMPT,
    IMAGE2WEBPAGE_PROMPT,
    MODEL_EVALUATE_PROMPT,
-    MODEL_TRAIN_PROMPT, WEB_SCRAPING_PROMPT,
+    MODEL_TRAIN_PROMPT,
+    WEB_SCRAPING_PROMPT,
 )


--- a/metagpt/tools/libs/init.py
+++ b/metagpt/tools/libs/init.py
@ -17,7 +17,7 @@ from metagpt.tools.libs import (
    deployer,
    git,
 )
-from metagpt.tools.libs.env import get_env, set_get_env_entry, default_get_env, get_env_description
+from metagpt.tools.libs.env import get_env, set_get_env_entry, default_get_env, get_env_description, get_env_default

 _ = (
    data_preprocess,
@ -32,6 +32,7 @@ _ = (
    deployer,
    git,
    get_env,
+    get_env_default,
    get_env_description,
    set_get_env_entry,
    default_get_env,
--- a/metagpt/tools/libs/editor.py
+++ b/metagpt/tools/libs/editor.py
@ -1,11 +1,18 @@
+import base64
 import os
 import shutil
 import subprocess
+from pathlib import Path
+from typing import List, Optional, Union

 from pydantic import BaseModel

 from metagpt.const import DEFAULT_WORKSPACE_ROOT
+from metagpt.logs import logger
 from metagpt.tools.tool_registry import register_tool
+from metagpt.utils import read_docx
+from metagpt.utils.common import aread_bin, awrite_bin, run_coroutine_sync
+from metagpt.utils.repo_to_markdown import is_text_file
 from metagpt.utils.report import EditorReporter


@ -40,12 +47,26 @@ class Editor:

    def read(self, path: str) -> FileBlock:
        """Read the whole content of a file. Using absolute paths as the argument for specifying the file location."""
-        with open(path, "r") as f:
-            self.resource.report(path, "path")
-            lines = f.readlines()
+        is_text, mime_type = run_coroutine_sync(is_text_file, path)
+        if is_text:
+            lines = self._read_text(path)
+        elif mime_type == "application/pdf":
+            lines = self._read_pdf(path)
+        elif mime_type in {
+            "application/msword",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "application/vnd.ms-word.document.macroEnabled.12",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+            "application/vnd.ms-word.template.macroEnabled.12",
+        }:
+            lines = self._read_docx(path)
+        else:
+            return FileBlock(file_path=str(path), block_content="")
+        self.resource.report(str(path), "path")
+
        lines_with_num = [f"{i + 1:03}|{line}" for i, line in enumerate(lines)]
        result = FileBlock(
-            file_path=path,
+            file_path=str(path),
            block_content="".join(lines_with_num),
        )
        return result
@ -196,3 +217,63 @@ class Editor:
        lint_passed = result.returncode == 0
        lint_message = result.stdout
        return lint_passed, lint_message
+
+    @staticmethod
+    def _read_text(path: Union[str, Path]) -> List[str]:
+        with open(str(path), "r") as f:
+            lines = f.readlines()
+        return lines
+
+    @staticmethod
+    def _read_pdf(path: Union[str, Path]) -> List[str]:
+        result = run_coroutine_sync(Editor._omniparse_read_file, path)
+        if result:
+            return result
+
+        from llama_index.readers.file import PDFReader
+
+        reader = PDFReader()
+        lines = reader.load_data(file=Path(path))
+        return [i.text for i in lines]
+
+    @staticmethod
+    def _read_docx(path: Union[str, Path]) -> List[str]:
+        result = run_coroutine_sync(Editor._omniparse_read_file, path)
+        if result:
+            return result
+        return read_docx(str(path))
+
+    @staticmethod
+    async def _omniparse_read_file(path: Union[str, Path]) -> Optional[List[str]]:
+        from metagpt.tools.libs import get_env_default
+        from metagpt.utils.omniparse_client import OmniParseClient
+
+        base_url = await get_env_default(key="base_url", app_name="OmniParse", default_value="")
+        if not base_url:
+            return None
+        api_key = await get_env_default(key="api_key", app_name="OmniParse", default_value="")
+        v = await get_env_default(key="timeout", app_name="OmniParse", default_value="120")
+        try:
+            timeout = int(v) or 120
+        except ValueError:
+            timeout = 120
+
+        try:
+            client = OmniParseClient(api_key=api_key, base_url=base_url, max_timeout=timeout)
+            file_data = await aread_bin(filename=path)
+            ret = await client.parse_document(file_input=file_data, bytes_filename=str(path))
+        except (ValueError, Exception) as e:
+            logger.exception(f"{path}: {e}")
+            return None
+        if not ret.images:
+            return [ret.text] if ret.text else None
+
+        result = [ret.text]
+        img_dir = Path(path).parent / (Path(path).name.replace(".", "_") + "_images")
+        img_dir.mkdir(parents=True, exist_ok=True)
+        for i in ret.images:
+            byte_data = base64.b64decode(i.image)
+            filename = img_dir / i.image_name
+            await awrite_bin(filename=filename, data=byte_data)
+            result.append(f"![{i.image_name}]({str(filename)})")
+        return result
--- a/metagpt/tools/libs/env.py
+++ b/metagpt/tools/libs/env.py
@ -7,7 +7,7 @@
@Desc: Implement `get_env`. RFC 216 2.4.2.4.2.
 """
 import os
-from typing import Dict
+from typing import Dict, Optional


 class EnvKeyNotFoundError(Exception):
@ -15,14 +15,26 @@ class EnvKeyNotFoundError(Exception):
        super().__init__(info)


+def to_app_key(key: str, app_name: str = None) -> str:
+    return f"{app_name}-{key}" if app_name else key
+
+
+def split_app_key(app_key: str) -> (str, str):
+    if "-" not in app_key:
+        return "", app_key
+    app_name, key = app_key.split("-", 1)
+    return app_name, key
+
+
 async def default_get_env(key: str, app_name: str = None) -> str:
-    if key in os.environ:
-        return os.environ[key]
+    app_key = to_app_key(key=key, app_name=app_name)
+    if app_key in os.environ:
+        return os.environ[app_key]

    from metagpt.context import Context

    context = Context()
-    val = context.kwargs.get(key, None)
+    val = context.kwargs.get(app_key, None)
    if val is not None:
        return val

@ -32,14 +44,16 @@ async def default_get_env(key: str, app_name: str = None) -> str:
 async def default_get_env_description() -> Dict[str, str]:
    result = {}
    for k in os.environ.keys():
-        call = f'await get_env(key="{k}", app_name="")'
+        app_name, key = split_app_key(k)
+        call = f'await get_env(key="{key}", app_name="{app_name}")'
        result[call] = f"Return the value of environment variable `{k}`."

    from metagpt.context import Context

    context = Context()
    for k in context.kwargs.__dict__.keys():
-        call = f'await get_env(key="{k}", app_name="")'
+        app_name, key = split_app_key(k)
+        call = f'await get_env(key="{key}", app_name="{app_name}")'
        result[call] = f"Get the value of environment variable `{k}`."
    return result

@ -84,6 +98,37 @@ async def get_env(key: str, app_name: str = None) -> str:
    return await default_get_env(key=key, app_name=app_name)


+async def get_env_default(key: str, app_name: str = None, default_value: str = None) -> Optional[str]:
+    """
+    Retrieves the value for the specified environment variable key. If the key is not found,
+    returns the default value.
+
+    Args:
+        key (str): The name of the environment variable to retrieve.
+        app_name (str, optional): The name of the application or component to associate with the environment variable.
+        default_value (str, optional): The default value to return if the environment variable is not found.
+
+    Returns:
+        str or None: The value of the environment variable if found, otherwise the default value.
+
+    Example:
+        >>> from metagpt.tools.libs.env import get_env
+        >>> api_key = await get_env_default(key="NOT_EXISTS_API_KEY", default_value="<API_KEY>")
+        >>> print(api_key)
+        <API_KEY>
+
+        >>> from metagpt.tools.libs.env import get_env
+        >>> api_key = await get_env_default(key="NOT_EXISTS_API_KEY", app_name="GITHUB", default_value="<API_KEY>")
+        >>> print(api_key)
+        <API_KEY>
+
+    """
+    try:
+        return await get_env(key=key, app_name=app_name)
+    except EnvKeyNotFoundError:
+        return default_value
+
+
 async def get_env_description() -> Dict[str, str]:
    global _get_env_description_entry

--- a/metagpt/utils/common.py
+++ b/metagpt/utils/common.py
@ -12,6 +12,7 @@
 from __future__ import annotations

 import ast
+import asyncio
 import base64
 import contextlib
 import csv
@ -852,7 +853,10 @@ async def get_mime_type(filename: str | Path, force_read: bool = False) -> str:
    }

    try:
-        stdout, _, _ = await shell_execute(f"file --mime-type {str(filename)}")
+        stdout, stderr, _ = await shell_execute(f"file --mime-type {str(filename)}")
+        if stderr:
+            logger.debug(f"file:{filename}, error:{stderr}")
+            return guess_mime_type
        ix = stdout.rfind(" ")
        mime_type = stdout[ix:].strip()
        if mime_type == "text/plain" and guess_mime_type in text_set:
@ -1050,6 +1054,32 @@ def tool2name(cls, methods: List[str], entry) -> Dict[str, Any]:
    return mappings


+def run_coroutine_sync(coroutine, *args, **kwargs):
+    """
+    Runs a coroutine function synchronously by encapsulating its invocation as a non-coroutine function call.
+
+    Args:
+        coroutine: The coroutine function to be encapsulated.
+        *args: Positional arguments to be passed to the coroutine.
+        **kwargs: Keyword arguments to be passed to the coroutine.
+
+    Returns:
+        The return value of the coroutine.
+    """
+    try:
+        loop = asyncio.get_running_loop()
+    except RuntimeError:  # No running event loop
+        loop = None
+
+    if loop and loop.is_running():
+        # The event loop is already running
+        future = asyncio.run_coroutine_threadsafe(coroutine(*args, **kwargs), loop)
+        return future.result()
+    else:
+        # The event loop is not running
+        return asyncio.run(coroutine(*args, **kwargs))
+
+
 def log_time(method):
    """A time-consuming decorator for printing execution duration."""

--- a/metagpt/utils/repo_to_markdown.py
+++ b/metagpt/utils/repo_to_markdown.py
@ -7,7 +7,7 @@ from __future__ import annotations

 import re
 from pathlib import Path
-from typing import Tuple
+from typing import Tuple, Union

 from gitignore_parser import parse_gitignore

@ -82,7 +82,7 @@ async def _write_files(repo_path, gitignore_rules=None) -> str:


 async def _write_file(filename: Path, repo_path: Path) -> str:
-    is_text, mime_type = await _is_text_file(filename)
+    is_text, mime_type = await is_text_file(filename)
    if not is_text:
        logger.info(f"Ignore content: {filename}")
        return ""
@ -100,7 +100,17 @@ async def _write_file(filename: Path, repo_path: Path) -> str:
        return ""


-async def _is_text_file(filename: Path) -> Tuple[bool, str]:
+async def is_text_file(filename: Union[str, Path]) -> Tuple[bool, str]:
+    """
+    Determines if the specified file is a text file based on its MIME type.
+
+    Args:
+        filename (Union[str, Path]): The path to the file.
+
+    Returns:
+        Tuple[bool, str]: A tuple where the first element indicates if the file is a text file
+        (True for text file, False otherwise), and the second element is the MIME type of the file.
+    """
    pass_set = {
        "application/json",
        "application/vnd.chipnuts.karaoke-mmd",
@ -129,7 +139,7 @@ async def _is_text_file(filename: Path) -> Tuple[bool, str]:
        "image/vnd.microsoft.icon",
        "video/mp4",
    }
-    mime_type = await get_mime_type(filename, force_read=True)
+    mime_type = await get_mime_type(Path(filename), force_read=True)
    v = "text/" in mime_type or mime_type in pass_set
    if v:
        return True, mime_type