rm Chinese comments

2026-06-17 15:35:21 +02:00 · 2024-03-19 23:51:57 +08:00 · 2024-03-19 23:51:57 +08:00 · 3fac156d66
commit 3fac156d66
parent 6e28eafcd7
1 changed files with 17 additions and 6 deletions
--- a/data/inference/make_datasets/parse_utils.py
+++ b/data/inference/make_datasets/parse_utils.py
@ -2,26 +2,37 @@ import re


 def extract_scripts_from_codetext(codetext: str):
-    script_names = []
-    # 提供的文本内容，可能包含多个 [start of ... .py]
    """
+    Extracts Python script file names from a given text that contains multiple sections.
+    Each section starts with '[start of <script_name>.py]' and ends with '[end of <script_name>.py]'.
+
+    Parameters:
+    - codetext (str): A string that may contain multiple sections, each indicating the start of a Python script file.
+
+    Returns:
+    - list: A list of extracted Python script file names.
+
+    Example of codetext:
+    '''
    [end of README.rst]
    [start of sklearn/compose/_target.py]
-    ... 文件内容 ...
+    ... file content ...
    [end of sklearn/compose/_target.py]
    [start of another_module/example.py]
-    ... 文件内容 ...
+    ... file content ...
    [end of another_module/example.py]
+    '''
    """
+    script_names = []

-    # 使用正则表达式匹配所有 “[start of 任意字符.py]”
+    # Match all occurrences of '[start of <script_name>.py]'
    matches = re.findall(r"\[start of ([^\]]+\.py)\]", codetext)

    if matches:
-        # 遍历所有匹配的文件名并打印
        for script_name in matches:
            print("Extracted script name:", script_name)
            script_names.append(script_name)
    else:
        print("No script names found in the text.")
+
    return script_names