diff --git a/data/inference/make_datasets/parse_utils.py b/data/inference/make_datasets/parse_utils.py index ace137a44..79b6189e1 100644 --- a/data/inference/make_datasets/parse_utils.py +++ b/data/inference/make_datasets/parse_utils.py @@ -2,26 +2,37 @@ import re def extract_scripts_from_codetext(codetext: str): - script_names = [] - # 提供的文本内容,可能包含多个 [start of ... .py] """ + Extracts Python script file names from a given text that contains multiple sections. + Each section starts with '[start of .py]' and ends with '[end of .py]'. + + Parameters: + - codetext (str): A string that may contain multiple sections, each indicating the start of a Python script file. + + Returns: + - list: A list of extracted Python script file names. + + Example of codetext: + ''' [end of README.rst] [start of sklearn/compose/_target.py] - ... 文件内容 ... + ... file content ... [end of sklearn/compose/_target.py] [start of another_module/example.py] - ... 文件内容 ... + ... file content ... [end of another_module/example.py] + ''' """ + script_names = [] - # 使用正则表达式匹配所有 “[start of 任意字符.py]” + # Match all occurrences of '[start of .py]' matches = re.findall(r"\[start of ([^\]]+\.py)\]", codetext) if matches: - # 遍历所有匹配的文件名并打印 for script_name in matches: print("Extracted script name:", script_name) script_names.append(script_name) else: print("No script names found in the text.") + return script_names