diff --git a/data/inference/make_datasets/parse_utils.py b/data/inference/make_datasets/parse_utils.py
new file mode 100644
index 000000000..ace137a44
--- /dev/null
+++ b/data/inference/make_datasets/parse_utils.py
@@ -0,0 +1,27 @@
+import re
+
+
+def extract_scripts_from_codetext(codetext: str):
+    script_names = []
+    # 提供的文本内容，可能包含多个 [start of ... .py]
+    """
+    [end of README.rst]
+    [start of sklearn/compose/_target.py]
+    ... 文件内容 ...
+    [end of sklearn/compose/_target.py]
+    [start of another_module/example.py]
+    ... 文件内容 ...
+    [end of another_module/example.py]
+    """
+
+    # 使用正则表达式匹配所有 “[start of 任意字符.py]”
+    matches = re.findall(r"\[start of ([^\]]+\.py)\]", codetext)
+
+    if matches:
+        # 遍历所有匹配的文件名并打印
+        for script_name in matches:
+            print("Extracted script name:", script_name)
+            script_names.append(script_name)
+    else:
+        print("No script names found in the text.")
+    return script_names