diff --git a/expo/MCTS.py b/expo/MCTS.py
index a8410748e..749850dd6 100644
--- a/expo/MCTS.py
+++ b/expo/MCTS.py
@@ -8,7 +8,7 @@ import shutil
 import numpy as np
 import pandas as pd
 
-from expo.data.custom_task import get_mle_bench_requirements
+from expo.data.custom_task import get_mle_bench_requirements, get_mle_task_id
 from expo.data.dataset import generate_task_requirement, get_split_dataset_path
 from expo.evaluation.evaluation import evaluate_score
 from expo.insights.instruction_generator import InstructionGenerator
@@ -35,6 +35,8 @@ def create_initial_state(
         datasets_dir = args.custom_dataset_dir
         requirement = get_mle_bench_requirements(args.custom_dataset_dir, data_config)
         exp_pool_path = None
+        # external_eval = False # make sure external eval is false if custom dataset is used
+        task = get_mle_task_id(args.custom_dataset_dir)
     else:
         dataset_config = data_config["datasets"][task]
         datasets_dir = get_split_dataset_path(task, data_config)
@@ -120,7 +122,7 @@ class Node:
             return f"{self.parent.id}-{num_sibling}"
 
     def is_terminal(self):
-        return int(self.state["start_task_id"]) == self.max_depth + 1
+        return int(self.state["start_task_id"]) == self.max_depth + 1  # TODO: Check if this is correct or +1
 
     def is_fully_expanded(self):
         return len(self.children) > 0
diff --git a/expo/data/custom_task.py b/expo/data/custom_task.py
index 14eb6aac2..f66b4aa58 100644
--- a/expo/data/custom_task.py
+++ b/expo/data/custom_task.py
@@ -22,19 +22,26 @@ COMPETITION INSTRUCTIONS
 ## More Instructions
 - output_dir: {output_dir}
 - Besides `submission.csv`, you should also save your output in the output directory.
-- Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory.
-
-Do not make visualizations.
+- You should split the training data into train and dev set.
+- Save the prediction results of BOTH the dev set and test set in `dev_predictions.csv` and `test_predictions.csv` respectively in the output directory. They should be in the same format as the `submission.csv`.
+- Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. 
+**Do not make any plots or visualizations.**
 """
 
 
+def get_mle_task_id(dataset_dir):
+    return dataset_dir.split("/")[-3]
+
+
 def get_mle_bench_requirements(dataset_dir, data_config, obfuscated=False):
     work_dir = data_config["work_dir"]
-    output_dir = f"{work_dir}/output"
+    task = get_mle_task_id(dataset_dir)
+    output_dir = f"{work_dir}/{task}"
+    final_output_dir = f"{work_dir}/submission"
     os.makedirs(output_dir, exist_ok=True)
 
     if obfuscated:
-        instructions = INSTRUCTIONS_OBFUSCATED.format(dataset_dir=dataset_dir, output_dir=output_dir)
+        instructions = INSTRUCTIONS_OBFUSCATED.format(dataset_dir=dataset_dir, output_dir=final_output_dir)
         task_file = "description_obfuscated.md"
     else:
         instructions = INSTRUCTIONS.format(dataset_dir=dataset_dir, output_dir=output_dir)
diff --git a/expo/evaluation/evaluation.py b/expo/evaluation/evaluation.py
index 1ba7fa60f..2c19b81fc 100644
--- a/expo/evaluation/evaluation.py
+++ b/expo/evaluation/evaluation.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 import numpy as np
 from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, roc_auc_score
 
@@ -33,4 +35,14 @@ def node_evaluate_score_sela(node):
 
 def node_evaluate_score_mlebench(node):
     # TODO
-    return 0
+    from mlebench.grade import grade_csv
+    from mlebench.registry import registry
+
+    competition_id = node.state["task"]
+    pred_path = node.get_predictions_path("test")
+    new_registry = registry.set_data_dir(Path(registry.get_data_dir()))
+    competition = new_registry.get_competition(competition_id)
+    submission = Path(pred_path)
+    report = grade_csv(submission, competition).to_dict()
+    report["submission_path"] = str(submission)
+    return report
diff --git a/expo/run_experiment.py b/expo/run_experiment.py
index 53fcdd18c..bf90cb07a 100644
--- a/expo/run_experiment.py
+++ b/expo/run_experiment.py
@@ -60,6 +60,11 @@ def get_di_args(parser):
 
 
 async def main(args):
+    if args.custom_dataset_dir:
+        args.external_eval = False
+        args.eval_func = "mlebench"
+        args.from_scratch = True
+
     if args.exp_mode == "mcts":
         experimenter = MCTSExperimenter(args)
     elif args.exp_mode == "greedy":