diff --git a/apt/utils/models/model.py b/apt/utils/models/model.py
index 157158e..f470de7 100644
--- a/apt/utils/models/model.py
+++ b/apt/utils/models/model.py
@@ -29,7 +29,15 @@ class ScoringMethod(Enum):
 
 
 def is_one_hot(y: OUTPUT_DATA_ARRAY_TYPE) -> bool:
-    return len(y.shape) == 2 and y.shape[1] > 1
+    return len(y.shape) == 2 and y.shape[1] > 1 and np.all(np.around(np.sum(y, axis=1), decimals=4) == 1)
+
+
+def is_multi_label(y: OUTPUT_DATA_ARRAY_TYPE) -> bool:
+    return len(y.shape) == 2 and y.shape[1] > 1 and not is_one_hot(y)
+
+
+def is_multi_label_binary(y: OUTPUT_DATA_ARRAY_TYPE) -> bool:
+    return is_multi_label(y) and np.max(y) <= 1
 
 
 def get_nb_classes(y: OUTPUT_DATA_ARRAY_TYPE) -> int:
@@ -38,7 +46,7 @@ def get_nb_classes(y: OUTPUT_DATA_ARRAY_TYPE) -> int:
 
     :param y: The labels
     :type y: numpy array
-    :return: The number of classes as integer
+    :return: The number of classes as integer, or list of integers for multi-label
     """
     if y is None:
         return 0
@@ -48,6 +56,10 @@ def get_nb_classes(y: OUTPUT_DATA_ARRAY_TYPE) -> int:
 
     if is_one_hot(y):
         return y.shape[1]
+    elif is_multi_label(y):
+        # for now just return the number of labels
+        return y.shape[1]
+        # return [int(np.max(y.T[i]) + 1) for i in range(y.shape[1])]
     else:
         return int(np.max(y) + 1)
 
@@ -61,7 +73,7 @@ def check_correct_model_output(y: OUTPUT_DATA_ARRAY_TYPE, output_type: ModelOutp
     :type output_type: ModelOutputType
     :raises: ValueError (in case of mismatch)
     """
-    if not is_one_hot(y):  # 1D array
+    if not is_one_hot(y) and not is_multi_label(y):  # 1D array
         if output_type == ModelOutputType.CLASSIFIER_PROBABILITIES or output_type == ModelOutputType.CLASSIFIER_LOGITS:
             raise ValueError("Incompatible model output types. Model outputs 1D array of categorical scalars while "
                              "output type is set to ", output_type)
@@ -167,7 +179,8 @@ class Model(metaclass=ABCMeta):
 
 class BlackboxClassifier(Model):
     """
-    Wrapper for black-box ML classification models.
+    Wrapper for black-box ML classification models. This is an abstract class and must be instantiated as either
+    BlackboxClassifierPredictFunction or BlackboxClassifierPredictions.
 
     :param model: The training and/or test data along with the model's predictions for the data or a callable predict
                   method.
@@ -266,7 +279,8 @@ class BlackboxClassifier(Model):
         check_correct_model_output(predictions, self.output_type)
         return predictions
 
-    def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
+    def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY,
+              binary_threshold: Optional[float] = 0.5, **kwargs):
         """
         Score the model using test data.
 
@@ -274,14 +288,25 @@ class BlackboxClassifier(Model):
         :type train_data: `Dataset`
         :param scoring_method: The method for scoring predictions. Default is ACCURACY.
         :type scoring_method: `ScoringMethod`, optional
+        :param binary_threshold: The threshold to use on binary classification probabilities to assign the positive
+                                 class.
+        :type binary_threshold: float, optional. Default is 0.5.
         :return: the score as float (for classifiers, between 0 and 1)
         """
         if test_data.get_samples() is None or test_data.get_labels() is None:
             raise ValueError('score can only be computed when test data and labels are available')
         predicted = self._art_model.predict(test_data.get_samples())
-        y = check_and_transform_label_format(test_data.get_labels(), nb_classes=self._nb_classes)
+        y = test_data.get_labels()
+        if not is_multi_label(y):
+            y = check_and_transform_label_format(y, nb_classes=self._nb_classes)
         if scoring_method == ScoringMethod.ACCURACY:
-            return np.count_nonzero(np.argmax(y, axis=1) == np.argmax(predicted, axis=1)) / predicted.shape[0]
+            if not is_multi_label(y):
+                return np.count_nonzero(np.argmax(y, axis=1) == np.argmax(predicted, axis=1)) / predicted.shape[0]
+            else:
+                if is_multi_label_binary(y):
+                    predicted[predicted < binary_threshold] = 0
+                    predicted[predicted >= binary_threshold] = 1
+                return np.count_nonzero(y == predicted) / (predicted.shape[0] * y.shape[1])
         else:
             raise NotImplementedError
 
diff --git a/tests/test_model.py b/tests/test_model.py
index b8fb8f1..b7b2909 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -156,6 +156,54 @@ def test_blackbox_classifier_predictions_y():
     assert model.model_type is None
 
 
+def test_blackbox_classifier_predictions_multi_label_cat():
+    (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
+
+    # make multi-label categorical
+    y_train = np.column_stack((y_train, y_train, y_train))
+    y_test = np.column_stack((y_test, y_test, y_test))
+
+    train = DatasetWithPredictions(y_train, x_train, y_train)
+    test = DatasetWithPredictions(y_test, x_test, y_test)
+    data = Data(train, test)
+    model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
+    pred = model.predict(test)
+    assert (pred.shape[0] == x_test.shape[0])
+
+    score = model.score(test)
+    assert (score == 1.0)
+
+    assert model.model_type is None
+
+
+def test_blackbox_classifier_predictions_multi_label_binary():
+    (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
+
+    # make multi-label categorical
+    y_train = np.column_stack((y_train, y_train, y_train))
+    y_train[y_train > 1] = 1
+    pred_train = y_train.copy().astype(float)
+    pred_train[pred_train == 0] = 0.2
+    pred_train[pred_train == 1] = 0.6
+    y_test = np.column_stack((y_test, y_test, y_test))
+    y_test[y_test > 1] = 1
+    pred_test = y_test.copy().astype(float)
+    pred_test[pred_test == 0] = 0.2
+    pred_test[pred_test == 1] = 0.6
+
+    train = DatasetWithPredictions(pred_train, x_train, y_train)
+    test = DatasetWithPredictions(pred_test, x_test, y_test)
+    data = Data(train, test)
+    model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
+    pred = model.predict(test)
+    assert (pred.shape[0] == x_test.shape[0])
+
+    score = model.score(test)
+    assert (score == 1.0)
+
+    assert model.model_type is None
+
+
 def test_blackbox_classifier_mismatch():
     (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()