diff --git a/apt/utils/models/__init__.py b/apt/utils/models/__init__.py
index fe1721b..65861f3 100644
--- a/apt/utils/models/__init__.py
+++ b/apt/utils/models/__init__.py
@@ -1,4 +1,5 @@
 from apt.utils.models.model import Model, BlackboxClassifier, ModelOutputType, ScoringMethod, \
-    BlackboxClassifierPredictions, BlackboxClassifierPredictFunction, get_nb_classes, is_one_hot
+    BlackboxClassifierPredictions, BlackboxClassifierPredictFunction, get_nb_classes, is_one_hot, \
+    check_correct_model_output
 from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor
 from apt.utils.models.keras_model import KerasClassifier
diff --git a/apt/utils/models/keras_model.py b/apt/utils/models/keras_model.py
index 0cb7252..6bce043 100644
--- a/apt/utils/models/keras_model.py
+++ b/apt/utils/models/keras_model.py
@@ -7,7 +7,7 @@ import tensorflow as tf
 from tensorflow import keras
 tf.compat.v1.disable_eager_execution()
 
-from apt.utils.models import Model, ModelOutputType, ScoringMethod
+from apt.utils.models import Model, ModelOutputType, ScoringMethod, check_correct_model_output
 from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
 
 from art.utils import check_and_transform_label_format
@@ -68,7 +68,9 @@ class KerasClassifier(KerasModel):
         :type x: `Dataset`
         :return: Predictions from the model as numpy array (class probabilities, if supported).
         """
-        return self._art_model.predict(x.get_samples(), **kwargs)
+        predictions = self._art_model.predict(x.get_samples(), **kwargs)
+        check_correct_model_output(predictions, self.output_type)
+        return predictions
 
     def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
         """
diff --git a/apt/utils/models/model.py b/apt/utils/models/model.py
index 9e8379d..902a22c 100644
--- a/apt/utils/models/model.py
+++ b/apt/utils/models/model.py
@@ -8,30 +8,6 @@ from art.estimators.classification import BlackBoxClassifier
 from art.utils import check_and_transform_label_format
 
 
-def is_one_hot(y: OUTPUT_DATA_ARRAY_TYPE) -> bool:
-    return len(y.shape) == 2 and y.shape[1] > 1
-
-
-def get_nb_classes(y: OUTPUT_DATA_ARRAY_TYPE) -> int:
-    """
-    Get the number of classes from an array of labels
-
-    :param y: the labels
-    :type y: numpy array
-    :return: the number of classes as integer
-    """
-    if y is None:
-        return 0
-
-    if type(y) != np.ndarray:
-        raise ValueError("Input should be numpy array")
-
-    if is_one_hot(y):
-        return y.shape[1]
-    else:
-        return int(np.max(y) + 1)
-
-
 class ModelOutputType(Enum):
     CLASSIFIER_PROBABILITIES = auto()  # vector of probabilities
     CLASSIFIER_LOGITS = auto()  # vector of logits
@@ -49,6 +25,45 @@ class ScoringMethod(Enum):
     MEAN_SQUARED_ERROR = auto()  # mean squared error between the predictions and true labels
 
 
+def is_one_hot(y: OUTPUT_DATA_ARRAY_TYPE) -> bool:
+    return len(y.shape) == 2 and y.shape[1] > 1
+
+
+def get_nb_classes(y: OUTPUT_DATA_ARRAY_TYPE) -> int:
+    """
+    Get the number of classes from an array of labels
+
+    :param y: The labels
+    :type y: numpy array
+    :return: The number of classes as integer
+    """
+    if y is None:
+        return 0
+
+    if type(y) != np.ndarray:
+        raise ValueError("Input should be numpy array")
+
+    if is_one_hot(y):
+        return y.shape[1]
+    else:
+        return int(np.max(y) + 1)
+
+
+def check_correct_model_output(y: OUTPUT_DATA_ARRAY_TYPE, output_type: ModelOutputType):
+    """
+    Checks whether there is a mismatch between the declared model output type and its actual output.
+    :param y: Model output
+    :type y: numpy array
+    :param output_type: Declared output type (provided at init)
+    :type output_type: ModelOutputType
+    :raises: ValueError (in case of mismatch)
+    """
+    if not is_one_hot(y):  # 1D array
+        if output_type == ModelOutputType.CLASSIFIER_PROBABILITIES or output_type == ModelOutputType.CLASSIFIER_LOGITS:
+            raise ValueError("Incompatible model output types. Model outputs 1D array of categorical scalars while "
+                             "output type is set to ", output_type)
+
+
 class Model(metaclass=ABCMeta):
     """
     Abstract base class for ML model wrappers.
@@ -147,8 +162,6 @@ class Model(metaclass=ABCMeta):
         return self._unlimited_queries
 
 
-
-
 class BlackboxClassifier(Model):
     """
     Wrapper for black-box ML classification models.
@@ -168,7 +181,6 @@ class BlackboxClassifier(Model):
     :type model_type: Either a (unfitted) model object of the underlying framework, or a ModelType representing the
                       type of the model, optional.
     """
-
     def __init__(self, model: Any, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
                  unlimited_queries: Optional[bool] = True, model_type: Optional[Union[Any, ModelType]] = None,
                  **kwargs):
@@ -220,7 +232,9 @@ class BlackboxClassifier(Model):
         :type x: `Dataset`
         :return: Predictions from the model as numpy array.
         """
-        return self._art_model.predict(x.get_samples())
+        predictions = self._art_model.predict(x.get_samples())
+        check_correct_model_output(predictions, self.output_type)
+        return predictions
 
     def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
         """
@@ -266,6 +280,11 @@ class BlackboxClassifierPredictions(BlackboxClassifier):
         x_test_pred = model.get_test_samples()
         y_test_pred = model.get_test_labels()
 
+        if y_train_pred is not None:
+            check_correct_model_output(y_train_pred, self.output_type)
+        if y_test_pred is not None:
+            check_correct_model_output(y_test_pred, self.output_type)
+
         if y_train_pred is not None and len(y_train_pred.shape) == 1:
             self._nb_classes = get_nb_classes(y_train_pred)
             y_train_pred = check_and_transform_label_format(y_train_pred, nb_classes=self._nb_classes)
diff --git a/apt/utils/models/sklearn_model.py b/apt/utils/models/sklearn_model.py
index 8becc18..c378a24 100644
--- a/apt/utils/models/sklearn_model.py
+++ b/apt/utils/models/sklearn_model.py
@@ -3,7 +3,7 @@ from typing import Optional
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.base import BaseEstimator
 
-from apt.utils.models import Model, ModelOutputType, get_nb_classes
+from apt.utils.models import Model, ModelOutputType, get_nb_classes, check_correct_model_output
 from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
 
 from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier
@@ -71,7 +71,9 @@ class SklearnClassifier(SklearnModel):
         :type x: `Dataset`
         :return: Predictions from the model as numpy array (class probabilities, if supported).
         """
-        return self._art_model.predict(x.get_samples(), **kwargs)
+        predictions = self._art_model.predict(x.get_samples(), **kwargs)
+        check_correct_model_output(predictions, self.output_type)
+        return predictions
 
 
 class SklearnRegressor(SklearnModel):
diff --git a/tests/test_model.py b/tests/test_model.py
index 195ad81..8b4769c 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -72,7 +72,7 @@ def test_blackbox_classifier():
     train = ArrayDataset(x_train, y_train)
     test = ArrayDataset(x_test, y_test)
     data = Data(train, test)
-    model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
     pred = model.predict(test)
     assert(pred.shape[0] == x_test.shape[0])
 
@@ -81,13 +81,24 @@ def test_blackbox_classifier():
 
     assert model.model_type is None
 
+
+def test_blackbox_classifier_mismatch():
+    (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
+
+    train = ArrayDataset(x_train, y_train)
+    test = ArrayDataset(x_test, y_test)
+    data = Data(train, test)
+    with pytest.raises(ValueError):
+        model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_PROBABILITIES)
+
+
 def test_blackbox_classifier_no_test():
     (x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
 
     train = ArrayDataset(x_train, y_train)
 
     data = Data(train)
-    model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
     pred = model.predict(train)
     assert(pred.shape[0] == x_train.shape[0])
 
@@ -100,7 +111,7 @@ def test_blackbox_classifier_no_train():
 
     test = ArrayDataset(x_test, y_test)
     data = Data(test=test)
-    model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
     pred = model.predict(test)
     assert(pred.shape[0] == x_test.shape[0])
 
@@ -114,7 +125,7 @@ def test_blackbox_classifier_no_test_y():
     train = ArrayDataset(x_train, y_train)
     test = ArrayDataset(x_test)
     data = Data(train, test)
-    model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
     pred = model.predict(train)
     assert(pred.shape[0] == x_train.shape[0])
 
@@ -136,7 +147,7 @@ def test_blackbox_classifier_no_train_y():
     train = ArrayDataset(x_train)
     test = ArrayDataset(x_test, y_test)
     data = Data(train, test)
-    model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = BlackboxClassifierPredictions(data, ModelOutputType.CLASSIFIER_SCALAR)
     pred = model.predict(test)
     assert (pred.shape[0] == x_test.shape[0])
 
@@ -171,7 +182,7 @@ def test_blackbox_classifier_probabilities():
 
 def test_blackbox_classifier_predict():
     def predict(x):
-        return [0.23, 0.56, 0.21]
+        return np.array([[0.23, 0.56, 0.21] for i in range(x.shape[0])])
 
     (x_train, y_train), (_, _) = dataset_utils.get_iris_dataset_np()
     y_train = np.array([[0.23, 0.56, 0.21] for i in range(105)])
@@ -187,6 +198,7 @@ def test_blackbox_classifier_predict():
     score = model.score(train)
     assert (score == 1.0)
 
+
 def test_is_one_hot():
     (_, y_train), (_, _) = dataset_utils.get_iris_dataset_np()