Merge pull request #1012 from SheffieldML/devel

Release 1.12.0
2026-06-14 15:25:15 +02:00 · 2023-04-28 15:50:52 +01:00 · 2023-04-28 15:50:52 +01:00 · 6254451513
commit 6254451513
parent b9686fbeb4 9ec3e50e3b
33 changed files with 1291 additions and 654 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -16,7 +16,6 @@ addons:
 #  - "$HOME/install/"

 env:
-  - PYTHON_VERSION=3.5
  - PYTHON_VERSION=3.6
  - PYTHON_VERSION=3.7
  - PYTHON_VERSION=3.8
--- a/GPy/version.py
+++ b/GPy/version.py
@ -1 +1 @@
-__version__ = "1.11.0"
+__version__ = "1.12.0"
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -77,7 +77,7 @@ def randomize(self, rand_gen=None, *args, **kwargs):
    # now draw from prior where possible
    x = self.param_array.copy()
    [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.items() if not p is None]
-    unfixlist = np.ones((self.size,),dtype=np.bool)
+    unfixlist = np.ones((self.size,),dtype=bool)
    from paramz.transformations import __fixed__
    unfixlist[self.constraints[__fixed__]] = False
    self.param_array.flat[unfixlist] = x.view(np.ndarray).ravel()[unfixlist]
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -21,7 +21,7 @@ class GP(Model):

    :param X: input observations
    :param Y: output observations
-    :param kernel: a GPy kernel, defaults to rbf+white
+    :param kernel: a GPy kernel
    :param likelihood: a GPy likelihood
    :param inference_method: The :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference method to use for this GP
    :rtype: model object
@ -134,9 +134,10 @@ class GP(Model):
        if self.mean_function is not None:
            input_dict["mean_function"] = self.mean_function.to_dict()
        input_dict["inference_method"] = self.inference_method.to_dict()
-        #FIXME: Assumes the Y_metadata is serializable. We should create a Metadata class
+        # TODO: We should create a Metadata class
        if self.Y_metadata is not None:
-            input_dict["Y_metadata"] = self.Y_metadata
+            # make Y_metadata serializable
+            input_dict["Y_metadata"] = {k: self.Y_metadata[k].tolist() for k in self.Y_metadata.keys()}
        if self.normalizer is not None:
            input_dict["normalizer"] = self.normalizer.to_dict()
        return input_dict
@ -162,9 +163,12 @@ class GP(Model):
            input_dict["mean_function"] = mean_function
        input_dict["inference_method"] = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(input_dict["inference_method"])

-        #FIXME: Assumes the Y_metadata is serializable. We should create a Metadata class
+        # converts Y_metadata from serializable to array. We should create a Metadata class
        Y_metadata = input_dict.get("Y_metadata")
-        input_dict["Y_metadata"] = Y_metadata
+        if isinstance(Y_metadata, dict):
+            input_dict["Y_metadata"] = {k: np.array(Y_metadata[k]) for k in Y_metadata.keys()}
+        else:
+            input_dict["Y_metadata"] = Y_metadata

        normalizer = input_dict.get("normalizer")
        if normalizer is not None:
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -200,37 +200,50 @@ class MultivariateGaussian(Prior):

    def __new__(cls, mu=0, var=1):  # Singleton:
        if cls._instances:
-            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            cls._instances[:] = [instance for instance in cls._instances if
+                                 instance()]
            for instance in cls._instances:
-                if np.all(instance().mu == mu) and np.all(instance().var == var):
+                if np.all(instance().mu == mu) and np.all(
+                        instance().var == var):
                    return instance()
-        o = super(Prior, cls).__new__(cls, mu, var)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)
+        else:
+            o = newfunc(cls, mu, var)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()

    def __init__(self, mu, var):
        self.mu = np.array(mu).flatten()
        self.var = np.array(var)
-        assert len(self.var.shape) == 2
-        assert self.var.shape[0] == self.var.shape[1]
+        assert len(self.var.shape) == 2, 'Covariance must be a matrix'
+        assert self.var.shape[0] == self.var.shape[1], \
+            'Covariance must be a square matrix'
        assert self.var.shape[0] == self.mu.size
        self.input_dim = self.mu.size
-        self.inv, self.hld = pdinv(self.var)
-        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld
+        self.inv, _, self.hld, _ = pdinv(self.var)
+        self.constant = -0.5 * (self.input_dim * np.log(2 * np.pi) + self.hld)
+
+    def __str__(self):
+        return 'MultiN(' + str(self.mu) + ', ' + str(np.diag(self.var)) + ')'

    def summary(self):
        raise NotImplementedError

    def pdf(self, x):
+        x = np.array(x).flatten()
        return np.exp(self.lnpdf(x))

    def lnpdf(self, x):
+        x = np.array(x).flatten()
        d = x - self.mu
-        return self.constant - 0.5 * np.sum(d * np.dot(d, self.inv), 1)
+        return self.constant - 0.5 * np.dot(d.T, np.dot(self.inv, d))

    def lnpdf_grad(self, x):
+        x = np.array(x).flatten()
        d = x - self.mu
-        return -np.dot(self.inv, d)
+        return - np.dot(self.inv, d)

    def rvs(self, n):
        return np.random.multivariate_normal(self.mu, self.var, n)
@ -247,14 +260,15 @@ class MultivariateGaussian(Prior):
        return self.mu, self.var

    def __setstate__(self, state):
-        self.mu = state[0]
+        self.mu = np.array(state[0]).flatten()
        self.var = state[1]
-        assert len(self.var.shape) == 2
-        assert self.var.shape[0] == self.var.shape[1]
+        assert len(self.var.shape) == 2, 'Covariance must be a matrix'
+        assert self.var.shape[0] == self.var.shape[1], \
+            'Covariance must be a square matrix'
        assert self.var.shape[0] == self.mu.size
        self.input_dim = self.mu.size
-        self.inv, self.hld = pdinv(self.var)
-        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld
+        self.inv, _, self.hld, _ = pdinv(self.var)
+        self.constant = -0.5 * (self.input_dim * np.log(2 * np.pi) + self.hld)

 def gamma_from_EV(E, V):
    warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning)
@ -357,24 +371,17 @@ class InverseGamma(Gamma):
    """
    domain = _POSITIVE
    _instances = []
-    def __new__(cls, a=1, b=.5): # Singleton:
-        if cls._instances:
-            cls._instances[:] = [instance for instance in cls._instances if instance()]
-            for instance in cls._instances:
-                if instance().a == a and instance().b == b:
-                    return instance()
-        o = super(Prior, cls).__new__(cls, a, b)
-        cls._instances.append(weakref.ref(o))
-        return cls._instances[-1]()
-
-    def __init__(self, a, b):
-        self._a = float(a)
-        self._b = float(b)
-        self.constant = -gammaln(self.a) + a * np.log(b)

    def __str__(self):
        return "iGa({:.2g}, {:.2g})".format(self.a, self.b)

+    def summary(self):
+        return {}
+
+    @staticmethod
+    def from_EV(E, V):
+        raise NotImplementedError
+
    def lnpdf(self, x):
        return self.constant - (self.a + 1) * np.log(x) - self.b / x

@ -384,7 +391,6 @@ class InverseGamma(Gamma):
    def rvs(self, n):
        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)

-
 class DGPLVM_KFDA(Prior):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable function using
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -106,7 +106,7 @@ class VariationalPosterior(Parameterized):
        self.link_parameters(self.mean, self.variance)
        self.num_data, self.input_dim = self.mean.shape
        if self.has_uncertain_inputs():
-            assert self.variance.shape == self.mean.shape, "need one variance per sample and dimenion"
+            assert self.variance.shape == self.mean.shape, "need one variance per sample and dimension"

    def set_gradients(self, grad):
        self.mean.gradient, self.variance.gradient = grad
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -108,6 +108,7 @@ class SparseGP_MPI(SparseGP):
                            raise
                        self._fail_count += 1
                elif flag==-1:
+                    ret = None
                    break
                else:
                    self._IN_OPTIMIZATION_ = False
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -3,43 +3,57 @@
 """
 Gaussian Processes classification examples
 """
+MPL_AVAILABLE = True
+try:
+    import matplotlib.pyplot as plt
+except ImportError:
+    MPL_AVAILABLE = False
+
 import GPy

 default_seed = 10000

+
 def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
    """
    Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

    """
-    try:import pods
-    except ImportError:raise ImportWarning('Need pods for example datasets. See https://github.com/sods/ods, or pip install pods.')
+    try:
+        import pods
+    except ImportError:
+        raise ImportWarning(
+            "Need pods for example datasets. See https://github.com/sods/ods, or pip install pods."
+        )
    data = pods.datasets.oil()
-    X = data['X']
-    Xtest = data['Xtest']
-    Y = data['Y'][:, 0:1]
-    Ytest = data['Ytest'][:, 0:1]
-    Y[Y.flatten()==-1] = 0
-    Ytest[Ytest.flatten()==-1] = 0
+    X = data["X"]
+    Xtest = data["Xtest"]
+    Y = data["Y"][:, 0:1]
+    Ytest = data["Ytest"][:, 0:1]
+    Y[Y.flatten() == -1] = 0
+    Ytest[Ytest.flatten() == -1] = 0

    # Create GP model
-    m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, num_inducing=num_inducing)
+    m = GPy.models.SparseGPClassification(
+        X, Y, kernel=kernel, num_inducing=num_inducing
+    )
    m.Ytest = Ytest

    # Contrain all parameters to be positive
-    #m.tie_params('.*len')
-    m['.*len'] = 10.
+    # m.tie_params('.*len')
+    m[".*len"] = 10.0

    # Optimize
    if optimize:
        m.optimize(messages=1)
    print(m)

-    #Test
+    # Test
    probs = m.predict(Xtest)[0]
    GPy.util.classification.conf_matrix(probs, Ytest)
    return m

+
 def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
    """
    Simple 1D classification example using EP approximation
@ -48,26 +62,29 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
    :type seed: int

    """
-    try:import pods
-    except ImportError:raise ImportWarning('Need pods for example datasets. See https://github.com/sods/ods, or pip install pods.')
+    try:
+        import pods
+    except ImportError:
+        raise ImportWarning(
+            "Need pods for example datasets. See https://github.com/sods/ods, or pip install pods."
+        )
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
-    Y = data['Y'][:, 0:1]
+    Y = data["Y"][:, 0:1]
    Y[Y.flatten() == -1] = 0

    # Model definition
-    m = GPy.models.GPClassification(data['X'], Y)
+    m = GPy.models.GPClassification(data["X"], Y)

    # Optimize
    if optimize:
-        #m.update_likelihood_approximation()
+        # m.update_likelihood_approximation()
        # Parameters optimization:
        m.optimize()
-        #m.update_likelihood_approximation()
-        #m.pseudo_EM()
+        # m.update_likelihood_approximation()
+        # m.pseudo_EM()

    # Plot
-    if plot:
-        from matplotlib import pyplot as plt
+    if MPL_AVAILABLE and plot:
        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])
@ -75,6 +92,7 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
    print(m)
    return m

+
 def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=True):
    """
    Simple 1D classification example using Laplace approximation
@ -84,10 +102,12 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=

    """

-    try:import pods
-    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
+    try:
+        import pods
+    except ImportError:
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
-    Y = data['Y'][:, 0:1]
+    Y = data["Y"][:, 0:1]
    Y[Y.flatten() == -1] = 0

    likelihood = GPy.likelihoods.Bernoulli()
@ -95,18 +115,18 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=
    kernel = GPy.kern.RBF(1)

    # Model definition
-    m = GPy.core.GP(data['X'], Y, kernel=kernel, likelihood=likelihood, inference_method=laplace_inf)
+    m = GPy.core.GP(
+        data["X"], Y, kernel=kernel, likelihood=likelihood, inference_method=laplace_inf
+    )

    # Optimize
    if optimize:
-        try:
-            m.optimize('scg', messages=1)
-        except Exception as e:
-            return m
+        m.optimize("scg", messages=True)
+
+    return m

    # Plot
-    if plot:
-        from matplotlib import pyplot as plt
+    if MPL_AVAILABLE and plot:
        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])
@ -114,7 +134,10 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=
    print(m)
    return m

-def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, optimize=True, plot=True):
+
+def sparse_toy_linear_1d_classification(
+    num_inducing=10, seed=default_seed, optimize=True, plot=True
+):
    """
    Sparse 1D classification example

@ -123,23 +146,24 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti

    """

-    try:import pods
-    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
+    try:
+        import pods
+    except ImportError:
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
-    Y = data['Y'][:, 0:1]
+    Y = data["Y"][:, 0:1]
    Y[Y.flatten() == -1] = 0

    # Model definition
-    m = GPy.models.SparseGPClassification(data['X'], Y, num_inducing=num_inducing)
-    m['.*len'] = 4.
+    m = GPy.models.SparseGPClassification(data["X"], Y, num_inducing=num_inducing)
+    m[".*len"] = 4.0

    # Optimize
    if optimize:
        m.optimize()

    # Plot
-    if plot:
-        from matplotlib import pyplot as plt
+    if MPL_AVAILABLE and plot:
        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])
@ -147,7 +171,10 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
    print(m)
    return m

-def sparse_toy_linear_1d_classification_uncertain_input(num_inducing=10, seed=default_seed, optimize=True, plot=True):
+
+def sparse_toy_linear_1d_classification_uncertain_input(
+    num_inducing=10, seed=default_seed, optimize=True, plot=True
+):
    """
    Sparse 1D classification example

@ -156,26 +183,30 @@ def sparse_toy_linear_1d_classification_uncertain_input(num_inducing=10, seed=de

    """

-    try:import pods
-    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
+    try:
+        import pods
+    except ImportError:
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
    import numpy as np
+
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
-    Y = data['Y'][:, 0:1]
+    Y = data["Y"][:, 0:1]
    Y[Y.flatten() == -1] = 0
-    X = data['X']
-    X_var = np.random.uniform(0.3,0.5,X.shape)
+    X = data["X"]
+    X_var = np.random.uniform(0.3, 0.5, X.shape)

    # Model definition
-    m = GPy.models.SparseGPClassificationUncertainInput(X, X_var, Y, num_inducing=num_inducing)
-    m['.*len'] = 4.
+    m = GPy.models.SparseGPClassificationUncertainInput(
+        X, X_var, Y, num_inducing=num_inducing
+    )
+    m[".*len"] = 4.0

    # Optimize
    if optimize:
        m.optimize()

    # Plot
-    if plot:
-        from matplotlib import pyplot as plt
+    if MPL_AVAILABLE and plot:
        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])
@ -183,6 +214,7 @@ def sparse_toy_linear_1d_classification_uncertain_input(num_inducing=10, seed=de
    print(m)
    return m

+
 def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
    """
    Simple 1D classification example using a heavy side gp transformation
@ -192,29 +224,39 @@ def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):

    """

-    try:import pods
-    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
+    try:
+        import pods
+    except ImportError:
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
-    Y = data['Y'][:, 0:1]
+    Y = data["Y"][:, 0:1]
    Y[Y.flatten() == -1] = 0

    # Model definition
    kernel = GPy.kern.RBF(1)
-    likelihood = GPy.likelihoods.Bernoulli(gp_link=GPy.likelihoods.link_functions.Heaviside())
+    likelihood = GPy.likelihoods.Bernoulli(
+        gp_link=GPy.likelihoods.link_functions.Heaviside()
+    )
    ep = GPy.inference.latent_function_inference.expectation_propagation.EP()
-    m = GPy.core.GP(X=data['X'], Y=Y, kernel=kernel, likelihood=likelihood, inference_method=ep, name='gp_classification_heaviside')
-    #m = GPy.models.GPClassification(data['X'], likelihood=likelihood)
+    m = GPy.core.GP(
+        X=data["X"],
+        Y=Y,
+        kernel=kernel,
+        likelihood=likelihood,
+        inference_method=ep,
+        name="gp_classification_heaviside",
+    )
+    # m = GPy.models.GPClassification(data['X'], likelihood=likelihood)

    # Optimize
    if optimize:
        # Parameters optimization:
        for _ in range(5):
-            m.optimize(max_iters=int(max_iters/5))
+            m.optimize(max_iters=int(max_iters / 5))
        print(m)

    # Plot
-    if plot:
-        from matplotlib import pyplot as plt
+    if MPL_AVAILABLE and plot:
        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])
@ -222,7 +264,15 @@ def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
    print(m)
    return m

-def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=None, optimize=True, plot=True):
+
+def crescent_data(
+    model_type="Full",
+    num_inducing=10,
+    seed=default_seed,
+    kernel=None,
+    optimize=True,
+    plot=True,
+):
    """
    Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

@ -234,26 +284,32 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
    :param kernel: kernel to use in the model
    :type kernel: a GPy kernel
    """
-    try:import pods
-    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
+    try:
+        import pods
+    except ImportError:
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
    data = pods.datasets.crescent_data(seed=seed)
-    Y = data['Y']
-    Y[Y.flatten()==-1] = 0
+    Y = data["Y"]
+    Y[Y.flatten() == -1] = 0

-    if model_type == 'Full':
-        m = GPy.models.GPClassification(data['X'], Y, kernel=kernel)
+    if model_type == "Full":
+        m = GPy.models.GPClassification(data["X"], Y, kernel=kernel)

-    elif model_type == 'DTC':
-        m = GPy.models.SparseGPClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
-        m['.*len'] = 10.
+    elif model_type == "DTC":
+        m = GPy.models.SparseGPClassification(
+            data["X"], Y, kernel=kernel, num_inducing=num_inducing
+        )
+        m[".*len"] = 10.0

-    elif model_type == 'FITC':
-        m = GPy.models.FITCClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
-        m['.*len'] = 3.
+    elif model_type == "FITC":
+        m = GPy.models.FITCClassification(
+            data["X"], Y, kernel=kernel, num_inducing=num_inducing
+        )
+        m[".*len"] = 3.0
    if optimize:
        m.optimize(messages=1)

-    if plot:
+    if MPL_AVAILABLE and plot:
        m.plot()

    print(m)
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -1,10 +1,12 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as _np
+
 default_seed = 123344

 # default_seed = _np.random.seed(123344)

+
 def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan=False):
    """
    model for testing purposes. Samples from a GP with rbf kernel and learns
@ -24,7 +26,7 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan
    # generate GPLVM-like data
    X = _np.random.rand(num_inputs, input_dim)
    lengthscales = _np.random.rand(input_dim)
-    k = GPy.kern.RBF(input_dim, .5, lengthscales, ARD=True)
+    k = GPy.kern.RBF(input_dim, 0.5, lengthscales, ARD=True)
    K = k.K(X)
    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, (output_dim,)).T

@ -35,87 +37,102 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan
    # k = GPy.kern.RBF(input_dim, .5, 2., ARD=0) + GPy.kern.RBF(input_dim, .3, .2, ARD=0)
    # k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)

-    p = .3
+    p = 0.3

    m = GPy.models.BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)

    if nan:
-        m.inference_method = GPy.inference.latent_function_inference.var_dtc.VarDTCMissingData()
+        m.inference_method = (
+            GPy.inference.latent_function_inference.var_dtc.VarDTCMissingData()
+        )
        m.Y[_np.random.binomial(1, p, size=(Y.shape)).astype(bool)] = _np.nan
        m.parameters_changed()

-    #===========================================================================
+    # ===========================================================================
    # randomly obstruct data with percentage p
-    #===========================================================================
+    # ===========================================================================
    # m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
    # m.lengthscales = lengthscales

    if plot:
        import matplotlib.pyplot as pb
+
        m.plot()
-        pb.title('PCA initialisation')
+        pb.title("PCA initialisation")
        # m2.plot()
        # pb.title('PCA initialisation')

    if optimize:
-        m.optimize('scg', messages=verbose)
+        m.optimize("scg", messages=verbose)
        # m2.optimize('scg', messages=verbose)
        if plot:
            m.plot()
-            pb.title('After optimisation')
+            pb.title("After optimisation")
            # m2.plot()
            # pb.title('After optimisation')

    return m

+
 def gplvm_oil_100(optimize=True, verbose=1, plot=True):
    import GPy
    import pods
+
    data = pods.datasets.oil_100()
-    Y = data['X']
+    Y = data["X"]
    # create simple GP model
    kernel = GPy.kern.RBF(6, ARD=True) + GPy.kern.Bias(6)
    m = GPy.models.GPLVM(Y, 6, kernel=kernel)
-    m.data_labels = data['Y'].argmax(axis=1)
-    if optimize: m.optimize('scg', messages=verbose)
-    if plot: m.plot_latent(labels=m.data_labels)
+    m.data_labels = data["Y"].argmax(axis=1)
+    if optimize:
+        m.optimize("scg", messages=verbose)
+    if plot:
+        m.plot_latent(labels=m.data_labels)
    return m

-def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_inducing=15, max_iters=50):
+
+def sparse_gplvm_oil(
+    optimize=True, verbose=0, plot=True, N=100, Q=6, num_inducing=15, max_iters=50
+):
    import GPy
    import pods

    _np.random.seed(0)
    data = pods.datasets.oil()
-    Y = data['X'][:N]
+    Y = data["X"][:N]
    Y = Y - Y.mean(0)
    Y /= Y.std(0)
    # Create the model
    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q)
    m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing)
-    m.data_labels = data['Y'][:N].argmax(axis=1)
+    m.data_labels = data["Y"][:N].argmax(axis=1)

-    if optimize: m.optimize('scg', messages=verbose, max_iters=max_iters)
+    if optimize:
+        m.optimize("scg", messages=verbose, max_iters=max_iters)
    if plot:
        m.plot_latent(labels=m.data_labels)
        m.kern.plot_ARD()
    return m

-def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=25, Q=4, sigma=.2):
+
+def swiss_roll(
+    optimize=True, verbose=1, plot=True, N=1000, num_inducing=25, Q=4, sigma=0.2
+):
    import GPy
    from pods.datasets import swiss_roll_generated
    from GPy.models import BayesianGPLVM

    data = swiss_roll_generated(num_samples=N, sigma=sigma)
-    Y = data['Y']
+    Y = data["Y"]
    Y -= Y.mean()
    Y /= Y.std()

-    t = data['t']
-    c = data['colors']
+    t = data["t"]
+    c = data["colors"]

    try:
        from sklearn.manifold.isomap import Isomap
+
        iso = Isomap().fit(Y)
        X = iso.embedding_
        if Q > 2:
@ -126,8 +143,9 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=25, Q=4
    if plot:
        import matplotlib.pyplot as plt
        from mpl_toolkits.mplot3d import Axes3D  # @UnusedImport
+
        fig = plt.figure("Swiss Roll Data")
-        ax = fig.add_subplot(121, projection='3d')
+        ax = fig.add_subplot(121, projection="3d")
        ax.scatter(*Y.T, c=c)
        ax.set_title("Swiss Roll")

@ -135,60 +153,96 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=25, Q=4
        ax.scatter(*X.T[:2], c=c)
        ax.set_title("BGPLVM init")

-    var = .5
-    S = (var * _np.ones_like(X) + _np.clip(_np.random.randn(N, Q) * var ** 2,
-                                         - (1 - var),
-                                         (1 - var))) + .001
+    var = 0.5
+    S = (
+        var * _np.ones_like(X)
+        + _np.clip(_np.random.randn(N, Q) * var ** 2, -(1 - var), (1 - var))
+    ) + 0.001
    Z = _np.random.permutation(X)[:num_inducing]

-    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q, _np.exp(-2)) + GPy.kern.White(Q, _np.exp(-2))
+    kernel = (
+        GPy.kern.RBF(Q, ARD=True)
+        + GPy.kern.Bias(Q, _np.exp(-2))
+        + GPy.kern.White(Q, _np.exp(-2))
+    )

-    m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
+    m = BayesianGPLVM(
+        Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel
+    )
    m.data_colors = c
    m.data_t = t

    if optimize:
-        m.optimize('bfgs', messages=verbose, max_iters=2e3)
+        m.optimize("bfgs", messages=verbose, max_iters=2e3)

    if plot:
-        fig = plt.figure('fitted')
+        fig = plt.figure("fitted")
        ax = fig.add_subplot(111)
        s = m.input_sensitivity().argsort()[::-1][:2]
        ax.scatter(*m.X.mean.T[s], c=c)

    return m

-def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
+
+def bgplvm_oil(
+    optimize=True,
+    verbose=1,
+    plot=True,
+    N=200,
+    Q=7,
+    num_inducing=40,
+    max_iters=1000,
+    **k
+):
    import GPy
    from matplotlib import pyplot as plt
    import numpy as np
+
    _np.random.seed(0)
    try:
        import pods
+
        data = pods.datasets.oil()
    except ImportError:
        data = GPy.util.datasets.oil()

-
-    kernel = GPy.kern.RBF(Q, 1., 1. / _np.random.uniform(0, 1, (Q,)), ARD=True)  # + GPy.kern.Bias(Q, _np.exp(-2))
-    Y = data['X'][:N]
+    kernel = GPy.kern.RBF(
+        Q, 1.0, 1.0 / _np.random.uniform(0, 1, (Q,)), ARD=True
+    )  # + GPy.kern.Bias(Q, _np.exp(-2))
+    Y = data["X"][:N]
    m = GPy.models.BayesianGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k)
-    m.data_labels = data['Y'][:N].argmax(axis=1)
+    m.data_labels = data["Y"][:N].argmax(axis=1)

    if optimize:
-        m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05)
+        m.optimize("bfgs", messages=verbose, max_iters=max_iters, gtol=0.05)

    if plot:
        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
        m.plot_latent(ax=latent_axes, labels=m.data_labels)
        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0, :]))
-        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1, :],  # @UnusedVariable
-            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
-        input('Press enter to finish')
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(
+            m.X.mean.values[0:1, :],  # @UnusedVariable
+            m,
+            data_show,
+            latent_axes=latent_axes,
+            sense_axes=sense_axes,
+            labels=m.data_labels,
+        )
+        input("Press enter to finish")
        plt.close(fig)
    return m

-def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
+
+def ssgplvm_oil(
+    optimize=True,
+    verbose=1,
+    plot=True,
+    N=200,
+    Q=7,
+    num_inducing=40,
+    max_iters=1000,
+    **k
+):
    import GPy
    from matplotlib import pyplot as plt
    import pods
@ -196,39 +250,57 @@ def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40
    _np.random.seed(0)
    data = pods.datasets.oil()

-    kernel = GPy.kern.RBF(Q, 1., 1. / _np.random.uniform(0, 1, (Q,)), ARD=True)  # + GPy.kern.Bias(Q, _np.exp(-2))
-    Y = data['X'][:N]
+    kernel = GPy.kern.RBF(
+        Q, 1.0, 1.0 / _np.random.uniform(0, 1, (Q,)), ARD=True
+    )  # + GPy.kern.Bias(Q, _np.exp(-2))
+    Y = data["X"][:N]
    m = GPy.models.SSGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k)
-    m.data_labels = data['Y'][:N].argmax(axis=1)
+    m.data_labels = data["Y"][:N].argmax(axis=1)

    if optimize:
-        m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05)
+        m.optimize("bfgs", messages=verbose, max_iters=max_iters, gtol=0.05)

    if plot:
        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
        m.plot_latent(ax=latent_axes, labels=m.data_labels)
        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0, :]))
-        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1, :],  # @UnusedVariable
-            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
-        input('Press enter to finish')
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(
+            m.X.mean.values[0:1, :],  # @UnusedVariable
+            m,
+            data_show,
+            latent_axes=latent_axes,
+            sense_axes=sense_axes,
+            labels=m.data_labels,
+        )
+        input("Press enter to finish")
        plt.close(fig)
    return m

+
 def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
    """Simulate some data drawn from a matern covariance and a periodic exponential for use in MRD demos."""
    Q_signal = 4
    import GPy
    import numpy as np
+
    np.random.seed(3000)

-    k = GPy.kern.Matern32(Q_signal, 1., lengthscale=(np.random.uniform(1, 6, Q_signal)), ARD=1)
+    k = GPy.kern.Matern32(
+        Q_signal, 1.0, lengthscale=(np.random.uniform(1, 6, Q_signal)), ARD=1
+    )
    for i in range(Q_signal):
-        k += GPy.kern.PeriodicExponential(1, variance=1., active_dims=[i], period=3., lower=-2, upper=6)
+        k += GPy.kern.PeriodicExponential(
+            1, variance=1.0, active_dims=[i], period=3.0, lower=-2, upper=6
+        )
    t = np.c_[[np.linspace(-1, 5, N) for _ in range(Q_signal)]].T
    K = k.K(t)
-    s2, s1, s3, sS = np.random.multivariate_normal(np.zeros(K.shape[0]), K, size=(4))[:, :, None]
+    s2, s1, s3, sS = np.random.multivariate_normal(np.zeros(K.shape[0]), K, size=(4))[
+        :, :, None
+    ]

-    Y1, Y2, Y3, S1, S2, S3 = _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS)
+    Y1, Y2, Y3, S1, S2, S3 = _generate_high_dimensional_output(
+        D1, D2, D3, s1, s2, s3, sS
+    )

    slist = [sS, s1, s2, s3]
    slist_names = ["sS", "s1", "s2", "s3"]
@ -238,6 +310,7 @@ def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
        from matplotlib import pyplot as plt
        import matplotlib.cm as cm
        import itertools
+
        fig = plt.figure("MRD Simulation Data", figsize=(8, 6))
        fig.clf()
        ax = fig.add_subplot(2, 1, 1)
@ -247,13 +320,14 @@ def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
        ax.legend()
        for i, Y in enumerate(Ylist):
            ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
-            ax.imshow(Y, aspect='auto', cmap=cm.gray)  # @UndefinedVariable
+            ax.imshow(Y, aspect="auto", cmap=cm.gray)  # @UndefinedVariable
            ax.set_title("Y{}".format(i + 1))
        plt.draw()
        plt.tight_layout()

    return slist, [S1, S2, S3], Ylist

+
 def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
    """Simulate some data drawn from sine and cosine for use in demos of MRD"""
    _np.random.seed(1234)
@ -261,7 +335,7 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
    s1 = _np.vectorize(lambda x: _np.sin(x))
    s2 = _np.vectorize(lambda x: _np.cos(x))
-    s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
+    s3 = _np.vectorize(lambda x: -_np.exp(-_np.cos(2 * x)))
    sS = _np.vectorize(lambda x: _np.cos(x))

    s1 = s1(x)
@ -269,12 +343,18 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
    s3 = s3(x)
    sS = sS(x)

-    s1 -= s1.mean(); s1 /= s1.std(0)
-    s2 -= s2.mean(); s2 /= s2.std(0)
-    s3 -= s3.mean(); s3 /= s3.std(0)
-    sS -= sS.mean(); sS /= sS.std(0)
+    s1 -= s1.mean()
+    s1 /= s1.std(0)
+    s2 -= s2.mean()
+    s2 /= s2.std(0)
+    s3 -= s3.mean()
+    s3 /= s3.std(0)
+    sS -= sS.mean()
+    sS /= sS.std(0)

-    Y1, Y2, Y3, S1, S2, S3 = _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS)
+    Y1, Y2, Y3, S1, S2, S3 = _generate_high_dimensional_output(
+        D1, D2, D3, s1, s2, s3, sS
+    )

    slist = [sS, s1, s2, s3]
    slist_names = ["sS", "s1", "s2", "s3"]
@ -284,6 +364,7 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
        from matplotlib import pyplot as plt
        import matplotlib.cm as cm
        import itertools
+
        fig = plt.figure("MRD Simulation Data", figsize=(8, 6))
        fig.clf()
        ax = fig.add_subplot(2, 1, 1)
@ -293,13 +374,14 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
        ax.legend()
        for i, Y in enumerate(Ylist):
            ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
-            ax.imshow(Y, aspect='auto', cmap=cm.gray)  # @UndefinedVariable
+            ax.imshow(Y, aspect="auto", cmap=cm.gray)  # @UndefinedVariable
            ax.set_title("Y{}".format(i + 1))
        plt.draw()
        plt.tight_layout()

    return slist, [S1, S2, S3], Ylist

+
 def _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS):
    S1 = _np.hstack([s1, sS])
    S2 = _np.hstack([sS])
@ -307,9 +389,9 @@ def _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS):
    Y1 = S1.dot(_np.random.randn(S1.shape[1], D1))
    Y2 = S2.dot(_np.random.randn(S2.shape[1], D2))
    Y3 = S3.dot(_np.random.randn(S3.shape[1], D3))
-    Y1 += .3 * _np.random.randn(*Y1.shape)
-    Y2 += .2 * _np.random.randn(*Y2.shape)
-    Y3 += .25 * _np.random.randn(*Y3.shape)
+    Y1 += 0.3 * _np.random.randn(*Y1.shape)
+    Y2 += 0.2 * _np.random.randn(*Y2.shape)
+    Y3 += 0.25 * _np.random.randn(*Y3.shape)
    Y1 -= Y1.mean(0)
    Y2 -= Y2.mean(0)
    Y3 -= Y3.mean(0)
@ -318,10 +400,10 @@ def _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS):
    Y3 /= Y3.std(0)
    return Y1, Y2, Y3, S1, S2, S3

-def bgplvm_simulation(optimize=True, verbose=1,
-                      plot=True, plot_sim=False,
-                      max_iters=2e4,
-                      ):
+
+def bgplvm_simulation(
+    optimize=True, verbose=1, plot=True, plot_sim=False, max_iters=2e4,
+):
    from GPy import kern
    from GPy.models import BayesianGPLVM

@ -331,22 +413,21 @@ def bgplvm_simulation(optimize=True, verbose=1,
    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    # k = kern.RBF(Q, ARD=True, lengthscale=10.)
    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
-    m.X.variance[:] = _np.random.uniform(0, .01, m.X.shape)
-    m.likelihood.variance = .1
+    m.X.variance[:] = _np.random.uniform(0, 0.01, m.X.shape)
+    m.likelihood.variance = 0.1

    if optimize:
        print("Optimizing model:")
-        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
-                   gtol=.05)
+        m.optimize("bfgs", messages=verbose, max_iters=max_iters, gtol=0.05)
    if plot:
        m.X.plot("BGPLVM Latent Space 1D")
        m.kern.plot_ARD()
    return m

-def gplvm_simulation(optimize=True, verbose=1,
-                      plot=True, plot_sim=False,
-                      max_iters=2e4,
-                      ):
+
+def gplvm_simulation(
+    optimize=True, verbose=1, plot=True, plot_sim=False, max_iters=2e4,
+):
    from GPy import kern
    from GPy.models import GPLVM

@ -356,20 +437,20 @@ def gplvm_simulation(optimize=True, verbose=1,
    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    # k = kern.RBF(Q, ARD=True, lengthscale=10.)
    m = GPLVM(Y, Q, init="PCA", kernel=k)
-    m.likelihood.variance = .1
+    m.likelihood.variance = 0.1

    if optimize:
        print("Optimizing model:")
-        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
-                   gtol=.05)
+        m.optimize("bfgs", messages=verbose, max_iters=max_iters, gtol=0.05)
    if plot:
        m.X.plot("BGPLVM Latent Space 1D")
        m.kern.plot_ARD()
    return m
-def ssgplvm_simulation(optimize=True, verbose=1,
-                      plot=True, plot_sim=False,
-                      max_iters=2e4, useGPU=False
-                      ):
+
+
+def ssgplvm_simulation(
+    optimize=True, verbose=1, plot=True, plot_sim=False, max_iters=2e4, useGPU=False
+):
    from GPy import kern
    from GPy.models import SSGPLVM

@ -378,23 +459,30 @@ def ssgplvm_simulation(optimize=True, verbose=1,
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    # k = kern.RBF(Q, ARD=True, lengthscale=10.)
-    m = SSGPLVM(Y, Q, init="rand", num_inducing=num_inducing, kernel=k, group_spike=True)
-    m.X.variance[:] = _np.random.uniform(0, .01, m.X.shape)
-    m.likelihood.variance = .01
+    m = SSGPLVM(
+        Y, Q, init="rand", num_inducing=num_inducing, kernel=k, group_spike=True
+    )
+    m.X.variance[:] = _np.random.uniform(0, 0.01, m.X.shape)
+    m.likelihood.variance = 0.01

    if optimize:
        print("Optimizing model:")
-        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
-                   gtol=.05)
+        m.optimize("bfgs", messages=verbose, max_iters=max_iters, gtol=0.05)
    if plot:
        m.X.plot("SSGPLVM Latent Space 1D")
        m.kern.plot_ARD()
    return m

-def bgplvm_simulation_missing_data(optimize=True, verbose=1,
-                      plot=True, plot_sim=False,
-                      max_iters=2e4, percent_missing=.1, d=13,
-                      ):
+
+def bgplvm_simulation_missing_data(
+    optimize=True,
+    verbose=1,
+    plot=True,
+    plot_sim=False,
+    max_iters=2e4,
+    percent_missing=0.1,
+    d=13,
+):
    from GPy import kern
    from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch

@ -403,28 +491,42 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)

-    inan = _np.random.binomial(1, percent_missing, size=Y.shape).astype(bool)  # 80% missing data
+    inan = _np.random.binomial(1, percent_missing, size=Y.shape).astype(
+        bool
+    )  # 80% missing data
    Ymissing = Y.copy()
    Ymissing[inan] = _np.nan

-    m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
-                      kernel=k, missing_data=True)
+    m = BayesianGPLVMMiniBatch(
+        Ymissing,
+        Q,
+        init="random",
+        num_inducing=num_inducing,
+        kernel=k,
+        missing_data=True,
+    )

    m.Yreal = Y

    if optimize:
        print("Optimizing model:")
-        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
-                   gtol=.05)
+        m.optimize("bfgs", messages=verbose, max_iters=max_iters, gtol=0.05)
    if plot:
        m.X.plot("BGPLVM Latent Space 1D")
        m.kern.plot_ARD()
    return m

-def bgplvm_simulation_missing_data_stochastics(optimize=True, verbose=1,
-                      plot=True, plot_sim=False,
-                      max_iters=2e4, percent_missing=.1, d=13, batchsize=2,
-                      ):
+
+def bgplvm_simulation_missing_data_stochastics(
+    optimize=True,
+    verbose=1,
+    plot=True,
+    plot_sim=False,
+    max_iters=2e4,
+    percent_missing=0.1,
+    d=13,
+    batchsize=2,
+):
    from GPy import kern
    from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch

@ -433,19 +535,28 @@ def bgplvm_simulation_missing_data_stochastics(optimize=True, verbose=1,
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)

-    inan = _np.random.binomial(1, percent_missing, size=Y.shape).astype(bool)  # 80% missing data
+    inan = _np.random.binomial(1, percent_missing, size=Y.shape).astype(
+        bool
+    )  # 80% missing data
    Ymissing = Y.copy()
    Ymissing[inan] = _np.nan

-    m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
-                      kernel=k, missing_data=True, stochastic=True, batchsize=batchsize)
+    m = BayesianGPLVMMiniBatch(
+        Ymissing,
+        Q,
+        init="random",
+        num_inducing=num_inducing,
+        kernel=k,
+        missing_data=True,
+        stochastic=True,
+        batchsize=batchsize,
+    )

    m.Yreal = Y

    if optimize:
        print("Optimizing model:")
-        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
-                   gtol=.05)
+        m.optimize("bfgs", messages=verbose, max_iters=max_iters, gtol=0.05)
    if plot:
        m.X.plot("BGPLVM Latent Space 1D")
        m.kern.plot_ARD()
@ -460,9 +571,17 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim)

    k = kern.Linear(Q, ARD=True) + kern.White(Q, variance=1e-4)
-    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernel=k, initx="PCA_concat", initz='permute', **kw)
+    m = MRD(
+        Ylist,
+        input_dim=Q,
+        num_inducing=num_inducing,
+        kernel=k,
+        initx="PCA_concat",
+        initz="permute",
+        **kw
+    )

-    m['.*noise'] = [Y.var() / 40. for Y in Ylist]
+    m[".*noise"] = [Y.var() / 40.0 for Y in Ylist]

    if optimize:
        print("Optimizing Model:")
@ -472,7 +591,10 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
        m.plot_scales()
    return m

-def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
+
+def mrd_simulation_missing_data(
+    optimize=True, verbose=True, plot=True, plot_sim=True, **kw
+):
    from GPy import kern
    from GPy.models import MRD

@ -483,29 +605,37 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim
    inanlist = []

    for Y in Ylist:
-        inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
+        inan = _np.random.binomial(1, 0.6, size=Y.shape).astype(bool)
        inanlist.append(inan)
        Y[inan] = _np.nan

-    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing,
-            kernel=k, inference_method=None,
-            initx="random", initz='permute', **kw)
+    m = MRD(
+        Ylist,
+        input_dim=Q,
+        num_inducing=num_inducing,
+        kernel=k,
+        inference_method=None,
+        initx="random",
+        initz="permute",
+        **kw
+    )

    if optimize:
        print("Optimizing Model:")
-        m.optimize('bfgs', messages=verbose, max_iters=8e3, gtol=.1)
+        m.optimize("bfgs", messages=verbose, max_iters=8e3, gtol=0.1)
    if plot:
        m.X.plot("MRD Latent Space 1D")
        m.plot_scales()
    return m

+
 def brendan_faces(optimize=True, verbose=True, plot=True):
    import GPy
    import pods

    data = pods.datasets.brendan_faces()
    Q = 2
-    Y = data['Y']
+    Y = data["Y"]
    Yn = Y - Y.mean()
    Yn /= Yn.std()

@ -513,39 +643,56 @@ def brendan_faces(optimize=True, verbose=True, plot=True):

    # optimize

-    if optimize: m.optimize('bfgs', messages=verbose, max_iters=1000)
+    if optimize:
+        m.optimize("bfgs", messages=verbose, max_iters=1000)

    if plot:
        ax = m.plot_latent(which_indices=(0, 1))
        y = m.Y[0, :]
-        data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
-        lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax)
-        input('Press enter to finish')
+        data_show = GPy.plotting.matplot_dep.visualize.image_show(
+            y[None, :],
+            dimensions=(20, 28),
+            transpose=True,
+            order="F",
+            invert=False,
+            scale=False,
+        )
+        lvm = GPy.plotting.matplot_dep.visualize.lvm(
+            m.X.mean[0, :].copy(), m, data_show, ax
+        )
+        input("Press enter to finish")

    return m

+
 def olivetti_faces(optimize=True, verbose=True, plot=True):
    import GPy
    import pods

    data = pods.datasets.olivetti_faces()
    Q = 2
-    Y = data['Y']
+    Y = data["Y"]
    Yn = Y - Y.mean()
    Yn /= Yn.std()

    m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=20)

-    if optimize: m.optimize('bfgs', messages=verbose, max_iters=1000)
+    if optimize:
+        m.optimize("bfgs", messages=verbose, max_iters=1000)
    if plot:
        ax = m.plot_latent(which_indices=(0, 1))
        y = m.Y[0, :]
-        data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
-        lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax)
-        input('Press enter to finish')
+        data_show = GPy.plotting.matplot_dep.visualize.image_show(
+            y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False
+        )
+        lvm = GPy.plotting.matplot_dep.visualize.lvm(
+            m.X.mean[0, :].copy(), m, data_show, ax
+        )
+        input("Press enter to finish")

    return m

+
 def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=True):
    import GPy
    import pods
@ -553,15 +700,18 @@ def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=Tru
    data = pods.datasets.osu_run1()
    # optimize
    if range == None:
-        Y = data['Y'].copy()
+        Y = data["Y"].copy()
    else:
-        Y = data['Y'][range[0]:range[1], :].copy()
+        Y = data["Y"][range[0] : range[1], :].copy()
    if plot:
        y = Y[0, :]
-        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(
+            y[None, :], connect=data["connect"]
+        )
        GPy.plotting.matplot_dep.visualize.data_play(Y, data_show, frame_rate)
    return Y

+
 def stick(kernel=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
@ -569,19 +719,25 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True):

    data = pods.datasets.osu_run1()
    # optimize
-    m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
-    if optimize: m.optimize('bfgs', messages=verbose, max_f_eval=10000)
+    m = GPy.models.GPLVM(data["Y"], 2, kernel=kernel)
+    if optimize:
+        m.optimize("bfgs", messages=verbose, max_f_eval=10000)
    if plot:
        plt.clf
        ax = m.plot_latent()
        y = m.Y[0, :]
-        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[:1, :].copy(), m, data_show, latent_axes=ax)
-        input('Press enter to finish')
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(
+            y[None, :], connect=data["connect"]
+        )
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(
+            m.X[:1, :].copy(), m, data_show, latent_axes=ax
+        )
+        input("Press enter to finish")
        lvm_visualizer.close()
        data_show.close()
    return m

+
 def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
@ -589,19 +745,23 @@ def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):

    data = pods.datasets.osu_run1()
    # optimize
-    mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
-    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
-    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    mapping = GPy.mappings.Linear(data["Y"].shape[1], 2)
+    m = GPy.models.BCGPLVM(data["Y"], 2, kernel=kernel, mapping=mapping)
+    if optimize:
+        m.optimize(messages=verbose, max_f_eval=10000)
    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
-        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(
+            y[None, :], connect=data["connect"]
+        )
        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-        input('Press enter to finish')
+        input("Press enter to finish")

    return m

+
 def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
@ -609,20 +769,24 @@ def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):

    data = pods.datasets.osu_run1()
    # optimize
-    back_kernel = GPy.kern.RBF(data['Y'].shape[1], lengthscale=5.)
-    mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
-    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
-    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    back_kernel = GPy.kern.RBF(data["Y"].shape[1], lengthscale=5.0)
+    mapping = GPy.mappings.Kernel(X=data["Y"], output_dim=2, kernel=back_kernel)
+    m = GPy.models.BCGPLVM(data["Y"], 2, kernel=kernel, mapping=mapping)
+    if optimize:
+        m.optimize(messages=verbose, max_f_eval=10000)
    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
-        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(
+            y[None, :], connect=data["connect"]
+        )
        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        # input('Press enter to finish')

    return m

+
 def robot_wireless(optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
@ -630,13 +794,15 @@ def robot_wireless(optimize=True, verbose=True, plot=True):

    data = pods.datasets.robot_wireless()
    # optimize
-    m = GPy.models.BayesianGPLVM(data['Y'], 4, num_inducing=25)
-    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    m = GPy.models.BayesianGPLVM(data["Y"], 4, num_inducing=25)
+    if optimize:
+        m.optimize(messages=verbose, max_f_eval=10000)
    if plot:
        m.plot_latent()

    return m

+
 def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
    """Interactive visualisation of the Stick Man data from Ohio State University with the Bayesian GPLVM."""
    from GPy.models import BayesianGPLVM
@ -647,15 +813,16 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):

    data = pods.datasets.osu_run1()
    Q = 6
-    kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True)
-    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
+    kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(0.5, Q), ARD=True)
+    m = BayesianGPLVM(data["Y"], Q, init="PCA", num_inducing=20, kernel=kernel)

    m.data = data
    m.likelihood.variance = 0.001

    # optimize
    try:
-        if optimize: m.optimize('bfgs', messages=verbose, max_iters=5e3, bfgs_factor=10)
+        if optimize:
+            m.optimize("bfgs", messages=verbose, max_iters=5e3, bfgs_factor=10)
    except KeyboardInterrupt:
        print("Keyboard interrupt, continuing to plot and return")

@ -664,44 +831,68 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
        plt.sca(latent_axes)
        m.plot_latent(ax=latent_axes)
        y = m.Y[:1, :].copy()
-        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y, connect=data['connect'])
-        dim_select = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(
+            y, connect=data["connect"]
+        )
+        dim_select = GPy.plotting.matplot_dep.visualize.lvm_dimselect(
+            m.X.mean[:1, :].copy(),
+            m,
+            data_show,
+            latent_axes=latent_axes,
+            sense_axes=sense_axes,
+        )
        fig.canvas.draw()
        # Canvas.show doesn't work on OSX.
-        #fig.canvas.show()
-        input('Press enter to finish')
+        # fig.canvas.show()
+        input("Press enter to finish")

    return m


-def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose=True, plot=True):
+def cmu_mocap(
+    subject="35", motion=["01"], in_place=True, optimize=True, verbose=True, plot=True
+):
+    import matplotlib.pyplot as plt
    import GPy
    import pods

    data = pods.datasets.cmu_mocap(subject, motion)
    if in_place:
        # Make figure move in place.
-        data['Y'][:, 0:3] = 0.0
-    Y = data['Y']
-    Y_mean = Y.mean(0)
-    Y_std = Y.std(0)
-    m = GPy.models.GPLVM((Y - Y_mean) / Y_std, 2)
+        data["Y"][:, 0:3] = 0.0
+    Y = data["Y"]
+    m = GPy.models.GPLVM(Y, 2, normalizer=True)

-    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if optimize:
+        m.optimize(messages=verbose, max_f_eval=10000)
    if plot:
-        ax = m.plot_latent()
+        fig, _ = plt.subplots(figsize=(8, 5))
+        latent_axes = fig.add_subplot(131)
+        sense_axes = fig.add_subplot(132)
+        viz_axes = fig.add_subplot(133, projection="3d")
+
+        m.plot_latent(ax=latent_axes)
+        latent_axes.set_aspect("equal")
+
        y = m.Y[0, :]
-        data_show = GPy.plotting.matplot_dep.visualize.skeleton_show(y[None, :], data['skel'])
-        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0].copy(), m, data_show, latent_axes=ax)
-        input('Press enter to finish')
+        data_show = GPy.plotting.matplot_dep.visualize.skeleton_show(
+            y[None, :], data["skel"], viz_axes
+        )
+
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(
+            m.X[0].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes
+        )
+        input("Press enter to finish")
        lvm_visualizer.close()
        data_show.close()

    return m

+
 def ssgplvm_simulation_linear():
    import numpy as np
    import GPy
+
    N, D, Q = 1000, 20, 5
    pi = 0.2

@ -712,7 +903,7 @@ def ssgplvm_simulation_linear():
            if dies[q] < pi:
                x[q] = np.random.randn()
            else:
-                x[q] = 0.
+                x[q] = 0.0
        return x

    Y = np.empty((N, D))
@ -722,4 +913,3 @@ def ssgplvm_simulation_linear():
        X[n] = sample_X(Q, pi)
        w = np.random.randn(D, Q)
        Y[n] = np.dot(w, X[n])
-
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@ -3,39 +3,41 @@

 import GPy
 import numpy as np
-from GPy.util import datasets
+
+MPL_AVAILABLE = True
 try:
    import matplotlib.pyplot as plt
-except:
-    pass
+except ImportError:
+    MPL_AVAILABLE = False
+

 def student_t_approx(optimize=True, plot=True):
    """
    Example of regressing with a student t likelihood using Laplace
    """
    real_std = 0.1
-    #Start a function, any function
-    X = np.linspace(0.0, np.pi*2, 100)[:, None]
-    Y = np.sin(X) + np.random.randn(*X.shape)*real_std
-    Y = Y/Y.max()
+    # Start a function, any function
+    X = np.linspace(0.0, np.pi * 2, 100)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape) * real_std
+    Y = Y / Y.max()
    Yc = Y.copy()

-    X_full = np.linspace(0.0, np.pi*2, 500)[:, None]
+    X_full = np.linspace(0.0, np.pi * 2, 500)[:, None]
    Y_full = np.sin(X_full)
-    Y_full = Y_full/Y_full.max()
+    Y_full = Y_full / Y_full.max()

-    #Slightly noisy data
+    # Slightly noisy data
    Yc[75:80] += 1

-    #Very noisy data
-    #Yc[10] += 100
-    #Yc[25] += 10
-    #Yc[23] += 10
-    #Yc[26] += 1000
-    #Yc[24] += 10
-    #Yc = Yc/Yc.max()
+    # Very noisy data
+    # Yc[10] += 100
+    # Yc[25] += 10
+    # Yc[23] += 10
+    # Yc[26] += 1000
+    # Yc[24] += 10
+    # Yc = Yc/Yc.max()

-    #Add student t random noise to datapoints
+    # Add student t random noise to datapoints
    deg_free = 1
    print("Real noise: ", real_std)
    initial_var_guess = 0.5
@ -47,45 +49,50 @@ def student_t_approx(optimize=True, plot=True):
    kernel3 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
    kernel4 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])

-    #Gaussian GP model on clean data
+    # Gaussian GP model on clean data
    m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
    # optimize
-    m1['.*white'].constrain_fixed(1e-5)
+    m1[".*white"].constrain_fixed(1e-5)
    m1.randomize()

-    #Gaussian GP model on corrupt data
+    # Gaussian GP model on corrupt data
    m2 = GPy.models.GPRegression(X, Yc.copy(), kernel=kernel2)
-    m2['.*white'].constrain_fixed(1e-5)
+    m2[".*white"].constrain_fixed(1e-5)
    m2.randomize()

-    #Student t GP model on clean data
+    # Student t GP model on clean data
    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
    laplace_inf = GPy.inference.latent_function_inference.Laplace()
-    m3 = GPy.core.GP(X, Y.copy(), kernel3, likelihood=t_distribution, inference_method=laplace_inf)
-    m3['.*t_scale2'].constrain_bounded(1e-6, 10.)
-    m3['.*white'].constrain_fixed(1e-5)
+    m3 = GPy.core.GP(
+        X, Y.copy(), kernel3, likelihood=t_distribution, inference_method=laplace_inf
+    )
+    m3[".*t_scale2"].constrain_bounded(1e-6, 10.0)
+    m3[".*white"].constrain_fixed(1e-5)
    m3.randomize()

-    #Student t GP model on corrupt data
+    # Student t GP model on corrupt data
    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
    laplace_inf = GPy.inference.latent_function_inference.Laplace()
-    m4 = GPy.core.GP(X, Yc.copy(), kernel4, likelihood=t_distribution, inference_method=laplace_inf)
-    m4['.*t_scale2'].constrain_bounded(1e-6, 10.)
-    m4['.*white'].constrain_fixed(1e-5)
+    m4 = GPy.core.GP(
+        X, Yc.copy(), kernel4, likelihood=t_distribution, inference_method=laplace_inf
+    )
+    m4[".*t_scale2"].constrain_bounded(1e-6, 10.0)
+    m4[".*white"].constrain_fixed(1e-5)
    m4.randomize()
    print(m4)
-    debug=True
+    debug = True
    if debug:
        m4.optimize(messages=1)
        from matplotlib import pyplot as pb
+
        pb.plot(m4.X, m4.inference_method.f_hat)
-        pb.plot(m4.X, m4.Y, 'rx')
+        pb.plot(m4.X, m4.Y, "rx")
        m4.plot()
        print(m4)
        return m4

    if optimize:
-        optimizer='scg'
+        optimizer = "scg"
        print("Clean Gaussian")
        m1.optimize(optimizer, messages=1)
        print("Corrupt Gaussian")
@ -95,79 +102,98 @@ def student_t_approx(optimize=True, plot=True):
        print("Corrupt student t")
        m4.optimize(optimizer, messages=1)

-    if plot:
+    if MPL_AVAILABLE and plot:
        plt.figure(1)
-        plt.suptitle('Gaussian likelihood')
+        plt.suptitle("Gaussian likelihood")
        ax = plt.subplot(211)
        m1.plot(ax=ax)
        plt.plot(X_full, Y_full)
        plt.ylim(-1.5, 1.5)
-        plt.title('Gaussian clean')
+        plt.title("Gaussian clean")

        ax = plt.subplot(212)
        m2.plot(ax=ax)
        plt.plot(X_full, Y_full)
        plt.ylim(-1.5, 1.5)
-        plt.title('Gaussian corrupt')
+        plt.title("Gaussian corrupt")

        plt.figure(2)
-        plt.suptitle('Student-t likelihood')
+        plt.suptitle("Student-t likelihood")
        ax = plt.subplot(211)
        m3.plot(ax=ax)
        plt.plot(X_full, Y_full)
        plt.ylim(-1.5, 1.5)
-        plt.title('Student-t rasm clean')
+        plt.title("Student-t rasm clean")

        ax = plt.subplot(212)
        m4.plot(ax=ax)
        plt.plot(X_full, Y_full)
        plt.ylim(-1.5, 1.5)
-        plt.title('Student-t rasm corrupt')
+        plt.title("Student-t rasm corrupt")

    return m1, m2, m3, m4

+
 def boston_example(optimize=True, plot=True):
    raise NotImplementedError("Needs updating")
    import sklearn
    from sklearn.cross_validation import KFold
-    optimizer='bfgs'
-    messages=0
-    data = datasets.boston_housing()
+
+    optimizer = "bfgs"
+    messages = 0
+    try:
+        import pods
+    except ImportError:
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
+        return
+    data = pods.datasets.boston_housing()
    degrees_freedoms = [3, 5, 8, 10]
-    X = data['X'].copy()
-    Y = data['Y'].copy()
-    X = X-X.mean(axis=0)
-    X = X/X.std(axis=0)
-    Y = Y-Y.mean()
-    Y = Y/Y.std()
+    X = data["X"].copy()
+    Y = data["Y"].copy()
+    X = X - X.mean(axis=0)
+    X = X / X.std(axis=0)
+    Y = Y - Y.mean()
+    Y = Y / Y.std()
    num_folds = 10
    kf = KFold(len(Y), n_folds=num_folds, indices=True)
-    num_models = len(degrees_freedoms) + 3 #3 for baseline, gaussian, gaussian laplace approx
+    num_models = (
+        len(degrees_freedoms) + 3
+    )  # 3 for baseline, gaussian, gaussian laplace approx
    score_folds = np.zeros((num_models, num_folds))
    pred_density = score_folds.copy()

    def rmse(Y, Ystar):
-        return np.sqrt(np.mean((Y-Ystar)**2))
+        return np.sqrt(np.mean((Y - Ystar) ** 2))

    for n, (train, test) in enumerate(kf):
        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
        print("Fold {}".format(n))

-        noise = 1e-1 #np.exp(-2)
+        noise = 1e-1  # np.exp(-2)
        rbf_len = 0.5
        data_axis_plot = 4
-        kernelstu = GPy.kern.RBF(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
-        kernelgp = GPy.kern.RBF(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelstu = (
+            GPy.kern.RBF(X.shape[1])
+            + GPy.kern.white(X.shape[1])
+            + GPy.kern.bias(X.shape[1])
+        )
+        kernelgp = (
+            GPy.kern.RBF(X.shape[1])
+            + GPy.kern.white(X.shape[1])
+            + GPy.kern.bias(X.shape[1])
+        )

-        #Baseline
+        # Baseline
        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))

-        #Gaussian GP
+        # Gaussian GP
        print("Gauss GP")
-        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
-        mgp.constrain_fixed('.*white', 1e-5)
-        mgp['.*len'] = rbf_len
-        mgp['.*noise'] = noise
+        mgp = GPy.models.GPRegression(
+            X_train.copy(), Y_train.copy(), kernel=kernelgp.copy()
+        )
+        mgp.constrain_fixed(".*white", 1e-5)
+        mgp[".*len"] = rbf_len
+        mgp[".*noise"] = noise
        print(mgp)
        if optimize:
            mgp.optimize(optimizer=optimizer, messages=messages)
@ -179,13 +205,20 @@ def boston_example(optimize=True, plot=True):

        print("Gaussian Laplace GP")
        N, D = Y_train.shape
-        g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
+        g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(
+            variance=noise, N=N, D=D
+        )
        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
-        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=g_likelihood)
-        mg.constrain_positive('noise_variance')
-        mg.constrain_fixed('.*white', 1e-5)
-        mg['rbf_len'] = rbf_len
-        mg['noise'] = noise
+        mg = GPy.models.GPRegression(
+            X_train.copy(),
+            Y_train.copy(),
+            kernel=kernelstu.copy(),
+            likelihood=g_likelihood,
+        )
+        mg.constrain_positive("noise_variance")
+        mg.constrain_fixed(".*white", 1e-5)
+        mg["rbf_len"] = rbf_len
+        mg["noise"] = noise
        print(mg)
        if optimize:
            mg.optimize(optimizer=optimizer, messages=messages)
@ -196,101 +229,113 @@ def boston_example(optimize=True, plot=True):
        print(mg)

        for stu_num, df in enumerate(degrees_freedoms):
-            #Student T
+            # Student T
            print("Student-T GP {}df".format(df))
-            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
+            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(
+                deg_free=df, sigma2=noise
+            )
            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
-            mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
-            mstu_t.constrain_fixed('.*white', 1e-5)
-            mstu_t.constrain_bounded('.*t_scale2', 0.0001, 1000)
-            mstu_t['rbf_len'] = rbf_len
-            mstu_t['.*t_scale2'] = noise
+            mstu_t = GPy.models.GPRegression(
+                X_train.copy(),
+                Y_train.copy(),
+                kernel=kernelstu.copy(),
+                likelihood=stu_t_likelihood,
+            )
+            mstu_t.constrain_fixed(".*white", 1e-5)
+            mstu_t.constrain_bounded(".*t_scale2", 0.0001, 1000)
+            mstu_t["rbf_len"] = rbf_len
+            mstu_t[".*t_scale2"] = noise
            print(mstu_t)
            if optimize:
                mstu_t.optimize(optimizer=optimizer, messages=messages)
            Y_test_pred = mstu_t.predict(X_test)
-            score_folds[3+stu_num, n] = rmse(Y_test, Y_test_pred[0])
-            pred_density[3+stu_num, n] = np.mean(mstu_t.log_predictive_density(X_test, Y_test))
+            score_folds[3 + stu_num, n] = rmse(Y_test, Y_test_pred[0])
+            pred_density[3 + stu_num, n] = np.mean(
+                mstu_t.log_predictive_density(X_test, Y_test)
+            )
            print(pred_density)
            print(mstu_t)

-    if plot:
+    if MPL_AVAILABLE and plot:
        plt.figure()
        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-        plt.title('GP gauss')
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c="r", marker="x")
+        plt.title("GP gauss")

        plt.figure()
        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-        plt.title('Lap gauss')
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c="r", marker="x")
+        plt.title("Lap gauss")

        plt.figure()
        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-        plt.title('Stu t {}df'.format(df))
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c="r", marker="x")
+        plt.title("Stu t {}df".format(df))

    print("Average scores: {}".format(np.mean(score_folds, 1)))
    print("Average pred density: {}".format(np.mean(pred_density, 1)))

-    if plot:
-        #Plotting
-        stu_t_legends = ['Student T, df={}'.format(df) for df in degrees_freedoms]
-        legends = ['Baseline', 'Gaussian', 'Laplace Approx Gaussian'] + stu_t_legends
+    if MPL_AVAILABLE and plot:
+        # Plotting
+        stu_t_legends = ["Student T, df={}".format(df) for df in degrees_freedoms]
+        legends = ["Baseline", "Gaussian", "Laplace Approx Gaussian"] + stu_t_legends

-        #Plot boxplots for RMSE density
+        # Plot boxplots for RMSE density
        fig = plt.figure()
-        ax=fig.add_subplot(111)
-        plt.title('RMSE')
-        bp = ax.boxplot(score_folds.T, notch=0, sym='+', vert=1, whis=1.5)
-        plt.setp(bp['boxes'], color='black')
-        plt.setp(bp['whiskers'], color='black')
-        plt.setp(bp['fliers'], color='red', marker='+')
+        ax = fig.add_subplot(111)
+        plt.title("RMSE")
+        bp = ax.boxplot(score_folds.T, notch=0, sym="+", vert=1, whis=1.5)
+        plt.setp(bp["boxes"], color="black")
+        plt.setp(bp["whiskers"], color="black")
+        plt.setp(bp["fliers"], color="red", marker="+")
        xtickNames = plt.setp(ax, xticklabels=legends)
        plt.setp(xtickNames, rotation=45, fontsize=8)
-        ax.set_ylabel('RMSE')
-        ax.set_xlabel('Distribution')
-        #Make grid and put it below boxes
-        ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
-                alpha=0.5)
+        ax.set_ylabel("RMSE")
+        ax.set_xlabel("Distribution")
+        # Make grid and put it below boxes
+        ax.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5)
        ax.set_axisbelow(True)

-        #Plot boxplots for predictive density
+        # Plot boxplots for predictive density
        fig = plt.figure()
-        ax=fig.add_subplot(111)
-        plt.title('Predictive density')
-        bp = ax.boxplot(pred_density[1:,:].T, notch=0, sym='+', vert=1, whis=1.5)
-        plt.setp(bp['boxes'], color='black')
-        plt.setp(bp['whiskers'], color='black')
-        plt.setp(bp['fliers'], color='red', marker='+')
+        ax = fig.add_subplot(111)
+        plt.title("Predictive density")
+        bp = ax.boxplot(pred_density[1:, :].T, notch=0, sym="+", vert=1, whis=1.5)
+        plt.setp(bp["boxes"], color="black")
+        plt.setp(bp["whiskers"], color="black")
+        plt.setp(bp["fliers"], color="red", marker="+")
        xtickNames = plt.setp(ax, xticklabels=legends[1:])
        plt.setp(xtickNames, rotation=45, fontsize=8)
-        ax.set_ylabel('Mean Log probability P(Y*|Y)')
-        ax.set_xlabel('Distribution')
-        #Make grid and put it below boxes
-        ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
-                alpha=0.5)
+        ax.set_ylabel("Mean Log probability P(Y*|Y)")
+        ax.set_xlabel("Distribution")
+        # Make grid and put it below boxes
+        ax.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5)
        ax.set_axisbelow(True)
    return mstu_t

-#def precipitation_example():
-    #import sklearn
-    #from sklearn.cross_validation import KFold
-    #data = datasets.boston_housing()
-    #X = data['X'].copy()
-    #Y = data['Y'].copy()
-    #X = X-X.mean(axis=0)
-    #X = X/X.std(axis=0)
-    #Y = Y-Y.mean()
-    #Y = Y/Y.std()
-    #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
-    #num_folds = 10
-    #kf = KFold(len(Y), n_folds=num_folds, indices=True)
-    #score_folds = np.zeros((4, num_folds))
-    #def rmse(Y, Ystar):
-        #return np.sqrt(np.mean((Y-Ystar)**2))
-    ##for train, test in kf:
-    #for n, (train, test) in enumerate(kf):
-        #X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
-        #print "Fold {}".format(n)

+# def precipitation_example():
+#     import sklearn
+#     from sklearn.cross_validation import KFold
+
+#     data = datasets.boston_housing()
+#     X = data["X"].copy()
+#     Y = data["Y"].copy()
+#     X = X - X.mean(axis=0)
+#     X = X / X.std(axis=0)
+#     Y = Y - Y.mean()
+#     Y = Y / Y.std()
+#     import ipdb
+
+#     ipdb.set_trace()  # XXX BREAKPOINT
+#     num_folds = 10
+#     kf = KFold(len(Y), n_folds=num_folds, indices=True)
+#     score_folds = np.zeros((4, num_folds))
+
+#     def rmse(Y, Ystar):
+#         return np.sqrt(np.mean((Y - Ystar) ** 2))
+
+#     for train, test in kf:
+#         for n, (train, test) in enumerate(kf):
+#             X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+#             print("Fold {}".format(n))
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -4,95 +4,120 @@
 """
 Gaussian Processes regression examples
 """
+MPL_AVAILABLE = True
 try:
-    from matplotlib import pyplot as pb
-except:
-    pass
+    import matplotlib.pyplot as plt
+except ImportError:
+    MPL_AVAILABLE = False
+
 import numpy as np
 import GPy

+
 def olympic_marathon_men(optimize=True, plot=True):
    """Run a standard Gaussian process regression on the Olympic marathon data."""
-    try:import pods
+    try:
+        import pods
    except ImportError:
-        print('pods unavailable, see https://github.com/sods/ods for example datasets')
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
        return
    data = pods.datasets.olympic_marathon_men()

    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'], data['Y'])
+    m = GPy.models.GPRegression(data["X"], data["Y"])

    # set the lengthscale to be something sensible (defaults to 1)
-    m.kern.lengthscale = 10.
+    m.kern.lengthscale = 10.0

    if optimize:
-        m.optimize('bfgs', max_iters=200)
-    if plot:
+        m.optimize("bfgs", max_iters=200)
+    if MPL_AVAILABLE and plot:
        m.plot(plot_limits=(1850, 2050))

    return m

+
 def coregionalization_toy(optimize=True, plot=True):
    """
    A simple demonstration of coregionalization on two sinusoidal functions.
    """
-    #build a design matrix with a column of integers indicating the output
+    # build a design matrix with a column of integers indicating the output
    X1 = np.random.rand(50, 1) * 8
    X2 = np.random.rand(30, 1) * 5

-    #build a suitable set of observed variables
+    # build a suitable set of observed variables
    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.
+    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.0

-    m = GPy.models.GPCoregionalizedRegression(X_list=[X1,X2], Y_list=[Y1,Y2])
+    m = GPy.models.GPCoregionalizedRegression(X_list=[X1, X2], Y_list=[Y1, Y2])

    if optimize:
-        m.optimize('bfgs', max_iters=100)
+        m.optimize("bfgs", max_iters=100)

-    if plot:
-        slices = GPy.util.multioutput.get_slices([X1,X2])
-        m.plot(fixed_inputs=[(1,0)],which_data_rows=slices[0],Y_metadata={'output_index':0})
-        m.plot(fixed_inputs=[(1,1)],which_data_rows=slices[1],Y_metadata={'output_index':1},ax=pb.gca())
+    if MPL_AVAILABLE and plot:
+        slices = GPy.util.multioutput.get_slices([X1, X2])
+        m.plot(
+            fixed_inputs=[(1, 0)],
+            which_data_rows=slices[0],
+            Y_metadata={"output_index": 0},
+        )
+        m.plot(
+            fixed_inputs=[(1, 1)],
+            which_data_rows=slices[1],
+            Y_metadata={"output_index": 1},
+            ax=plt.gca(),
+        )
    return m

+
 def coregionalization_sparse(optimize=True, plot=True):
-    """
-    A simple demonstration of coregionalization on two sinusoidal functions using sparse approximations.
-    """
-    #build a design matrix with a column of integers indicating the output
+    """A simple demonstration of coregionalization on two sinusoidal
+    functions using sparse approximations. """
+    # build a design matrix with a column of integers indicating the output
    X1 = np.random.rand(50, 1) * 8
    X2 = np.random.rand(30, 1) * 5

-    #build a suitable set of observed variables
+    # build a suitable set of observed variables
    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.
+    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.0

-    m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1,X2], Y_list=[Y1,Y2])
+    m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1, X2], Y_list=[Y1, Y2])

    if optimize:
-        m.optimize('bfgs', max_iters=100)
+        m.optimize("bfgs", max_iters=100)

-    if plot:
-        slices = GPy.util.multioutput.get_slices([X1,X2])
-        m.plot(fixed_inputs=[(1,0)],which_data_rows=slices[0],Y_metadata={'output_index':0})
-        m.plot(fixed_inputs=[(1,1)],which_data_rows=slices[1],Y_metadata={'output_index':1},ax=pb.gca())
-        pb.ylim(-3,)
+    if MPL_AVAILABLE and plot:
+        slices = GPy.util.multioutput.get_slices([X1, X2])
+        m.plot(
+            fixed_inputs=[(1, 0)],
+            which_data_rows=slices[0],
+            Y_metadata={"output_index": 0},
+        )
+        m.plot(
+            fixed_inputs=[(1, 1)],
+            which_data_rows=slices[1],
+            Y_metadata={"output_index": 1},
+            ax=plt.gca(),
+        )
+        plt.ylim(-3,)

    return m

+
 def epomeo_gpx(max_iters=200, optimize=True, plot=True):
    """
    Perform Gaussian process regression on the latitude and longitude data
    from the Mount Epomeo runs. Requires gpxpy to be installed on your system
    to load in the data.
    """
-    try:import pods
+    try:
+        import pods
    except ImportError:
-        print('pods unavailable, see https://github.com/sods/ods for example datasets')
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
        return
    data = pods.datasets.epomeo_gpx()
    num_data_list = []
-    for Xpart in data['X']:
+    for Xpart in data["X"]:
        num_data_list.append(Xpart.shape[0])

    num_data_array = np.array(num_data_list)
@ -100,29 +125,43 @@ def epomeo_gpx(max_iters=200, optimize=True, plot=True):
    Y = np.zeros((num_data, 2))
    t = np.zeros((num_data, 2))
    start = 0
-    for Xpart, index in zip(data['X'], range(len(data['X']))):
-        end = start+Xpart.shape[0]
-        t[start:end, :] = np.hstack((Xpart[:, 0:1],
-                                    index*np.ones((Xpart.shape[0], 1))))
+    for Xpart, index in zip(data["X"], range(len(data["X"]))):
+        end = start + Xpart.shape[0]
+        t[start:end, :] = np.hstack(
+            (Xpart[:, 0:1], index * np.ones((Xpart.shape[0], 1)))
+        )
        Y[start:end, :] = Xpart[:, 1:3]

    num_inducing = 200
-    Z = np.hstack((np.linspace(t[:,0].min(), t[:, 0].max(), num_inducing)[:, None],
-                   np.random.randint(0, 4, num_inducing)[:, None]))
+    Z = np.hstack(
+        (
+            np.linspace(t[:, 0].min(), t[:, 0].max(), num_inducing)[:, None],
+            np.random.randint(0, 4, num_inducing)[:, None],
+        )
+    )

    k1 = GPy.kern.RBF(1)
    k2 = GPy.kern.Coregionalize(output_dim=5, rank=5)
-    k = k1**k2
+    k = k1 ** k2

    m = GPy.models.SparseGPRegression(t, Y, kernel=k, Z=Z, normalize_Y=True)
-    m.constrain_fixed('.*variance', 1.)
+    m.constrain_fixed(".*variance", 1.0)
    m.inducing_inputs.constrain_fixed()
    m.Gaussian_noise.variance.constrain_bounded(1e-3, 1e-1)
-    m.optimize(max_iters=max_iters,messages=True)
+    m.optimize(max_iters=max_iters, messages=True)

    return m

-def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300, optimize=True, plot=True):
+
+def multiple_optima(
+    gene_number=937,
+    resolution=80,
+    model_restarts=10,
+    seed=10000,
+    max_iters=300,
+    optimize=True,
+    plot=True,
+):
    """
    Show an example of a multimodal error surface for Gaussian process
    regression. Gene 939 has bimodal behaviour where the noisy mode is
@ -130,25 +169,30 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
    """

    # Contour over a range of length scales and signal/noise ratios.
-    length_scales = np.linspace(0.1, 60., resolution)
-    log_SNRs = np.linspace(-3., 4., resolution)
+    length_scales = np.linspace(0.1, 60.0, resolution)
+    log_SNRs = np.linspace(-3.0, 4.0, resolution)

-    try:import pods
+    try:
+        import pods
    except ImportError:
-        print('pods unavailable, see https://github.com/sods/ods for example datasets')
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
        return
-    data = pods.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta',gene_number=gene_number)
+    data = pods.datasets.della_gatta_TRP63_gene_expression(
+        data_set="della_gatta", gene_number=gene_number
+    )
    # data['Y'] = data['Y'][0::2, :]
    # data['X'] = data['X'][0::2, :]

-    data['Y'] = data['Y'] - np.mean(data['Y'])
+    data["Y"] = data["Y"] - np.mean(data["Y"])

-    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.RBF)
-    if plot:
-        pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
-        ax = pb.gca()
-        pb.xlabel('length scale')
-        pb.ylabel('log_10 SNR')
+    lls = GPy.examples.regression._contour_data(
+        data, length_scales, log_SNRs, GPy.kern.RBF
+    )
+    if MPL_AVAILABLE and plot:
+        plt.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=plt.cm.jet)
+        ax = plt.gca()
+        plt.xlabel("length scale")
+        plt.ylabel("log_10 SNR")

        xlim = ax.get_xlim()
        ylim = ax.get_ylim()
@ -159,29 +203,44 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
    optim_point_y = np.empty(2)
    np.random.seed(seed=seed)
    for i in range(0, model_restarts):
-        # kern = GPy.kern.RBF(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
-        kern = GPy.kern.RBF(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))
+        # kern = GPy.kern.RBF(
+        #     1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.)
+        # )
+        kern = GPy.kern.RBF(
+            1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50)
+        )

-        m = GPy.models.GPRegression(data['X'], data['Y'], kernel=kern)
+        m = GPy.models.GPRegression(data["X"], data["Y"], kernel=kern)
        m.likelihood.variance = np.random.uniform(1e-3, 1)
        optim_point_x[0] = m.rbf.lengthscale
-        optim_point_y[0] = np.log10(m.rbf.variance) - np.log10(m.likelihood.variance);
+        optim_point_y[0] = np.log10(m.rbf.variance) - np.log10(m.likelihood.variance)

        # optimize
        if optimize:
-            m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)
+            m.optimize("scg", xtol=1e-6, ftol=1e-6, max_iters=max_iters)

        optim_point_x[1] = m.rbf.lengthscale
-        optim_point_y[1] = np.log10(m.rbf.variance) - np.log10(m.likelihood.variance);
+        optim_point_y[1] = np.log10(m.rbf.variance) - np.log10(m.likelihood.variance)

-        if plot:
-            pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
+        if MPL_AVAILABLE and plot:
+            plt.arrow(
+                optim_point_x[0],
+                optim_point_y[0],
+                optim_point_x[1] - optim_point_x[0],
+                optim_point_y[1] - optim_point_y[0],
+                label=str(i),
+                head_length=1,
+                head_width=0.5,
+                fc="k",
+                ec="k",
+            )
        models.append(m)

-    if plot:
+    if MPL_AVAILABLE and plot:
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
-    return m # (models, lls)
+    return m  # (models, lls)
+

 def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.RBF):
    """
@ -195,19 +254,19 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.RBF):
    """

    lls = []
-    total_var = np.var(data['Y'])
-    kernel = kernel_call(1, variance=1., lengthscale=1.)
-    model = GPy.models.GPRegression(data['X'], data['Y'], kernel=kernel)
+    total_var = np.var(data["Y"])
+    kernel = kernel_call(1, variance=1.0, lengthscale=1.0)
+    model = GPy.models.GPRegression(data["X"], data["Y"], kernel=kernel)
    for log_SNR in log_SNRs:
-        SNR = 10.**log_SNR
-        noise_var = total_var / (1. + SNR)
+        SNR = 10.0 ** log_SNR
+        noise_var = total_var / (1.0 + SNR)
        signal_var = total_var - noise_var
-        model.kern['.*variance'] = signal_var
+        model.kern[".*variance"] = signal_var
        model.likelihood.variance = noise_var
        length_scale_lls = []

        for length_scale in length_scales:
-            model['.*lengthscale'] = length_scale
+            model[".*lengthscale"] = length_scale
            length_scale_lls.append(model.log_likelihood())

        lls.append(length_scale_lls)
@ -217,86 +276,99 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.RBF):

 def olympic_100m_men(optimize=True, plot=True):
    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
-    try:import pods
+    try:
+        import pods
    except ImportError:
-        print('pods unavailable, see https://github.com/sods/ods for example datasets')
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
        return
    data = pods.datasets.olympic_100m_men()

    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'], data['Y'])
+    m = GPy.models.GPRegression(data["X"], data["Y"])

    # set the lengthscale to be something sensible (defaults to 1)
    m.rbf.lengthscale = 10

    if optimize:
-        m.optimize('bfgs', max_iters=200)
+        m.optimize("bfgs", max_iters=200)

-    if plot:
+    if MPL_AVAILABLE and plot:
        m.plot(plot_limits=(1850, 2050))
    return m

+
 def toy_rbf_1d(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    try:import pods
+    try:
+        import pods
    except ImportError:
-        print('pods unavailable, see https://github.com/sods/ods for example datasets')
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
        return
    data = pods.datasets.toy_rbf_1d()

    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'], data['Y'])
+    m = GPy.models.GPRegression(data["X"], data["Y"])

    if optimize:
-        m.optimize('bfgs')
-    if plot:
+        m.optimize("bfgs")
+    if MPL_AVAILABLE and plot:
        m.plot()

    return m

+
 def toy_rbf_1d_50(optimize=True, plot=True):
-    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    try:import pods
+    """Run a simple demonstration of a standard Gaussian process fitting
+it to data sampled from an RBF covariance."""
+
+    try:
+        import pods
    except ImportError:
-        print('pods unavailable, see https://github.com/sods/ods for example datasets')
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
        return
    data = pods.datasets.toy_rbf_1d_50()

    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'], data['Y'])
+    m = GPy.models.GPRegression(data["X"], data["Y"])

    if optimize:
-        m.optimize('bfgs')
-    if plot:
+        m.optimize("bfgs")
+    if MPL_AVAILABLE and plot:
        m.plot()

    return m

+
 def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    optimizer='scg'
+    optimizer = "scg"
    x_len = 100
    X = np.linspace(0, 10, x_len)[:, None]
    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.RBF(1).K(X))
-    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
+    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:, None]

    kern = GPy.kern.RBF(1)
    poisson_lik = GPy.likelihoods.Poisson()
    laplace_inf = GPy.inference.latent_function_inference.Laplace()

    # create simple GP Model
-    m = GPy.core.GP(X, Y, kernel=kern, likelihood=poisson_lik, inference_method=laplace_inf)
+    m = GPy.core.GP(
+        X, Y, kernel=kern, likelihood=poisson_lik, inference_method=laplace_inf
+    )

    if optimize:
        m.optimize(optimizer)
-    if plot:
+    if MPL_AVAILABLE and plot:
        m.plot()
        # plot the real underlying rate function
-        pb.plot(X, np.exp(f_true), '--k', linewidth=2)
+        plt.plot(X, np.exp(f_true), "--k", linewidth=2)

    return m

-def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
+
+def toy_ARD(
+    max_iters=1000, kernel_type="linear", num_samples=300, D=4, optimize=True, plot=True
+):
    # Create an artificial dataset where the values in the targets (Y)
    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
    # see if this dependency can be recovered
@ -310,14 +382,14 @@ def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize
    Y2 = np.asarray(4 * (X[:, 2] - 1.5 * X[:, 0])).reshape(-1, 1)
    Y = np.hstack((Y1, Y2))

-    Y = np.dot(Y, np.random.rand(2, D));
+    Y = np.dot(Y, np.random.rand(2, D))
    Y = Y + 0.2 * np.random.randn(Y.shape[0], Y.shape[1])
    Y -= Y.mean()
    Y /= Y.std()

-    if kernel_type == 'linear':
+    if kernel_type == "linear":
        kernel = GPy.kern.Linear(X.shape[1], ARD=1)
-    elif kernel_type == 'rbf_inv':
+    elif kernel_type == "rbf_inv":
        kernel = GPy.kern.RBF_inv(X.shape[1], ARD=1)
    else:
        kernel = GPy.kern.RBF(X.shape[1], ARD=1)
@ -327,14 +399,17 @@ def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize
    # m.set_prior('.*lengthscale',len_prior)

    if optimize:
-        m.optimize(optimizer='scg', max_iters=max_iters)
+        m.optimize(optimizer="scg", max_iters=max_iters)

-    if plot:
+    if MPL_AVAILABLE and plot:
        m.kern.plot_ARD()

    return m

-def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
+
+def toy_ARD_sparse(
+    max_iters=1000, kernel_type="linear", num_samples=300, D=4, optimize=True, plot=True
+):
    # Create an artificial dataset where the values in the targets (Y)
    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
    # see if this dependency can be recovered
@ -348,69 +423,73 @@ def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, o
    Y2 = np.asarray(4 * (X[:, 2] - 1.5 * X[:, 0]))[:, None]
    Y = np.hstack((Y1, Y2))

-    Y = np.dot(Y, np.random.rand(2, D));
+    Y = np.dot(Y, np.random.rand(2, D))
    Y = Y + 0.2 * np.random.randn(Y.shape[0], Y.shape[1])
    Y -= Y.mean()
    Y /= Y.std()

-    if kernel_type == 'linear':
+    if kernel_type == "linear":
        kernel = GPy.kern.Linear(X.shape[1], ARD=1)
-    elif kernel_type == 'rbf_inv':
+    elif kernel_type == "rbf_inv":
        kernel = GPy.kern.RBF_inv(X.shape[1], ARD=1)
    else:
        kernel = GPy.kern.RBF(X.shape[1], ARD=1)
-    #kernel += GPy.kern.Bias(X.shape[1])
+    # kernel += GPy.kern.Bias(X.shape[1])
    X_variance = np.ones(X.shape) * 0.5
    m = GPy.models.SparseGPRegression(X, Y, kernel, X_variance=X_variance)
    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
    # m.set_prior('.*lengthscale',len_prior)

    if optimize:
-        m.optimize(optimizer='scg', max_iters=max_iters)
+        m.optimize(optimizer="scg", max_iters=max_iters)

-    if plot:
+    if MPL_AVAILABLE and plot:
        m.kern.plot_ARD()

    return m

+
 def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
    """Predict the location of a robot given wirelss signal strength readings."""
-    try:import pods
+    try:
+        import pods
    except ImportError:
-        print('pods unavailable, see https://github.com/sods/ods for example datasets')
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
        return
    data = pods.datasets.robot_wireless()

    # create simple GP Model
-    m = GPy.models.GPRegression(data['Y'], data['X'], kernel=kernel)
+    m = GPy.models.GPRegression(data["Y"], data["X"], kernel=kernel)

    # optimize
    if optimize:
        m.optimize(max_iters=max_iters)

-    Xpredict = m.predict(data['Ytest'])[0]
-    if plot:
-        pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-')
-        pb.plot(Xpredict[:, 0], Xpredict[:, 1], 'b-')
-        pb.axis('equal')
-        pb.title('WiFi Localization with Gaussian Processes')
-        pb.legend(('True Location', 'Predicted Location'))
+    Xpredict = m.predict(data["Ytest"])[0]
+    if MPL_AVAILABLE and plot:
+        plt.plot(data["Xtest"][:, 0], data["Xtest"][:, 1], "r-")
+        plt.plot(Xpredict[:, 0], Xpredict[:, 1], "b-")
+        plt.axis("equal")
+        plt.title("WiFi Localization with Gaussian Processes")
+        plt.legend(("True Location", "Predicted Location"))

-    sse = ((data['Xtest'] - Xpredict)**2).sum()
+    sse = ((data["Xtest"] - Xpredict) ** 2).sum()

-    print(('Sum of squares error on test data: ' + str(sse)))
+    print(("Sum of squares error on test data: " + str(sse)))
    return m

+
 def silhouette(max_iters=100, optimize=True, plot=True):
    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
-    try:import pods
+    try:
+        import pods
    except ImportError:
-        print('pods unavailable, see https://github.com/sods/ods for example datasets')
+        print("pods unavailable, see https://github.com/sods/ods for example datasets")
        return
    data = pods.datasets.silhouette()

    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'], data['Y'])
+    m = GPy.models.GPRegression(data["X"], data["Y"])

    # optimize
    if optimize:
@ -419,10 +498,18 @@ def silhouette(max_iters=100, optimize=True, plot=True):
    print(m)
    return m

-def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True, checkgrad=False):
+
+def sparse_GP_regression_1D(
+    num_samples=400,
+    num_inducing=5,
+    max_iters=100,
+    optimize=True,
+    plot=True,
+    checkgrad=False,
+):
    """Run a 1D example of a sparse GP regression."""
    # sample inputs and outputs
-    X = np.random.uniform(-3., 3., (num_samples, 1))
+    X = np.random.uniform(-3.0, 3.0, (num_samples, 1))
    Y = np.sin(X) + np.random.randn(num_samples, 1) * 0.05
    # construct kernel
    rbf = GPy.kern.RBF(1)
@ -433,20 +520,23 @@ def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, opti
        m.checkgrad()

    if optimize:
-        m.optimize('tnc', max_iters=max_iters)
+        m.optimize("tnc", max_iters=max_iters)

-    if plot:
+    if MPL_AVAILABLE and plot:
        m.plot()

    return m

-def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, optimize=True, plot=True, nan=False):
+
+def sparse_GP_regression_2D(
+    num_samples=400, num_inducing=50, max_iters=100, optimize=True, plot=True, nan=False
+):
    """Run a 2D example of a sparse GP regression."""
    np.random.seed(1234)
-    X = np.random.uniform(-3., 3., (num_samples, 2))
+    X = np.random.uniform(-3.0, 3.0, (num_samples, 2))
    Y = np.sin(X[:, 0:1]) * np.sin(X[:, 1:2]) + np.random.randn(num_samples, 1) * 0.05
    if nan:
-        inan = np.random.binomial(1,.2,size=Y.shape)
+        inan = np.random.binomial(1, 0.2, size=Y.shape)
        Y[inan] = np.nan

    # construct kernel
@ -456,183 +546,228 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, opt
    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)

    # contrain all parameters to be positive (but not inducing inputs)
-    m['.*len'] = 2.
+    m[".*len"] = 2.0

    m.checkgrad()

    # optimize
    if optimize:
-        m.optimize('tnc', messages=1, max_iters=max_iters)
+        m.optimize("tnc", messages=1, max_iters=max_iters)

    # plot
-    if plot:
+    if MPL_AVAILABLE and plot:
        m.plot()

    print(m)
    return m

+
 def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    """Run a 1D example of a sparse GP regression with uncertain inputs."""
-    fig, axes = pb.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)
+    if MPL_AVAILABLE and plot:
+        fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)

    # sample inputs and outputs
    S = np.ones((20, 1))
-    X = np.random.uniform(-3., 3., (20, 1))
+    X = np.random.uniform(-3.0, 3.0, (20, 1))
    Y = np.sin(X) + np.random.randn(20, 1) * 0.05
    # likelihood = GPy.likelihoods.Gaussian(Y)
-    Z = np.random.uniform(-3., 3., (7, 1))
+    Z = np.random.uniform(-3.0, 3.0, (7, 1))

    k = GPy.kern.RBF(1)
    # create simple GP Model - no input uncertainty on this one
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)

    if optimize:
-        m.optimize('scg', messages=1, max_iters=max_iters)
+        m.optimize("scg", messages=1, max_iters=max_iters)

-    if plot:
+    if MPL_AVAILABLE and plot:
        m.plot(ax=axes[0])
-        axes[0].set_title('no input uncertainty')
+        axes[0].set_title("no input uncertainty")
    print(m)

    # the same Model with uncertainty
    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.RBF(1), Z=Z, X_variance=S)
    if optimize:
-        m.optimize('scg', messages=1, max_iters=max_iters)
-    if plot:
+        m.optimize("scg", messages=1, max_iters=max_iters)
+    if MPL_AVAILABLE and plot:
        m.plot(ax=axes[1])
-        axes[1].set_title('with input uncertainty')
+        axes[1].set_title("with input uncertainty")
        fig.canvas.draw()

    print(m)
    return m

+
 def simple_mean_function(max_iters=100, optimize=True, plot=True):
    """
    The simplest possible mean function. No parameters, just a simple Sinusoid.
    """
-    #create  simple mean function
-    mf = GPy.core.Mapping(1,1)
+    # create  simple mean function
+    mf = GPy.core.Mapping(1, 1)
    mf.f = np.sin
-    mf.update_gradients = lambda a,b: None
+    mf.update_gradients = lambda a, b: None

-    X = np.linspace(0,10,50).reshape(-1,1)
-    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
+    X = np.linspace(0, 10, 50).reshape(-1, 1)
+    Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape)

-    k =GPy.kern.RBF(1)
+    k = GPy.kern.RBF(1)
    lik = GPy.likelihoods.Gaussian()
    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
    if optimize:
        m.optimize(max_iters=max_iters)
-    if plot:
-        m.plot(plot_limits=(-10,15))
+    if MPL_AVAILABLE and plot:
+        m.plot(plot_limits=(-10, 15))
    return m

+
 def parametric_mean_function(max_iters=100, optimize=True, plot=True):
    """
    A linear mean function with parameters that we'll learn alongside the kernel
    """
-    #create  simple mean function
-    mf = GPy.core.Mapping(1,1)
+    # create  simple mean function
+    mf = GPy.core.Mapping(1, 1)
    mf.f = np.sin

-    X = np.linspace(0,10,50).reshape(-1,1)
-    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
+    X = np.linspace(0, 10, 50).reshape(-1, 1)
+    Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape) + 3 * X

-    mf = GPy.mappings.Linear(1,1)
+    mf = GPy.mappings.Linear(1, 1)

-    k =GPy.kern.RBF(1)
+    k = GPy.kern.RBF(1)
    lik = GPy.likelihoods.Gaussian()
    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
    if optimize:
        m.optimize(max_iters=max_iters)
-    if plot:
+    if MPL_AVAILABLE and plot:
        m.plot()
    return m


-def warped_gp_cubic_sine(max_iters=100):
+def warped_gp_cubic_sine(max_iters=100, plot=True):
    """
    A test replicating the cubic sine regression problem from
    Snelson's paper.
    """
    X = (2 * np.pi) * np.random.random(151) - np.pi
-    Y = np.sin(X) + np.random.normal(0,0.2,151)
-    Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y])
+    Y = np.sin(X) + np.random.normal(0, 0.2, 151)
+    Y = np.array([np.power(abs(y), float(1) / 3) * (1, -1)[y < 0] for y in Y])
    X = X[:, None]
    Y = Y[:, None]

    warp_k = GPy.kern.RBF(1)
    warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
    warp_m = GPy.models.WarpedGP(X, Y, kernel=warp_k, warping_function=warp_f)
-    warp_m['.*\.d'].constrain_fixed(1.0)
+    warp_m[".*\\.d"].constrain_fixed(1.0)
    m = GPy.models.GPRegression(X, Y)
-    m.optimize_restarts(parallel=False, robust=True, num_restarts=5, max_iters=max_iters)
-    warp_m.optimize_restarts(parallel=False, robust=True, num_restarts=5, max_iters=max_iters)
-    #m.optimize(max_iters=max_iters)
-    #warp_m.optimize(max_iters=max_iters)
+    m.optimize_restarts(
+        parallel=False, robust=True, num_restarts=5, max_iters=max_iters
+    )
+    warp_m.optimize_restarts(
+        parallel=False, robust=True, num_restarts=5, max_iters=max_iters
+    )
+    # m.optimize(max_iters=max_iters)
+    # warp_m.optimize(max_iters=max_iters)

    print(warp_m)
-    print(warp_m['.*warp.*'])
+    print(warp_m[".*warp.*"])

-    warp_m.predict_in_warped_space = False
-    warp_m.plot(title="Warped GP - Latent space")
-    warp_m.predict_in_warped_space = True
-    warp_m.plot(title="Warped GP - Warped space")
-    m.plot(title="Standard GP")
-    warp_m.plot_warping()
-    pb.show()
+    if MPL_AVAILABLE and plot:
+        warp_m.predict_in_warped_space = False
+        warp_m.plot(title="Warped GP - Latent space")
+        warp_m.predict_in_warped_space = True
+        warp_m.plot(title="Warped GP - Warped space")
+        m.plot(title="Standard GP")
+        warp_m.plot_warping()
+        plt.show()
    return warp_m


+def multioutput_gp_with_derivative_observations(plot=True):

-def multioutput_gp_with_derivative_observations():
-    def plot_gp_vs_real(m, x, yreal, size_inputs, title, fixed_input=1, xlim=[0,11], ylim=[-1.5,3]):
-        fig, ax = pb.subplots()
-        ax.set_title(title)
-        pb.plot(x, yreal, "r", label='Real function')
-        rows = slice(0, size_inputs[0]) if fixed_input == 0 else slice(size_inputs[0], size_inputs[0]+size_inputs[1])
-        m.plot(fixed_inputs=[(1, fixed_input)], which_data_rows=rows, xlim=xlim, ylim=ylim, ax=ax)
-    f = lambda x: np.sin(x)+0.1*(x-2.)**2-0.005*x**3
-    fd = lambda x: np.cos(x)+0.2*(x-2.)-0.015*x**2
-    N=10 # Number of observations
-    M=10 # Number of derivative observations
-    Npred=100 # Number of prediction points
-    sigma = 0.05 # Noise of observations
-    sigma_der = 0.05 # Noise of derivative observations
-    x = np.array([np.linspace(1,10,N)]).T
-    y = f(x) + np.array(sigma*np.random.normal(0,1,(N,1)))
+    f = lambda x: np.sin(x) + 0.1 * (x - 2.0) ** 2 - 0.005 * x ** 3
+    fd = lambda x: np.cos(x) + 0.2 * (x - 2.0) - 0.015 * x ** 2
+    N = 10  # Number of observations
+    M = 10  # Number of derivative observations
+    Npred = 100  # Number of prediction points
+    sigma = 0.05  # Noise of observations
+    sigma_der = 0.05  # Noise of derivative observations
+    x = np.array([np.linspace(1, 10, N)]).T
+    y = f(x) + np.array(sigma * np.random.normal(0, 1, (N, 1)))

-    xd = np.array([np.linspace(2,8,M)]).T
-    yd = fd(xd) + np.array(sigma_der*np.random.normal(0,1,(M,1)))
+    xd = np.array([np.linspace(2, 8, M)]).T
+    yd = fd(xd) + np.array(sigma_der * np.random.normal(0, 1, (M, 1)))

-    xpred = np.array([np.linspace(0,11,Npred)]).T
+    xpred = np.array([np.linspace(0, 11, Npred)]).T
    ypred_true = f(xpred)
    ydpred_true = fd(xpred)

    # squared exponential kernel:
-    se = GPy.kern.RBF(input_dim = 1, lengthscale=1.5, variance=0.2)
+    se = GPy.kern.RBF(input_dim=1, lengthscale=1.5, variance=0.2)
    # We need to generate separate kernel for the derivative observations and give the created kernel as an input:
    se_der = GPy.kern.DiffKern(se, 0)

-    #Then
-    gauss = GPy.likelihoods.Gaussian(variance=sigma**2)
-    gauss_der = GPy.likelihoods.Gaussian(variance=sigma_der**2)
+    # Then
+    gauss = GPy.likelihoods.Gaussian(variance=sigma ** 2)
+    gauss_der = GPy.likelihoods.Gaussian(variance=sigma_der ** 2)

    # Then create the model, we give everything in lists, the order of the inputs indicates the order of the outputs
    # Now we have the regular observations first and derivative observations second, meaning that the kernels and
    # the likelihoods must follow the same order. Crosscovariances are automatically taken care of
-    m = GPy.models.MultioutputGP(X_list=[x, xd], Y_list=[y, yd],
-                                 kernel_list=[se, se_der],
-                                 likelihood_list=[gauss, gauss_der])
+    m = GPy.models.MultioutputGP(
+        X_list=[x, xd],
+        Y_list=[y, yd],
+        kernel_list=[se, se_der],
+        likelihood_list=[gauss, gauss_der],
+    )
+

    # Optimize the model
    m.optimize(messages=0, ipython_notebook=False)

-    #Plot the model, the syntax is same as for multioutput models:
-    plot_gp_vs_real(m, xpred, ydpred_true, [x.shape[0], xd.shape[0]], title='Latent function derivatives', fixed_input=1, xlim=[0,11], ylim=[-1.5,3])
-    plot_gp_vs_real(m, xpred, ypred_true, [x.shape[0], xd.shape[0]], title='Latent function', fixed_input=0, xlim=[0,11], ylim=[-1.5,3])
+    if MPL_AVAILABLE and plot:
+        def plot_gp_vs_real(
+            m, x, yreal, size_inputs, title, fixed_input=1, xlim=[0, 11], ylim=[-1.5, 3]
+        ):
+            fig, ax = plt.subplots()
+            ax.set_title(title)
+            plt.plot(x, yreal, "r", label="Real function")
+            rows = (
+                slice(0, size_inputs[0])
+                if fixed_input == 0
+                else slice(size_inputs[0], size_inputs[0] + size_inputs[1])
+            )
+            m.plot(
+                fixed_inputs=[(1, fixed_input)],
+                which_data_rows=rows,
+                xlim=xlim,
+                ylim=ylim,
+                ax=ax,
+            )
+        
+        # Plot the model, the syntax is same as for multioutput models:
+        plot_gp_vs_real(
+            m,
+            xpred,
+            ydpred_true,
+            [x.shape[0], xd.shape[0]],
+            title="Latent function derivatives",
+            fixed_input=1,
+            xlim=[0, 11],
+            ylim=[-1.5, 3],
+        )
+        plot_gp_vs_real(
+            m,
+            xpred,
+            ypred_true,
+            [x.shape[0], xd.shape[0]],
+            title="Latent function",
+            fixed_input=0,
+            xlim=[0, 11],
+            ylim=[-1.5, 3],
+        )

-    #making predictions for the values:
-    mu, var = m.predict_noiseless(Xnew=[xpred, np.empty((0,1))])
+    # making predictions for the values:
+    mu, var = m.predict_noiseless(Xnew=[xpred, np.empty((0, 1))])

    return m
--- a/GPy/examples/state_space.py
+++ b/GPy/examples/state_space.py
@ -4,26 +4,26 @@ import matplotlib.pyplot as plt

 import GPy.models.state_space_model as SS_model

+
 def state_space_example():
    X = np.linspace(0, 10, 2000)[:, None]
-    Y = np.sin(X) + np.random.randn(*X.shape)*0.1
+    Y = np.sin(X) + np.random.randn(*X.shape) * 0.1

    kernel1 = GPy.kern.Matern32(X.shape[1])
-    m1  = GPy.models.GPRegression(X,Y, kernel1)
+    m1 = GPy.models.GPRegression(X, Y, kernel1)

    print(m1)
-    m1.optimize(optimizer='bfgs',messages=True)
+    m1.optimize(optimizer="bfgs", messages=True)

    print(m1)

    kernel2 = GPy.kern.sde_Matern32(X.shape[1])
-    #m2  = SS_model.StateSpace(X,Y, kernel2)
-    m2 = GPy.models.StateSpace(X,Y, kernel2)
+    # m2  = SS_model.StateSpace(X,Y, kernel2)
+    m2 = GPy.models.StateSpace(X, Y, kernel2)
    print(m2)

-    m2.optimize(optimizer='bfgs',messages=True)
+    m2.optimize(optimizer="bfgs", messages=True)

    print(m2)

    return m1, m2
-
--- a/GPy/inference/mcmc/samplers.py
+++ b/GPy/inference/mcmc/samplers.py
@ -38,7 +38,7 @@ class Metropolis_Hastings(object):
    def sample(self, Ntotal=10000, Nburn=1000, Nthin=10, tune=True, tune_throughout=False, tune_interval=400):
        current = self.model.optimizer_array
        fcurrent = self.model.log_likelihood() + self.model.log_prior() 
-        accepted = np.zeros(Ntotal,dtype=np.bool)
+        accepted = np.zeros(Ntotal,dtype=bool)
        for it in range(Ntotal):
            print("sample %d of %d\r"%(it+1,Ntotal),end="")
            sys.stdout.flush()
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -38,7 +38,7 @@ from .src.rbf import RBF
 from .src.linear import Linear, LinearFull
 from .src.static import Bias, White, Fixed, WhiteHeteroscedastic, Precomputed
 from .src.brownian import Brownian
-from .src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
+from .src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine, Sinc, ExpQuadCosine
 from .src.mlp import MLP
 from .src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
 from .src.standard_periodic import StdPeriodic
--- a/GPy/kern/src/coregionalize.py
+++ b/GPy/kern/src/coregionalize.py
@ -134,3 +134,28 @@ class Coregionalize(Kern):

    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
+
+    def to_dict(self):
+        """
+        Convert the object into a json serializable dictionary.
+
+        Note: It uses the private method _save_to_input_dict of the parent.
+
+        :return dict: json serializable dictionary containing the needed information to instantiate the object
+        """
+
+        input_dict = super(Coregionalize, self)._save_to_input_dict()
+        input_dict["class"] = "GPy.kern.Coregionalize"
+        # W and kappa must be serializable
+        input_dict["W"] = self.W.values.tolist()
+        input_dict["kappa"] = self.kappa.values.tolist()
+        input_dict["output_dim"] = self.output_dim
+        return input_dict
+
+    @staticmethod
+    def _build_from_input_dict(kernel_class, input_dict):
+        useGPU = input_dict.pop('useGPU', None)
+        # W and kappa must be converted back to numpy arrays
+        input_dict['W'] = np.array(input_dict['W'])
+        input_dict['kappa'] = np.array(input_dict['kappa'])
+        return Coregionalize(**input_dict)
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@ -68,6 +68,11 @@ class White(Static):
        input_dict = super(White, self)._save_to_input_dict()
        input_dict["class"] = "GPy.kern.White"
        return input_dict
+
+    @staticmethod
+    def _build_from_input_dict(kernel_class, input_dict):
+        useGPU = input_dict.pop('useGPU', None)
+        return White(**input_dict)
    
    def K(self, X, X2=None):
        if X2 is None:
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -621,10 +621,10 @@ class ExpQuad(Stationary):

    .. math::

-       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r)
+       k(r) = \sigma^2 \exp(- 0.5 r^2)

    notes::
-     - Yes, this is exactly the same as the RBF covariance function, but the
+     - This is exactly the same as the RBF covariance function, but the
       RBF implementation also has some features for doing variational kernels
       (the psi-statistics).

@ -657,6 +657,14 @@ class ExpQuad(Stationary):
        return -r*self.K_of_r(r)

 class Cosine(Stationary):
+    """
+    Cosine Covariance function
+    
+    .. math::
+
+        k(r) = \sigma^2 \cos(r)
+
+    """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Cosine'):
        super(Cosine, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)

@ -666,6 +674,62 @@ class Cosine(Stationary):
    def dK_dr(self, r):
        return -self.variance * np.sin(r)

+class ExpQuadCosine(Stationary):
+    """
+    Exponentiated quadratic multiplied by cosine covariance function (spectral mixture kernel).
+    
+    .. math::
+
+        k(r) = \sigma^2 \exp(-2\pi^2r^2)\cos(2\pi r/T)
+
+    """
+    
+    def __init__(self, input_dim, variance=1., lengthscale=None, period=1., ARD=False, active_dims=None, name='ExpQuadCosine'):
+        super(ExpQuadCosine, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+        self.period = Param('period', period, Logexp())
+        self.link_parameters(self.period)
+        
+    def K_of_r(self, r):
+        return self.variance * np.exp(-2*np.pi**2*r**2)*np.cos(2*np.pi*r/self.period)
+
+    def dK_dr(self, r):
+        return -4*np.pi**2*r*self.K_of_r(r) - self.variance * 2*np.pi/self.period*np.exp(-2*np.pi**2*r**2)*np.sin(2*np.pi*r/self.period)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        super(ExpQuadCosine, self).update_gradients_full(dL_dK, X, X2)
+        r = self._scaled_dist(X, X2)
+        r2 = np.square(r)
+        dK_dperiod = self.variance * 2*np.pi*r/self.period**2*np.exp(-2*np.pi**2*r**2)*np.sin(2*np.pi*r/self.period)
+        grad = np.sum(dL_dK*dK_dperiod)
+        self.period.gradient = grad
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        super(ExpQuadCosine, self).update_gradients_diag(dL_dKdiag, X)
+        self.period.gradient = 0.
+
+    
+    
+class Sinc(Stationary):
+    """
+    Sinc Covariance function
+    
+    .. math::
+
+        k(r) = \sigma^2 \sinc(\pi r)
+
+    """
+    
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Sinc'):
+        super(Sinc, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance * np.sinc(2*r)
+
+    def dK_dr(self, r):
+        # small angle approximation to avoid divide by zero errors.
+        return np.where(r<1e-5, -self.variance*4/3*np.pi*np.pi*r, self.variance/r * (np.cos(2*np.pi*r)-np.sinc(2*r)))
+
+    

 class RatQuad(Stationary):
    """
@ -677,7 +741,6 @@ class RatQuad(Stationary):

    """

-
    def __init__(self, input_dim, variance=1., lengthscale=None, power=2., ARD=False, active_dims=None, name='RatQuad'):
        super(RatQuad, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
        self.power = Param('power', power, Logexp())
--- a/GPy/likelihoods/mixed_noise.py
+++ b/GPy/likelihoods/mixed_noise.py
@ -80,3 +80,32 @@ class MixedNoise(Likelihood):
            _ysim = np.array([np.random.normal(lik.gp_link.transf(gpj), scale=np.sqrt(lik.variance), size=1) for gpj in gp_filtered.flatten()])
            Ysim[flt,:] = _ysim.reshape(n1,N2)
        return Ysim
+
+    def to_dict(self):
+        """
+        Convert the object into a json serializable dictionary.
+
+        Note: It uses the private method _save_to_input_dict of the parent.
+
+        :return dict: json serializable dictionary containing the needed information to instantiate the object
+        """
+
+        # input_dict = super(MixedNoise, self)._save_to_input_dict()
+        input_dict = {"name": self.name,
+                      "class": "GPy.likelihoods.MixedNoise",
+                      "likelihoods_list": []}
+        for ii in range(len(self.likelihoods_list)):
+            input_dict["likelihoods_list"].append(self.likelihoods_list[ii].to_dict())
+
+        return input_dict
+
+    @staticmethod
+    def _build_from_input_dict(likelihood_class, input_dict):
+        import copy
+        input_dict = copy.deepcopy(input_dict)
+        # gp_link_dict = input_dict.pop('gp_link_dict')
+        # import GPy
+        # gp_link = GPy.likelihoods.link_functions.GPTransformation.from_dict(gp_link_dict)
+        # input_dict["gp_link"] = gp_link
+        input_dict['likelihoods_list'] = [Likelihood.from_dict(l) for l in input_dict['likelihoods_list']]
+        return likelihood_class(**input_dict)
--- a/GPy/models/gp_multiout_regression_md.py
+++ b/GPy/models/gp_multiout_regression_md.py
@ -72,6 +72,10 @@ class GPMultioutRegressionMD(SparseGP):
            kernel = kern.RBF(X.shape[1])
        if kernel_row is None:
            kernel_row = kern.RBF(Xr_dim,name='kern_row')
+            
+        if num_inducing[1] > self.output_dim:
+            msg = 'Number of inducing points ({}) in latent space must be <= output dim ({})'
+            raise ValueError(msg.format(num_inducing[1], self.output_dim))

        if init=='GP':
            from . import SparseGPRegression, BayesianGPLVM
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -14,7 +14,7 @@ class GPLVM(GP):


    """
-    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, name="gplvm"):
+    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, name="gplvm", Y_metadata=None, normalizer=False):

        """
        :param Y: observed data
@ -23,6 +23,11 @@ class GPLVM(GP):
        :type input_dim: int
        :param init: initialisation method for the latent space
        :type init: 'PCA'|'random'
+        :param normalizer:
+          normalize the outputs Y.
+          If normalizer is True, we will normalize using Standardize.
+          If normalizer is False (the default), no normalization will be done.
+        :type normalizer: bool
        """
        if X is None:
            from ..util.initialization import initialize_latent
@ -34,7 +39,7 @@ class GPLVM(GP):

        likelihood = Gaussian()

-        super(GPLVM, self).__init__(X, Y, kernel, likelihood, name='GPLVM')
+        super(GPLVM, self).__init__(X, Y, kernel, likelihood, name='GPLVM', Y_metadata=Y_metadata, normalizer=normalizer)

        self.X = Param('latent_mean', X)
        self.link_parameter(self.X, index=0)
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -82,9 +82,9 @@ def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
            y2 = np.ones_like(x) * y2

        if where is None:
-            where = np.ones(len(x), np.bool)
+            where = np.ones(len(x), bool)
        else:
-            where = np.asarray(where, np.bool)
+            where = np.asarray(where, bool)

        if not (x.shape == y1.shape == y2.shape == where.shape):
            raise ValueError("Argument dimensions are incompatible")
--- a/GPy/plotting/matplot_dep/plot_definitions.py
+++ b/GPy/plotting/matplot_dep/plot_definitions.py
@ -255,9 +255,9 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                y2 = np.ones_like(x) * y2

            if where is None:
-                where = np.ones(len(x), np.bool)
+                where = np.ones(len(x), bool)
            else:
-                where = np.asarray(where, np.bool)
+                where = np.asarray(where, bool)

            if not (x.shape == y1.shape == y2.shape == where.shape):
                raise ValueError("Argument dimensions are incompatible")
--- a/GPy/plotting/matplot_dep/priors_plots.py
+++ b/GPy/plotting/matplot_dep/priors_plots.py
@ -11,7 +11,7 @@ except:

 def univariate_plot(prior):
    rvs = prior.rvs(1000)
-    pb.hist(rvs, 100, normed=True)
+    pb.hist(rvs, 100, density=True)
    xmin, xmax = pb.xlim()
    xx = np.linspace(xmin, xmax, 1000)
    pb.plot(xx, prior.pdf(xx), 'r', linewidth=2)
--- a/GPy/plotting/matplot_dep/visualize.py
+++ b/GPy/plotting/matplot_dep/visualize.py
@ -39,7 +39,7 @@ class vpython_show(data_show):
        super(vpython_show, self).__init__(vals)
        # If no axes are defined, create some.

-        if scene==None:
+        if scene is None:
            self.scene = visual.display(title='Data Visualization')
        else:
            self.scene = scene
@ -57,7 +57,7 @@ class matplotlib_show(data_show):
        super(matplotlib_show, self).__init__(vals)
        # If no axes are defined, create some.

-        if axes==None:
+        if axes is None:
            fig = plt.figure()
            self.axes = fig.add_subplot(111)
        else:
@ -190,7 +190,7 @@ class lvm_subplots(lvm):
    def __init__(self, vals, Model, data_visualize, latent_axes=None, sense_axes=None):
        self.nplots = int(np.ceil(Model.input_dim/2.))+1
        assert len(latent_axes)==self.nplots
-        if vals==None:
+        if vals is None:
            vals = Model.X[0, :]
        self.latent_values = vals

@ -215,9 +215,9 @@ class lvm_dimselect(lvm):

    """
    def __init__(self, vals, model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0, 1], labels=None):
-        if latent_axes==None and sense_axes==None:
+        if latent_axes is None and sense_axes is None:
            self.fig,(latent_axes,self.sense_axes) = plt.subplots(1,2)
-        elif sense_axes==None:
+        elif sense_axes is None:
            fig=plt.figure()
            self.sense_axes = fig.add_subplot(111)
        else:
@ -300,7 +300,7 @@ class image_show(matplotlib_show):
        self.set_image(self.vals)
        if not self.palette == []: # Can just show the image (self.set_image() took care of setting the palette)
            self.handle = self.axes.imshow(self.vals, interpolation='nearest')
-        elif cmap==None: # Use a jet map.
+        elif cmap is None: # Use a jet map.
            self.handle = self.axes.imshow(self.vals, cmap=plt.cm.jet, interpolation='nearest') # @UndefinedVariable
        else: # Use the selected map.
            self.handle = self.axes.imshow(self.vals, cmap=cmap, interpolation='nearest') # @UndefinedVariable
@ -368,7 +368,7 @@ class mocap_data_show_vpython(vpython_show):
    def draw_edges(self):
        self.rods = []
        self.line_handle = []
-        if not self.connect==None:
+        if self.connect is not None:
            self.I, self.J = np.nonzero(self.connect)
            for i, j in zip(self.I, self.J):
                pos, axis = self.pos_axis(i, j)
@ -380,7 +380,7 @@ class mocap_data_show_vpython(vpython_show):

    def modify_edges(self):
        self.line_handle = []
-        if not self.connect==None:
+        if self.connect is not None:
            self.I, self.J = np.nonzero(self.connect)
            for rod, i, j in zip(self.rods, self.I, self.J):
                rod.pos, rod.axis = self.pos_axis(i, j)
@ -409,9 +409,9 @@ class mocap_data_show(matplotlib_show):
    """Base class for visualizing motion capture data."""

    def __init__(self, vals, axes=None, connect=None, color='b'):
-        if axes==None:
+        if axes is None:
            fig = plt.figure()
-            axes = fig.add_subplot(111, projection='3d', aspect='equal')
+            axes = fig.add_subplot(111, projection='3d') #, aspect='equal') aspect equal not implemented in 3D plots currently see this issue: https://github.com/matplotlib/matplotlib/issues/17172
        super(mocap_data_show, self).__init__(vals, axes)

        self.color = color
@ -428,7 +428,7 @@ class mocap_data_show(matplotlib_show):

    def draw_edges(self):
        self.line_handle = []
-        if not self.connect==None:
+        if self.connect is not None:
            x = []
            y = []
            z = []
@ -499,7 +499,12 @@ class stick_show(mocap_data_show):
        super(stick_show, self).__init__(vals, axes=axes, connect=connect)

    def process_values(self):
-        self.vals = self.vals.reshape((3, self.vals.shape[1]/3)).T
+        """Convert vector of values into a matrix for use as a 3-D point cloud."""
+        try:
+            self.vals = self.vals.reshape((3, self.vals.shape[1]//3)).T
+        except ValueError as e:
+            raise ValueError("Passed values to stick_show need to have a dimension which is divisible by 3 for display as they should be a point cloud of 3-D points.") from e
+            

 class skeleton_show(mocap_data_show):
    """data_show class for visualizing motion capture data encoded as a skeleton with angles."""
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -426,6 +426,22 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))

+    def test_Cosine(self):
+        # Don't test Cosine directly as it fails positive definite test.
+        k = GPy.kern.RBF(self.D-1, ARD=False)*GPy.kern.Cosine(self.D-1, ARD=True)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
+    def test_ExpQuadCosine(self):
+        k = GPy.kern.ExpQuadCosine(self.D-1, ARD=True)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
+    def test_Sinc(self):
+        k = GPy.kern.Sinc(self.D-1, ARD=True)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
    def test_RatQuad(self):
        k = GPy.kern.RatQuad(self.D-1, ARD=True)
        k.randomize()
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -936,6 +936,34 @@ class GradientTests(np.testing.TestCase):
        #import ipdb;ipdb.set_trace()
        #m.constrain_fixed('.*rbf_var', 1.)
        self.assertTrue(m.checkgrad())
+    
+    def test_simple_MultivariateGaussian_prior(self):
+        X = np.random.multivariate_normal(
+            [1, 5], np.diag([0.5, 0.3]), (100, 1)).reshape(100, 2)
+        Y = X + np.random.randn(100, 2) * 0.05
+        kernel = GPy.kern.RBF(input_dim=2, variance=1,lengthscale=1, ARD=True)
+        kernel.unconstrain()
+        kernel.variance.set_prior(GPy.priors.Gaussian(150, 5))
+        kernel.lengthscale.set_prior(GPy.priors.MultivariateGaussian(
+            np.array([20, 20]), np.diag([5, 5])))
+        m = GPy.models.GPRegression(X, Y, kernel=kernel)
+        m.optimize()
+        print(m.kern.variance)
+        print(m.kern.lengthscale)
+
+    def test_simple_MultivariateGaussian_prior_matrixmean(self):
+        X = np.random.multivariate_normal(
+            [1, 5], np.diag([0.5, 0.3]), (100, 1)).reshape(100, 2)
+        Y = X + np.random.randn(100, 2) * 0.05
+        kernel = GPy.kern.RBF(input_dim=2, variance=1,lengthscale=1, ARD=True)
+        kernel.unconstrain()
+        kernel.variance.set_prior(GPy.priors.Gaussian(150, 5))
+        kernel.lengthscale.set_prior(GPy.priors.MultivariateGaussian(
+            np.array([[20, 20]]), np.diag([5, 5])))
+        m = GPy.models.GPRegression(X, Y, kernel=kernel)
+        m.optimize()
+        print(m.kern.variance)
+        print(m.kern.lengthscale)

    def test_multioutput_sparse_regression_1D(self):
        X1 = np.random.rand(500, 1) * 8
--- a/GPy/testing/prior_tests.py
+++ b/GPy/testing/prior_tests.py
@ -55,6 +55,21 @@ class PriorTests(unittest.TestCase):
        m.randomize()
        self.assertTrue(m.checkgrad())

+    def test_InverseGamma(self):
+        # Test that this prior object can be instantiated and performs its basic functions
+        # in integration.
+        xmin, xmax = 1, 2.5*np.pi
+        b, C, SNR = 1, 0, 0.1
+        X = np.linspace(xmin, xmax, 500)
+        y  = b*X + C + 1*np.sin(X)
+        y += 0.05*np.random.randn(len(X))
+        X, y = X[:, None], y[:, None]
+        m = GPy.models.GPRegression(X, y)
+        InverseGamma = GPy.priors.InverseGamma(1, 1)
+        m.rbf.set_prior(InverseGamma)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
    def test_incompatibility(self):
        xmin, xmax = 1, 2.5*np.pi
        b, C, SNR = 1, 0, 0.1
--- a/GPy/testing/state_space_main_tests.py
+++ b/GPy/testing/state_space_main_tests.py
@ -722,9 +722,9 @@ class StateSpaceKernelsTests(np.testing.TestCase):
            # plotting <-
        # 2D measurement, 3 ts_no <-
            
-    def test_continuos_ss(self,plot=False):
+    def test_continuous_ss(self,plot=False):
        """
-        This function tests the continuos state-space model.
+        This function tests the continuous state-space model.
        """                    
                
        # 1D measurements, 1 ts_no ->
@ -974,4 +974,4 @@ if __name__ == '__main__':
    #res = tt.test_discrete_ss_2D(plot=False) 
    #res = tt.test_continuos_ss(plot=True)
    
- 
+ 
--- a/GPy/util/normalizer.py
+++ b/GPy/util/normalizer.py
@ -4,7 +4,7 @@ Created on Aug 27, 2014
@author: Max Zwiessele
 '''
 import numpy as np
-
+import warnings

 class _Norm(object):
    def __init__(self):
@ -90,6 +90,10 @@ class Standardize(_Norm):
        Y = np.ma.masked_invalid(Y, copy=False)
        self.mean = Y.mean(0).view(np.ndarray)
        self.std = Y.std(0).view(np.ndarray)
+        if np.any(self.std == 0):
+            warnings.warn("Some values of Y have standard deviation of zero. Resetting to 1.0 to avoid divide by zero errors.")
+           # Choice of setting to 1.0 is somewhat arbitrary. It avoids a divide by zero error, but setting to EPS would also do this. Don't have strong reasons for choosing 1.0, it was just first instinct 
+            self.std[np.where(self.std==0)]=1. 

    def normalize(self, Y):
        super(Standardize, self).normalize(Y)
--- a/appveyor.yml
+++ b/appveyor.yml
@ -3,18 +3,20 @@ environment:
    secure: 8/ZjXFwtd1S7ixd7PJOpptupKKEDhm2da/q3unabJ00=
  COVERALLS_REPO_TOKEN:
    secure: d3Luic/ESkGaWnZrvWZTKrzO+xaVwJWaRCEP0F+K/9DQGPSRZsJ/Du5g3s4XF+tS
-  gpy_version: 1.9.9
+  gpy_version: 1.12.0
  matrix:
-    - PYTHON_VERSION: 3.5
-      MINICONDA: C:\Miniconda35-x64
    - PYTHON_VERSION: 3.6
-      MINICONDA: C:\Miniconda36-x64
+      MINICONDA: C:\Miniconda3-x64
+      MPL_VERSION: 3.3.4
    - PYTHON_VERSION: 3.7
-      MINICONDA: C:\Miniconda36-x64
+      MINICONDA: C:\Miniconda3-x64
+      MPL_VERSION: 3.3.4
    - PYTHON_VERSION: 3.8
-      MINICONDA: C:\Miniconda36-x64
+      MINICONDA: C:\Miniconda3-x64
+      MPL_VERSION: 3.3.4
    - PYTHON_VERSION: 3.9
-      MINICONDA: C:\Miniconda36-x64
+      MINICONDA: C:\Miniconda3-x64
+      MPL_VERSION: 3.3.4

 #configuration:
 #  - Debug
@ -25,7 +27,8 @@ install:
 - conda config --set always_yes yes --set changeps1 no
 - conda update -q conda
 - conda info -a
- - "conda create -q -n build-environment python=%PYTHON_VERSION% numpy scipy matplotlib"
+# github issue #955: freeze build version of matplotlib
+ - "conda create -q -n build-environment python=%PYTHON_VERSION% numpy scipy matplotlib=%MPL_VERSION%"
 - activate build-environment
 # We need wheel installed to build wheels
 - python -m pip install wheel
--- a/doc/source/tuto_creating_new_kernels.rst
+++ b/doc/source/tuto_creating_new_kernels.rst
@ -7,7 +7,7 @@ We will see in this tutorial how to create new kernels in GPy. We will also give
 Structure of a kernel in GPy
 ============================

-In GPy a kernel object is made of a list of kernpart objects, which correspond to symetric positive definite functions. More precisely, the kernel should be understood as the sum of the kernparts. In order to implement a new covariance, the following steps must be followed
+In GPy a kernel object is made of a list of kernpart objects, which correspond to symmetric positive definite functions. More precisely, the kernel should be understood as the sum of the kernparts. In order to implement a new covariance, the following steps must be followed

    1. implement the new covariance as a :py:class:`GPy.kern.src.kern.Kern` object
    2. update the :py:mod:`GPy.kern.src` file
--- a/setup.cfg
+++ b/setup.cfg
@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.9.9
+current_version = 1.12.0
 tag = True
 commit = True

--- a/setup.py
+++ b/setup.py
@ -118,10 +118,8 @@ except ModuleNotFoundError:
    ext_mods = []

 install_requirements = ['numpy>=1.7', 'six', 'paramz>=0.9.0', 'cython>=0.29']
-if sys.version_info < (3, 6):
-    install_requirements += ['scipy>=1.3.0,<1.5.0']
-else:
-    install_requirements += ['scipy>=1.3.0']
+matplotlib_version = 'matplotlib==3.3.4'
+install_requirements += ['scipy>=1.3.0']

 setup(name = 'GPy',
      version = __version__,
@ -174,7 +172,8 @@ setup(name = 'GPy',
                        'optional':['mpi4py',
                                    'ipython>=4.0.0',
                                    ],
-                        'plotting':['matplotlib >= 3.0',
+                        #matplotlib Version see github issue #955
+                        'plotting':[matplotlib_version,
                                    'plotly >= 1.8.6'],
                        'notebook':['jupyter_client >= 4.0.6',
                                    'ipywidgets >= 4.0.3',