replace np.int by int

2026-05-21 14:05:14 +02:00 · 2023-10-16 21:20:17 +02:00 · 2023-10-16 21:20:17 +02:00 · 65af6ee35e
commit 65af6ee35e
parent a6d78d79aa
15 changed files with 3889 additions and 2375 deletions
--- a/GPy/kern/src/coregionalize.py
+++ b/GPy/kern/src/coregionalize.py
@ -5,13 +5,16 @@ from .kern import Kern
 import numpy as np
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
-from ...util.config import config # for assesing whether to use cython
+from ...util.config import config  # for assesing whether to use cython

 try:
    from . import coregionalize_cython
-    use_coregionalize_cython = config.getboolean('cython', 'working')
+
+    use_coregionalize_cython = config.getboolean("cython", "working")
 except ImportError:
-    print('warning in coregionalize: failed to import cython module: falling back to numpy')
+    print(
+        "warning in coregionalize: failed to import cython module: falling back to numpy"
+    )
    use_coregionalize_cython = False


@ -43,22 +46,34 @@ class Coregionalize(Kern):

    .. note: see coregionalization examples in GPy.examples.regression for some usage.
    """
-    def __init__(self, input_dim, output_dim, rank=1, W=None, kappa=None, active_dims=None, name='coregion'):
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        rank=1,
+        W=None,
+        kappa=None,
+        active_dims=None,
+        name="coregion",
+    ):
        super(Coregionalize, self).__init__(input_dim, active_dims, name=name)
        self.output_dim = output_dim
        self.rank = rank
-        if self.rank>output_dim:
-            print("Warning: Unusual choice of rank, it should normally be less than the output_dim.")
+        if self.rank > output_dim:
+            print(
+                "Warning: Unusual choice of rank, it should normally be less than the output_dim."
+            )
        if W is None:
-            W = 0.5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
+            W = 0.5 * np.random.randn(self.output_dim, self.rank) / np.sqrt(self.rank)
        else:
-            assert W.shape==(self.output_dim, self.rank)
-        self.W = Param('W', W)
+            assert W.shape == (self.output_dim, self.rank)
+        self.W = Param("W", W)
        if kappa is None:
-            kappa = 0.5*np.ones(self.output_dim)
+            kappa = 0.5 * np.ones(self.output_dim)
        else:
-            assert kappa.shape==(self.output_dim, )
-        self.kappa = Param('kappa', kappa, Logexp())
+            assert kappa.shape == (self.output_dim,)
+        self.kappa = Param("kappa", kappa, Logexp())
        self.link_parameters(self.W, self.kappa)

    def parameters_changed(self):
@ -70,63 +85,69 @@ class Coregionalize(Kern):
        else:
            return self._K_numpy(X, X2)

-
    def _K_numpy(self, X, X2=None):
-        index = np.asarray(X, dtype=np.int)
+        index = np.asarray(X, dtype=int)
        if X2 is None:
-            return self.B[index,index.T]
+            return self.B[index, index.T]
        else:
-            index2 = np.asarray(X2, dtype=np.int)
-            return self.B[index,index2.T]
+            index2 = np.asarray(X2, dtype=int)
+            return self.B[index, index2.T]

    def _K_cython(self, X, X2=None):
        if X2 is None:
-            return coregionalize_cython.K_symmetric(self.B, np.asarray(X, dtype=np.int64)[:,0])
-        return coregionalize_cython.K_asymmetric(self.B, np.asarray(X, dtype=np.int64)[:,0], np.asarray(X2, dtype=np.int64)[:,0])
-
+            return coregionalize_cython.K_symmetric(
+                self.B, np.asarray(X, dtype=np.int64)[:, 0]
+            )
+        return coregionalize_cython.K_asymmetric(
+            self.B,
+            np.asarray(X, dtype=np.int64)[:, 0],
+            np.asarray(X2, dtype=np.int64)[:, 0],
+        )

    def Kdiag(self, X):
-        return np.diag(self.B)[np.asarray(X, dtype=np.int).flatten()]
+        return np.diag(self.B)[np.asarray(X, dtype=int).flatten()]

    def update_gradients_full(self, dL_dK, X, X2=None):
-        index = np.asarray(X, dtype=np.int)
+        index = np.asarray(X, dtype=int)
        if X2 is None:
            index2 = index
        else:
-            index2 = np.asarray(X2, dtype=np.int)
+            index2 = np.asarray(X2, dtype=int)

-        #attempt to use cython for a nasty double indexing loop: fall back to numpy
+        # attempt to use cython for a nasty double indexing loop: fall back to numpy
        if use_coregionalize_cython:
            dL_dK_small = self._gradient_reduce_cython(dL_dK, index, index2)
        else:
            dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)

-
        dkappa = np.diag(dL_dK_small).copy()
        dL_dK_small += dL_dK_small.T
-        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
+        dW = (self.W[:, None, :] * dL_dK_small[:, :, None]).sum(0)

        self.W.gradient = dW
        self.kappa.gradient = dkappa

    def _gradient_reduce_numpy(self, dL_dK, index, index2):
-        index, index2 = index[:,0], index2[:,0]
+        index, index2 = index[:, 0], index2[:, 0]
        dL_dK_small = np.zeros_like(self.B)
        for i in range(self.output_dim):
-            tmp1 = dL_dK[index==i]
+            tmp1 = dL_dK[index == i]
            for j in range(self.output_dim):
-                dL_dK_small[j,i] = tmp1[:,index2==j].sum()
+                dL_dK_small[j, i] = tmp1[:, index2 == j].sum()
        return dL_dK_small

    def _gradient_reduce_cython(self, dL_dK, index, index2):
-        index, index2 = np.int64(index[:,0]), np.int64(index2[:,0])
-        return coregionalize_cython.gradient_reduce(self.B.shape[0], dL_dK, index, index2)
-
+        index, index2 = np.int64(index[:, 0]), np.int64(index2[:, 0])
+        return coregionalize_cython.gradient_reduce(
+            self.B.shape[0], dL_dK, index, index2
+        )

    def update_gradients_diag(self, dL_dKdiag, X):
-        index = np.asarray(X, dtype=np.int).flatten()
-        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in range(self.output_dim)])
-        self.W.gradient = 2.*self.W*dL_dKdiag_small[:, None]
+        index = np.asarray(X, dtype=int).flatten()
+        dL_dKdiag_small = np.array(
+            [dL_dKdiag[index == i].sum() for i in range(self.output_dim)]
+        )
+        self.W.gradient = 2.0 * self.W * dL_dKdiag_small[:, None]
        self.kappa.gradient = dL_dKdiag_small

    def gradients_X(self, dL_dK, X, X2=None):
@ -154,8 +175,8 @@ class Coregionalize(Kern):

    @staticmethod
    def _build_from_input_dict(kernel_class, input_dict):
-        useGPU = input_dict.pop('useGPU', None)
+        useGPU = input_dict.pop("useGPU", None)
        # W and kappa must be converted back to numpy arrays
-        input_dict['W'] = np.array(input_dict['W'])
-        input_dict['kappa'] = np.array(input_dict['kappa'])
+        input_dict["W"] = np.array(input_dict["W"])
+        input_dict["kappa"] = np.array(input_dict["kappa"])
        return Coregionalize(**input_dict)
--- a/GPy/kern/src/eq_ode1.py
+++ b/GPy/kern/src/eq_ode1.py
@ -8,6 +8,7 @@ from ...core.parameterization import Param
 from paramz.transformations import Logexp
 from paramz.caching import Cache_this

+
 class EQ_ODE1(Kern):
    """
    Covariance function for first order differential equation driven by an exponentiated quadratic covariance.
@ -33,23 +34,36 @@ class EQ_ODE1(Kern):

    .. Note: see first order differential equation examples in GPy.examples.regression for some usage.
    """
-    def __init__(self, input_dim=2, output_dim=1, rank=1, W = None, lengthscale=None,  decay=None, active_dims=None, name='eq_ode1'):
+
+    def __init__(
+        self,
+        input_dim=2,
+        output_dim=1,
+        rank=1,
+        W=None,
+        lengthscale=None,
+        decay=None,
+        active_dims=None,
+        name="eq_ode1",
+    ):
        assert input_dim == 2, "only defined for 1 input dims"
-        super(EQ_ODE1, self).__init__(input_dim=input_dim, active_dims=active_dims, name=name)
+        super(EQ_ODE1, self).__init__(
+            input_dim=input_dim, active_dims=active_dims, name=name
+        )

        self.rank = rank
        self.output_dim = output_dim

        if lengthscale is None:
-            lengthscale = .5 + np.random.rand(self.rank)
+            lengthscale = 0.5 + np.random.rand(self.rank)
        else:
            lengthscale = np.asarray(lengthscale)
            assert lengthscale.size in [1, self.rank], "Bad number of lengthscales"
            if lengthscale.size != self.rank:
-                lengthscale = np.ones(self.rank)*lengthscale
+                lengthscale = np.ones(self.rank) * lengthscale

        if W is None:
-            W = .5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
+            W = 0.5 * np.random.randn(self.output_dim, self.rank) / np.sqrt(self.rank)
        else:
            assert W.shape == (self.output_dim, self.rank)

@ -59,168 +73,181 @@ class EQ_ODE1(Kern):
            decay = np.asarray(decay)
            assert decay.size in [1, self.output_dim], "Bad number of decay"
            if decay.size != self.output_dim:
-                decay = np.ones(self.output_dim)*decay
+                decay = np.ones(self.output_dim) * decay

-#        if kappa is None:
-#            self.kappa = np.ones(self.output_dim)
-#        else:
-#            kappa = np.asarray(kappa)
-#            assert kappa.size in [1, self.output_dim], "Bad number of kappa"
-#            if decay.size != self.output_dim:
-#                decay = np.ones(self.output_dim)*kappa
+        #        if kappa is None:
+        #            self.kappa = np.ones(self.output_dim)
+        #        else:
+        #            kappa = np.asarray(kappa)
+        #            assert kappa.size in [1, self.output_dim], "Bad number of kappa"
+        #            if decay.size != self.output_dim:
+        #                decay = np.ones(self.output_dim)*kappa

-        #self.kappa = Param('kappa', kappa, Logexp())
-        #self.delay = Param('delay', delay, Logexp())
-        #self.is_normalized = True
-        #self.is_stationary = False
-        #self.gaussian_initial = False
+        # self.kappa = Param('kappa', kappa, Logexp())
+        # self.delay = Param('delay', delay, Logexp())
+        # self.is_normalized = True
+        # self.is_stationary = False
+        # self.gaussian_initial = False

-        self.lengthscale = Param('lengthscale', lengthscale, Logexp())
-        self.decay = Param('decay', decay, Logexp())
-        self.W = Param('W', W)
+        self.lengthscale = Param("lengthscale", lengthscale, Logexp())
+        self.decay = Param("decay", decay, Logexp())
+        self.W = Param("W", W)
        self.link_parameters(self.lengthscale, self.decay, self.W)

    @Cache_this(limit=3)
    def K(self, X, X2=None):
-        #This way is not working, indexes are lost after using k._slice_X
-        #index = np.asarray(X, dtype=np.int)
-        #index = index.reshape(index.size,)
-        if hasattr(X, 'values'):
+        # This way is not working, indexes are lost after using k._slice_X
+        # index = np.asarray(X, dtype=int)
+        # index = index.reshape(index.size,)
+        if hasattr(X, "values"):
            X = X.values
        index = np.int_(np.round(X[:, 1]))
-        index = index.reshape(index.size,)
+        index = index.reshape(
+            index.size,
+        )
        X_flag = index[0] >= self.output_dim
        if X2 is None:
            if X_flag:
-                #Calculate covariance function for the latent functions
+                # Calculate covariance function for the latent functions
                index -= self.output_dim
                return self._Kuu(X, index)
            else:
                raise NotImplementedError
        else:
-            #This way is not working, indexes are lost after using k._slice_X
-            #index2 = np.asarray(X2, dtype=np.int)
-            #index2 = index2.reshape(index2.size,)
-            if hasattr(X2, 'values'):
+            # This way is not working, indexes are lost after using k._slice_X
+            # index2 = np.asarray(X2, dtype=int)
+            # index2 = index2.reshape(index2.size,)
+            if hasattr(X2, "values"):
                X2 = X2.values
            index2 = np.int_(np.round(X2[:, 1]))
-            index2 = index2.reshape(index2.size,)
+            index2 = index2.reshape(
+                index2.size,
+            )
            X2_flag = index2[0] >= self.output_dim
-            #Calculate cross-covariance function
+            # Calculate cross-covariance function
            if not X_flag and X2_flag:
                index2 -= self.output_dim
-                return self._Kfu(X, index, X2, index2) #Kfu
+                return self._Kfu(X, index, X2, index2)  # Kfu
            elif X_flag and not X2_flag:
                index -= self.output_dim
-                return self._Kfu(X2, index2, X, index).T #Kuf
+                return self._Kfu(X2, index2, X, index).T  # Kuf
            elif X_flag and X2_flag:
                index -= self.output_dim
                index2 -= self.output_dim
-                return self._Kusu(X, index, X2, index2) #Ku_s u
+                return self._Kusu(X, index, X2, index2)  # Ku_s u
            else:
-                raise NotImplementedError #Kf_s f
+                raise NotImplementedError  # Kf_s f

-    #Calculate the covariance function for diag(Kff(X,X))
+    # Calculate the covariance function for diag(Kff(X,X))
    def Kdiag(self, X):
-        if hasattr(X, 'values'):
+        if hasattr(X, "values"):
            index = np.int_(np.round(X[:, 1].values))
        else:
            index = np.int_(np.round(X[:, 1]))
-        index = index.reshape(index.size,)
+        index = index.reshape(
+            index.size,
+        )
        X_flag = index[0] >= self.output_dim

-        if X_flag: #Kuudiag        
-            return np.ones(X[:,0].shape)
-        else: #Kffdiag
+        if X_flag:  # Kuudiag
+            return np.ones(X[:, 0].shape)
+        else:  # Kffdiag
            kdiag = self._Kdiag(X)
            return np.sum(kdiag, axis=1)

    def _Kdiag(self, X):
-        #This way is not working, indexes are lost after using k._slice_X
-        #index = np.asarray(X, dtype=np.int)
-        #index = index.reshape(index.size,)
-        if hasattr(X, 'values'):
+        # This way is not working, indexes are lost after using k._slice_X
+        # index = np.asarray(X, dtype=int)
+        # index = index.reshape(index.size,)
+        if hasattr(X, "values"):
            X = X.values
        index = np.int_(X[:, 1])
-        index = index.reshape(index.size,)
+        index = index.reshape(
+            index.size,
+        )

-        #terms that move along t
+        # terms that move along t
        t = X[:, 0].reshape(X.shape[0], 1)
-        d = np.unique(index) #Output Indexes
+        d = np.unique(index)  # Output Indexes
        B = self.decay.values[d]
        S = self.W.values[d, :]
-        #Index transformation
+        # Index transformation
        indd = np.arange(self.output_dim)
        indd[d] = np.arange(d.size)
        index = indd[index]

        B = B.reshape(B.size, 1)
-        #Terms that move along q
+        # Terms that move along q
        lq = self.lengthscale.values.reshape(1, self.rank)
-        S2 = S*S
-        kdiag = np.empty((t.size, ))
+        S2 = S * S
+        kdiag = np.empty((t.size,))

-        #Dx1 terms
-        c0 = (S2/B)*((.5*np.sqrt(np.pi))*lq)
+        # Dx1 terms
+        c0 = (S2 / B) * ((0.5 * np.sqrt(np.pi)) * lq)

-        #DxQ terms
-        nu = lq*(B*.5)
-        nu2 = nu*nu
-        #Nx1 terms
-        gamt = -2.*B
-        gamt = gamt[index]*t
+        # DxQ terms
+        nu = lq * (B * 0.5)
+        nu2 = nu * nu
+        # Nx1 terms
+        gamt = -2.0 * B
+        gamt = gamt[index] * t

-        #NxQ terms
-        t_lq = t/lq
+        # NxQ terms
+        t_lq = t / lq

        # Upsilon Calculations
        # Using wofz
-        #erfnu = erf(nu)
+        # erfnu = erf(nu)

-        upm = np.exp(nu2[index, :] + lnDifErf( nu[index, :] ,t_lq+nu[index,:] ))
-        upm[t[:, 0] == 0, :] = 0.
+        upm = np.exp(nu2[index, :] + lnDifErf(nu[index, :], t_lq + nu[index, :]))
+        upm[t[:, 0] == 0, :] = 0.0

+        upv = np.exp(
+            nu2[index, :] + gamt + lnDifErf(-t_lq + nu[index, :], nu[index, :])
+        )
+        upv[t[:, 0] == 0, :] = 0.0

-        upv = np.exp(nu2[index, :] + gamt + lnDifErf( -t_lq+nu[index,:], nu[index, :] ) )
-        upv[t[:, 0] == 0, :] = 0.
-
-        #Covariance calculation
-        #kdiag = np.sum(c0[index, :]*(upm-upv), axis=1)
-        kdiag = c0[index, :]*(upm-upv)
+        # Covariance calculation
+        # kdiag = np.sum(c0[index, :]*(upm-upv), axis=1)
+        kdiag = c0[index, :] * (upm - upv)
        return kdiag

-    def update_gradients_full(self, dL_dK, X, X2 = None):
-        #index = np.asarray(X, dtype=np.int)
-        #index = index.reshape(index.size,)
-        if hasattr(X, 'values'):
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        # index = np.asarray(X, dtype=int)
+        # index = index.reshape(index.size,)
+        if hasattr(X, "values"):
            X = X.values
        self.decay.gradient = np.zeros(self.decay.shape)
        self.W.gradient = np.zeros(self.W.shape)
        self.lengthscale.gradient = np.zeros(self.lengthscale.shape)
        index = np.int_(np.round(X[:, 1]))
-        index = index.reshape(index.size,)
+        index = index.reshape(
+            index.size,
+        )
        X_flag = index[0] >= self.output_dim
        if X2 is None:
-            if X_flag: #Kuu or Kmm
+            if X_flag:  # Kuu or Kmm
                index -= self.output_dim
-                tmp = dL_dK*self._gkuu_lq(X, index)
+                tmp = dL_dK * self._gkuu_lq(X, index)
                for q in np.unique(index):
                    ind = np.where(index == q)
                    self.lengthscale.gradient[q] = tmp[np.ix_(ind[0], ind[0])].sum()
            else:
                raise NotImplementedError
-        else: #Kfu or Knm
-            #index2 = np.asarray(X2, dtype=np.int)
-            #index2 = index2.reshape(index2.size,)
-            if hasattr(X2, 'values'):
+        else:  # Kfu or Knm
+            # index2 = np.asarray(X2, dtype=int)
+            # index2 = index2.reshape(index2.size,)
+            if hasattr(X2, "values"):
                X2 = X2.values
            index2 = np.int_(np.round(X2[:, 1]))
-            index2 = index2.reshape(index2.size,)
+            index2 = index2.reshape(
+                index2.size,
+            )
            X2_flag = index2[0] >= self.output_dim
-            if not X_flag and X2_flag: #Kfu
+            if not X_flag and X2_flag:  # Kfu
                index2 -= self.output_dim
-            else: #Kuf
-                dL_dK = dL_dK.T #so we obtaing dL_Kfu
+            else:  # Kuf
+                dL_dK = dL_dK.T  # so we obtaing dL_Kfu
                indtemp = index - self.output_dim
                Xtemp = X
                X = X2
@ -228,12 +255,12 @@ class EQ_ODE1(Kern):
                index = index2
                index2 = indtemp
            glq, gSdq, gB = self._gkfu(X, index, X2, index2)
-            tmp = dL_dK*glq
+            tmp = dL_dK * glq
            for q in np.unique(index2):
                ind = np.where(index2 == q)
                self.lengthscale.gradient[q] = tmp[:, ind].sum()
-            tmpB = dL_dK*gB
-            tmp = dL_dK*gSdq
+            tmpB = dL_dK * gB
+            tmp = dL_dK * gSdq
            for d in np.unique(index):
                ind = np.where(index == d)
                self.decay.gradient[d] = tmpB[ind, :].sum()
@ -242,404 +269,459 @@ class EQ_ODE1(Kern):
                    self.W.gradient[d, q] = tmp[np.ix_(ind[0], ind2[0])].sum()

    def update_gradients_diag(self, dL_dKdiag, X):
-        #index = np.asarray(X, dtype=np.int)
-        #index = index.reshape(index.size,)
-        if hasattr(X, 'values'):
+        # index = np.asarray(X, dtype=int)
+        # index = index.reshape(index.size,)
+        if hasattr(X, "values"):
            X = X.values
        self.decay.gradient = np.zeros(self.decay.shape)
        self.W.gradient = np.zeros(self.W.shape)
        self.lengthscale.gradient = np.zeros(self.lengthscale.shape)
        index = np.int_(X[:, 1])
-        index = index.reshape(index.size,)
+        index = index.reshape(
+            index.size,
+        )

        glq, gS, gB = self._gkdiag(X, index)
        if dL_dKdiag.size == X.shape[0]:
            dL_dKdiag = np.reshape(dL_dKdiag, (index.size, 1))
-        tmp = dL_dKdiag*glq
+        tmp = dL_dKdiag * glq
        self.lengthscale.gradient = tmp.sum(0)
-        tmpB = dL_dKdiag*gB
-        tmp = dL_dKdiag*gS
+        tmpB = dL_dKdiag * gB
+        tmp = dL_dKdiag * gS
        for d in np.unique(index):
            ind = np.where(index == d)
            self.decay.gradient[d] = tmpB[ind, :].sum()
            self.W.gradient[d, :] = tmp[ind].sum(0)

    def gradients_X(self, dL_dK, X, X2=None):
-        #index = np.asarray(X, dtype=np.int)
-        #index = index.reshape(index.size,)
-        if hasattr(X, 'values'):
+        # index = np.asarray(X, dtype=int)
+        # index = index.reshape(index.size,)
+        if hasattr(X, "values"):
            X = X.values
        index = np.int_(np.round(X[:, 1]))
-        index = index.reshape(index.size,)
+        index = index.reshape(
+            index.size,
+        )
        X_flag = index[0] >= self.output_dim
-        #If input_dim == 1, use this
-        #gX = np.zeros((X.shape[0], 1))
-        #Cheat to allow gradient for input_dim==2
+        # If input_dim == 1, use this
+        # gX = np.zeros((X.shape[0], 1))
+        # Cheat to allow gradient for input_dim==2
        gX = np.zeros(X.shape)
-        if X2 is None: #Kuu or Kmm
+        if X2 is None:  # Kuu or Kmm
            if X_flag:
                index -= self.output_dim
-                gX[:, 0] = 2.*(dL_dK*self._gkuu_X(X, index)).sum(0)
+                gX[:, 0] = 2.0 * (dL_dK * self._gkuu_X(X, index)).sum(0)
                return gX
            else:
                raise NotImplementedError
-        else: #Kuf or Kmn
-            #index2 = np.asarray(X2, dtype=np.int)
-            #index2 = index2.reshape(index2.size,)
-            if hasattr(X2, 'values'):
+        else:  # Kuf or Kmn
+            # index2 = np.asarray(X2, dtype=int)
+            # index2 = index2.reshape(index2.size,)
+            if hasattr(X2, "values"):
                X2 = X2.values
            index2 = np.int_(np.round(X2[:, 1]))
-            index2 = index2.reshape(index2.size,)
+            index2 = index2.reshape(
+                index2.size,
+            )
            X2_flag = index2[0] >= self.output_dim
-            if X_flag and not X2_flag: #gradient of Kuf(Z, X) wrt Z
+            if X_flag and not X2_flag:  # gradient of Kuf(Z, X) wrt Z
                index -= self.output_dim
-                gX[:, 0] = (dL_dK*self._gkfu_z(X2, index2, X, index).T).sum(1)
+                gX[:, 0] = (dL_dK * self._gkfu_z(X2, index2, X, index).T).sum(1)
                return gX
            else:
                raise NotImplementedError

-    #---------------------------------------#
+    # ---------------------------------------#
    #             Helper functions          #
-    #---------------------------------------#
+    # ---------------------------------------#

-    #Evaluation of squared exponential for LFM
+    # Evaluation of squared exponential for LFM
    def _Kuu(self, X, index):
-        index = index.reshape(index.size,)
-        t = X[:, 0].reshape(X.shape[0],)
-        lq = self.lengthscale.values.reshape(self.rank,)
-        lq2 = lq*lq
-        #Covariance matrix initialization
+        index = index.reshape(
+            index.size,
+        )
+        t = X[:, 0].reshape(
+            X.shape[0],
+        )
+        lq = self.lengthscale.values.reshape(
+            self.rank,
+        )
+        lq2 = lq * lq
+        # Covariance matrix initialization
        kuu = np.zeros((t.size, t.size))
-        #Assign 1. to diagonal terms
-        kuu[np.diag_indices(t.size)] = 1.
-        #Upper triangular indices
+        # Assign 1. to diagonal terms
+        kuu[np.diag_indices(t.size)] = 1.0
+        # Upper triangular indices
        indtri1, indtri2 = np.triu_indices(t.size, 1)
-        #Block Diagonal indices among Upper Triangular indices
+        # Block Diagonal indices among Upper Triangular indices
        ind = np.where(index[indtri1] == index[indtri2])
        indr = indtri1[ind]
        indc = indtri2[ind]
        r = t[indr] - t[indc]
-        r2 = r*r
-        #Calculation of  covariance function
-        kuu[indr, indc] = np.exp(-r2/lq2[index[indr]])
-        #Completion of lower triangular part
+        r2 = r * r
+        # Calculation of  covariance function
+        kuu[indr, indc] = np.exp(-r2 / lq2[index[indr]])
+        # Completion of lower triangular part
        kuu[indc, indr] = kuu[indr, indc]
        return kuu

    def _Kusu(self, X, index, X2, index2):
-        index = index.reshape(index.size,)
-        index2 = index2.reshape(index2.size,)
-        t = X[:, 0].reshape(X.shape[0],1)
-        t2 = X2[:, 0].reshape(1,X2.shape[0])
-        lq = self.lengthscale.values.reshape(self.rank,)
-        #Covariance matrix initialization
+        index = index.reshape(
+            index.size,
+        )
+        index2 = index2.reshape(
+            index2.size,
+        )
+        t = X[:, 0].reshape(X.shape[0], 1)
+        t2 = X2[:, 0].reshape(1, X2.shape[0])
+        lq = self.lengthscale.values.reshape(
+            self.rank,
+        )
+        # Covariance matrix initialization
        kuu = np.zeros((t.size, t2.size))
        for q in range(self.rank):
            ind1 = index == q
            ind2 = index2 == q
-            r = t[ind1]/lq[q] - t2[0,ind2]/lq[q]
-            r2 = r*r
-            #Calculation of  covariance function
+            r = t[ind1] / lq[q] - t2[0, ind2] / lq[q]
+            r2 = r * r
+            # Calculation of  covariance function
            kuu[np.ix_(ind1, ind2)] = np.exp(-r2)
        return kuu

-    #Evaluation of cross-covariance function
+    # Evaluation of cross-covariance function
    def _Kfu(self, X, index, X2, index2):
-        #terms that move along t
+        # terms that move along t
        t = X[:, 0].reshape(X.shape[0], 1)
-        d = np.unique(index) #Output Indexes
+        d = np.unique(index)  # Output Indexes
        B = self.decay.values[d]
        S = self.W.values[d, :]
-        #Index transformation
+        # Index transformation
        indd = np.arange(self.output_dim)
        indd[d] = np.arange(d.size)
        index = indd[index]
-        #Output related variables must be column-wise
+        # Output related variables must be column-wise
        B = B.reshape(B.size, 1)
-        #Input related variables must be row-wise
+        # Input related variables must be row-wise
        z = X2[:, 0].reshape(1, X2.shape[0])
        lq = self.lengthscale.values.reshape((1, self.rank))

        kfu = np.empty((t.size, z.size))

-        #DxQ terms
-        c0 = S*((.5*np.sqrt(np.pi))*lq)
-        nu = B*(.5*lq)
+        # DxQ terms
+        c0 = S * ((0.5 * np.sqrt(np.pi)) * lq)
+        nu = B * (0.5 * lq)
        nu2 = nu**2
-        #1xM terms
-        z_lq = z/lq[0, index2]
-        #NxM terms
-        tz = t-z
-        tz_lq = tz/lq[0, index2]
+        # 1xM terms
+        z_lq = z / lq[0, index2]
+        # NxM terms
+        tz = t - z
+        tz_lq = tz / lq[0, index2]

        # Upsilon Calculations
        fullind = np.ix_(index, index2)

-        upsi = np.exp(nu2[fullind] - B[index]*tz + lnDifErf( -tz_lq + nu[fullind], z_lq+nu[fullind]))
-        upsi[t[:, 0] == 0, :] = 0.
-        #Covariance calculation
-        kfu = c0[fullind]*upsi
+        upsi = np.exp(
+            nu2[fullind]
+            - B[index] * tz
+            + lnDifErf(-tz_lq + nu[fullind], z_lq + nu[fullind])
+        )
+        upsi[t[:, 0] == 0, :] = 0.0
+        # Covariance calculation
+        kfu = c0[fullind] * upsi

        return kfu

-    #Gradient of Kuu wrt lengthscale
+    # Gradient of Kuu wrt lengthscale
    def _gkuu_lq(self, X, index):
-        t = X[:, 0].reshape(X.shape[0],)
-        index = index.reshape(X.shape[0],)
-        lq = self.lengthscale.values.reshape(self.rank,)
-        lq2 = lq*lq
-        #Covariance matrix initialization
+        t = X[:, 0].reshape(
+            X.shape[0],
+        )
+        index = index.reshape(
+            X.shape[0],
+        )
+        lq = self.lengthscale.values.reshape(
+            self.rank,
+        )
+        lq2 = lq * lq
+        # Covariance matrix initialization
        glq = np.zeros((t.size, t.size))
-        #Upper triangular indices
+        # Upper triangular indices
        indtri1, indtri2 = np.triu_indices(t.size, 1)
-        #Block Diagonal indices among Upper Triangular indices
+        # Block Diagonal indices among Upper Triangular indices
        ind = np.where(index[indtri1] == index[indtri2])
        indr = indtri1[ind]
        indc = indtri2[ind]
        r = t[indr] - t[indc]
-        r2 = r*r
-        r2_lq2 = r2/lq2[index[indr]]
-        #Calculation of  covariance function
+        r2 = r * r
+        r2_lq2 = r2 / lq2[index[indr]]
+        # Calculation of  covariance function
        er2_lq2 = np.exp(-r2_lq2)
-        #Gradient wrt lq
-        c = 2.*r2_lq2/lq[index[indr]]
-        glq[indr, indc] = er2_lq2*c
-        #Complete the lower triangular
+        # Gradient wrt lq
+        c = 2.0 * r2_lq2 / lq[index[indr]]
+        glq[indr, indc] = er2_lq2 * c
+        # Complete the lower triangular
        glq[indc, indr] = glq[indr, indc]
        return glq

-    #Be careful this derivative should be transpose it
-    def _gkuu_X(self, X, index): #Diagonal terms are always zero
-        t = X[:, 0].reshape(X.shape[0],)
-        index = index.reshape(index.size,)
-        lq = self.lengthscale.values.reshape(self.rank,)
-        lq2 = lq*lq
-        #Covariance matrix initialization
+    # Be careful this derivative should be transpose it
+    def _gkuu_X(self, X, index):  # Diagonal terms are always zero
+        t = X[:, 0].reshape(
+            X.shape[0],
+        )
+        index = index.reshape(
+            index.size,
+        )
+        lq = self.lengthscale.values.reshape(
+            self.rank,
+        )
+        lq2 = lq * lq
+        # Covariance matrix initialization
        gt = np.zeros((t.size, t.size))
-        #Upper triangular indices
-        indtri1, indtri2 = np.triu_indices(t.size, 1) #Offset of 1 from the diagonal
-        #Block Diagonal indices among Upper Triangular indices
+        # Upper triangular indices
+        indtri1, indtri2 = np.triu_indices(t.size, 1)  # Offset of 1 from the diagonal
+        # Block Diagonal indices among Upper Triangular indices
        ind = np.where(index[indtri1] == index[indtri2])
        indr = indtri1[ind]
        indc = indtri2[ind]
        r = t[indr] - t[indc]
-        r2 = r*r
-        r2_lq2 = r2/(-lq2[index[indr]])
-        #Calculation of  covariance function
+        r2 = r * r
+        r2_lq2 = r2 / (-lq2[index[indr]])
+        # Calculation of  covariance function
        er2_lq2 = np.exp(r2_lq2)
-        #Gradient wrt t
-        c = 2.*r/lq2[index[indr]]
-        gt[indr, indc] = er2_lq2*c
-        #Complete the lower triangular
+        # Gradient wrt t
+        c = 2.0 * r / lq2[index[indr]]
+        gt[indr, indc] = er2_lq2 * c
+        # Complete the lower triangular
        gt[indc, indr] = -gt[indr, indc]
        return gt

-    #Gradients for Diagonal Kff
+    # Gradients for Diagonal Kff
    def _gkdiag(self, X, index):
-        index = index.reshape(index.size,)
-        #terms that move along t
+        index = index.reshape(
+            index.size,
+        )
+        # terms that move along t
        d = np.unique(index)
        B = self.decay[d].values
        S = self.W[d, :].values
-        #Index transformation
+        # Index transformation
        indd = np.arange(self.output_dim)
        indd[d] = np.arange(d.size)
        index = indd[index]
-        #Output related variables must be column-wise
+        # Output related variables must be column-wise
        t = X[:, 0].reshape(X.shape[0], 1)
        B = B.reshape(B.size, 1)
-        S2 = S*S
+        S2 = S * S

-        #Input related variables must be row-wise
+        # Input related variables must be row-wise
        lq = self.lengthscale.values.reshape(1, self.rank)

        gB = np.empty((t.size,))
        glq = np.empty((t.size, lq.size))
        gS = np.empty((t.size, lq.size))

-        #Dx1 terms
-        c0 = S2*lq*np.sqrt(np.pi)
+        # Dx1 terms
+        c0 = S2 * lq * np.sqrt(np.pi)

-        #DxQ terms
-        nu = (.5*lq)*B
-        nu2 = nu*nu
+        # DxQ terms
+        nu = (0.5 * lq) * B
+        nu2 = nu * nu

-        #Nx1 terms
-        gamt = -B[index]*t
+        # Nx1 terms
+        gamt = -B[index] * t
        egamt = np.exp(gamt)
-        e2gamt = egamt*egamt
+        e2gamt = egamt * egamt

-        #NxQ terms
-        t_lq = t/lq
-        t2_lq2 = -t_lq*t_lq
+        # NxQ terms
+        t_lq = t / lq
+        t2_lq2 = -t_lq * t_lq

-        etlq2gamt = np.exp(t2_lq2 + gamt) #NXQ
+        etlq2gamt = np.exp(t2_lq2 + gamt)  # NXQ

        ##Upsilon calculations
-        #erfnu = erf(nu) #TODO: This can be improved
+        # erfnu = erf(nu) #TODO: This can be improved

-        upm = np.exp(nu2[index, :] + lnDifErf( nu[index, :], t_lq + nu[index, :]) )
-        upm[t[:, 0] == 0, :] = 0.
+        upm = np.exp(nu2[index, :] + lnDifErf(nu[index, :], t_lq + nu[index, :]))
+        upm[t[:, 0] == 0, :] = 0.0

-        upv = np.exp(nu2[index, :] + 2.*gamt + lnDifErf(-t_lq + nu[index, :], nu[index, :]) ) #egamt*upv
-        upv[t[:, 0] == 0, :] = 0.
+        upv = np.exp(
+            nu2[index, :] + 2.0 * gamt + lnDifErf(-t_lq + nu[index, :], nu[index, :])
+        )  # egamt*upv
+        upv[t[:, 0] == 0, :] = 0.0

-        #Gradient wrt S
-        c0_S = (S/B)*(lq*np.sqrt(np.pi))
+        # Gradient wrt S
+        c0_S = (S / B) * (lq * np.sqrt(np.pi))

-        gS = c0_S[index]*(upm - upv)
+        gS = c0_S[index] * (upm - upv)

-        #For B
-        CB1 = (.5*lq)**2 - .5/B**2 #DXQ
-        lq2_2B = (.5*lq**2)*(S2/B) #DXQ
-        CB2 = 2.*etlq2gamt - e2gamt - 1. #NxQ
+        # For B
+        CB1 = (0.5 * lq) ** 2 - 0.5 / B**2  # DXQ
+        lq2_2B = (0.5 * lq**2) * (S2 / B)  # DXQ
+        CB2 = 2.0 * etlq2gamt - e2gamt - 1.0  # NxQ

        # gradient wrt B NxZ
-        gB = c0[index, :]*(CB1[index, :]*upm - (CB1[index, :] - t/B[index])*upv) + \
-        lq2_2B[index, :]*CB2
+        gB = (
+            c0[index, :] * (CB1[index, :] * upm - (CB1[index, :] - t / B[index]) * upv)
+            + lq2_2B[index, :] * CB2
+        )

-        #Gradient wrt lengthscale
-        #DxQ terms
-        c0 = (.5*np.sqrt(np.pi))*(S2/B)*(1.+.5*(lq*B)**2)
-        Clq1 = S2*(lq*.5)
-        glq = c0[index]*(upm - upv) + Clq1[index]*CB2
+        # Gradient wrt lengthscale
+        # DxQ terms
+        c0 = (0.5 * np.sqrt(np.pi)) * (S2 / B) * (1.0 + 0.5 * (lq * B) ** 2)
+        Clq1 = S2 * (lq * 0.5)
+        glq = c0[index] * (upm - upv) + Clq1[index] * CB2

        return glq, gS, gB

    def _gkfu(self, X, index, Z, index2):
-        index = index.reshape(index.size,)
-        #TODO: reduce memory usage
-        #terms that move along t
+        index = index.reshape(
+            index.size,
+        )
+        # TODO: reduce memory usage
+        # terms that move along t
        d = np.unique(index)
        B = self.decay[d].values
        S = self.W[d, :].values

-        #Index transformation
+        # Index transformation
        indd = np.arange(self.output_dim)
        indd[d] = np.arange(d.size)
        index = indd[index]
-        #t column
+        # t column
        t = X[:, 0].reshape(X.shape[0], 1)
        B = B.reshape(B.size, 1)
-        #z row
+        # z row
        z = Z[:, 0].reshape(1, Z.shape[0])
-        index2 = index2.reshape(index2.size,)
+        index2 = index2.reshape(
+            index2.size,
+        )
        lq = self.lengthscale.values.reshape((1, self.rank))

-        #kfu = np.empty((t.size, z.size))
+        # kfu = np.empty((t.size, z.size))
        glq = np.empty((t.size, z.size))
        gSdq = np.empty((t.size, z.size))
        gB = np.empty((t.size, z.size))

-        #Dx1 terms
-        B_2 = B*.5
-        S_pi = S*(.5*np.sqrt(np.pi))
-        #DxQ terms
-        c0 = S_pi*lq #lq*Sdq*sqrt(pi)
-        nu = B*lq*.5
-        nu2 = nu*nu
+        # Dx1 terms
+        B_2 = B * 0.5
+        S_pi = S * (0.5 * np.sqrt(np.pi))
+        # DxQ terms
+        c0 = S_pi * lq  # lq*Sdq*sqrt(pi)
+        nu = B * lq * 0.5
+        nu2 = nu * nu

-        #1xM terms
-        z_lq = z/lq[0, index2]
+        # 1xM terms
+        z_lq = z / lq[0, index2]

-        #NxM terms
-        tz = t-z
-        tz_lq = tz/lq[0, index2]
-        etz_lq2 = -np.exp(-tz_lq*tz_lq)
-        ez_lq_Bt = np.exp(-z_lq*z_lq -B[index]*t)
+        # NxM terms
+        tz = t - z
+        tz_lq = tz / lq[0, index2]
+        etz_lq2 = -np.exp(-tz_lq * tz_lq)
+        ez_lq_Bt = np.exp(-z_lq * z_lq - B[index] * t)

        # Upsilon calculations
        fullind = np.ix_(index, index2)
-        upsi = np.exp(nu2[fullind] - B[index]*tz + lnDifErf( -tz_lq + nu[fullind], z_lq+nu[fullind] ) )
-        upsi[t[:, 0] == 0., :] = 0.
+        upsi = np.exp(
+            nu2[fullind]
+            - B[index] * tz
+            + lnDifErf(-tz_lq + nu[fullind], z_lq + nu[fullind])
+        )
+        upsi[t[:, 0] == 0.0, :] = 0.0

-        #Gradient wrt S
-        #DxQ term
-        Sa1 = lq*(.5*np.sqrt(np.pi))
+        # Gradient wrt S
+        # DxQ term
+        Sa1 = lq * (0.5 * np.sqrt(np.pi))

-        gSdq = Sa1[0,index2]*upsi
+        gSdq = Sa1[0, index2] * upsi

-        #Gradient wrt lq
-        la1 = S_pi*(1. + 2.*nu2)
-        Slq = S*lq
-        uplq = etz_lq2*(tz_lq/lq[0, index2] + B_2[index])
-        uplq += ez_lq_Bt*(-z_lq/lq[0, index2] + B_2[index])
+        # Gradient wrt lq
+        la1 = S_pi * (1.0 + 2.0 * nu2)
+        Slq = S * lq
+        uplq = etz_lq2 * (tz_lq / lq[0, index2] + B_2[index])
+        uplq += ez_lq_Bt * (-z_lq / lq[0, index2] + B_2[index])

-        glq = la1[fullind]*upsi
-        glq += Slq[fullind]*uplq
+        glq = la1[fullind] * upsi
+        glq += Slq[fullind] * uplq

-        #Gradient wrt B
-        Slq = Slq*lq
-        nulq = nu*lq
+        # Gradient wrt B
+        Slq = Slq * lq
+        nulq = nu * lq
        upBd = etz_lq2 + ez_lq_Bt
-        gB = c0[fullind]*(nulq[fullind] - tz)*upsi + .5*Slq[fullind]*upBd
+        gB = c0[fullind] * (nulq[fullind] - tz) * upsi + 0.5 * Slq[fullind] * upBd

        return glq, gSdq, gB

-    #TODO: reduce memory usage
-    def _gkfu_z(self, X, index, Z, index2): #Kfu(t,z)
-        index = index.reshape(index.size,)
-        #terms that move along t
+    # TODO: reduce memory usage
+    def _gkfu_z(self, X, index, Z, index2):  # Kfu(t,z)
+        index = index.reshape(
+            index.size,
+        )
+        # terms that move along t
        d = np.unique(index)
        B = self.decay[d].values
        S = self.W[d, :].values
-        #Index transformation
+        # Index transformation
        indd = np.arange(self.output_dim)
        indd[d] = np.arange(d.size)
        index = indd[index]

-        #t column
+        # t column
        t = X[:, 0].reshape(X.shape[0], 1)
        B = B.reshape(B.size, 1)
-        #z row
+        # z row
        z = Z[:, 0].reshape(1, Z.shape[0])
-        index2 = index2.reshape(index2.size,)
+        index2 = index2.reshape(
+            index2.size,
+        )
        lq = self.lengthscale.values.reshape((1, self.rank))

-        #kfu = np.empty((t.size, z.size))
+        # kfu = np.empty((t.size, z.size))
        gz = np.empty((t.size, z.size))

-        #Dx1 terms
-        S_pi =S*(.5*np.sqrt(np.pi))
-        #DxQ terms
-        #Slq = S*lq
-        c0 = S_pi*lq #lq*Sdq*sqrt(pi)
-        nu = (.5*lq)*B
-        nu2 = nu*nu
+        # Dx1 terms
+        S_pi = S * (0.5 * np.sqrt(np.pi))
+        # DxQ terms
+        # Slq = S*lq
+        c0 = S_pi * lq  # lq*Sdq*sqrt(pi)
+        nu = (0.5 * lq) * B
+        nu2 = nu * nu

-        #1xM terms
-        z_lq = z/lq[0, index2]
-        z_lq2 = -z_lq*z_lq
-        #NxQ terms
-        t_lq = t/lq
-        #NxM terms
+        # 1xM terms
+        z_lq = z / lq[0, index2]
+        z_lq2 = -z_lq * z_lq
+        # NxQ terms
+        t_lq = t / lq
+        # NxM terms
        zt_lq = z_lq - t_lq[:, index2]
-        zt_lq2 = -zt_lq*zt_lq
+        zt_lq2 = -zt_lq * zt_lq

        # Upsilon calculations
        fullind = np.ix_(index, index2)
        z2 = z_lq + nu[fullind]
        z1 = z2 - t_lq[:, index2]
-        upsi = np.exp(nu2[fullind] - B[index]*(t-z) + lnDifErf(z1,z2) )
-        upsi[t[:, 0] == 0., :] = 0.
+        upsi = np.exp(nu2[fullind] - B[index] * (t - z) + lnDifErf(z1, z2))
+        upsi[t[:, 0] == 0.0, :] = 0.0

-        #Gradient wrt z
-        za1 = c0*B
-        #za2 = S_w
-        gz = za1[fullind]*upsi + S[fullind]*( np.exp(z_lq2 - B[index]*t) -np.exp(zt_lq2) )
+        # Gradient wrt z
+        za1 = c0 * B
+        # za2 = S_w
+        gz = za1[fullind] * upsi + S[fullind] * (
+            np.exp(z_lq2 - B[index] * t) - np.exp(zt_lq2)
+        )

        return gz

-def lnDifErf(z1,z2):
-    #Z2 is always positive
+
+def lnDifErf(z1, z2):
+    # Z2 is always positive
    logdiferf = np.zeros(z1.shape)
-    ind = np.where(z1>0.)
-    ind2 = np.where(z1<=0.)
+    ind = np.where(z1 > 0.0)
+    ind2 = np.where(z1 <= 0.0)
    if ind[0].shape > 0:
        z1i = z1[ind]
-        z12 = z1i*z1i
+        z12 = z1i * z1i
        z2i = z2[ind]
-        logdiferf[ind] = -z12 + np.log(erfcx(z1i) - erfcx(z2i)*np.exp(z12-z2i**2))
+        logdiferf[ind] = -z12 + np.log(erfcx(z1i) - erfcx(z2i) * np.exp(z12 - z2i**2))

    if ind2[0].shape > 0:
        z1i = z1[ind2]
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
--- a/GPy/kern/src/todo/eq_ode1.py
+++ b/GPy/kern/src/todo/eq_ode1.py
@ -121,7 +121,7 @@ class Eq_ode1(Kernpart):
            target+=self.initial_variance * np.exp(- self.decay * (t1_mat + t2_mat))

    def Kdiag(self,index,target):
-        #target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
+        #target += np.diag(self.B)[np.asarray(index,dtype=int).flatten()]
        pass
    
    def _param_grad_helper(self,dL_dK,X,X2,target):
@ -203,7 +203,7 @@ class Eq_ode1(Kernpart):
        self._t = X[:, 0]
        if not X.shape[1] == 2:
            raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
-        self._index = np.asarray(X[:, 1],dtype=np.int)
+        self._index = np.asarray(X[:, 1],dtype=int)
        # Sort indices so that outputs are in blocks for computational
        # convenience.
        self._order = self._index.argsort()
@ -220,7 +220,7 @@ class Eq_ode1(Kernpart):
            if not X2.shape[1] == 2:
                raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
            self._t2 = X2[:, 0]
-            self._index2 = np.asarray(X2[:, 1],dtype=np.int)
+            self._index2 = np.asarray(X2[:, 1],dtype=int)
            self._order2 = self._index2.argsort()
            self._index2 = self._index2[self._order2]
            self._t2 = self._t2[self._order2]
--- a/GPy/models/sparse_gp_coregionalized_regression.py
+++ b/GPy/models/sparse_gp_coregionalized_regression.py
@ -7,6 +7,7 @@ from ..inference.latent_function_inference import VarDTC
 from .. import kern
 from .. import util

+
 class SparseGPCoregionalizedRegression(SparseGP):
    """
    Sparse Gaussian Process model for heteroscedastic multioutput regression
@ -34,34 +35,65 @@ class SparseGPCoregionalizedRegression(SparseGP):
    :type kernel_name: string
    """

-    def __init__(self, X_list, Y_list, Z_list=[], kernel=None, likelihoods_list=None, num_inducing=10, X_variance=None, name='SGPCR',W_rank=1,kernel_name='coreg'):
-
-        #Input and Output
-        X,Y,self.output_index = util.multioutput.build_XY(X_list,Y_list)
+    def __init__(
+        self,
+        X_list,
+        Y_list,
+        Z_list=[],
+        kernel=None,
+        likelihoods_list=None,
+        num_inducing=10,
+        X_variance=None,
+        name="SGPCR",
+        W_rank=1,
+        kernel_name="coreg",
+    ):
+        # Input and Output
+        X, Y, self.output_index = util.multioutput.build_XY(X_list, Y_list)
        Ny = len(Y_list)

-        #Kernel
+        # Kernel
        if kernel is None:
-            kernel = kern.RBF(X.shape[1]-1)
+            kernel = kern.RBF(X.shape[1] - 1)

-            kernel = util.multioutput.ICM(input_dim=X.shape[1]-1, num_outputs=Ny, kernel=kernel, W_rank=W_rank, name=kernel_name)
+            kernel = util.multioutput.ICM(
+                input_dim=X.shape[1] - 1,
+                num_outputs=Ny,
+                kernel=kernel,
+                W_rank=W_rank,
+                name=kernel_name,
+            )

-        #Likelihood
-        likelihood = util.multioutput.build_likelihood(Y_list,self.output_index,likelihoods_list)
+        # Likelihood
+        likelihood = util.multioutput.build_likelihood(
+            Y_list, self.output_index, likelihoods_list
+        )

-        #Inducing inputs list
+        # Inducing inputs list
        if len(Z_list):
-            assert len(Z_list) == Ny, 'Number of outputs do not match length of inducing inputs list.'
+            assert (
+                len(Z_list) == Ny
+            ), "Number of outputs do not match length of inducing inputs list."
        else:
-            if isinstance(num_inducing,np.int):
+            if isinstance(num_inducing, int):
                num_inducing = [num_inducing] * Ny
            num_inducing = np.asarray(num_inducing)
-            assert num_inducing.size == Ny, 'Number of outputs do not match length of inducing inputs list.'
-            for ni,Xi in zip(num_inducing,X_list):
+            assert (
+                num_inducing.size == Ny
+            ), "Number of outputs do not match length of inducing inputs list."
+            for ni, Xi in zip(num_inducing, X_list):
                i = np.random.permutation(Xi.shape[0])[:ni]
                Z_list.append(Xi[i].copy())

        Z, _, Iz = util.multioutput.build_XY(Z_list)

-        super(SparseGPCoregionalizedRegression, self).__init__(X, Y, Z, kernel, likelihood, inference_method=VarDTC(), Y_metadata={'output_index':self.output_index})
-        self['.*inducing'][:,-1].fix()
+        super(SparseGPCoregionalizedRegression, self).__init__(
+            X,
+            Y,
+            Z,
+            kernel,
+            likelihood,
+            inference_method=VarDTC(),
+            Y_metadata={"output_index": self.output_index},
+        )
+        self[".*inducing"][:, -1].fix()
--- a/GPy/models/ss_mrd.py
+++ b/GPy/models/ss_mrd.py
@ -5,51 +5,109 @@ The Maniforld Relevance Determination model with the spike-and-slab prior
 import numpy as np
 from ..core import Model
 from .ss_gplvm import SSGPLVM
-from GPy.core.parameterization.variational import SpikeAndSlabPrior,NormalPosterior,VariationalPrior
+from GPy.core.parameterization.variational import (
+    SpikeAndSlabPrior,
+    NormalPosterior,
+    VariationalPrior,
+)
 from ..util.misc import param_to_array
 from ..kern import RBF
 from ..core import Param
 from numpy.linalg.linalg import LinAlgError

-class SSMRD(Model):

-    def __init__(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx = 'PCA_concat', initz = 'permute', 
-                 num_inducing=10, Zs=None, kernels=None, inference_methods=None, likelihoods=None, group_spike=True,
-                 pi=0.5, name='ss_mrd', Ynames=None, mpi_comm=None, IBP=False, alpha=2., taus=None, ):
+class SSMRD(Model):
+    def __init__(
+        self,
+        Ylist,
+        input_dim,
+        X=None,
+        X_variance=None,
+        Gammas=None,
+        initx="PCA_concat",
+        initz="permute",
+        num_inducing=10,
+        Zs=None,
+        kernels=None,
+        inference_methods=None,
+        likelihoods=None,
+        group_spike=True,
+        pi=0.5,
+        name="ss_mrd",
+        Ynames=None,
+        mpi_comm=None,
+        IBP=False,
+        alpha=2.0,
+        taus=None,
+    ):
        super(SSMRD, self).__init__(name)
        self.mpi_comm = mpi_comm
        self._PROPAGATE_ = False

        # initialize X for individual models
-        X, X_variance, Gammas, fracs = self._init_X(Ylist, input_dim, X, X_variance, Gammas, initx)
+        X, X_variance, Gammas, fracs = self._init_X(
+            Ylist, input_dim, X, X_variance, Gammas, initx
+        )
        self.X = NormalPosterior(means=X, variances=X_variance)

        if kernels is None:
-            kernels = [RBF(input_dim, lengthscale=1./fracs, ARD=True) for i in range(len(Ylist))]
+            kernels = [
+                RBF(input_dim, lengthscale=1.0 / fracs, ARD=True)
+                for i in range(len(Ylist))
+            ]
        if Zs is None:
-            Zs = [None]* len(Ylist)
+            Zs = [None] * len(Ylist)
        if likelihoods is None:
-            likelihoods = [None]* len(Ylist)
+            likelihoods = [None] * len(Ylist)
        if inference_methods is None:
-            inference_methods = [None]* len(Ylist)
+            inference_methods = [None] * len(Ylist)

        if IBP:
-            self.var_priors = [IBPPrior_SSMRD(len(Ylist),input_dim,alpha=alpha) for i in range(len(Ylist))]
+            self.var_priors = [
+                IBPPrior_SSMRD(len(Ylist), input_dim, alpha=alpha)
+                for i in range(len(Ylist))
+            ]
        else:
-            self.var_priors = [SpikeAndSlabPrior_SSMRD(nModels=len(Ylist),pi=pi,learnPi=False, group_spike=group_spike) for i in range(len(Ylist))]
-        self.models = [SSGPLVM(y, input_dim, X=X.copy(), X_variance=X_variance.copy(), Gamma=Gammas[i], num_inducing=num_inducing,Z=Zs[i], learnPi=False, group_spike=group_spike,
-                               kernel=kernels[i],inference_method=inference_methods[i],likelihood=likelihoods[i], variational_prior=self.var_priors[i], IBP=IBP, tau=None if taus is None else taus[i],
-                               name='model_'+str(i), mpi_comm=mpi_comm, sharedX=True) for i,y in enumerate(Ylist)]
-        self.link_parameters(*(self.models+[self.X]))
+            self.var_priors = [
+                SpikeAndSlabPrior_SSMRD(
+                    nModels=len(Ylist), pi=pi, learnPi=False, group_spike=group_spike
+                )
+                for i in range(len(Ylist))
+            ]
+        self.models = [
+            SSGPLVM(
+                y,
+                input_dim,
+                X=X.copy(),
+                X_variance=X_variance.copy(),
+                Gamma=Gammas[i],
+                num_inducing=num_inducing,
+                Z=Zs[i],
+                learnPi=False,
+                group_spike=group_spike,
+                kernel=kernels[i],
+                inference_method=inference_methods[i],
+                likelihood=likelihoods[i],
+                variational_prior=self.var_priors[i],
+                IBP=IBP,
+                tau=None if taus is None else taus[i],
+                name="model_" + str(i),
+                mpi_comm=mpi_comm,
+                sharedX=True,
+            )
+            for i, y in enumerate(Ylist)
+        ]
+        self.link_parameters(*(self.models + [self.X]))

    def _propogate_X_val(self):
-        if self._PROPAGATE_: return
+        if self._PROPAGATE_:
+            return
        for m in self.models:
            m.X.mean.values[:] = self.X.mean.values
            m.X.variance.values[:] = self.X.variance.values
        varp_list = [m.X for m in self.models]
        [vp._update_inernal(varp_list) for vp in self.var_priors]
-        self._PROPAGATE_=True
+        self._PROPAGATE_ = True

    def _collate_X_gradient(self):
        self._PROPAGATE_ = False
@ -62,82 +120,88 @@ class SSMRD(Model):
    def parameters_changed(self):
        super(SSMRD, self).parameters_changed()
        [m.parameters_changed() for m in self.models]
-        self._log_marginal_likelihood = sum([m._log_marginal_likelihood for m in self.models])
+        self._log_marginal_likelihood = sum(
+            [m._log_marginal_likelihood for m in self.models]
+        )
        self._collate_X_gradient()

    def log_likelihood(self):
        return self._log_marginal_likelihood

-    def _init_X(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx='PCA_concat'):
-        
+    def _init_X(
+        self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx="PCA_concat"
+    ):
        # Divide latent dimensions
-        idx = np.empty((input_dim,),dtype=np.int)
-        residue = (input_dim)%(len(Ylist))
+        idx = np.empty((input_dim,), dtype=int)
+        residue = (input_dim) % (len(Ylist))
        for i in range(len(Ylist)):
            if i < residue:
-                size = input_dim/len(Ylist)+1
-                idx[i*size:(i+1)*size] = i
+                size = input_dim / len(Ylist) + 1
+                idx[i * size : (i + 1) * size] = i
            else:
-                size = input_dim/len(Ylist)
-                idx[i*size+residue:(i+1)*size+residue] = i
+                size = input_dim / len(Ylist)
+                idx[i * size + residue : (i + 1) * size + residue] = i

        if X is None:
-            if initx == 'PCA_concat':
-                X = np.empty((Ylist[0].shape[0],input_dim))
+            if initx == "PCA_concat":
+                X = np.empty((Ylist[0].shape[0], input_dim))
                fracs = np.empty((input_dim,))
                from ..util.initialization import initialize_latent
+
                for i in range(len(Ylist)):
                    Y = Ylist[i]
-                    dim = (idx==i).sum()
-                    if dim>0:
-                        x, fr = initialize_latent('PCA', dim, Y)
-                        X[:,idx==i] = x
-                        fracs[idx==i] = fr
-            elif initx=='PCA_joint':
+                    dim = (idx == i).sum()
+                    if dim > 0:
+                        x, fr = initialize_latent("PCA", dim, Y)
+                        X[:, idx == i] = x
+                        fracs[idx == i] = fr
+            elif initx == "PCA_joint":
                y = np.hstack(Ylist)
                from ..util.initialization import initialize_latent
-                X, fracs = initialize_latent('PCA', input_dim, y)
+
+                X, fracs = initialize_latent("PCA", input_dim, y)
            else:
                X = np.random.randn(Ylist[0].shape[0], input_dim)
                fracs = np.ones(input_dim)
        else:
            fracs = np.ones(input_dim)

-    
-        if X_variance is None: # The variance of the variational approximation (S)
-            X_variance = np.random.uniform(0,.1,X.shape)
+        if X_variance is None:  # The variance of the variational approximation (S)
+            X_variance = np.random.uniform(0, 0.1, X.shape)

        if Gammas is None:
            Gammas = []
            for x in X:
-                gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+                gamma = np.empty_like(
+                    X
+                )  # The posterior probabilities of the binary variable in the variational approximation
                gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
-                gamma[gamma>1.-1e-9] = 1.-1e-9
-                gamma[gamma<1e-9] = 1e-9
+                gamma[gamma > 1.0 - 1e-9] = 1.0 - 1e-9
+                gamma[gamma < 1e-9] = 1e-9
                Gammas.append(gamma)
        return X, X_variance, Gammas, fracs

    @Model.optimizer_array.setter
    def optimizer_array(self, p):
        if self.mpi_comm != None:
-            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0:
-                self.mpi_comm.Bcast(np.int32(1),root=0)
+            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank == 0:
+                self.mpi_comm.Bcast(np.int32(1), root=0)
            self.mpi_comm.Bcast(p, root=0)
-        Model.optimizer_array.fset(self,p)
+        Model.optimizer_array.fset(self, p)

    def optimize(self, optimizer=None, start=None, **kwargs):
        self._IN_OPTIMIZATION_ = True
-        if self.mpi_comm==None:
-            super(SSMRD, self).optimize(optimizer,start,**kwargs)
-        elif self.mpi_comm.rank==0:
-            super(SSMRD, self).optimize(optimizer,start,**kwargs)
-            self.mpi_comm.Bcast(np.int32(-1),root=0)
-        elif self.mpi_comm.rank>0:
+        if self.mpi_comm == None:
+            super(SSMRD, self).optimize(optimizer, start, **kwargs)
+        elif self.mpi_comm.rank == 0:
+            super(SSMRD, self).optimize(optimizer, start, **kwargs)
+            self.mpi_comm.Bcast(np.int32(-1), root=0)
+        elif self.mpi_comm.rank > 0:
            x = self.optimizer_array.copy()
-            flag = np.empty(1,dtype=np.int32)
+            flag = np.empty(1, dtype=np.int32)
            while True:
-                self.mpi_comm.Bcast(flag,root=0)
-                if flag==1:
+                self.mpi_comm.Bcast(flag, root=0)
+                if flag == 1:
                    try:
                        self.optimizer_array = x
                        self._fail_count = 0
@ -145,7 +209,7 @@ class SSMRD(Model):
                        if self._fail_count >= self._allowed_failures:
                            raise
                        self._fail_count += 1
-                elif flag==-1:
+                elif flag == -1:
                    break
                else:
                    self._IN_OPTIMIZATION_ = False
@ -154,20 +218,42 @@ class SSMRD(Model):


 class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
-    def __init__(self, nModels, pi=0.5, learnPi=False, group_spike=True, variance = 1.0, name='SSMRDPrior', **kw):
+    def __init__(
+        self,
+        nModels,
+        pi=0.5,
+        learnPi=False,
+        group_spike=True,
+        variance=1.0,
+        name="SSMRDPrior",
+        **kw
+    ):
        self.nModels = nModels
        self._b_prob_all = 0.5
-        super(SpikeAndSlabPrior_SSMRD, self).__init__(pi=pi,learnPi=learnPi,group_spike=group_spike,variance=variance, name=name, **kw)
+        super(SpikeAndSlabPrior_SSMRD, self).__init__(
+            pi=pi,
+            learnPi=learnPi,
+            group_spike=group_spike,
+            variance=variance,
+            name=name,
+            **kw
+        )

    def _update_inernal(self, varp_list):
        """Make an update of the internal status by gathering the variational posteriors for all the individual models."""
        # The probability for the binary variable for the same latent dimension of any of the models is on.
        if self.group_spike:
-            self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group)
-            [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]]
+            self._b_prob_all = 1.0 - param_to_array(varp_list[0].gamma_group)
+            [
+                np.multiply(self._b_prob_all, 1.0 - vp.gamma_group, self._b_prob_all)
+                for vp in varp_list[1:]
+            ]
        else:
-            self._b_prob_all = 1.-param_to_array(varp_list[0].binary_prob)
-            [np.multiply(self._b_prob_all, 1.-vp.binary_prob, self._b_prob_all) for vp in varp_list[1:]]            
+            self._b_prob_all = 1.0 - param_to_array(varp_list[0].binary_prob)
+            [
+                np.multiply(self._b_prob_all, 1.0 - vp.binary_prob, self._b_prob_all)
+                for vp in varp_list[1:]
+            ]

    def KL_divergence(self, variational_posterior):
        mu = variational_posterior.mean
@ -176,16 +262,20 @@ class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
            gamma = variational_posterior.binary_prob[0]
        else:
            gamma = variational_posterior.binary_prob
-        if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+        if len(self.pi.shape) == 2:
+            idx = np.unique(gamma._raveled_index() / gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi

-        var_mean = np.square(mu)/self.variance
-        var_S = (S/self.variance - np.log(S))
-        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
-        return var_gamma +((1.-self._b_prob_all)*(np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels)
+        var_mean = np.square(mu) / self.variance
+        var_S = S / self.variance - np.log(S)
+        var_gamma = (gamma * np.log(gamma / pi)).sum() + (
+            (1 - gamma) * np.log((1 - gamma) / (1 - pi))
+        ).sum()
+        return var_gamma + (
+            (1.0 - self._b_prob_all) * (np.log(self.variance) - 1.0 + var_mean + var_S)
+        ).sum() / (2.0 * self.nModels)

    def update_gradients_KL(self, variational_posterior):
        mu = variational_posterior.mean
@ -195,63 +285,141 @@ class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
            gamma = variational_posterior.binary_prob.values[0]
        else:
            gamma = variational_posterior.binary_prob.values
-        if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+        if len(self.pi.shape) == 2:
+            idx = np.unique(gamma._raveled_index() / gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi

        if self.group_spike:
-            tmp = self._b_prob_all/(1.-gamma)
-            variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))/N +tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+            tmp = self._b_prob_all / (1.0 - gamma)
+            variational_posterior.binary_prob.gradient -= (
+                np.log((1 - pi) / pi * gamma / (1.0 - gamma)) / N
+                + tmp
+                * (
+                    (np.square(mu) + S) / self.variance
+                    - np.log(S)
+                    + np.log(self.variance)
+                    - 1.0
+                )
+                / 2.0
+            )
        else:
-            variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
-        mu.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels)
-        S.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels)
+            variational_posterior.binary_prob.gradient -= (
+                np.log((1 - pi) / pi * gamma / (1.0 - gamma))
+                + (
+                    (np.square(mu) + S) / self.variance
+                    - np.log(S)
+                    + np.log(self.variance)
+                    - 1.0
+                )
+                / 2.0
+            )
+        mu.gradient -= (1.0 - self._b_prob_all) * mu / (self.variance * self.nModels)
+        S.gradient -= (
+            (1.0 / self.variance - 1.0 / S)
+            * (1.0 - self._b_prob_all)
+            / (2.0 * self.nModels)
+        )
        if self.learnPi:
-            raise 'Not Supported!'
+            raise "Not Supported!"
+

 class IBPPrior_SSMRD(VariationalPrior):
-    def __init__(self, nModels, input_dim, alpha =2., tau=None, name='IBPPrior', **kw):
+    def __init__(self, nModels, input_dim, alpha=2.0, tau=None, name="IBPPrior", **kw):
        super(IBPPrior_SSMRD, self).__init__(name=name, **kw)
        from paramz.transformations import Logexp, __fixed__
+
        self.nModels = nModels
        self._b_prob_all = 0.5
        self.input_dim = input_dim
-        self.variance = 1.
-        self.alpha = Param('alpha', alpha, __fixed__)
+        self.variance = 1.0
+        self.alpha = Param("alpha", alpha, __fixed__)
        self.link_parameter(self.alpha)

    def _update_inernal(self, varp_list):
        """Make an update of the internal status by gathering the variational posteriors for all the individual models."""
        # The probability for the binary variable for the same latent dimension of any of the models is on.
-        self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group)
-        [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]]
+        self._b_prob_all = 1.0 - param_to_array(varp_list[0].gamma_group)
+        [
+            np.multiply(self._b_prob_all, 1.0 - vp.gamma_group, self._b_prob_all)
+            for vp in varp_list[1:]
+        ]

    def KL_divergence(self, variational_posterior):
-        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+        mu, S, gamma, tau = (
+            variational_posterior.mean.values,
+            variational_posterior.variance.values,
+            variational_posterior.gamma_group.values,
+            variational_posterior.tau.values,
+        )

-        var_mean = np.square(mu)/self.variance
-        var_S = (S/self.variance - np.log(S))
-        part1 = ((1.-self._b_prob_all)* (np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels)
+        var_mean = np.square(mu) / self.variance
+        var_S = S / self.variance - np.log(S)
+        part1 = (
+            (1.0 - self._b_prob_all) * (np.log(self.variance) - 1.0 + var_mean + var_S)
+        ).sum() / (2.0 * self.nModels)

-        ad = self.alpha/self.input_dim
-        from scipy.special import betaln,digamma
-        part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + (betaln(ad,1.)*self.input_dim -betaln(tau[:,0], tau[:,1]).sum())/self.nModels \
-                 + (( (tau[:,0]-ad)/self.nModels -gamma)*digamma(tau[:,0])).sum() + \
-                (((tau[:,1]-1.)/self.nModels+gamma-1.)*digamma(tau[:,1])).sum() + (((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*digamma(tau.sum(axis=1))).sum()
-        return part1+part2
+        ad = self.alpha / self.input_dim
+        from scipy.special import betaln, digamma
+
+        part2 = (
+            (gamma * np.log(gamma)).sum()
+            + ((1.0 - gamma) * np.log(1.0 - gamma)).sum()
+            + (betaln(ad, 1.0) * self.input_dim - betaln(tau[:, 0], tau[:, 1]).sum())
+            / self.nModels
+            + (((tau[:, 0] - ad) / self.nModels - gamma) * digamma(tau[:, 0])).sum()
+            + (
+                ((tau[:, 1] - 1.0) / self.nModels + gamma - 1.0) * digamma(tau[:, 1])
+            ).sum()
+            + (
+                ((1.0 + ad - tau[:, 0] - tau[:, 1]) / self.nModels + 1.0)
+                * digamma(tau.sum(axis=1))
+            ).sum()
+        )
+        return part1 + part2

    def update_gradients_KL(self, variational_posterior):
-        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+        mu, S, gamma, tau = (
+            variational_posterior.mean.values,
+            variational_posterior.variance.values,
+            variational_posterior.gamma_group.values,
+            variational_posterior.tau.values,
+        )

-        variational_posterior.mean.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels)
-        variational_posterior.variance.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels)
-        from scipy.special import digamma,polygamma
-        tmp = self._b_prob_all/(1.-gamma)
-        dgamma = (np.log(gamma/(1.-gamma))+ digamma(tau[:,1])-digamma(tau[:,0]))/variational_posterior.num_data
-        variational_posterior.binary_prob.gradient -= dgamma+tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
-        ad = self.alpha/self.input_dim
-        common = ((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*polygamma(1,tau.sum(axis=1))
-        variational_posterior.tau.gradient[:,0] = -(((tau[:,0]-ad)/self.nModels -gamma)*polygamma(1,tau[:,0])+common)
-        variational_posterior.tau.gradient[:,1] = -(((tau[:,1]-1.)/self.nModels+gamma-1.)*polygamma(1,tau[:,1])+common)
+        variational_posterior.mean.gradient -= (
+            (1.0 - self._b_prob_all) * mu / (self.variance * self.nModels)
+        )
+        variational_posterior.variance.gradient -= (
+            (1.0 / self.variance - 1.0 / S)
+            * (1.0 - self._b_prob_all)
+            / (2.0 * self.nModels)
+        )
+        from scipy.special import digamma, polygamma
+
+        tmp = self._b_prob_all / (1.0 - gamma)
+        dgamma = (
+            np.log(gamma / (1.0 - gamma)) + digamma(tau[:, 1]) - digamma(tau[:, 0])
+        ) / variational_posterior.num_data
+        variational_posterior.binary_prob.gradient -= (
+            dgamma
+            + tmp
+            * (
+                (np.square(mu) + S) / self.variance
+                - np.log(S)
+                + np.log(self.variance)
+                - 1.0
+            )
+            / 2.0
+        )
+        ad = self.alpha / self.input_dim
+        common = ((1.0 + ad - tau[:, 0] - tau[:, 1]) / self.nModels + 1.0) * polygamma(
+            1, tau.sum(axis=1)
+        )
+        variational_posterior.tau.gradient[:, 0] = -(
+            ((tau[:, 0] - ad) / self.nModels - gamma) * polygamma(1, tau[:, 0]) + common
+        )
+        variational_posterior.tau.gradient[:, 1] = -(
+            ((tau[:, 1] - 1.0) / self.nModels + gamma - 1.0) * polygamma(1, tau[:, 1])
+            + common
+        )
--- a/GPy/models/state_space_main.py
+++ b/GPy/models/state_space_main.py
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -5,6 +5,7 @@ import numpy as np

 from .util import align_subplot_array, align_subplots

+
 def ax_default(fignum, ax):
    if ax is None:
        fig = plt.figure(fignum)
@ -13,11 +14,23 @@ def ax_default(fignum, ax):
        fig = ax.figure
    return fig, ax

-def meanplot(x, mu, color='#3300FF', ax=None, fignum=None, linewidth=2,**kw):
-    _, axes = ax_default(fignum, ax)
-    return axes.plot(x,mu,color=color,linewidth=linewidth,**kw)

-def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, fignum=None, **kwargs):
+def meanplot(x, mu, color="#3300FF", ax=None, fignum=None, linewidth=2, **kw):
+    _, axes = ax_default(fignum, ax)
+    return axes.plot(x, mu, color=color, linewidth=linewidth, **kw)
+
+
+def gpplot(
+    x,
+    mu,
+    lower,
+    upper,
+    edgecol="#3300FF",
+    fillcol="#33CCFF",
+    ax=None,
+    fignum=None,
+    **kwargs
+):
    _, axes = ax_default(fignum, ax)

    mu = mu.flatten()
@ -27,51 +40,62 @@ def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, f

    plots = []

-    #here's the mean
+    # here's the mean
    plots.append(meanplot(x, mu, edgecol, axes))

-    #here's the box
-    kwargs['linewidth']=0.5
-    if not 'alpha' in kwargs.keys():
-        kwargs['alpha'] = 0.3
-    plots.append(axes.fill(np.hstack((x,x[::-1])),np.hstack((upper,lower[::-1])),color=fillcol,**kwargs))
+    # here's the box
+    kwargs["linewidth"] = 0.5
+    if not "alpha" in kwargs.keys():
+        kwargs["alpha"] = 0.3
+    plots.append(
+        axes.fill(
+            np.hstack((x, x[::-1])),
+            np.hstack((upper, lower[::-1])),
+            color=fillcol,
+            **kwargs
+        )
+    )

-    #this is the edge:
-    plots.append(meanplot(x, upper,color=edgecol, linewidth=0.2, ax=axes))
-    plots.append(meanplot(x, lower,color=edgecol, linewidth=0.2, ax=axes))
+    # this is the edge:
+    plots.append(meanplot(x, upper, color=edgecol, linewidth=0.2, ax=axes))
+    plots.append(meanplot(x, lower, color=edgecol, linewidth=0.2, ax=axes))

    return plots

+
 def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
    _, ax = ax_default(fignum, ax)

    plots = []

-    #here's the box
-    if 'linewidth' not in kwargs:
-        kwargs['linewidth'] = 0.5
-    if not 'alpha' in kwargs.keys():
-        kwargs['alpha'] = 1./(len(percentiles))
+    # here's the box
+    if "linewidth" not in kwargs:
+        kwargs["linewidth"] = 0.5
+    if not "alpha" in kwargs.keys():
+        kwargs["alpha"] = 1.0 / (len(percentiles))

    # pop where from kwargs
-    where = kwargs.pop('where') if 'where' in kwargs else None
+    where = kwargs.pop("where") if "where" in kwargs else None
    # pop interpolate, which we actually do not do here!
-    if 'interpolate' in kwargs: kwargs.pop('interpolate')
+    if "interpolate" in kwargs:
+        kwargs.pop("interpolate")

    def pairwise(inlist):
        l = len(inlist)
-        for i in range(int(np.ceil(l/2.))):
-            yield inlist[:][i], inlist[:][(l-1)-i]
+        for i in range(int(np.ceil(l / 2.0))):
+            yield inlist[:][i], inlist[:][(l - 1) - i]

    polycol = []
    for y1, y2 in pairwise(percentiles):
        import matplotlib.mlab as mlab
+
        # Handle united data, such as dates
        ax._process_unit_info(xdata=x, ydata=y1)
        ax._process_unit_info(ydata=y2)

        # Convert the arrays so we can work with them
        from numpy import ma
+
        x = ma.masked_invalid(ax.convert_xunits(x))
        y1 = ma.masked_invalid(ax.convert_yunits(y1))
        y2 = ma.masked_invalid(ax.convert_yunits(y2))
@ -103,7 +127,7 @@ def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
                continue

            N = len(xslice)
-            X = np.zeros((2 * N + 2, 2), np.float)
+            X = np.zeros((2 * N + 2, 2), float)

            # the purpose of the next two lines is for when y2 is a
            # scalar like 0 and we want the fill to go all the way
@ -114,19 +138,21 @@ def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
            X[0] = start
            X[N + 1] = end

-            X[1:N + 1, 0] = xslice
-            X[1:N + 1, 1] = y1slice
-            X[N + 2:, 0] = xslice[::-1]
-            X[N + 2:, 1] = y2slice[::-1]
+            X[1 : N + 1, 0] = xslice
+            X[1 : N + 1, 1] = y1slice
+            X[N + 2 :, 0] = xslice[::-1]
+            X[N + 2 :, 1] = y2slice[::-1]

            polys.append(X)
        polycol.extend(polys)
    from matplotlib.collections import PolyCollection
+
    plots.append(PolyCollection(polycol, **kwargs))
    ax.add_collection(plots[-1], autolim=True)
    ax.autoscale_view()
    return plots

+
 def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)

@ -138,17 +164,19 @@ def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
    plots = []

    if edgecol is None:
-        edgecol='#3300FF'
+        edgecol = "#3300FF"

-    if not 'alpha' in kwargs.keys():
-        kwargs['alpha'] = 1.
+    if not "alpha" in kwargs.keys():
+        kwargs["alpha"] = 1.0

+    if not "lw" in kwargs.keys():
+        kwargs["lw"] = 1.0

-    if not 'lw' in kwargs.keys():
-        kwargs['lw'] = 1.
-
-
-    plots.append(axes.errorbar(x,mu,yerr=np.vstack([mu-lower,upper-mu]),color=edgecol,**kwargs))
+    plots.append(
+        axes.errorbar(
+            x, mu, yerr=np.vstack([mu - lower, upper - mu]), color=edgecol, **kwargs
+        )
+    )
    plots[-1][0].remove()
    return plots

@ -156,53 +184,60 @@ def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
 def removeRightTicks(ax=None):
    ax = ax or plt.gca()
    for i, line in enumerate(ax.get_yticklines()):
-        if i%2 == 1:   # odd indices
+        if i % 2 == 1:  # odd indices
            line.set_visible(False)

+
 def removeUpperTicks(ax=None):
    ax = ax or plt.gca()
    for i, line in enumerate(ax.get_xticklines()):
-        if i%2 == 1:   # odd indices
+        if i % 2 == 1:  # odd indices
            line.set_visible(False)

-def fewerXticks(ax=None,divideby=2):
+
+def fewerXticks(ax=None, divideby=2):
    ax = ax or plt.gca()
    ax.set_xticks(ax.get_xticks()[::divideby])

-def x_frame1D(X,plot_limits=None,resolution=None):
+
+def x_frame1D(X, plot_limits=None, resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
    """
-    assert X.shape[1] ==1, "x_frame1D is defined for one-dimensional inputs"
+    assert X.shape[1] == 1, "x_frame1D is defined for one-dimensional inputs"
    if plot_limits is None:
        from ...core.parameterization.variational import VariationalPosterior
+
        if isinstance(X, VariationalPosterior):
-            xmin,xmax = X.mean.min(0),X.mean.max(0)
+            xmin, xmax = X.mean.min(0), X.mean.max(0)
        else:
-            xmin,xmax = X.min(0),X.max(0)
-        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
-    elif len(plot_limits)==2:
+            xmin, xmax = X.min(0), X.max(0)
+        xmin, xmax = xmin - 0.2 * (xmax - xmin), xmax + 0.2 * (xmax - xmin)
+    elif len(plot_limits) == 2:
        xmin, xmax = plot_limits
    else:
        raise ValueError("Bad limits for plotting")

-    Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
+    Xnew = np.linspace(xmin, xmax, resolution or 200)[:, None]
    return Xnew, xmin, xmax

-def x_frame2D(X,plot_limits=None,resolution=None):
+
+def x_frame2D(X, plot_limits=None, resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
    """
-    assert X.shape[1] ==2, "x_frame2D is defined for two-dimensional inputs"
+    assert X.shape[1] == 2, "x_frame2D is defined for two-dimensional inputs"
    if plot_limits is None:
-        xmin,xmax = X.min(0),X.max(0)
-        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
-    elif len(plot_limits)==2:
+        xmin, xmax = X.min(0), X.max(0)
+        xmin, xmax = xmin - 0.2 * (xmax - xmin), xmax + 0.2 * (xmax - xmin)
+    elif len(plot_limits) == 2:
        xmin, xmax = plot_limits
    else:
        raise ValueError("Bad limits for plotting")

    resolution = resolution or 50
-    xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
-    Xnew = np.vstack((xx.flatten(),yy.flatten())).T
+    xx, yy = np.mgrid[
+        xmin[0] : xmax[0] : 1j * resolution, xmin[1] : xmax[1] : 1j * resolution
+    ]
+    Xnew = np.vstack((xx.flatten(), yy.flatten())).T
    return Xnew, xx, yy, xmin, xmax
--- a/GPy/plotting/matplot_dep/plot_definitions.py
+++ b/GPy/plotting/matplot_dep/plot_definitions.py
@ -1,4 +1,4 @@
-#===============================================================================
+# ===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 # All rights reserved.
 #
@ -26,7 +26,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
+# ===============================================================================
 import numpy as np
 from matplotlib import pyplot as plt
 from ..abstract_plotting_library import AbstractPlottingLibrary
@ -37,6 +37,7 @@ from .controllers import ImshowController, ImAnnotateController
 import itertools
 from .util import legend_ontop

+
 class MatplotlibPlots(AbstractPlottingLibrary):
    def __init__(self):
        super(MatplotlibPlots, self).__init__()
@ -49,54 +50,86 @@ class MatplotlibPlots(AbstractPlottingLibrary):
        fig.gridspec = plt.GridSpec(rows, cols, **gridspec_kwargs)
        return fig

-    def new_canvas(self, figure=None, row=1, col=1, projection='2d', xlabel=None, ylabel=None, zlabel=None, title=None, xlim=None, ylim=None, zlim=None, **kwargs):
-        if projection == '3d':
+    def new_canvas(
+        self,
+        figure=None,
+        row=1,
+        col=1,
+        projection="2d",
+        xlabel=None,
+        ylabel=None,
+        zlabel=None,
+        title=None,
+        xlim=None,
+        ylim=None,
+        zlim=None,
+        **kwargs
+    ):
+        if projection == "3d":
            from mpl_toolkits.mplot3d import Axes3D
-        elif projection == '2d':
+        elif projection == "2d":
            projection = None
-        if 'ax' in kwargs:
-            ax = kwargs.pop('ax')
+        if "ax" in kwargs:
+            ax = kwargs.pop("ax")
        else:
            if figure is not None:
                fig = figure
-            elif 'num' in kwargs and 'figsize' in kwargs:
-                fig = self.figure(num=kwargs.pop('num'), figsize=kwargs.pop('figsize'))
-            elif 'num' in kwargs:
-                fig = self.figure(num=kwargs.pop('num'))
-            elif 'figsize' in kwargs:
-                fig = self.figure(figsize=kwargs.pop('figsize'))
+            elif "num" in kwargs and "figsize" in kwargs:
+                fig = self.figure(num=kwargs.pop("num"), figsize=kwargs.pop("figsize"))
+            elif "num" in kwargs:
+                fig = self.figure(num=kwargs.pop("num"))
+            elif "figsize" in kwargs:
+                fig = self.figure(figsize=kwargs.pop("figsize"))
            else:
                fig = self.figure()

-            #if hasattr(fig, 'rows') and hasattr(fig, 'cols'):
-            ax = fig.add_subplot(fig.gridspec[row-1, col-1], projection=projection)
+            # if hasattr(fig, 'rows') and hasattr(fig, 'cols'):
+            ax = fig.add_subplot(fig.gridspec[row - 1, col - 1], projection=projection)

-        if xlim is not None: ax.set_xlim(xlim)
-        if ylim is not None: ax.set_ylim(ylim)
-        if xlabel is not None: ax.set_xlabel(xlabel)
-        if ylabel is not None: ax.set_ylabel(ylabel)
-        if title is not None: ax.set_title(title)
-        if projection == '3d':
-            if zlim is not None: ax.set_zlim(zlim)
-            if zlabel is not None: ax.set_zlabel(zlabel)
+        if xlim is not None:
+            ax.set_xlim(xlim)
+        if ylim is not None:
+            ax.set_ylim(ylim)
+        if xlabel is not None:
+            ax.set_xlabel(xlabel)
+        if ylabel is not None:
+            ax.set_ylabel(ylabel)
+        if title is not None:
+            ax.set_title(title)
+        if projection == "3d":
+            if zlim is not None:
+                ax.set_zlim(zlim)
+            if zlabel is not None:
+                ax.set_zlabel(zlabel)
        return ax, kwargs

    def add_to_canvas(self, ax, plots, legend=False, title=None, **kwargs):
-        #ax.autoscale_view()
-        fontdict=dict(family='sans-serif', weight='light', size=9)
+        # ax.autoscale_view()
+        fontdict = dict(family="sans-serif", weight="light", size=9)
        if legend is True:
            ax.legend(*ax.get_legend_handles_labels())
        elif legend >= 1:
-            #ax.legend(prop=fontdict)
+            # ax.legend(prop=fontdict)
            legend_ontop(ax, ncol=legend, fontdict=fontdict)
-        if title is not None: ax.figure.suptitle(title)
+        if title is not None:
+            ax.figure.suptitle(title)
        return plots

    def show_canvas(self, ax, **kwargs):
        ax.figure.canvas.draw()
        return ax.figure

-    def scatter(self, ax, X, Y, Z=None, color=Tango.colorsHex['mediumBlue'], label=None, marker='o', **kwargs):
+    def scatter(
+        self,
+        ax,
+        X,
+        Y,
+        Z=None,
+        color=Tango.colorsHex["mediumBlue"],
+        label=None,
+        marker="o",
+        **kwargs
+    ):
        if Z is not None:
            return ax.scatter(X, Y, c=color, zs=Z, label=label, marker=marker, **kwargs)
        return ax.scatter(X, Y, c=color, label=label, marker=marker, **kwargs)
@ -106,129 +139,258 @@ class MatplotlibPlots(AbstractPlottingLibrary):
            return ax.plot(X, Y, color=color, zs=Z, label=label, **kwargs)
        return ax.plot(X, Y, color=color, label=label, **kwargs)

-    def plot_axis_lines(self, ax, X, color=Tango.colorsHex['darkRed'], label=None, **kwargs):
+    def plot_axis_lines(
+        self, ax, X, color=Tango.colorsHex["darkRed"], label=None, **kwargs
+    ):
        from matplotlib import transforms
        from matplotlib.path import Path
-        if 'marker' not in kwargs:
-            kwargs['marker'] = Path([[-.2,0.],    [-.2,.5],    [0.,1.],    [.2,.5],     [.2,0.],     [-.2,0.]],
-                                    [Path.MOVETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.CLOSEPOLY])
-        if 'transform' not in kwargs:
+
+        if "marker" not in kwargs:
+            kwargs["marker"] = Path(
+                [
+                    [-0.2, 0.0],
+                    [-0.2, 0.5],
+                    [0.0, 1.0],
+                    [0.2, 0.5],
+                    [0.2, 0.0],
+                    [-0.2, 0.0],
+                ],
+                [
+                    Path.MOVETO,
+                    Path.LINETO,
+                    Path.LINETO,
+                    Path.LINETO,
+                    Path.LINETO,
+                    Path.CLOSEPOLY,
+                ],
+            )
+        if "transform" not in kwargs:
            if X.shape[1] == 1:
-                kwargs['transform'] = transforms.blended_transform_factory(ax.transData, ax.transAxes)
+                kwargs["transform"] = transforms.blended_transform_factory(
+                    ax.transData, ax.transAxes
+                )
        if X.shape[1] == 2:
-            return ax.scatter(X[:,0], X[:,1], ax.get_zlim()[0], c=color, label=label, **kwargs)
+            return ax.scatter(
+                X[:, 0], X[:, 1], ax.get_zlim()[0], c=color, label=label, **kwargs
+            )
        return ax.scatter(X, np.zeros_like(X), c=color, label=label, **kwargs)

-    def barplot(self, ax, x, height, width=0.8, bottom=0, color=Tango.colorsHex['mediumBlue'], label=None, **kwargs):
-        if 'align' not in kwargs:
-            kwargs['align'] = 'center'
-        return ax.bar(x=x, height=height, width=width,
-               bottom=bottom, label=label, color=color,
-               **kwargs)
+    def barplot(
+        self,
+        ax,
+        x,
+        height,
+        width=0.8,
+        bottom=0,
+        color=Tango.colorsHex["mediumBlue"],
+        label=None,
+        **kwargs
+    ):
+        if "align" not in kwargs:
+            kwargs["align"] = "center"
+        return ax.bar(
+            x=x,
+            height=height,
+            width=width,
+            bottom=bottom,
+            label=label,
+            color=color,
+            **kwargs
+        )

-    def xerrorbar(self, ax, X, Y, error, color=Tango.colorsHex['darkRed'], label=None, **kwargs):
-        if not('linestyle' in kwargs or 'ls' in kwargs):
-            kwargs['ls'] = 'none'
-        #if Z is not None:
+    def xerrorbar(
+        self, ax, X, Y, error, color=Tango.colorsHex["darkRed"], label=None, **kwargs
+    ):
+        if not ("linestyle" in kwargs or "ls" in kwargs):
+            kwargs["ls"] = "none"
+        # if Z is not None:
        #    return ax.errorbar(X, Y, Z, xerr=error, ecolor=color, label=label, **kwargs)
        return ax.errorbar(X, Y, xerr=error, ecolor=color, label=label, **kwargs)

-    def yerrorbar(self, ax, X, Y, error, color=Tango.colorsHex['darkRed'], label=None, **kwargs):
-        if not('linestyle' in kwargs or 'ls' in kwargs):
-            kwargs['ls'] = 'none'
-        #if Z is not None:
+    def yerrorbar(
+        self, ax, X, Y, error, color=Tango.colorsHex["darkRed"], label=None, **kwargs
+    ):
+        if not ("linestyle" in kwargs or "ls" in kwargs):
+            kwargs["ls"] = "none"
+        # if Z is not None:
        #    return ax.errorbar(X, Y, Z, yerr=error, ecolor=color, label=label, **kwargs)
        return ax.errorbar(X, Y, yerr=error, ecolor=color, label=label, **kwargs)

-    def imshow(self, ax, X, extent=None, label=None, vmin=None, vmax=None, **imshow_kwargs):
-        if 'origin' not in imshow_kwargs:
-            imshow_kwargs['origin'] = 'lower'
-        #xmin, xmax, ymin, ymax = extent
-        #xoffset, yoffset = (xmax - xmin) / (2. * X.shape[0]), (ymax - ymin) / (2. * X.shape[1])
-        #xmin, xmax, ymin, ymax = extent = xmin-xoffset, xmax+xoffset, ymin-yoffset, ymax+yoffset
-        return ax.imshow(X, label=label, extent=extent, vmin=vmin, vmax=vmax, **imshow_kwargs)
+    def imshow(
+        self, ax, X, extent=None, label=None, vmin=None, vmax=None, **imshow_kwargs
+    ):
+        if "origin" not in imshow_kwargs:
+            imshow_kwargs["origin"] = "lower"
+        # xmin, xmax, ymin, ymax = extent
+        # xoffset, yoffset = (xmax - xmin) / (2. * X.shape[0]), (ymax - ymin) / (2. * X.shape[1])
+        # xmin, xmax, ymin, ymax = extent = xmin-xoffset, xmax+xoffset, ymin-yoffset, ymax+yoffset
+        return ax.imshow(
+            X, label=label, extent=extent, vmin=vmin, vmax=vmax, **imshow_kwargs
+        )

-    def imshow_interact(self, ax, plot_function, extent, label=None, resolution=None, vmin=None, vmax=None, **imshow_kwargs):
-        if imshow_kwargs is None: imshow_kwargs = {}
-        if 'origin' not in imshow_kwargs:
-            imshow_kwargs['origin'] = 'lower'
-        return ImshowController(ax, plot_function, extent, resolution=resolution, vmin=vmin, vmax=vmax, **imshow_kwargs)
+    def imshow_interact(
+        self,
+        ax,
+        plot_function,
+        extent,
+        label=None,
+        resolution=None,
+        vmin=None,
+        vmax=None,
+        **imshow_kwargs
+    ):
+        if imshow_kwargs is None:
+            imshow_kwargs = {}
+        if "origin" not in imshow_kwargs:
+            imshow_kwargs["origin"] = "lower"
+        return ImshowController(
+            ax,
+            plot_function,
+            extent,
+            resolution=resolution,
+            vmin=vmin,
+            vmax=vmax,
+            **imshow_kwargs
+        )

-    def annotation_heatmap(self, ax, X, annotation, extent=None, label=None, imshow_kwargs=None, **annotation_kwargs):
-        if imshow_kwargs is None: imshow_kwargs = {}
-        if 'origin' not in imshow_kwargs:
-            imshow_kwargs['origin'] = 'lower'
-        if ('ha' not in annotation_kwargs) and ('horizontalalignment' not in annotation_kwargs):
-            annotation_kwargs['ha'] = 'center'
-        if ('va' not in annotation_kwargs) and ('verticalalignment' not in annotation_kwargs):
-            annotation_kwargs['va'] = 'center'
+    def annotation_heatmap(
+        self,
+        ax,
+        X,
+        annotation,
+        extent=None,
+        label=None,
+        imshow_kwargs=None,
+        **annotation_kwargs
+    ):
+        if imshow_kwargs is None:
+            imshow_kwargs = {}
+        if "origin" not in imshow_kwargs:
+            imshow_kwargs["origin"] = "lower"
+        if ("ha" not in annotation_kwargs) and (
+            "horizontalalignment" not in annotation_kwargs
+        ):
+            annotation_kwargs["ha"] = "center"
+        if ("va" not in annotation_kwargs) and (
+            "verticalalignment" not in annotation_kwargs
+        ):
+            annotation_kwargs["va"] = "center"
        imshow = self.imshow(ax, X, extent, label, **imshow_kwargs)
        if extent is None:
            extent = (0, X.shape[0], 0, X.shape[1])
        xmin, xmax, ymin, ymax = extent
-        xoffset, yoffset = (xmax - xmin) / (2. * X.shape[0]), (ymax - ymin) / (2. * X.shape[1])
+        xoffset, yoffset = (xmax - xmin) / (2.0 * X.shape[0]), (ymax - ymin) / (
+            2.0 * X.shape[1]
+        )
        xlin = np.linspace(xmin, xmax, X.shape[0], endpoint=False)
        ylin = np.linspace(ymin, ymax, X.shape[1], endpoint=False)
        annotations = []
        for [i, x], [j, y] in itertools.product(enumerate(xlin), enumerate(ylin)):
-            annotations.append(ax.text(x+xoffset, y+yoffset, "{}".format(annotation[j, i]), **annotation_kwargs))
+            annotations.append(
+                ax.text(
+                    x + xoffset,
+                    y + yoffset,
+                    "{}".format(annotation[j, i]),
+                    **annotation_kwargs
+                )
+            )
        return imshow, annotations

-    def annotation_heatmap_interact(self, ax, plot_function, extent, label=None, resolution=15, imshow_kwargs=None, **annotation_kwargs):
-        if imshow_kwargs is None: imshow_kwargs = {}
-        if 'origin' not in imshow_kwargs:
-            imshow_kwargs['origin'] = 'lower'
-        return ImAnnotateController(ax, plot_function, extent, resolution=resolution, imshow_kwargs=imshow_kwargs or {}, **annotation_kwargs)
+    def annotation_heatmap_interact(
+        self,
+        ax,
+        plot_function,
+        extent,
+        label=None,
+        resolution=15,
+        imshow_kwargs=None,
+        **annotation_kwargs
+    ):
+        if imshow_kwargs is None:
+            imshow_kwargs = {}
+        if "origin" not in imshow_kwargs:
+            imshow_kwargs["origin"] = "lower"
+        return ImAnnotateController(
+            ax,
+            plot_function,
+            extent,
+            resolution=resolution,
+            imshow_kwargs=imshow_kwargs or {},
+            **annotation_kwargs
+        )

    def contour(self, ax, X, Y, C, levels=20, label=None, **kwargs):
-        return ax.contour(X, Y, C, levels=np.linspace(C.min(), C.max(), levels), label=label, **kwargs)
+        return ax.contour(
+            X, Y, C, levels=np.linspace(C.min(), C.max(), levels), label=label, **kwargs
+        )

    def surface(self, ax, X, Y, Z, color=None, label=None, **kwargs):
        return ax.plot_surface(X, Y, Z, label=label, **kwargs)

-    def fill_between(self, ax, X, lower, upper, color=Tango.colorsHex['mediumBlue'], label=None, **kwargs):
+    def fill_between(
+        self,
+        ax,
+        X,
+        lower,
+        upper,
+        color=Tango.colorsHex["mediumBlue"],
+        label=None,
+        **kwargs
+    ):
        return ax.fill_between(X, lower, upper, facecolor=color, label=label, **kwargs)

-    def fill_gradient(self, canvas, X, percentiles, color=Tango.colorsHex['mediumBlue'], label=None, **kwargs):
+    def fill_gradient(
+        self,
+        canvas,
+        X,
+        percentiles,
+        color=Tango.colorsHex["mediumBlue"],
+        label=None,
+        **kwargs
+    ):
        ax = canvas
        plots = []

-        if 'edgecolors' not in kwargs:
-            kwargs['edgecolors'] = 'none'
+        if "edgecolors" not in kwargs:
+            kwargs["edgecolors"] = "none"

-        if 'facecolors' in kwargs:
-            color = kwargs.pop('facecolors')
+        if "facecolors" in kwargs:
+            color = kwargs.pop("facecolors")

-        if 'array' in kwargs:
-            array = kwargs.pop('array')
+        if "array" in kwargs:
+            array = kwargs.pop("array")
        else:
-            array = 1.-np.abs(np.linspace(-.97, .97, len(percentiles)-1))
+            array = 1.0 - np.abs(np.linspace(-0.97, 0.97, len(percentiles) - 1))

-        if 'alpha' in kwargs:
-            alpha = kwargs.pop('alpha')
+        if "alpha" in kwargs:
+            alpha = kwargs.pop("alpha")
        else:
-            alpha = .8
+            alpha = 0.8

-        if 'cmap' in kwargs:
-            cmap = kwargs.pop('cmap')
+        if "cmap" in kwargs:
+            cmap = kwargs.pop("cmap")
        else:
-            cmap = LinearSegmentedColormap.from_list('WhToColor', (color, color), N=array.size)
+            cmap = LinearSegmentedColormap.from_list(
+                "WhToColor", (color, color), N=array.size
+            )
        cmap._init()
-        cmap._lut[:-3, -1] = alpha*array
+        cmap._lut[:-3, -1] = alpha * array

-        kwargs['facecolors'] = [cmap(i) for i in np.linspace(0,1,cmap.N)]
+        kwargs["facecolors"] = [cmap(i) for i in np.linspace(0, 1, cmap.N)]

        # pop where from kwargs
-        where = kwargs.pop('where') if 'where' in kwargs else None
+        where = kwargs.pop("where") if "where" in kwargs else None
        # pop interpolate, which we actually do not do here!
-        if 'interpolate' in kwargs: kwargs.pop('interpolate')
+        if "interpolate" in kwargs:
+            kwargs.pop("interpolate")

        def pairwise(iterable):
            "s -> (s0,s1), (s1,s2), (s2, s3), ..."
            from itertools import tee
-            #try:
+
+            # try:
            #    from itertools import izip as zip
-            #except ImportError:
+            # except ImportError:
            #    pass
            a, b = tee(iterable)
            next(b, None)
@ -245,6 +407,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
            ax._process_unit_info(ydata=y2)
            # Convert the arrays so we can work with them
            from numpy import ma
+
            x = ma.masked_invalid(ax.convert_xunits(X))
            y1 = ma.masked_invalid(ax.convert_yunits(y1))
            y2 = ma.masked_invalid(ax.convert_yunits(y2))
@ -263,6 +426,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                raise ValueError("Argument dimensions are incompatible")

            from functools import reduce
+
            mask = reduce(ma.mask_or, [ma.getmask(a) for a in (x, y1, y2)])
            if mask is not ma.nomask:
                where &= ~mask
@ -277,7 +441,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                    continue

                N = len(xslice)
-                p = np.zeros((2 * N + 2, 2), np.float)
+                p = np.zeros((2 * N + 2, 2), float)

                # the purpose of the next two lines is for when y2 is a
                # scalar like 0 and we want the fill to go all the way
@ -288,16 +452,17 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                p[0] = start
                p[N + 1] = end

-                p[1:N + 1, 0] = xslice
-                p[1:N + 1, 1] = y1slice
-                p[N + 2:, 0] = xslice[::-1]
-                p[N + 2:, 1] = y2slice[::-1]
+                p[1 : N + 1, 0] = xslice
+                p[1 : N + 1, 1] = y1slice
+                p[N + 2 :, 0] = xslice[::-1]
+                p[N + 2 :, 1] = y2slice[::-1]

                polys.append(p)
            polycol.extend(polys)
        from matplotlib.collections import PolyCollection
-        if 'zorder' not in kwargs:
-            kwargs['zorder'] = 0
+
+        if "zorder" not in kwargs:
+            kwargs["zorder"] = 0
        plots.append(PolyCollection(polycol, label=label, **kwargs))
        ax.add_collection(plots[-1], autolim=True)
        ax.autoscale_view()
--- a/GPy/testing/test_ep_likelihood.py
+++ b/GPy/testing/test_ep_likelihood.py
@ -24,7 +24,7 @@ class TestObservationModels:
        self.Y = (np.sin(self.X[:, 0] * 2 * np.pi) + noise)[:, None]
        self.num_points = self.X.shape[0]
        self.f = np.random.rand(self.N, 1)
-        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
+        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=int)[:, None]
        # self.binary_Y[self.binary_Y == 0.0] = -1.0
        self.positive_Y = np.exp(self.Y.copy())

--- a/GPy/testing/test_likelihood.py
+++ b/GPy/testing/test_likelihood.py
@ -136,7 +136,7 @@ class TestNoiseModels:
        noise = np.random.randn(*self.X[:, 0].shape) * self.real_std
        self.Y = (np.sin(self.X[:, 0] * 2 * np.pi) + noise)[:, None]
        self.f = np.random.rand(self.N, 1)
-        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
+        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=int)[:, None]
        self.binary_Y[self.binary_Y == 0.0] = -1.0
        self.positive_Y = np.exp(self.Y.copy())
        tmp = (
--- a/GPy/testing/test_model.py
+++ b/GPy/testing/test_model.py
@ -1432,8 +1432,8 @@ class TestGradient:
        y = np.zeros((D * N_train,))
        x_test = np.zeros((D * (N - N_train),))
        y_test = np.zeros((D * (N - N_train),))
-        indexD = np.zeros((D * N_train), dtype=np.int)
-        indexD_test = np.zeros((D * (N - N_train)), dtype=np.int)
+        indexD = np.zeros((D * N_train), dtype=int)
+        indexD_test = np.zeros((D * (N - N_train)), dtype=int)

        offset_all = 0
        offset_train = 0
--- a/GPy/testing/test_pickle.py
+++ b/GPy/testing/test_pickle.py
@ -53,7 +53,7 @@ class TestPickleSupport(ListDictTestCase):
        assert par.param_array.tolist() == pcopy.param_array.tolist()
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        assert str(par) == str(pcopy)
-        assert par.param_array != pcopy.param_array
+        assert np.all(par.param_array != pcopy.param_array)
        assert par.gradient_full != pcopy.gradient_full
        assert pcopy.checkgrad()
        assert np.any(pcopy.gradient != 0.0)
@ -72,7 +72,7 @@ class TestPickleSupport(ListDictTestCase):
        np.testing.assert_allclose(par.param_array, pcopy.param_array)
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        assert str(par) == str(pcopy)
-        assert par.param_array != pcopy.param_array
+        assert np.all(par.param_array != pcopy.param_array)
        assert par.gradient_full != pcopy.gradient_full
        assert pcopy.checkgrad()
        assert np.any(pcopy.gradient != 0.0)
@ -97,7 +97,7 @@ class TestPickleSupport(ListDictTestCase):
        assert par.param_array.tolist() == pcopy.param_array.tolist()
        assert par.gradient_full.tolist() == pcopy.gradient_full.tolist()
        assert str(par) == str(pcopy)
-        assert par.param_array != pcopy.param_array
+        assert np.all(par.param_array != pcopy.param_array)
        assert par.gradient_full != pcopy.gradient_full
        with tempfile.TemporaryFile("w+b") as f:
            par.pickle(f)
@ -116,7 +116,7 @@ class TestPickleSupport(ListDictTestCase):
        assert par.param_array.tolist() == pcopy.param_array.tolist()
        assert par.gradient_full.tolist() == pcopy.gradient_full.tolist()
        assert str(par) == str(pcopy)
-        assert par.param_array != pcopy.param_array
+        assert np.all(par.param_array != pcopy.param_array)
        assert par.gradient_full != pcopy.gradient_full
        assert par.checkgrad()
        assert pcopy.checkgrad()
--- a/GPy/util/classification.py
+++ b/GPy/util/classification.py
@ -2,7 +2,8 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np

-def conf_matrix(p,labels,names=['1','0'],threshold=.5,show=True):
+
+def conf_matrix(p, labels, names=["1", "0"], threshold=0.5, show=True):
    """
    Returns error rate and true/false positives in a binary classification problem
    - Actual classes are displayed by column.
@ -16,18 +17,18 @@ def conf_matrix(p,labels,names=['1','0'],threshold=.5,show=True):
    :type show: False|True
    """
    assert p.size == labels.size, "Arrays p and labels have different dimensions."
-    decision = np.ones((labels.size,1))
-    decision[p<threshold] = 0
+    decision = np.ones((labels.size, 1))
+    decision[p < threshold] = 0
    diff = decision - labels
    false_0 = diff[diff == -1].size
    false_1 = diff[diff == 1].size
-    true_1 = np.sum(decision[diff ==0])
+    true_1 = np.sum(decision[diff == 0])
    true_0 = labels.size - true_1 - false_0 - false_1
-    error = (false_1 + false_0)/np.float(labels.size)
+    error = (false_1 + false_0) / float(labels.size)
    if show:
-        print(100. - error * 100,'% instances correctly classified')
-        print('%-10s|  %-10s|  %-10s| ' % ('',names[0],names[1]))
-        print('----------|------------|------------|')
-        print('%-10s|  %-10s|  %-10s| ' % (names[0],true_1,false_0))
-        print('%-10s|  %-10s|  %-10s| ' % (names[1],false_1,true_0))
-    return error,true_1, false_1, true_0, false_0
+        print(100.0 - error * 100, "% instances correctly classified")
+        print("%-10s|  %-10s|  %-10s| " % ("", names[0], names[1]))
+        print("----------|------------|------------|")
+        print("%-10s|  %-10s|  %-10s| " % (names[0], true_1, false_0))
+        print("%-10s|  %-10s|  %-10s| " % (names[1], false_1, true_0))
+    return error, true_1, false_1, true_0, false_0
--- a/GPy/util/multioutput.py
+++ b/GPy/util/multioutput.py
@ -2,6 +2,7 @@ import numpy as np
 import warnings
 import GPy

+
 def index_to_slices(index):
    """
    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index.
@ -16,28 +17,35 @@ def index_to_slices(index):
    returns
    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
    """
-    if len(index)==0:
-        return[]
+    if len(index) == 0:
+        return []

-    #contruct the return structure
-    ind = np.asarray(index,dtype=np.int)
-    ret = [[] for i in range(ind.max()+1)]
+    # contruct the return structure
+    ind = np.asarray(index, dtype=int)
+    ret = [[] for i in range(ind.max() + 1)]

-    #find the switchpoints
-    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
-    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
+    # find the switchpoints
+    ind_ = np.hstack((ind, ind[0] + ind[-1] + 1))
+    switchpoints = np.nonzero(ind_ - np.roll(ind_, +1))[0]

-    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
+    [
+        ret[ind_i].append(slice(*indexes_i))
+        for ind_i, indexes_i in zip(
+            ind[switchpoints[:-1]], zip(switchpoints, switchpoints[1:])
+        )
+    ]
    return ret

+
 def get_slices(input_list):
    num_outputs = len(input_list)
-    _s = [0] + [ _x.shape[0] for _x in input_list ]
+    _s = [0] + [_x.shape[0] for _x in input_list]
    _s = np.cumsum(_s)
-    slices = [slice(a,b) for a,b in zip(_s[:-1],_s[1:])]
+    slices = [slice(a, b) for a, b in zip(_s[:-1], _s[1:])]
    return slices

-def build_XY(input_list,output_list=None,index=None):
+
+def build_XY(input_list, output_list=None, index=None):
    num_outputs = len(input_list)
    if output_list is not None:
        assert num_outputs == len(output_list)
@ -47,27 +55,35 @@ def build_XY(input_list,output_list=None,index=None):

    if index is not None:
        assert len(index) == num_outputs
-        I = np.hstack( [np.repeat(j,_x.shape[0]) for _x,j in zip(input_list,index)] )
+        I = np.hstack([np.repeat(j, _x.shape[0]) for _x, j in zip(input_list, index)])
    else:
-        I = np.hstack( [np.repeat(j,_x.shape[0]) for _x,j in zip(input_list,range(num_outputs))] )
+        I = np.hstack(
+            [np.repeat(j, _x.shape[0]) for _x, j in zip(input_list, range(num_outputs))]
+        )

    X = np.vstack(input_list)
-    X = np.hstack([X,I[:,None]])
+    X = np.hstack([X, I[:, None]])

-    return X,Y,I[:,None]#slices
+    return X, Y, I[:, None]  # slices

-def build_likelihood(Y_list,noise_index,likelihoods_list=None):
+
+def build_likelihood(Y_list, noise_index, likelihoods_list=None):
    Ny = len(Y_list)
    if likelihoods_list is None:
-       likelihoods_list = [GPy.likelihoods.Gaussian(name="Gaussian_noise_%s" %j) for y,j in zip(Y_list,range(Ny))]
+        likelihoods_list = [
+            GPy.likelihoods.Gaussian(name="Gaussian_noise_%s" % j)
+            for y, j in zip(Y_list, range(Ny))
+        ]
    else:
        assert len(likelihoods_list) == Ny
-    #likelihood = GPy.likelihoods.mixed_noise.MixedNoise(likelihoods_list=likelihoods_list, noise_index=noise_index)
-    likelihood = GPy.likelihoods.mixed_noise.MixedNoise(likelihoods_list=likelihoods_list)
+    # likelihood = GPy.likelihoods.mixed_noise.MixedNoise(likelihoods_list=likelihoods_list, noise_index=noise_index)
+    likelihood = GPy.likelihoods.mixed_noise.MixedNoise(
+        likelihoods_list=likelihoods_list
+    )
    return likelihood


-def ICM(input_dim, num_outputs, kernel, W_rank=1,W=None,kappa=None,name='ICM'):
+def ICM(input_dim, num_outputs, kernel, W_rank=1, W=None, kappa=None, name="ICM"):
    """
    Builds a kernel for an Intrinsic Coregionalization Model

@ -80,13 +96,26 @@ def ICM(input_dim, num_outputs, kernel, W_rank=1,W=None,kappa=None,name='ICM'):
    """
    if kernel.input_dim != input_dim:
        kernel.input_dim = input_dim
-        warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
+        warnings.warn(
+            "kernel's input dimension overwritten to fit input_dim parameter."
+        )

-    K = kernel.prod(GPy.kern.Coregionalize(1, num_outputs, active_dims=[input_dim], rank=W_rank,W=W,kappa=kappa,name='B'),name=name)
+    K = kernel.prod(
+        GPy.kern.Coregionalize(
+            1,
+            num_outputs,
+            active_dims=[input_dim],
+            rank=W_rank,
+            W=W,
+            kappa=kappa,
+            name="B",
+        ),
+        name=name,
+    )
    return K


-def LCM(input_dim, num_outputs, kernels_list, W_rank=1,name='ICM'):
+def LCM(input_dim, num_outputs, kernels_list, W_rank=1, name="ICM"):
    """
    Builds a kernel for an Linear Coregionalization Model

@ -98,15 +127,15 @@ def LCM(input_dim, num_outputs, kernels_list, W_rank=1,name='ICM'):
    :type W_rank: integer
    """
    Nk = len(kernels_list)
-    K = ICM(input_dim,num_outputs,kernels_list[0],W_rank,name='%s%s' %(name,0))
+    K = ICM(input_dim, num_outputs, kernels_list[0], W_rank, name="%s%s" % (name, 0))
    j = 1
    for kernel in kernels_list[1:]:
-        K += ICM(input_dim,num_outputs,kernel,W_rank,name='%s%s' %(name,j))
+        K += ICM(input_dim, num_outputs, kernel, W_rank, name="%s%s" % (name, j))
        j += 1
    return K


-def Private(input_dim, num_outputs, kernel, output, kappa=None,name='X'):
+def Private(input_dim, num_outputs, kernel, output, kappa=None, name="X"):
    """
    Builds a kernel for an Intrinsic Coregionalization Model

@ -117,7 +146,7 @@ def Private(input_dim, num_outputs, kernel, output, kappa=None,name='X'):
    :param W_rank: number tuples of the corregionalization parameters 'W'
    :type W_rank: integer
    """
-    K = ICM(input_dim,num_outputs,kernel,W_rank=1,kappa=kappa,name=name)
+    K = ICM(input_dim, num_outputs, kernel, W_rank=1, kappa=kappa, name=name)
    K.B.W.fix(0)
    _range = range(num_outputs)
    _range.pop(output)