diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3d43591e..7d56ed1f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## Unreleased
 
++ update import in `.plotting.matplot_dep.defaults` due to change in matplotlib
+
 ## v1.13.1 (2024-01-14)
 
 * limit `scipy<1.12` as macos and linux jobs install some pre-release version of `scipy==1.12` which breaks tests
diff --git a/GPy/kern/src/sde_standard_periodic.py b/GPy/kern/src/sde_standard_periodic.py
index be32f7b2..2963eb66 100644
--- a/GPy/kern/src/sde_standard_periodic.py
+++ b/GPy/kern/src/sde_standard_periodic.py
@@ -13,213 +13,267 @@ import warnings
 
 from scipy import special as special
 
+
 class sde_StdPeriodic(StdPeriodic):
     """
-    
+
     Class provide extra functionality to transfer this covariance function into
     SDE form.
-    
+
     Standard Periodic kernel:
 
     .. math::
 
-       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
+       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}
        \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
 
     """
+
     # TODO: write comment to the constructor arguments
     def __init__(self, *args, **kwargs):
         """
         Init constructior.
-        
-        Two optinal extra parameters are added in addition to the ones in 
+
+        Two optinal extra parameters are added in addition to the ones in
         StdPeriodic kernel.
-        
+
         :param approx_order: approximation order for the RBF covariance. (Default 7)
         :type approx_order: int
-        
+
         :param balance: Whether to balance this kernel separately. (Defaulf False). Model has a separate parameter for balancing.
         :type balance: bool
         """
-        
-        #import pdb; pdb.set_trace()
-        
-        if 'approx_order' in kwargs:
-            self.approx_order = kwargs.get('approx_order')
-            del kwargs['approx_order']
+
+        # import pdb; pdb.set_trace()
+
+        if "approx_order" in kwargs:
+            self.approx_order = kwargs.get("approx_order")
+            del kwargs["approx_order"]
         else:
             self.approx_order = 7
-        
-        
-        if 'balance' in kwargs:
-            self.balance = bool( kwargs.get('balance') )
-            del kwargs['balance']
+
+        if "balance" in kwargs:
+            self.balance = bool(kwargs.get("balance"))
+            del kwargs["balance"]
         else:
             self.balance = False
-        
+
         super(sde_StdPeriodic, self).__init__(*args, **kwargs)
-        
+
     def sde_update_gradient_full(self, gradients):
         """
         Update gradient in the order in which parameters are represented in the
         kernel
         """
-    
+
         self.variance.gradient = gradients[0]
         self.period.gradient = gradients[1]
         self.lengthscale.gradient = gradients[2]
-        
-    def sde(self): 
-        """ 
+
+    def sde(self):
+        """
         Return the state space representation of the standard periodic covariance.
-        
-        
+
+
         ! Note: one must constrain lengthscale not to drop below 0.2. (independently of approximation order)
         After this Bessel functions of the first becomes NaN. Rescaling
         time variable might help.
-        
+
         ! Note: one must keep period also not very low. Because then
-        the gradients wrt wavelength become ustable. 
+        the gradients wrt wavelength become ustable.
         However this might depend on the data. For test example with
-        300 data points the low limit is 0.15. 
-        """ 
-        
-        #import pdb; pdb.set_trace()
+        300 data points the low limit is 0.15.
+        """
+
+        # import pdb; pdb.set_trace()
         # Params to use: (in that order)
-        #self.variance
-        #self.period
-        #self.lengthscale
+        # self.variance
+        # self.period
+        # self.lengthscale
         if self.approx_order is not None:
             N = int(self.approx_order)
         else:
-            N = 7 # approximation order        
-        
-        p_period = float(self.period)        
-        p_lengthscale = 2*float(self.lengthscale)
-        p_variance = float(self.variance)        
-        
-        w0 = 2*np.pi/p_period # frequency
+            N = 7  # approximation order
+
+        p_period = float(self.period)
+        p_lengthscale = 2 * float(self.lengthscale)
+        p_variance = float(self.variance)
+
+        w0 = 2 * np.pi / p_period  # frequency
         # lengthscale is multiplied by 2 because of different definition of lengthscale
-        
-        [q2,dq2l] = seriescoeff(N, p_lengthscale, p_variance)        
-        
-        dq2l = 2*dq2l  # This is because the lengthscale if multiplied by 2.
-        
+
+        [q2, dq2l] = seriescoeff(N, p_lengthscale, p_variance)
+
+        dq2l = 2 * dq2l  # This is because the lengthscale if multiplied by 2.
+
         eps = 1e-12
-        if np.any( np.isfinite(q2) == False) or np.any( np.abs(q2) > 1.0/eps) or np.any( np.abs(q2) < eps):
-            warnings.warn("sde_Periodic:  Infinite, too small, or too large (eps={0:e}) values in q2 :".format(eps) + q2.__format__("") )
-                                
-        if np.any( np.isfinite(dq2l) == False) or np.any( np.abs(dq2l) > 1.0/eps) or np.any( np.abs(dq2l) < eps):
-            warnings.warn("sde_Periodic:  Infinite, too small, or too large (eps={0:e}) values in dq2l :".format(eps) + q2.__format__("") )
-                 
-                 
-        F    = np.kron(np.diag(range(0,N+1)),np.array( ((0, -w0), (w0, 0)) ) )
-        L    = np.eye(2*(N+1))
-        Qc   = np.zeros((2*(N+1), 2*(N+1)))
-        P_inf = np.kron(np.diag(q2),np.eye(2))
-        H    = np.kron(np.ones((1,N+1)),np.array((1,0)) )
+        if (
+            np.any(np.isfinite(q2) == False)
+            or np.any(np.abs(q2) > 1.0 / eps)
+            or np.any(np.abs(q2) < eps)
+        ):
+            warnings.warn(
+                "sde_Periodic:  Infinite, too small, or too large (eps={0:e}) values in q2 :".format(
+                    eps
+                )
+                + q2.__format__("")
+            )
+
+        if (
+            np.any(np.isfinite(dq2l) == False)
+            or np.any(np.abs(dq2l) > 1.0 / eps)
+            or np.any(np.abs(dq2l) < eps)
+        ):
+            warnings.warn(
+                "sde_Periodic:  Infinite, too small, or too large (eps={0:e}) values in dq2l :".format(
+                    eps
+                )
+                + q2.__format__("")
+            )
+
+        F = np.kron(np.diag(range(0, N + 1)), np.array(((0, -w0), (w0, 0))))
+        L = np.eye(2 * (N + 1))
+        Qc = np.zeros((2 * (N + 1), 2 * (N + 1)))
+        P_inf = np.kron(np.diag(q2), np.eye(2))
+        H = np.kron(np.ones((1, N + 1)), np.array((1, 0)))
         P0 = P_inf.copy()
-        
+
         # Derivatives
         dF = np.empty((F.shape[0], F.shape[1], 3))
         dQc = np.empty((Qc.shape[0], Qc.shape[1], 3))
-        dP_inf = np.empty((P_inf.shape[0], P_inf.shape[1], 3))         
-        
+        dP_inf = np.empty((P_inf.shape[0], P_inf.shape[1], 3))
+
         # Derivatives wrt self.variance
-        dF[:,:,0] = np.zeros(F.shape)
-        dQc[:,:,0] = np.zeros(Qc.shape)
-        dP_inf[:,:,0] = P_inf / p_variance
+        dF[:, :, 0] = np.zeros(F.shape)
+        dQc[:, :, 0] = np.zeros(Qc.shape)
+        dP_inf[:, :, 0] = P_inf / p_variance
 
         # Derivatives self.period
-        dF[:,:,1] = np.kron(np.diag(range(0,N+1)),np.array( ((0,  w0), (-w0, 0)) ) / p_period );
-        dQc[:,:,1] = np.zeros(Qc.shape)
-        dP_inf[:,:,1] = np.zeros(P_inf.shape)      
-        
-        # Derivatives self.lengthscales        
-        dF[:,:,2] = np.zeros(F.shape)
-        dQc[:,:,2] = np.zeros(Qc.shape)
-        dP_inf[:,:,2] = np.kron(np.diag(dq2l),np.eye(2))
+        dF[:, :, 1] = np.kron(
+            np.diag(range(0, N + 1)), np.array(((0, w0), (-w0, 0))) / p_period
+        )
+        dQc[:, :, 1] = np.zeros(Qc.shape)
+        dP_inf[:, :, 1] = np.zeros(P_inf.shape)
+
+        # Derivatives self.lengthscales
+        dF[:, :, 2] = np.zeros(F.shape)
+        dQc[:, :, 2] = np.zeros(Qc.shape)
+        dP_inf[:, :, 2] = np.kron(np.diag(dq2l), np.eye(2))
         dP0 = dP_inf.copy()
-        
+
         if self.balance:
             # Benefits of this are not very sound.
             import GPy.models.state_space_main as ssm
-            (F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf,dP0) = ssm.balance_ss_model(F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0 )
-            
+
+            (F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0) = ssm.balance_ss_model(
+                F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0
+            )
+
         return (F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0)
-        
-        
-        
-        
-def seriescoeff(m=6,lengthScale=1.0,magnSigma2=1.0, true_covariance=False):
+
+
+def seriescoeff(m=6, lengthScale=1.0, magnSigma2=1.0, true_covariance=False):
     """
-    Calculate the coefficients q_j^2 for the covariance function 
+    Calculate the coefficients q_j^2 for the covariance function
     approximation:
-    
+
         k(\tau) =  \sum_{j=0}^{+\infty} q_j^2 \cos(j\omega_0 \tau)
-    
+
     Reference is:
 
-    [1] Arno Solin and Simo Särkkä (2014). Explicit link between periodic 
-        covariance functions and state space models. In Proceedings of the 
-        Seventeenth International Conference on Artifcial Intelligence and 
-        Statistics (AISTATS 2014). JMLR: W&CP, volume 33.    
-    
-    Note! Only the infinite approximation (through Bessel function) 
+    [1] Arno Solin and Simo Särkkä (2014). Explicit link between periodic
+        covariance functions and state space models. In Proceedings of the
+        Seventeenth International Conference on Artifcial Intelligence and
+        Statistics (AISTATS 2014). JMLR: W&CP, volume 33.
+
+    Note! Only the infinite approximation (through Bessel function)
           is currently implemented.
 
     Input:
     ----------------
-    
+
     m: int
         Degree of approximation. Default 6.
     lengthScale: float
         Length scale parameter in the kerenl
     magnSigma2:float
         Multiplier in front of the kernel.
-        
-    
+
+
     Output:
     -----------------
-    
+
     coeffs: array(m+1)
         Covariance series coefficients
-    
+
     coeffs_dl: array(m+1)
         Derivatives of the coefficients with respect to lengthscale.
-    
+
     """
-    
+
     if true_covariance:
-        
-        bb = lambda j,m: (1.0 + np.array((j != 0), dtype=np.float64) ) / (2**(j)) *\
-            sp.special.binom(j, sp.floor( (j-m)/2.0 * np.array(m<=j, dtype=np.float64) ))*\
-            np.array(m<=j, dtype=np.float64) *np.array(sp.mod(j-m,2)==0, dtype=np.float64)
-                
-        M,J = np.meshgrid(range(0,m+1),range(0,m+1))
-        
-        coeffs = bb(J,M) / sp.misc.factorial(J) * sp.exp( -lengthScale**(-2) ) *\
-             (lengthScale**(-2))**J  *magnSigma2
-        
-        coeffs_dl = np.sum( coeffs*lengthScale**(-3)*(2.0-2.0*J*lengthScale**2),0)         
-        
-        coeffs = np.sum(coeffs,0)
-        
+
+        bb = (
+            lambda j, m: (1.0 + np.array((j != 0), dtype=np.float64))
+            / (2 ** (j))
+            * sp.special.binom(
+                j, sp.floor((j - m) / 2.0 * np.array(m <= j, dtype=np.float64))
+            )
+            * np.array(m <= j, dtype=np.float64)
+            * np.array(sp.mod(j - m, 2) == 0, dtype=np.float64)
+        )
+
+        M, J = np.meshgrid(range(0, m + 1), range(0, m + 1))
+
+        coeffs = (
+            bb(J, M)
+            / sp.misc.factorial(J)
+            * np.exp(-(lengthScale ** (-2)))
+            * (lengthScale ** (-2)) ** J
+            * magnSigma2
+        )
+
+        coeffs_dl = np.sum(
+            coeffs * lengthScale ** (-3) * (2.0 - 2.0 * J * lengthScale**2), 0
+        )
+
+        coeffs = np.sum(coeffs, 0)
+
     else:
-        coeffs = 2*magnSigma2*sp.exp( -lengthScale**(-2) ) * special.iv(range(0,m+1),1.0/lengthScale**(2))
-        if np.any( np.isfinite(coeffs) == False):
+        coeffs = (
+            2
+            * magnSigma2
+            * np.exp(-(lengthScale ** (-2)))
+            * special.iv(range(0, m + 1), 1.0 / lengthScale ** (2))
+        )
+        if np.any(np.isfinite(coeffs) == False):
             raise ValueError("sde_standard_periodic: Coefficients are not finite!")
-        #import pdb; pdb.set_trace()
-        coeffs[0] = 0.5*coeffs[0]
-        #print(coeffs)
+        # import pdb; pdb.set_trace()
+        coeffs[0] = 0.5 * coeffs[0]
+        # print(coeffs)
         # Derivatives wrt (lengthScale)
-        coeffs_dl = np.zeros(m+1)
-        coeffs_dl[1:] = magnSigma2*lengthScale**(-3) * sp.exp(-lengthScale**(-2))*\
-        (-4*special.iv(range(0,m),lengthScale**(-2)) + 4*(1+np.arange(1,m+1)*lengthScale**(2))*special.iv(range(1,m+1),lengthScale**(-2)) )    
-            
+        coeffs_dl = np.zeros(m + 1)
+        coeffs_dl[1:] = (
+            magnSigma2
+            * lengthScale ** (-3)
+            * np.exp(-(lengthScale ** (-2)))
+            * (
+                -4 * special.iv(range(0, m), lengthScale ** (-2))
+                + 4
+                * (1 + np.arange(1, m + 1) * lengthScale ** (2))
+                * special.iv(range(1, m + 1), lengthScale ** (-2))
+            )
+        )
+
         # The first element
-        coeffs_dl[0] = magnSigma2*lengthScale**(-3) * np.exp(-lengthScale**(-2))*\
-            (2*special.iv(0,lengthScale**(-2)) - 2*special.iv(1,lengthScale**(-2)) )     
-        
+        coeffs_dl[0] = (
+            magnSigma2
+            * lengthScale ** (-3)
+            * np.exp(-(lengthScale ** (-2)))
+            * (
+                2 * special.iv(0, lengthScale ** (-2))
+                - 2 * special.iv(1, lengthScale ** (-2))
+            )
+        )
 
     return coeffs.squeeze(), coeffs_dl.squeeze()
diff --git a/GPy/kern/src/sde_stationary.py b/GPy/kern/src/sde_stationary.py
index 9802905c..61516799 100644
--- a/GPy/kern/src/sde_stationary.py
+++ b/GPy/kern/src/sde_stationary.py
@@ -11,12 +11,14 @@ from .stationary import RatQuad
 
 import numpy as np
 import scipy as sp
+
 try:
     from scipy.linalg import solve_continuous_lyapunov as lyap
 except ImportError:
     from scipy.linalg import solve_lyapunov as lyap
 import warnings
 
+
 class sde_RBF(RBF):
     """
 
@@ -30,37 +32,35 @@ class sde_RBF(RBF):
         k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
 
     """
+
     def __init__(self, *args, **kwargs):
         """
         Init constructior.
-        
-        Two optinal extra parameters are added in addition to the ones in 
+
+        Two optinal extra parameters are added in addition to the ones in
         RBF kernel.
-        
+
         :param approx_order: approximation order for the RBF covariance. (Default 10)
         :type approx_order: int
-        
+
         :param balance: Whether to balance this kernel separately. (Defaulf True). Model has a separate parameter for balancing.
         :type balance: bool
         """
-        
-        if 'balance' in kwargs:
-            self.balance = bool( kwargs.get('balance') )
-            del kwargs['balance']
+
+        if "balance" in kwargs:
+            self.balance = bool(kwargs.get("balance"))
+            del kwargs["balance"]
         else:
             self.balance = True
-        
-        
-        if 'approx_order' in kwargs:
-            self.approx_order = kwargs.get('approx_order')
-            del kwargs['approx_order']
+
+        if "approx_order" in kwargs:
+            self.approx_order = kwargs.get("approx_order")
+            del kwargs["approx_order"]
         else:
             self.approx_order = 6
-        
-        
-        
+
         super(sde_RBF, self).__init__(*args, **kwargs)
-        
+
     def sde_update_gradient_full(self, gradients):
         """
         Update gradient in the order in which parameters are represented in the
@@ -73,86 +73,111 @@ class sde_RBF(RBF):
     def sde(self):
         """
         Return the state space representation of the covariance.
-        
+
         Note! For Sparse GP inference too small or two high values of lengthscale
         lead to instabilities. This is because Qc are too high or too low
         and P_inf are not full rank. This effect depends on approximatio order.
         For N = 10. lengthscale must be in (0.8,8). For other N tests must be conducted.
         N=6: (0.06,31)
         Variance should be within reasonable bounds as well, but its dependence is linear.
-        
+
         The above facts do not take into accout regularization.
         """
-        #import pdb; pdb.set_trace()
+        # import pdb; pdb.set_trace()
         if self.approx_order is not None:
             N = self.approx_order
         else:
-            N = 10# approximation order ( number of terms in exponent series expansion)
-            
+            N = 10  # approximation order ( number of terms in exponent series expansion)
+
         roots_rounding_decimals = 6
 
         fn = np.math.factorial(N)
 
-        p_lengthscale = float( self.lengthscale )
+        p_lengthscale = float(self.lengthscale)
         p_variance = float(self.variance)
-        kappa = 1.0/2.0/p_lengthscale**2
+        kappa = 1.0 / 2.0 / p_lengthscale**2
+
+        Qc = np.array(((p_variance * np.sqrt(np.pi / kappa) * fn * (4 * kappa) ** N,),))
 
-        Qc = np.array( ((p_variance*np.sqrt(np.pi/kappa)*fn*(4*kappa)**N,),) )
-        
         eps = 1e-12
-        if (float(Qc) > 1.0/eps) or (float(Qc) < eps):
-            warnings.warn("""sde_RBF kernel: the noise variance Qc is either very large or very small. 
-                                It influece conditioning of P_inf: {0:e}""".format(float(Qc)) )
+        if (float(Qc) > 1.0 / eps) or (float(Qc) < eps):
+            warnings.warn(
+                """sde_RBF kernel: the noise variance Qc is either very large or very small. 
+                                It influece conditioning of P_inf: {0:e}""".format(
+                    float(Qc)
+                )
+            )
 
-        pp1 = np.zeros((2*N+1,)) # array of polynomial coefficients from higher power to lower
+        pp1 = np.zeros(
+            (2 * N + 1,)
+        )  # array of polynomial coefficients from higher power to lower
 
-        for n in range(0, N+1): # (2N+1) - number of polynomial coefficients
-            pp1[2*(N-n)] = fn*(4.0*kappa)**(N-n)/np.math.factorial(n)*(-1)**n
-            
-        pp = sp.poly1d(pp1)
-        roots = sp.roots(pp)
+        for n in range(0, N + 1):  # (2N+1) - number of polynomial coefficients
+            pp1[2 * (N - n)] = (
+                fn * (4.0 * kappa) ** (N - n) / np.math.factorial(n) * (-1) ** n
+            )
 
-        neg_real_part_roots = roots[np.round(np.real(roots) ,roots_rounding_decimals) < 0]
-        aa = sp.poly1d(neg_real_part_roots, r=True).coeffs
+        pp = np.poly1d(pp1)
+        roots = np.roots(pp)
 
-        F = np.diag(np.ones((N-1,)),1)
-        F[-1,:] = -aa[-1:0:-1]
+        neg_real_part_roots = roots[
+            np.round(np.real(roots), roots_rounding_decimals) < 0
+        ]
+        aa = np.poly1d(neg_real_part_roots, r=True).coeffs
 
-        L= np.zeros((N,1))
-        L[N-1,0] = 1
+        F = np.diag(np.ones((N - 1,)), 1)
+        F[-1, :] = -aa[-1:0:-1]
 
-        H = np.zeros((1,N))
-        H[0,0] = 1
+        L = np.zeros((N, 1))
+        L[N - 1, 0] = 1
+
+        H = np.zeros((1, N))
+        H[0, 0] = 1
 
         # Infinite covariance:
-        Pinf = lyap(F, -np.dot(L,np.dot( Qc[0,0],L.T)))
-        Pinf = 0.5*(Pinf + Pinf.T)
+        Pinf = lyap(F, -np.dot(L, np.dot(Qc[0, 0], L.T)))
+        Pinf = 0.5 * (Pinf + Pinf.T)
         # Allocating space for derivatives
-        dF    = np.empty([F.shape[0],F.shape[1],2])
-        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2])
-        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2])
+        dF = np.empty([F.shape[0], F.shape[1], 2])
+        dQc = np.empty([Qc.shape[0], Qc.shape[1], 2])
+        dPinf = np.empty([Pinf.shape[0], Pinf.shape[1], 2])
 
         # Derivatives:
         dFvariance = np.zeros(F.shape)
         dFlengthscale = np.zeros(F.shape)
-        dFlengthscale[-1,:] = -aa[-1:0:-1]/p_lengthscale * np.arange(-N,0,1)
+        dFlengthscale[-1, :] = -aa[-1:0:-1] / p_lengthscale * np.arange(-N, 0, 1)
 
-        dQcvariance = Qc/p_variance
-        dQclengthscale = np.array(( (p_variance*np.sqrt(2*np.pi)*fn*2**N*p_lengthscale**(-2*N)*(1-2*N),),))
-        
-        dPinf_variance = Pinf/p_variance
+        dQcvariance = Qc / p_variance
+        dQclengthscale = np.array(
+            (
+                (
+                    p_variance
+                    * np.sqrt(2 * np.pi)
+                    * fn
+                    * 2**N
+                    * p_lengthscale ** (-2 * N)
+                    * (1 - 2 * N),
+                ),
+            )
+        )
+
+        dPinf_variance = Pinf / p_variance
 
         lp = Pinf.shape[0]
-        coeff = np.arange(1,lp+1).reshape(lp,1) + np.arange(1,lp+1).reshape(1,lp) - 2
-        coeff[np.mod(coeff,2) != 0] = 0
-        dPinf_lengthscale = -1/p_lengthscale*Pinf*coeff
+        coeff = (
+            np.arange(1, lp + 1).reshape(lp, 1)
+            + np.arange(1, lp + 1).reshape(1, lp)
+            - 2
+        )
+        coeff[np.mod(coeff, 2) != 0] = 0
+        dPinf_lengthscale = -1 / p_lengthscale * Pinf * coeff
 
-        dF[:,:,0]    = dFvariance
-        dF[:,:,1]    = dFlengthscale
-        dQc[:,:,0]   = dQcvariance
-        dQc[:,:,1]   = dQclengthscale
-        dPinf[:,:,0] = dPinf_variance
-        dPinf[:,:,1] = dPinf_lengthscale
+        dF[:, :, 0] = dFvariance
+        dF[:, :, 1] = dFlengthscale
+        dQc[:, :, 0] = dQcvariance
+        dQc[:, :, 1] = dQclengthscale
+        dPinf[:, :, 0] = dPinf_variance
+        dPinf[:, :, 1] = dPinf_lengthscale
 
         P0 = Pinf.copy()
         dP0 = dPinf.copy()
@@ -161,10 +186,14 @@ class sde_RBF(RBF):
             # Benefits of this are not very sound. Helps only in one case:
             # SVD Kalman + RBF kernel
             import GPy.models.state_space_main as ssm
-            (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf,dP0) = ssm.balance_ss_model(F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0 )
+
+            (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0) = ssm.balance_ss_model(
+                F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0
+            )
 
         return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 
+
 class sde_Exponential(Exponential):
     """
 
@@ -195,30 +224,31 @@ class sde_Exponential(Exponential):
         variance = float(self.variance.values)
         lengthscale = float(self.lengthscale)
 
-        F  = np.array(((-1.0/lengthscale,),))
-        L  = np.array(((1.0,),))
-        Qc = np.array( ((2.0*variance/lengthscale,),) )
+        F = np.array(((-1.0 / lengthscale,),))
+        L = np.array(((1.0,),))
+        Qc = np.array(((2.0 * variance / lengthscale,),))
         H = np.array(((1.0,),))
         Pinf = np.array(((variance,),))
         P0 = Pinf.copy()
 
-        dF = np.zeros((1,1,2));
-        dQc = np.zeros((1,1,2));
-        dPinf = np.zeros((1,1,2));
+        dF = np.zeros((1, 1, 2))
+        dQc = np.zeros((1, 1, 2))
+        dPinf = np.zeros((1, 1, 2))
 
-        dF[:,:,0] = 0.0
-        dF[:,:,1] = 1.0/lengthscale**2
+        dF[:, :, 0] = 0.0
+        dF[:, :, 1] = 1.0 / lengthscale**2
 
-        dQc[:,:,0] = 2.0/lengthscale
-        dQc[:,:,1] = -2.0*variance/lengthscale**2
+        dQc[:, :, 0] = 2.0 / lengthscale
+        dQc[:, :, 1] = -2.0 * variance / lengthscale**2
 
-        dPinf[:,:,0] = 1.0
-        dPinf[:,:,1] = 0.0
+        dPinf[:, :, 0] = 1.0
+        dPinf[:, :, 1] = 0.0
 
         dP0 = dPinf.copy()
 
         return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 
+
 class sde_RatQuad(RatQuad):
     """
 
@@ -238,12 +268,12 @@ class sde_RatQuad(RatQuad):
         Return the state space representation of the covariance.
         """
 
-        assert False, 'Not Implemented'
+        assert False, "Not Implemented"
 
         # Params to use:
 
         # self.lengthscale
         # self.variance
-        #self.power
+        # self.power
 
-        #return (F, L, Qc, H, Pinf, dF, dQc, dPinf)
+        # return (F, L, Qc, H, Pinf, dF, dQc, dPinf)
diff --git a/GPy/plotting/matplot_dep/defaults.py b/GPy/plotting/matplot_dep/defaults.py
index 8518b9d0..553922d6 100644
--- a/GPy/plotting/matplot_dep/defaults.py
+++ b/GPy/plotting/matplot_dep/defaults.py
@@ -1,4 +1,4 @@
-#===============================================================================
+# ===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 # All rights reserved.
 #
@@ -26,12 +26,12 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
+# ===============================================================================
 
-from matplotlib import cm
+from matplotlib import pyplot
 from .. import Tango
 
-'''
+"""
 This file is for defaults for the gpy plot, specific to the plotting library.
 
 Create a kwargs dictionary with the right name for the plotting function
@@ -40,36 +40,55 @@ the plotting library will be used.
 
 In the code, always ise plotting.gpy_plots.defaults to get the defaults, as
 it gives back an empty default, when defaults are not defined.
-'''
+"""
 
 # Data plots:
-data_1d = dict(lw=1.5, marker='x', color='k')
-data_2d = dict(s=35, edgecolors='none', linewidth=0., cmap=cm.get_cmap('hot'), alpha=.5)
-inducing_1d = dict(lw=0, s=500, color=Tango.colorsHex['darkRed'])
-inducing_2d = dict(s=17, edgecolor='k', linewidth=.4, color='white', alpha=.5, marker='^')
-inducing_3d = dict(lw=.3, s=500, color=Tango.colorsHex['darkRed'], edgecolor='k')
-xerrorbar = dict(color='k', fmt='none', elinewidth=.5, alpha=.5)
-yerrorbar = dict(color=Tango.colorsHex['darkRed'], fmt='none', elinewidth=.5, alpha=.5)
+data_1d = dict(lw=1.5, marker="x", color="k")
+data_2d = dict(
+    s=35, edgecolors="none", linewidth=0.0, cmap=pyplot.get_cmap("hot"), alpha=0.5
+)
+inducing_1d = dict(lw=0, s=500, color=Tango.colorsHex["darkRed"])
+inducing_2d = dict(
+    s=17, edgecolor="k", linewidth=0.4, color="white", alpha=0.5, marker="^"
+)
+inducing_3d = dict(lw=0.3, s=500, color=Tango.colorsHex["darkRed"], edgecolor="k")
+xerrorbar = dict(color="k", fmt="none", elinewidth=0.5, alpha=0.5)
+yerrorbar = dict(
+    color=Tango.colorsHex["darkRed"], fmt="none", elinewidth=0.5, alpha=0.5
+)
 
 # GP plots:
-meanplot_1d = dict(color=Tango.colorsHex['mediumBlue'], linewidth=2)
-meanplot_2d = dict(cmap='hot', linewidth=.5)
-meanplot_3d = dict(linewidth=0, antialiased=True, cstride=1, rstride=1, cmap='hot', alpha=.3)
-samples_1d = dict(color=Tango.colorsHex['mediumBlue'], linewidth=.3)
-samples_3d = dict(cmap='hot', alpha=.1, antialiased=True, cstride=1, rstride=1, linewidth=0)
-confidence_interval = dict(edgecolor=Tango.colorsHex['darkBlue'], linewidth=.5, color=Tango.colorsHex['lightBlue'],alpha=.2)
-density = dict(alpha=.5, color=Tango.colorsHex['lightBlue'])
+meanplot_1d = dict(color=Tango.colorsHex["mediumBlue"], linewidth=2)
+meanplot_2d = dict(cmap="hot", linewidth=0.5)
+meanplot_3d = dict(
+    linewidth=0, antialiased=True, cstride=1, rstride=1, cmap="hot", alpha=0.3
+)
+samples_1d = dict(color=Tango.colorsHex["mediumBlue"], linewidth=0.3)
+samples_3d = dict(
+    cmap="hot", alpha=0.1, antialiased=True, cstride=1, rstride=1, linewidth=0
+)
+confidence_interval = dict(
+    edgecolor=Tango.colorsHex["darkBlue"],
+    linewidth=0.5,
+    color=Tango.colorsHex["lightBlue"],
+    alpha=0.2,
+)
+density = dict(alpha=0.5, color=Tango.colorsHex["lightBlue"])
 
 # GPLVM plots:
-data_y_1d = dict(linewidth=0, cmap='RdBu', s=40)
-data_y_1d_plot = dict(color='k', linewidth=1.5)
+data_y_1d = dict(linewidth=0, cmap="RdBu", s=40)
+data_y_1d_plot = dict(color="k", linewidth=1.5)
 
 # Kernel plots:
-ard = dict(edgecolor='k', linewidth=1.2)
+ard = dict(edgecolor="k", linewidth=1.2)
 
 # Input plots:
-latent = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
-gradient = dict(aspect='auto', cmap='RdBu', interpolation='nearest', alpha=.7)
-magnification = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
-latent_scatter = dict(s=20, linewidth=.2, edgecolor='k', alpha=.9)
-annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
+latent = dict(aspect="auto", cmap="Greys", interpolation="bicubic")
+gradient = dict(aspect="auto", cmap="RdBu", interpolation="nearest", alpha=0.7)
+magnification = dict(aspect="auto", cmap="Greys", interpolation="bicubic")
+latent_scatter = dict(s=20, linewidth=0.2, edgecolor="k", alpha=0.9)
+annotation = dict(
+    fontdict=dict(family="sans-serif", weight="light", fontsize=9),
+    zorder=0.3,
+    alpha=0.7,
+)
diff --git a/GPy/plotting/matplot_dep/variational_plots.py b/GPy/plotting/matplot_dep/variational_plots.py
index 3f20efeb..17481a80 100644
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@@ -1,4 +1,6 @@
-from matplotlib import pyplot as pb, numpy as np
+from matplotlib import pyplot as pb
+import numpy as np
+
 
 def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
     """
@@ -17,6 +19,7 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
     if colors is None:
         from ..Tango import mediumList
         from itertools import cycle
+
         colors = cycle(mediumList)
         pb.clf()
     else:
@@ -33,21 +36,30 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
             a = ax[i]
         else:
             raise ValueError("Need one ax per latent dimension input_dim")
-        bg_lines.append(a.plot(means, c='k', alpha=.3))
-        lines.extend(a.plot(x, means.T[i], c=next(colors), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
-        fills.append(a.fill_between(x,
-                        means.T[i] - 2 * np.sqrt(variances.T[i]),
-                        means.T[i] + 2 * np.sqrt(variances.T[i]),
-                        facecolor=lines[-1].get_color(),
-                        alpha=.3))
-        a.legend(borderaxespad=0.)
+        bg_lines.append(a.plot(means, c="k", alpha=0.3))
+        lines.extend(
+            a.plot(
+                x, means.T[i], c=next(colors), label=r"$\mathbf{{X_{{{}}}}}$".format(i)
+            )
+        )
+        fills.append(
+            a.fill_between(
+                x,
+                means.T[i] - 2 * np.sqrt(variances.T[i]),
+                means.T[i] + 2 * np.sqrt(variances.T[i]),
+                facecolor=lines[-1].get_color(),
+                alpha=0.3,
+            )
+        )
+        a.legend(borderaxespad=0.0)
         a.set_xlim(x.min(), x.max())
         if i < means.shape[1] - 1:
-            a.set_xticklabels('')
+            a.set_xticklabels("")
     pb.draw()
-    a.figure.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
+    a.figure.tight_layout(h_pad=0.01)  # , rect=(0, 0, 1, .95))
     return dict(lines=lines, fills=fills, bg_lines=bg_lines)
 
+
 def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_side=True):
     """
     Plot latent space X in 1D:
@@ -62,45 +74,60 @@ def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_sid
     """
     if ax is None:
         if side_by_side:
-            fig = pb.figure(num=fignum, figsize=(16, min(12, (2 * parameterized.mean.shape[1]))))
+            fig = pb.figure(
+                num=fignum, figsize=(16, min(12, (2 * parameterized.mean.shape[1])))
+            )
         else:
-            fig = pb.figure(num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1]))))
+            fig = pb.figure(
+                num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1])))
+            )
     if colors is None:
         from ..Tango import mediumList
         from itertools import cycle
+
         colors = cycle(mediumList)
         pb.clf()
     else:
         colors = iter(colors)
     plots = []
-    means, variances, gamma = parameterized.mean, parameterized.variance, parameterized.binary_prob
+    means, variances, gamma = (
+        parameterized.mean,
+        parameterized.variance,
+        parameterized.binary_prob,
+    )
     x = np.arange(means.shape[0])
     for i in range(means.shape[1]):
         if side_by_side:
-            sub1 = (means.shape[1],2,2*i+1)
-            sub2 = (means.shape[1],2,2*i+2)
+            sub1 = (means.shape[1], 2, 2 * i + 1)
+            sub2 = (means.shape[1], 2, 2 * i + 2)
         else:
-            sub1 = (means.shape[1]*2,1,2*i+1)
-            sub2 = (means.shape[1]*2,1,2*i+2)
+            sub1 = (means.shape[1] * 2, 1, 2 * i + 1)
+            sub2 = (means.shape[1] * 2, 1, 2 * i + 2)
 
         # mean and variance plot
         a = fig.add_subplot(*sub1)
-        a.plot(means, c='k', alpha=.3)
-        plots.extend(a.plot(x, means.T[i], c=next(colors), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
-        a.fill_between(x,
-                        means.T[i] - 2 * np.sqrt(variances.T[i]),
-                        means.T[i] + 2 * np.sqrt(variances.T[i]),
-                        facecolor=plots[-1].get_color(),
-                        alpha=.3)
-        a.legend(borderaxespad=0.)
+        a.plot(means, c="k", alpha=0.3)
+        plots.extend(
+            a.plot(
+                x, means.T[i], c=next(colors), label=r"$\mathbf{{X_{{{}}}}}$".format(i)
+            )
+        )
+        a.fill_between(
+            x,
+            means.T[i] - 2 * np.sqrt(variances.T[i]),
+            means.T[i] + 2 * np.sqrt(variances.T[i]),
+            facecolor=plots[-1].get_color(),
+            alpha=0.3,
+        )
+        a.legend(borderaxespad=0.0)
         a.set_xlim(x.min(), x.max())
         if i < means.shape[1] - 1:
-            a.set_xticklabels('')
+            a.set_xticklabels("")
         # binary prob plot
         a = fig.add_subplot(*sub2)
-        a.bar(x,gamma[:,i],bottom=0.,linewidth=1.,width=1.0,align='center')
+        a.bar(x, gamma[:, i], bottom=0.0, linewidth=1.0, width=1.0, align="center")
         a.set_xlim(x.min(), x.max())
-        a.set_ylim([0.,1.])
+        a.set_ylim([0.0, 1.0])
     pb.draw()
-    fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
+    fig.tight_layout(h_pad=0.01)  # , rect=(0, 0, 1, .95))
     return fig
diff --git a/GPy/testing/test_grid.py b/GPy/testing/test_grid.py
index f46b95a6..dbd61183 100644
--- a/GPy/testing/test_grid.py
+++ b/GPy/testing/test_grid.py
@@ -29,6 +29,7 @@ class TestGridModel:
         self.dim = self.X.shape[1]
 
     def test_alpha_match(self):
+        self.setup()
         kernel = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
         m = GPy.models.GPRegressionGrid(self.X, self.Y, kernel)
 
@@ -38,6 +39,7 @@ class TestGridModel:
         np.testing.assert_almost_equal(m.posterior.alpha, m2.posterior.woodbury_vector)
 
     def test_gradient_match(self):
+        self.setup()
         kernel = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
         m = GPy.models.GPRegressionGrid(self.X, self.Y, kernel)
 
@@ -55,6 +57,7 @@ class TestGridModel:
         )
 
     def test_prediction_match(self):
+        self.setup()
         kernel = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
         m = GPy.models.GPRegressionGrid(self.X, self.Y, kernel)
 
diff --git a/GPy/testing/test_kernel.py b/GPy/testing/test_kernel.py
index bae1ed0b..2862e195 100644
--- a/GPy/testing/test_kernel.py
+++ b/GPy/testing/test_kernel.py
@@ -544,6 +544,7 @@ class TestKernelGradientContinuous:
         assert check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose)
 
     def test_OU(self):
+        self.setup()
         k = GPy.kern.OU(self.D - 1, ARD=True)
         k.randomize()
         assert check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose)