diff --git a/GPy/core/parameterization/priors.py b/GPy/core/parameterization/priors.py
index c4dfbc2a..3550a8b5 100644
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@@ -13,14 +13,15 @@ import weakref
 class Prior(object):
     domain = None
     _instance = None
+
     def __new__(cls, *args, **kwargs):
         if not cls._instance or cls._instance.__class__ is not cls:
-                newfunc = super(Prior, cls).__new__
-                if newfunc is object.__new__:
-                    cls._instance = newfunc(cls)
-                else:
-                    cls._instance = newfunc(cls, *args, **kwargs)
-                return cls._instance
+            newfunc = super(Prior, cls).__new__
+            if newfunc is object.__new__:
+                cls._instance = newfunc(cls)
+            else:
+                cls._instance = newfunc(cls, *args, **kwargs)
+            return cls._instance
 
     def pdf(self, x):
         return np.exp(self.lnpdf(x))
@@ -47,6 +48,7 @@ class Gaussian(Prior):
     .. Note:: Bishop 2006 notation is used throughout the code
 
     """
+
     domain = _REAL
     _instances = []
 
@@ -82,6 +84,7 @@ class Gaussian(Prior):
     def rvs(self, n):
         return np.random.randn(n) * self.sigma + self.mu
 
+
 #     def __getstate__(self):
 #         return self.mu, self.sigma
 #
@@ -91,6 +94,7 @@ class Gaussian(Prior):
 #         self.sigma2 = np.square(self.sigma)
 #         self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
 
+
 class Uniform(Prior):
     _instances = []
 
@@ -132,6 +136,7 @@ class Uniform(Prior):
     def rvs(self, n):
         return np.random.uniform(self.lower, self.upper, size=n)
 
+
 #     def __getstate__(self):
 #         return self.lower, self.upper
 #
@@ -139,6 +144,7 @@ class Uniform(Prior):
 #         self.lower = state[0]
 #         self.upper = state[1]
 
+
 class LogGaussian(Gaussian):
     """
     Implementation of the univariate *log*-Gaussian probability function, coupled with random variables.
@@ -149,6 +155,7 @@ class LogGaussian(Gaussian):
     .. Note:: Bishop 2006 notation is used throughout the code
 
     """
+
     domain = _POSITIVE
     _instances = []
 
@@ -160,7 +167,7 @@ class LogGaussian(Gaussian):
                     return instance()
         newfunc = super(Prior, cls).__new__
         if newfunc is object.__new__:
-            o = newfunc(cls)  
+            o = newfunc(cls)
         else:
             o = newfunc(cls, mu, sigma)
         cls._instances.append(weakref.ref(o))
@@ -176,10 +183,14 @@ class LogGaussian(Gaussian):
         return "lnN({:.2g}, {:.2g})".format(self.mu, self.sigma)
 
     def lnpdf(self, x):
-        return self.constant - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2 - np.log(x)
+        return (
+            self.constant
+            - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2
+            - np.log(x)
+        )
 
     def lnpdf_grad(self, x):
-        return -((np.log(x) - self.mu) / self.sigma2 + 1.) / x
+        return -((np.log(x) - self.mu) / self.sigma2 + 1.0) / x
 
     def rvs(self, n):
         return np.exp(np.random.randn(int(n)) * self.sigma + self.mu)
@@ -195,16 +206,15 @@ class MultivariateGaussian(Prior):
     .. Note:: Bishop 2006 notation is used throughout the code
 
     """
+
     domain = _REAL
     _instances = []
 
     def __new__(cls, mu=0, var=1):  # Singleton:
         if cls._instances:
-            cls._instances[:] = [instance for instance in cls._instances if
-                                 instance()]
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
-                if np.all(instance().mu == mu) and np.all(
-                        instance().var == var):
+                if np.all(instance().mu == mu) and np.all(instance().var == var):
                     return instance()
         newfunc = super(Prior, cls).__new__
         if newfunc is object.__new__:
@@ -217,16 +227,17 @@ class MultivariateGaussian(Prior):
     def __init__(self, mu, var):
         self.mu = np.array(mu).flatten()
         self.var = np.array(var)
-        assert len(self.var.shape) == 2, 'Covariance must be a matrix'
-        assert self.var.shape[0] == self.var.shape[1], \
-            'Covariance must be a square matrix'
+        assert len(self.var.shape) == 2, "Covariance must be a matrix"
+        assert (
+            self.var.shape[0] == self.var.shape[1]
+        ), "Covariance must be a square matrix"
         assert self.var.shape[0] == self.mu.size
         self.input_dim = self.mu.size
         self.inv, _, self.hld, _ = pdinv(self.var)
         self.constant = -0.5 * (self.input_dim * np.log(2 * np.pi) + self.hld)
 
     def __str__(self):
-        return 'MultiN(' + str(self.mu) + ', ' + str(np.diag(self.var)) + ')'
+        return "MultiN(" + str(self.mu) + ", " + str(np.diag(self.var)) + ")"
 
     def summary(self):
         raise NotImplementedError
@@ -243,7 +254,7 @@ class MultivariateGaussian(Prior):
     def lnpdf_grad(self, x):
         x = np.array(x).flatten()
         d = x - self.mu
-        return - np.dot(self.inv, d)
+        return -np.dot(self.inv, d)
 
     def rvs(self, n):
         return np.random.multivariate_normal(self.mu, self.var, n)
@@ -262,14 +273,16 @@ class MultivariateGaussian(Prior):
     def __setstate__(self, state):
         self.mu = np.array(state[0]).flatten()
         self.var = state[1]
-        assert len(self.var.shape) == 2, 'Covariance must be a matrix'
-        assert self.var.shape[0] == self.var.shape[1], \
-            'Covariance must be a square matrix'
+        assert len(self.var.shape) == 2, "Covariance must be a matrix"
+        assert (
+            self.var.shape[0] == self.var.shape[1]
+        ), "Covariance must be a square matrix"
         assert self.var.shape[0] == self.mu.size
         self.input_dim = self.mu.size
         self.inv, _, self.hld, _ = pdinv(self.var)
         self.constant = -0.5 * (self.input_dim * np.log(2 * np.pi) + self.hld)
 
+
 def gamma_from_EV(E, V):
     warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning)
     return Gamma.from_EV(E, V)
@@ -285,10 +298,11 @@ class Gamma(Prior):
     .. Note:: Bishop 2006 notation is used throughout the code
 
     """
+
     domain = _POSITIVE
     _instances = []
 
-    def __new__(cls, a=1, b=.5):  # Singleton:
+    def __new__(cls, a=1, b=0.5):  # Singleton:
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
@@ -319,24 +333,29 @@ class Gamma(Prior):
         return "Ga({:.2g}, {:.2g})".format(self.a, self.b)
 
     def summary(self):
-        ret = {"E[x]": self.a / self.b, \
-               "E[ln x]": digamma(self.a) - np.log(self.b), \
-               "var[x]": self.a / self.b / self.b, \
-               "Entropy": gammaln(self.a) - (self.a - 1.) * digamma(self.a) - np.log(self.b) + self.a}
+        ret = {
+            "E[x]": self.a / self.b,
+            "E[ln x]": digamma(self.a) - np.log(self.b),
+            "var[x]": self.a / self.b / self.b,
+            "Entropy": gammaln(self.a)
+            - (self.a - 1.0) * digamma(self.a)
+            - np.log(self.b)
+            + self.a,
+        }
         if self.a > 1:
-            ret['Mode'] = (self.a - 1.) / self.b
+            ret["Mode"] = (self.a - 1.0) / self.b
         else:
-            ret['mode'] = np.nan
+            ret["mode"] = np.nan
         return ret
 
     def lnpdf(self, x):
         return self.constant + (self.a - 1) * np.log(x) - self.b * x
 
     def lnpdf_grad(self, x):
-        return (self.a - 1.) / x - self.b
+        return (self.a - 1.0) / x - self.b
 
     def rvs(self, n):
-        return np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+        return np.random.gamma(scale=1.0 / self.b, shape=self.a, size=n)
 
     @staticmethod
     def from_EV(E, V):
@@ -359,6 +378,7 @@ class Gamma(Prior):
         self._b = state[1]
         self.constant = -gammaln(self.a) + self.a * np.log(self.b)
 
+
 class InverseGamma(Gamma):
     """
     Implementation of the inverse-Gamma probability function, coupled with random variables.
@@ -369,6 +389,7 @@ class InverseGamma(Gamma):
     .. Note:: Bishop 2006 notation is used throughout the code
 
     """
+
     domain = _POSITIVE
     _instances = []
 
@@ -386,10 +407,11 @@ class InverseGamma(Gamma):
         return self.constant - (self.a + 1) * np.log(x) - self.b / x
 
     def lnpdf_grad(self, x):
-        return -(self.a + 1.) / x + self.b / x ** 2
+        return -(self.a + 1.0) / x + self.b / x**2
 
     def rvs(self, n):
-        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+        return 1.0 / np.random.gamma(scale=1.0 / self.b, shape=self.a, size=n)
+
 
 class DGPLVM_KFDA(Prior):
     """
@@ -403,6 +425,7 @@ class DGPLVM_KFDA(Prior):
     .. Note:: Surpassing Human-Level Face paper dgplvm implementation
 
     """
+
     domain = _REAL
     # _instances = []
     # def __new__(cls, lambdaa, sigma2):  # Singleton:
@@ -459,8 +482,8 @@ class DGPLVM_KFDA(Prior):
         lst_ni = []
         lst_ni1 = []
         lst_ni2 = []
-        f1 = (np.where(self.lbl[:, 0] == 1)[0])
-        f2 = (np.where(self.lbl[:, 1] == 1)[0])
+        f1 = np.where(self.lbl[:, 0] == 1)[0]
+        f2 = np.where(self.lbl[:, 1] == 1)[0]
         for idx in f1:
             lst_ni1.append(idx)
         for idx in f2:
@@ -474,11 +497,11 @@ class DGPLVM_KFDA(Prior):
         count = 0
         for N_i in lst_ni:
             if N_i == lst_ni[0]:
-                a[count:count + N_i] = (float(1) / N_i) * a[count]
+                a[count : count + N_i] = (float(1) / N_i) * a[count]
                 count += N_i
             else:
                 if N_i == lst_ni[1]:
-                    a[count: count + N_i] = -(float(1) / N_i) * a[count]
+                    a[count : count + N_i] = -(float(1) / N_i) * a[count]
                     count += N_i
         return a
 
@@ -486,8 +509,12 @@ class DGPLVM_KFDA(Prior):
         A = np.zeros((self.datanum, self.datanum))
         idx = 0
         for N_i in lst_ni:
-            B = float(1) / np.sqrt(N_i) * (np.eye(N_i) - ((float(1) / N_i) * np.ones((N_i, N_i))))
-            A[idx:idx + N_i, idx:idx + N_i] = B
+            B = (
+                float(1)
+                / np.sqrt(N_i)
+                * (np.eye(N_i) - ((float(1) / N_i) * np.ones((N_i, N_i))))
+            )
+            A[idx : idx + N_i, idx : idx + N_i] = B
             idx += N_i
         return A
 
@@ -498,9 +525,11 @@ class DGPLVM_KFDA(Prior):
         a_trans = np.transpose(self.a)
         paran = self.lambdaa * np.eye(x.shape[0]) + self.A.dot(K).dot(self.A)
         inv_part = pdinv(paran)[0]
-        J = a_trans.dot(K).dot(self.a) - a_trans.dot(K).dot(self.A).dot(inv_part).dot(self.A).dot(K).dot(self.a)
-        J_star = (1. / self.lambdaa) * J
-        return (-1. / self.sigma2) * J_star
+        J = a_trans.dot(K).dot(self.a) - a_trans.dot(K).dot(self.A).dot(inv_part).dot(
+            self.A
+        ).dot(K).dot(self.a)
+        J_star = (1.0 / self.lambdaa) * J
+        return (-1.0 / self.sigma2) * J_star
 
     # Here gradient function
     def lnpdf_grad(self, x):
@@ -511,15 +540,15 @@ class DGPLVM_KFDA(Prior):
         b = self.A.dot(inv_part).dot(self.A).dot(K).dot(self.a)
         a_Minus_b = self.a - b
         a_b_trans = np.transpose(a_Minus_b)
-        DJ_star_DK = (1. / self.lambdaa) * (a_Minus_b.dot(a_b_trans))
+        DJ_star_DK = (1.0 / self.lambdaa) * (a_Minus_b.dot(a_b_trans))
         DJ_star_DX = self.kern.gradients_X(DJ_star_DK, x)
-        return (-1. / self.sigma2) * DJ_star_DX
+        return (-1.0 / self.sigma2) * DJ_star_DX
 
     def rvs(self, n):
         return np.random.rand(n)  # A WRONG implementation
 
     def __str__(self):
-        return 'DGPLVM_prior'
+        return "DGPLVM_prior"
 
     def __getstate___(self):
         return self.lbl, self.lambdaa, self.sigma2, self.kern, self.x_shape
@@ -547,6 +576,7 @@ class DGPLVM(Prior):
     .. Note:: DGPLVM for Classification paper implementation
 
     """
+
     domain = _REAL
 
     def __new__(cls, sigma2, lbl, x_shape):
@@ -606,7 +636,7 @@ class DGPLVM(Prior):
         for i in data_idx:
             if len(lst_idx) == 0:
                 pass
-                #Do nothing, because it is the first time list is created so is empty
+                # Do nothing, because it is the first time list is created so is empty
             else:
                 lst_idx = []
             # Here we put indices of each class in to the list called lst_idx_all
@@ -631,9 +661,9 @@ class DGPLVM(Prior):
             N_i = float(len(cls[i]))
             W_WT = np.zeros((self.dim, self.dim))
             for xk in cls[i]:
-                W = (xk - M_i[i])
+                W = xk - M_i[i]
                 W_WT += np.outer(W, W)
-            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+            Sw += (N_i / self.datanum) * ((1.0 / N_i) * W_WT)
         return Sw
 
     # Calculating beta and Bi for Sb
@@ -658,7 +688,6 @@ class DGPLVM(Prior):
         Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
         return Sig_beta_B_i_all
 
-
     # Calculating W_j s separately so we can access all the W_j s anytime
     def compute_wj(self, data_idx, M_i):
         W_i = np.zeros((self.datanum, self.dim))
@@ -667,7 +696,7 @@ class DGPLVM(Prior):
             for tpl in data_idx[i]:
                 xj = tpl[1]
                 j = tpl[0]
-                W_i[j] = (xj - M_i[i])
+                W_i[j] = xj - M_i[i]
         return W_i
 
     # Calculating alpha and Wj for Sw
@@ -680,11 +709,11 @@ class DGPLVM(Prior):
                 for j in lst_idx_all[i]:
                     if k == j:
                         alpha = 1 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
                     else:
                         alpha = 0 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
-        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
+        Sig_alpha_W_i = (1.0 / self.datanum) * np.transpose(Sig_alpha_W_i)
         return Sig_alpha_W_i
 
     # This function calculates log of our prior
@@ -696,9 +725,9 @@ class DGPLVM(Prior):
         Sb = self.compute_Sb(cls, M_i, M_0)
         Sw = self.compute_Sw(cls, M_i)
         # sb_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
         return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
 
     # This function calculates derivative of the log of prior function
@@ -717,19 +746,20 @@ class DGPLVM(Prior):
 
         # Calculating inverse of Sb and its transpose and minus
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
         Sb_inv_N_trans = np.transpose(Sb_inv_N)
         Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
         Sw_trans = np.transpose(Sw)
 
         # Calculating DJ/DXk
         DJ_Dxk = 2 * (
-            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
-                Sig_alpha_W_i))
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all)
+            + Sb_inv_N_trans.dot(Sig_alpha_W_i)
+        )
         # Calculating derivative of the log of the prior
-        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        DPx_Dx = (-1 / self.sigma2) * DJ_Dxk
         return DPx_Dx.T
 
     # def frb(self, x):
@@ -744,7 +774,7 @@ class DGPLVM(Prior):
         return np.random.rand(n)  # A WRONG implementation
 
     def __str__(self):
-        return 'DGPLVM_prior_Raq'
+        return "DGPLVM_prior_Raq"
 
 
 # ******************************************
@@ -752,6 +782,7 @@ class DGPLVM(Prior):
 from . import Parameterized
 from . import Param
 
+
 class DGPLVM_Lamda(Prior, Parameterized):
     """
     Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
@@ -761,6 +792,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
     .. Note:: DGPLVM for Classification paper implementation
 
     """
+
     domain = _REAL
     # _instances = []
     # def __new__(cls, mu, sigma): # Singleton:
@@ -773,7 +805,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
     #     cls._instances.append(weakref.ref(o))
     #     return cls._instances[-1]()
 
-    def __init__(self, sigma2, lbl, x_shape, lamda, name='DP_prior'):
+    def __init__(self, sigma2, lbl, x_shape, lamda, name="DP_prior"):
         super(DGPLVM_Lamda, self).__init__(name=name)
         self.sigma2 = sigma2
         # self.x = x
@@ -783,7 +815,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
         self.datanum = lbl.shape[0]
         self.x_shape = x_shape
         self.dim = x_shape[1]
-        self.lamda = Param('lamda', np.diag(lamda))
+        self.lamda = Param("lamda", np.diag(lamda))
         self.link_parameter(self.lamda)
 
     def get_class_label(self, y):
@@ -831,7 +863,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
         for i in data_idx:
             if len(lst_idx) == 0:
                 pass
-                #Do nothing, because it is the first time list is created so is empty
+                # Do nothing, because it is the first time list is created so is empty
             else:
                 lst_idx = []
             # Here we put indices of each class in to the list called lst_idx_all
@@ -856,9 +888,9 @@ class DGPLVM_Lamda(Prior, Parameterized):
             N_i = float(len(cls[i]))
             W_WT = np.zeros((self.dim, self.dim))
             for xk in cls[i]:
-                W = (xk - M_i[i])
+                W = xk - M_i[i]
                 W_WT += np.outer(W, W)
-            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+            Sw += (N_i / self.datanum) * ((1.0 / N_i) * W_WT)
         return Sw
 
     # Calculating beta and Bi for Sb
@@ -883,7 +915,6 @@ class DGPLVM_Lamda(Prior, Parameterized):
         Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
         return Sig_beta_B_i_all
 
-
     # Calculating W_j s separately so we can access all the W_j s anytime
     def compute_wj(self, data_idx, M_i):
         W_i = np.zeros((self.datanum, self.dim))
@@ -892,7 +923,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
             for tpl in data_idx[i]:
                 xj = tpl[1]
                 j = tpl[0]
-                W_i[j] = (xj - M_i[i])
+                W_i[j] = xj - M_i[i]
         return W_i
 
     # Calculating alpha and Wj for Sw
@@ -905,11 +936,11 @@ class DGPLVM_Lamda(Prior, Parameterized):
                 for j in lst_idx_all[i]:
                     if k == j:
                         alpha = 1 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
                     else:
                         alpha = 0 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
-        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
+        Sig_alpha_W_i = (1.0 / self.datanum) * np.transpose(Sig_alpha_W_i)
         return Sig_alpha_W_i
 
     # This function calculates log of our prior
@@ -917,7 +948,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
         x = x.reshape(self.x_shape)
 
         #!!!!!!!!!!!!!!!!!!!!!!!!!!!
-        #self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()
+        # self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()
 
         xprime = x.dot(np.diagflat(self.lamda))
         x = xprime
@@ -928,9 +959,9 @@ class DGPLVM_Lamda(Prior, Parameterized):
         Sb = self.compute_Sb(cls, M_i, M_0)
         Sw = self.compute_Sw(cls, M_i)
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.9)[0]
         return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
 
     # This function calculates derivative of the log of prior function
@@ -952,19 +983,20 @@ class DGPLVM_Lamda(Prior, Parameterized):
 
         # Calculating inverse of Sb and its transpose and minus
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.9)[0]
         Sb_inv_N_trans = np.transpose(Sb_inv_N)
         Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
         Sw_trans = np.transpose(Sw)
 
         # Calculating DJ/DXk
         DJ_Dxk = 2 * (
-            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
-                Sig_alpha_W_i))
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all)
+            + Sb_inv_N_trans.dot(Sig_alpha_W_i)
+        )
         # Calculating derivative of the log of the prior
-        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        DPx_Dx = (-1 / self.sigma2) * DJ_Dxk
 
         DPxprim_Dx = np.diagflat(self.lamda).dot(DPx_Dx)
 
@@ -980,7 +1012,6 @@ class DGPLVM_Lamda(Prior, Parameterized):
         # print DPxprim_Dx
         return DPxprim_Dx
 
-
     # def frb(self, x):
     #     from functools import partial
     #     from GPy.models import GradientChecker
@@ -993,10 +1024,12 @@ class DGPLVM_Lamda(Prior, Parameterized):
         return np.random.rand(n)  # A WRONG implementation
 
     def __str__(self):
-        return 'DGPLVM_prior_Raq_Lamda'
+        return "DGPLVM_prior_Raq_Lamda"
+
 
 # ******************************************
 
+
 class DGPLVM_T(Prior):
     """
     Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
@@ -1006,6 +1039,7 @@ class DGPLVM_T(Prior):
     .. Note:: DGPLVM for Classification paper implementation
 
     """
+
     domain = _REAL
     # _instances = []
     # def __new__(cls, mu, sigma): # Singleton:
@@ -1028,7 +1062,6 @@ class DGPLVM_T(Prior):
         self.dim = x_shape[1]
         self.vec = vec
 
-
     def get_class_label(self, y):
         for idx, v in enumerate(y):
             if v == 1:
@@ -1075,7 +1108,7 @@ class DGPLVM_T(Prior):
         for i in data_idx:
             if len(lst_idx) == 0:
                 pass
-                #Do nothing, because it is the first time list is created so is empty
+                # Do nothing, because it is the first time list is created so is empty
             else:
                 lst_idx = []
             # Here we put indices of each class in to the list called lst_idx_all
@@ -1100,9 +1133,9 @@ class DGPLVM_T(Prior):
             N_i = float(len(cls[i]))
             W_WT = np.zeros((self.dim, self.dim))
             for xk in cls[i]:
-                W = (xk - M_i[i])
+                W = xk - M_i[i]
                 W_WT += np.outer(W, W)
-            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+            Sw += (N_i / self.datanum) * ((1.0 / N_i) * W_WT)
         return Sw
 
     # Calculating beta and Bi for Sb
@@ -1127,7 +1160,6 @@ class DGPLVM_T(Prior):
         Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
         return Sig_beta_B_i_all
 
-
     # Calculating W_j s separately so we can access all the W_j s anytime
     def compute_wj(self, data_idx, M_i):
         W_i = np.zeros((self.datanum, self.dim))
@@ -1136,7 +1168,7 @@ class DGPLVM_T(Prior):
             for tpl in data_idx[i]:
                 xj = tpl[1]
                 j = tpl[0]
-                W_i[j] = (xj - M_i[i])
+                W_i[j] = xj - M_i[i]
         return W_i
 
     # Calculating alpha and Wj for Sw
@@ -1149,11 +1181,11 @@ class DGPLVM_T(Prior):
                 for j in lst_idx_all[i]:
                     if k == j:
                         alpha = 1 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
                     else:
                         alpha = 0 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
-        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
+        Sig_alpha_W_i = (1.0 / self.datanum) * np.transpose(Sig_alpha_W_i)
         return Sig_alpha_W_i
 
     # This function calculates log of our prior
@@ -1168,10 +1200,10 @@ class DGPLVM_T(Prior):
         Sb = self.compute_Sb(cls, M_i, M_0)
         Sw = self.compute_Sw(cls, M_i)
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #print 'SB_inv: ', Sb_inv_N
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # print 'SB_inv: ', Sb_inv_N
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
         return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
 
     # This function calculates derivative of the log of prior function
@@ -1193,20 +1225,21 @@ class DGPLVM_T(Prior):
 
         # Calculating inverse of Sb and its transpose and minus
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #print 'SB_inv: ',Sb_inv_N
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # print 'SB_inv: ',Sb_inv_N
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
         Sb_inv_N_trans = np.transpose(Sb_inv_N)
         Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
         Sw_trans = np.transpose(Sw)
 
         # Calculating DJ/DXk
         DJ_Dxk = 2 * (
-            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
-                Sig_alpha_W_i))
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all)
+            + Sb_inv_N_trans.dot(Sig_alpha_W_i)
+        )
         # Calculating derivative of the log of the prior
-        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        DPx_Dx = (-1 / self.sigma2) * DJ_Dxk
         return DPx_Dx.T
 
     # def frb(self, x):
@@ -1221,9 +1254,7 @@ class DGPLVM_T(Prior):
         return np.random.rand(n)  # A WRONG implementation
 
     def __str__(self):
-        return 'DGPLVM_prior_Raq_TTT'
-
-
+        return "DGPLVM_prior_Raq_TTT"
 
 
 class HalfT(Prior):
@@ -1234,6 +1265,7 @@ class HalfT(Prior):
     :param nu: degrees of freedom
 
     """
+
     domain = _POSITIVE
     _instances = []
 
@@ -1250,13 +1282,22 @@ class HalfT(Prior):
     def __init__(self, A, nu):
         self.A = float(A)
         self.nu = float(nu)
-        self.constant = gammaln(.5*(self.nu+1.)) - gammaln(.5*self.nu) - .5*np.log(np.pi*self.A*self.nu)
+        self.constant = (
+            gammaln(0.5 * (self.nu + 1.0))
+            - gammaln(0.5 * self.nu)
+            - 0.5 * np.log(np.pi * self.A * self.nu)
+        )
 
     def __str__(self):
         return "hT({:.2g}, {:.2g})".format(self.A, self.nu)
 
     def lnpdf(self, theta):
-        return (theta > 0) * (self.constant - .5*(self.nu + 1) * np.log(1. + (1./self.nu) * (theta/self.A)**2))
+        return (theta > 0) * (
+            self.constant
+            - 0.5
+            * (self.nu + 1)
+            * np.log(1.0 + (1.0 / self.nu) * (theta / self.A) ** 2)
+        )
 
         # theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
         # lnpdfs = np.zeros_like(theta)
@@ -1268,7 +1309,7 @@ class HalfT(Prior):
         # lnpdfs[above_zero] = (+ gammaln((v + 1) * 0.5)
         #     - gammaln(v * 0.5)
         #     - 0.5*np.log(sigma2 * v * np.pi)
-        #     - 0.5*(v + 1)*np.log(1 + (1/np.float(v))*((theta[above_zero][0]**2)/sigma2))
+        #     - 0.5*(v + 1)*np.log(1 + (1/float(v))*((theta[above_zero][0]**2)/sigma2))
         # )
         # return lnpdfs
 
@@ -1278,12 +1319,18 @@ class HalfT(Prior):
         above_zero = theta > 1e-6
         v = self.nu
         sigma2 = self.A
-        grad[above_zero] = -0.5*(v+1)*(2*theta[above_zero])/(v*sigma2 + theta[above_zero][0]**2)
+        grad[above_zero] = (
+            -0.5
+            * (v + 1)
+            * (2 * theta[above_zero])
+            / (v * sigma2 + theta[above_zero][0] ** 2)
+        )
         return grad
 
     def rvs(self, n):
         # return np.random.randn(n) * self.sigma + self.mu
         from scipy.stats import t
+
         # [np.abs(x) for x in t.rvs(df=4,loc=0,scale=50, size=10000)])
         ret = t.rvs(self.nu, loc=0, scale=self.A, size=n)
         ret[ret < 0] = 0
@@ -1298,6 +1345,7 @@ class Exponential(Prior):
     :param l: shape parameter
 
     """
+
     domain = _POSITIVE
     _instances = []
 
@@ -1318,22 +1366,25 @@ class Exponential(Prior):
         return "Exp({:.2g})".format(self.l)
 
     def summary(self):
-        ret = {"E[x]": 1. / self.l,
-               "E[ln x]": np.nan,
-               "var[x]": 1. / self.l**2,
-               "Entropy": 1. - np.log(self.l),
-               "Mode": 0.}
+        ret = {
+            "E[x]": 1.0 / self.l,
+            "E[ln x]": np.nan,
+            "var[x]": 1.0 / self.l**2,
+            "Entropy": 1.0 - np.log(self.l),
+            "Mode": 0.0,
+        }
         return ret
 
     def lnpdf(self, x):
         return np.log(self.l) - self.l * x
 
     def lnpdf_grad(self, x):
-        return - self.l
+        return -self.l
 
     def rvs(self, n):
         return np.random.exponential(scale=self.l, size=n)
 
+
 class StudentT(Prior):
     """
     Implementation of the student t probability function, coupled with random variables.
@@ -1345,6 +1396,7 @@ class StudentT(Prior):
     .. Note:: Bishop 2006 notation is used throughout the code
 
     """
+
     domain = _REAL
     _instances = []
 
@@ -1352,7 +1404,11 @@ class StudentT(Prior):
         if cls._instances:
             cls._instances[:] = [instance for instance in cls._instances if instance()]
             for instance in cls._instances:
-                if instance().mu == mu and instance().sigma == sigma and instance().nu == nu:
+                if (
+                    instance().mu == mu
+                    and instance().sigma == sigma
+                    and instance().nu == nu
+                ):
                     return instance()
         newfunc = super(Prior, cls).__new__
         if newfunc is object.__new__:
@@ -1373,13 +1429,18 @@ class StudentT(Prior):
 
     def lnpdf(self, x):
         from scipy.stats import t
-        return t.logpdf(x,self.nu,self.mu,self.sigma)
+
+        return t.logpdf(x, self.nu, self.mu, self.sigma)
 
     def lnpdf_grad(self, x):
-        return -(self.nu + 1.)*(x - self.mu)/( self.nu*self.sigma2 + np.square(x - self.mu) )
+        return (
+            -(self.nu + 1.0)
+            * (x - self.mu)
+            / (self.nu * self.sigma2 + np.square(x - self.mu))
+        )
 
     def rvs(self, n):
         from scipy.stats import t
+
         ret = t.rvs(self.nu, loc=self.mu, scale=self.sigma, size=n)
         return ret
-
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index e8de3c40..6c97a5d8 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -12,6 +12,7 @@ from ..core.parameterization import Param
 from paramz.transformations import Logexp
 from scipy.special import psi as digamma
 
+
 class StudentT(Likelihood):
     """
     Student T likelihood
@@ -22,17 +23,18 @@ class StudentT(Likelihood):
         p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
 
     """
-    def __init__(self,gp_link=None, deg_free=5, sigma2=2):
+
+    def __init__(self, gp_link=None, deg_free=5, sigma2=2):
         if gp_link is None:
             gp_link = link_functions.Identity()
 
-        super(StudentT, self).__init__(gp_link, name='Student_T')
+        super(StudentT, self).__init__(gp_link, name="Student_T")
         # sigma2 is not a noise parameter, it is a squared scale.
-        self.sigma2 = Param('t_scale2', float(sigma2), Logexp())
-        self.v = Param('deg_free', float(deg_free), Logexp())
+        self.sigma2 = Param("t_scale2", float(sigma2), Logexp())
+        self.v = Param("deg_free", float(deg_free), Logexp())
         self.link_parameter(self.sigma2)
         self.link_parameter(self.v)
-        #self.v.constrain_fixed()
+        # self.v.constrain_fixed()
 
         self.log_concave = False
 
@@ -61,11 +63,14 @@ class StudentT(Likelihood):
         """
         assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
         e = y - inv_link_f
-        #Careful gamma(big_number) is infinity!
-        objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
-                     / (np.sqrt(self.v * np.pi * self.sigma2)))
-                     * ((1 + (1./float(self.v))*((e**2)/float(self.sigma2)))**(-0.5*(self.v + 1)))
-                    )
+        # Careful gamma(big_number) is infinity!
+        objective = (
+            np.exp(gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5))
+            / (np.sqrt(self.v * np.pi * self.sigma2))
+        ) * (
+            (1 + (1.0 / float(self.v)) * ((e**2) / float(self.sigma2)))
+            ** (-0.5 * (self.v + 1))
+        )
         return np.prod(objective)
 
     def logpdf_link(self, inv_link_f, y, Y_metadata=None):
@@ -85,15 +90,16 @@ class StudentT(Likelihood):
 
         """
         e = y - inv_link_f
-        #FIXME:
-        #Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?!
-        #But np.log(1 + (1/float(self.v))*((y-inv_link_f)**2)/self.sigma2) throws it correctly
-        #print - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
-        objective = (+ gammaln((self.v + 1) * 0.5)
-                    - gammaln(self.v * 0.5)
-                    - 0.5*np.log(self.sigma2 * self.v * np.pi)
-                    - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
-                    )
+        # FIXME:
+        # Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?!
+        # But np.log(1 + (1/float(self.v))*((y-inv_link_f)**2)/self.sigma2) throws it correctly
+        # print - 0.5*(self.v + 1)*np.log(1 + (1/(self.v))*((e**2)/self.sigma2))
+        objective = (
+            +gammaln((self.v + 1) * 0.5)
+            - gammaln(self.v * 0.5)
+            - 0.5 * np.log(self.sigma2 * self.v * np.pi)
+            - 0.5 * (self.v + 1) * np.log(1 + (1 / (self.v)) * ((e**2) / self.sigma2))
+        )
         return objective
 
     def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
@@ -138,7 +144,9 @@ class StudentT(Likelihood):
             (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
         e = y - inv_link_f
-        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
+        hess = ((self.v + 1) * (e**2 - self.v * self.sigma2)) / (
+            (self.sigma2 * self.v + e**2) ** 2
+        )
         return hess
 
     def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
@@ -157,9 +165,9 @@ class StudentT(Likelihood):
         :rtype: Nx1 array
         """
         e = y - inv_link_f
-        d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
-                       ((e**2 + self.sigma2*self.v)**3)
-                    )
+        d3lik_dlink3 = -(
+            2 * (self.v + 1) * (-e) * (e**2 - 3 * self.v * self.sigma2)
+        ) / ((e**2 + self.sigma2 * self.v) ** 3)
         return d3lik_dlink3
 
     def dlogpdf_link_dvar(self, inv_link_f, y, Y_metadata=None):
@@ -179,7 +187,11 @@ class StudentT(Likelihood):
         """
         e = y - inv_link_f
         e2 = np.square(e)
-        dlogpdf_dvar = self.v*(e2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e2))
+        dlogpdf_dvar = (
+            self.v
+            * (e2 - self.sigma2)
+            / (2 * self.sigma2 * (self.sigma2 * self.v + e2))
+        )
         return dlogpdf_dvar
 
     def dlogpdf_dlink_dvar(self, inv_link_f, y, Y_metadata=None):
@@ -198,7 +210,9 @@ class StudentT(Likelihood):
         :rtype: Nx1 array
         """
         e = y - inv_link_f
-        dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
+        dlogpdf_dlink_dvar = (self.v * (self.v + 1) * (-e)) / (
+            (self.sigma2 * self.v + e**2) ** 2
+        )
         return dlogpdf_dlink_dvar
 
     def d2logpdf_dlink2_dvar(self, inv_link_f, y, Y_metadata=None):
@@ -217,9 +231,9 @@ class StudentT(Likelihood):
         :rtype: Nx1 array
         """
         e = y - inv_link_f
-        d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
-                              / ((self.sigma2*self.v + (e**2))**3)
-                           )
+        d2logpdf_dlink2_dvar = (
+            self.v * (self.v + 1) * (self.sigma2 * self.v - 3 * (e**2))
+        ) / ((self.sigma2 * self.v + (e**2)) ** 3)
         return d2logpdf_dlink2_dvar
 
     def dlogpdf_link_dv(self, inv_link_f, y, Y_metadata=None):
@@ -227,9 +241,11 @@ class StudentT(Likelihood):
         e2 = np.square(e)
         df = float(self.v[:])
         s2 = float(self.sigma2[:])
-        dlogpdf_dv =  0.5*digamma(0.5*(df+1)) - 0.5*digamma(0.5*df) - 1.0/(2*df)
-        dlogpdf_dv += 0.5*(df+1)*e2/(df*(e2 + s2*df))
-        dlogpdf_dv -= 0.5*np.log1p(e2/(s2*df))
+        dlogpdf_dv = (
+            0.5 * digamma(0.5 * (df + 1)) - 0.5 * digamma(0.5 * df) - 1.0 / (2 * df)
+        )
+        dlogpdf_dv += 0.5 * (df + 1) * e2 / (df * (e2 + s2 * df))
+        dlogpdf_dv -= 0.5 * np.log1p(e2 / (s2 * df))
         return dlogpdf_dv
 
     def dlogpdf_dlink_dv(self, inv_link_f, y, Y_metadata=None):
@@ -237,7 +253,7 @@ class StudentT(Likelihood):
         e2 = np.square(e)
         df = float(self.v[:])
         s2 = float(self.sigma2[:])
-        dlogpdf_df_dv = e*(e2 - self.sigma2)/(e2 + s2*df)**2
+        dlogpdf_df_dv = e * (e2 - self.sigma2) / (e2 + s2 * df) ** 2
         return dlogpdf_df_dv
 
     def d2logpdf_dlink2_dv(self, inv_link_f, y, Y_metadata=None):
@@ -245,8 +261,10 @@ class StudentT(Likelihood):
         e2 = np.square(e)
         df = float(self.v[:])
         s2 = float(self.sigma2[:])
-        e2_s2v = e**2 + s2*df
-        d2logpdf_df2_dv = (-s2*(df+1) + e2 - s2*df)/e2_s2v**2 - 2*s2*(df+1)*(e2 - s2*df)/e2_s2v**3
+        e2_s2v = e**2 + s2 * df
+        d2logpdf_df2_dv = (-s2 * (df + 1) + e2 - s2 * df) / e2_s2v**2 - 2 * s2 * (
+            df + 1
+        ) * (e2 - s2 * df) / e2_s2v**3
         return d2logpdf_df2_dv
 
     def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
@@ -266,19 +284,23 @@ class StudentT(Likelihood):
 
     def predictive_mean(self, mu, sigma, Y_metadata=None):
         # The comment here confuses mean and median.
-        return self.gp_link.transf(mu) # only true if link is monotonic, which it is.
+        return self.gp_link.transf(mu)  # only true if link is monotonic, which it is.
 
-    def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
-        if self.deg_free<=2.:
-            return np.empty(mu.shape)*np.nan # does not exist for degrees of freedom <= 2.
+    def predictive_variance(self, mu, variance, predictive_mean=None, Y_metadata=None):
+        if self.deg_free <= 2.0:
+            return (
+                np.empty(mu.shape) * np.nan
+            )  # does not exist for degrees of freedom <= 2.
         else:
-            return super(StudentT, self).predictive_variance(mu, variance, predictive_mean, Y_metadata)
+            return super(StudentT, self).predictive_variance(
+                mu, variance, predictive_mean, Y_metadata
+            )
 
     def conditional_mean(self, gp):
         return self.gp_link.transf(gp)
 
     def conditional_variance(self, gp):
-        return self.deg_free/(self.deg_free - 2.)
+        return self.deg_free / (self.deg_free - 2.0)
 
     def samples(self, gp, Y_metadata=None):
         """
@@ -288,11 +310,10 @@ class StudentT(Likelihood):
         """
         orig_shape = gp.shape
         gp = gp.flatten()
-        #FIXME: Very slow as we are computing a new random variable per input!
-        #Can't get it to sample all at the same time
-        #student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
-        dfs = np.ones_like(gp)*self.v
-        scales = np.ones_like(gp)*np.sqrt(self.sigma2)
-        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp),
-                                        scale=scales)
+        # FIXME: Very slow as we are computing a new random variable per input!
+        # Can't get it to sample all at the same time
+        # student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
+        dfs = np.ones_like(gp) * self.v
+        scales = np.ones_like(gp) * np.sqrt(self.sigma2)
+        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp), scale=scales)
         return student_t_samples.reshape(orig_shape)
diff --git a/benchmarks/regression/evaluation.py b/benchmarks/regression/evaluation.py
index c57bce7e..7de8d5ae 100644
--- a/benchmarks/regression/evaluation.py
+++ b/benchmarks/regression/evaluation.py
@@ -4,18 +4,19 @@
 import abc
 import numpy as np
 
+
 class Evaluation(object):
     __metaclass__ = abc.ABCMeta
-    
+
     @abc.abstractmethod
     def evaluate(self, gt, pred):
         """Compute a scalar for access the performance"""
         return None
 
+
 class RMSE(Evaluation):
     "Rooted Mean Square Error"
-    name = 'RMSE'
-    
+    name = "RMSE"
+
     def evaluate(self, gt, pred):
-        return np.sqrt(np.square(gt-pred).astype(np.float).mean())
-    
+        return np.sqrt(np.square(gt - pred).astype(float).mean())