Merge pull request #1040 from SheffieldML/1030-move-from-nose-to-pytests

1030 move from nose to pytests
2026-07-23 17:01:06 +02:00 · 2023-11-16 18:50:29 +01:00 · 2023-11-16 18:50:29 +01:00 · 328312f342
commit 328312f342
parent 279f8e83f1 caafcbf1d7
75 changed files with 11948 additions and 7446 deletions
--- a/.gitignore
+++ b/.gitignore
@ -55,4 +55,9 @@ iterate.dat
 GPy*.rst

 # vscode
-settings.json
+settings.json
+
+# local dev
+.eggs
+.venv
+.env
--- a/.travis.yml
+++ b/.travis.yml
@ -20,12 +20,17 @@ env:
  - PYTHON_VERSION=3.7
  - PYTHON_VERSION=3.8
  - PYTHON_VERSION=3.9
+  - PYTHON_VERSION=3.10
+  - PYTHON_VERSION=3.11
+  - PYTHON_VERSION=3.12
+  # TODO: add more recent python versions? will later address this in the issue claiming we follow numpy

 before_install:
 - wget https://github.com/mzwiessele/travis_scripts/raw/master/download_miniconda.sh
 - wget https://github.com/mzwiessele/travis_scripts/raw/master/install_retry.sh
 - source download_miniconda.sh
 - echo $PATH
+# why not cloning a miniconda container?!

 install:
 - echo $PATH
@ -39,7 +44,6 @@ install:
 - pip install pypandoc
 - pip install git+git://github.com/BRML/climin.git
 - pip install autograd
- pip install nose-show-skipped
 - python setup.py develop

 script:
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,8 @@
 # Changelog

+## Unreleased
+
+* Change from `nosetest` to `pytest`

 ## v1.9.8 (2019-05-17)

--- a/GPy/init.py
+++ b/GPy/init.py
@ -1,6 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import warnings
+
 warnings.filterwarnings("ignore", category=DeprecationWarning)

 from . import core
@ -18,30 +19,25 @@ from .util import normalizer

 # backwards compatibility
 import sys
-backwards_compatibility = ['lists_and_dicts', 'observable_array', 'index_operations']
+
+backwards_compatibility = ["lists_and_dicts", "observable_array", "index_operations"]
 for bc in backwards_compatibility:
-    sys.modules['GPy.core.parameterization.{!s}'.format(bc)] = getattr(core.parameterization, bc)
+    sys.modules["GPy.core.parameterization.{!s}".format(bc)] = getattr(
+        core.parameterization, bc
+    )

 # Direct imports for convenience:
 from .core import Model
 from .core.parameterization import priors
-from .core.parameterization import Param, Parameterized, ObsAr, transformations as constraints
+from .core.parameterization import (
+    Param,
+    Parameterized,
+    ObsAr,
+    transformations as constraints,
+)

 from .__version__ import __version__

-from numpy.testing import Tester
-
-with warnings.catch_warnings():
-    warnings.simplefilter('ignore')
-    try:
-        #Get rid of nose dependency by only ignoring if you have nose installed
-        from nose.tools import nottest
-        @nottest
-        def tests(verbose=10):
-            Tester(testing).test(verbose=verbose)
-    except:
-        def tests(verbose=10):
-            Tester(testing).test(verbose=verbose)

 def load(file_or_path):
    """
@ -52,10 +48,12 @@ def load(file_or_path):
    # This is the pickling pain when changing _src -> src
    import sys
    import inspect
-    sys.modules['GPy.kern._src'] = kern.src
+
+    sys.modules["GPy.kern._src"] = kern.src
    for name, module in inspect.getmembers(kern.src):
-        if not name.startswith('_'):
-            sys.modules['GPy.kern._src.{}'.format(name)] = module
-    sys.modules['GPy.inference.optimization'] = inference.optimization
+        if not name.startswith("_"):
+            sys.modules["GPy.kern._src.{}".format(name)] = module
+    sys.modules["GPy.inference.optimization"] = inference.optimization
    import paramz
+
    return paramz.load(file_or_path)
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -13,14 +13,15 @@ import weakref
 class Prior(object):
    domain = None
    _instance = None
+
    def __new__(cls, *args, **kwargs):
        if not cls._instance or cls._instance.__class__ is not cls:
-                newfunc = super(Prior, cls).__new__
-                if newfunc is object.__new__:
-                    cls._instance = newfunc(cls)
-                else:
-                    cls._instance = newfunc(cls, *args, **kwargs)
-                return cls._instance
+            newfunc = super(Prior, cls).__new__
+            if newfunc is object.__new__:
+                cls._instance = newfunc(cls)
+            else:
+                cls._instance = newfunc(cls, *args, **kwargs)
+            return cls._instance

    def pdf(self, x):
        return np.exp(self.lnpdf(x))
@ -47,6 +48,7 @@ class Gaussian(Prior):
    .. Note:: Bishop 2006 notation is used throughout the code

    """
+
    domain = _REAL
    _instances = []

@ -82,6 +84,7 @@ class Gaussian(Prior):
    def rvs(self, n):
        return np.random.randn(n) * self.sigma + self.mu

+
 #     def __getstate__(self):
 #         return self.mu, self.sigma
 #
@ -91,6 +94,7 @@ class Gaussian(Prior):
 #         self.sigma2 = np.square(self.sigma)
 #         self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)

+
 class Uniform(Prior):
    _instances = []

@ -132,6 +136,7 @@ class Uniform(Prior):
    def rvs(self, n):
        return np.random.uniform(self.lower, self.upper, size=n)

+
 #     def __getstate__(self):
 #         return self.lower, self.upper
 #
@ -139,6 +144,7 @@ class Uniform(Prior):
 #         self.lower = state[0]
 #         self.upper = state[1]

+
 class LogGaussian(Gaussian):
    """
    Implementation of the univariate *log*-Gaussian probability function, coupled with random variables.
@ -149,6 +155,7 @@ class LogGaussian(Gaussian):
    .. Note:: Bishop 2006 notation is used throughout the code

    """
+
    domain = _POSITIVE
    _instances = []

@ -160,7 +167,7 @@ class LogGaussian(Gaussian):
                    return instance()
        newfunc = super(Prior, cls).__new__
        if newfunc is object.__new__:
-            o = newfunc(cls)  
+            o = newfunc(cls)
        else:
            o = newfunc(cls, mu, sigma)
        cls._instances.append(weakref.ref(o))
@ -176,10 +183,14 @@ class LogGaussian(Gaussian):
        return "lnN({:.2g}, {:.2g})".format(self.mu, self.sigma)

    def lnpdf(self, x):
-        return self.constant - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2 - np.log(x)
+        return (
+            self.constant
+            - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2
+            - np.log(x)
+        )

    def lnpdf_grad(self, x):
-        return -((np.log(x) - self.mu) / self.sigma2 + 1.) / x
+        return -((np.log(x) - self.mu) / self.sigma2 + 1.0) / x

    def rvs(self, n):
        return np.exp(np.random.randn(int(n)) * self.sigma + self.mu)
@ -195,16 +206,15 @@ class MultivariateGaussian(Prior):
    .. Note:: Bishop 2006 notation is used throughout the code

    """
+
    domain = _REAL
    _instances = []

    def __new__(cls, mu=0, var=1):  # Singleton:
        if cls._instances:
-            cls._instances[:] = [instance for instance in cls._instances if
-                                 instance()]
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
-                if np.all(instance().mu == mu) and np.all(
-                        instance().var == var):
+                if np.all(instance().mu == mu) and np.all(instance().var == var):
                    return instance()
        newfunc = super(Prior, cls).__new__
        if newfunc is object.__new__:
@ -217,16 +227,17 @@ class MultivariateGaussian(Prior):
    def __init__(self, mu, var):
        self.mu = np.array(mu).flatten()
        self.var = np.array(var)
-        assert len(self.var.shape) == 2, 'Covariance must be a matrix'
-        assert self.var.shape[0] == self.var.shape[1], \
-            'Covariance must be a square matrix'
+        assert len(self.var.shape) == 2, "Covariance must be a matrix"
+        assert (
+            self.var.shape[0] == self.var.shape[1]
+        ), "Covariance must be a square matrix"
        assert self.var.shape[0] == self.mu.size
        self.input_dim = self.mu.size
        self.inv, _, self.hld, _ = pdinv(self.var)
        self.constant = -0.5 * (self.input_dim * np.log(2 * np.pi) + self.hld)

    def __str__(self):
-        return 'MultiN(' + str(self.mu) + ', ' + str(np.diag(self.var)) + ')'
+        return "MultiN(" + str(self.mu) + ", " + str(np.diag(self.var)) + ")"

    def summary(self):
        raise NotImplementedError
@ -243,7 +254,7 @@ class MultivariateGaussian(Prior):
    def lnpdf_grad(self, x):
        x = np.array(x).flatten()
        d = x - self.mu
-        return - np.dot(self.inv, d)
+        return -np.dot(self.inv, d)

    def rvs(self, n):
        return np.random.multivariate_normal(self.mu, self.var, n)
@ -262,14 +273,16 @@ class MultivariateGaussian(Prior):
    def __setstate__(self, state):
        self.mu = np.array(state[0]).flatten()
        self.var = state[1]
-        assert len(self.var.shape) == 2, 'Covariance must be a matrix'
-        assert self.var.shape[0] == self.var.shape[1], \
-            'Covariance must be a square matrix'
+        assert len(self.var.shape) == 2, "Covariance must be a matrix"
+        assert (
+            self.var.shape[0] == self.var.shape[1]
+        ), "Covariance must be a square matrix"
        assert self.var.shape[0] == self.mu.size
        self.input_dim = self.mu.size
        self.inv, _, self.hld, _ = pdinv(self.var)
        self.constant = -0.5 * (self.input_dim * np.log(2 * np.pi) + self.hld)

+
 def gamma_from_EV(E, V):
    warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning)
    return Gamma.from_EV(E, V)
@ -285,10 +298,11 @@ class Gamma(Prior):
    .. Note:: Bishop 2006 notation is used throughout the code

    """
+
    domain = _POSITIVE
    _instances = []

-    def __new__(cls, a=1, b=.5):  # Singleton:
+    def __new__(cls, a=1, b=0.5):  # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
@ -319,24 +333,29 @@ class Gamma(Prior):
        return "Ga({:.2g}, {:.2g})".format(self.a, self.b)

    def summary(self):
-        ret = {"E[x]": self.a / self.b, \
-               "E[ln x]": digamma(self.a) - np.log(self.b), \
-               "var[x]": self.a / self.b / self.b, \
-               "Entropy": gammaln(self.a) - (self.a - 1.) * digamma(self.a) - np.log(self.b) + self.a}
+        ret = {
+            "E[x]": self.a / self.b,
+            "E[ln x]": digamma(self.a) - np.log(self.b),
+            "var[x]": self.a / self.b / self.b,
+            "Entropy": gammaln(self.a)
+            - (self.a - 1.0) * digamma(self.a)
+            - np.log(self.b)
+            + self.a,
+        }
        if self.a > 1:
-            ret['Mode'] = (self.a - 1.) / self.b
+            ret["Mode"] = (self.a - 1.0) / self.b
        else:
-            ret['mode'] = np.nan
+            ret["mode"] = np.nan
        return ret

    def lnpdf(self, x):
        return self.constant + (self.a - 1) * np.log(x) - self.b * x

    def lnpdf_grad(self, x):
-        return (self.a - 1.) / x - self.b
+        return (self.a - 1.0) / x - self.b

    def rvs(self, n):
-        return np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+        return np.random.gamma(scale=1.0 / self.b, shape=self.a, size=n)

    @staticmethod
    def from_EV(E, V):
@ -359,6 +378,7 @@ class Gamma(Prior):
        self._b = state[1]
        self.constant = -gammaln(self.a) + self.a * np.log(self.b)

+
 class InverseGamma(Gamma):
    """
    Implementation of the inverse-Gamma probability function, coupled with random variables.
@ -369,6 +389,7 @@ class InverseGamma(Gamma):
    .. Note:: Bishop 2006 notation is used throughout the code

    """
+
    domain = _POSITIVE
    _instances = []

@ -386,10 +407,11 @@ class InverseGamma(Gamma):
        return self.constant - (self.a + 1) * np.log(x) - self.b / x

    def lnpdf_grad(self, x):
-        return -(self.a + 1.) / x + self.b / x ** 2
+        return -(self.a + 1.0) / x + self.b / x**2

    def rvs(self, n):
-        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+        return 1.0 / np.random.gamma(scale=1.0 / self.b, shape=self.a, size=n)
+

 class DGPLVM_KFDA(Prior):
    """
@ -403,6 +425,7 @@ class DGPLVM_KFDA(Prior):
    .. Note:: Surpassing Human-Level Face paper dgplvm implementation

    """
+
    domain = _REAL
    # _instances = []
    # def __new__(cls, lambdaa, sigma2):  # Singleton:
@ -459,8 +482,8 @@ class DGPLVM_KFDA(Prior):
        lst_ni = []
        lst_ni1 = []
        lst_ni2 = []
-        f1 = (np.where(self.lbl[:, 0] == 1)[0])
-        f2 = (np.where(self.lbl[:, 1] == 1)[0])
+        f1 = np.where(self.lbl[:, 0] == 1)[0]
+        f2 = np.where(self.lbl[:, 1] == 1)[0]
        for idx in f1:
            lst_ni1.append(idx)
        for idx in f2:
@ -474,11 +497,11 @@ class DGPLVM_KFDA(Prior):
        count = 0
        for N_i in lst_ni:
            if N_i == lst_ni[0]:
-                a[count:count + N_i] = (float(1) / N_i) * a[count]
+                a[count : count + N_i] = (float(1) / N_i) * a[count]
                count += N_i
            else:
                if N_i == lst_ni[1]:
-                    a[count: count + N_i] = -(float(1) / N_i) * a[count]
+                    a[count : count + N_i] = -(float(1) / N_i) * a[count]
                    count += N_i
        return a

@ -486,8 +509,12 @@ class DGPLVM_KFDA(Prior):
        A = np.zeros((self.datanum, self.datanum))
        idx = 0
        for N_i in lst_ni:
-            B = float(1) / np.sqrt(N_i) * (np.eye(N_i) - ((float(1) / N_i) * np.ones((N_i, N_i))))
-            A[idx:idx + N_i, idx:idx + N_i] = B
+            B = (
+                float(1)
+                / np.sqrt(N_i)
+                * (np.eye(N_i) - ((float(1) / N_i) * np.ones((N_i, N_i))))
+            )
+            A[idx : idx + N_i, idx : idx + N_i] = B
            idx += N_i
        return A

@ -498,9 +525,11 @@ class DGPLVM_KFDA(Prior):
        a_trans = np.transpose(self.a)
        paran = self.lambdaa * np.eye(x.shape[0]) + self.A.dot(K).dot(self.A)
        inv_part = pdinv(paran)[0]
-        J = a_trans.dot(K).dot(self.a) - a_trans.dot(K).dot(self.A).dot(inv_part).dot(self.A).dot(K).dot(self.a)
-        J_star = (1. / self.lambdaa) * J
-        return (-1. / self.sigma2) * J_star
+        J = a_trans.dot(K).dot(self.a) - a_trans.dot(K).dot(self.A).dot(inv_part).dot(
+            self.A
+        ).dot(K).dot(self.a)
+        J_star = (1.0 / self.lambdaa) * J
+        return (-1.0 / self.sigma2) * J_star

    # Here gradient function
    def lnpdf_grad(self, x):
@ -511,15 +540,15 @@ class DGPLVM_KFDA(Prior):
        b = self.A.dot(inv_part).dot(self.A).dot(K).dot(self.a)
        a_Minus_b = self.a - b
        a_b_trans = np.transpose(a_Minus_b)
-        DJ_star_DK = (1. / self.lambdaa) * (a_Minus_b.dot(a_b_trans))
+        DJ_star_DK = (1.0 / self.lambdaa) * (a_Minus_b.dot(a_b_trans))
        DJ_star_DX = self.kern.gradients_X(DJ_star_DK, x)
-        return (-1. / self.sigma2) * DJ_star_DX
+        return (-1.0 / self.sigma2) * DJ_star_DX

    def rvs(self, n):
        return np.random.rand(n)  # A WRONG implementation

    def __str__(self):
-        return 'DGPLVM_prior'
+        return "DGPLVM_prior"

    def __getstate___(self):
        return self.lbl, self.lambdaa, self.sigma2, self.kern, self.x_shape
@ -547,6 +576,7 @@ class DGPLVM(Prior):
    .. Note:: DGPLVM for Classification paper implementation

    """
+
    domain = _REAL

    def __new__(cls, sigma2, lbl, x_shape):
@ -606,7 +636,7 @@ class DGPLVM(Prior):
        for i in data_idx:
            if len(lst_idx) == 0:
                pass
-                #Do nothing, because it is the first time list is created so is empty
+                # Do nothing, because it is the first time list is created so is empty
            else:
                lst_idx = []
            # Here we put indices of each class in to the list called lst_idx_all
@ -631,9 +661,9 @@ class DGPLVM(Prior):
            N_i = float(len(cls[i]))
            W_WT = np.zeros((self.dim, self.dim))
            for xk in cls[i]:
-                W = (xk - M_i[i])
+                W = xk - M_i[i]
                W_WT += np.outer(W, W)
-            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+            Sw += (N_i / self.datanum) * ((1.0 / N_i) * W_WT)
        return Sw

    # Calculating beta and Bi for Sb
@ -658,7 +688,6 @@ class DGPLVM(Prior):
        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
        return Sig_beta_B_i_all

-
    # Calculating W_j s separately so we can access all the W_j s anytime
    def compute_wj(self, data_idx, M_i):
        W_i = np.zeros((self.datanum, self.dim))
@ -667,7 +696,7 @@ class DGPLVM(Prior):
            for tpl in data_idx[i]:
                xj = tpl[1]
                j = tpl[0]
-                W_i[j] = (xj - M_i[i])
+                W_i[j] = xj - M_i[i]
        return W_i

    # Calculating alpha and Wj for Sw
@ -680,11 +709,11 @@ class DGPLVM(Prior):
                for j in lst_idx_all[i]:
                    if k == j:
                        alpha = 1 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
                    else:
                        alpha = 0 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
-        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
+        Sig_alpha_W_i = (1.0 / self.datanum) * np.transpose(Sig_alpha_W_i)
        return Sig_alpha_W_i

    # This function calculates log of our prior
@ -696,9 +725,9 @@ class DGPLVM(Prior):
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
        # sb_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))

    # This function calculates derivative of the log of prior function
@ -717,19 +746,20 @@ class DGPLVM(Prior):

        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)

        # Calculating DJ/DXk
        DJ_Dxk = 2 * (
-            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
-                Sig_alpha_W_i))
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all)
+            + Sb_inv_N_trans.dot(Sig_alpha_W_i)
+        )
        # Calculating derivative of the log of the prior
-        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        DPx_Dx = (-1 / self.sigma2) * DJ_Dxk
        return DPx_Dx.T

    # def frb(self, x):
@ -744,7 +774,7 @@ class DGPLVM(Prior):
        return np.random.rand(n)  # A WRONG implementation

    def __str__(self):
-        return 'DGPLVM_prior_Raq'
+        return "DGPLVM_prior_Raq"


 # ******************************************
@ -752,6 +782,7 @@ class DGPLVM(Prior):
 from . import Parameterized
 from . import Param

+
 class DGPLVM_Lamda(Prior, Parameterized):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
@ -761,6 +792,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
    .. Note:: DGPLVM for Classification paper implementation

    """
+
    domain = _REAL
    # _instances = []
    # def __new__(cls, mu, sigma): # Singleton:
@ -773,7 +805,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
    #     cls._instances.append(weakref.ref(o))
    #     return cls._instances[-1]()

-    def __init__(self, sigma2, lbl, x_shape, lamda, name='DP_prior'):
+    def __init__(self, sigma2, lbl, x_shape, lamda, name="DP_prior"):
        super(DGPLVM_Lamda, self).__init__(name=name)
        self.sigma2 = sigma2
        # self.x = x
@ -783,7 +815,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        self.datanum = lbl.shape[0]
        self.x_shape = x_shape
        self.dim = x_shape[1]
-        self.lamda = Param('lamda', np.diag(lamda))
+        self.lamda = Param("lamda", np.diag(lamda))
        self.link_parameter(self.lamda)

    def get_class_label(self, y):
@ -831,7 +863,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        for i in data_idx:
            if len(lst_idx) == 0:
                pass
-                #Do nothing, because it is the first time list is created so is empty
+                # Do nothing, because it is the first time list is created so is empty
            else:
                lst_idx = []
            # Here we put indices of each class in to the list called lst_idx_all
@ -856,9 +888,9 @@ class DGPLVM_Lamda(Prior, Parameterized):
            N_i = float(len(cls[i]))
            W_WT = np.zeros((self.dim, self.dim))
            for xk in cls[i]:
-                W = (xk - M_i[i])
+                W = xk - M_i[i]
                W_WT += np.outer(W, W)
-            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+            Sw += (N_i / self.datanum) * ((1.0 / N_i) * W_WT)
        return Sw

    # Calculating beta and Bi for Sb
@ -883,7 +915,6 @@ class DGPLVM_Lamda(Prior, Parameterized):
        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
        return Sig_beta_B_i_all

-
    # Calculating W_j s separately so we can access all the W_j s anytime
    def compute_wj(self, data_idx, M_i):
        W_i = np.zeros((self.datanum, self.dim))
@ -892,7 +923,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
            for tpl in data_idx[i]:
                xj = tpl[1]
                j = tpl[0]
-                W_i[j] = (xj - M_i[i])
+                W_i[j] = xj - M_i[i]
        return W_i

    # Calculating alpha and Wj for Sw
@ -905,11 +936,11 @@ class DGPLVM_Lamda(Prior, Parameterized):
                for j in lst_idx_all[i]:
                    if k == j:
                        alpha = 1 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
                    else:
                        alpha = 0 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
-        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
+        Sig_alpha_W_i = (1.0 / self.datanum) * np.transpose(Sig_alpha_W_i)
        return Sig_alpha_W_i

    # This function calculates log of our prior
@ -917,7 +948,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        x = x.reshape(self.x_shape)

        #!!!!!!!!!!!!!!!!!!!!!!!!!!!
-        #self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()
+        # self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()

        xprime = x.dot(np.diagflat(self.lamda))
        x = xprime
@ -928,9 +959,9 @@ class DGPLVM_Lamda(Prior, Parameterized):
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.9)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))

    # This function calculates derivative of the log of prior function
@ -952,19 +983,20 @@ class DGPLVM_Lamda(Prior, Parameterized):

        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.9)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)

        # Calculating DJ/DXk
        DJ_Dxk = 2 * (
-            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
-                Sig_alpha_W_i))
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all)
+            + Sb_inv_N_trans.dot(Sig_alpha_W_i)
+        )
        # Calculating derivative of the log of the prior
-        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        DPx_Dx = (-1 / self.sigma2) * DJ_Dxk

        DPxprim_Dx = np.diagflat(self.lamda).dot(DPx_Dx)

@ -980,7 +1012,6 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # print DPxprim_Dx
        return DPxprim_Dx

-
    # def frb(self, x):
    #     from functools import partial
    #     from GPy.models import GradientChecker
@ -993,10 +1024,12 @@ class DGPLVM_Lamda(Prior, Parameterized):
        return np.random.rand(n)  # A WRONG implementation

    def __str__(self):
-        return 'DGPLVM_prior_Raq_Lamda'
+        return "DGPLVM_prior_Raq_Lamda"
+

 # ******************************************

+
 class DGPLVM_T(Prior):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
@ -1006,6 +1039,7 @@ class DGPLVM_T(Prior):
    .. Note:: DGPLVM for Classification paper implementation

    """
+
    domain = _REAL
    # _instances = []
    # def __new__(cls, mu, sigma): # Singleton:
@ -1028,7 +1062,6 @@ class DGPLVM_T(Prior):
        self.dim = x_shape[1]
        self.vec = vec

-
    def get_class_label(self, y):
        for idx, v in enumerate(y):
            if v == 1:
@ -1075,7 +1108,7 @@ class DGPLVM_T(Prior):
        for i in data_idx:
            if len(lst_idx) == 0:
                pass
-                #Do nothing, because it is the first time list is created so is empty
+                # Do nothing, because it is the first time list is created so is empty
            else:
                lst_idx = []
            # Here we put indices of each class in to the list called lst_idx_all
@ -1100,9 +1133,9 @@ class DGPLVM_T(Prior):
            N_i = float(len(cls[i]))
            W_WT = np.zeros((self.dim, self.dim))
            for xk in cls[i]:
-                W = (xk - M_i[i])
+                W = xk - M_i[i]
                W_WT += np.outer(W, W)
-            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+            Sw += (N_i / self.datanum) * ((1.0 / N_i) * W_WT)
        return Sw

    # Calculating beta and Bi for Sb
@ -1127,7 +1160,6 @@ class DGPLVM_T(Prior):
        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
        return Sig_beta_B_i_all

-
    # Calculating W_j s separately so we can access all the W_j s anytime
    def compute_wj(self, data_idx, M_i):
        W_i = np.zeros((self.datanum, self.dim))
@ -1136,7 +1168,7 @@ class DGPLVM_T(Prior):
            for tpl in data_idx[i]:
                xj = tpl[1]
                j = tpl[0]
-                W_i[j] = (xj - M_i[i])
+                W_i[j] = xj - M_i[i]
        return W_i

    # Calculating alpha and Wj for Sw
@ -1149,11 +1181,11 @@ class DGPLVM_T(Prior):
                for j in lst_idx_all[i]:
                    if k == j:
                        alpha = 1 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
                    else:
                        alpha = 0 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
-        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
+        Sig_alpha_W_i = (1.0 / self.datanum) * np.transpose(Sig_alpha_W_i)
        return Sig_alpha_W_i

    # This function calculates log of our prior
@ -1168,10 +1200,10 @@ class DGPLVM_T(Prior):
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #print 'SB_inv: ', Sb_inv_N
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # print 'SB_inv: ', Sb_inv_N
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))

    # This function calculates derivative of the log of prior function
@ -1193,20 +1225,21 @@ class DGPLVM_T(Prior):

        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #print 'SB_inv: ',Sb_inv_N
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # print 'SB_inv: ',Sb_inv_N
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)

        # Calculating DJ/DXk
        DJ_Dxk = 2 * (
-            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
-                Sig_alpha_W_i))
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all)
+            + Sb_inv_N_trans.dot(Sig_alpha_W_i)
+        )
        # Calculating derivative of the log of the prior
-        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        DPx_Dx = (-1 / self.sigma2) * DJ_Dxk
        return DPx_Dx.T

    # def frb(self, x):
@ -1221,9 +1254,7 @@ class DGPLVM_T(Prior):
        return np.random.rand(n)  # A WRONG implementation

    def __str__(self):
-        return 'DGPLVM_prior_Raq_TTT'
-
-
+        return "DGPLVM_prior_Raq_TTT"


 class HalfT(Prior):
@ -1234,6 +1265,7 @@ class HalfT(Prior):
    :param nu: degrees of freedom

    """
+
    domain = _POSITIVE
    _instances = []

@ -1250,13 +1282,22 @@ class HalfT(Prior):
    def __init__(self, A, nu):
        self.A = float(A)
        self.nu = float(nu)
-        self.constant = gammaln(.5*(self.nu+1.)) - gammaln(.5*self.nu) - .5*np.log(np.pi*self.A*self.nu)
+        self.constant = (
+            gammaln(0.5 * (self.nu + 1.0))
+            - gammaln(0.5 * self.nu)
+            - 0.5 * np.log(np.pi * self.A * self.nu)
+        )

    def __str__(self):
        return "hT({:.2g}, {:.2g})".format(self.A, self.nu)

    def lnpdf(self, theta):
-        return (theta > 0) * (self.constant - .5*(self.nu + 1) * np.log(1. + (1./self.nu) * (theta/self.A)**2))
+        return (theta > 0) * (
+            self.constant
+            - 0.5
+            * (self.nu + 1)
+            * np.log(1.0 + (1.0 / self.nu) * (theta / self.A) ** 2)
+        )

        # theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
        # lnpdfs = np.zeros_like(theta)
@ -1268,7 +1309,7 @@ class HalfT(Prior):
        # lnpdfs[above_zero] = (+ gammaln((v + 1) * 0.5)
        #     - gammaln(v * 0.5)
        #     - 0.5*np.log(sigma2 * v * np.pi)
-        #     - 0.5*(v + 1)*np.log(1 + (1/np.float(v))*((theta[above_zero][0]**2)/sigma2))
+        #     - 0.5*(v + 1)*np.log(1 + (1/float(v))*((theta[above_zero][0]**2)/sigma2))
        # )
        # return lnpdfs

@ -1278,12 +1319,18 @@ class HalfT(Prior):
        above_zero = theta > 1e-6
        v = self.nu
        sigma2 = self.A
-        grad[above_zero] = -0.5*(v+1)*(2*theta[above_zero])/(v*sigma2 + theta[above_zero][0]**2)
+        grad[above_zero] = (
+            -0.5
+            * (v + 1)
+            * (2 * theta[above_zero])
+            / (v * sigma2 + theta[above_zero][0] ** 2)
+        )
        return grad

    def rvs(self, n):
        # return np.random.randn(n) * self.sigma + self.mu
        from scipy.stats import t
+
        # [np.abs(x) for x in t.rvs(df=4,loc=0,scale=50, size=10000)])
        ret = t.rvs(self.nu, loc=0, scale=self.A, size=n)
        ret[ret < 0] = 0
@ -1298,6 +1345,7 @@ class Exponential(Prior):
    :param l: shape parameter

    """
+
    domain = _POSITIVE
    _instances = []

@ -1318,22 +1366,25 @@ class Exponential(Prior):
        return "Exp({:.2g})".format(self.l)

    def summary(self):
-        ret = {"E[x]": 1. / self.l,
-               "E[ln x]": np.nan,
-               "var[x]": 1. / self.l**2,
-               "Entropy": 1. - np.log(self.l),
-               "Mode": 0.}
+        ret = {
+            "E[x]": 1.0 / self.l,
+            "E[ln x]": np.nan,
+            "var[x]": 1.0 / self.l**2,
+            "Entropy": 1.0 - np.log(self.l),
+            "Mode": 0.0,
+        }
        return ret

    def lnpdf(self, x):
        return np.log(self.l) - self.l * x

    def lnpdf_grad(self, x):
-        return - self.l
+        return -self.l

    def rvs(self, n):
        return np.random.exponential(scale=self.l, size=n)

+
 class StudentT(Prior):
    """
    Implementation of the student t probability function, coupled with random variables.
@ -1345,6 +1396,7 @@ class StudentT(Prior):
    .. Note:: Bishop 2006 notation is used throughout the code

    """
+
    domain = _REAL
    _instances = []

@ -1352,7 +1404,11 @@ class StudentT(Prior):
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
-                if instance().mu == mu and instance().sigma == sigma and instance().nu == nu:
+                if (
+                    instance().mu == mu
+                    and instance().sigma == sigma
+                    and instance().nu == nu
+                ):
                    return instance()
        newfunc = super(Prior, cls).__new__
        if newfunc is object.__new__:
@ -1373,13 +1429,18 @@ class StudentT(Prior):

    def lnpdf(self, x):
        from scipy.stats import t
-        return t.logpdf(x,self.nu,self.mu,self.sigma)
+
+        return t.logpdf(x, self.nu, self.mu, self.sigma)

    def lnpdf_grad(self, x):
-        return -(self.nu + 1.)*(x - self.mu)/( self.nu*self.sigma2 + np.square(x - self.mu) )
+        return (
+            -(self.nu + 1.0)
+            * (x - self.mu)
+            / (self.nu * self.sigma2 + np.square(x - self.mu))
+        )

    def rvs(self, n):
        from scipy.stats import t
+
        ret = t.rvs(self.nu, loc=self.mu, scale=self.sigma, size=n)
        return ret
-
--- a/GPy/kern/src/coregionalize.py
+++ b/GPy/kern/src/coregionalize.py
@ -5,13 +5,16 @@ from .kern import Kern
 import numpy as np
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
-from ...util.config import config # for assesing whether to use cython
+from ...util.config import config  # for assesing whether to use cython

 try:
    from . import coregionalize_cython
-    use_coregionalize_cython = config.getboolean('cython', 'working')
+
+    use_coregionalize_cython = config.getboolean("cython", "working")
 except ImportError:
-    print('warning in coregionalize: failed to import cython module: falling back to numpy')
+    print(
+        "warning in coregionalize: failed to import cython module: falling back to numpy"
+    )
    use_coregionalize_cython = False


@ -43,22 +46,34 @@ class Coregionalize(Kern):

    .. note: see coregionalization examples in GPy.examples.regression for some usage.
    """
-    def __init__(self, input_dim, output_dim, rank=1, W=None, kappa=None, active_dims=None, name='coregion'):
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        rank=1,
+        W=None,
+        kappa=None,
+        active_dims=None,
+        name="coregion",
+    ):
        super(Coregionalize, self).__init__(input_dim, active_dims, name=name)
        self.output_dim = output_dim
        self.rank = rank
-        if self.rank>output_dim:
-            print("Warning: Unusual choice of rank, it should normally be less than the output_dim.")
+        if self.rank > output_dim:
+            print(
+                "Warning: Unusual choice of rank, it should normally be less than the output_dim."
+            )
        if W is None:
-            W = 0.5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
+            W = 0.5 * np.random.randn(self.output_dim, self.rank) / np.sqrt(self.rank)
        else:
-            assert W.shape==(self.output_dim, self.rank)
-        self.W = Param('W', W)
+            assert W.shape == (self.output_dim, self.rank)
+        self.W = Param("W", W)
        if kappa is None:
-            kappa = 0.5*np.ones(self.output_dim)
+            kappa = 0.5 * np.ones(self.output_dim)
        else:
-            assert kappa.shape==(self.output_dim, )
-        self.kappa = Param('kappa', kappa, Logexp())
+            assert kappa.shape == (self.output_dim,)
+        self.kappa = Param("kappa", kappa, Logexp())
        self.link_parameters(self.W, self.kappa)

    def parameters_changed(self):
@ -70,63 +85,69 @@ class Coregionalize(Kern):
        else:
            return self._K_numpy(X, X2)

-
    def _K_numpy(self, X, X2=None):
-        index = np.asarray(X, dtype=np.int)
+        index = np.asarray(X, dtype=int)
        if X2 is None:
-            return self.B[index,index.T]
+            return self.B[index, index.T]
        else:
-            index2 = np.asarray(X2, dtype=np.int)
-            return self.B[index,index2.T]
+            index2 = np.asarray(X2, dtype=int)
+            return self.B[index, index2.T]

    def _K_cython(self, X, X2=None):
        if X2 is None:
-            return coregionalize_cython.K_symmetric(self.B, np.asarray(X, dtype=np.int64)[:,0])
-        return coregionalize_cython.K_asymmetric(self.B, np.asarray(X, dtype=np.int64)[:,0], np.asarray(X2, dtype=np.int64)[:,0])
-
+            return coregionalize_cython.K_symmetric(
+                self.B, np.asarray(X, dtype=np.int64)[:, 0]
+            )
+        return coregionalize_cython.K_asymmetric(
+            self.B,
+            np.asarray(X, dtype=np.int64)[:, 0],
+            np.asarray(X2, dtype=np.int64)[:, 0],
+        )

    def Kdiag(self, X):
-        return np.diag(self.B)[np.asarray(X, dtype=np.int).flatten()]
+        return np.diag(self.B)[np.asarray(X, dtype=int).flatten()]

    def update_gradients_full(self, dL_dK, X, X2=None):
-        index = np.asarray(X, dtype=np.int)
+        index = np.asarray(X, dtype=int)
        if X2 is None:
            index2 = index
        else:
-            index2 = np.asarray(X2, dtype=np.int)
+            index2 = np.asarray(X2, dtype=int)

-        #attempt to use cython for a nasty double indexing loop: fall back to numpy
+        # attempt to use cython for a nasty double indexing loop: fall back to numpy
        if use_coregionalize_cython:
            dL_dK_small = self._gradient_reduce_cython(dL_dK, index, index2)
        else:
            dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)

-
        dkappa = np.diag(dL_dK_small).copy()
        dL_dK_small += dL_dK_small.T
-        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
+        dW = (self.W[:, None, :] * dL_dK_small[:, :, None]).sum(0)

        self.W.gradient = dW
        self.kappa.gradient = dkappa

    def _gradient_reduce_numpy(self, dL_dK, index, index2):
-        index, index2 = index[:,0], index2[:,0]
+        index, index2 = index[:, 0], index2[:, 0]
        dL_dK_small = np.zeros_like(self.B)
        for i in range(self.output_dim):
-            tmp1 = dL_dK[index==i]
+            tmp1 = dL_dK[index == i]
            for j in range(self.output_dim):
-                dL_dK_small[j,i] = tmp1[:,index2==j].sum()
+                dL_dK_small[j, i] = tmp1[:, index2 == j].sum()
        return dL_dK_small

    def _gradient_reduce_cython(self, dL_dK, index, index2):
-        index, index2 = np.int64(index[:,0]), np.int64(index2[:,0])
-        return coregionalize_cython.gradient_reduce(self.B.shape[0], dL_dK, index, index2)
-
+        index, index2 = np.int64(index[:, 0]), np.int64(index2[:, 0])
+        return coregionalize_cython.gradient_reduce(
+            self.B.shape[0], dL_dK, index, index2
+        )

    def update_gradients_diag(self, dL_dKdiag, X):
-        index = np.asarray(X, dtype=np.int).flatten()
-        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in range(self.output_dim)])
-        self.W.gradient = 2.*self.W*dL_dKdiag_small[:, None]
+        index = np.asarray(X, dtype=int).flatten()
+        dL_dKdiag_small = np.array(
+            [dL_dKdiag[index == i].sum() for i in range(self.output_dim)]
+        )
+        self.W.gradient = 2.0 * self.W * dL_dKdiag_small[:, None]
        self.kappa.gradient = dL_dKdiag_small

    def gradients_X(self, dL_dK, X, X2=None):
@ -154,8 +175,8 @@ class Coregionalize(Kern):

    @staticmethod
    def _build_from_input_dict(kernel_class, input_dict):
-        useGPU = input_dict.pop('useGPU', None)
+        useGPU = input_dict.pop("useGPU", None)
        # W and kappa must be converted back to numpy arrays
-        input_dict['W'] = np.array(input_dict['W'])
-        input_dict['kappa'] = np.array(input_dict['kappa'])
+        input_dict["W"] = np.array(input_dict["W"])
+        input_dict["kappa"] = np.array(input_dict["kappa"])
        return Coregionalize(**input_dict)
--- a/GPy/kern/src/eq_ode1.py
+++ b/GPy/kern/src/eq_ode1.py
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
--- a/GPy/kern/src/todo/eq_ode1.py
+++ b/GPy/kern/src/todo/eq_ode1.py
@ -121,7 +121,7 @@ class Eq_ode1(Kernpart):
            target+=self.initial_variance * np.exp(- self.decay * (t1_mat + t2_mat))

    def Kdiag(self,index,target):
-        #target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
+        #target += np.diag(self.B)[np.asarray(index,dtype=int).flatten()]
        pass
    
    def _param_grad_helper(self,dL_dK,X,X2,target):
@ -203,7 +203,7 @@ class Eq_ode1(Kernpart):
        self._t = X[:, 0]
        if not X.shape[1] == 2:
            raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
-        self._index = np.asarray(X[:, 1],dtype=np.int)
+        self._index = np.asarray(X[:, 1],dtype=int)
        # Sort indices so that outputs are in blocks for computational
        # convenience.
        self._order = self._index.argsort()
@ -220,7 +220,7 @@ class Eq_ode1(Kernpart):
            if not X2.shape[1] == 2:
                raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
            self._t2 = X2[:, 0]
-            self._index2 = np.asarray(X2[:, 1],dtype=np.int)
+            self._index2 = np.asarray(X2[:, 1],dtype=int)
            self._order2 = self._index2.argsort()
            self._index2 = self._index2[self._order2]
            self._t2 = self._t2[self._order2]
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@ -12,6 +12,7 @@ from ..core.parameterization import Param
 from paramz.transformations import Logexp
 from scipy.special import psi as digamma

+
 class StudentT(Likelihood):
    """
    Student T likelihood
@ -22,17 +23,18 @@ class StudentT(Likelihood):
        p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}

    """
-    def __init__(self,gp_link=None, deg_free=5, sigma2=2):
+
+    def __init__(self, gp_link=None, deg_free=5, sigma2=2):
        if gp_link is None:
            gp_link = link_functions.Identity()

-        super(StudentT, self).__init__(gp_link, name='Student_T')
+        super(StudentT, self).__init__(gp_link, name="Student_T")
        # sigma2 is not a noise parameter, it is a squared scale.
-        self.sigma2 = Param('t_scale2', float(sigma2), Logexp())
-        self.v = Param('deg_free', float(deg_free), Logexp())
+        self.sigma2 = Param("t_scale2", float(sigma2), Logexp())
+        self.v = Param("deg_free", float(deg_free), Logexp())
        self.link_parameter(self.sigma2)
        self.link_parameter(self.v)
-        #self.v.constrain_fixed()
+        # self.v.constrain_fixed()

        self.log_concave = False

@ -61,11 +63,14 @@ class StudentT(Likelihood):
        """
        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
        e = y - inv_link_f
-        #Careful gamma(big_number) is infinity!
-        objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
-                     / (np.sqrt(self.v * np.pi * self.sigma2)))
-                     * ((1 + (1./float(self.v))*((e**2)/float(self.sigma2)))**(-0.5*(self.v + 1)))
-                    )
+        # Careful gamma(big_number) is infinity!
+        objective = (
+            np.exp(gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5))
+            / (np.sqrt(self.v * np.pi * self.sigma2))
+        ) * (
+            (1 + (1.0 / float(self.v)) * ((e**2) / float(self.sigma2)))
+            ** (-0.5 * (self.v + 1))
+        )
        return np.prod(objective)

    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
@ -85,15 +90,16 @@ class StudentT(Likelihood):

        """
        e = y - inv_link_f
-        #FIXME:
-        #Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?!
-        #But np.log(1 + (1/float(self.v))*((y-inv_link_f)**2)/self.sigma2) throws it correctly
-        #print - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
-        objective = (+ gammaln((self.v + 1) * 0.5)
-                    - gammaln(self.v * 0.5)
-                    - 0.5*np.log(self.sigma2 * self.v * np.pi)
-                    - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
-                    )
+        # FIXME:
+        # Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?!
+        # But np.log(1 + (1/float(self.v))*((y-inv_link_f)**2)/self.sigma2) throws it correctly
+        # print - 0.5*(self.v + 1)*np.log(1 + (1/(self.v))*((e**2)/self.sigma2))
+        objective = (
+            +gammaln((self.v + 1) * 0.5)
+            - gammaln(self.v * 0.5)
+            - 0.5 * np.log(self.sigma2 * self.v * np.pi)
+            - 0.5 * (self.v + 1) * np.log(1 + (1 / (self.v)) * ((e**2) / self.sigma2))
+        )
        return objective

    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
@ -138,7 +144,9 @@ class StudentT(Likelihood):
            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
        """
        e = y - inv_link_f
-        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
+        hess = ((self.v + 1) * (e**2 - self.v * self.sigma2)) / (
+            (self.sigma2 * self.v + e**2) ** 2
+        )
        return hess

    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
@ -157,9 +165,9 @@ class StudentT(Likelihood):
        :rtype: Nx1 array
        """
        e = y - inv_link_f
-        d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
-                       ((e**2 + self.sigma2*self.v)**3)
-                    )
+        d3lik_dlink3 = -(
+            2 * (self.v + 1) * (-e) * (e**2 - 3 * self.v * self.sigma2)
+        ) / ((e**2 + self.sigma2 * self.v) ** 3)
        return d3lik_dlink3

    def dlogpdf_link_dvar(self, inv_link_f, y, Y_metadata=None):
@ -179,7 +187,11 @@ class StudentT(Likelihood):
        """
        e = y - inv_link_f
        e2 = np.square(e)
-        dlogpdf_dvar = self.v*(e2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e2))
+        dlogpdf_dvar = (
+            self.v
+            * (e2 - self.sigma2)
+            / (2 * self.sigma2 * (self.sigma2 * self.v + e2))
+        )
        return dlogpdf_dvar

    def dlogpdf_dlink_dvar(self, inv_link_f, y, Y_metadata=None):
@ -198,7 +210,9 @@ class StudentT(Likelihood):
        :rtype: Nx1 array
        """
        e = y - inv_link_f
-        dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
+        dlogpdf_dlink_dvar = (self.v * (self.v + 1) * (-e)) / (
+            (self.sigma2 * self.v + e**2) ** 2
+        )
        return dlogpdf_dlink_dvar

    def d2logpdf_dlink2_dvar(self, inv_link_f, y, Y_metadata=None):
@ -217,9 +231,9 @@ class StudentT(Likelihood):
        :rtype: Nx1 array
        """
        e = y - inv_link_f
-        d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
-                              / ((self.sigma2*self.v + (e**2))**3)
-                           )
+        d2logpdf_dlink2_dvar = (
+            self.v * (self.v + 1) * (self.sigma2 * self.v - 3 * (e**2))
+        ) / ((self.sigma2 * self.v + (e**2)) ** 3)
        return d2logpdf_dlink2_dvar

    def dlogpdf_link_dv(self, inv_link_f, y, Y_metadata=None):
@ -227,9 +241,11 @@ class StudentT(Likelihood):
        e2 = np.square(e)
        df = float(self.v[:])
        s2 = float(self.sigma2[:])
-        dlogpdf_dv =  0.5*digamma(0.5*(df+1)) - 0.5*digamma(0.5*df) - 1.0/(2*df)
-        dlogpdf_dv += 0.5*(df+1)*e2/(df*(e2 + s2*df))
-        dlogpdf_dv -= 0.5*np.log1p(e2/(s2*df))
+        dlogpdf_dv = (
+            0.5 * digamma(0.5 * (df + 1)) - 0.5 * digamma(0.5 * df) - 1.0 / (2 * df)
+        )
+        dlogpdf_dv += 0.5 * (df + 1) * e2 / (df * (e2 + s2 * df))
+        dlogpdf_dv -= 0.5 * np.log1p(e2 / (s2 * df))
        return dlogpdf_dv

    def dlogpdf_dlink_dv(self, inv_link_f, y, Y_metadata=None):
@ -237,7 +253,7 @@ class StudentT(Likelihood):
        e2 = np.square(e)
        df = float(self.v[:])
        s2 = float(self.sigma2[:])
-        dlogpdf_df_dv = e*(e2 - self.sigma2)/(e2 + s2*df)**2
+        dlogpdf_df_dv = e * (e2 - self.sigma2) / (e2 + s2 * df) ** 2
        return dlogpdf_df_dv

    def d2logpdf_dlink2_dv(self, inv_link_f, y, Y_metadata=None):
@ -245,8 +261,10 @@ class StudentT(Likelihood):
        e2 = np.square(e)
        df = float(self.v[:])
        s2 = float(self.sigma2[:])
-        e2_s2v = e**2 + s2*df
-        d2logpdf_df2_dv = (-s2*(df+1) + e2 - s2*df)/e2_s2v**2 - 2*s2*(df+1)*(e2 - s2*df)/e2_s2v**3
+        e2_s2v = e**2 + s2 * df
+        d2logpdf_df2_dv = (-s2 * (df + 1) + e2 - s2 * df) / e2_s2v**2 - 2 * s2 * (
+            df + 1
+        ) * (e2 - s2 * df) / e2_s2v**3
        return d2logpdf_df2_dv

    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
@ -266,19 +284,23 @@ class StudentT(Likelihood):

    def predictive_mean(self, mu, sigma, Y_metadata=None):
        # The comment here confuses mean and median.
-        return self.gp_link.transf(mu) # only true if link is monotonic, which it is.
+        return self.gp_link.transf(mu)  # only true if link is monotonic, which it is.

-    def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
-        if self.deg_free<=2.:
-            return np.empty(mu.shape)*np.nan # does not exist for degrees of freedom <= 2.
+    def predictive_variance(self, mu, variance, predictive_mean=None, Y_metadata=None):
+        if self.deg_free <= 2.0:
+            return (
+                np.empty(mu.shape) * np.nan
+            )  # does not exist for degrees of freedom <= 2.
        else:
-            return super(StudentT, self).predictive_variance(mu, variance, predictive_mean, Y_metadata)
+            return super(StudentT, self).predictive_variance(
+                mu, variance, predictive_mean, Y_metadata
+            )

    def conditional_mean(self, gp):
        return self.gp_link.transf(gp)

    def conditional_variance(self, gp):
-        return self.deg_free/(self.deg_free - 2.)
+        return self.deg_free / (self.deg_free - 2.0)

    def samples(self, gp, Y_metadata=None):
        """
@ -288,11 +310,10 @@ class StudentT(Likelihood):
        """
        orig_shape = gp.shape
        gp = gp.flatten()
-        #FIXME: Very slow as we are computing a new random variable per input!
-        #Can't get it to sample all at the same time
-        #student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
-        dfs = np.ones_like(gp)*self.v
-        scales = np.ones_like(gp)*np.sqrt(self.sigma2)
-        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp),
-                                        scale=scales)
+        # FIXME: Very slow as we are computing a new random variable per input!
+        # Can't get it to sample all at the same time
+        # student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
+        dfs = np.ones_like(gp) * self.v
+        scales = np.ones_like(gp) * np.sqrt(self.sigma2)
+        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp), scale=scales)
        return student_t_samples.reshape(orig_shape)
--- a/GPy/models/sparse_gp_coregionalized_regression.py
+++ b/GPy/models/sparse_gp_coregionalized_regression.py
@ -7,6 +7,7 @@ from ..inference.latent_function_inference import VarDTC
 from .. import kern
 from .. import util

+
 class SparseGPCoregionalizedRegression(SparseGP):
    """
    Sparse Gaussian Process model for heteroscedastic multioutput regression
@ -34,34 +35,65 @@ class SparseGPCoregionalizedRegression(SparseGP):
    :type kernel_name: string
    """

-    def __init__(self, X_list, Y_list, Z_list=[], kernel=None, likelihoods_list=None, num_inducing=10, X_variance=None, name='SGPCR',W_rank=1,kernel_name='coreg'):
-
-        #Input and Output
-        X,Y,self.output_index = util.multioutput.build_XY(X_list,Y_list)
+    def __init__(
+        self,
+        X_list,
+        Y_list,
+        Z_list=[],
+        kernel=None,
+        likelihoods_list=None,
+        num_inducing=10,
+        X_variance=None,
+        name="SGPCR",
+        W_rank=1,
+        kernel_name="coreg",
+    ):
+        # Input and Output
+        X, Y, self.output_index = util.multioutput.build_XY(X_list, Y_list)
        Ny = len(Y_list)

-        #Kernel
+        # Kernel
        if kernel is None:
-            kernel = kern.RBF(X.shape[1]-1)
-            
-            kernel = util.multioutput.ICM(input_dim=X.shape[1]-1, num_outputs=Ny, kernel=kernel, W_rank=W_rank, name=kernel_name)
+            kernel = kern.RBF(X.shape[1] - 1)

-        #Likelihood
-        likelihood = util.multioutput.build_likelihood(Y_list,self.output_index,likelihoods_list)
+            kernel = util.multioutput.ICM(
+                input_dim=X.shape[1] - 1,
+                num_outputs=Ny,
+                kernel=kernel,
+                W_rank=W_rank,
+                name=kernel_name,
+            )

-        #Inducing inputs list
+        # Likelihood
+        likelihood = util.multioutput.build_likelihood(
+            Y_list, self.output_index, likelihoods_list
+        )
+
+        # Inducing inputs list
        if len(Z_list):
-            assert len(Z_list) == Ny, 'Number of outputs do not match length of inducing inputs list.'
+            assert (
+                len(Z_list) == Ny
+            ), "Number of outputs do not match length of inducing inputs list."
        else:
-            if isinstance(num_inducing,np.int):
+            if isinstance(num_inducing, int):
                num_inducing = [num_inducing] * Ny
            num_inducing = np.asarray(num_inducing)
-            assert num_inducing.size == Ny, 'Number of outputs do not match length of inducing inputs list.'
-            for ni,Xi in zip(num_inducing,X_list):
+            assert (
+                num_inducing.size == Ny
+            ), "Number of outputs do not match length of inducing inputs list."
+            for ni, Xi in zip(num_inducing, X_list):
                i = np.random.permutation(Xi.shape[0])[:ni]
                Z_list.append(Xi[i].copy())

        Z, _, Iz = util.multioutput.build_XY(Z_list)

-        super(SparseGPCoregionalizedRegression, self).__init__(X, Y, Z, kernel, likelihood, inference_method=VarDTC(), Y_metadata={'output_index':self.output_index})
-        self['.*inducing'][:,-1].fix()
+        super(SparseGPCoregionalizedRegression, self).__init__(
+            X,
+            Y,
+            Z,
+            kernel,
+            likelihood,
+            inference_method=VarDTC(),
+            Y_metadata={"output_index": self.output_index},
+        )
+        self[".*inducing"][:, -1].fix()
--- a/GPy/models/ss_mrd.py
+++ b/GPy/models/ss_mrd.py
@ -5,52 +5,110 @@ The Maniforld Relevance Determination model with the spike-and-slab prior
 import numpy as np
 from ..core import Model
 from .ss_gplvm import SSGPLVM
-from GPy.core.parameterization.variational import SpikeAndSlabPrior,NormalPosterior,VariationalPrior
+from GPy.core.parameterization.variational import (
+    SpikeAndSlabPrior,
+    NormalPosterior,
+    VariationalPrior,
+)
 from ..util.misc import param_to_array
 from ..kern import RBF
 from ..core import Param
 from numpy.linalg.linalg import LinAlgError

+
 class SSMRD(Model):
-    
-    def __init__(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx = 'PCA_concat', initz = 'permute', 
-                 num_inducing=10, Zs=None, kernels=None, inference_methods=None, likelihoods=None, group_spike=True,
-                 pi=0.5, name='ss_mrd', Ynames=None, mpi_comm=None, IBP=False, alpha=2., taus=None, ):
+    def __init__(
+        self,
+        Ylist,
+        input_dim,
+        X=None,
+        X_variance=None,
+        Gammas=None,
+        initx="PCA_concat",
+        initz="permute",
+        num_inducing=10,
+        Zs=None,
+        kernels=None,
+        inference_methods=None,
+        likelihoods=None,
+        group_spike=True,
+        pi=0.5,
+        name="ss_mrd",
+        Ynames=None,
+        mpi_comm=None,
+        IBP=False,
+        alpha=2.0,
+        taus=None,
+    ):
        super(SSMRD, self).__init__(name)
        self.mpi_comm = mpi_comm
        self._PROPAGATE_ = False
-        
+
        # initialize X for individual models
-        X, X_variance, Gammas, fracs = self._init_X(Ylist, input_dim, X, X_variance, Gammas, initx)
+        X, X_variance, Gammas, fracs = self._init_X(
+            Ylist, input_dim, X, X_variance, Gammas, initx
+        )
        self.X = NormalPosterior(means=X, variances=X_variance)
-        
+
        if kernels is None:
-            kernels = [RBF(input_dim, lengthscale=1./fracs, ARD=True) for i in range(len(Ylist))]
+            kernels = [
+                RBF(input_dim, lengthscale=1.0 / fracs, ARD=True)
+                for i in range(len(Ylist))
+            ]
        if Zs is None:
-            Zs = [None]* len(Ylist)
+            Zs = [None] * len(Ylist)
        if likelihoods is None:
-            likelihoods = [None]* len(Ylist)
+            likelihoods = [None] * len(Ylist)
        if inference_methods is None:
-            inference_methods = [None]* len(Ylist)
-        
+            inference_methods = [None] * len(Ylist)
+
        if IBP:
-            self.var_priors = [IBPPrior_SSMRD(len(Ylist),input_dim,alpha=alpha) for i in range(len(Ylist))]
+            self.var_priors = [
+                IBPPrior_SSMRD(len(Ylist), input_dim, alpha=alpha)
+                for i in range(len(Ylist))
+            ]
        else:
-            self.var_priors = [SpikeAndSlabPrior_SSMRD(nModels=len(Ylist),pi=pi,learnPi=False, group_spike=group_spike) for i in range(len(Ylist))]
-        self.models = [SSGPLVM(y, input_dim, X=X.copy(), X_variance=X_variance.copy(), Gamma=Gammas[i], num_inducing=num_inducing,Z=Zs[i], learnPi=False, group_spike=group_spike,
-                               kernel=kernels[i],inference_method=inference_methods[i],likelihood=likelihoods[i], variational_prior=self.var_priors[i], IBP=IBP, tau=None if taus is None else taus[i],
-                               name='model_'+str(i), mpi_comm=mpi_comm, sharedX=True) for i,y in enumerate(Ylist)]
-        self.link_parameters(*(self.models+[self.X]))
-        
+            self.var_priors = [
+                SpikeAndSlabPrior_SSMRD(
+                    nModels=len(Ylist), pi=pi, learnPi=False, group_spike=group_spike
+                )
+                for i in range(len(Ylist))
+            ]
+        self.models = [
+            SSGPLVM(
+                y,
+                input_dim,
+                X=X.copy(),
+                X_variance=X_variance.copy(),
+                Gamma=Gammas[i],
+                num_inducing=num_inducing,
+                Z=Zs[i],
+                learnPi=False,
+                group_spike=group_spike,
+                kernel=kernels[i],
+                inference_method=inference_methods[i],
+                likelihood=likelihoods[i],
+                variational_prior=self.var_priors[i],
+                IBP=IBP,
+                tau=None if taus is None else taus[i],
+                name="model_" + str(i),
+                mpi_comm=mpi_comm,
+                sharedX=True,
+            )
+            for i, y in enumerate(Ylist)
+        ]
+        self.link_parameters(*(self.models + [self.X]))
+
    def _propogate_X_val(self):
-        if self._PROPAGATE_: return
+        if self._PROPAGATE_:
+            return
        for m in self.models:
            m.X.mean.values[:] = self.X.mean.values
            m.X.variance.values[:] = self.X.variance.values
        varp_list = [m.X for m in self.models]
        [vp._update_inernal(varp_list) for vp in self.var_priors]
-        self._PROPAGATE_=True
-    
+        self._PROPAGATE_ = True
+
    def _collate_X_gradient(self):
        self._PROPAGATE_ = False
        self.X.mean.gradient[:] = 0
@ -58,86 +116,92 @@ class SSMRD(Model):
        for m in self.models:
            self.X.mean.gradient += m.X.mean.gradient
            self.X.variance.gradient += m.X.variance.gradient
-        
+
    def parameters_changed(self):
        super(SSMRD, self).parameters_changed()
        [m.parameters_changed() for m in self.models]
-        self._log_marginal_likelihood = sum([m._log_marginal_likelihood for m in self.models])
+        self._log_marginal_likelihood = sum(
+            [m._log_marginal_likelihood for m in self.models]
+        )
        self._collate_X_gradient()

    def log_likelihood(self):
        return self._log_marginal_likelihood
-    
-    def _init_X(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx='PCA_concat'):
-        
+
+    def _init_X(
+        self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx="PCA_concat"
+    ):
        # Divide latent dimensions
-        idx = np.empty((input_dim,),dtype=np.int)
-        residue = (input_dim)%(len(Ylist))
+        idx = np.empty((input_dim,), dtype=int)
+        residue = (input_dim) % (len(Ylist))
        for i in range(len(Ylist)):
            if i < residue:
-                size = input_dim/len(Ylist)+1
-                idx[i*size:(i+1)*size] = i
+                size = input_dim / len(Ylist) + 1
+                idx[i * size : (i + 1) * size] = i
            else:
-                size = input_dim/len(Ylist)
-                idx[i*size+residue:(i+1)*size+residue] = i
-        
+                size = input_dim / len(Ylist)
+                idx[i * size + residue : (i + 1) * size + residue] = i
+
        if X is None:
-            if initx == 'PCA_concat':
-                X = np.empty((Ylist[0].shape[0],input_dim))
+            if initx == "PCA_concat":
+                X = np.empty((Ylist[0].shape[0], input_dim))
                fracs = np.empty((input_dim,))
                from ..util.initialization import initialize_latent
+
                for i in range(len(Ylist)):
                    Y = Ylist[i]
-                    dim = (idx==i).sum()
-                    if dim>0:
-                        x, fr = initialize_latent('PCA', dim, Y)
-                        X[:,idx==i] = x
-                        fracs[idx==i] = fr
-            elif initx=='PCA_joint':
+                    dim = (idx == i).sum()
+                    if dim > 0:
+                        x, fr = initialize_latent("PCA", dim, Y)
+                        X[:, idx == i] = x
+                        fracs[idx == i] = fr
+            elif initx == "PCA_joint":
                y = np.hstack(Ylist)
                from ..util.initialization import initialize_latent
-                X, fracs = initialize_latent('PCA', input_dim, y)
+
+                X, fracs = initialize_latent("PCA", input_dim, y)
            else:
                X = np.random.randn(Ylist[0].shape[0], input_dim)
                fracs = np.ones(input_dim)
        else:
            fracs = np.ones(input_dim)
-            
-    
-        if X_variance is None: # The variance of the variational approximation (S)
-            X_variance = np.random.uniform(0,.1,X.shape)
-            
+
+        if X_variance is None:  # The variance of the variational approximation (S)
+            X_variance = np.random.uniform(0, 0.1, X.shape)
+
        if Gammas is None:
            Gammas = []
            for x in X:
-                gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+                gamma = np.empty_like(
+                    X
+                )  # The posterior probabilities of the binary variable in the variational approximation
                gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
-                gamma[gamma>1.-1e-9] = 1.-1e-9
-                gamma[gamma<1e-9] = 1e-9
+                gamma[gamma > 1.0 - 1e-9] = 1.0 - 1e-9
+                gamma[gamma < 1e-9] = 1e-9
                Gammas.append(gamma)
        return X, X_variance, Gammas, fracs

    @Model.optimizer_array.setter
    def optimizer_array(self, p):
        if self.mpi_comm != None:
-            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0:
-                self.mpi_comm.Bcast(np.int32(1),root=0)
-            self.mpi_comm.Bcast(p, root=0)        
-        Model.optimizer_array.fset(self,p)
-        
+            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank == 0:
+                self.mpi_comm.Bcast(np.int32(1), root=0)
+            self.mpi_comm.Bcast(p, root=0)
+        Model.optimizer_array.fset(self, p)
+
    def optimize(self, optimizer=None, start=None, **kwargs):
        self._IN_OPTIMIZATION_ = True
-        if self.mpi_comm==None:
-            super(SSMRD, self).optimize(optimizer,start,**kwargs)
-        elif self.mpi_comm.rank==0:
-            super(SSMRD, self).optimize(optimizer,start,**kwargs)
-            self.mpi_comm.Bcast(np.int32(-1),root=0)
-        elif self.mpi_comm.rank>0:
+        if self.mpi_comm == None:
+            super(SSMRD, self).optimize(optimizer, start, **kwargs)
+        elif self.mpi_comm.rank == 0:
+            super(SSMRD, self).optimize(optimizer, start, **kwargs)
+            self.mpi_comm.Bcast(np.int32(-1), root=0)
+        elif self.mpi_comm.rank > 0:
            x = self.optimizer_array.copy()
-            flag = np.empty(1,dtype=np.int32)
+            flag = np.empty(1, dtype=np.int32)
            while True:
-                self.mpi_comm.Bcast(flag,root=0)
-                if flag==1:
+                self.mpi_comm.Bcast(flag, root=0)
+                if flag == 1:
                    try:
                        self.optimizer_array = x
                        self._fail_count = 0
@ -145,29 +209,51 @@ class SSMRD(Model):
                        if self._fail_count >= self._allowed_failures:
                            raise
                        self._fail_count += 1
-                elif flag==-1:
+                elif flag == -1:
                    break
                else:
                    self._IN_OPTIMIZATION_ = False
                    raise Exception("Unrecognizable flag for synchronization!")
        self._IN_OPTIMIZATION_ = False
-        
+

 class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
-    def __init__(self, nModels, pi=0.5, learnPi=False, group_spike=True, variance = 1.0, name='SSMRDPrior', **kw):
+    def __init__(
+        self,
+        nModels,
+        pi=0.5,
+        learnPi=False,
+        group_spike=True,
+        variance=1.0,
+        name="SSMRDPrior",
+        **kw
+    ):
        self.nModels = nModels
        self._b_prob_all = 0.5
-        super(SpikeAndSlabPrior_SSMRD, self).__init__(pi=pi,learnPi=learnPi,group_spike=group_spike,variance=variance, name=name, **kw)
-    
+        super(SpikeAndSlabPrior_SSMRD, self).__init__(
+            pi=pi,
+            learnPi=learnPi,
+            group_spike=group_spike,
+            variance=variance,
+            name=name,
+            **kw
+        )
+
    def _update_inernal(self, varp_list):
        """Make an update of the internal status by gathering the variational posteriors for all the individual models."""
        # The probability for the binary variable for the same latent dimension of any of the models is on.
        if self.group_spike:
-            self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group)
-            [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]]
+            self._b_prob_all = 1.0 - param_to_array(varp_list[0].gamma_group)
+            [
+                np.multiply(self._b_prob_all, 1.0 - vp.gamma_group, self._b_prob_all)
+                for vp in varp_list[1:]
+            ]
        else:
-            self._b_prob_all = 1.-param_to_array(varp_list[0].binary_prob)
-            [np.multiply(self._b_prob_all, 1.-vp.binary_prob, self._b_prob_all) for vp in varp_list[1:]]            
+            self._b_prob_all = 1.0 - param_to_array(varp_list[0].binary_prob)
+            [
+                np.multiply(self._b_prob_all, 1.0 - vp.binary_prob, self._b_prob_all)
+                for vp in varp_list[1:]
+            ]

    def KL_divergence(self, variational_posterior):
        mu = variational_posterior.mean
@ -176,16 +262,20 @@ class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
            gamma = variational_posterior.binary_prob[0]
        else:
            gamma = variational_posterior.binary_prob
-        if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+        if len(self.pi.shape) == 2:
+            idx = np.unique(gamma._raveled_index() / gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi

-        var_mean = np.square(mu)/self.variance
-        var_S = (S/self.variance - np.log(S))
-        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
-        return var_gamma +((1.-self._b_prob_all)*(np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels)
+        var_mean = np.square(mu) / self.variance
+        var_S = S / self.variance - np.log(S)
+        var_gamma = (gamma * np.log(gamma / pi)).sum() + (
+            (1 - gamma) * np.log((1 - gamma) / (1 - pi))
+        ).sum()
+        return var_gamma + (
+            (1.0 - self._b_prob_all) * (np.log(self.variance) - 1.0 + var_mean + var_S)
+        ).sum() / (2.0 * self.nModels)

    def update_gradients_KL(self, variational_posterior):
        mu = variational_posterior.mean
@ -195,63 +285,141 @@ class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
            gamma = variational_posterior.binary_prob.values[0]
        else:
            gamma = variational_posterior.binary_prob.values
-        if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+        if len(self.pi.shape) == 2:
+            idx = np.unique(gamma._raveled_index() / gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi

        if self.group_spike:
-            tmp = self._b_prob_all/(1.-gamma)
-            variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))/N +tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+            tmp = self._b_prob_all / (1.0 - gamma)
+            variational_posterior.binary_prob.gradient -= (
+                np.log((1 - pi) / pi * gamma / (1.0 - gamma)) / N
+                + tmp
+                * (
+                    (np.square(mu) + S) / self.variance
+                    - np.log(S)
+                    + np.log(self.variance)
+                    - 1.0
+                )
+                / 2.0
+            )
        else:
-            variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
-        mu.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels)
-        S.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels)
+            variational_posterior.binary_prob.gradient -= (
+                np.log((1 - pi) / pi * gamma / (1.0 - gamma))
+                + (
+                    (np.square(mu) + S) / self.variance
+                    - np.log(S)
+                    + np.log(self.variance)
+                    - 1.0
+                )
+                / 2.0
+            )
+        mu.gradient -= (1.0 - self._b_prob_all) * mu / (self.variance * self.nModels)
+        S.gradient -= (
+            (1.0 / self.variance - 1.0 / S)
+            * (1.0 - self._b_prob_all)
+            / (2.0 * self.nModels)
+        )
        if self.learnPi:
-            raise 'Not Supported!'
+            raise "Not Supported!"
+

 class IBPPrior_SSMRD(VariationalPrior):
-    def __init__(self, nModels, input_dim, alpha =2., tau=None, name='IBPPrior', **kw):
+    def __init__(self, nModels, input_dim, alpha=2.0, tau=None, name="IBPPrior", **kw):
        super(IBPPrior_SSMRD, self).__init__(name=name, **kw)
-        from paramz.transformations import Logexp, __fixed__  
+        from paramz.transformations import Logexp, __fixed__
+
        self.nModels = nModels
        self._b_prob_all = 0.5
        self.input_dim = input_dim
-        self.variance = 1.
-        self.alpha = Param('alpha', alpha, __fixed__)
+        self.variance = 1.0
+        self.alpha = Param("alpha", alpha, __fixed__)
        self.link_parameter(self.alpha)
-        
+
    def _update_inernal(self, varp_list):
        """Make an update of the internal status by gathering the variational posteriors for all the individual models."""
        # The probability for the binary variable for the same latent dimension of any of the models is on.
-        self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group)
-        [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]]
+        self._b_prob_all = 1.0 - param_to_array(varp_list[0].gamma_group)
+        [
+            np.multiply(self._b_prob_all, 1.0 - vp.gamma_group, self._b_prob_all)
+            for vp in varp_list[1:]
+        ]

    def KL_divergence(self, variational_posterior):
-        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
-            
-        var_mean = np.square(mu)/self.variance
-        var_S = (S/self.variance - np.log(S))
-        part1 = ((1.-self._b_prob_all)* (np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels)
-        
-        ad = self.alpha/self.input_dim
-        from scipy.special import betaln,digamma
-        part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + (betaln(ad,1.)*self.input_dim -betaln(tau[:,0], tau[:,1]).sum())/self.nModels \
-                 + (( (tau[:,0]-ad)/self.nModels -gamma)*digamma(tau[:,0])).sum() + \
-                (((tau[:,1]-1.)/self.nModels+gamma-1.)*digamma(tau[:,1])).sum() + (((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*digamma(tau.sum(axis=1))).sum()
-        return part1+part2
+        mu, S, gamma, tau = (
+            variational_posterior.mean.values,
+            variational_posterior.variance.values,
+            variational_posterior.gamma_group.values,
+            variational_posterior.tau.values,
+        )
+
+        var_mean = np.square(mu) / self.variance
+        var_S = S / self.variance - np.log(S)
+        part1 = (
+            (1.0 - self._b_prob_all) * (np.log(self.variance) - 1.0 + var_mean + var_S)
+        ).sum() / (2.0 * self.nModels)
+
+        ad = self.alpha / self.input_dim
+        from scipy.special import betaln, digamma
+
+        part2 = (
+            (gamma * np.log(gamma)).sum()
+            + ((1.0 - gamma) * np.log(1.0 - gamma)).sum()
+            + (betaln(ad, 1.0) * self.input_dim - betaln(tau[:, 0], tau[:, 1]).sum())
+            / self.nModels
+            + (((tau[:, 0] - ad) / self.nModels - gamma) * digamma(tau[:, 0])).sum()
+            + (
+                ((tau[:, 1] - 1.0) / self.nModels + gamma - 1.0) * digamma(tau[:, 1])
+            ).sum()
+            + (
+                ((1.0 + ad - tau[:, 0] - tau[:, 1]) / self.nModels + 1.0)
+                * digamma(tau.sum(axis=1))
+            ).sum()
+        )
+        return part1 + part2

    def update_gradients_KL(self, variational_posterior):
-        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+        mu, S, gamma, tau = (
+            variational_posterior.mean.values,
+            variational_posterior.variance.values,
+            variational_posterior.gamma_group.values,
+            variational_posterior.tau.values,
+        )

-        variational_posterior.mean.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels)
-        variational_posterior.variance.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels)
-        from scipy.special import digamma,polygamma
-        tmp = self._b_prob_all/(1.-gamma)
-        dgamma = (np.log(gamma/(1.-gamma))+ digamma(tau[:,1])-digamma(tau[:,0]))/variational_posterior.num_data
-        variational_posterior.binary_prob.gradient -= dgamma+tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
-        ad = self.alpha/self.input_dim
-        common = ((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*polygamma(1,tau.sum(axis=1))
-        variational_posterior.tau.gradient[:,0] = -(((tau[:,0]-ad)/self.nModels -gamma)*polygamma(1,tau[:,0])+common)
-        variational_posterior.tau.gradient[:,1] = -(((tau[:,1]-1.)/self.nModels+gamma-1.)*polygamma(1,tau[:,1])+common)
+        variational_posterior.mean.gradient -= (
+            (1.0 - self._b_prob_all) * mu / (self.variance * self.nModels)
+        )
+        variational_posterior.variance.gradient -= (
+            (1.0 / self.variance - 1.0 / S)
+            * (1.0 - self._b_prob_all)
+            / (2.0 * self.nModels)
+        )
+        from scipy.special import digamma, polygamma
+
+        tmp = self._b_prob_all / (1.0 - gamma)
+        dgamma = (
+            np.log(gamma / (1.0 - gamma)) + digamma(tau[:, 1]) - digamma(tau[:, 0])
+        ) / variational_posterior.num_data
+        variational_posterior.binary_prob.gradient -= (
+            dgamma
+            + tmp
+            * (
+                (np.square(mu) + S) / self.variance
+                - np.log(S)
+                + np.log(self.variance)
+                - 1.0
+            )
+            / 2.0
+        )
+        ad = self.alpha / self.input_dim
+        common = ((1.0 + ad - tau[:, 0] - tau[:, 1]) / self.nModels + 1.0) * polygamma(
+            1, tau.sum(axis=1)
+        )
+        variational_posterior.tau.gradient[:, 0] = -(
+            ((tau[:, 0] - ad) / self.nModels - gamma) * polygamma(1, tau[:, 0]) + common
+        )
+        variational_posterior.tau.gradient[:, 1] = -(
+            ((tau[:, 1] - 1.0) / self.nModels + gamma - 1.0) * polygamma(1, tau[:, 1])
+            + common
+        )
--- a/GPy/models/state_space_main.py
+++ b/GPy/models/state_space_main.py
--- a/GPy/old_tests/bcgplvm_tests.py
+++ b/GPy/old_tests/bcgplvm_tests.py
@ -17,7 +17,7 @@ class BCGPLVMTests(unittest.TestCase):
        mapping = GPy.mappings.Kernel(output_dim=input_dim, X=Y, kernel=bk)
        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
        
    def test_linear_backconstraint(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
@ -30,7 +30,7 @@ class BCGPLVMTests(unittest.TestCase):
        mapping = GPy.mappings.Linear(output_dim=input_dim, input_dim=output_dim)
        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
        
    def test_mlp_backconstraint(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
@ -43,7 +43,7 @@ class BCGPLVMTests(unittest.TestCase):
        mapping = GPy.mappings.MLP(output_dim=input_dim, input_dim=output_dim, hidden_dim=[5, 4, 7])
        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/old_tests/gp_transformation_tests.py
+++ b/GPy/old_tests/gp_transformation_tests.py
@ -1,4 +1,3 @@
-from nose.tools import with_setup
 from GPy.models import GradientChecker
 from GPy.likelihoods.noise_models import gp_transformations
 import inspect
--- a/GPy/old_tests/gplvm_tests.py
+++ b/GPy/old_tests/gplvm_tests.py
@ -15,7 +15,7 @@ class GPLVMTests(unittest.TestCase):
        k = GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_linear_kern(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
@ -26,7 +26,7 @@ class GPLVMTests(unittest.TestCase):
        k = GPy.kern.Linear(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_rbf_kern(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
@ -37,7 +37,7 @@ class GPLVMTests(unittest.TestCase):
        k = GPy.kern.RBF(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/old_tests/psi_stat_gradient_tests.py
+++ b/GPy/old_tests/psi_stat_gradient_tests.py
@ -1,8 +1,8 @@
-'''
+"""
 Created on 22 Apr 2013

@author: maxz
-'''
+"""
 import unittest
 import numpy

@ -13,42 +13,66 @@ from GPy.core.parameterization.param import Param
 from GPy.core.parameterization.transformations import Logexp
 from GPy.core.parameterization.variational import NormalPosterior

+
 class PsiStatModel(Model):
    def __init__(self, which, X, X_variance, Z, num_inducing, kernel):
-        super(PsiStatModel, self).__init__(name='psi stat test')
+        super(PsiStatModel, self).__init__(name="psi stat test")
        self.which = which
        self.X = Param("X", X)
-        self.X_variance = Param('X_variance', X_variance, Logexp())
+        self.X_variance = Param("X_variance", X_variance, Logexp())
        self.q = NormalPosterior(self.X, self.X_variance)
        self.Z = Param("Z", Z)
        self.N, self.input_dim = X.shape
        self.num_inducing, input_dim = Z.shape
-        assert self.input_dim == input_dim, "shape missmatch: Z:{!s} X:{!s}".format(Z.shape, X.shape)
+        assert self.input_dim == input_dim, "shape missmatch: Z:{!s} X:{!s}".format(
+            Z.shape, X.shape
+        )
        self.kern = kernel
        self.psi_ = self.kern.__getattribute__(self.which)(self.Z, self.q)
        self.add_parameters(self.q, self.Z, self.kern)

    def log_likelihood(self):
-        return self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance).sum()
+        return self.kern.__getattribute__(self.which)(
+            self.Z, self.X, self.X_variance
+        ).sum()

    def parameters_changed(self):
-        psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(numpy.ones_like(self.psi_), self.Z, self.q)
+        psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(
+            numpy.ones_like(self.psi_), self.Z, self.q
+        )
        self.X.gradient = psimu
        self.X_variance.gradient = psiS
-        #psimu, psiS = numpy.ones(self.N * self.input_dim), numpy.ones(self.N * self.input_dim)
-        try: psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.q)
-        except AttributeError: psiZ = numpy.zeros_like(self.Z)
+        # psimu, psiS = numpy.ones(self.N * self.input_dim), numpy.ones(self.N * self.input_dim)
+        try:
+            psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(
+                numpy.ones_like(self.psi_), self.Z, self.q
+            )
+        except AttributeError:
+            psiZ = numpy.zeros_like(self.Z)
        self.Z.gradient = psiZ
-        #psiZ = numpy.ones(self.num_inducing * self.input_dim)
-        N,M = self.X.shape[0], self.Z.shape[0]
-        dL_dpsi0, dL_dpsi1, dL_dpsi2 = numpy.zeros([N]), numpy.zeros([N,M]), numpy.zeros([N,M,M])
-        if self.which == 'psi0': dL_dpsi0 += 1
-        if self.which == 'psi1': dL_dpsi1 += 1
-        if self.which == 'psi2': dL_dpsi2 += 1
-        self.kern.update_gradients_variational(numpy.zeros([1,1]),
-                                               dL_dpsi0,
-                                               dL_dpsi1,
-                                               dL_dpsi2, self.X, self.X_variance, self.Z)
+        # psiZ = numpy.ones(self.num_inducing * self.input_dim)
+        N, M = self.X.shape[0], self.Z.shape[0]
+        dL_dpsi0, dL_dpsi1, dL_dpsi2 = (
+            numpy.zeros([N]),
+            numpy.zeros([N, M]),
+            numpy.zeros([N, M, M]),
+        )
+        if self.which == "psi0":
+            dL_dpsi0 += 1
+        if self.which == "psi1":
+            dL_dpsi1 += 1
+        if self.which == "psi2":
+            dL_dpsi2 += 1
+        self.kern.update_gradients_variational(
+            numpy.zeros([1, 1]),
+            dL_dpsi0,
+            dL_dpsi1,
+            dL_dpsi2,
+            self.X,
+            self.X_variance,
+            self.Z,
+        )
+

 class DPsiStatTest(unittest.TestCase):
    input_dim = 5
@ -56,128 +80,206 @@ class DPsiStatTest(unittest.TestCase):
    num_inducing = 10
    input_dim = 20
    X = numpy.random.randn(N, input_dim)
-    X_var = .5 * numpy.ones_like(X) + .4 * numpy.clip(numpy.random.randn(*X.shape), 0, 1)
+    X_var = 0.5 * numpy.ones_like(X) + 0.4 * numpy.clip(
+        numpy.random.randn(*X.shape), 0, 1
+    )
    Z = numpy.random.permutation(X)[:num_inducing]
    Y = X.dot(numpy.random.randn(input_dim, input_dim))
-#     kernels = [GPy.kern.Linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)), GPy.kern.RBF(input_dim, ARD=True), GPy.kern.Bias(input_dim)]
+    #     kernels = [GPy.kern.Linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)), GPy.kern.RBF(input_dim, ARD=True), GPy.kern.Bias(input_dim)]

    kernels = [
-               GPy.kern.Linear(input_dim),
-               GPy.kern.RBF(input_dim),
-               #GPy.kern.Bias(input_dim),
-               #GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim),
-               #GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim)
-               ]
+        GPy.kern.Linear(input_dim),
+        GPy.kern.RBF(input_dim),
+        # GPy.kern.Bias(input_dim),
+        # GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim),
+        # GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim)
+    ]

    def testPsi0(self):
        for k in self.kernels:
-            m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,\
-                             num_inducing=self.num_inducing, kernel=k)
+            m = PsiStatModel(
+                "psi0",
+                X=self.X,
+                X_variance=self.X_var,
+                Z=self.Z,
+                num_inducing=self.num_inducing,
+                kernel=k,
+            )
            m.randomize()
-            assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k._parameters_)))
+            assert m.checkgrad(), "{} x psi0".format(
+                "+".join(map(lambda x: x.name, k._parameters_))
+            )

    def testPsi1(self):
        for k in self.kernels:
-            m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     num_inducing=self.num_inducing, kernel=k)
+            m = PsiStatModel(
+                "psi1",
+                X=self.X,
+                X_variance=self.X_var,
+                Z=self.Z,
+                num_inducing=self.num_inducing,
+                kernel=k,
+            )
            m.randomize()
-            assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k._parameters_)))
+            assert m.checkgrad(), "{} x psi1".format(
+                "+".join(map(lambda x: x.name, k._parameters_))
+            )

    def testPsi2_lin(self):
        k = self.kernels[0]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                 num_inducing=self.num_inducing, kernel=k)
+        m = PsiStatModel(
+            "psi2",
+            X=self.X,
+            X_variance=self.X_var,
+            Z=self.Z,
+            num_inducing=self.num_inducing,
+            kernel=k,
+        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
+            "+".join(map(lambda x: x.name, k._parameters_))
+        )
+
    def testPsi2_lin_bia(self):
        k = self.kernels[3]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     num_inducing=self.num_inducing, kernel=k)
+        m = PsiStatModel(
+            "psi2",
+            X=self.X,
+            X_variance=self.X_var,
+            Z=self.Z,
+            num_inducing=self.num_inducing,
+            kernel=k,
+        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
+            "+".join(map(lambda x: x.name, k._parameters_))
+        )
+
    def testPsi2_rbf(self):
        k = self.kernels[1]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     num_inducing=self.num_inducing, kernel=k)
+        m = PsiStatModel(
+            "psi2",
+            X=self.X,
+            X_variance=self.X_var,
+            Z=self.Z,
+            num_inducing=self.num_inducing,
+            kernel=k,
+        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
+            "+".join(map(lambda x: x.name, k._parameters_))
+        )
+
    def testPsi2_rbf_bia(self):
        k = self.kernels[-1]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     num_inducing=self.num_inducing, kernel=k)
+        m = PsiStatModel(
+            "psi2",
+            X=self.X,
+            X_variance=self.X_var,
+            Z=self.Z,
+            num_inducing=self.num_inducing,
+            kernel=k,
+        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
+            "+".join(map(lambda x: x.name, k._parameters_))
+        )
+
    def testPsi2_bia(self):
        k = self.kernels[2]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     num_inducing=self.num_inducing, kernel=k)
+        m = PsiStatModel(
+            "psi2",
+            X=self.X,
+            X_variance=self.X_var,
+            Z=self.Z,
+            num_inducing=self.num_inducing,
+            kernel=k,
+        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
+            "+".join(map(lambda x: x.name, k._parameters_))
+        )


 if __name__ == "__main__":
    import sys
-    interactive = 'i' in sys.argv
+
+    interactive = "i" in sys.argv
    if interactive:
-#         N, num_inducing, input_dim, input_dim = 30, 5, 4, 30
-#         X = numpy.random.rand(N, input_dim)
-#         k = GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
-#         K = k.K(X)
-#         Y = numpy.random.multivariate_normal(numpy.zeros(N), K, input_dim).T
-#         Y -= Y.mean(axis=0)
-#         k = GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
-#         m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
-#         m.randomize()
-# #         self.assertTrue(m.checkgrad())
+        #         N, num_inducing, input_dim, input_dim = 30, 5, 4, 30
+        #         X = numpy.random.rand(N, input_dim)
+        #         k = GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
+        #         K = k.K(X)
+        #         Y = numpy.random.multivariate_normal(numpy.zeros(N), K, input_dim).T
+        #         Y -= Y.mean(axis=0)
+        #         k = GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
+        #         m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+        #         m.randomize()
+        # #         assert m.checkgrad()
        numpy.random.seed(0)
        input_dim = 3
        N = 3
        num_inducing = 2
        D = 15
        X = numpy.random.randn(N, input_dim)
-        X_var = .5 * numpy.ones_like(X) + .1 * numpy.clip(numpy.random.randn(*X.shape), 0, 1)
+        X_var = 0.5 * numpy.ones_like(X) + 0.1 * numpy.clip(
+            numpy.random.randn(*X.shape), 0, 1
+        )
        Z = numpy.random.permutation(X)[:num_inducing]
        Y = X.dot(numpy.random.randn(input_dim, D))
-#         kernel = GPy.kern.Bias(input_dim)
-#
-#         kernels = [GPy.kern.Linear(input_dim), GPy.kern.RBF(input_dim), GPy.kern.Bias(input_dim),
-#                GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim),
-#                GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim)]
+        #         kernel = GPy.kern.Bias(input_dim)
+        #
+        #         kernels = [GPy.kern.Linear(input_dim), GPy.kern.RBF(input_dim), GPy.kern.Bias(input_dim),
+        #                GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim),
+        #                GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim)]

-#         for k in kernels:
-#             m = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
-#                      num_inducing=num_inducing, kernel=k)
-#             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
-#
-        m0 = PsiStatModel('psi0', X=X, X_variance=X_var, Z=Z,
-                         num_inducing=num_inducing, kernel=GPy.kern.RBF(input_dim)+GPy.kern.Bias(input_dim))
-#         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=kernel)
-#         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=kernel)
-#         m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=GPy.kern.RBF(input_dim))
-#         m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=GPy.kern.Linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
+        #         for k in kernels:
+        #             m = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
+        #                      num_inducing=num_inducing, kernel=k)
+        #             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
+        #
+        m0 = PsiStatModel(
+            "psi0",
+            X=X,
+            X_variance=X_var,
+            Z=Z,
+            num_inducing=num_inducing,
+            kernel=GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim),
+        )
+        #         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
+        #                          num_inducing=num_inducing, kernel=kernel)
+        #         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
+        #                          num_inducing=num_inducing, kernel=kernel)
+        #         m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+        #                          num_inducing=num_inducing, kernel=GPy.kern.RBF(input_dim))
+        #         m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+        #                          num_inducing=num_inducing, kernel=GPy.kern.Linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
        # + GPy.kern.Bias(input_dim))
-#         m = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing,
-#                          kernel=(
-#             GPy.kern.RBF(input_dim, ARD=1)
-#             +GPy.kern.Linear(input_dim, ARD=1)
-#             +GPy.kern.Bias(input_dim))
-#                          )
-#         m.ensure_default_constraints()
-        m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-                         num_inducing=num_inducing, kernel=(
-            GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1)
-            #+GPy.kern.Linear(input_dim, numpy.random.rand(input_dim), ARD=1)
-            #+GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1)
-            #+GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(), ARD=0)
-            +GPy.kern.Bias(input_dim)
-            +GPy.kern.White(input_dim)
-            )
-            )
-        #m2.ensure_default_constraints()
+        #         m = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+        #                          num_inducing=num_inducing,
+        #                          kernel=(
+        #             GPy.kern.RBF(input_dim, ARD=1)
+        #             +GPy.kern.Linear(input_dim, ARD=1)
+        #             +GPy.kern.Bias(input_dim))
+        #                          )
+        #         m.ensure_default_constraints()
+        m2 = PsiStatModel(
+            "psi2",
+            X=X,
+            X_variance=X_var,
+            Z=Z,
+            num_inducing=num_inducing,
+            kernel=(
+                GPy.kern.RBF(
+                    input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1
+                )
+                # +GPy.kern.Linear(input_dim, numpy.random.rand(input_dim), ARD=1)
+                # +GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1)
+                # +GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(), ARD=0)
+                + GPy.kern.Bias(input_dim)
+                + GPy.kern.White(input_dim)
+            ),
+        )
+        # m2.ensure_default_constraints()
    else:
        unittest.main()
--- a/GPy/old_tests/sparse_gplvm_tests.py
+++ b/GPy/old_tests/sparse_gplvm_tests.py
@ -16,7 +16,7 @@ class sparse_GPLVMTests(unittest.TestCase):
        k = GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_linear_kern(self):
        N, num_inducing, input_dim, D = 10, 3, 2, 4
@ -27,7 +27,7 @@ class sparse_GPLVMTests(unittest.TestCase):
        k = GPy.kern.Linear(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_rbf_kern(self):
        N, num_inducing, input_dim, D = 10, 3, 2, 4
@ -38,7 +38,7 @@ class sparse_GPLVMTests(unittest.TestCase):
        k = GPy.kern.RBF(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -5,6 +5,7 @@ import numpy as np

 from .util import align_subplot_array, align_subplots

+
 def ax_default(fignum, ax):
    if ax is None:
        fig = plt.figure(fignum)
@ -13,11 +14,23 @@ def ax_default(fignum, ax):
        fig = ax.figure
    return fig, ax

-def meanplot(x, mu, color='#3300FF', ax=None, fignum=None, linewidth=2,**kw):
-    _, axes = ax_default(fignum, ax)
-    return axes.plot(x,mu,color=color,linewidth=linewidth,**kw)

-def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, fignum=None, **kwargs):
+def meanplot(x, mu, color="#3300FF", ax=None, fignum=None, linewidth=2, **kw):
+    _, axes = ax_default(fignum, ax)
+    return axes.plot(x, mu, color=color, linewidth=linewidth, **kw)
+
+
+def gpplot(
+    x,
+    mu,
+    lower,
+    upper,
+    edgecol="#3300FF",
+    fillcol="#33CCFF",
+    ax=None,
+    fignum=None,
+    **kwargs
+):
    _, axes = ax_default(fignum, ax)

    mu = mu.flatten()
@ -27,51 +40,62 @@ def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, f

    plots = []

-    #here's the mean
+    # here's the mean
    plots.append(meanplot(x, mu, edgecol, axes))

-    #here's the box
-    kwargs['linewidth']=0.5
-    if not 'alpha' in kwargs.keys():
-        kwargs['alpha'] = 0.3
-    plots.append(axes.fill(np.hstack((x,x[::-1])),np.hstack((upper,lower[::-1])),color=fillcol,**kwargs))
+    # here's the box
+    kwargs["linewidth"] = 0.5
+    if not "alpha" in kwargs.keys():
+        kwargs["alpha"] = 0.3
+    plots.append(
+        axes.fill(
+            np.hstack((x, x[::-1])),
+            np.hstack((upper, lower[::-1])),
+            color=fillcol,
+            **kwargs
+        )
+    )

-    #this is the edge:
-    plots.append(meanplot(x, upper,color=edgecol, linewidth=0.2, ax=axes))
-    plots.append(meanplot(x, lower,color=edgecol, linewidth=0.2, ax=axes))
+    # this is the edge:
+    plots.append(meanplot(x, upper, color=edgecol, linewidth=0.2, ax=axes))
+    plots.append(meanplot(x, lower, color=edgecol, linewidth=0.2, ax=axes))

    return plots

+
 def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
    _, ax = ax_default(fignum, ax)

    plots = []

-    #here's the box
-    if 'linewidth' not in kwargs:
-        kwargs['linewidth'] = 0.5
-    if not 'alpha' in kwargs.keys():
-        kwargs['alpha'] = 1./(len(percentiles))
+    # here's the box
+    if "linewidth" not in kwargs:
+        kwargs["linewidth"] = 0.5
+    if not "alpha" in kwargs.keys():
+        kwargs["alpha"] = 1.0 / (len(percentiles))

    # pop where from kwargs
-    where = kwargs.pop('where') if 'where' in kwargs else None
+    where = kwargs.pop("where") if "where" in kwargs else None
    # pop interpolate, which we actually do not do here!
-    if 'interpolate' in kwargs: kwargs.pop('interpolate')
+    if "interpolate" in kwargs:
+        kwargs.pop("interpolate")

    def pairwise(inlist):
        l = len(inlist)
-        for i in range(int(np.ceil(l/2.))):
-            yield inlist[:][i], inlist[:][(l-1)-i]
+        for i in range(int(np.ceil(l / 2.0))):
+            yield inlist[:][i], inlist[:][(l - 1) - i]

    polycol = []
    for y1, y2 in pairwise(percentiles):
        import matplotlib.mlab as mlab
+
        # Handle united data, such as dates
        ax._process_unit_info(xdata=x, ydata=y1)
        ax._process_unit_info(ydata=y2)

        # Convert the arrays so we can work with them
        from numpy import ma
+
        x = ma.masked_invalid(ax.convert_xunits(x))
        y1 = ma.masked_invalid(ax.convert_yunits(y1))
        y2 = ma.masked_invalid(ax.convert_yunits(y2))
@ -103,7 +127,7 @@ def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
                continue

            N = len(xslice)
-            X = np.zeros((2 * N + 2, 2), np.float)
+            X = np.zeros((2 * N + 2, 2), float)

            # the purpose of the next two lines is for when y2 is a
            # scalar like 0 and we want the fill to go all the way
@ -114,19 +138,21 @@ def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
            X[0] = start
            X[N + 1] = end

-            X[1:N + 1, 0] = xslice
-            X[1:N + 1, 1] = y1slice
-            X[N + 2:, 0] = xslice[::-1]
-            X[N + 2:, 1] = y2slice[::-1]
+            X[1 : N + 1, 0] = xslice
+            X[1 : N + 1, 1] = y1slice
+            X[N + 2 :, 0] = xslice[::-1]
+            X[N + 2 :, 1] = y2slice[::-1]

            polys.append(X)
        polycol.extend(polys)
    from matplotlib.collections import PolyCollection
+
    plots.append(PolyCollection(polycol, **kwargs))
    ax.add_collection(plots[-1], autolim=True)
    ax.autoscale_view()
    return plots

+
 def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)

@ -138,17 +164,19 @@ def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
    plots = []

    if edgecol is None:
-        edgecol='#3300FF'
+        edgecol = "#3300FF"

-    if not 'alpha' in kwargs.keys():
-        kwargs['alpha'] = 1.
+    if not "alpha" in kwargs.keys():
+        kwargs["alpha"] = 1.0

+    if not "lw" in kwargs.keys():
+        kwargs["lw"] = 1.0

-    if not 'lw' in kwargs.keys():
-        kwargs['lw'] = 1.
-
-
-    plots.append(axes.errorbar(x,mu,yerr=np.vstack([mu-lower,upper-mu]),color=edgecol,**kwargs))
+    plots.append(
+        axes.errorbar(
+            x, mu, yerr=np.vstack([mu - lower, upper - mu]), color=edgecol, **kwargs
+        )
+    )
    plots[-1][0].remove()
    return plots

@ -156,53 +184,60 @@ def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
 def removeRightTicks(ax=None):
    ax = ax or plt.gca()
    for i, line in enumerate(ax.get_yticklines()):
-        if i%2 == 1:   # odd indices
+        if i % 2 == 1:  # odd indices
            line.set_visible(False)

+
 def removeUpperTicks(ax=None):
    ax = ax or plt.gca()
    for i, line in enumerate(ax.get_xticklines()):
-        if i%2 == 1:   # odd indices
+        if i % 2 == 1:  # odd indices
            line.set_visible(False)

-def fewerXticks(ax=None,divideby=2):
+
+def fewerXticks(ax=None, divideby=2):
    ax = ax or plt.gca()
    ax.set_xticks(ax.get_xticks()[::divideby])

-def x_frame1D(X,plot_limits=None,resolution=None):
+
+def x_frame1D(X, plot_limits=None, resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
    """
-    assert X.shape[1] ==1, "x_frame1D is defined for one-dimensional inputs"
+    assert X.shape[1] == 1, "x_frame1D is defined for one-dimensional inputs"
    if plot_limits is None:
        from ...core.parameterization.variational import VariationalPosterior
+
        if isinstance(X, VariationalPosterior):
-            xmin,xmax = X.mean.min(0),X.mean.max(0)
+            xmin, xmax = X.mean.min(0), X.mean.max(0)
        else:
-            xmin,xmax = X.min(0),X.max(0)
-        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
-    elif len(plot_limits)==2:
+            xmin, xmax = X.min(0), X.max(0)
+        xmin, xmax = xmin - 0.2 * (xmax - xmin), xmax + 0.2 * (xmax - xmin)
+    elif len(plot_limits) == 2:
        xmin, xmax = plot_limits
    else:
        raise ValueError("Bad limits for plotting")

-    Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
+    Xnew = np.linspace(xmin, xmax, resolution or 200)[:, None]
    return Xnew, xmin, xmax

-def x_frame2D(X,plot_limits=None,resolution=None):
+
+def x_frame2D(X, plot_limits=None, resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
    """
-    assert X.shape[1] ==2, "x_frame2D is defined for two-dimensional inputs"
+    assert X.shape[1] == 2, "x_frame2D is defined for two-dimensional inputs"
    if plot_limits is None:
-        xmin,xmax = X.min(0),X.max(0)
-        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
-    elif len(plot_limits)==2:
+        xmin, xmax = X.min(0), X.max(0)
+        xmin, xmax = xmin - 0.2 * (xmax - xmin), xmax + 0.2 * (xmax - xmin)
+    elif len(plot_limits) == 2:
        xmin, xmax = plot_limits
    else:
        raise ValueError("Bad limits for plotting")

    resolution = resolution or 50
-    xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
-    Xnew = np.vstack((xx.flatten(),yy.flatten())).T
+    xx, yy = np.mgrid[
+        xmin[0] : xmax[0] : 1j * resolution, xmin[1] : xmax[1] : 1j * resolution
+    ]
+    Xnew = np.vstack((xx.flatten(), yy.flatten())).T
    return Xnew, xx, yy, xmin, xmax
--- a/GPy/plotting/matplot_dep/plot_definitions.py
+++ b/GPy/plotting/matplot_dep/plot_definitions.py
@ -1,4 +1,4 @@
-#===============================================================================
+# ===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 # All rights reserved.
 #
@ -26,7 +26,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
+# ===============================================================================
 import numpy as np
 from matplotlib import pyplot as plt
 from ..abstract_plotting_library import AbstractPlottingLibrary
@ -37,6 +37,7 @@ from .controllers import ImshowController, ImAnnotateController
 import itertools
 from .util import legend_ontop

+
 class MatplotlibPlots(AbstractPlottingLibrary):
    def __init__(self):
        super(MatplotlibPlots, self).__init__()
@ -49,54 +50,86 @@ class MatplotlibPlots(AbstractPlottingLibrary):
        fig.gridspec = plt.GridSpec(rows, cols, **gridspec_kwargs)
        return fig

-    def new_canvas(self, figure=None, row=1, col=1, projection='2d', xlabel=None, ylabel=None, zlabel=None, title=None, xlim=None, ylim=None, zlim=None, **kwargs):
-        if projection == '3d':
+    def new_canvas(
+        self,
+        figure=None,
+        row=1,
+        col=1,
+        projection="2d",
+        xlabel=None,
+        ylabel=None,
+        zlabel=None,
+        title=None,
+        xlim=None,
+        ylim=None,
+        zlim=None,
+        **kwargs
+    ):
+        if projection == "3d":
            from mpl_toolkits.mplot3d import Axes3D
-        elif projection == '2d':
+        elif projection == "2d":
            projection = None
-        if 'ax' in kwargs:
-            ax = kwargs.pop('ax')
+        if "ax" in kwargs:
+            ax = kwargs.pop("ax")
        else:
            if figure is not None:
                fig = figure
-            elif 'num' in kwargs and 'figsize' in kwargs:
-                fig = self.figure(num=kwargs.pop('num'), figsize=kwargs.pop('figsize'))
-            elif 'num' in kwargs:
-                fig = self.figure(num=kwargs.pop('num'))
-            elif 'figsize' in kwargs:
-                fig = self.figure(figsize=kwargs.pop('figsize'))
+            elif "num" in kwargs and "figsize" in kwargs:
+                fig = self.figure(num=kwargs.pop("num"), figsize=kwargs.pop("figsize"))
+            elif "num" in kwargs:
+                fig = self.figure(num=kwargs.pop("num"))
+            elif "figsize" in kwargs:
+                fig = self.figure(figsize=kwargs.pop("figsize"))
            else:
                fig = self.figure()

-            #if hasattr(fig, 'rows') and hasattr(fig, 'cols'):
-            ax = fig.add_subplot(fig.gridspec[row-1, col-1], projection=projection)
+            # if hasattr(fig, 'rows') and hasattr(fig, 'cols'):
+            ax = fig.add_subplot(fig.gridspec[row - 1, col - 1], projection=projection)

-        if xlim is not None: ax.set_xlim(xlim)
-        if ylim is not None: ax.set_ylim(ylim)
-        if xlabel is not None: ax.set_xlabel(xlabel)
-        if ylabel is not None: ax.set_ylabel(ylabel)
-        if title is not None: ax.set_title(title)
-        if projection == '3d':
-            if zlim is not None: ax.set_zlim(zlim)
-            if zlabel is not None: ax.set_zlabel(zlabel)
+        if xlim is not None:
+            ax.set_xlim(xlim)
+        if ylim is not None:
+            ax.set_ylim(ylim)
+        if xlabel is not None:
+            ax.set_xlabel(xlabel)
+        if ylabel is not None:
+            ax.set_ylabel(ylabel)
+        if title is not None:
+            ax.set_title(title)
+        if projection == "3d":
+            if zlim is not None:
+                ax.set_zlim(zlim)
+            if zlabel is not None:
+                ax.set_zlabel(zlabel)
        return ax, kwargs

    def add_to_canvas(self, ax, plots, legend=False, title=None, **kwargs):
-        #ax.autoscale_view()
-        fontdict=dict(family='sans-serif', weight='light', size=9)
+        # ax.autoscale_view()
+        fontdict = dict(family="sans-serif", weight="light", size=9)
        if legend is True:
            ax.legend(*ax.get_legend_handles_labels())
        elif legend >= 1:
-            #ax.legend(prop=fontdict)
+            # ax.legend(prop=fontdict)
            legend_ontop(ax, ncol=legend, fontdict=fontdict)
-        if title is not None: ax.figure.suptitle(title)
+        if title is not None:
+            ax.figure.suptitle(title)
        return plots

    def show_canvas(self, ax, **kwargs):
        ax.figure.canvas.draw()
        return ax.figure

-    def scatter(self, ax, X, Y, Z=None, color=Tango.colorsHex['mediumBlue'], label=None, marker='o', **kwargs):
+    def scatter(
+        self,
+        ax,
+        X,
+        Y,
+        Z=None,
+        color=Tango.colorsHex["mediumBlue"],
+        label=None,
+        marker="o",
+        **kwargs
+    ):
        if Z is not None:
            return ax.scatter(X, Y, c=color, zs=Z, label=label, marker=marker, **kwargs)
        return ax.scatter(X, Y, c=color, label=label, marker=marker, **kwargs)
@ -106,129 +139,258 @@ class MatplotlibPlots(AbstractPlottingLibrary):
            return ax.plot(X, Y, color=color, zs=Z, label=label, **kwargs)
        return ax.plot(X, Y, color=color, label=label, **kwargs)

-    def plot_axis_lines(self, ax, X, color=Tango.colorsHex['darkRed'], label=None, **kwargs):
+    def plot_axis_lines(
+        self, ax, X, color=Tango.colorsHex["darkRed"], label=None, **kwargs
+    ):
        from matplotlib import transforms
        from matplotlib.path import Path
-        if 'marker' not in kwargs:
-            kwargs['marker'] = Path([[-.2,0.],    [-.2,.5],    [0.,1.],    [.2,.5],     [.2,0.],     [-.2,0.]],
-                                    [Path.MOVETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.CLOSEPOLY])
-        if 'transform' not in kwargs:
+
+        if "marker" not in kwargs:
+            kwargs["marker"] = Path(
+                [
+                    [-0.2, 0.0],
+                    [-0.2, 0.5],
+                    [0.0, 1.0],
+                    [0.2, 0.5],
+                    [0.2, 0.0],
+                    [-0.2, 0.0],
+                ],
+                [
+                    Path.MOVETO,
+                    Path.LINETO,
+                    Path.LINETO,
+                    Path.LINETO,
+                    Path.LINETO,
+                    Path.CLOSEPOLY,
+                ],
+            )
+        if "transform" not in kwargs:
            if X.shape[1] == 1:
-                kwargs['transform'] = transforms.blended_transform_factory(ax.transData, ax.transAxes)
+                kwargs["transform"] = transforms.blended_transform_factory(
+                    ax.transData, ax.transAxes
+                )
        if X.shape[1] == 2:
-            return ax.scatter(X[:,0], X[:,1], ax.get_zlim()[0], c=color, label=label, **kwargs)
+            return ax.scatter(
+                X[:, 0], X[:, 1], ax.get_zlim()[0], c=color, label=label, **kwargs
+            )
        return ax.scatter(X, np.zeros_like(X), c=color, label=label, **kwargs)

-    def barplot(self, ax, x, height, width=0.8, bottom=0, color=Tango.colorsHex['mediumBlue'], label=None, **kwargs):
-        if 'align' not in kwargs:
-            kwargs['align'] = 'center'
-        return ax.bar(x=x, height=height, width=width,
-               bottom=bottom, label=label, color=color,
-               **kwargs)
+    def barplot(
+        self,
+        ax,
+        x,
+        height,
+        width=0.8,
+        bottom=0,
+        color=Tango.colorsHex["mediumBlue"],
+        label=None,
+        **kwargs
+    ):
+        if "align" not in kwargs:
+            kwargs["align"] = "center"
+        return ax.bar(
+            x=x,
+            height=height,
+            width=width,
+            bottom=bottom,
+            label=label,
+            color=color,
+            **kwargs
+        )

-    def xerrorbar(self, ax, X, Y, error, color=Tango.colorsHex['darkRed'], label=None, **kwargs):
-        if not('linestyle' in kwargs or 'ls' in kwargs):
-            kwargs['ls'] = 'none'
-        #if Z is not None:
+    def xerrorbar(
+        self, ax, X, Y, error, color=Tango.colorsHex["darkRed"], label=None, **kwargs
+    ):
+        if not ("linestyle" in kwargs or "ls" in kwargs):
+            kwargs["ls"] = "none"
+        # if Z is not None:
        #    return ax.errorbar(X, Y, Z, xerr=error, ecolor=color, label=label, **kwargs)
        return ax.errorbar(X, Y, xerr=error, ecolor=color, label=label, **kwargs)

-    def yerrorbar(self, ax, X, Y, error, color=Tango.colorsHex['darkRed'], label=None, **kwargs):
-        if not('linestyle' in kwargs or 'ls' in kwargs):
-            kwargs['ls'] = 'none'
-        #if Z is not None:
+    def yerrorbar(
+        self, ax, X, Y, error, color=Tango.colorsHex["darkRed"], label=None, **kwargs
+    ):
+        if not ("linestyle" in kwargs or "ls" in kwargs):
+            kwargs["ls"] = "none"
+        # if Z is not None:
        #    return ax.errorbar(X, Y, Z, yerr=error, ecolor=color, label=label, **kwargs)
        return ax.errorbar(X, Y, yerr=error, ecolor=color, label=label, **kwargs)

-    def imshow(self, ax, X, extent=None, label=None, vmin=None, vmax=None, **imshow_kwargs):
-        if 'origin' not in imshow_kwargs:
-            imshow_kwargs['origin'] = 'lower'
-        #xmin, xmax, ymin, ymax = extent
-        #xoffset, yoffset = (xmax - xmin) / (2. * X.shape[0]), (ymax - ymin) / (2. * X.shape[1])
-        #xmin, xmax, ymin, ymax = extent = xmin-xoffset, xmax+xoffset, ymin-yoffset, ymax+yoffset
-        return ax.imshow(X, label=label, extent=extent, vmin=vmin, vmax=vmax, **imshow_kwargs)
+    def imshow(
+        self, ax, X, extent=None, label=None, vmin=None, vmax=None, **imshow_kwargs
+    ):
+        if "origin" not in imshow_kwargs:
+            imshow_kwargs["origin"] = "lower"
+        # xmin, xmax, ymin, ymax = extent
+        # xoffset, yoffset = (xmax - xmin) / (2. * X.shape[0]), (ymax - ymin) / (2. * X.shape[1])
+        # xmin, xmax, ymin, ymax = extent = xmin-xoffset, xmax+xoffset, ymin-yoffset, ymax+yoffset
+        return ax.imshow(
+            X, label=label, extent=extent, vmin=vmin, vmax=vmax, **imshow_kwargs
+        )

-    def imshow_interact(self, ax, plot_function, extent, label=None, resolution=None, vmin=None, vmax=None, **imshow_kwargs):
-        if imshow_kwargs is None: imshow_kwargs = {}
-        if 'origin' not in imshow_kwargs:
-            imshow_kwargs['origin'] = 'lower'
-        return ImshowController(ax, plot_function, extent, resolution=resolution, vmin=vmin, vmax=vmax, **imshow_kwargs)
+    def imshow_interact(
+        self,
+        ax,
+        plot_function,
+        extent,
+        label=None,
+        resolution=None,
+        vmin=None,
+        vmax=None,
+        **imshow_kwargs
+    ):
+        if imshow_kwargs is None:
+            imshow_kwargs = {}
+        if "origin" not in imshow_kwargs:
+            imshow_kwargs["origin"] = "lower"
+        return ImshowController(
+            ax,
+            plot_function,
+            extent,
+            resolution=resolution,
+            vmin=vmin,
+            vmax=vmax,
+            **imshow_kwargs
+        )

-    def annotation_heatmap(self, ax, X, annotation, extent=None, label=None, imshow_kwargs=None, **annotation_kwargs):
-        if imshow_kwargs is None: imshow_kwargs = {}
-        if 'origin' not in imshow_kwargs:
-            imshow_kwargs['origin'] = 'lower'
-        if ('ha' not in annotation_kwargs) and ('horizontalalignment' not in annotation_kwargs):
-            annotation_kwargs['ha'] = 'center'
-        if ('va' not in annotation_kwargs) and ('verticalalignment' not in annotation_kwargs):
-            annotation_kwargs['va'] = 'center'
+    def annotation_heatmap(
+        self,
+        ax,
+        X,
+        annotation,
+        extent=None,
+        label=None,
+        imshow_kwargs=None,
+        **annotation_kwargs
+    ):
+        if imshow_kwargs is None:
+            imshow_kwargs = {}
+        if "origin" not in imshow_kwargs:
+            imshow_kwargs["origin"] = "lower"
+        if ("ha" not in annotation_kwargs) and (
+            "horizontalalignment" not in annotation_kwargs
+        ):
+            annotation_kwargs["ha"] = "center"
+        if ("va" not in annotation_kwargs) and (
+            "verticalalignment" not in annotation_kwargs
+        ):
+            annotation_kwargs["va"] = "center"
        imshow = self.imshow(ax, X, extent, label, **imshow_kwargs)
        if extent is None:
            extent = (0, X.shape[0], 0, X.shape[1])
        xmin, xmax, ymin, ymax = extent
-        xoffset, yoffset = (xmax - xmin) / (2. * X.shape[0]), (ymax - ymin) / (2. * X.shape[1])
+        xoffset, yoffset = (xmax - xmin) / (2.0 * X.shape[0]), (ymax - ymin) / (
+            2.0 * X.shape[1]
+        )
        xlin = np.linspace(xmin, xmax, X.shape[0], endpoint=False)
        ylin = np.linspace(ymin, ymax, X.shape[1], endpoint=False)
        annotations = []
        for [i, x], [j, y] in itertools.product(enumerate(xlin), enumerate(ylin)):
-            annotations.append(ax.text(x+xoffset, y+yoffset, "{}".format(annotation[j, i]), **annotation_kwargs))
+            annotations.append(
+                ax.text(
+                    x + xoffset,
+                    y + yoffset,
+                    "{}".format(annotation[j, i]),
+                    **annotation_kwargs
+                )
+            )
        return imshow, annotations

-    def annotation_heatmap_interact(self, ax, plot_function, extent, label=None, resolution=15, imshow_kwargs=None, **annotation_kwargs):
-        if imshow_kwargs is None: imshow_kwargs = {}
-        if 'origin' not in imshow_kwargs:
-            imshow_kwargs['origin'] = 'lower'
-        return ImAnnotateController(ax, plot_function, extent, resolution=resolution, imshow_kwargs=imshow_kwargs or {}, **annotation_kwargs)
+    def annotation_heatmap_interact(
+        self,
+        ax,
+        plot_function,
+        extent,
+        label=None,
+        resolution=15,
+        imshow_kwargs=None,
+        **annotation_kwargs
+    ):
+        if imshow_kwargs is None:
+            imshow_kwargs = {}
+        if "origin" not in imshow_kwargs:
+            imshow_kwargs["origin"] = "lower"
+        return ImAnnotateController(
+            ax,
+            plot_function,
+            extent,
+            resolution=resolution,
+            imshow_kwargs=imshow_kwargs or {},
+            **annotation_kwargs
+        )

    def contour(self, ax, X, Y, C, levels=20, label=None, **kwargs):
-        return ax.contour(X, Y, C, levels=np.linspace(C.min(), C.max(), levels), label=label, **kwargs)
+        return ax.contour(
+            X, Y, C, levels=np.linspace(C.min(), C.max(), levels), label=label, **kwargs
+        )

    def surface(self, ax, X, Y, Z, color=None, label=None, **kwargs):
        return ax.plot_surface(X, Y, Z, label=label, **kwargs)

-    def fill_between(self, ax, X, lower, upper, color=Tango.colorsHex['mediumBlue'], label=None, **kwargs):
+    def fill_between(
+        self,
+        ax,
+        X,
+        lower,
+        upper,
+        color=Tango.colorsHex["mediumBlue"],
+        label=None,
+        **kwargs
+    ):
        return ax.fill_between(X, lower, upper, facecolor=color, label=label, **kwargs)

-    def fill_gradient(self, canvas, X, percentiles, color=Tango.colorsHex['mediumBlue'], label=None, **kwargs):
+    def fill_gradient(
+        self,
+        canvas,
+        X,
+        percentiles,
+        color=Tango.colorsHex["mediumBlue"],
+        label=None,
+        **kwargs
+    ):
        ax = canvas
        plots = []

-        if 'edgecolors' not in kwargs:
-            kwargs['edgecolors'] = 'none'
+        if "edgecolors" not in kwargs:
+            kwargs["edgecolors"] = "none"

-        if 'facecolors' in kwargs:
-            color = kwargs.pop('facecolors')
+        if "facecolors" in kwargs:
+            color = kwargs.pop("facecolors")

-        if 'array' in kwargs:
-            array = kwargs.pop('array')
+        if "array" in kwargs:
+            array = kwargs.pop("array")
        else:
-            array = 1.-np.abs(np.linspace(-.97, .97, len(percentiles)-1))
+            array = 1.0 - np.abs(np.linspace(-0.97, 0.97, len(percentiles) - 1))

-        if 'alpha' in kwargs:
-            alpha = kwargs.pop('alpha')
+        if "alpha" in kwargs:
+            alpha = kwargs.pop("alpha")
        else:
-            alpha = .8
+            alpha = 0.8

-        if 'cmap' in kwargs:
-            cmap = kwargs.pop('cmap')
+        if "cmap" in kwargs:
+            cmap = kwargs.pop("cmap")
        else:
-            cmap = LinearSegmentedColormap.from_list('WhToColor', (color, color), N=array.size)
+            cmap = LinearSegmentedColormap.from_list(
+                "WhToColor", (color, color), N=array.size
+            )
        cmap._init()
-        cmap._lut[:-3, -1] = alpha*array
+        cmap._lut[:-3, -1] = alpha * array

-        kwargs['facecolors'] = [cmap(i) for i in np.linspace(0,1,cmap.N)]
+        kwargs["facecolors"] = [cmap(i) for i in np.linspace(0, 1, cmap.N)]

        # pop where from kwargs
-        where = kwargs.pop('where') if 'where' in kwargs else None
+        where = kwargs.pop("where") if "where" in kwargs else None
        # pop interpolate, which we actually do not do here!
-        if 'interpolate' in kwargs: kwargs.pop('interpolate')
+        if "interpolate" in kwargs:
+            kwargs.pop("interpolate")

        def pairwise(iterable):
            "s -> (s0,s1), (s1,s2), (s2, s3), ..."
            from itertools import tee
-            #try:
+
+            # try:
            #    from itertools import izip as zip
-            #except ImportError:
+            # except ImportError:
            #    pass
            a, b = tee(iterable)
            next(b, None)
@ -245,6 +407,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
            ax._process_unit_info(ydata=y2)
            # Convert the arrays so we can work with them
            from numpy import ma
+
            x = ma.masked_invalid(ax.convert_xunits(X))
            y1 = ma.masked_invalid(ax.convert_yunits(y1))
            y2 = ma.masked_invalid(ax.convert_yunits(y2))
@ -263,6 +426,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                raise ValueError("Argument dimensions are incompatible")

            from functools import reduce
+
            mask = reduce(ma.mask_or, [ma.getmask(a) for a in (x, y1, y2)])
            if mask is not ma.nomask:
                where &= ~mask
@ -277,7 +441,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                    continue

                N = len(xslice)
-                p = np.zeros((2 * N + 2, 2), np.float)
+                p = np.zeros((2 * N + 2, 2), float)

                # the purpose of the next two lines is for when y2 is a
                # scalar like 0 and we want the fill to go all the way
@ -288,16 +452,17 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                p[0] = start
                p[N + 1] = end

-                p[1:N + 1, 0] = xslice
-                p[1:N + 1, 1] = y1slice
-                p[N + 2:, 0] = xslice[::-1]
-                p[N + 2:, 1] = y2slice[::-1]
+                p[1 : N + 1, 0] = xslice
+                p[1 : N + 1, 1] = y1slice
+                p[N + 2 :, 0] = xslice[::-1]
+                p[N + 2 :, 1] = y2slice[::-1]

                polys.append(p)
            polycol.extend(polys)
        from matplotlib.collections import PolyCollection
-        if 'zorder' not in kwargs:
-            kwargs['zorder'] = 0
+
+        if "zorder" not in kwargs:
+            kwargs["zorder"] = 0
        plots.append(PolyCollection(polycol, label=label, **kwargs))
        ax.add_collection(plots[-1], autolim=True)
        ax.autoscale_view()
--- a/GPy/testing/init.py
+++ b/GPy/testing/init.py
@ -1,9 +0,0 @@
-# Copyright (c) 2014, Max Zwiessele, GPy Authors
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-import unittest
-import sys
-
-def deepTest(reason):
-    if reason:
-        return lambda x:x
-    return unittest.skip("Not deep scanning, enable deepscan by adding 'deep' argument to unittest call")
--- a/GPy/testing/cython_tests.py
+++ b/GPy/testing/cython_tests.py
@ -1,81 +0,0 @@
-import numpy as np
-import scipy as sp
-from GPy.util import choleskies
-import GPy
-import unittest
-
-from ..util.config import config
-
-try:
-    from ..util import choleskies_cython
-    choleskies_cython_working = config.getboolean('cython', 'working')
-except ImportError:
-    choleskies_cython_working = False
-
-try:
-    from ..kern.src import stationary_cython
-    stationary_cython_working = config.getboolean('cython', 'working')
-except ImportError:
-    stationary_cython_working = False
-
-"""
-These tests make sure that the pure python and cython codes work the same
-"""
-
-@unittest.skipIf(not choleskies_cython_working,"Cython cholesky module has not been built on this machine")
-class CythonTestChols(np.testing.TestCase):
-    def setUp(self):
-        self.flat = np.random.randn(45,5)
-        self.triang = np.array([np.eye(20) for i in range(3)])
-    def test_flat_to_triang(self):
-        L1 = choleskies._flat_to_triang_pure(self.flat)
-        L2 = choleskies._flat_to_triang_cython(self.flat)
-        np.testing.assert_allclose(L1, L2)
-    def test_triang_to_flat(self):
-        A1 = choleskies._triang_to_flat_pure(self.triang)
-        A2 = choleskies._triang_to_flat_cython(self.triang)
-        np.testing.assert_allclose(A1, A2)
-
-@unittest.skipIf(not stationary_cython_working,"Cython stationary module has not been built on this machine")
-class test_stationary(np.testing.TestCase):
-    def setUp(self):
-        self.k = GPy.kern.RBF(10)
-        self.X = np.random.randn(300,10)
-        self.Z = np.random.randn(20,10)
-        self.dKxx = np.random.randn(300,300)
-        self.dKzz = np.random.randn(20,20)
-        self.dKxz = np.random.randn(300,20)
-
-    def test_square_gradX(self):
-        g1 = self.k._gradients_X_cython(self.dKxx, self.X)
-        g2 = self.k._gradients_X_pure(self.dKxx, self.X)
-        np.testing.assert_allclose(g1, g2)
-
-    def test_rect_gradx(self):
-        g1 = self.k._gradients_X_cython(self.dKxz, self.X, self.Z)
-        g2 = self.k._gradients_X_pure(self.dKxz, self.X, self.Z)
-        np.testing.assert_allclose(g1, g2)
-
-    def test_square_lengthscales(self):
-        g1 = self.k._lengthscale_grads_pure(self.dKxx, self.X, self.X)
-        g2 = self.k._lengthscale_grads_cython(self.dKxx, self.X, self.X)
-        np.testing.assert_allclose(g1, g2)
-
-    def test_rect_lengthscales(self):
-        g1 = self.k._lengthscale_grads_pure(self.dKxz, self.X, self.Z)
-        g2 = self.k._lengthscale_grads_cython(self.dKxz, self.X, self.Z)
-        np.testing.assert_allclose(g1, g2)
-
-@unittest.skipIf(not choleskies_cython_working,"Cython cholesky module has not been built on this machine")
-class test_choleskies_backprop(np.testing.TestCase):
-    def setUp(self):
-        a =np.random.randn(10,12)
-        A = a.dot(a.T)
-        self.L = GPy.util.linalg.jitchol(A)
-        self.dL = np.random.randn(10,10)
-    def test(self):
-        r1 = choleskies._backprop_gradient_pure(self.dL, self.L)
-        r2 = choleskies_cython.backprop_gradient(self.dL, self.L)
-        r3 = choleskies_cython.backprop_gradient_par_c(self.dL, self.L)
-        np.testing.assert_allclose(r1, r2)
-        np.testing.assert_allclose(r1, r3)
--- a/GPy/testing/deactivated/test_examples.py
+++ b/GPy/testing/deactivated/test_examples.py
@ -1,61 +1,65 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import unittest
-import numpy as np
 import GPy
 import inspect
 import pkgutil
 import os
-import random
-from nose.tools import nottest
-import sys
-import itertools

-class ExamplesTests(unittest.TestCase):
-    def _checkgrad(self, Model):
-        self.assertTrue(Model.checkgrad())

-    def _model_instance(self, Model):
-        self.assertTrue(isinstance(Model, GPy.models))
+def check_grad(Model):
+    assert Model.checkgrad(), "Gradient check failed!"
+
+
+def check_model_instance(Model):
+    assert isinstance(Model, GPy.models), "Wrong type!"
+

 def model_checkgrads(model):
    model.randomize()
-    #NOTE: Step as 1e-4, this should be acceptable for more peaky models
+    # NOTE: Step as 1e-4, this should be acceptable for more peaky models
    return model.checkgrad(step=1e-4)

+
 def model_instance(model):
    return isinstance(model, GPy.core.model.Model)

+
 def flatten_nested(lst):
    result = []
    for element in lst:
-        if hasattr(element, '__iter__'):
+        if hasattr(element, "__iter__"):
            result.extend(flatten_nested(element))
        else:
            result.append(element)
    return result

-@nottest
+
 def test_models():
-    optimize=False
-    plot=True
+    optimize = False
+    plot = True
    examples_path = os.path.dirname(GPy.examples.__file__)
    # Load modules
    failing_models = {}
-    for loader, module_name, is_pkg in pkgutil.iter_modules([examples_path]):
+    for loader, module_name, _is_pkg in pkgutil.iter_modules([examples_path]):
        # Load examples
        module_examples = loader.find_module(module_name).load_module(module_name)
        print("MODULE", module_examples)
        print("Before")
        print(inspect.getmembers(module_examples, predicate=inspect.isfunction))
-        functions = [ func for func in inspect.getmembers(module_examples, predicate=inspect.isfunction) if func[0].startswith('_') is False ][::-1]
+        functions = [
+            func
+            for func in inspect.getmembers(
+                module_examples, predicate=inspect.isfunction
+            )
+            if func[0].startswith("_") is False
+        ][::-1]
        print("After")
        print(functions)
        for example in functions:
-            if example[0] in ['epomeo_gpx']:
-                #These are the edge cases that we might want to handle specially
-                if example[0] == 'epomeo_gpx' and not GPy.util.datasets.gpxpy_available:
+            if example[0] in ["epomeo_gpx"]:
+                # These are the edge cases that we might want to handle specially
+                if example[0] == "epomeo_gpx" and not GPy.util.datasets.gpxpy_available:
                    print("Skipping as gpxpy is not available to parse GPS")
                    continue

@ -63,14 +67,14 @@ def test_models():
            # Generate model

            try:
-                models = [ example[1](optimize=optimize, plot=plot) ]
-                #If more than one model returned, flatten them
+                models = [example[1](optimize=optimize, plot=plot)]
+                # If more than one model returned, flatten them
                models = flatten_nested(models)
            except Exception as e:
                failing_models[example[0]] = "Cannot make model: \n{e}".format(e=e)
            else:
                print(models)
-                model_checkgrads.description = 'test_checkgrads_%s' % example[0]
+                model_checkgrads.description = "test_checkgrads_%s" % example[0]
                try:
                    for model in models:
                        if not model_checkgrads(model):
@ -78,7 +82,7 @@ def test_models():
                except Exception as e:
                    failing_models[model_checkgrads.description] = e

-                model_instance.description = 'test_instance_%s' % example[0]
+                model_instance.description = "test_instance_%s" % example[0]
                try:
                    for model in models:
                        if not model_instance(model):
@ -86,8 +90,8 @@ def test_models():
                except Exception as e:
                    failing_models[model_instance.description] = e

-            #yield model_checkgrads, model
-            #yield model_instance, model
+            # yield model_checkgrads, model
+            # yield model_instance, model

        print("Finished checking module {m}".format(m=module_name))
        if len(failing_models.keys()) > 0:
@ -97,9 +101,3 @@ def test_models():
    if len(failing_models.keys()) > 0:
        print(failing_models)
        raise Exception(failing_models)
-
-
-if __name__ == "__main__":
-    print("Running unit tests, please be (very) patient...")
-    # unittest.main()
-    test_models()
--- a/GPy/testing/deactivated/test_mpi.py
+++ b/GPy/testing/deactivated/test_mpi.py
@ -1,16 +1,12 @@
 # Copyright (c) 2013-2014, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import unittest
 import numpy as np
-import GPy

 try:
-    from mpi4py import MPI
    import subprocess

-    class MPITests(unittest.TestCase):
-            
+    class TestMPI:
        def test_BayesianGPLVM_MPI(self):
            code = """
 import numpy as np
@ -33,17 +29,20 @@ if comm.rank==0:
    m._trigger_params_changed()
    print float(m.objective_function())
            """
-            with open('mpi_test__.py','w') as f:
+            with open("mpi_test__.py", "w") as f:
                f.write(code)
                f.close()
-            p = subprocess.Popen('mpirun -n 4 python mpi_test__.py',stdout=subprocess.PIPE,shell=True)
-            (stdout, stderr) = p.communicate()
-            L1 =  float(stdout.splitlines()[-2])
-            L2 =  float(stdout.splitlines()[-1])
-            self.assertTrue(np.allclose(L1,L2))
+            p = subprocess.Popen(
+                "mpirun -n 4 python mpi_test__.py", stdout=subprocess.PIPE, shell=True
+            )
+            (stdout, _stderr) = p.communicate()
+            L1 = float(stdout.splitlines()[-2])
+            L2 = float(stdout.splitlines()[-1])
+            self.assertTrue(np.allclose(L1, L2))
            import os
-            os.remove('mpi_test__.py')
-            
+
+            os.remove("mpi_test__.py")
+
        def test_SparseGPRegression_MPI(self):
            code = """
 import numpy as np
@ -66,27 +65,19 @@ if comm.rank==0:
    m._trigger_params_changed()
    print float(m.objective_function())
            """
-            with open('mpi_test__.py','w') as f:
+            with open("mpi_test__.py", "w") as f:
                f.write(code)
                f.close()
-            p = subprocess.Popen('mpirun -n 4 python mpi_test__.py',stdout=subprocess.PIPE,shell=True)
+            p = subprocess.Popen(
+                "mpirun -n 4 python mpi_test__.py", stdout=subprocess.PIPE, shell=True
+            )
            (stdout, stderr) = p.communicate()
-            L1 =  float(stdout.splitlines()[-2])
-            L2 =  float(stdout.splitlines()[-1])
-            self.assertTrue(np.allclose(L1,L2))
+            L1 = float(stdout.splitlines()[-2])
+            L2 = float(stdout.splitlines()[-1])
+            assert np.allclose(L1, L2)
            import os
-            os.remove('mpi_test__.py')

+            os.remove("mpi_test__.py")

 except:
    pass
-
-
-
-if __name__ == "__main__":
-    print("Running unit tests, please be (very) patient...")
-    try:
-        import mpi4py
-        unittest.main()
-    except:
-        pass
--- a/GPy/testing/fitc.py
+++ b/GPy/testing/fitc.py
@ -1,34 +1,38 @@
 # Copyright (c) 2014, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import unittest
 import numpy as np
 import GPy

-class FITCtest(unittest.TestCase):
-    def setUp(self):
+
+class FITCtest:
+    def setup(self):
        ######################################
        # # 1 dimensional example

        N = 20
        # sample inputs and outputs
-        self.X1D = np.random.uniform(-3., 3., (N, 1))
+        self.X1D = np.random.uniform(-3.0, 3.0, (N, 1))
        self.Y1D = np.sin(self.X1D) + np.random.randn(N, 1) * 0.05

        ######################################
        # # 2 dimensional example

        # sample inputs and outputs
-        self.X2D = np.random.uniform(-3., 3., (N, 2))
-        self.Y2D = np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2]) + np.random.randn(N, 1) * 0.05
+        self.X2D = np.random.uniform(-3.0, 3.0, (N, 2))
+        self.Y2D = (
+            np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2])
+            + np.random.randn(N, 1) * 0.05
+        )

    def test_fitc_1d(self):
+        self.setup()
        m = GPy.models.SparseGPRegression(self.X1D, self.Y1D)
-        m.inference_method=GPy.inference.latent_function_inference.FITC()
-        self.assertTrue(m.checkgrad())
+        m.inference_method = GPy.inference.latent_function_inference.FITC()
+        assert m.checkgrad(), "Gradient check failed!"

    def test_fitc_2d(self):
+        self.setup()
        m = GPy.models.SparseGPRegression(self.X2D, self.Y2D)
-        m.inference_method=GPy.inference.latent_function_inference.FITC()
-        self.assertTrue(m.checkgrad())
-
+        m.inference_method = GPy.inference.latent_function_inference.FITC()
+        assert m.checkgrad(), "Gradient check failed!"
--- a/GPy/testing/gpy_kernels_state_space_tests.py
+++ b/GPy/testing/gpy_kernels_state_space_tests.py
@ -1,454 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2015, Alex Grigorevskiy
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-"""
-Testing state space related functions.
-"""
-import unittest
-import numpy as np
-import GPy
-import GPy.models.state_space_model as SS_model
-from .state_space_main_tests import generate_x_points, generate_sine_data, \
-    generate_linear_data, generate_brownian_data, generate_linear_plus_sin
-from nose import SkipTest
-
-#from state_space_main_tests import generate_x_points, generate_sine_data, \
-#    generate_linear_data, generate_brownian_data, generate_linear_plus_sin
-
-class StateSpaceKernelsTests(np.testing.TestCase):
-    def setUp(self):
-        pass
-
-    def run_for_model(self, X, Y, ss_kernel, kalman_filter_type = 'regular',
-                      use_cython=False, check_gradients=True,
-                      optimize=True, optimize_max_iters=250, predict_X=None,
-                      compare_with_GP=True, gp_kernel=None,
-                      mean_compare_decimal=10, var_compare_decimal=7):
-
-        m1  = SS_model.StateSpace(X,Y, ss_kernel,
-                                kalman_filter_type=kalman_filter_type,
-                                use_cython=use_cython)
-
-        m1.likelihood[:] = Y.var()/100.
-
-        if check_gradients:
-            self.assertTrue(m1.checkgrad())
-
-        if 1:#optimize:
-            m1.optimize(optimizer='lbfgsb', max_iters=1)
-
-        if compare_with_GP and (predict_X is None):
-            predict_X = X
-
-        self.assertTrue(compare_with_GP)
-        if compare_with_GP:
-            m2  = GPy.models.GPRegression(X,Y, gp_kernel)
-
-            m2[:] = m1[:]
-
-            if (predict_X is not None):
-                x_pred_reg_1 = m1.predict(predict_X)
-                x_quant_reg_1 = m1.predict_quantiles(predict_X)
-
-            x_pred_reg_2 = m2.predict(predict_X)
-            x_quant_reg_2 = m2.predict_quantiles(predict_X)
-
-            np.testing.assert_array_almost_equal(x_pred_reg_1[0], x_pred_reg_2[0], mean_compare_decimal)
-            np.testing.assert_array_almost_equal(x_pred_reg_1[1], x_pred_reg_2[1], var_compare_decimal)
-            np.testing.assert_array_almost_equal(x_quant_reg_1[0], x_quant_reg_2[0], mean_compare_decimal)
-            np.testing.assert_array_almost_equal(x_quant_reg_1[1], x_quant_reg_2[1], mean_compare_decimal)
-            np.testing.assert_array_almost_equal(m1.gradient, m2.gradient, var_compare_decimal)
-            np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), var_compare_decimal)
-
-
-    def test_Matern32_kernel(self,):
-        np.random.seed(234) # seed the random number generator
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
-                        plot = False, points_num=50, x_interval = (0, 20), random=True)
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_Matern32(1,active_dims=[0,])
-        gp_kernel = GPy.kern.Matern32(1,active_dims=[0,])
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                           predict_X=X,
-                           compare_with_GP=True,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=5, var_compare_decimal=5)
-
-    def test_Matern52_kernel(self,):
-        np.random.seed(234) # seed the random number generator
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
-                        plot = False, points_num=50, x_interval = (0, 20), random=True)
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_Matern52(1,active_dims=[0,])
-        gp_kernel = GPy.kern.Matern52(1,active_dims=[0,])
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                           optimize = True, predict_X=X,
-                           compare_with_GP=True, gp_kernel=gp_kernel,
-                           mean_compare_decimal=5, var_compare_decimal=5)
-
-    def test_RBF_kernel(self,):
-        #import pdb;pdb.set_trace()
-        
-        np.random.seed(234) # seed the random number generator
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
-                        plot = False, points_num=50, x_interval = (0, 20), random=True)
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_RBF(1, 110., 1.5, active_dims=[0,], balance=True, approx_order=10)
-        gp_kernel = GPy.kern.RBF(1, 110., 1.5, active_dims=[0,])
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                           predict_X=X,
-                           gp_kernel=gp_kernel,
-                           optimize_max_iters=1000,
-                           mean_compare_decimal=2, var_compare_decimal=1)
-
-    def test_periodic_kernel(self,):
-        np.random.seed(322) # seed the random number generator
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
-                        plot = False, points_num=50, x_interval = (0, 20), random=True)
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
-        ss_kernel.lengthscale.constrain_bounded(0.27, 1000)
-        ss_kernel.period.constrain_bounded(0.17, 100)
-
-        gp_kernel = GPy.kern.StdPeriodic(1,active_dims=[0,])
-        gp_kernel.lengthscale.constrain_bounded(0.27, 1000)
-        gp_kernel.period.constrain_bounded(0.17, 100)
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                           predict_X=X,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=3, var_compare_decimal=3)
-
-    def test_quasi_periodic_kernel(self,):
-        np.random.seed(329) # seed the random number generator
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
-                        plot = False, points_num=50, x_interval = (0, 20), random=True)
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_Matern32(1)*GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
-        ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        ss_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        gp_kernel = GPy.kern.Matern32(1)*GPy.kern.StdPeriodic(1,active_dims=[0,])
-        gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        gp_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                            predict_X=X,
-                            gp_kernel=gp_kernel,
-                            mean_compare_decimal=1, var_compare_decimal=2)
-
-    def test_linear_kernel(self,):
-
-        np.random.seed(234) # seed the random number generator
-        (X,Y) = generate_linear_data(x_points=None, tangent=2.0, add_term=20.0, noise_var=2.0,
-                    plot = False, points_num=50, x_interval = (0, 20), random=True)
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_Linear(1,X,active_dims=[0,]) + GPy.kern.sde_Bias(1, active_dims=[0,])
-        gp_kernel = GPy.kern.Linear(1, active_dims=[0,]) + GPy.kern.Bias(1, active_dims=[0,])
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients= False,
-                           predict_X=X,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=5, var_compare_decimal=5)
-
-    def test_brownian_kernel(self,):
-        np.random.seed(234) # seed the random number generator
-        (X,Y) = generate_brownian_data(x_points=None, kernel_var=2.0, noise_var = 0.1,
-                    plot = False, points_num=50, x_interval = (0, 20), random=True)
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_Brownian()
-        gp_kernel = GPy.kern.Brownian()
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                            predict_X=X,
-                            gp_kernel=gp_kernel,
-                            mean_compare_decimal=4, var_compare_decimal=4)
-
-    def test_exponential_kernel(self,):
-        np.random.seed(12345) # seed the random number generator
-        (X,Y) = generate_linear_data(x_points=None, tangent=1.0, add_term=20.0, noise_var=2.0,
-                    plot = False, points_num=10, x_interval = (0, 20), random=True)
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_Exponential(1, Y.var(), X.ptp()/2., active_dims=[0,])
-        gp_kernel = GPy.kern.Exponential(1, Y.var(), X.ptp()/2., active_dims=[0,])
-
-        Y -= Y.mean()
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                      predict_X=X,
-                      gp_kernel=gp_kernel,
-                      optimize_max_iters=1000,
-                      mean_compare_decimal=2, var_compare_decimal=2)
-
-    def test_kernel_addition_svd(self,):
-        #np.random.seed(329) # seed the random number generator
-        np.random.seed(42)
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
-                        plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
-                    plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        # Sine data <-
-        Y = Y + Y1
-        Y -= Y.mean()
-    
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        def get_new_kernels():
-            ss_kernel = GPy.kern.sde_Linear(1, X, variances=1) + GPy.kern.sde_StdPeriodic(1, period=5.0, variance=300, lengthscale=3, active_dims=[0,])
-            #ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-            #ss_kernel.std_periodic.period.constrain_bounded(3, 8)
-
-            gp_kernel = GPy.kern.Linear(1, variances=1) + GPy.kern.StdPeriodic(1, period=5.0, variance=300, lengthscale=3, active_dims=[0,])
-            #gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-            #gp_kernel.std_periodic.period.constrain_bounded(3, 8)
-
-            return ss_kernel, gp_kernel
-
-        # Cython is available only with svd.
-        ss_kernel, gp_kernel = get_new_kernels()
-        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'svd',
-                           use_cython=True, optimize_max_iters=10, check_gradients=False,
-                           predict_X=X,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=3, var_compare_decimal=3)
-
-        ss_kernel, gp_kernel = get_new_kernels()
-        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'svd',
-                           use_cython=False, optimize_max_iters=10, check_gradients=False,
-                           predict_X=X,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=3, var_compare_decimal=3)
-
-    def test_kernel_addition_regular(self,):
-        #np.random.seed(329) # seed the random number generator
-        np.random.seed(42)
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
-                        plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
-                    plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        # Sine data <-
-        Y = Y + Y1
-        Y -= Y.mean()
-    
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        def get_new_kernels():
-            ss_kernel = GPy.kern.sde_Linear(1, X, variances=1) + GPy.kern.sde_StdPeriodic(1, period=5.0, variance=300, lengthscale=3, active_dims=[0,])
-            #ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-            #ss_kernel.std_periodic.period.constrain_bounded(3, 8)
-
-            gp_kernel = GPy.kern.Linear(1, variances=1) + GPy.kern.StdPeriodic(1, period=5.0, variance=300, lengthscale=3, active_dims=[0,])
-            #gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-            #gp_kernel.std_periodic.period.constrain_bounded(3, 8)
-
-            return ss_kernel, gp_kernel
-
-        ss_kernel, gp_kernel = get_new_kernels()
-        try:
-            self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'regular',
-                               use_cython=False, optimize_max_iters=10, check_gradients=True,
-                               predict_X=X,
-                               gp_kernel=gp_kernel,
-                               mean_compare_decimal=2, var_compare_decimal=2)
-        except AssertionError:
-            raise SkipTest("Skipping Regular kalman filter for kernel addition, because it is not stable (normal situation) for this data.")
-
-
-    def test_kernel_multiplication(self,):
-        np.random.seed(329) # seed the random number generator
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
-                        plot = False, points_num=50, x_interval = (0, 20), random=True)
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        def get_new_kernels():
-            ss_kernel = GPy.kern.sde_Matern32(1)*GPy.kern.sde_Matern52(1)
-            gp_kernel = GPy.kern.Matern32(1)*GPy.kern.sde_Matern52(1)
-
-            return ss_kernel, gp_kernel
-
-        ss_kernel, gp_kernel = get_new_kernels()
-
-        #import ipdb;ipdb.set_trace()
-        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'svd',
-                           use_cython=True, optimize_max_iters=10, check_gradients=True,
-                            predict_X=X,
-                            gp_kernel=gp_kernel,
-                            mean_compare_decimal=2, var_compare_decimal=2)
-
-        ss_kernel, gp_kernel = get_new_kernels()
-        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'regular',
-                           use_cython=False, optimize_max_iters=10, check_gradients=True,
-                            predict_X=X,
-                            gp_kernel=gp_kernel,
-                            mean_compare_decimal=2, var_compare_decimal=2)
-
-        ss_kernel, gp_kernel = get_new_kernels()
-        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'svd',
-                           use_cython=False, optimize_max_iters=10, check_gradients=True,
-                            predict_X=X,
-                            gp_kernel=gp_kernel,
-                            mean_compare_decimal=2, var_compare_decimal=2)
-
-    def test_forecast_regular(self,):
-        # Generate data ->
-        np.random.seed(339) # seed the random number generator
-        #import pdb; pdb.set_trace()
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
-                        plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
-                    plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        Y = Y + Y1
-
-        X_train = X[X <= 20]
-        Y_train = Y[X <= 20]
-        X_test = X[X > 20]
-        Y_test = Y[X > 20]
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-        X_train.shape = (X_train.shape[0],1); Y_train.shape = (Y_train.shape[0],1)
-        X_test.shape = (X_test.shape[0],1); Y_test.shape = (Y_test.shape[0],1)
-        # Generate data <-
-
-        #import pdb; pdb.set_trace()
-
-        periodic_kernel = GPy.kern.StdPeriodic(1,active_dims=[0,])
-        gp_kernel = GPy.kern.Linear(1, active_dims=[0,]) + GPy.kern.Bias(1, active_dims=[0,]) + periodic_kernel
-        gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        gp_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        periodic_kernel = GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
-        ss_kernel = GPy.kern.sde_Linear(1,X,active_dims=[0,]) + \
-            GPy.kern.sde_Bias(1, active_dims=[0,]) + periodic_kernel
-
-        ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        ss_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        self.run_for_model(X_train, Y_train, ss_kernel, kalman_filter_type = 'regular',
-                           use_cython=False, optimize_max_iters=30, check_gradients=True,
-                           predict_X=X_test,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=2, var_compare_decimal=2)
-
-    def test_forecast_svd(self,):
-        # Generate data ->
-        np.random.seed(339) # seed the random number generator
-        #import pdb; pdb.set_trace()
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
-                        plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
-                    plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        Y = Y + Y1
-
-        X_train = X[X <= 20]
-        Y_train = Y[X <= 20]
-        X_test = X[X > 20]
-        Y_test = Y[X > 20]
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-        X_train.shape = (X_train.shape[0],1); Y_train.shape = (Y_train.shape[0],1)
-        X_test.shape = (X_test.shape[0],1); Y_test.shape = (Y_test.shape[0],1)
-        # Generate data <-
-
-        #import pdb; pdb.set_trace()
-
-        periodic_kernel = GPy.kern.StdPeriodic(1,active_dims=[0,])
-        gp_kernel = GPy.kern.Linear(1, active_dims=[0,]) + GPy.kern.Bias(1, active_dims=[0,]) + periodic_kernel
-        gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        gp_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        periodic_kernel = GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
-        ss_kernel = GPy.kern.sde_Linear(1,X,active_dims=[0,]) + \
-            GPy.kern.sde_Bias(1, active_dims=[0,]) + periodic_kernel
-
-        ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        ss_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        self.run_for_model(X_train, Y_train, ss_kernel, kalman_filter_type = 'svd',
-                           use_cython=False, optimize_max_iters=30, check_gradients=False,
-                           predict_X=X_test,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=2, var_compare_decimal=2)
-
-    def test_forecast_svd_cython(self,):
-        # Generate data ->
-        np.random.seed(339) # seed the random number generator
-        #import pdb; pdb.set_trace()
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
-                        plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
-                    plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        Y = Y + Y1
-
-        X_train = X[X <= 20]
-        Y_train = Y[X <= 20]
-        X_test = X[X > 20]
-        Y_test = Y[X > 20]
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-        X_train.shape = (X_train.shape[0],1); Y_train.shape = (Y_train.shape[0],1)
-        X_test.shape = (X_test.shape[0],1); Y_test.shape = (Y_test.shape[0],1)
-        # Generate data <-
-
-        #import pdb; pdb.set_trace()
-
-        periodic_kernel = GPy.kern.StdPeriodic(1,active_dims=[0,])
-        gp_kernel = GPy.kern.Linear(1, active_dims=[0,]) + GPy.kern.Bias(1, active_dims=[0,]) + periodic_kernel
-        gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        gp_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        periodic_kernel = GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
-        ss_kernel = GPy.kern.sde_Linear(1,X,active_dims=[0,]) + \
-            GPy.kern.sde_Bias(1, active_dims=[0,]) + periodic_kernel
-
-        ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        ss_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        self.run_for_model(X_train, Y_train, ss_kernel, kalman_filter_type = 'svd',
-                           use_cython=True, optimize_max_iters=30, check_gradients=False,
-                           predict_X=X_test,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=2, var_compare_decimal=2)
-
-if __name__ == "__main__":
-    print("Running state-space inference tests...")
-    unittest.main()
-
-    #tt = StateSpaceKernelsTests('test_RBF_kernel')
-    #import pdb; pdb.set_trace()
-    #tt.test_Matern32_kernel()
-    #tt.test_Matern52_kernel()
-    #tt.test_RBF_kernel()
-    #tt.test_periodic_kernel()
-    #tt.test_quasi_periodic_kernel()
-    #tt.test_linear_kernel()
-    #tt.test_brownian_kernel()
-    #tt.test_exponential_kernel()
-    #tt.test_kernel_addition()
-    #tt.test_kernel_multiplication()
-    #tt.test_forecast()
-
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@ -1,179 +0,0 @@
-# Copyright (c) 2014, Max Zwiessele
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-"""
-The test cases for various inference algorithms
-"""
-
-import unittest
-import numpy as np
-import GPy
-#np.seterr(invalid='raise')
-
-class InferenceXTestCase(unittest.TestCase):
-
-    def genData(self):
-        np.random.seed(1111)
-        Ylist = GPy.examples.dimensionality_reduction._simulate_matern(5, 1, 1, 10, 3, False)[0]
-        return Ylist[0]
-
-    def test_inferenceX_BGPLVM_Linear(self):
-        Ys = self.genData()
-        m = GPy.models.BayesianGPLVM(Ys,3,kernel=GPy.kern.Linear(3,ARD=True))
-        m.optimize()
-        x, mi = m.infer_newX(m.Y, optimize=True)
-        np.testing.assert_array_almost_equal(m.X.mean, mi.X.mean, decimal=2)
-        np.testing.assert_array_almost_equal(m.X.variance, mi.X.variance, decimal=2)
-
-    def test_inferenceX_BGPLVM_RBF(self):
-        Ys = self.genData()
-        m = GPy.models.BayesianGPLVM(Ys,3,kernel=GPy.kern.RBF(3,ARD=True))
-        import warnings
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            m.optimize()
-        x, mi = m.infer_newX(m.Y, optimize=True)
-        np.testing.assert_array_almost_equal(m.X.mean, mi.X.mean, decimal=2)
-        np.testing.assert_array_almost_equal(m.X.variance, mi.X.variance, decimal=2)
-
-    def test_inferenceX_GPLVM_Linear(self):
-        Ys = self.genData()
-        m = GPy.models.GPLVM(Ys,3,kernel=GPy.kern.Linear(3,ARD=True))
-        m.optimize()
-        x, mi = m.infer_newX(m.Y, optimize=True)
-        np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
-
-    def test_inferenceX_GPLVM_RBF(self):
-        Ys = self.genData()
-        m = GPy.models.GPLVM(Ys,3,kernel=GPy.kern.RBF(3,ARD=True))
-        m.optimize()
-        x, mi = m.infer_newX(m.Y, optimize=True)
-        np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
-
-class InferenceGPEP(unittest.TestCase):
-
-    def genData(self):
-        np.random.seed(1)
-        k = GPy.kern.RBF(1, variance=7., lengthscale=0.2)
-        X = np.random.rand(200,1)
-        f = np.random.multivariate_normal(np.zeros(200), k.K(X) + 1e-5 * np.eye(X.shape[0]))
-        lik = GPy.likelihoods.Bernoulli()
-        p = lik.gp_link.transf(f) # squash the latent function
-        Y = lik.samples(f).reshape(-1,1)
-        return X, Y
-
-    def genNoisyData(self):
-        np.random.seed(1)
-        X = np.random.rand(100,1)
-        self.real_std = 0.1
-        noise = np.random.randn(*X[:, 0].shape)*self.real_std
-        Y = (np.sin(X[:, 0]*2*np.pi) + noise)[:, None]
-        self.f = np.random.rand(X.shape[0],1)
-        Y_extra_noisy = Y.copy()
-        Y_extra_noisy[50] += 4.
-        # Y_extra_noisy[80:83] -= 2.
-        return X, Y, Y_extra_noisy
-
-    def test_inference_EP(self):
-        from paramz import ObsAr
-        X, Y = self.genData()
-        lik = GPy.likelihoods.Bernoulli()
-        k = GPy.kern.RBF(1, variance=7., lengthscale=0.2)
-        inf = GPy.inference.latent_function_inference.expectation_propagation.EP(max_iters=30, delta=0.5)
-        self.model = GPy.core.GP(X=X,
-                        Y=Y,
-                        kernel=k,
-                        inference_method=inf,
-                        likelihood=lik)
-        K = self.model.kern.K(X)
-        mean_prior = np.zeros(K.shape[0])
-        post_params, ga_approx, cav_params, log_Z_tilde = self.model.inference_method.expectation_propagation(mean_prior, K, ObsAr(Y), lik, None)
-
-        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
-        p, m, d = self.model.inference_method._inference(Y, mean_prior, K, ga_approx, cav_params, lik, Y_metadata=None,  Z_tilde=log_Z_tilde)
-        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, inf).inference(k, X,lik ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
-
-        assert (np.sum(np.array([m - m0,
-                    np.sum(d['dL_dK'] - d0['dL_dK']),
-                    np.sum(d['dL_dthetaL'] - d0['dL_dthetaL']),
-                    np.sum(d['dL_dm'] - d0['dL_dm']),
-                    np.sum(p._woodbury_vector - p0._woodbury_vector),
-                    np.sum(p.woodbury_inv - p0.woodbury_inv)])) < 1e6)
-
-    # NOTE: adding a test like above for parameterized likelihood- the above test is
-    # only for probit likelihood which does not have any tunable hyperparameter which is why
-    # the term in dictionary of gradients: dL_dthetaL will always be zero. So here we repeat tests for
-    # student-t likelihood and heterodescastic gaussian noise case. This test simply checks if the posterior
-    # and gradients of log marginal are roughly the same for inference through EP and exact gaussian inference using
-    # the gaussian approximation for the individual likelihood site terms. For probit likelihood, it is possible to
-    # calculate moments analytically, but for other likelihoods, we will need to use numerical quadrature techniques,
-    # and it is possible that any error might creep up because of quadrature implementation.
-    def test_inference_EP_non_classification(self):
-        from paramz import ObsAr
-        X, Y, Y_extra_noisy = self.genNoisyData()
-        deg_freedom = 5.
-        init_noise_var = 0.08
-        lik_studentT = GPy.likelihoods.StudentT(deg_free=deg_freedom, sigma2=init_noise_var)
-        # like_gaussian_noise = GPy.likelihoods.MixedNoise()
-        k = GPy.kern.RBF(1, variance=2., lengthscale=1.1)
-        ep_inf_alt = GPy.inference.latent_function_inference.expectation_propagation.EP(max_iters=4, delta=0.5)
-        # ep_inf_nested = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode='nested', max_iters=100, delta=0.5)
-        m = GPy.core.GP(X=X,Y=Y_extra_noisy,kernel=k,likelihood=lik_studentT,inference_method=ep_inf_alt)
-        K = m.kern.K(X)
-        mean_prior = np.zeros(K.shape[0])
-        post_params, ga_approx, cav_params, log_Z_tilde = m.inference_method.expectation_propagation(mean_prior, K, ObsAr(Y_extra_noisy), lik_studentT, None)
-
-        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
-        p, m, d = m.inference_method._inference(Y_extra_noisy, mean_prior, K, ga_approx, cav_params, lik_studentT, Y_metadata=None,  Z_tilde=log_Z_tilde)
-        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, ep_inf_alt).inference(k, X,lik_studentT ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
-
-        assert (np.sum(np.array([m - m0,
-                    np.sum(d['dL_dK'] - d0['dL_dK']),
-                    np.sum(d['dL_dthetaL'] - d0['dL_dthetaL']),
-                    np.sum(d['dL_dm'] - d0['dL_dm']),
-                    np.sum(p._woodbury_vector - p0._woodbury_vector),
-                    np.sum(p.woodbury_inv - p0.woodbury_inv)])) < 1e6)
-
-class VarDtcTest(unittest.TestCase):
-
-    def test_var_dtc_inference_with_mean(self):
-        """ Check dL_dm in var_dtc is calculated correctly"""
-        np.random.seed(1)
-        x = np.linspace(0.,2*np.pi,100)[:,None]
-        y = -np.cos(x)+np.random.randn(*x.shape)*0.3+1
-        m = GPy.models.SparseGPRegression(x,y, mean_function=GPy.mappings.Linear(input_dim=1, output_dim=1))
-        self.assertTrue(m.checkgrad())
-
-
-class HMCSamplerTest(unittest.TestCase):
-
-    def test_sampling(self):
-        np.random.seed(1)
-        x = np.linspace(0.,2*np.pi,100)[:,None]
-        y = -np.cos(x)+np.random.randn(*x.shape)*0.3+1
-
-        m = GPy.models.GPRegression(x,y)
-        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
-        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
-        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
-
-        hmc = GPy.inference.mcmc.HMC(m,stepsize=1e-2)
-        s = hmc.sample(num_samples=3)
-
-class MCMCSamplerTest(unittest.TestCase):
-
-    def test_sampling(self):
-        np.random.seed(1)
-        x = np.linspace(0.,2*np.pi,100)[:,None]
-        y = -np.cos(x)+np.random.randn(*x.shape)*0.3+1
-
-        m = GPy.models.GPRegression(x,y)
-        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
-        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
-        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
-
-        mcmc = GPy.inference.mcmc.Metropolis_Hastings(m)
-        mcmc.sample(Ntotal=100, Nburn=10)
-
-if __name__ == "__main__":
-    unittest.main()
--- a/GPy/testing/link_function_tests.py
+++ b/GPy/testing/link_function_tests.py
@ -1,148 +0,0 @@
-import numpy as np
-import scipy
-from scipy.special import cbrt
-from GPy.models import GradientChecker
-import random
-_lim_val = np.finfo(np.float64).max
-_lim_val_exp = np.log(_lim_val)
-_lim_val_square = np.sqrt(_lim_val)
-_lim_val_cube = cbrt(_lim_val)
-from GPy.likelihoods.link_functions import Identity, Probit, Cloglog, Log, Log_ex_1, Reciprocal, Heaviside, ScaledProbit
-
-class LinkFunctionTests(np.testing.TestCase):
-    def setUp(self):
-        self.small_f = np.array([[-1e-4]])
-        self.zero_f = np.array([[1e-4]])
-        self.mid_f = np.array([[5.0]])
-        self.large_f = np.array([[1e4]])
-        self.f_lower_lim = np.array(-np.inf)
-        self.f_upper_lim = np.array(np.inf)
-
-    def check_gradient(self, link_func, lim_of_inf, test_lim=False):
-        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.mid_f)
-        self.assertTrue(grad.checkgrad(verbose=True))
-        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.mid_f)
-        self.assertTrue(grad2.checkgrad(verbose=True))
-        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.mid_f)
-        self.assertTrue(grad3.checkgrad(verbose=True))
-
-        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.small_f)
-        self.assertTrue(grad.checkgrad(verbose=True))
-        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.small_f)
-        self.assertTrue(grad2.checkgrad(verbose=True))
-        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.small_f)
-        self.assertTrue(grad3.checkgrad(verbose=True))
-
-        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.zero_f)
-        self.assertTrue(grad.checkgrad(verbose=True))
-        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.zero_f)
-        self.assertTrue(grad2.checkgrad(verbose=True))
-        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.zero_f)
-        self.assertTrue(grad3.checkgrad(verbose=True))
-
-        #Do a limit test if the large f value is too large
-        large_f = np.clip(self.large_f, -np.inf, lim_of_inf-1e-3)
-        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=large_f)
-        self.assertTrue(grad.checkgrad(verbose=True))
-        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=large_f)
-        self.assertTrue(grad2.checkgrad(verbose=True))
-        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=large_f)
-        self.assertTrue(grad3.checkgrad(verbose=True))
-
-        if test_lim:
-            print("Testing limits")
-            #Remove some otherwise we are too close to the limit for gradcheck to work effectively
-            lim_of_inf = lim_of_inf - 1e-4
-            grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=lim_of_inf)
-            self.assertTrue(grad.checkgrad(verbose=True))
-            grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=lim_of_inf)
-            self.assertTrue(grad2.checkgrad(verbose=True))
-            grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=lim_of_inf)
-            self.assertTrue(grad3.checkgrad(verbose=True))
-
-    def check_overflow(self, link_func, lim_of_inf):
-        #Check that it does something sensible beyond this limit,
-        #note this is not checking the value is correct, just that it isn't nan
-        beyond_lim_of_inf = lim_of_inf + 100.0
-        self.assertFalse(np.isinf(link_func.transf(beyond_lim_of_inf)))
-        self.assertFalse(np.isinf(link_func.dtransf_df(beyond_lim_of_inf)))
-        self.assertFalse(np.isinf(link_func.d2transf_df2(beyond_lim_of_inf)))
-
-        self.assertFalse(np.isnan(link_func.transf(beyond_lim_of_inf)))
-        self.assertFalse(np.isnan(link_func.dtransf_df(beyond_lim_of_inf)))
-        self.assertFalse(np.isnan(link_func.d2transf_df2(beyond_lim_of_inf)))
-
-    def test_log_overflow(self):
-        link = Log()
-        lim_of_inf = _lim_val_exp
-
-        np.testing.assert_almost_equal(np.exp(self.mid_f), link.transf(self.mid_f))
-        assert np.isinf(np.exp(np.log(self.f_upper_lim)))
-        #Check the clipping works
-        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
-        self.assertTrue(np.isfinite(link.transf(self.f_upper_lim)))
-        self.check_overflow(link, lim_of_inf)
-
-        #Check that it would otherwise fail
-        beyond_lim_of_inf = lim_of_inf + 10.0
-        old_err_state = np.seterr(over='ignore')
-        self.assertTrue(np.isinf(np.exp(beyond_lim_of_inf)))
-        np.seterr(**old_err_state)
-
-    def test_log_ex_1_overflow(self):
-        link = Log_ex_1()
-        lim_of_inf = _lim_val_exp
-
-        np.testing.assert_almost_equal(scipy.special.log1p(np.exp(self.mid_f)), link.transf(self.mid_f))
-        assert np.isinf(scipy.special.log1p(np.exp(np.log(self.f_upper_lim))))
-        #Check the clipping works
-        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
-        #Need to look at most significant figures here rather than the decimals
-        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), scipy.special.log1p(_lim_val), significant=5)
-        self.check_overflow(link, lim_of_inf)
-
-        #Check that it would otherwise fail
-        beyond_lim_of_inf = lim_of_inf + 10.0
-        old_err_state = np.seterr(over='ignore')
-        self.assertTrue(np.isinf(scipy.special.log1p(np.exp(beyond_lim_of_inf))))
-        np.seterr(**old_err_state)
-
-
-    def test_log_gradients(self):
-        # transf dtransf_df d2transf_df2 d3transf_df3
-        link = Log()
-        lim_of_inf = _lim_val_exp
-        self.check_gradient(link, lim_of_inf, test_lim=True)
-
-    def test_identity_gradients(self):
-        link = Identity()
-        lim_of_inf = _lim_val
-        #FIXME: Should be able to think of a way to test the limits of this
-        self.check_gradient(link, lim_of_inf, test_lim=False)
-
-    def test_probit_gradients(self):
-        link = Probit()
-        lim_of_inf = _lim_val
-        self.check_gradient(link, lim_of_inf, test_lim=True)
-        
-    def test_scaledprobit_gradients(self):
-        link = ScaledProbit(nu=random.random())
-        lim_of_inf = _lim_val
-        self.check_gradient(link, lim_of_inf, test_lim=True)
-
-    def test_Cloglog_gradients(self):
-        link = Cloglog()
-        lim_of_inf = _lim_val_exp
-        self.check_gradient(link, lim_of_inf, test_lim=True)
-
-    def test_Log_ex_1_gradients(self):
-        link = Log_ex_1()
-        lim_of_inf = _lim_val_exp
-        self.check_gradient(link, lim_of_inf, test_lim=True)
-        self.check_overflow(link, lim_of_inf)
-
-    def test_reciprocal_gradients(self):
-        link = Reciprocal()
-        lim_of_inf = _lim_val
-        #Does not work with much smaller values, and values closer to zero than 1e-5
-        self.check_gradient(link, lim_of_inf, test_lim=True)
--- a/GPy/testing/meanfunc_tests.py
+++ b/GPy/testing/meanfunc_tests.py
@ -1,95 +0,0 @@
-# Copyright (c) 2015, James Hensman
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import unittest
-import numpy as np
-import GPy
-
-class MFtests(unittest.TestCase):
-    def test_simple_mean_function(self):
-        """
-        The simplest possible mean function. No parameters, just a simple Sinusoid.
-        """
-        #create  simple mean function
-        mf = GPy.core.Mapping(1,1)
-        mf.f = np.sin
-        mf.update_gradients = lambda a,b: None
-
-        X = np.linspace(0,10,50).reshape(-1,1)
-        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
-
-        k =GPy.kern.RBF(1)
-        lik = GPy.likelihoods.Gaussian()
-        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
-        self.assertTrue(m.checkgrad())
-
-    def test_parametric_mean_function(self):
-        """
-        A linear mean function with parameters that we'll learn alongside the kernel
-        """
-
-        X = np.linspace(-1,10,50).reshape(-1,1)
-        
-        Y = 3-np.abs((X-6))
-        Y += .5*np.cos(3*X) + 0.3*np.random.randn(*X.shape) 
-
-        mf = GPy.mappings.PiecewiseLinear(1, 1, [-1,1], [9,2])
-
-        k =GPy.kern.RBF(1)
-        lik = GPy.likelihoods.Gaussian()
-        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
-        self.assertTrue(m.checkgrad())
-
-    def test_parametric_mean_function_composition(self):
-        """
-        A linear mean function with parameters that we'll learn alongside the kernel
-        """
-
-        X = np.linspace(0,10,50).reshape(-1,1)
-        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
-
-        mf = GPy.mappings.Compound(GPy.mappings.Linear(1,1), 
-                                   GPy.mappings.Kernel(1, 1, np.random.normal(0,1,(1,1)), 
-                                                       GPy.kern.RBF(1))
-                                   )
-
-        k =GPy.kern.RBF(1)
-        lik = GPy.likelihoods.Gaussian()
-        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
-        self.assertTrue(m.checkgrad())
-
-    def test_parametric_mean_function_additive(self):
-        """
-        A linear mean function with parameters that we'll learn alongside the kernel
-        """
-
-        X = np.linspace(0,10,50).reshape(-1,1)
-        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
-
-        mf = GPy.mappings.Additive(GPy.mappings.Constant(1,1,3),
-               GPy.mappings.Additive(GPy.mappings.MLP(1,1),
-                     GPy.mappings.Identity(1,1)
-                           )
-                        )
-
-        k =GPy.kern.RBF(1)
-        lik = GPy.likelihoods.Gaussian()
-        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
-        self.assertTrue(m.checkgrad())
-
-    def test_svgp_mean_function(self):
-
-        # an instance of the SVIGOP with a men function
-        X = np.linspace(0,10,500).reshape(-1,1)
-        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
-        Y = np.where(Y>0, 1,0) # make aclassificatino problem
-
-        mf = GPy.mappings.Linear(1,1)
-        Z = np.linspace(0,10,50).reshape(-1,1)
-        lik = GPy.likelihoods.Bernoulli()
-        k =GPy.kern.RBF(1) + GPy.kern.White(1, 1e-4)
-        m = GPy.core.SVGP(X, Y,Z=Z, kernel=k, likelihood=lik, mean_function=mf)
-        self.assertTrue(m.checkgrad())
-
-
-
--- a/GPy/testing/minibatch_tests.py
+++ b/GPy/testing/minibatch_tests.py
@ -1,230 +0,0 @@
-'''
-Created on 4 Sep 2015
-
-@author: maxz
-'''
-import unittest
-import numpy as np
-import GPy
-
-class BGPLVMTest(unittest.TestCase):
-
-
-    def setUp(self):
-        np.random.seed(12345)
-        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
-        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
-        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
-        self.X, self.W, self.Y = X,W,Y
-        self.Q = 3
-        self.m_full = GPy.models.BayesianGPLVM(Y, self.Q)
-
-    def test_lik_comparisons_m1_s0(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False)
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_predict_missing_data(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-
-        self.assertRaises(NotImplementedError, m.predict, m.X, full_cov=True)
-
-        mu1, var1 = m.predict(m.X, full_cov=False)
-        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
-        np.testing.assert_allclose(mu1, mu2)
-        np.testing.assert_allclose(var1, var2)
-
-        mu1, var1 = m.predict(m.X.mean, full_cov=True)
-        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=True)
-        np.testing.assert_allclose(mu1, mu2)
-        np.testing.assert_allclose(var1[:,:,0], var2)
-
-        mu1, var1 = m.predict(m.X.mean, full_cov=False)
-        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=False)
-        np.testing.assert_allclose(mu1, mu2)
-        np.testing.assert_allclose(var1[:,[0]], var2)
-
-    def test_lik_comparisons_m0_s0(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=self.m_full.X.variance.values, missing_data=False, stochastic=False)
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_lik_comparisons_m1_s1(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_lik_comparisons_m0_s1(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_gradients_missingdata(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
-        assert(m.checkgrad())
-
-    def test_gradients_missingdata_stochastics(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=1)
-        assert(m.checkgrad())
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=4)
-        assert(m.checkgrad())
-
-    def test_gradients_stochastics(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=1)
-        assert(m.checkgrad())
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=4)
-        assert(m.checkgrad())
-
-    def test_predict(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-class SparseGPMinibatchTest(unittest.TestCase):
-
-
-    def setUp(self):
-        np.random.seed(12345)
-        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
-        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
-        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
-        self.X, self.W, self.Y = X,W,Y
-        self.Q = 3
-        self.m_full = GPy.models.SparseGPLVM(Y, self.Q, kernel=GPy.kern.RBF(self.Q, ARD=True))
-
-    def test_lik_comparisons_m1_s0(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False)
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_sparsegp_init(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        try:
-            np.random.seed(1234)
-            Z = self.X[np.random.choice(self.X.shape[0], replace=False, size=10)].copy()
-            Q = Z.shape[1]
-            m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=True, stochastic=False)
-            assert(m.checkgrad())
-            m.optimize('adadelta', max_iters=10)
-            assert(m.checkgrad())
-    
-            m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=True, stochastic=True)
-            assert(m.checkgrad())
-            m.optimize('rprop', max_iters=10)
-            assert(m.checkgrad())
-            
-            m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=False, stochastic=False)
-            assert(m.checkgrad())
-            m.optimize('rprop', max_iters=10)
-            assert(m.checkgrad())
-            
-            m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=False, stochastic=True)
-            assert(m.checkgrad())
-            m.optimize('adadelta', max_iters=10)
-            assert(m.checkgrad())
-        except ImportError:
-            from nose import SkipTest
-            raise SkipTest('climin not installed, skipping stochastic gradients')
-
-    def test_predict_missing_data(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-
-        mu1, var1 = m.predict(m.X, full_cov=False)
-        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
-        np.testing.assert_allclose(mu1, mu2)
-        for i in range(var1.shape[1]):
-            np.testing.assert_allclose(var1[:,[i]], var2)
-
-        mu1, var1 = m.predict(m.X, full_cov=True)
-        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=True)
-        np.testing.assert_allclose(mu1, mu2)
-        for i in range(var1.shape[2]):
-            np.testing.assert_allclose(var1[:,:,i], var2)
-            
-    def test_lik_comparisons_m0_s0(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=False)
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_lik_comparisons_m1_s1(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_lik_comparisons_m0_s1(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_gradients_missingdata(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
-        assert(m.checkgrad())
-
-    def test_gradients_missingdata_stochastics(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=1)
-        assert(m.checkgrad())
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=4)
-        assert(m.checkgrad())
-
-    def test_gradients_stochastics(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=1)
-        assert(m.checkgrad())
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=4)
-        assert(m.checkgrad())
-
-    def test_predict(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.testName']
-    unittest.main()
--- a/GPy/testing/mpi_test__.py
+++ b/GPy/testing/mpi_test__.py
@ -0,0 +1,21 @@
+
+import numpy as np
+import GPy
+from mpi4py import MPI
+np.random.seed(123456)
+comm = MPI.COMM_WORLD
+N = 100
+x = np.linspace(-6., 6., N)
+y = np.sin(x) + np.random.randn(N) * 0.05
+comm.Bcast(y)
+data = np.vstack([x,y])
+#infr = GPy.inference.latent_function_inference.VarDTC_minibatch(mpi_comm=comm)
+m = GPy.models.SparseGPRegression(data[:1].T,data[1:2].T,mpi_comm=comm)
+m.optimize(max_iters=10)
+if comm.rank==0:
+    print float(m.objective_function())
+    m.inference_method.mpi_comm=None
+    m.mpi_comm=None
+    m._trigger_params_changed()
+    print float(m.objective_function())
+            
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@ -1,130 +0,0 @@
-'''
-Created on 13 Mar 2014
-
-@author: maxz
-'''
-import unittest, itertools
-#import cPickle as pickle
-import pickle
-import numpy as np
-import tempfile
-from GPy.examples.dimensionality_reduction import mrd_simulation
-from GPy.core.parameterization.variational import NormalPosterior
-from GPy.models.gp_regression import GPRegression
-import GPy
-from nose import SkipTest
-
-def toy_model():
-    X = np.linspace(0,1,50)[:, None]
-    Y = np.sin(X)
-    m = GPRegression(X=X, Y=Y)
-    return m
-
-class ListDictTestCase(unittest.TestCase):
-    def assertListDictEquals(self, d1, d2, msg=None):
-        #py3 fix
-        #for k,v in d1.iteritems():
-        for k,v in d1.items():
-            self.assertListEqual(list(v), list(d2[k]), msg)
-    def assertArrayListEquals(self, l1, l2):
-        for a1, a2 in zip(l1,l2):
-            np.testing.assert_array_equal(a1, a2)
-
-class Test(ListDictTestCase):
-    @SkipTest
-    def test_load_pickle(self):
-        import os
-        m = GPy.load(os.path.join(os.path.abspath(os.path.split(__file__)[0]), 'pickle_test.pickle'))
-        self.assertTrue(m.checkgrad())
-        self.assertEqual(m.log_likelihood(), -4.7351019830022087)
-
-    def test_model(self):
-        par = toy_model()
-        pcopy = par.copy()
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
-        self.assertTrue(pcopy.checkgrad())
-        self.assert_(np.any(pcopy.gradient!=0.0))
-        with tempfile.TemporaryFile('w+b') as f:
-            par.pickle(f)
-            f.seek(0)
-            pcopy = pickle.load(f)
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assert_(pcopy.checkgrad())
-
-    def test_modelrecreation(self):
-        par = toy_model()
-        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
-        np.testing.assert_allclose(par.param_array, pcopy.param_array)
-        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
-        self.assertTrue(pcopy.checkgrad())
-        self.assert_(np.any(pcopy.gradient!=0.0))
-        np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)
-        par.randomize()
-        with tempfile.TemporaryFile('w+b') as f:
-            par.pickle(f)
-            f.seek(0)
-            pcopy = pickle.load(f)
-        np.testing.assert_allclose(par.param_array, pcopy.param_array)
-        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full, atol=1e-6)
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assert_(pcopy.checkgrad())
-
-    def test_posterior(self):
-        X = np.random.randn(3,5)
-        Xv = np.random.rand(*X.shape)
-        par = NormalPosterior(X,Xv)
-        par.gradient = 10
-        pcopy = par.copy()
-        pcopy.gradient = 10
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
-        with tempfile.TemporaryFile('w+b') as f:
-            par.pickle(f)
-            f.seek(0)
-            pcopy = pickle.load(f)
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        pcopy.gradient = 10
-        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
-        np.testing.assert_allclose(pcopy.mean.gradient_full, 10)
-        self.assertSequenceEqual(str(par), str(pcopy))
-
-    def test_model_concat(self):
-        par = mrd_simulation(optimize=0, plot=0, plot_sim=0)
-        par.randomize()
-        pcopy = par.copy()
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
-        self.assertTrue(par.checkgrad())
-        self.assertTrue(pcopy.checkgrad())
-        self.assert_(np.any(pcopy.gradient!=0.0))
-        with tempfile.TemporaryFile('w+b') as f:
-            par.pickle(f)
-            f.seek(0)
-            pcopy = pickle.load(f)
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assert_(pcopy.checkgrad())
-
-    def _callback(self, what, which):
-        what.count += 1
-
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.test_parameter_index_operations']
-    unittest.main()
--- a/GPy/testing/plotting_tests.py
+++ b/GPy/testing/plotting_tests.py
@ -1,509 +0,0 @@
-#===============================================================================
-# Copyright (c) 2015, Max Zwiessele
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of GPy nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
-
-
-#===============================================================================
-# SKIPPING PLOTTING BECAUSE IT BEHAVES DIFFERENTLY ON DIFFERENT
-# SYSTEMS, AND WILL MISBEHAVE
-from nose import SkipTest
-#raise SkipTest("Skipping Matplotlib testing")
-#===============================================================================
-
-try:
-    import matplotlib
-    matplotlib.use('agg')
-except ImportError:
-    # matplotlib not installed
-    from nose import SkipTest
-    raise SkipTest("Error importing matplotlib")
-
-from unittest.case import TestCase
-
-import numpy as np
-import GPy, os
-import logging
-
-from GPy.util.config import config
-from GPy.plotting import change_plotting_library, plotting_library
-
-class ConfigTest(TestCase):
-    def tearDown(self):
-        change_plotting_library('matplotlib')
-
-    def test_change_plotting(self):
-        self.assertRaises(ValueError, change_plotting_library, 'not+in9names')
-        change_plotting_library('none')
-        self.assertRaises(RuntimeError, plotting_library)
-
-change_plotting_library('matplotlib')
-if config.get('plotting', 'library') != 'matplotlib':
-    raise SkipTest("Matplotlib not installed, not testing plots")
-
-try:
-    from matplotlib import cbook, pyplot as plt
-    from matplotlib.testing.compare import compare_images
-except ImportError:
-    raise SkipTest("Matplotlib not installed, not testing plots")
-
-extensions = ['npz']
-
-basedir = os.path.dirname(os.path.relpath(os.path.abspath(__file__)))
-
-def _image_directories():
-    """
-    Compute the baseline and result image directories for testing *func*.
-    Create the result directory if it doesn't exist.
-    """
-    #module_name = __init__.__module__
-    #mods = module_name.split('.')
-    #basedir = os.path.join(*mods)
-    result_dir = os.path.join(basedir, 'testresult','.')
-    baseline_dir = os.path.join(basedir, 'baseline','.')
-    if not os.path.exists(result_dir):
-        os.makedirs(result_dir)
-    return baseline_dir, result_dir
-
-baseline_dir, result_dir = _image_directories()
-if not os.path.exists(baseline_dir):
-    raise SkipTest("Not installed from source, baseline not available. Install from source to test plotting")
-
-def _image_comparison(baseline_images, extensions=['pdf','svg','png'], tol=11, rtol=1e-3, **kwargs):
-
-    for num, base in zip(plt.get_fignums(), baseline_images):
-        for ext in extensions:
-            fig = plt.figure(num)
-            try:
-                fig.canvas.draw()
-            except Exception as e:
-                logging.error(base)
-                #raise SkipTest(e)
-            #fig.axes[0].set_axis_off()
-            #fig.set_frameon(False)
-            if ext in ['npz']:
-                figdict = flatten_axis(fig)
-                np.savez_compressed(os.path.join(result_dir, "{}.{}".format(base, ext)), **figdict)
-                try:
-                    fig.savefig(os.path.join(result_dir, "{}.{}".format(base, 'png')),
-                                transparent=True,
-                                edgecolor='none',
-                                facecolor='none',
-                                #bbox='tight'
-                                )
-                except:
-                    logging.error(base)
-                    # raise
-            else:
-                fig.savefig(os.path.join(result_dir, "{}.{}".format(base, ext)),
-                            transparent=True,
-                            edgecolor='none',
-                            facecolor='none',
-                            #bbox='tight'
-                            )
-    for num, base in zip(plt.get_fignums(), baseline_images):
-        for ext in extensions:
-            #plt.close(num)
-            actual = os.path.join(result_dir, "{}.{}".format(base, ext))
-            expected = os.path.join(baseline_dir, "{}.{}".format(base, ext))
-            if ext == 'npz':
-                def do_test():
-                    if not os.path.exists(expected):
-                        import shutil
-                        shutil.copy2(actual, expected)
-                        #shutil.copy2(os.path.join(result_dir, "{}.{}".format(base, 'png')), os.path.join(baseline_dir, "{}.{}".format(base, 'png')))
-                        raise IOError("Baseline file {} not found, copying result {}".format(expected, actual))
-                    else:
-                        exp_dict = dict(np.load(expected).items())
-                        act_dict = dict(np.load(actual).items())
-                        for name in act_dict:
-                            if name in exp_dict:
-                                try:
-                                    np.testing.assert_allclose(exp_dict[name], act_dict[name], err_msg="Mismatch in {}.{}".format(base, name), rtol=rtol, **kwargs)
-                                except AssertionError as e:
-                                    raise SkipTest(e)
-            else:
-                def do_test():
-                    err = compare_images(expected, actual, tol, in_decorator=True)
-                    if err:
-                        raise SkipTest("Error between {} and {} is {:.5f}, which is bigger then the tolerance of {:.5f}".format(actual, expected, err['rms'], tol))
-            yield do_test
-    plt.close('all')
-
-def flatten_axis(ax, prevname=''):
-    import inspect
-    members = inspect.getmembers(ax)
-
-    arrays = {}
-
-    def _flatten(l, pre):
-        arr = {}
-        if isinstance(l, np.ndarray):
-            if l.size:
-                arr[pre] = np.asarray(l)
-        elif isinstance(l, dict):
-            for _n in l:
-                _tmp = _flatten(l, pre+"."+_n+".")
-                for _nt in _tmp.keys():
-                    arrays[_nt] = _tmp[_nt]
-        elif isinstance(l, list) and len(l)>0:
-            for i in range(len(l)):
-                _tmp = _flatten(l[i], pre+"[{}]".format(i))
-                for _n in _tmp:
-                    arr["{}".format(_n)] = _tmp[_n]
-        else:
-            return flatten_axis(l, pre+'.')
-        return arr
-
-
-    for name, l in members:
-        if isinstance(l, np.ndarray):
-            arrays[prevname+name] = np.asarray(l)
-        elif isinstance(l, list) and len(l)>0:
-            for i in range(len(l)):
-                _tmp = _flatten(l[i], prevname+name+"[{}]".format(i))
-                for _n in _tmp:
-                    arrays["{}".format(_n)] = _tmp[_n]
-
-    return arrays
-
-def _a(x,y,decimal):
-    np.testing.assert_array_almost_equal(x, y, decimal)
-
-def compare_axis_dicts(x, y, decimal=6):
-    try:
-        assert(len(x)==len(y))
-        for name in x:
-            _a(x[name], y[name], decimal)
-    except AssertionError as e:
-        raise SkipTest(e.message)
-
-def test_figure():
-    np.random.seed(1239847)
-    from GPy.plotting import plotting_library as pl
-    #import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    import warnings
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-
-        ax, _ = pl().new_canvas(num="imshow_interact")
-        def test_func(x):
-            return x[:, 0].reshape(3,3)
-        pl().imshow_interact(ax, test_func, extent=(-1,1,-1,1), resolution=3)
-
-        ax, _ = pl().new_canvas()
-        def test_func_2(x):
-            y = x[:, 0].reshape(3,3)
-            anno = np.argmax(x, axis=1).reshape(3,3)
-            return y, anno
-
-        pl().annotation_heatmap_interact(ax, test_func_2, extent=(-1,1,-1,1), resolution=3)
-        pl().annotation_heatmap_interact(ax, test_func_2, extent=(-1,1,-1,1), resolution=3, imshow_kwargs=dict(interpolation='nearest'))
-
-        ax, _ = pl().new_canvas(figsize=(4,3))
-        x = np.linspace(0,1,100)
-        y = [0,1,2]
-        array = np.array([.4,.5])
-        cmap = matplotlib.colors.LinearSegmentedColormap.from_list('WhToColor', ('r', 'b'), N=array.size)
-
-        pl().fill_gradient(ax, x, y, facecolors=['r', 'g'], array=array, cmap=cmap)
-
-        ax, _ = pl().new_canvas(num="3d_plot", figsize=(4,3), projection='3d', xlabel='x', ylabel='y', zlabel='z', title='awsome title', xlim=(-1,1), ylim=(-1,1), zlim=(-3,3))
-        z = 2-np.abs(np.linspace(-2,2,(100)))+1
-        x, y = z*np.sin(np.linspace(-2*np.pi,2*np.pi,(100))), z*np.cos(np.linspace(-np.pi,np.pi,(100)))
-
-        pl().plot(ax, x, y, z, linewidth=2)
-
-        for do_test in _image_comparison(
-                baseline_images=['coverage_{}'.format(sub) for sub in ["imshow_interact",'annotation_interact','gradient','3d_plot',]],
-                extensions=extensions):
-            yield (do_test, )
-
-
-def test_kernel():
-    np.random.seed(1239847)
-    #import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    import warnings
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        k = GPy.kern.RBF(5, ARD=True) * GPy.kern.Linear(3, active_dims=[0,2,4], ARD=True) + GPy.kern.Bias(2)
-        k.randomize()
-        k2 = GPy.kern.RBF(5, ARD=True) * GPy.kern.Linear(3, active_dims=[0,2,4], ARD=True) + GPy.kern.Bias(2) + GPy.kern.White(4)
-        k2[:-1] = k[:]
-        k2.plot_ARD(['rbf', 'linear', 'bias'], legend=True)
-        k2.plot_covariance(visible_dims=[0, 3], plot_limits=(-1,3))
-        k2.plot_covariance(visible_dims=[2], plot_limits=(-1, 3))
-        k2.plot_covariance(visible_dims=[2, 4], plot_limits=((-1, 0), (5, 3)), projection='3d', rstride=10, cstride=10)
-        k2.plot_covariance(visible_dims=[1, 4])
-        for do_test in _image_comparison(
-                baseline_images=['kern_{}'.format(sub) for sub in ["ARD", 'cov_2d', 'cov_1d', 'cov_3d', 'cov_no_lim']],
-                extensions=extensions):
-            yield (do_test, )
-
-def test_plot():
-    np.random.seed(111)
-    import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    import warnings
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        X = np.random.uniform(-2, 2, (40, 1))
-        f = .2 * np.sin(1.3*X) + 1.3*np.cos(2*X)
-        Y = f+np.random.normal(0, .1, f.shape)
-        m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X)*[0.06])
-        #m.optimize()
-        m.plot_data()
-        m.plot_mean()
-        m.plot_confidence()
-        m.plot_density()
-        m.plot_errorbars_trainset()
-        m.plot_samples()
-        m.plot_data_error()
-    for do_test in _image_comparison(baseline_images=['gp_{}'.format(sub) for sub in ["data", "mean", 'conf',
-                                                                                      'density',
-                                                                                      'out_error',
-                                                                                      'samples', 'in_error']], extensions=extensions):
-        yield (do_test, )
-
-def test_twod():
-    np.random.seed(11111)
-    import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    X = np.random.uniform(-2, 2, (40, 2))
-    f = .2 * np.sin(1.3*X[:,[0]]) + 1.3*np.cos(2*X[:,[1]])
-    Y = f+np.random.normal(0, .1, f.shape)
-    m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X)*[0.01, 0.2])
-    #m.optimize()
-    m.plot_data()
-    m.plot_mean()
-    m.plot_inducing(legend=False, marker='s')
-    #m.plot_errorbars_trainset()
-    m.plot_data_error()
-    for do_test in _image_comparison(baseline_images=['gp_2d_{}'.format(sub) for sub in ["data", "mean",
-                                                                                         'inducing',
-                                                                                         #'out_error',
-                                                                                         'in_error',
-                                                                                         ]], extensions=extensions):
-        yield (do_test, )
-
-def test_threed():
-    np.random.seed(11111)
-    import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    X = np.random.uniform(-2, 2, (40, 2))
-    f = .2 * np.sin(1.3*X[:,[0]]) + 1.3*np.cos(2*X[:,[1]])
-    Y = f+np.random.normal(0, .1, f.shape)
-    m = GPy.models.SparseGPRegression(X, Y)
-    m.likelihood.variance = .1
-    #m.optimize()
-    m.plot_samples(projection='3d', samples=1)
-    m.plot_samples(projection='3d', plot_raw=False, samples=1)
-    plt.close('all')
-    m.plot_data(projection='3d')
-    m.plot_mean(projection='3d', rstride=10, cstride=10)
-    m.plot_inducing(projection='3d')
-    #m.plot_errorbars_trainset(projection='3d')
-    for do_test in _image_comparison(baseline_images=[
-        'gp_3d_{}'.format(sub) for sub in ["data", "mean", 'inducing',
-    ]], extensions=extensions):
-        yield (do_test, )
-
-def test_sparse():
-    np.random.seed(11111)
-    import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    X = np.random.uniform(-2, 2, (40, 1))
-    f = .2 * np.sin(1.3*X) + 1.3*np.cos(2*X)
-    Y = f+np.random.normal(0, .1, f.shape)
-    m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X)*0.1)
-    #m.optimize()
-    #m.plot_inducing()
-    _, ax = plt.subplots()
-    m.plot_data(ax=ax)
-    m.plot_data_error(ax=ax)
-    for do_test in _image_comparison(baseline_images=['sparse_gp_{}'.format(sub) for sub in ['data_error']], extensions=extensions):
-        yield (do_test, )
-
-def test_classification():
-    np.random.seed(11111)
-    import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    X = np.random.uniform(-2, 2, (40, 1))
-    f = .2 * np.sin(1.3*X) + 1.3*np.cos(2*X)
-    Y = f+np.random.normal(0, .1, f.shape)
-    m = GPy.models.GPClassification(X, Y>Y.mean())
-    #m.optimize()
-    _, ax = plt.subplots()
-    m.plot(plot_raw=False, apply_link=False, ax=ax, samples=3)
-    m.plot_errorbars_trainset(plot_raw=False, apply_link=False, ax=ax)
-    _, ax = plt.subplots()
-    m.plot(plot_raw=True, apply_link=False, ax=ax, samples=3)
-    m.plot_errorbars_trainset(plot_raw=True, apply_link=False, ax=ax)
-    _, ax = plt.subplots()
-    m.plot(plot_raw=True, apply_link=True, ax=ax, samples=3)
-    m.plot_errorbars_trainset(plot_raw=True, apply_link=True, ax=ax)
-    for do_test in _image_comparison(baseline_images=['gp_class_{}'.format(sub) for sub in ["likelihood", "raw", 'raw_link']], extensions=extensions):
-        yield (do_test, )
-
-
-def test_sparse_classification():
-    np.random.seed(11111)
-    import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    X = np.random.uniform(-2, 2, (40, 1))
-    f = .2 * np.sin(1.3*X) + 1.3*np.cos(2*X)
-    Y = f+np.random.normal(0, .1, f.shape)
-    m = GPy.models.SparseGPClassification(X, Y>Y.mean())
-    #m.optimize()
-    m.plot(plot_raw=False, apply_link=False, samples_likelihood=3)
-    np.random.seed(111)
-    m.plot(plot_raw=True, apply_link=False, samples=3)
-    np.random.seed(111)
-    m.plot(plot_raw=True, apply_link=True, samples=3)
-    for do_test in _image_comparison(baseline_images=['sparse_gp_class_{}'.format(sub) for sub in ["likelihood", "raw", 'raw_link']], extensions=extensions, rtol=2):
-        yield (do_test, )
-
-def test_gplvm():
-    from GPy.models import GPLVM
-    np.random.seed(12345)
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    #Q = 3
-    # Define dataset
-    #N = 60
-    #k1 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,10,10,0.1,0.1]), ARD=True)
-    #k2 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,0.1,10,0.1,10]), ARD=True)
-    #k3 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[0.1,0.1,10,10,10]), ARD=True)
-    #X = np.random.normal(0, 1, (N, 5))
-    #A = np.random.multivariate_normal(np.zeros(N), k1.K(X), Q).T
-    #B = np.random.multivariate_normal(np.zeros(N), k2.K(X), Q).T
-    #C = np.random.multivariate_normal(np.zeros(N), k3.K(X), Q).T
-    #Y = np.vstack((A,B,C))
-    #labels = np.hstack((np.zeros(A.shape[0]), np.ones(B.shape[0]), np.ones(C.shape[0])*2))
-
-    #k = RBF(Q, ARD=True, lengthscale=2)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
-    pars = np.load(os.path.join(basedir, 'b-gplvm-save.npz'))
-    Y = pars['Y']
-    Q = pars['Q']
-    labels = pars['labels']
-
-    import warnings
-    with warnings.catch_warnings(record=True) as w:
-        warnings.simplefilter('always')  # always print
-        m = GPLVM(Y, Q, initialize=False)
-    m.update_model(False)
-    m.initialize_parameter()
-    m[:] = pars['gplvm_p']
-    m.update_model(True)
-
-    #m.optimize(messages=0)
-    np.random.seed(111)
-    m.plot_latent(labels=labels)
-    np.random.seed(111)
-    m.plot_scatter(projection='3d', labels=labels)
-    np.random.seed(111)
-    m.plot_magnification(labels=labels)
-    m.plot_steepest_gradient_map(resolution=10, data_labels=labels)
-    for do_test in _image_comparison(baseline_images=['gplvm_{}'.format(sub) for sub in ["latent", "latent_3d", "magnification", 'gradient']],
-                                     extensions=extensions,
-                                     tol=12):
-        yield (do_test, )
-
-def test_bayesian_gplvm():
-    from ..models import BayesianGPLVM
-    np.random.seed(12345)
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    #Q = 3
-    # Define dataset
-    #N = 10
-    #k1 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,10,10,0.1,0.1]), ARD=True)
-    #k2 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,0.1,10,0.1,10]), ARD=True)
-    #k3 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[0.1,0.1,10,10,10]), ARD=True)
-    #X = np.random.normal(0, 1, (N, 5))
-    #A = np.random.multivariate_normal(np.zeros(N), k1.K(X), Q).T
-    #B = np.random.multivariate_normal(np.zeros(N), k2.K(X), Q).T
-    #C = np.random.multivariate_normal(np.zeros(N), k3.K(X), Q).T
-
-    #Y = np.vstack((A,B,C))
-    #labels = np.hstack((np.zeros(A.shape[0]), np.ones(B.shape[0]), np.ones(C.shape[0])*2))
-
-    #k = RBF(Q, ARD=True, lengthscale=2)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
-    pars = np.load(os.path.join(basedir, 'b-gplvm-save.npz'))
-    Y = pars['Y']
-    Q = pars['Q']
-    labels = pars['labels']
-
-    import warnings
-    with warnings.catch_warnings(record=True) as w:
-        warnings.simplefilter('always')  # always print
-        m = BayesianGPLVM(Y, Q, initialize=False)
-    m.update_model(False)
-    m.initialize_parameter()
-    m[:] = pars['bgplvm_p']
-    m.update_model(True)
-
-    #m.optimize(messages=0)
-    np.random.seed(111)
-    m.plot_inducing(projection='2d')
-    np.random.seed(111)
-    m.plot_inducing(projection='3d')
-    np.random.seed(111)
-    m.plot_latent(projection='2d', labels=labels)
-    np.random.seed(111)
-    m.plot_scatter(projection='3d', labels=labels)
-    np.random.seed(111)
-    m.plot_magnification(labels=labels)
-    np.random.seed(111)
-    m.plot_steepest_gradient_map(resolution=10, data_labels=labels)
-    for do_test in _image_comparison(baseline_images=['bayesian_gplvm_{}'.format(sub) for sub in ["inducing", "inducing_3d", "latent", "latent_3d", "magnification", 'gradient']], extensions=extensions):
-        yield (do_test, )
-
-if __name__ == '__main__':
-    import nose
-    nose.main(defaultTest='./plotting_tests.py')
--- a/GPy/testing/run_coverage.sh
+++ b/GPy/testing/run_coverage.sh
@ -1 +1 @@
-nosetests . --with-coverage --logging-level=INFO --cover-html --cover-html-dir=coverage --cover-package=GPy --cover-erase
+pytest .
--- a/GPy/testing/rv_transformation_tests.py
+++ b/GPy/testing/rv_transformation_tests.py
@ -1,117 +0,0 @@
-# Written by Ilias Bilionis
-"""
-Test if hyperparameters in models are properly transformed.
-"""
-
-
-import unittest
-import numpy as np
-import scipy.stats as st
-import GPy
-
-
-class TestModel(GPy.core.Model):
-    """
-    A simple GPy model with one parameter.
-    """
-    def __init__(self, theta=1.):
-        super(TestModel, self).__init__('test_model')
-        theta = GPy.core.Param('theta', theta)
-        self.link_parameter(theta)
-
-    def log_likelihood(self):
-        return 0.
-
-
-class RVTransformationTestCase(unittest.TestCase):
-
-    def _test_trans(self, trans):
-        m = TestModel()
-        prior = GPy.priors.LogGaussian(.5, 0.1)
-        m.theta.set_prior(prior)
-        m.theta.unconstrain()
-        m.theta.constrain(trans)
-        # The PDF of the transformed variables
-        p_phi = lambda phi : np.exp(-m._objective_grads(phi)[0])
-        # To the empirical PDF of:
-        theta_s = prior.rvs(1e5)
-        phi_s = trans.finv(theta_s)
-        # which is essentially a kernel density estimation
-        kde = st.gaussian_kde(phi_s)
-        # We will compare the PDF here:
-        phi = np.linspace(phi_s.min(), phi_s.max(), 100)
-        # The transformed PDF of phi should be this:
-        pdf_phi = np.array([p_phi(p) for p in phi])
-        # UNCOMMENT TO SEE GRAPHICAL COMPARISON
-        #import matplotlib.pyplot as plt
-        #fig, ax = plt.subplots()
-        #ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Histogram')
-        #ax.plot(phi, kde(phi), '--', linewidth=2, label='Kernel Density Estimation')
-        #ax.plot(phi, pdf_phi, ':', linewidth=2, label='Transformed PDF')
-        #ax.set_xlabel(r'transformed $\theta$', fontsize=16)
-        #ax.set_ylabel('PDF', fontsize=16)
-        #plt.legend(loc='best')
-        #plt.show(block=True)
-        # END OF PLOT
-        # The following test cannot be very accurate
-        self.assertTrue(np.linalg.norm(pdf_phi - kde(phi)) / np.linalg.norm(kde(phi)) <= 1e-1)
-
-    def _test_grad(self, trans):
-        np.random.seed(1234)
-        m = TestModel(np.random.uniform(.5, 1.5, 20))
-        prior = GPy.priors.LogGaussian(.5, 0.1)
-        m.theta.set_prior(prior)
-        m.theta.constrain(trans)
-        m.randomize()
-        print(m)
-        self.assertTrue(m.checkgrad(1))
-
-    def test_Logexp(self):
-        self._test_trans(GPy.constraints.Logexp())
-
-    @unittest.skip("Gradient not checking right, @jameshensman what is going on here?")
-    def test_Logexp_grad(self):        
-        self._test_grad(GPy.constraints.Logexp())
-        
-    def test_Exponent(self):
-        self._test_trans(GPy.constraints.Exponent())
-    
-    @unittest.skip("Gradient not checking right, @jameshensman what is going on here?")
-    def test_Exponent_grad(self):
-        self._test_grad(GPy.constraints.Exponent())
-
-
-if __name__ == '__main__':
-    unittest.main()
-    quit()
-    m = TestModel()
-    prior = GPy.priors.LogGaussian(0., .9)
-    m.theta.set_prior(prior)
-
-    # The following should return the PDF in terms of the transformed quantities
-    p_phi = lambda phi : np.exp(-m._objective_grads(phi)[0])
-
-    # Let's look at the transformation phi = log(exp(theta - 1))
-    trans = GPy.constraints.Exponent()
-    m.theta.constrain(trans)
-    # Plot the transformed probability density
-    phi = np.linspace(-8, 8, 100)
-    fig, ax = plt.subplots()
-    # Let's draw some samples of theta and transform them so that we see
-    # which one is right
-    theta_s = prior.rvs(10000)
-    # Transform it to the new variables
-    phi_s = trans.finv(theta_s)
-    # And draw their histogram
-    ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Empirical')
-    # This is to be compared to the PDF of the model expressed in terms of these new
-    # variables
-    ax.plot(phi, [p_phi(p) for p in phi], label='Transformed PDF', linewidth=2)
-    ax.set_xlim(-3, 10)
-    ax.set_xlabel(r'transformed $\theta$', fontsize=16)
-    ax.set_ylabel('PDF', fontsize=16)
-    plt.legend(loc='best')
-    # Now let's test the gradients
-    m.checkgrad(verbose=True)
-    # And show the plot
-    plt.show(block=True)
--- a/GPy/testing/serialization_tests.py
+++ b/GPy/testing/serialization_tests.py
@ -1,279 +0,0 @@
-'''
-Created on 20 April 2017
-
-@author: pgmoren
-'''
-import unittest, itertools
-#import cPickle as pickle
-import pickle
-import numpy as np
-import tempfile
-import GPy
-from nose import SkipTest
-import numpy as np
-import os
-fixed_seed = 11
-
-
-class Test(unittest.TestCase):
-    def test_serialize_deserialize_kernels(self):
-        k1 = GPy.kern.RBF(2, variance=1.0, lengthscale=[1.0,1.0], ARD=True)
-        k2 = GPy.kern.RatQuad(2, variance=2.0, lengthscale=1.0, power=2.0, active_dims = [0,1])
-        k3 = GPy.kern.Bias(2, variance=2.0, active_dims = [1,0])
-        k4 = GPy.kern.StdPeriodic(2, variance=2.0, lengthscale=1.0, period=1.0, active_dims = [1,1])
-        k5 = GPy.kern.Linear(2, variances=[2.0, 1.0], ARD=True, active_dims = [1,1])
-        k6 = GPy.kern.Exponential(2, variance=1., lengthscale=2)
-        k7 = GPy.kern.Matern32(2, variance=1.0, lengthscale=[1.0,3.0], ARD=True, active_dims = [1,1])
-        k8 = GPy.kern.Matern52(2, variance=2.0, lengthscale=[2.0,1.0], ARD=True, active_dims = [1,0])
-        k9 = GPy.kern.ExpQuad(2, variance=3.0, lengthscale=[1.0,2.0], ARD=True, active_dims = [0,1])
-        k10 = GPy.kern.OU(2, variance=2.0, lengthscale=[2.0, 1.0], ARD=True, active_dims=[1, 0])
-        k11 = k1 + k1.copy() + k2 + k3 + k4 + k5 + k6
-        k12 = k1 * k2 * k2.copy() * k3 * k4 * k5
-        k13 = (k1 + k2) * (k3 + k4 + k5)
-        k14 = ((k1 + k2) * k3) + k4 + k5 * k7
-        k15 = ((k1 + k2) * k3) + k4 * k5 + k8 * k10
-        k16 = ((k1 * k2) * k3) + k4 * k5 + k8 + k9
-
-        k_list = [k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15,k16]
-
-        for kk in k_list:
-            kk_dict = kk.to_dict()
-            kk_r = GPy.kern.Kern.from_dict(kk_dict)
-            assert type(kk) == type(kk_r)
-            np.testing.assert_array_equal(kk[:], kk_r[:])
-            np.testing.assert_array_equal(np.array(kk.active_dims), np.array(kk_r.active_dims))
-
-    def test_serialize_deserialize_mappings(self):
-        m1 = GPy.mappings.Identity(3,2)
-        m2 = GPy.mappings.Constant(3,2,1)
-        m2_r = GPy.core.mapping.Mapping.from_dict(m2.to_dict())
-        np.testing.assert_array_equal(m2.C.values[:], m2_r.C.values[:])
-        m3 = GPy.mappings.Linear(3,2)
-        m3_r = GPy.core.mapping.Mapping.from_dict(m3.to_dict())
-        assert np.all(m3.A == m3_r.A)
-
-        m_list = [m1, m2, m3]
-        for mm in m_list:
-            mm_dict = mm.to_dict()
-            mm_r = GPy.core.mapping.Mapping.from_dict(mm_dict)
-            assert type(mm) == type(mm_r)
-            assert type(mm.input_dim) == type(mm_r.input_dim)
-            assert type(mm.output_dim) == type(mm_r.output_dim)
-
-    def test_serialize_deserialize_likelihoods(self):
-        l1 = GPy.likelihoods.Gaussian(GPy.likelihoods.link_functions.Identity(),variance=3.0)
-        l1_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l1.to_dict())
-        l2 = GPy.likelihoods.Bernoulli(GPy.likelihoods.link_functions.Probit())
-        l2_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l2.to_dict())
-        assert type(l1) == type(l1_r)
-        assert np.all(l1.variance == l1_r.variance)
-        assert type(l2) == type(l2_r)
-
-    def test_serialize_deserialize_normalizers(self):
-        n1 = GPy.util.normalizer.Standardize()
-        n1.scale_by(np.random.rand(10))
-        n1_r = GPy.util.normalizer._Norm.from_dict((n1.to_dict()))
-        assert type(n1) == type(n1_r)
-        assert np.all(n1.mean == n1_r.mean)
-        assert np.all(n1.std == n1_r.std)
-
-    def test_serialize_deserialize_link_functions(self):
-        l1 = GPy.likelihoods.link_functions.Identity()
-        l2 = GPy.likelihoods.link_functions.Probit()
-        l_list = [l1, l2]
-        for ll in l_list:
-            ll_dict = ll.to_dict()
-            ll_r = GPy.likelihoods.link_functions.GPTransformation.from_dict(ll_dict)
-            assert type(ll) == type(ll_r)
-
-    def test_serialize_deserialize_inference_methods(self):
-
-        e1 = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode="nested")
-        e1.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10))
-        e1._ep_approximation = []
-        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.posteriorParams(np.random.rand(10),np.random.rand(100).reshape((10,10))))
-        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10)))
-        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.cavityParams(10))
-        e1._ep_approximation[-1].v = np.random.rand(10)
-        e1._ep_approximation[-1].tau = np.random.rand(10)
-        e1._ep_approximation.append(np.random.rand(10))
-        e1_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e1.to_dict())
-
-        assert type(e1) == type(e1_r)
-        assert e1.epsilon==e1_r.epsilon
-        assert e1.eta==e1_r.eta
-        assert e1.delta==e1_r.delta
-        assert e1.always_reset==e1_r.always_reset
-        assert e1.max_iters==e1_r.max_iters
-        assert e1.ep_mode==e1_r.ep_mode
-        assert e1.parallel_updates==e1_r.parallel_updates
-
-        np.testing.assert_array_equal(e1.ga_approx_old.tau[:], e1_r.ga_approx_old.tau[:])
-        np.testing.assert_array_equal(e1.ga_approx_old.v[:], e1_r.ga_approx_old.v[:])
-        np.testing.assert_array_equal(e1._ep_approximation[0].mu[:], e1_r._ep_approximation[0].mu[:])
-        np.testing.assert_array_equal(e1._ep_approximation[0].Sigma[:], e1_r._ep_approximation[0].Sigma[:])
-        np.testing.assert_array_equal(e1._ep_approximation[1].tau[:], e1_r._ep_approximation[1].tau[:])
-        np.testing.assert_array_equal(e1._ep_approximation[1].v[:], e1_r._ep_approximation[1].v[:])
-        np.testing.assert_array_equal(e1._ep_approximation[2].tau[:], e1_r._ep_approximation[2].tau[:])
-        np.testing.assert_array_equal(e1._ep_approximation[2].v[:], e1_r._ep_approximation[2].v[:])
-        np.testing.assert_array_equal(e1._ep_approximation[3][:], e1_r._ep_approximation[3][:])
-
-        e2 = GPy.inference.latent_function_inference.expectation_propagation.EPDTC(ep_mode="nested")
-        e2.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10))
-        e2._ep_approximation = []
-        e2._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.posteriorParamsDTC(np.random.rand(10),np.random.rand(10)))
-        e2._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10)))
-        e2._ep_approximation.append(100.0)
-        e2_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e2.to_dict())
-
-        assert type(e2) == type(e2_r)
-        assert e2.epsilon==e2_r.epsilon
-        assert e2.eta==e2_r.eta
-        assert e2.delta==e2_r.delta
-        assert e2.always_reset==e2_r.always_reset
-        assert e2.max_iters==e2_r.max_iters
-        assert e2.ep_mode==e2_r.ep_mode
-        assert e2.parallel_updates==e2_r.parallel_updates
-
-        np.testing.assert_array_equal(e2.ga_approx_old.tau[:], e2_r.ga_approx_old.tau[:])
-        np.testing.assert_array_equal(e2.ga_approx_old.v[:], e2_r.ga_approx_old.v[:])
-        np.testing.assert_array_equal(e2._ep_approximation[0].mu[:], e2_r._ep_approximation[0].mu[:])
-        np.testing.assert_array_equal(e2._ep_approximation[0].Sigma_diag[:], e2_r._ep_approximation[0].Sigma_diag[:])
-        np.testing.assert_array_equal(e2._ep_approximation[1].tau[:], e2_r._ep_approximation[1].tau[:])
-        np.testing.assert_array_equal(e2._ep_approximation[1].v[:], e2_r._ep_approximation[1].v[:])
-        assert(e2._ep_approximation[2] == e2_r._ep_approximation[2])
-
-        e3 = GPy.inference.latent_function_inference.exact_gaussian_inference.ExactGaussianInference()
-        e3_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e3.to_dict())
-
-        assert type(e3) == type(e3_r)
-
-
-    def test_serialize_deserialize_GP(self):
-        np.random.seed(fixed_seed)
-        N = 20
-        Nhalf = int(N/2)
-        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
-        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
-        kernel = GPy.kern.RBF(1)
-        likelihood = GPy.likelihoods.Bernoulli()
-        inference_method=GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode="nested")
-        mean_function=None
-
-        m = GPy.core.GP(X=X, Y=Y,  kernel=kernel, likelihood=likelihood, inference_method=inference_method, mean_function=mean_function, normalizer=True, name='gp_classification')
-        m.optimize()
-        m.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
-        m.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
-        m1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
-        m2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X,Y))
-        os.remove("temp_test_gp_with_data.json.zip")
-        os.remove("temp_test_gp_without_data.json.zip")
-        var = m.predict(X)[0]
-        var1_r = m1_r.predict(X)[0]
-        var2_r = m2_r.predict(X)[0]
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var2_r).flatten())
-
-    def test_serialize_deserialize_SparseGP(self):
-        np.random.seed(fixed_seed)
-        N = 20
-        Nhalf = int(N/2)
-        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
-        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
-        kernel = GPy.kern.RBF(1)
-        likelihood = GPy.likelihoods.Bernoulli()
-        inference_method=GPy.inference.latent_function_inference.expectation_propagation.EPDTC(ep_mode="nested")
-        mean_function=None
-
-        sm = GPy.core.SparseGP(X=X, Y=Y, Z=X[0:20,:], kernel=kernel, likelihood=likelihood, inference_method=inference_method, mean_function=mean_function, normalizer=True, name='sparse_gp_classification')
-        sm.optimize()
-        sm.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
-        sm.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
-        sm1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
-        sm2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X,Y))
-        os.remove("temp_test_gp_with_data.json.zip")
-        os.remove("temp_test_gp_without_data.json.zip")
-        var = sm.predict(X)[0]
-        var1_r = sm1_r.predict(X)[0]
-        var2_r = sm2_r.predict(X)[0]
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var2_r).flatten())
-
-    def test_serialize_deserialize_GPRegressor(self):
-        np.random.seed(fixed_seed)
-        N = 50
-        N_new = 50
-        D = 1
-        X = np.random.uniform(-3., 3., (N, 1))
-        Y = np.sin(X) + np.random.randn(N, D) * 0.05
-        X_new = np.random.uniform(-3., 3., (N_new, 1))
-        k = GPy.kern.RBF(input_dim=1, lengthscale=10)
-        m = GPy.models.GPRegression(X,Y,k)
-        m.optimize()
-        m.save_model("temp_test_gp_regressor_with_data.json", compress=True, save_data=True)
-        m.save_model("temp_test_gp_regressor_without_data.json", compress=True, save_data=False)
-        m1_r = GPy.models.GPRegression.load_model("temp_test_gp_regressor_with_data.json.zip")
-        m2_r = GPy.models.GPRegression.load_model("temp_test_gp_regressor_without_data.json.zip", (X,Y))
-        os.remove("temp_test_gp_regressor_with_data.json.zip")
-        os.remove("temp_test_gp_regressor_without_data.json.zip")
-
-        Xp = np.random.uniform(size=(int(1e5),1))
-        Xp[:,0] = Xp[:,0]*15-5
-
-        _, var = m.predict(Xp)
-        _, var1_r = m1_r.predict(Xp)
-        _, var2_r = m2_r.predict(Xp)
-        np.testing.assert_array_equal(var.flatten(), var1_r.flatten())
-        np.testing.assert_array_equal(var.flatten(), var2_r.flatten())
-
-    def test_serialize_deserialize_GPClassification(self):
-        np.random.seed(fixed_seed)
-        N = 50
-        Nhalf = int(N/2)
-        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
-        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
-        kernel = GPy.kern.RBF(1)
-        m = GPy.models.GPClassification(X, Y, kernel=kernel)
-        m.optimize()
-        m.save_model("temp_test_gp_classifier_with_data.json", compress=True, save_data=True)
-        m.save_model("temp_test_gp_classifier_without_data.json", compress=True, save_data=False)
-        m1_r = GPy.models.GPClassification.load_model("temp_test_gp_classifier_with_data.json.zip")
-        self.assertTrue(type(m) == type(m1_r), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m1_r)))
-        m2_r = GPy.models.GPClassification.load_model("temp_test_gp_classifier_without_data.json.zip", (X,Y))
-        self.assertTrue(type(m) == type(m2_r), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m2_r)))
-        os.remove("temp_test_gp_classifier_with_data.json.zip")
-        os.remove("temp_test_gp_classifier_without_data.json.zip")
-
-        var = m.predict(X)[0]
-        var1_r = m1_r.predict(X)[0]
-        var2_r = m2_r.predict(X)[0]
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
-
-    def test_serialize_deserialize_SparseGPClassification(self):
-        np.random.seed(fixed_seed)
-        N = 50
-        Nhalf = int(N/2)
-        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
-        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
-        kernel = GPy.kern.RBF(1)
-        m = GPy.models.SparseGPClassification(X, Y, num_inducing=3, kernel=kernel)
-        m.optimize()
-        m.save_model("temp_test_sparse_gp_classifier_with_data.json", compress=True, save_data=True)
-        m.save_model("temp_test_sparse_gp_classifier_without_data.json", compress=True, save_data=False)
-        m1_r = GPy.models.SparseGPClassification.load_model("temp_test_sparse_gp_classifier_with_data.json.zip")
-        self.assertTrue(type(m) == type(m1_r), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m1_r)))
-        m2_r = GPy.models.SparseGPClassification.load_model("temp_test_sparse_gp_classifier_without_data.json.zip", (X,Y))
-        self.assertTrue(type(m) == type(m2_r), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m2_r)))
-        os.remove("temp_test_sparse_gp_classifier_with_data.json.zip")
-        os.remove("temp_test_sparse_gp_classifier_without_data.json.zip")
-
-        var = m.predict(X)[0]
-        var1_r = m1_r.predict(X)[0]
-        var2_r = m2_r.predict(X)[0]
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.test_parameter_index_operations']
-    unittest.main()
--- a/GPy/testing/state_space_main_tests.py
+++ b/GPy/testing/state_space_main_tests.py
--- a/GPy/testing/svgp_tests.py
+++ b/GPy/testing/svgp_tests.py
@ -1,54 +0,0 @@
-import numpy as np
-import scipy as sp
-import GPy
-
-class SVGP_nonconvex(np.testing.TestCase):
-    """
-    Inference in the SVGP with a student-T likelihood
-    """
-    def setUp(self):
-        X = np.linspace(0,10,100).reshape(-1,1)
-        Z = np.linspace(0,10,10).reshape(-1,1)
-        Y = np.sin(X) + np.random.randn(*X.shape)*0.1
-        Y[50] += 3
-
-        lik = GPy.likelihoods.StudentT(deg_free=2)
-        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
-        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
-    def test_grad(self):
-        assert self.m.checkgrad(step=1e-4)
-
-class SVGP_classification(np.testing.TestCase):
-    """
-    Inference in the SVGP with a Bernoulli likelihood
-    """
-    def setUp(self):
-        X = np.linspace(0,10,100).reshape(-1,1)
-        Z = np.linspace(0,10,10).reshape(-1,1)
-        Y = np.where((np.sin(X) + np.random.randn(*X.shape)*0.1)>0, 1,0)
-
-        lik = GPy.likelihoods.Bernoulli()
-        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
-        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
-    def test_grad(self):
-        assert self.m.checkgrad(step=1e-4)
-
-class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
-    """
-    Inference in the SVGP with a Bernoulli likelihood
-    """
-    def setUp(self):
-        X = np.linspace(0,10,100).reshape(-1,1)
-        Z = np.linspace(0,10,10).reshape(-1,1)
-        latent_f = np.exp(0.1*X * 0.05*X**2)
-        Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
-
-        mf = GPy.mappings.Linear(1,1)
-
-        lik = GPy.likelihoods.Poisson()
-        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
-        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
-    def test_grad(self):
-        assert self.m.checkgrad(step=1e-4)
-
-
--- a/GPy/testing/test_cython.py
+++ b/GPy/testing/test_cython.py
@ -0,0 +1,118 @@
+import numpy as np
+from GPy.util import choleskies
+import GPy
+import pytest
+
+from ..util.config import config
+
+try:
+    from ..util import choleskies_cython
+
+    choleskies_cython_working = config.getboolean("cython", "working")
+except ImportError:
+    choleskies_cython_working = False
+
+try:
+    from ..kern.src import stationary_cython
+
+    stationary_cython_working = config.getboolean("cython", "working")
+except ImportError:
+    stationary_cython_working = False
+
+"""
+These tests make sure that the pure python and cython codes work the same
+"""
+
+
+class CythonTestChols:
+    def setup(self):
+        self.flat = np.random.randn(45, 5)
+        self.triang = np.array([np.eye(20) for i in range(3)])
+
+    @pytest.mark.skipif(
+        not choleskies_cython_working,
+        "Cython cholesky module has not been built on this machine",
+    )
+    def test_flat_to_triang(self):
+        L1 = choleskies._flat_to_triang_pure(self.flat)
+        L2 = choleskies._flat_to_triang_cython(self.flat)
+        assert np.allclose(L1, L2), "Triang mismatch!"
+
+    @pytest.mark.skipif(
+        not choleskies_cython_working,
+        "Cython cholesky module has not been built on this machine",
+    )
+    def test_triang_to_flat(self):
+        A1 = choleskies._triang_to_flat_pure(self.triang)
+        A2 = choleskies._triang_to_flat_cython(self.triang)
+        assert np.allclose(A1, A2), "Flat mismatch!"
+
+
+class TestStationary:
+    def setup(self):
+        self.k = GPy.kern.RBF(10)
+        self.X = np.random.randn(300, 10)
+        self.Z = np.random.randn(20, 10)
+        self.dKxx = np.random.randn(300, 300)
+        self.dKzz = np.random.randn(20, 20)
+        self.dKxz = np.random.randn(300, 20)
+
+    @pytest.mark.skipif(
+        not stationary_cython_working,
+        reason="Cython stationary module has not been built on this machine",
+    )
+    def test_square_gradX(self):
+        self.setup()
+        g1 = self.k._gradients_X_cython(self.dKxx, self.X)
+        g2 = self.k._gradients_X_pure(self.dKxx, self.X)
+        assert np.allclose(g1, g2), "Gradient mismatch on square X!"
+
+    @pytest.mark.skipif(
+        not stationary_cython_working,
+        reason="Cython stationary module has not been built on this machine",
+    )
+    def test_rect_gradx(self):
+        self.setup()
+        g1 = self.k._gradients_X_cython(self.dKxz, self.X, self.Z)
+        g2 = self.k._gradients_X_pure(self.dKxz, self.X, self.Z)
+        assert np.allclose(g1, g2), "Gradient mismatch on rect X!"
+
+    @pytest.mark.skipif(
+        not stationary_cython_working,
+        reason="Cython stationary module has not been built on this machine",
+    )
+    def test_square_lengthscales(self):
+        self.setup()
+        g1 = self.k._lengthscale_grads_pure(self.dKxx, self.X, self.X)
+        g2 = self.k._lengthscale_grads_cython(self.dKxx, self.X, self.X)
+        assert np.allclose(g1, g2), "Gradient mismatch on square lengthscale!"
+
+    @pytest.mark.skipif(
+        not stationary_cython_working,
+        reason="Cython stationary module has not been built on this machine",
+    )
+    def test_rect_lengthscales(self):
+        self.setup()
+        g1 = self.k._lengthscale_grads_pure(self.dKxz, self.X, self.Z)
+        g2 = self.k._lengthscale_grads_cython(self.dKxz, self.X, self.Z)
+        assert np.allclose(g1, g2), "Gradient mismatch on rect lengthscale!"
+
+
+class TestCholeskiesBackprop:
+    def setup(self):
+        a = np.random.randn(10, 12)
+        A = a.dot(a.T)
+        self.L = GPy.util.linalg.jitchol(A)
+        self.dL = np.random.randn(10, 10)
+
+    @pytest.mark.skipif(
+        not choleskies_cython_working,
+        reason="Cython cholesky module has not been built on this machine",
+    )
+    def test_backprop(self):
+        self.setup()
+        r1 = choleskies._backprop_gradient_pure(self.dL, self.L)
+        r2 = choleskies_cython.backprop_gradient(self.dL, self.L)
+        r3 = choleskies_cython.backprop_gradient_par_c(self.dL, self.L)
+        assert np.allclose(r1, r2), "Gradient mismatch!"
+        assert np.allclose(r1, r3), "Gradient mismatch!"
--- a/GPy/testing/ep_likelihood_tests.py
+++ b/GPy/testing/ep_likelihood_tests.py
@ -1,17 +1,19 @@
-
+import pytest
 import numpy as np
-import unittest
 import GPy
-from GPy.models import GradientChecker
+

 fixed_seed = 10
-from nose.tools import with_setup, nottest
+
+
+def rmse(Y, Ystar):
+    return np.sqrt(np.mean((Y - Ystar) ** 2))


 # this file will contain some high level tests, this is not unit testing, but will give us a higher level estimate
 # if things are going well under the hood.
-class TestObservationModels(unittest.TestCase):
-    def setUp(self):
+class TestObservationModels:
+    def setup(self):
        np.random.seed(fixed_seed)
        self.N = 100
        self.D = 2
@ -22,7 +24,7 @@ class TestObservationModels(unittest.TestCase):
        self.Y = (np.sin(self.X[:, 0] * 2 * np.pi) + noise)[:, None]
        self.num_points = self.X.shape[0]
        self.f = np.random.rand(self.N, 1)
-        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
+        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=int)[:, None]
        # self.binary_Y[self.binary_Y == 0.0] = -1.0
        self.positive_Y = np.exp(self.Y.copy())

@ -31,45 +33,72 @@ class TestObservationModels(unittest.TestCase):
        self.Y_noisy[75] += 1.3

        self.init_var = 0.15
-        self.deg_free = 4.
+        self.deg_free = 4.0
        censored = np.zeros_like(self.Y)
        random_inds = np.random.choice(self.N, int(self.N / 2), replace=True)
        censored[random_inds] = 1
        self.Y_metadata = dict()
-        self.Y_metadata['censored'] = censored
+        self.Y_metadata["censored"] = censored
        self.kernel1 = GPy.kern.RBF(self.X.shape[1]) + GPy.kern.White(self.X.shape[1])

-    def tearDown(self):
+    def tear_down(self):
        self.Y = None
        self.X = None
-        self.binary_Y =None
+        self.binary_Y = None
        self.positive_Y = None
        self.kernel1 = None

-    @with_setup(setUp, tearDown)
-    def testEPClassification(self):
+    def test_epccassification(self):
+        self.setup()
+
        bernoulli = GPy.likelihoods.Bernoulli()
        laplace_inf = GPy.inference.latent_function_inference.Laplace()

-        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
-        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
-        ep_inf_fractional = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.9)
+        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode="alternated")
+        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode="nested")
+        ep_inf_fractional = GPy.inference.latent_function_inference.EP(
+            ep_mode="nested", eta=0.9
+        )

-        m1 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=laplace_inf)
+        m1 = GPy.core.GP(
+            self.X,
+            self.binary_Y.copy(),
+            kernel=self.kernel1.copy(),
+            likelihood=bernoulli.copy(),
+            inference_method=laplace_inf,
+        )
        m1.randomize()

-        m2 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_alt)
+        m2 = GPy.core.GP(
+            self.X,
+            self.binary_Y.copy(),
+            kernel=self.kernel1.copy(),
+            likelihood=bernoulli.copy(),
+            inference_method=ep_inf_alt,
+        )
        m2.randomize()

-        m3 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_nested)
+        m3 = GPy.core.GP(
+            self.X,
+            self.binary_Y.copy(),
+            kernel=self.kernel1.copy(),
+            likelihood=bernoulli.copy(),
+            inference_method=ep_inf_nested,
+        )
        m3.randomize()
        #
-        m4 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_fractional)
+        m4 = GPy.core.GP(
+            self.X,
+            self.binary_Y.copy(),
+            kernel=self.kernel1.copy(),
+            likelihood=bernoulli.copy(),
+            inference_method=ep_inf_fractional,
+        )
        m4.randomize()

-        optimizer = 'bfgs'
+        optimizer = "bfgs"

-        #do gradcheck here ...
+        # do gradcheck here ...
        # self.assertTrue(m1.checkgrad())
        # self.assertTrue(m2.checkgrad())
        # self.assertTrue(m3.checkgrad())
@ -86,35 +115,53 @@ class TestObservationModels(unittest.TestCase):
        probs_mean_ep_nested, probs_var_ep_nested = m3.predict(self.X)

        # for simple single dimension data , marginal likelihood for laplace and EP approximations should not be so far apart.
-        self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=1)
-        self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), delta=1)
-        self.assertAlmostEqual(m1.log_likelihood(), m4.log_likelihood(), delta=5)
+        # TODO: the below were assertAlmostEqual, not sure if allclose will do the job here
+        #     I replace the old delta with the atol
+        assert np.allclose(m1.log_likelihood(), m2.log_likelihood(), atol=1.0)
+        assert np.allclose(m1.log_likelihood(), m3.log_likelihood(), atol=1)
+        assert np.allclose(m1.log_likelihood(), m4.log_likelihood(), atol=5.0)

        GPy.util.classification.conf_matrix(probs_mean_lap, self.binary_Y)
        GPy.util.classification.conf_matrix(probs_mean_ep_alt, self.binary_Y)
        GPy.util.classification.conf_matrix(probs_mean_ep_nested, self.binary_Y)

-    @nottest
-    def rmse(self, Y, Ystar):
-        return np.sqrt(np.mean((Y - Ystar) ** 2))
+    @pytest.mark.skip(
+        "Fails as a consequence of fixing the DSYR function. Needs to be reviewed!"
+    )
+    def test_ep_with_studentt(self):
+        self.setup()
+        self.tear_down()

-    @with_setup(setUp, tearDown)
-    @unittest.skip("Fails as a consequence of fixing the DSYR function. Needs to be reviewed!")
-    def test_EP_with_StudentT(self):
-        studentT = GPy.likelihoods.StudentT(deg_free=self.deg_free, sigma2=self.init_var)
+        studentT = GPy.likelihoods.StudentT(
+            deg_free=self.deg_free, sigma2=self.init_var
+        )
        laplace_inf = GPy.inference.latent_function_inference.Laplace()

-        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
-        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
-        ep_inf_frac = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.7)
+        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode="alternated")
+        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode="nested")
+        ep_inf_frac = GPy.inference.latent_function_inference.EP(
+            ep_mode="nested", eta=0.7
+        )

-        m1 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=laplace_inf)
+        m1 = GPy.core.GP(
+            self.X.copy(),
+            self.Y_noisy.copy(),
+            kernel=self.kernel1.copy(),
+            likelihood=studentT.copy(),
+            inference_method=laplace_inf,
+        )
        # optimize
-        m1['.*white'].constrain_fixed(1e-5)
+        m1[".*white"].constrain_fixed(1e-5)
        m1.randomize()

-        m2 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=ep_inf_alt)
-        m2['.*white'].constrain_fixed(1e-5)
+        m2 = GPy.core.GP(
+            self.X.copy(),
+            self.Y_noisy.copy(),
+            kernel=self.kernel1.copy(),
+            likelihood=studentT.copy(),
+            inference_method=ep_inf_alt,
+        )
+        m2[".*white"].constrain_fixed(1e-5)
        # m2.constrain_bounded('.*t_scale2', 0.001, 10)
        m2.randomize()

@ -123,12 +170,14 @@ class TestObservationModels(unittest.TestCase):
        # # m3.constrain_bounded('.*t_scale2', 0.001, 10)
        # m3.randomize()

-        optimizer='bfgs'
-        m1.optimize(optimizer=optimizer,max_iters=400)
+        optimizer = "bfgs"
+        m1.optimize(optimizer=optimizer, max_iters=400)
        m2.optimize(optimizer=optimizer, max_iters=400)
        # m3.optimize(optimizer=optimizer, max_iters=500)

-        self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=200)
+        # TODO: this was assertAlmostEqual, not sure if allclose will do the job here
+        #    I replace the old delta with the atol
+        assert np.allclose(m1.log_likelihood(), m2.log_likelihood(), atol=200.0)

        # self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), 3)

@ -140,9 +189,7 @@ class TestObservationModels(unittest.TestCase):
        # rmse_nested = self.rmse(preds_mean_nested, self.Y_noisy)

        if rmse_alt > rmse_lap:
-            self.assertAlmostEqual(rmse_lap, rmse_alt, delta=1.5)
+            # TODO: this was assertAlmostEqual, not sure if allclose will do the job here
+            #   I replace the old delta with the atol
+            assert np.allclose(rmse_lap, rmse_alt, atol=1.5)
        # m3.optimize(optimizer=optimizer, max_iters=500)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/GPy/testing/gp_tests.py
+++ b/GPy/testing/gp_tests.py
@ -1,36 +1,36 @@
-'''
+"""
 Created on 4 Sep 2015

@author: maxz
-'''
-import unittest
-import numpy as np, GPy
+"""
+import numpy as np
+import GPy
 from GPy.core.parameterization.variational import NormalPosterior

-class Test(unittest.TestCase):

-
-    def setUp(self):
+class TestGP:
+    def setup(self):
        np.random.seed(12345)
        self.N = 20
        self.N_new = 50
        self.D = 1
-        self.X = np.random.uniform(-3., 3., (self.N, 1))
+        self.X = np.random.uniform(-3.0, 3.0, (self.N, 1))
        self.Y = np.sin(self.X) + np.random.randn(self.N, self.D) * 0.05
-        self.X_new = np.random.uniform(-3., 3., (self.N_new, 1))
-
+        self.X_new = np.random.uniform(-3.0, 3.0, (self.N_new, 1))

    def test_setxy_bgplvm(self):
+        self.setup()
+
        k = GPy.kern.RBF(1)
        m = GPy.models.BayesianGPLVM(self.Y, 1, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X
        Xnew = NormalPosterior(m.X.mean[:10].copy(), m.X.variance[:10].copy())
        m.set_XY(Xnew, m.Y[:10].copy())
-        assert(m.checkgrad())
+        assert m.checkgrad()

-        assert(m.num_data == m.X.shape[0])
-        assert(m.input_dim == m.X.shape[1])
+        assert m.num_data == m.X.shape[0]
+        assert m.input_dim == m.X.shape[1]

        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
@ -38,16 +38,18 @@ class Test(unittest.TestCase):
        np.testing.assert_allclose(var, var2)

    def test_setxy_gplvm(self):
+        self.setup()
+
        k = GPy.kern.RBF(1)
        m = GPy.models.GPLVM(self.Y, 1, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X.copy()
        Xnew = X[:10].copy()
        m.set_XY(Xnew, m.Y[:10].copy())
-        assert(m.checkgrad())
+        assert m.checkgrad()

-        assert(m.num_data == m.X.shape[0])
-        assert(m.input_dim == m.X.shape[1])
+        assert m.num_data == m.X.shape[0]
+        assert m.input_dim == m.X.shape[1]

        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
@ -55,15 +57,17 @@ class Test(unittest.TestCase):
        np.testing.assert_allclose(var, var2)

    def test_setxy_gp(self):
+        self.setup()
+
        k = GPy.kern.RBF(1)
        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X.copy()
        m.set_XY(m.X[:10], m.Y[:10])
-        assert(m.checkgrad())
+        assert m.checkgrad()

-        assert(m.num_data == m.X.shape[0])
-        assert(m.input_dim == m.X.shape[1])
+        assert m.num_data == m.X.shape[0]
+        assert m.input_dim == m.X.shape[1]

        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
@ -73,39 +77,45 @@ class Test(unittest.TestCase):
    def test_mean_function(self):
        from GPy.core.parameterization.param import Param
        from GPy.core.mapping import Mapping
+
+        self.setup()
+
        class Parabola(Mapping):
-            def __init__(self, variance, degree=2, name='parabola'):
+            def __init__(self, variance, degree=2, name="parabola"):
                super(Parabola, self).__init__(1, 1, name)
-                self.variance = Param('variance', np.ones(degree+1) * variance)
+                self.variance = Param("variance", np.ones(degree + 1) * variance)
                self.degree = degree
                self.link_parameter(self.variance)

            def f(self, X):
                p = self.variance[0] * np.ones(X.shape)
-                for i in range(1, self.degree+1):
-                    p += self.variance[i] * X**(i)
+                for i in range(1, self.degree + 1):
+                    p += self.variance[i] * X ** (i)
                return p

            def gradients_X(self, dL_dF, X):
                grad = np.zeros(X.shape)
-                for i in range(1, self.degree+1):
-                    grad += (i) * self.variance[i] * X**(i-1)
+                for i in range(1, self.degree + 1):
+                    grad += (i) * self.variance[i] * X ** (i - 1)
                return grad

            def update_gradients(self, dL_dF, X):
-                for i in range(self.degree+1):
-                    self.variance.gradient[i] = (dL_dF * X**(i)).sum(0)
+                for i in range(self.degree + 1):
+                    self.variance.gradient[i] = (dL_dF * X ** (i)).sum(0)
+
        X = np.linspace(-2, 2, 100)[:, None]
        k = GPy.kern.RBF(1)
        k.randomize()
-        p = Parabola(.3)
+        p = Parabola(0.3)
        p.randomize()
-        Y = p.f(X) + np.random.multivariate_normal(np.zeros(X.shape[0]), k.K(X)+np.eye(X.shape[0])*1e-8)[:,None] + np.random.normal(0, .1, (X.shape[0], 1))
+        Y = (
+            p.f(X)
+            + np.random.multivariate_normal(
+                np.zeros(X.shape[0]), k.K(X) + np.eye(X.shape[0]) * 1e-8
+            )[:, None]
+            + np.random.normal(0, 0.1, (X.shape[0], 1))
+        )
        m = GPy.models.GPRegression(X, Y, mean_function=p)
        m.randomize()
-        assert(m.checkgrad())
+        assert m.checkgrad()
        _ = m.predict(m.X)
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.testName']
-    unittest.main()
--- a/GPy/testing/test_gpy_kernels_state_space.py
+++ b/GPy/testing/test_gpy_kernels_state_space.py
--- a/GPy/testing/grid_tests.py
+++ b/GPy/testing/grid_tests.py
@ -3,17 +3,28 @@

 # Kurt Cutajar

-import unittest
 import numpy as np
 import GPy

-class GridModelTest(unittest.TestCase):
-    def setUp(self):
+
+class TestGridModel:
+    def setup(self):
        ######################################
        # # 3 dimensional example

        # sample inputs and outputs
-        self.X = np.array([[0,0,0],[0,0,1],[0,1,0],[0,1,1],[1,0,0],[1,0,1],[1,1,0],[1,1,1]])
+        self.X = np.array(
+            [
+                [0, 0, 0],
+                [0, 0, 1],
+                [0, 1, 0],
+                [0, 1, 1],
+                [1, 0, 0],
+                [1, 0, 1],
+                [1, 1, 0],
+                [1, 1, 1],
+            ]
+        )
        self.Y = np.random.randn(8, 1) * 100
        self.dim = self.X.shape[1]

@ -33,10 +44,15 @@ class GridModelTest(unittest.TestCase):
        kernel2 = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
        m2 = GPy.models.GPRegression(self.X, self.Y, kernel2)

-        np.testing.assert_almost_equal(kernel.variance.gradient, kernel2.variance.gradient)
-        np.testing.assert_almost_equal(kernel.lengthscale.gradient, kernel2.lengthscale.gradient)
-        np.testing.assert_almost_equal(m.likelihood.variance.gradient, m2.likelihood.variance.gradient)
-
+        np.testing.assert_almost_equal(
+            kernel.variance.gradient, kernel2.variance.gradient
+        )
+        np.testing.assert_almost_equal(
+            kernel.lengthscale.gradient, kernel2.lengthscale.gradient
+        )
+        np.testing.assert_almost_equal(
+            m.likelihood.variance.gradient, m2.likelihood.variance.gradient
+        )

    def test_prediction_match(self):
        kernel = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
@ -45,7 +61,6 @@ class GridModelTest(unittest.TestCase):
        kernel2 = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
        m2 = GPy.models.GPRegression(self.X, self.Y, kernel2)

-        test = np.array([[0,0,2],[-1,3,-4]])
+        test = np.array([[0, 0, 2], [-1, 3, -4]])

        np.testing.assert_almost_equal(m.predict(test), m2.predict(test))
-
--- a/GPy/testing/test_inference.py
+++ b/GPy/testing/test_inference.py
@ -0,0 +1,275 @@
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+The test cases for various inference algorithms
+"""
+
+import numpy as np
+import GPy
+
+# np.seterr(invalid='raise')
+
+
+class TestInferenceXCase:
+    def get_data(self):
+        np.random.seed(1111)
+        Ylist = GPy.examples.dimensionality_reduction._simulate_matern(
+            5, 1, 1, 10, 3, False
+        )[0]
+        return Ylist[0]
+
+    def test_inferenceX_BGPLVM_Linear(self):
+        Ys = self.get_data()
+        m = GPy.models.BayesianGPLVM(Ys, 3, kernel=GPy.kern.Linear(3, ARD=True))
+        m.optimize()
+        x, mi = m.infer_newX(m.Y, optimize=True)
+        np.testing.assert_array_almost_equal(m.X.mean, mi.X.mean, decimal=2)
+        np.testing.assert_array_almost_equal(m.X.variance, mi.X.variance, decimal=2)
+
+    def test_inferenceX_BGPLVM_RBF(self):
+        Ys = self.get_data()
+        m = GPy.models.BayesianGPLVM(Ys, 3, kernel=GPy.kern.RBF(3, ARD=True))
+        import warnings
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            m.optimize()
+        _x, mi = m.infer_newX(m.Y, optimize=True)
+        np.testing.assert_array_almost_equal(m.X.mean, mi.X.mean, decimal=2)
+        np.testing.assert_array_almost_equal(m.X.variance, mi.X.variance, decimal=2)
+
+    def test_inferenceX_GPLVM_Linear(self):
+        Ys = self.get_data()
+        m = GPy.models.GPLVM(Ys, 3, kernel=GPy.kern.Linear(3, ARD=True))
+        m.optimize()
+        _x, mi = m.infer_newX(m.Y, optimize=True)
+        np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
+
+    def test_inferenceX_GPLVM_RBF(self):
+        Ys = self.get_data()
+        m = GPy.models.GPLVM(Ys, 3, kernel=GPy.kern.RBF(3, ARD=True))
+        m.optimize()
+        _x, mi = m.infer_newX(m.Y, optimize=True)
+        np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
+
+
+class TestInferenceGPEP:
+    def get_data(self):
+        np.random.seed(1)
+        k = GPy.kern.RBF(1, variance=7.0, lengthscale=0.2)
+        X = np.random.rand(200, 1)
+        f = np.random.multivariate_normal(
+            np.zeros(200), k.K(X) + 1e-5 * np.eye(X.shape[0])
+        )
+        lik = GPy.likelihoods.Bernoulli()
+        _p = lik.gp_link.transf(f)  # squash the latent function
+        Y = lik.samples(f).reshape(-1, 1)
+        return X, Y
+
+    def get_noisy_data(self):
+        np.random.seed(1)
+        X = np.random.rand(100, 1)
+        self.real_std = 0.1
+        noise = np.random.randn(*X[:, 0].shape) * self.real_std
+        Y = (np.sin(X[:, 0] * 2 * np.pi) + noise)[:, None]
+        self.f = np.random.rand(X.shape[0], 1)
+        Y_extra_noisy = Y.copy()
+        Y_extra_noisy[50] += 4.0
+        # Y_extra_noisy[80:83] -= 2.
+        return X, Y, Y_extra_noisy
+
+    def test_inference_EP(self):
+        from paramz import ObsAr
+
+        X, Y = self.get_data()
+        lik = GPy.likelihoods.Bernoulli()
+        k = GPy.kern.RBF(1, variance=7.0, lengthscale=0.2)
+        inf = GPy.inference.latent_function_inference.expectation_propagation.EP(
+            max_iters=30, delta=0.5
+        )
+        self.model = GPy.core.GP(
+            X=X, Y=Y, kernel=k, inference_method=inf, likelihood=lik
+        )
+        K = self.model.kern.K(X)
+        mean_prior = np.zeros(K.shape[0])
+        (
+            post_params,
+            ga_approx,
+            cav_params,
+            log_Z_tilde,
+        ) = self.model.inference_method.expectation_propagation(
+            mean_prior, K, ObsAr(Y), lik, None
+        )
+
+        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
+        p, m, d = self.model.inference_method._inference(
+            Y,
+            mean_prior,
+            K,
+            ga_approx,
+            cav_params,
+            lik,
+            Y_metadata=None,
+            Z_tilde=log_Z_tilde,
+        )
+        p0, m0, d0 = super(
+            GPy.inference.latent_function_inference.expectation_propagation.EP, inf
+        ).inference(
+            k,
+            X,
+            lik,
+            mu_tilde[:, None],
+            mean_function=None,
+            variance=1.0 / ga_approx.tau,
+            K=K,
+            Z_tilde=log_Z_tilde
+            + np.sum(
+                -0.5 * np.log(ga_approx.tau)
+                + 0.5 * (ga_approx.v * ga_approx.v * 1.0 / ga_approx.tau)
+            ),
+        )
+
+        assert (
+            np.sum(
+                np.array(
+                    [
+                        m - m0,
+                        np.sum(d["dL_dK"] - d0["dL_dK"]),
+                        np.sum(d["dL_dthetaL"] - d0["dL_dthetaL"]),
+                        np.sum(d["dL_dm"] - d0["dL_dm"]),
+                        np.sum(p._woodbury_vector - p0._woodbury_vector),
+                        np.sum(p.woodbury_inv - p0.woodbury_inv),
+                    ]
+                )
+            )
+            < 1e6
+        )
+
+    # NOTE: adding a test like above for parameterized likelihood- the above test is
+    # only for probit likelihood which does not have any tunable hyperparameter which is why
+    # the term in dictionary of gradients: dL_dthetaL will always be zero. So here we repeat tests for
+    # student-t likelihood and heterodescastic gaussian noise case. This test simply checks if the posterior
+    # and gradients of log marginal are roughly the same for inference through EP and exact gaussian inference using
+    # the gaussian approximation for the individual likelihood site terms. For probit likelihood, it is possible to
+    # calculate moments analytically, but for other likelihoods, we will need to use numerical quadrature techniques,
+    # and it is possible that any error might creep up because of quadrature implementation.
+    def test_inference_EP_non_classification(self):
+        from paramz import ObsAr
+
+        X, _Y, Y_extra_noisy = self.get_noisy_data()
+        deg_freedom = 5.0
+        init_noise_var = 0.08
+        lik_studentT = GPy.likelihoods.StudentT(
+            deg_free=deg_freedom, sigma2=init_noise_var
+        )
+        # like_gaussian_noise = GPy.likelihoods.MixedNoise()
+        k = GPy.kern.RBF(1, variance=2.0, lengthscale=1.1)
+        ep_inf_alt = GPy.inference.latent_function_inference.expectation_propagation.EP(
+            max_iters=4, delta=0.5
+        )
+        # ep_inf_nested = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode='nested', max_iters=100, delta=0.5)
+        m = GPy.core.GP(
+            X=X,
+            Y=Y_extra_noisy,
+            kernel=k,
+            likelihood=lik_studentT,
+            inference_method=ep_inf_alt,
+        )
+        K = m.kern.K(X)
+        mean_prior = np.zeros(K.shape[0])
+        (
+            post_params,
+            ga_approx,
+            cav_params,
+            log_Z_tilde,
+        ) = m.inference_method.expectation_propagation(
+            mean_prior, K, ObsAr(Y_extra_noisy), lik_studentT, None
+        )
+
+        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
+        p, m, d = m.inference_method._inference(
+            Y_extra_noisy,
+            mean_prior,
+            K,
+            ga_approx,
+            cav_params,
+            lik_studentT,
+            Y_metadata=None,
+            Z_tilde=log_Z_tilde,
+        )
+        p0, m0, d0 = super(
+            GPy.inference.latent_function_inference.expectation_propagation.EP,
+            ep_inf_alt,
+        ).inference(
+            k,
+            X,
+            lik_studentT,
+            mu_tilde[:, None],
+            mean_function=None,
+            variance=1.0 / ga_approx.tau,
+            K=K,
+            Z_tilde=log_Z_tilde
+            + np.sum(
+                -0.5 * np.log(ga_approx.tau)
+                + 0.5 * (ga_approx.v * ga_approx.v * 1.0 / ga_approx.tau)
+            ),
+        )
+
+        assert (
+            np.sum(
+                np.array(
+                    [
+                        m - m0,
+                        np.sum(d["dL_dK"] - d0["dL_dK"]),
+                        np.sum(d["dL_dthetaL"] - d0["dL_dthetaL"]),
+                        np.sum(d["dL_dm"] - d0["dL_dm"]),
+                        np.sum(p._woodbury_vector - p0._woodbury_vector),
+                        np.sum(p.woodbury_inv - p0.woodbury_inv),
+                    ]
+                )
+            )
+            < 1e6
+        )
+
+
+class TestVarDtc:
+    def test_var_dtc_inference_with_mean(self):
+        """Check dL_dm in var_dtc is calculated correctly"""
+        np.random.seed(1)
+        x = np.linspace(0.0, 2 * np.pi, 100)[:, None]
+        y = -np.cos(x) + np.random.randn(*x.shape) * 0.3 + 1
+        m = GPy.models.SparseGPRegression(
+            x, y, mean_function=GPy.mappings.Linear(input_dim=1, output_dim=1)
+        )
+        assert m.checkgrad()
+
+
+class TestHMCSampler:
+    def test_sampling(self):
+        np.random.seed(1)
+        x = np.linspace(0.0, 2 * np.pi, 100)[:, None]
+        y = -np.cos(x) + np.random.randn(*x.shape) * 0.3 + 1
+
+        m = GPy.models.GPRegression(x, y)
+        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
+        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
+        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
+
+        hmc = GPy.inference.mcmc.HMC(m, stepsize=1e-2)
+        _s = hmc.sample(num_samples=3)
+
+
+class TestMCMCSampler:
+    def test_sampling(self):
+        np.random.seed(1)
+        x = np.linspace(0.0, 2 * np.pi, 100)[:, None]
+        y = -np.cos(x) + np.random.randn(*x.shape) * 0.3 + 1
+
+        m = GPy.models.GPRegression(x, y)
+        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
+        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
+        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
+
+        mcmc = GPy.inference.mcmc.Metropolis_Hastings(m)
+        mcmc.sample(Ntotal=100, Nburn=10)
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
--- a/GPy/testing/test_linalg.py
+++ b/GPy/testing/test_linalg.py
@ -1,18 +1,19 @@
 import numpy as np
 import scipy as sp
-from ..util.linalg import jitchol,trace_dot, ijk_jlk_to_il, ijk_ljk_to_ilk
+from ..util.linalg import jitchol, trace_dot, ijk_jlk_to_il, ijk_ljk_to_ilk

-class LinalgTests(np.testing.TestCase):
-    def setUp(self):
-        #Create PD matrix
-        A = np.random.randn(20,100)
+
+class TestLinalg:
+    def setup(self):
+        # Create PD matrix
+        A = np.random.randn(20, 100)
        self.A = A.dot(A.T)
-        #compute Eigdecomp
+        # compute Eigdecomp
        vals, vectors = np.linalg.eig(self.A)
-        #Set smallest eigenval to be negative with 5 rounds worth of jitter
+        # Set smallest eigenval to be negative with 5 rounds worth of jitter
        vals[vals.argmin()] = 0
-        default_jitter = 1e-6*np.mean(vals)
-        vals[vals.argmin()] = -default_jitter*(10**3.5)
+        default_jitter = 1e-6 * np.mean(vals)
+        vals[vals.argmin()] = -default_jitter * (10**3.5)
        self.A_corrupt = (vectors * vals).dot(vectors.T)

    def test_jitchol_success(self):
@ -20,12 +21,16 @@ class LinalgTests(np.testing.TestCase):
        Expect 5 rounds of jitter to be added and for the recovered matrix to be
        identical to the corrupted matrix apart from the jitter added to the diagonal
        """
+        self.setup()
        L = jitchol(self.A_corrupt, maxtries=5)
        A_new = L.dot(L.T)
        diff = A_new - self.A_corrupt
-        np.testing.assert_allclose(diff, np.eye(A_new.shape[0])*np.diag(diff).mean(), atol=1e-13)
+        np.testing.assert_allclose(
+            diff, np.eye(A_new.shape[0]) * np.diag(diff).mean(), atol=1e-13
+        )

    def test_jitchol_failure(self):
+        self.setup()
        try:
            """
            Expecting an exception to be thrown as we expect it to require
@ -37,24 +42,27 @@ class LinalgTests(np.testing.TestCase):
            return True

    def test_trace_dot(self):
+        self.setup()
        N = 5
-        A = np.random.rand(N,N)
-        B = np.random.rand(N,N)
+        A = np.random.rand(N, N)
+        B = np.random.rand(N, N)
        trace = np.trace(A.dot(B))
-        test_trace = trace_dot(A,B)
-        np.testing.assert_allclose(trace,test_trace,atol=1e-13)
+        test_trace = trace_dot(A, B)
+        np.testing.assert_allclose(trace, test_trace, atol=1e-13)

    def test_einsum_ij_jlk_to_ilk(self):
+        self.setup()
        A = np.random.randn(15, 150, 5)
        B = np.random.randn(150, 50, 5)
-        pure = np.einsum('ijk,jlk->il', A, B)
-        quick = ijk_jlk_to_il(A,B)
+        pure = np.einsum("ijk,jlk->il", A, B)
+        quick = ijk_jlk_to_il(A, B)
        np.testing.assert_allclose(pure, quick)

    def test_einsum_ijk_ljk_to_ilk(self):
+        self.setup()
        A = np.random.randn(150, 20, 5)
        B = np.random.randn(150, 20, 5)
-        #B = A.copy()
-        pure = np.einsum('ijk,ljk->ilk', A, B)
-        quick = ijk_ljk_to_ilk(A,B)
+        # B = A.copy()
+        pure = np.einsum("ijk,ljk->ilk", A, B)
+        quick = ijk_ljk_to_ilk(A, B)
        np.testing.assert_allclose(pure, quick)
--- a/GPy/testing/test_link_function.py
+++ b/GPy/testing/test_link_function.py
@ -0,0 +1,196 @@
+import numpy as np
+import scipy
+from scipy.special import cbrt
+from GPy.models import GradientChecker
+import random
+
+_lim_val = np.finfo(np.float64).max
+_lim_val_exp = np.log(_lim_val)
+_lim_val_square = np.sqrt(_lim_val)
+_lim_val_cube = cbrt(_lim_val)
+from GPy.likelihoods.link_functions import (
+    Identity,
+    Probit,
+    Cloglog,
+    Log,
+    Log_ex_1,
+    Reciprocal,
+    Heaviside,
+    ScaledProbit,
+)
+
+
+class TestLinkFunction:
+    def setup(self):
+        self.small_f = np.array([[-1e-4]])
+        self.zero_f = np.array([[1e-4]])
+        self.mid_f = np.array([[5.0]])
+        self.large_f = np.array([[1e4]])
+        self.f_lower_lim = np.array(-np.inf)
+        self.f_upper_lim = np.array(np.inf)
+
+    def check_gradient(self, link_func, lim_of_inf, test_lim=False):
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.mid_f)
+        assert grad.checkgrad(verbose=True)
+        grad2 = GradientChecker(
+            link_func.dtransf_df, link_func.d2transf_df2, x0=self.mid_f
+        )
+        assert grad2.checkgrad(verbose=True)
+        grad3 = GradientChecker(
+            link_func.d2transf_df2, link_func.d3transf_df3, x0=self.mid_f
+        )
+        assert grad3.checkgrad(verbose=True)
+
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.small_f)
+        assert grad.checkgrad(verbose=True)
+        grad2 = GradientChecker(
+            link_func.dtransf_df, link_func.d2transf_df2, x0=self.small_f
+        )
+        assert grad2.checkgrad(verbose=True)
+        grad3 = GradientChecker(
+            link_func.d2transf_df2, link_func.d3transf_df3, x0=self.small_f
+        )
+        assert grad3.checkgrad(verbose=True)
+
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.zero_f)
+        assert grad.checkgrad(verbose=True)
+        grad2 = GradientChecker(
+            link_func.dtransf_df, link_func.d2transf_df2, x0=self.zero_f
+        )
+        assert grad2.checkgrad(verbose=True)
+        grad3 = GradientChecker(
+            link_func.d2transf_df2, link_func.d3transf_df3, x0=self.zero_f
+        )
+        assert grad3.checkgrad(verbose=True)
+
+        # Do a limit test if the large f value is too large
+        large_f = np.clip(self.large_f, -np.inf, lim_of_inf - 1e-3)
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=large_f)
+        assert grad.checkgrad(verbose=True)
+        grad2 = GradientChecker(
+            link_func.dtransf_df, link_func.d2transf_df2, x0=large_f
+        )
+        assert grad2.checkgrad(verbose=True)
+        grad3 = GradientChecker(
+            link_func.d2transf_df2, link_func.d3transf_df3, x0=large_f
+        )
+        assert grad3.checkgrad(verbose=True)
+
+        if test_lim:
+            print("Testing limits")
+            # Remove some otherwise we are too close to the limit for gradcheck to work effectively
+            lim_of_inf = lim_of_inf - 1e-4
+            grad = GradientChecker(
+                link_func.transf, link_func.dtransf_df, x0=lim_of_inf
+            )
+            assert grad.checkgrad(verbose=True)
+            grad2 = GradientChecker(
+                link_func.dtransf_df, link_func.d2transf_df2, x0=lim_of_inf
+            )
+            assert grad2.checkgrad(verbose=True)
+            grad3 = GradientChecker(
+                link_func.d2transf_df2, link_func.d3transf_df3, x0=lim_of_inf
+            )
+            assert grad3.checkgrad(verbose=True)
+
+    def check_overflow(self, link_func, lim_of_inf):
+        # Check that it does something sensible beyond this limit,
+        # note this is not checking the value is correct, just that it isn't nan
+        beyond_lim_of_inf = lim_of_inf + 100.0
+        assert not np.isinf(link_func.transf(beyond_lim_of_inf))
+        assert not np.isinf(link_func.dtransf_df(beyond_lim_of_inf))
+        assert not np.isinf(link_func.d2transf_df2(beyond_lim_of_inf))
+
+        assert not np.isnan(link_func.transf(beyond_lim_of_inf))
+        assert not np.isnan(link_func.dtransf_df(beyond_lim_of_inf))
+        assert not np.isnan(link_func.d2transf_df2(beyond_lim_of_inf))
+
+    def test_log_overflow(self):
+        self.setup()
+
+        link = Log()
+        lim_of_inf = _lim_val_exp
+
+        np.testing.assert_almost_equal(np.exp(self.mid_f), link.transf(self.mid_f))
+        assert np.isinf(np.exp(np.log(self.f_upper_lim)))
+        # Check the clipping works
+        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
+        assert np.isfinite(link.transf(self.f_upper_lim))
+        self.check_overflow(link, lim_of_inf)
+
+        # Check that it would otherwise fail
+        beyond_lim_of_inf = lim_of_inf + 10.0
+        old_err_state = np.seterr(over="ignore")
+        assert np.isinf(np.exp(beyond_lim_of_inf))
+        np.seterr(**old_err_state)
+
+    def test_log_ex_1_overflow(self):
+        self.setup()
+
+        link = Log_ex_1()
+        lim_of_inf = _lim_val_exp
+
+        np.testing.assert_almost_equal(
+            scipy.special.log1p(np.exp(self.mid_f)), link.transf(self.mid_f)
+        )
+        assert np.isinf(scipy.special.log1p(np.exp(np.log(self.f_upper_lim))))
+        # Check the clipping works
+        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
+        # Need to look at most significant figures here rather than the decimals
+        np.testing.assert_approx_equal(
+            link.transf(self.f_upper_lim), scipy.special.log1p(_lim_val), significant=5
+        )
+        self.check_overflow(link, lim_of_inf)
+
+        # Check that it would otherwise fail
+        beyond_lim_of_inf = lim_of_inf + 10.0
+        old_err_state = np.seterr(over="ignore")
+        assert np.isinf(scipy.special.log1p(np.exp(beyond_lim_of_inf)))
+        np.seterr(**old_err_state)
+
+    def test_log_gradients(self):
+        # transf dtransf_df d2transf_df2 d3transf_df3
+        self.setup()
+
+        link = Log()
+        lim_of_inf = _lim_val_exp
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_identity_gradients(self):
+        self.setup()
+        link = Identity()
+        lim_of_inf = _lim_val
+        # FIXME: Should be able to think of a way to test the limits of this
+        self.check_gradient(link, lim_of_inf, test_lim=False)
+
+    def test_probit_gradients(self):
+        self.setup()
+        link = Probit()
+        lim_of_inf = _lim_val
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_scaledprobit_gradients(self):
+        self.setup()
+        link = ScaledProbit(nu=random.random())
+        lim_of_inf = _lim_val
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_Cloglog_gradients(self):
+        self.setup()
+        link = Cloglog()
+        lim_of_inf = _lim_val_exp
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_Log_ex_1_gradients(self):
+        self.setup()
+        link = Log_ex_1()
+        lim_of_inf = _lim_val_exp
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+        self.check_overflow(link, lim_of_inf)
+
+    def test_reciprocal_gradients(self):
+        self.setup()
+        link = Reciprocal()
+        lim_of_inf = _lim_val
+        # Does not work with much smaller values, and values closer to zero than 1e-5
+        self.check_gradient(link, lim_of_inf, test_lim=True)
--- a/GPy/testing/mapping_tests.py
+++ b/GPy/testing/mapping_tests.py
@ -1,10 +1,10 @@
 # Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import unittest
 import numpy as np
 import GPy

+
 class MappingGradChecker(GPy.core.Model):
    """
    This class has everything we need to check the gradient of a mapping. It
@ -12,63 +12,60 @@ class MappingGradChecker(GPy.core.Model):
    mapping. the gradients are checked against the parameters of the mapping
    and the input.
    """
-    def __init__(self, mapping, X, name='map_grad_check'):
+
+    def __init__(self, mapping, X, name="map_grad_check"):
        super(MappingGradChecker, self).__init__(name)
        self.mapping = mapping
        self.link_parameter(self.mapping)
-        self.X = GPy.core.Param('X',X)
+        self.X = GPy.core.Param("X", X)
        self.link_parameter(self.X)
        self.dL_dY = np.random.randn(self.X.shape[0], self.mapping.output_dim)
+
    def log_likelihood(self):
        return np.sum(self.mapping.f(self.X) * self.dL_dY)
+
    def parameters_changed(self):
        self.X.gradient = self.mapping.gradients_X(self.dL_dY, self.X)
        self.mapping.update_gradients(self.dL_dY, self.X)


-class MappingTests(unittest.TestCase):
-
+class TestMapping:
    def test_kernelmapping(self):
-        X = np.random.randn(100,3)
-        Z = np.random.randn(10,3)
+        X = np.random.randn(100, 3)
+        Z = np.random.randn(10, 3)
        mapping = GPy.mappings.Kernel(3, 2, Z, GPy.kern.RBF(3))
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        assert MappingGradChecker(mapping, X).checkgrad()

    def test_linearmapping(self):
        mapping = GPy.mappings.Linear(3, 2)
-        X = np.random.randn(100,3)
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        X = np.random.randn(100, 3)
+        assert MappingGradChecker(mapping, X).checkgrad()

    def test_mlpmapping(self):
        mapping = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
-        X = np.random.randn(100,3)
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        X = np.random.randn(100, 3)
+        assert MappingGradChecker(mapping, X).checkgrad()

    def test_mlpextmapping(self):
        np.random.seed(42)
-        X = np.random.randn(100,3)
-        for activation in ['tanh', 'relu', 'sigmoid']:
-            mapping = GPy.mappings.MLPext(input_dim=3, hidden_dims=[5,5], output_dim=2, activation=activation)
-            self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        X = np.random.randn(100, 3)
+        for activation in ["tanh", "relu", "sigmoid"]:
+            mapping = GPy.mappings.MLPext(
+                input_dim=3, hidden_dims=[5, 5], output_dim=2, activation=activation
+            )
+            assert MappingGradChecker(mapping, X).checkgrad()

    def test_addmapping(self):
        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
        m2 = GPy.mappings.Linear(input_dim=3, output_dim=2)
        mapping = GPy.mappings.Additive(m1, m2)
-        X = np.random.randn(100,3)
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        X = np.random.randn(100, 3)
+        assert MappingGradChecker(mapping, X).checkgrad()

    def test_compoundmapping(self):
        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
-        Z = np.random.randn(10,2)
+        Z = np.random.randn(10, 2)
        m2 = GPy.mappings.Kernel(2, 4, Z, GPy.kern.RBF(2))
        mapping = GPy.mappings.Compound(m1, m2)
-        X = np.random.randn(100,3)
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
-
-
-
-
-if __name__ == "__main__":
-    print("Running unit tests, please be (very) patient...")
-    unittest.main()
+        X = np.random.randn(100, 3)
+        assert MappingGradChecker(mapping, X).checkgrad()
--- a/GPy/testing/test_meanfunc.py
+++ b/GPy/testing/test_meanfunc.py
@ -0,0 +1,90 @@
+# Copyright (c) 2015, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import GPy
+
+
+class TestMF:
+    def test_simple_mean_function(self):
+        """
+        The simplest possible mean function. No parameters, just a simple Sinusoid.
+        """
+        # create  simple mean function
+        mf = GPy.core.Mapping(1, 1)
+        mf.f = np.sin
+        mf.update_gradients = lambda a, b: None
+
+        X = np.linspace(0, 10, 50).reshape(-1, 1)
+        Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape)
+
+        k = GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        assert m.checkgrad()
+
+    def test_parametric_mean_function(self):
+        """
+        A linear mean function with parameters that we'll learn alongside the kernel
+        """
+
+        X = np.linspace(-1, 10, 50).reshape(-1, 1)
+
+        Y = 3 - np.abs((X - 6))
+        Y += 0.5 * np.cos(3 * X) + 0.3 * np.random.randn(*X.shape)
+
+        mf = GPy.mappings.PiecewiseLinear(1, 1, [-1, 1], [9, 2])
+
+        k = GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        assert m.checkgrad()
+
+    def test_parametric_mean_function_composition(self):
+        """
+        A linear mean function with parameters that we'll learn alongside the kernel
+        """
+
+        X = np.linspace(0, 10, 50).reshape(-1, 1)
+        Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape) + 3 * X
+
+        mf = GPy.mappings.Compound(
+            GPy.mappings.Linear(1, 1),
+            GPy.mappings.Kernel(1, 1, np.random.normal(0, 1, (1, 1)), GPy.kern.RBF(1)),
+        )
+
+        k = GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        assert m.checkgrad()
+
+    def test_parametric_mean_function_additive(self):
+        """
+        A linear mean function with parameters that we'll learn alongside the kernel
+        """
+
+        X = np.linspace(0, 10, 50).reshape(-1, 1)
+        Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape) + 3 * X
+
+        mf = GPy.mappings.Additive(
+            GPy.mappings.Constant(1, 1, 3),
+            GPy.mappings.Additive(GPy.mappings.MLP(1, 1), GPy.mappings.Identity(1, 1)),
+        )
+
+        k = GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        assert m.checkgrad()
+
+    def test_svgp_mean_function(self):
+        # an instance of the SVIGOP with a men function
+        X = np.linspace(0, 10, 500).reshape(-1, 1)
+        Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape)
+        Y = np.where(Y > 0, 1, 0)  # make aclassificatino problem
+
+        mf = GPy.mappings.Linear(1, 1)
+        Z = np.linspace(0, 10, 50).reshape(-1, 1)
+        lik = GPy.likelihoods.Bernoulli()
+        k = GPy.kern.RBF(1) + GPy.kern.White(1, 1e-4)
+        m = GPy.core.SVGP(X, Y, Z=Z, kernel=k, likelihood=lik, mean_function=mf)
+        assert m.checkgrad()
--- a/GPy/testing/test_minibatch.py
+++ b/GPy/testing/test_minibatch.py
@ -0,0 +1,416 @@
+"""
+Created on 4 Sep 2015
+
+@author: maxz
+"""
+import pytest
+import numpy as np
+import GPy
+
+try:
+    import climin
+except ImportError:
+    climin = None
+
+
+class TestBGPLVM:
+    def setup(self):
+        np.random.seed(12345)
+        X, W = np.random.normal(0, 1, (100, 6)), np.random.normal(0, 1, (6, 13))
+        Y = X.dot(W) + np.random.normal(0, 0.1, (X.shape[0], W.shape[1]))
+        self.inan = np.random.binomial(1, 0.1, Y.shape).astype(bool)
+        self.X, self.W, self.Y = X, W, Y
+        self.Q = 3
+        self.m_full = GPy.models.BayesianGPLVM(Y, self.Q)
+
+    def test_lik_comparisons_m1_s0(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, missing_data=True, stochastic=False
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_predict_missing_data(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            missing_data=True,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+
+        with pytest.raises(NotImplementedError):
+            m.predict(m.X, full_cov=True)
+
+        mu1, var1 = m.predict(m.X, full_cov=False)
+        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
+        np.testing.assert_allclose(mu1, mu2)
+        np.testing.assert_allclose(var1, var2)
+
+        mu1, var1 = m.predict(m.X.mean, full_cov=True)
+        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=True)
+        np.testing.assert_allclose(mu1, mu2)
+        np.testing.assert_allclose(var1[:, :, 0], var2)
+
+        mu1, var1 = m.predict(m.X.mean, full_cov=False)
+        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=False)
+        np.testing.assert_allclose(mu1, mu2)
+        np.testing.assert_allclose(var1[:, [0]], var2)
+
+    def test_lik_comparisons_m0_s0(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=self.m_full.X.variance.values,
+            missing_data=False,
+            stochastic=False,
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_lik_comparisons_m1_s1(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            missing_data=True,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_lik_comparisons_m0_s1(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            missing_data=False,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_gradients_missingdata(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            missing_data=True,
+            stochastic=False,
+            batchsize=self.Y.shape[1],
+        )
+        assert m.checkgrad()
+
+    def test_gradients_missingdata_stochastics(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, missing_data=True, stochastic=True, batchsize=1
+        )
+        assert m.checkgrad()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, missing_data=True, stochastic=True, batchsize=4
+        )
+        assert m.checkgrad()
+
+    def test_gradients_stochastics(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, missing_data=False, stochastic=True, batchsize=1
+        )
+        assert m.checkgrad()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, missing_data=False, stochastic=True, batchsize=4
+        )
+        assert m.checkgrad()
+
+    def test_predict(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            missing_data=True,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+
+class TestSparseGPMinibatch:
+    def setup(self):
+        np.random.seed(12345)
+        X, W = np.random.normal(0, 1, (100, 6)), np.random.normal(0, 1, (6, 13))
+        Y = X.dot(W) + np.random.normal(0, 0.1, (X.shape[0], W.shape[1]))
+        self.inan = np.random.binomial(1, 0.1, Y.shape).astype(bool)
+        self.X, self.W, self.Y = X, W, Y
+        self.Q = 3
+        self.m_full = GPy.models.SparseGPLVM(
+            Y, self.Q, kernel=GPy.kern.RBF(self.Q, ARD=True)
+        )
+
+    def test_lik_comparisons_m1_s0(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    @pytest.mark.skipif(climin is None, reason="climin not installed")
+    def test_sparsegp_init(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        np.random.seed(1234)
+        Z = self.X[np.random.choice(self.X.shape[0], replace=False, size=10)].copy()
+        Q = Z.shape[1]
+        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(
+            self.X,
+            self.Y,
+            Z,
+            GPy.kern.RBF(Q) + GPy.kern.Matern32(Q) + GPy.kern.Bias(Q),
+            GPy.likelihoods.Gaussian(),
+            missing_data=True,
+            stochastic=False,
+        )
+        assert m.checkgrad()
+        m.optimize("adadelta", max_iters=10)
+        assert m.checkgrad()
+
+        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(
+            self.X,
+            self.Y,
+            Z,
+            GPy.kern.RBF(Q) + GPy.kern.Matern32(Q) + GPy.kern.Bias(Q),
+            GPy.likelihoods.Gaussian(),
+            missing_data=True,
+            stochastic=True,
+        )
+        assert m.checkgrad()
+        m.optimize("rprop", max_iters=10)
+        assert m.checkgrad()
+
+        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(
+            self.X,
+            self.Y,
+            Z,
+            GPy.kern.RBF(Q) + GPy.kern.Matern32(Q) + GPy.kern.Bias(Q),
+            GPy.likelihoods.Gaussian(),
+            missing_data=False,
+            stochastic=False,
+        )
+        assert m.checkgrad()
+        m.optimize("rprop", max_iters=10)
+        assert m.checkgrad()
+
+        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(
+            self.X,
+            self.Y,
+            Z,
+            GPy.kern.RBF(Q) + GPy.kern.Matern32(Q) + GPy.kern.Bias(Q),
+            GPy.likelihoods.Gaussian(),
+            missing_data=False,
+            stochastic=True,
+        )
+        assert m.checkgrad()
+        m.optimize("adadelta", max_iters=10)
+        assert m.checkgrad()
+
+    def test_predict_missing_data(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=True,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+
+        mu1, var1 = m.predict(m.X, full_cov=False)
+        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
+        np.testing.assert_allclose(mu1, mu2)
+        for i in range(var1.shape[1]):
+            np.testing.assert_allclose(var1[:, [i]], var2)
+
+        mu1, var1 = m.predict(m.X, full_cov=True)
+        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=True)
+        np.testing.assert_allclose(mu1, mu2)
+        for i in range(var1.shape[2]):
+            np.testing.assert_allclose(var1[:, :, i], var2)
+
+    def test_lik_comparisons_m0_s0(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, X_variance=False, missing_data=False, stochastic=False
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_lik_comparisons_m1_s1(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=True,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_lik_comparisons_m0_s1(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=False,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_gradients_missingdata(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=True,
+            stochastic=False,
+            batchsize=self.Y.shape[1],
+        )
+        assert m.checkgrad()
+
+    def test_gradients_missingdata_stochastics(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=True,
+            stochastic=True,
+            batchsize=1,
+        )
+        assert m.checkgrad()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=True,
+            stochastic=True,
+            batchsize=4,
+        )
+        assert m.checkgrad()
+
+    def test_gradients_stochastics(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=False,
+            stochastic=True,
+            batchsize=1,
+        )
+        assert m.checkgrad()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=False,
+            stochastic=True,
+            batchsize=4,
+        )
+        assert m.checkgrad()
+
+    def test_predict(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=True,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
--- a/GPy/testing/misc_tests.py
+++ b/GPy/testing/misc_tests.py
@ -1,27 +1,28 @@
-from __future__ import print_function
 import numpy as np
-import scipy as sp
 import GPy
 import warnings

-class MiscTests(np.testing.TestCase):
+
+class TestMisc:
    """
    Testing some utilities of misc
    """
-    def setUp(self):
+
+    def setup(self):
        self._lim_val = np.finfo(np.float64).max
        self._lim_val_exp = np.log(self._lim_val)

    def test_safe_exp_upper(self):
+        self.setup()
        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter('always')  # always print
+            warnings.simplefilter("always")  # always print
            assert np.isfinite(np.exp(self._lim_val_exp))
            assert np.isinf(np.exp(self._lim_val_exp + 1))
            assert np.isfinite(GPy.util.misc.safe_exp(self._lim_val_exp + 1))

            print(w)
            print(len(w))
-            assert len(w)<=1 # should have one overflow warning
+            assert len(w) <= 1  # should have one overflow warning

    def test_safe_exp_lower(self):
        assert GPy.util.misc.safe_exp(1e-10) < np.inf
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
--- a/GPy/testing/test_mpi.py
+++ b/GPy/testing/test_mpi.py
@ -0,0 +1,83 @@
+# Copyright (c) 2013-2014, Zhenwen Dai
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+
+try:
+    import subprocess
+
+    class TestMPI:
+        def test_BayesianGPLVM_MPI(self):
+            code = """
+import numpy as np
+import GPy
+from mpi4py import MPI
+np.random.seed(123456)
+comm = MPI.COMM_WORLD
+N = 100
+x = np.linspace(-6., 6., N)
+y = np.sin(x) + np.random.randn(N) * 0.05
+comm.Bcast(y)
+data = np.vstack([x,y])
+infr = GPy.inference.latent_function_inference.VarDTC_minibatch(mpi_comm=comm)
+m = GPy.models.BayesianGPLVM(data.T,1,mpi_comm=comm)
+m.optimize(max_iters=10)
+if comm.rank==0:
+    print float(m.objective_function())
+    m.inference_method.mpi_comm=None
+    m.mpi_comm=None
+    m._trigger_params_changed()
+    print float(m.objective_function())
+            """
+            with open("mpi_test__.py", "w") as f:
+                f.write(code)
+                f.close()
+            p = subprocess.Popen(
+                "mpirun -n 4 python mpi_test__.py", stdout=subprocess.PIPE, shell=True
+            )
+            (stdout, _stderr) = p.communicate()
+            L1 = float(stdout.splitlines()[-2])
+            L2 = float(stdout.splitlines()[-1])
+            self.assertTrue(np.allclose(L1, L2))
+            import os
+
+            os.remove("mpi_test__.py")
+
+        def test_SparseGPRegression_MPI(self):
+            code = """
+import numpy as np
+import GPy
+from mpi4py import MPI
+np.random.seed(123456)
+comm = MPI.COMM_WORLD
+N = 100
+x = np.linspace(-6., 6., N)
+y = np.sin(x) + np.random.randn(N) * 0.05
+comm.Bcast(y)
+data = np.vstack([x,y])
+#infr = GPy.inference.latent_function_inference.VarDTC_minibatch(mpi_comm=comm)
+m = GPy.models.SparseGPRegression(data[:1].T,data[1:2].T,mpi_comm=comm)
+m.optimize(max_iters=10)
+if comm.rank==0:
+    print float(m.objective_function())
+    m.inference_method.mpi_comm=None
+    m.mpi_comm=None
+    m._trigger_params_changed()
+    print float(m.objective_function())
+            """
+            with open("mpi_test__.py", "w") as f:
+                f.write(code)
+                f.close()
+            p = subprocess.Popen(
+                "mpirun -n 4 python mpi_test__.py", stdout=subprocess.PIPE, shell=True
+            )
+            (stdout, stderr) = p.communicate()
+            L1 = float(stdout.splitlines()[-2])
+            L2 = float(stdout.splitlines()[-1])
+            assert np.allclose(L1, L2)
+            import os
+
+            os.remove("mpi_test__.py")
+
+except:
+    pass
--- a/GPy/testing/pep_tests.py
+++ b/GPy/testing/pep_tests.py
@ -1,94 +1,98 @@
 # Copyright (c) 2014, James Hensman, 2016, Thang Bui
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import unittest
 import numpy as np
 import GPy

-class PEPgradienttest(unittest.TestCase):
-    def setUp(self):
+
+class TestPEPgradient:
+    def setup(self):
        ######################################
        # # 1 dimensional example
        np.random.seed(10)

        N = 20
        # sample inputs and outputs
-        self.X1D = np.random.uniform(-3., 3., (N, 1))
+        self.X1D = np.random.uniform(-3.0, 3.0, (N, 1))
        self.Y1D = np.sin(self.X1D) + np.random.randn(N, 1) * 0.05

        ######################################
        # # 2 dimensional example

        # sample inputs and outputs
-        self.X2D = np.random.uniform(-3., 3., (N, 2))
-        self.Y2D = np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2]) + np.random.randn(N, 1) * 0.05
+        self.X2D = np.random.uniform(-3.0, 3.0, (N, 2))
+        self.Y2D = (
+            np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2])
+            + np.random.randn(N, 1) * 0.05
+        )

        #######################################
        # # more datapoints, check in alpha limits, the log marginal likelihood
        # # is consistent with FITC and VFE/Var_DTC
        M = 5
        np.random.seed(42)
-        self.X1 = np.c_[np.linspace(-1., 1., N)]
+        self.X1 = np.c_[np.linspace(-1.0, 1.0, N)]
        self.Y1 = np.sin(self.X1) + np.random.randn(N, 1) * 0.05
        self.kernel = GPy.kern.RBF(input_dim=1, lengthscale=0.5, variance=1)
        self.Z = np.random.uniform(-1, 1, (M, 1))
        self.lik_noise_var = 0.01

    def test_pep_1d_gradients(self):
+        self.setup()
        m = GPy.models.SparseGPRegression(self.X1D, self.Y1D)
-        m.inference_method = GPy.inference.latent_function_inference.PEP(alpha=np.random.rand())
-        self.assertTrue(m.checkgrad())
+        m.inference_method = GPy.inference.latent_function_inference.PEP(
+            alpha=np.random.rand()
+        )
+        assert m.checkgrad()

    def test_pep_2d_gradients(self):
+        self.setup()
        m = GPy.models.SparseGPRegression(self.X2D, self.Y2D)
-        m.inference_method = GPy.inference.latent_function_inference.PEP(alpha=np.random.rand())
-        self.assertTrue(m.checkgrad())
+        m.inference_method = GPy.inference.latent_function_inference.PEP(
+            alpha=np.random.rand()
+        )
+        assert m.checkgrad()

    def test_pep_vfe_consistency(self):
+        self.setup()
        vfe_model = GPy.models.SparseGPRegression(
-            self.X1, 
-            self.Y1, 
-            kernel=self.kernel, 
-            Z=self.Z
+            self.X1, self.Y1, kernel=self.kernel, Z=self.Z
        )
        vfe_model.inference_method = GPy.inference.latent_function_inference.VarDTC()
        vfe_model.Gaussian_noise.variance = self.lik_noise_var
        vfe_lml = vfe_model.log_likelihood()

        pep_model = GPy.models.SparseGPRegression(
-            self.X1, 
-            self.Y1, 
-            kernel=self.kernel, 
-            Z=self.Z
+            self.X1, self.Y1, kernel=self.kernel, Z=self.Z
+        )
+        pep_model.inference_method = GPy.inference.latent_function_inference.PEP(
+            alpha=1e-5
        )
-        pep_model.inference_method = GPy.inference.latent_function_inference.PEP(alpha=1e-5)
        pep_model.Gaussian_noise.variance = self.lik_noise_var
        pep_lml = pep_model.log_likelihood()

-        self.assertAlmostEqual(vfe_lml[0, 0], pep_lml[0], delta=abs(0.01*pep_lml[0]))
+        np.testing.assert_almost_equal(
+            vfe_lml[0, 0], pep_lml[0], decimal=abs(0.01 * pep_lml[0])
+        )

    def test_pep_fitc_consistency(self):
+        self.setup()
        fitc_model = GPy.models.SparseGPRegression(
-            self.X1D, 
-            self.Y1D, 
-            kernel=self.kernel, 
-            Z=self.Z
+            self.X1D, self.Y1D, kernel=self.kernel, Z=self.Z
        )
        fitc_model.inference_method = GPy.inference.latent_function_inference.FITC()
        fitc_model.Gaussian_noise.variance = self.lik_noise_var
        fitc_lml = fitc_model.log_likelihood()

        pep_model = GPy.models.SparseGPRegression(
-            self.X1D, 
-            self.Y1D, 
-            kernel=self.kernel, 
-            Z=self.Z
+            self.X1D, self.Y1D, kernel=self.kernel, Z=self.Z
+        )
+        pep_model.inference_method = GPy.inference.latent_function_inference.PEP(
+            alpha=1
        )
-        pep_model.inference_method = GPy.inference.latent_function_inference.PEP(alpha=1)
        pep_model.Gaussian_noise.variance = self.lik_noise_var
        pep_lml = pep_model.log_likelihood()

-        self.assertAlmostEqual(fitc_lml, pep_lml[0], delta=abs(0.001*pep_lml[0]))
-
-
-
+        np.testing.assert_almost_equal(
+            fitc_lml, pep_lml[0], decimal=abs(0.001 * pep_lml[0])
+        )
--- a/GPy/testing/test_pickle.py
+++ b/GPy/testing/test_pickle.py
@ -0,0 +1,133 @@
+"""
+Created on 13 Mar 2014
+
+@author: maxz
+"""
+# import cPickle as pickle
+import pickle
+import pytest
+import numpy as np
+import tempfile
+from GPy.examples.dimensionality_reduction import mrd_simulation
+from GPy.core.parameterization.variational import NormalPosterior
+from GPy.models.gp_regression import GPRegression
+import GPy
+
+
+def toy_model():
+    X = np.linspace(0, 1, 50)[:, None]
+    Y = np.sin(X)
+    m = GPRegression(X=X, Y=Y)
+    return m
+
+
+class ListDictTestCase:
+    def assertListDictEquals(self, d1, d2, msg=None):
+        # py3 fix
+        # for k,v in d1.iteritems():
+        for k, v in d1.items():
+            self.assertListEqual(list(v), list(d2[k]), msg)
+
+    def assertArrayListEquals(self, l1, l2):
+        for a1, a2 in zip(l1, l2):
+            np.testing.assert_array_equal(a1, a2)
+
+
+class TestPickleSupport(ListDictTestCase):
+    @pytest.mark.skip(reason="")  # why is this test skipped?
+    def test_load_pickle(self):
+        import os
+
+        m = GPy.load(
+            os.path.join(
+                os.path.abspath(os.path.split(__file__)[0]), "pickle_test.pickle"
+            )
+        )
+        assert m.checkgrad()
+        assert m.log_likelihood(), -4.7351019830022087
+
+    def test_model(self):
+        par = toy_model()
+        pcopy = par.copy()
+        assert par.param_array.tolist() == pcopy.param_array.tolist()
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
+        assert str(par) == str(pcopy)
+        assert np.all(par.param_array == pcopy.param_array)
+        assert np.all(par.gradient_full == pcopy.gradient_full)
+        assert pcopy.checkgrad()
+        assert np.any(pcopy.gradient != 0.0)
+        with tempfile.TemporaryFile("w+b") as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        assert par.param_array.tolist() == pcopy.param_array.tolist()
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
+        assert str(par) == str(pcopy)
+        assert pcopy.checkgrad()
+
+    def test_modelrecreation(self):
+        par = toy_model()
+        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
+        np.testing.assert_allclose(par.param_array, pcopy.param_array)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
+        assert str(par) == str(pcopy)
+        assert np.all(par.param_array == pcopy.param_array)
+        assert np.all(par.gradient_full == pcopy.gradient_full)
+        assert pcopy.checkgrad()
+        assert np.any(pcopy.gradient != 0.0)
+        np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)
+        par.randomize()
+        with tempfile.TemporaryFile("w+b") as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        np.testing.assert_allclose(par.param_array, pcopy.param_array)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full, atol=1e-6)
+        assert str(par) == str(pcopy)
+        assert pcopy.checkgrad()
+
+    def test_posterior(self):
+        X = np.random.randn(3, 5)
+        Xv = np.random.rand(*X.shape)
+        par = NormalPosterior(X, Xv)
+        par.gradient = 10
+        pcopy = par.copy()
+        pcopy.gradient = 10
+        assert par.param_array.tolist() == pcopy.param_array.tolist()
+        assert par.gradient_full.tolist() == pcopy.gradient_full.tolist()
+        assert str(par) == str(pcopy)
+        assert np.all(par.param_array == pcopy.param_array)
+        assert np.all(par.gradient_full == pcopy.gradient_full)
+        with tempfile.TemporaryFile("w+b") as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        assert par.param_array.tolist() == pcopy.param_array.tolist()
+        pcopy.gradient = 10
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
+        np.testing.assert_allclose(pcopy.mean.gradient_full, 10)
+        assert str(par) == str(pcopy)
+
+    def test_model_concat(self):
+        par = mrd_simulation(optimize=0, plot=0, plot_sim=0)
+        par.randomize()
+        pcopy = par.copy()
+        assert par.param_array.tolist() == pcopy.param_array.tolist()
+        assert par.gradient_full.tolist() == pcopy.gradient_full.tolist()
+        assert str(par) == str(pcopy)
+        assert np.all(par.param_array == pcopy.param_array)
+        assert np.all(par.gradient_full == pcopy.gradient_full)
+        assert par.checkgrad()
+        assert pcopy.checkgrad()
+        assert np.any(pcopy.gradient != 0.0)
+        with tempfile.TemporaryFile("w+b") as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        assert par.param_array.tolist() == pcopy.param_array.tolist()
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
+        assert str(par) == str(pcopy)
+        assert pcopy.checkgrad()
+
+    def _callback(self, what, which):
+        what.count += 1
--- a/GPy/testing/test_plotting.py
+++ b/GPy/testing/test_plotting.py
@ -0,0 +1,703 @@
+# ===============================================================================
+# Copyright (c) 2015, Max Zwiessele
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of GPy nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ===============================================================================
+
+
+# ===============================================================================
+# SKIPPING PLOTTING BECAUSE IT BEHAVES DIFFERENTLY ON DIFFERENT
+# SYSTEMS, AND WILL MISBEHAVE
+
+# raise SkipTest("Skipping Matplotlib testing")
+# ===============================================================================
+
+try:
+    import matplotlib
+    from matplotlib import pyplot as plt
+    from matplotlib.testing.compare import compare_images
+
+    matplotlib.use("agg")
+except ImportError:
+    # matplotlib not installed
+    matplotlib = None
+
+import pytest
+import numpy as np
+import GPy, os
+import logging
+
+from GPy.util.config import config
+from GPy.plotting import change_plotting_library, plotting_library
+
+
+class TestConfig:
+    def teardown(self):
+        change_plotting_library("matplotlib")
+
+    @pytest.mark.skipif(matplotlib is None, reason="Matplotlib not installed")
+    def test_change_plotting(self):
+        with pytest.raises(ValueError):
+            change_plotting_library("not+in9names")
+        change_plotting_library("none")
+        with pytest.raises(RuntimeError):
+            plotting_library()
+        self.teardown()
+
+
+change_plotting_library("matplotlib")
+
+extensions = ["npz"]
+
+basedir = os.path.dirname(os.path.relpath(os.path.abspath(__file__)))
+
+
+def _image_directories():
+    """
+    Compute the baseline and result image directories for testing *func*.
+    Create the result directory if it doesn't exist.
+    """
+    # module_name = __init__.__module__
+    # mods = module_name.split('.')
+    # basedir = os.path.join(*mods)
+    result_dir = os.path.join(basedir, "testresult", ".")
+    baseline_dir = os.path.join(basedir, "baseline", ".")
+    if not os.path.exists(result_dir):
+        os.makedirs(result_dir)
+    return baseline_dir, result_dir
+
+
+baseline_dir, result_dir = _image_directories()
+if not os.path.exists(baseline_dir):
+    baseline_dir = None
+
+
+def _image_comparison(
+    baseline_images, extensions=["pdf", "svg", "png"], tol=11, rtol=1e-3, **kwargs
+):
+    for num, base in zip(plt.get_fignums(), baseline_images):
+        for ext in extensions:
+            fig = plt.figure(num)
+            try:
+                fig.canvas.draw()
+            except Exception as e:
+                logging.error(base)
+                # raise SkipTest(e)
+            # fig.axes[0].set_axis_off()
+            # fig.set_frameon(False)
+            if ext in ["npz"]:
+                figdict = flatten_axis(fig)
+                np.savez_compressed(
+                    os.path.join(result_dir, "{}.{}".format(base, ext)), **figdict
+                )
+                try:
+                    fig.savefig(
+                        os.path.join(result_dir, "{}.{}".format(base, "png")),
+                        transparent=True,
+                        edgecolor="none",
+                        facecolor="none",
+                        # bbox='tight'
+                    )
+                except:
+                    logging.error(base)
+                    # raise
+            else:
+                fig.savefig(
+                    os.path.join(result_dir, "{}.{}".format(base, ext)),
+                    transparent=True,
+                    edgecolor="none",
+                    facecolor="none",
+                    # bbox='tight'
+                )
+    for num, base in zip(plt.get_fignums(), baseline_images):
+        for ext in extensions:
+            # plt.close(num)
+            actual = os.path.join(result_dir, "{}.{}".format(base, ext))
+            expected = os.path.join(baseline_dir, "{}.{}".format(base, ext))
+            if ext == "npz":
+
+                def do_test():
+                    with pytest.skip:
+                        if not os.path.exists(expected):
+                            import shutil
+
+                            shutil.copy2(actual, expected)
+                            # shutil.copy2(os.path.join(result_dir, "{}.{}".format(base, 'png')), os.path.join(baseline_dir, "{}.{}".format(base, 'png')))
+                            raise IOError(
+                                "Baseline file {} not found, copying result {}".format(
+                                    expected, actual
+                                )
+                            )
+                        else:
+                            exp_dict = dict(np.load(expected).items())
+                            act_dict = dict(np.load(actual).items())
+                            for name in act_dict:
+                                if name in exp_dict:
+                                    try:
+                                        np.testing.assert_allclose(
+                                            exp_dict[name],
+                                            act_dict[name],
+                                            err_msg="Mismatch in {}.{}".format(
+                                                base, name
+                                            ),
+                                            rtol=rtol,
+                                            **kwargs
+                                        )
+                                    except AssertionError as e:
+                                        pass
+
+            else:
+
+                def do_test():
+                    err = compare_images(expected, actual, tol, in_decorator=True)
+                    if err:
+                        print(
+                            "Error between {} and {} is {:.5f}, which is bigger then the tolerance of {:.5f}".format(
+                                actual, expected, err["rms"], tol
+                            )
+                        )
+                        pass
+
+            yield do_test
+    plt.close("all")
+
+
+def flatten_axis(ax, prevname=""):
+    import inspect
+
+    members = inspect.getmembers(ax)
+
+    arrays = {}
+
+    def _flatten(l, pre):
+        arr = {}
+        if isinstance(l, np.ndarray):
+            if l.size:
+                arr[pre] = np.asarray(l)
+        elif isinstance(l, dict):
+            for _n in l:
+                _tmp = _flatten(l, pre + "." + _n + ".")
+                for _nt in _tmp.keys():
+                    arrays[_nt] = _tmp[_nt]
+        elif isinstance(l, list) and len(l) > 0:
+            for i in range(len(l)):
+                _tmp = _flatten(l[i], pre + "[{}]".format(i))
+                for _n in _tmp:
+                    arr["{}".format(_n)] = _tmp[_n]
+        else:
+            return flatten_axis(l, pre + ".")
+        return arr
+
+    for name, l in members:
+        if isinstance(l, np.ndarray):
+            arrays[prevname + name] = np.asarray(l)
+        elif isinstance(l, list) and len(l) > 0:
+            for i in range(len(l)):
+                _tmp = _flatten(l[i], prevname + name + "[{}]".format(i))
+                for _n in _tmp:
+                    arrays["{}".format(_n)] = _tmp[_n]
+
+    return arrays
+
+
+def _a(x, y, decimal):
+    np.testing.assert_array_almost_equal(x, y, decimal)
+
+
+def compare_axis_dicts(x, y, decimal=6):
+    try:
+        assert len(x) == len(y)
+        for name in x:
+            _a(x[name], y[name], decimal)
+    except AssertionError as e:
+        print(e.message)
+        pass
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_figure():
+    np.random.seed(1239847)
+    from GPy.plotting import plotting_library as pl
+
+    # import matplotlib
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+
+        ax, _ = pl().new_canvas(num="imshow_interact")
+
+        def test_func(x):
+            return x[:, 0].reshape(3, 3)
+
+        pl().imshow_interact(ax, test_func, extent=(-1, 1, -1, 1), resolution=3)
+
+        ax, _ = pl().new_canvas()
+
+        def test_func_2(x):
+            y = x[:, 0].reshape(3, 3)
+            anno = np.argmax(x, axis=1).reshape(3, 3)
+            return y, anno
+
+        pl().annotation_heatmap_interact(
+            ax, test_func_2, extent=(-1, 1, -1, 1), resolution=3
+        )
+        pl().annotation_heatmap_interact(
+            ax,
+            test_func_2,
+            extent=(-1, 1, -1, 1),
+            resolution=3,
+            imshow_kwargs=dict(interpolation="nearest"),
+        )
+
+        ax, _ = pl().new_canvas(figsize=(4, 3))
+        x = np.linspace(0, 1, 100)
+        y = [0, 1, 2]
+        array = np.array([0.4, 0.5])
+        cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
+            "WhToColor", ("r", "b"), N=array.size
+        )
+
+        pl().fill_gradient(ax, x, y, facecolors=["r", "g"], array=array, cmap=cmap)
+
+        ax, _ = pl().new_canvas(
+            num="3d_plot",
+            figsize=(4, 3),
+            projection="3d",
+            xlabel="x",
+            ylabel="y",
+            zlabel="z",
+            title="awsome title",
+            xlim=(-1, 1),
+            ylim=(-1, 1),
+            zlim=(-3, 3),
+        )
+        z = 2 - np.abs(np.linspace(-2, 2, (100))) + 1
+        x, y = z * np.sin(np.linspace(-2 * np.pi, 2 * np.pi, (100))), z * np.cos(
+            np.linspace(-np.pi, np.pi, (100))
+        )
+
+        pl().plot(ax, x, y, z, linewidth=2)
+
+        for do_test in _image_comparison(
+            baseline_images=[
+                "coverage_{}".format(sub)
+                for sub in [
+                    "imshow_interact",
+                    "annotation_interact",
+                    "gradient",
+                    "3d_plot",
+                ]
+            ],
+            extensions=extensions,
+        ):
+            yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_kernel():
+    np.random.seed(1239847)
+    # import matplotlib
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        k = GPy.kern.RBF(5, ARD=True) * GPy.kern.Linear(
+            3, active_dims=[0, 2, 4], ARD=True
+        ) + GPy.kern.Bias(2)
+        k.randomize()
+        k2 = (
+            GPy.kern.RBF(5, ARD=True)
+            * GPy.kern.Linear(3, active_dims=[0, 2, 4], ARD=True)
+            + GPy.kern.Bias(2)
+            + GPy.kern.White(4)
+        )
+        k2[:-1] = k[:]
+        k2.plot_ARD(["rbf", "linear", "bias"], legend=True)
+        k2.plot_covariance(visible_dims=[0, 3], plot_limits=(-1, 3))
+        k2.plot_covariance(visible_dims=[2], plot_limits=(-1, 3))
+        k2.plot_covariance(
+            visible_dims=[2, 4],
+            plot_limits=((-1, 0), (5, 3)),
+            projection="3d",
+            rstride=10,
+            cstride=10,
+        )
+        k2.plot_covariance(visible_dims=[1, 4])
+        for do_test in _image_comparison(
+            baseline_images=[
+                "kern_{}".format(sub)
+                for sub in ["ARD", "cov_2d", "cov_1d", "cov_3d", "cov_no_lim"]
+            ],
+            extensions=extensions,
+        ):
+            yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_plot():
+    np.random.seed(111)
+    import matplotlib
+
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        X = np.random.uniform(-2, 2, (40, 1))
+        f = 0.2 * np.sin(1.3 * X) + 1.3 * np.cos(2 * X)
+        Y = f + np.random.normal(0, 0.1, f.shape)
+        m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X) * [0.06])
+        # m.optimize()
+        m.plot_data()
+        m.plot_mean()
+        m.plot_confidence()
+        m.plot_density()
+        m.plot_errorbars_trainset()
+        m.plot_samples()
+        m.plot_data_error()
+    for do_test in _image_comparison(
+        baseline_images=[
+            "gp_{}".format(sub)
+            for sub in [
+                "data",
+                "mean",
+                "conf",
+                "density",
+                "out_error",
+                "samples",
+                "in_error",
+            ]
+        ],
+        extensions=extensions,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_twod():
+    np.random.seed(11111)
+    import matplotlib
+
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    X = np.random.uniform(-2, 2, (40, 2))
+    f = 0.2 * np.sin(1.3 * X[:, [0]]) + 1.3 * np.cos(2 * X[:, [1]])
+    Y = f + np.random.normal(0, 0.1, f.shape)
+    m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X) * [0.01, 0.2])
+    # m.optimize()
+    m.plot_data()
+    m.plot_mean()
+    m.plot_inducing(legend=False, marker="s")
+    # m.plot_errorbars_trainset()
+    m.plot_data_error()
+    for do_test in _image_comparison(
+        baseline_images=[
+            "gp_2d_{}".format(sub)
+            for sub in [
+                "data",
+                "mean",
+                "inducing",
+                #'out_error',
+                "in_error",
+            ]
+        ],
+        extensions=extensions,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_threed():
+    np.random.seed(11111)
+    import matplotlib
+
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    X = np.random.uniform(-2, 2, (40, 2))
+    f = 0.2 * np.sin(1.3 * X[:, [0]]) + 1.3 * np.cos(2 * X[:, [1]])
+    Y = f + np.random.normal(0, 0.1, f.shape)
+    m = GPy.models.SparseGPRegression(X, Y)
+    m.likelihood.variance = 0.1
+    # m.optimize()
+    m.plot_samples(projection="3d", samples=1)
+    m.plot_samples(projection="3d", plot_raw=False, samples=1)
+    plt.close("all")
+    m.plot_data(projection="3d")
+    m.plot_mean(projection="3d", rstride=10, cstride=10)
+    m.plot_inducing(projection="3d")
+    # m.plot_errorbars_trainset(projection='3d')
+    for do_test in _image_comparison(
+        baseline_images=[
+            "gp_3d_{}".format(sub)
+            for sub in [
+                "data",
+                "mean",
+                "inducing",
+            ]
+        ],
+        extensions=extensions,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_sparse():
+    np.random.seed(11111)
+    import matplotlib
+
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    X = np.random.uniform(-2, 2, (40, 1))
+    f = 0.2 * np.sin(1.3 * X) + 1.3 * np.cos(2 * X)
+    Y = f + np.random.normal(0, 0.1, f.shape)
+    m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X) * 0.1)
+    # m.optimize()
+    # m.plot_inducing()
+    _, ax = plt.subplots()
+    m.plot_data(ax=ax)
+    m.plot_data_error(ax=ax)
+    for do_test in _image_comparison(
+        baseline_images=["sparse_gp_{}".format(sub) for sub in ["data_error"]],
+        extensions=extensions,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_classification():
+    np.random.seed(11111)
+    import matplotlib
+
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    X = np.random.uniform(-2, 2, (40, 1))
+    f = 0.2 * np.sin(1.3 * X) + 1.3 * np.cos(2 * X)
+    Y = f + np.random.normal(0, 0.1, f.shape)
+    m = GPy.models.GPClassification(X, Y > Y.mean())
+    # m.optimize()
+    _, ax = plt.subplots()
+    m.plot(plot_raw=False, apply_link=False, ax=ax, samples=3)
+    m.plot_errorbars_trainset(plot_raw=False, apply_link=False, ax=ax)
+    _, ax = plt.subplots()
+    m.plot(plot_raw=True, apply_link=False, ax=ax, samples=3)
+    m.plot_errorbars_trainset(plot_raw=True, apply_link=False, ax=ax)
+    _, ax = plt.subplots()
+    m.plot(plot_raw=True, apply_link=True, ax=ax, samples=3)
+    m.plot_errorbars_trainset(plot_raw=True, apply_link=True, ax=ax)
+    for do_test in _image_comparison(
+        baseline_images=[
+            "gp_class_{}".format(sub) for sub in ["likelihood", "raw", "raw_link"]
+        ],
+        extensions=extensions,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_sparse_classification():
+    np.random.seed(11111)
+    import matplotlib
+
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    X = np.random.uniform(-2, 2, (40, 1))
+    f = 0.2 * np.sin(1.3 * X) + 1.3 * np.cos(2 * X)
+    Y = f + np.random.normal(0, 0.1, f.shape)
+    m = GPy.models.SparseGPClassification(X, Y > Y.mean())
+    # m.optimize()
+    m.plot(plot_raw=False, apply_link=False, samples_likelihood=3)
+    np.random.seed(111)
+    m.plot(plot_raw=True, apply_link=False, samples=3)
+    np.random.seed(111)
+    m.plot(plot_raw=True, apply_link=True, samples=3)
+    for do_test in _image_comparison(
+        baseline_images=[
+            "sparse_gp_class_{}".format(sub)
+            for sub in ["likelihood", "raw", "raw_link"]
+        ],
+        extensions=extensions,
+        rtol=2,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_gplvm():
+    from GPy.models import GPLVM
+
+    np.random.seed(12345)
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    # Q = 3
+    # Define dataset
+    # N = 60
+    # k1 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,10,10,0.1,0.1]), ARD=True)
+    # k2 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,0.1,10,0.1,10]), ARD=True)
+    # k3 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[0.1,0.1,10,10,10]), ARD=True)
+    # X = np.random.normal(0, 1, (N, 5))
+    # A = np.random.multivariate_normal(np.zeros(N), k1.K(X), Q).T
+    # B = np.random.multivariate_normal(np.zeros(N), k2.K(X), Q).T
+    # C = np.random.multivariate_normal(np.zeros(N), k3.K(X), Q).T
+    # Y = np.vstack((A,B,C))
+    # labels = np.hstack((np.zeros(A.shape[0]), np.ones(B.shape[0]), np.ones(C.shape[0])*2))
+
+    # k = RBF(Q, ARD=True, lengthscale=2)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    pars = np.load(os.path.join(basedir, "b-gplvm-save.npz"))
+    Y = pars["Y"]
+    Q = pars["Q"]
+    labels = pars["labels"]
+
+    import warnings
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")  # always print
+        m = GPLVM(Y, Q, initialize=False)
+    m.update_model(False)
+    m.initialize_parameter()
+    m[:] = pars["gplvm_p"]
+    m.update_model(True)
+
+    # m.optimize(messages=0)
+    np.random.seed(111)
+    m.plot_latent(labels=labels)
+    np.random.seed(111)
+    m.plot_scatter(projection="3d", labels=labels)
+    np.random.seed(111)
+    m.plot_magnification(labels=labels)
+    m.plot_steepest_gradient_map(resolution=10, data_labels=labels)
+    for do_test in _image_comparison(
+        baseline_images=[
+            "gplvm_{}".format(sub)
+            for sub in ["latent", "latent_3d", "magnification", "gradient"]
+        ],
+        extensions=extensions,
+        tol=12,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_bayesian_gplvm():
+    from ..models import BayesianGPLVM
+
+    np.random.seed(12345)
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    # Q = 3
+    # Define dataset
+    # N = 10
+    # k1 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,10,10,0.1,0.1]), ARD=True)
+    # k2 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,0.1,10,0.1,10]), ARD=True)
+    # k3 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[0.1,0.1,10,10,10]), ARD=True)
+    # X = np.random.normal(0, 1, (N, 5))
+    # A = np.random.multivariate_normal(np.zeros(N), k1.K(X), Q).T
+    # B = np.random.multivariate_normal(np.zeros(N), k2.K(X), Q).T
+    # C = np.random.multivariate_normal(np.zeros(N), k3.K(X), Q).T
+
+    # Y = np.vstack((A,B,C))
+    # labels = np.hstack((np.zeros(A.shape[0]), np.ones(B.shape[0]), np.ones(C.shape[0])*2))
+
+    # k = RBF(Q, ARD=True, lengthscale=2)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    pars = np.load(os.path.join(basedir, "b-gplvm-save.npz"))
+    Y = pars["Y"]
+    Q = pars["Q"]
+    labels = pars["labels"]
+
+    import warnings
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")  # always print
+        m = BayesianGPLVM(Y, Q, initialize=False)
+    m.update_model(False)
+    m.initialize_parameter()
+    m[:] = pars["bgplvm_p"]
+    m.update_model(True)
+
+    # m.optimize(messages=0)
+    np.random.seed(111)
+    m.plot_inducing(projection="2d")
+    np.random.seed(111)
+    m.plot_inducing(projection="3d")
+    np.random.seed(111)
+    m.plot_latent(projection="2d", labels=labels)
+    np.random.seed(111)
+    m.plot_scatter(projection="3d", labels=labels)
+    np.random.seed(111)
+    m.plot_magnification(labels=labels)
+    np.random.seed(111)
+    m.plot_steepest_gradient_map(resolution=10, data_labels=labels)
+    for do_test in _image_comparison(
+        baseline_images=[
+            "bayesian_gplvm_{}".format(sub)
+            for sub in [
+                "inducing",
+                "inducing_3d",
+                "latent",
+                "latent_3d",
+                "magnification",
+                "gradient",
+            ]
+        ],
+        extensions=extensions,
+    ):
+        yield (do_test,)
--- a/GPy/testing/prior_tests.py
+++ b/GPy/testing/prior_tests.py
@ -1,138 +1,142 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import unittest
+import pytest
 import numpy as np
 import GPy

-class PriorTests(unittest.TestCase):
+
+class TestPrior:
    def test_studentT(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        studentT = GPy.priors.StudentT(1, 2, 4)
-        
+
        m = GPy.models.SparseGPRegression(X, y)
        m.Z.set_prior(studentT)

        # setting a StudentT prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, studentT)
-        
+
+        with pytest.raises(AssertionError):
+            m.rbf.set_prior(studentT)
+
        # The gradients need to be checked
-        self.assertTrue(m.checkgrad())
-        
+        assert m.checkgrad()
+
        # Check the singleton pattern:
-        self.assertIs(studentT, GPy.priors.StudentT(1,2,4))
-        self.assertIsNot(studentT, GPy.priors.StudentT(2,2,4))
-    
+        assert studentT is GPy.priors.StudentT(1, 2, 4)
+        assert studentT is not GPy.priors.StudentT(2, 2, 4)
+
    def test_lognormal(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        lognormal = GPy.priors.LogGaussian(1, 2)
        m.rbf.set_prior(lognormal)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_Gamma(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        Gamma = GPy.priors.Gamma(1, 1)
        m.rbf.set_prior(Gamma)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_InverseGamma(self):
        # Test that this prior object can be instantiated and performs its basic functions
        # in integration.
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        InverseGamma = GPy.priors.InverseGamma(1, 1)
        m.rbf.set_prior(InverseGamma)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_incompatibility(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        gaussian = GPy.priors.Gaussian(1, 1)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, gaussian)
+        with pytest.raises(AssertionError):
+            m.rbf.set_prior(gaussian)

    def test_set_prior(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)

        gaussian = GPy.priors.Gaussian(1, 1)
-        #m.rbf.set_prior(gaussian)
+        # m.rbf.set_prior(gaussian)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, gaussian)
+        with pytest.raises(AssertionError):
+            m.rbf.set_prior(gaussian)

    def test_uniform(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.SparseGPRegression(X, y)
        uniform = GPy.priors.Uniform(0, 2)
        m.rbf.set_prior(uniform)
        m.randomize()
-        self.assertTrue(m.checkgrad())
-        
+        assert m.checkgrad()
+
        m.Z.set_prior(uniform)
        m.randomize()
-        self.assertTrue(m.checkgrad())
-        
+        assert m.checkgrad()
+
        m.Z.unconstrain()
        uniform = GPy.priors.Uniform(-1, 10)
        m.Z.set_prior(uniform)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

        m.Z.constrain_negative()
        uniform = GPy.priors.Uniform(-1, 0)
        m.Z.set_prior(uniform)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_set_gaussian_for_reals(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.SparseGPRegression(X, y)

@ -140,16 +144,15 @@ class PriorTests(unittest.TestCase):
        m.Z.set_prior(gaussian)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        #self.assertRaises(AssertionError, m.Z.set_prior, gaussian)
-        self.assertTrue(m.checkgrad())
-
+        # self.assertRaises(AssertionError, m.Z.set_prior, gaussian)
+        assert m.checkgrad()

    def test_fixed_domain_check(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)

@ -157,14 +160,15 @@ class PriorTests(unittest.TestCase):
        gaussian = GPy.priors.Gaussian(1, 1)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, gaussian)
+        with pytest.raises(AssertionError):
+            m.rbf.set_prior(gaussian)

    def test_fixed_domain_check1(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)

@ -172,8 +176,5 @@ class PriorTests(unittest.TestCase):
        gaussian = GPy.priors.Gaussian(1, 1)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, gaussian)
-
-if __name__ == "__main__":
-    print("Running unit tests, please be (very) patient...")
-    unittest.main()
+        with pytest.raises(AssertionError):
+            m.rbf.set_prior(gaussian)
--- a/GPy/testing/quadrature_tests.py
+++ b/GPy/testing/quadrature_tests.py
@ -1,23 +1,19 @@
 from __future__ import print_function, division
 import numpy as np
-import GPy
-import warnings
-from  ..util.quad_integrate import quadgk_int, quadvgk
+from ..util.quad_integrate import quadgk_int, quadvgk


-
-class QuadTests(np.testing.TestCase):
+class TestQuad:
    """
    test file for checking implementation of gaussian-kronrod quadrature.
    we will take a function which can be integrated analytically and check if quadgk result is similar or not!
    through this file we can test how numerically accurate quadrature implementation in native numpy or manual code is.
    """
-    def setUp(self):
-        pass

    def test_infinite_quad(self):
        def f(x):
-            return np.exp(-0.5*x**2)*np.power(x,np.arange(3)[:,None])
+            return np.exp(-0.5 * x**2) * np.power(x, np.arange(3)[:, None])
+
        quad_int_val = quadgk_int(f)
        real_val = np.sqrt(np.pi * 2)
        np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
@ -25,15 +21,18 @@ class QuadTests(np.testing.TestCase):
    def test_finite_quad(self):
        def f2(x):
            return x**2
-        quad_int_val = quadvgk(f2, 1.,2.)
-        real_val = 7/3.
+
+        quad_int_val = quadvgk(f2, 1.0, 2.0)
+        real_val = 7 / 3.0
        np.testing.assert_almost_equal(real_val, quad_int_val, decimal=5)

-if __name__ == '__main__':
+
+if __name__ == "__main__":
+
    def f(x):
-        return np.exp(-0.5 * x ** 2) * np.power(x, np.arange(3)[:, None])
+        return np.exp(-0.5 * x**2) * np.power(x, np.arange(3)[:, None])

    quad_int_val = quadgk_int(f)
-    real_val = np.sqrt(np.pi*2)
+    real_val = np.sqrt(np.pi * 2)
    np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
    print(quadgk_int(f))
--- a/GPy/testing/test_rv_transformation.py
+++ b/GPy/testing/test_rv_transformation.py
@ -0,0 +1,84 @@
+# Written by Ilias Bilionis
+"""
+Test if hyperparameters in models are properly transformed.
+"""
+
+import pytest
+import numpy as np
+import scipy.stats as st
+import GPy
+
+
+class TestModel(GPy.core.Model):
+    """
+    A simple GPy model with one parameter.
+    """
+
+    def __init__(self, theta=1.0):
+        super(TestModel, self).__init__("test_model")
+        theta = GPy.core.Param("theta", theta)
+        self.link_parameter(theta)
+
+    def log_likelihood(self):
+        return 0.0
+
+
+class TestRVTransformation:
+    def _test_trans(self, trans):
+        m = TestModel()
+        prior = GPy.priors.LogGaussian(0.5, 0.1)
+        m.theta.set_prior(prior)
+        m.theta.unconstrain()
+        m.theta.constrain(trans)
+        # The PDF of the transformed variables
+        p_phi = lambda phi: np.exp(-m._objective_grads(phi)[0])
+        # To the empirical PDF of:
+        theta_s = prior.rvs(1e5)
+        phi_s = trans.finv(theta_s)
+        # which is essentially a kernel density estimation
+        kde = st.gaussian_kde(phi_s)
+        # We will compare the PDF here:
+        phi = np.linspace(phi_s.min(), phi_s.max(), 100)
+        # The transformed PDF of phi should be this:
+        pdf_phi = np.array([p_phi(p) for p in phi])
+        # UNCOMMENT TO SEE GRAPHICAL COMPARISON
+        # import matplotlib.pyplot as plt
+        # fig, ax = plt.subplots()
+        # ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Histogram')
+        # ax.plot(phi, kde(phi), '--', linewidth=2, label='Kernel Density Estimation')
+        # ax.plot(phi, pdf_phi, ':', linewidth=2, label='Transformed PDF')
+        # ax.set_xlabel(r'transformed $\theta$', fontsize=16)
+        # ax.set_ylabel('PDF', fontsize=16)
+        # plt.legend(loc='best')
+        # plt.show(block=True)
+        # END OF PLOT
+        # The following test cannot be very accurate
+        assert np.linalg.norm(pdf_phi - kde(phi)) / np.linalg.norm(kde(phi)) <= 1e-1
+
+    def _test_grad(self, trans):
+        np.random.seed(1234)
+        m = TestModel(np.random.uniform(0.5, 1.5, 20))
+        prior = GPy.priors.LogGaussian(0.5, 0.1)
+        m.theta.set_prior(prior)
+        m.theta.constrain(trans)
+        m.randomize()
+        print(m)
+        assert m.checkgrad(1)
+
+    def test_Logexp(self):
+        self._test_trans(GPy.constraints.Logexp())
+
+    @pytest.mark.skip(
+        "Gradient not checking right, @jameshensman what is going on here?"
+    )
+    def test_Logexp_grad(self):
+        self._test_grad(GPy.constraints.Logexp())
+
+    def test_Exponent(self):
+        self._test_trans(GPy.constraints.Exponent())
+
+    @pytest.mark.skip(
+        "Gradient not checking right, @jameshensman what is going on here?"
+    )
+    def test_Exponent_grad(self):
+        self._test_grad(GPy.constraints.Exponent())
--- a/GPy/testing/test_serialization.py
+++ b/GPy/testing/test_serialization.py
@ -0,0 +1,440 @@
+"""
+Created on 20 April 2017
+
+@author: pgmoren
+"""
+import numpy as np
+import GPy
+import os
+
+fixed_seed = 11
+
+
+class TestSerialization:
+    def test_serialize_deserialize_kernels(self):
+        k1 = GPy.kern.RBF(2, variance=1.0, lengthscale=[1.0, 1.0], ARD=True)
+        k2 = GPy.kern.RatQuad(
+            2, variance=2.0, lengthscale=1.0, power=2.0, active_dims=[0, 1]
+        )
+        k3 = GPy.kern.Bias(2, variance=2.0, active_dims=[1, 0])
+        k4 = GPy.kern.StdPeriodic(
+            2, variance=2.0, lengthscale=1.0, period=1.0, active_dims=[1, 1]
+        )
+        k5 = GPy.kern.Linear(2, variances=[2.0, 1.0], ARD=True, active_dims=[1, 1])
+        k6 = GPy.kern.Exponential(2, variance=1.0, lengthscale=2)
+        k7 = GPy.kern.Matern32(
+            2, variance=1.0, lengthscale=[1.0, 3.0], ARD=True, active_dims=[1, 1]
+        )
+        k8 = GPy.kern.Matern52(
+            2, variance=2.0, lengthscale=[2.0, 1.0], ARD=True, active_dims=[1, 0]
+        )
+        k9 = GPy.kern.ExpQuad(
+            2, variance=3.0, lengthscale=[1.0, 2.0], ARD=True, active_dims=[0, 1]
+        )
+        k10 = GPy.kern.OU(
+            2, variance=2.0, lengthscale=[2.0, 1.0], ARD=True, active_dims=[1, 0]
+        )
+        k11 = k1 + k1.copy() + k2 + k3 + k4 + k5 + k6
+        k12 = k1 * k2 * k2.copy() * k3 * k4 * k5
+        k13 = (k1 + k2) * (k3 + k4 + k5)
+        k14 = ((k1 + k2) * k3) + k4 + k5 * k7
+        k15 = ((k1 + k2) * k3) + k4 * k5 + k8 * k10
+        k16 = ((k1 * k2) * k3) + k4 * k5 + k8 + k9
+
+        k_list = [k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14, k15, k16]
+
+        for kk in k_list:
+            kk_dict = kk.to_dict()
+            kk_r = GPy.kern.Kern.from_dict(kk_dict)
+            assert type(kk) == type(kk_r)
+            np.testing.assert_array_equal(kk[:], kk_r[:])
+            np.testing.assert_array_equal(
+                np.array(kk.active_dims), np.array(kk_r.active_dims)
+            )
+
+    def test_serialize_deserialize_mappings(self):
+        m1 = GPy.mappings.Identity(3, 2)
+        m2 = GPy.mappings.Constant(3, 2, 1)
+        m2_r = GPy.core.mapping.Mapping.from_dict(m2.to_dict())
+        np.testing.assert_array_equal(m2.C.values[:], m2_r.C.values[:])
+        m3 = GPy.mappings.Linear(3, 2)
+        m3_r = GPy.core.mapping.Mapping.from_dict(m3.to_dict())
+        assert np.all(m3.A == m3_r.A)
+
+        m_list = [m1, m2, m3]
+        for mm in m_list:
+            mm_dict = mm.to_dict()
+            mm_r = GPy.core.mapping.Mapping.from_dict(mm_dict)
+            assert type(mm) == type(mm_r)
+            assert type(mm.input_dim) == type(mm_r.input_dim)
+            assert type(mm.output_dim) == type(mm_r.output_dim)
+
+    def test_serialize_deserialize_likelihoods(self):
+        l1 = GPy.likelihoods.Gaussian(
+            GPy.likelihoods.link_functions.Identity(), variance=3.0
+        )
+        l1_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l1.to_dict())
+        l2 = GPy.likelihoods.Bernoulli(GPy.likelihoods.link_functions.Probit())
+        l2_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l2.to_dict())
+        assert type(l1) == type(l1_r)
+        assert np.all(l1.variance == l1_r.variance)
+        assert type(l2) == type(l2_r)
+
+    def test_serialize_deserialize_normalizers(self):
+        n1 = GPy.util.normalizer.Standardize()
+        n1.scale_by(np.random.rand(10))
+        n1_r = GPy.util.normalizer._Norm.from_dict((n1.to_dict()))
+        assert type(n1) == type(n1_r)
+        assert np.all(n1.mean == n1_r.mean)
+        assert np.all(n1.std == n1_r.std)
+
+    def test_serialize_deserialize_link_functions(self):
+        l1 = GPy.likelihoods.link_functions.Identity()
+        l2 = GPy.likelihoods.link_functions.Probit()
+        l_list = [l1, l2]
+        for ll in l_list:
+            ll_dict = ll.to_dict()
+            ll_r = GPy.likelihoods.link_functions.GPTransformation.from_dict(ll_dict)
+            assert type(ll) == type(ll_r)
+
+    def test_serialize_deserialize_inference_methods(self):
+        e1 = GPy.inference.latent_function_inference.expectation_propagation.EP(
+            ep_mode="nested"
+        )
+        e1.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(
+            np.random.rand(10), np.random.rand(10)
+        )
+        e1._ep_approximation = []
+        e1._ep_approximation.append(
+            GPy.inference.latent_function_inference.expectation_propagation.posteriorParams(
+                np.random.rand(10), np.random.rand(100).reshape((10, 10))
+            )
+        )
+        e1._ep_approximation.append(
+            GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(
+                np.random.rand(10), np.random.rand(10)
+            )
+        )
+        e1._ep_approximation.append(
+            GPy.inference.latent_function_inference.expectation_propagation.cavityParams(
+                10
+            )
+        )
+        e1._ep_approximation[-1].v = np.random.rand(10)
+        e1._ep_approximation[-1].tau = np.random.rand(10)
+        e1._ep_approximation.append(np.random.rand(10))
+        e1_r = (
+            GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(
+                e1.to_dict()
+            )
+        )
+
+        assert type(e1) == type(e1_r)
+        assert e1.epsilon == e1_r.epsilon
+        assert e1.eta == e1_r.eta
+        assert e1.delta == e1_r.delta
+        assert e1.always_reset == e1_r.always_reset
+        assert e1.max_iters == e1_r.max_iters
+        assert e1.ep_mode == e1_r.ep_mode
+        assert e1.parallel_updates == e1_r.parallel_updates
+
+        np.testing.assert_array_equal(
+            e1.ga_approx_old.tau[:], e1_r.ga_approx_old.tau[:]
+        )
+        np.testing.assert_array_equal(e1.ga_approx_old.v[:], e1_r.ga_approx_old.v[:])
+        np.testing.assert_array_equal(
+            e1._ep_approximation[0].mu[:], e1_r._ep_approximation[0].mu[:]
+        )
+        np.testing.assert_array_equal(
+            e1._ep_approximation[0].Sigma[:], e1_r._ep_approximation[0].Sigma[:]
+        )
+        np.testing.assert_array_equal(
+            e1._ep_approximation[1].tau[:], e1_r._ep_approximation[1].tau[:]
+        )
+        np.testing.assert_array_equal(
+            e1._ep_approximation[1].v[:], e1_r._ep_approximation[1].v[:]
+        )
+        np.testing.assert_array_equal(
+            e1._ep_approximation[2].tau[:], e1_r._ep_approximation[2].tau[:]
+        )
+        np.testing.assert_array_equal(
+            e1._ep_approximation[2].v[:], e1_r._ep_approximation[2].v[:]
+        )
+        np.testing.assert_array_equal(
+            e1._ep_approximation[3][:], e1_r._ep_approximation[3][:]
+        )
+
+        e2 = GPy.inference.latent_function_inference.expectation_propagation.EPDTC(
+            ep_mode="nested"
+        )
+        e2.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(
+            np.random.rand(10), np.random.rand(10)
+        )
+        e2._ep_approximation = []
+        e2._ep_approximation.append(
+            GPy.inference.latent_function_inference.expectation_propagation.posteriorParamsDTC(
+                np.random.rand(10), np.random.rand(10)
+            )
+        )
+        e2._ep_approximation.append(
+            GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(
+                np.random.rand(10), np.random.rand(10)
+            )
+        )
+        e2._ep_approximation.append(100.0)
+        e2_r = (
+            GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(
+                e2.to_dict()
+            )
+        )
+
+        assert type(e2) == type(e2_r)
+        assert e2.epsilon == e2_r.epsilon
+        assert e2.eta == e2_r.eta
+        assert e2.delta == e2_r.delta
+        assert e2.always_reset == e2_r.always_reset
+        assert e2.max_iters == e2_r.max_iters
+        assert e2.ep_mode == e2_r.ep_mode
+        assert e2.parallel_updates == e2_r.parallel_updates
+
+        np.testing.assert_array_equal(
+            e2.ga_approx_old.tau[:], e2_r.ga_approx_old.tau[:]
+        )
+        np.testing.assert_array_equal(e2.ga_approx_old.v[:], e2_r.ga_approx_old.v[:])
+        np.testing.assert_array_equal(
+            e2._ep_approximation[0].mu[:], e2_r._ep_approximation[0].mu[:]
+        )
+        np.testing.assert_array_equal(
+            e2._ep_approximation[0].Sigma_diag[:],
+            e2_r._ep_approximation[0].Sigma_diag[:],
+        )
+        np.testing.assert_array_equal(
+            e2._ep_approximation[1].tau[:], e2_r._ep_approximation[1].tau[:]
+        )
+        np.testing.assert_array_equal(
+            e2._ep_approximation[1].v[:], e2_r._ep_approximation[1].v[:]
+        )
+        assert e2._ep_approximation[2] == e2_r._ep_approximation[2]
+
+        e3 = (
+            GPy.inference.latent_function_inference.exact_gaussian_inference.ExactGaussianInference()
+        )
+        e3_r = (
+            GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(
+                e3.to_dict()
+            )
+        )
+
+        assert type(e3) == type(e3_r)
+
+    def test_serialize_deserialize_GP(self):
+        np.random.seed(fixed_seed)
+        N = 20
+        Nhalf = int(N / 2)
+        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[
+            :, None
+        ]
+        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
+        kernel = GPy.kern.RBF(1)
+        likelihood = GPy.likelihoods.Bernoulli()
+        inference_method = (
+            GPy.inference.latent_function_inference.expectation_propagation.EP(
+                ep_mode="nested"
+            )
+        )
+        mean_function = None
+
+        m = GPy.core.GP(
+            X=X,
+            Y=Y,
+            kernel=kernel,
+            likelihood=likelihood,
+            inference_method=inference_method,
+            mean_function=mean_function,
+            normalizer=True,
+            name="gp_classification",
+        )
+        m.optimize()
+        m.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
+        m.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
+        m1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
+        m2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X, Y))
+        os.remove("temp_test_gp_with_data.json.zip")
+        os.remove("temp_test_gp_without_data.json.zip")
+        var = m.predict(X)[0]
+        var1_r = m1_r.predict(X)[0]
+        var2_r = m2_r.predict(X)[0]
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var1_r).flatten()
+        )
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var2_r).flatten()
+        )
+
+    def test_serialize_deserialize_SparseGP(self):
+        np.random.seed(fixed_seed)
+        N = 20
+        Nhalf = int(N / 2)
+        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[
+            :, None
+        ]
+        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
+        kernel = GPy.kern.RBF(1)
+        likelihood = GPy.likelihoods.Bernoulli()
+        inference_method = (
+            GPy.inference.latent_function_inference.expectation_propagation.EPDTC(
+                ep_mode="nested"
+            )
+        )
+        mean_function = None
+
+        sm = GPy.core.SparseGP(
+            X=X,
+            Y=Y,
+            Z=X[0:20, :],
+            kernel=kernel,
+            likelihood=likelihood,
+            inference_method=inference_method,
+            mean_function=mean_function,
+            normalizer=True,
+            name="sparse_gp_classification",
+        )
+        sm.optimize()
+        sm.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
+        sm.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
+        sm1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
+        sm2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X, Y))
+        os.remove("temp_test_gp_with_data.json.zip")
+        os.remove("temp_test_gp_without_data.json.zip")
+        var = sm.predict(X)[0]
+        var1_r = sm1_r.predict(X)[0]
+        var2_r = sm2_r.predict(X)[0]
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var1_r).flatten()
+        )
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var2_r).flatten()
+        )
+
+    def test_serialize_deserialize_GPRegressor(self):
+        np.random.seed(fixed_seed)
+        N = 50
+        N_new = 50
+        D = 1
+        X = np.random.uniform(-3.0, 3.0, (N, 1))
+        Y = np.sin(X) + np.random.randn(N, D) * 0.05
+        X_new = np.random.uniform(-3.0, 3.0, (N_new, 1))
+        k = GPy.kern.RBF(input_dim=1, lengthscale=10)
+        m = GPy.models.GPRegression(X, Y, k)
+        m.optimize()
+        m.save_model(
+            "temp_test_gp_regressor_with_data.json", compress=True, save_data=True
+        )
+        m.save_model(
+            "temp_test_gp_regressor_without_data.json", compress=True, save_data=False
+        )
+        m1_r = GPy.models.GPRegression.load_model(
+            "temp_test_gp_regressor_with_data.json.zip"
+        )
+        m2_r = GPy.models.GPRegression.load_model(
+            "temp_test_gp_regressor_without_data.json.zip", (X, Y)
+        )
+        os.remove("temp_test_gp_regressor_with_data.json.zip")
+        os.remove("temp_test_gp_regressor_without_data.json.zip")
+
+        Xp = np.random.uniform(size=(int(1e5), 1))
+        Xp[:, 0] = Xp[:, 0] * 15 - 5
+
+        _, var = m.predict(Xp)
+        _, var1_r = m1_r.predict(Xp)
+        _, var2_r = m2_r.predict(Xp)
+        np.testing.assert_array_equal(var.flatten(), var1_r.flatten())
+        np.testing.assert_array_equal(var.flatten(), var2_r.flatten())
+
+    def test_serialize_deserialize_GPClassification(self):
+        np.random.seed(fixed_seed)
+        N = 50
+        Nhalf = int(N / 2)
+        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[
+            :, None
+        ]
+        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
+        kernel = GPy.kern.RBF(1)
+        m = GPy.models.GPClassification(X, Y, kernel=kernel)
+        m.optimize()
+        m.save_model(
+            "temp_test_gp_classifier_with_data.json", compress=True, save_data=True
+        )
+        m.save_model(
+            "temp_test_gp_classifier_without_data.json", compress=True, save_data=False
+        )
+        m1_r = GPy.models.GPClassification.load_model(
+            "temp_test_gp_classifier_with_data.json.zip"
+        )
+        assert type(m) == type(
+            m1_r
+        ), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m1_r))
+        m2_r = GPy.models.GPClassification.load_model(
+            "temp_test_gp_classifier_without_data.json.zip", (X, Y)
+        )
+        assert type(m) == type(
+            m2_r
+        ), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m2_r))
+        os.remove("temp_test_gp_classifier_with_data.json.zip")
+        os.remove("temp_test_gp_classifier_without_data.json.zip")
+
+        var = m.predict(X)[0]
+        var1_r = m1_r.predict(X)[0]
+        _var2_r = m2_r.predict(X)[0]
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var1_r).flatten()
+        )
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var1_r).flatten()
+        )
+
+    def test_serialize_deserialize_SparseGPClassification(self):
+        np.random.seed(fixed_seed)
+        N = 50
+        Nhalf = int(N / 2)
+        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[
+            :, None
+        ]
+        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
+        kernel = GPy.kern.RBF(1)
+        m = GPy.models.SparseGPClassification(X, Y, num_inducing=3, kernel=kernel)
+        m.optimize()
+        m.save_model(
+            "temp_test_sparse_gp_classifier_with_data.json",
+            compress=True,
+            save_data=True,
+        )
+        m.save_model(
+            "temp_test_sparse_gp_classifier_without_data.json",
+            compress=True,
+            save_data=False,
+        )
+        m1_r = GPy.models.SparseGPClassification.load_model(
+            "temp_test_sparse_gp_classifier_with_data.json.zip"
+        )
+        assert type(m) == type(
+            m1_r
+        ), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m1_r))
+        m2_r = GPy.models.SparseGPClassification.load_model(
+            "temp_test_sparse_gp_classifier_without_data.json.zip", (X, Y)
+        )
+        assert type(m) == type(
+            m2_r
+        ), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m2_r))
+        os.remove("temp_test_sparse_gp_classifier_with_data.json.zip")
+        os.remove("temp_test_sparse_gp_classifier_without_data.json.zip")
+
+        var = m.predict(X)[0]
+        var1_r = m1_r.predict(X)[0]
+        var2_r = m2_r.predict(X)[0]
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var1_r).flatten()
+        )
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var1_r).flatten()
+        )
--- a/GPy/testing/test_svgp.py
+++ b/GPy/testing/test_svgp.py
@ -0,0 +1,63 @@
+import numpy as np
+import GPy
+
+
+class TestSVGP_nonconvex:
+    """
+    Inference in the SVGP with a student-T likelihood
+    """
+
+    def setup(self):
+        X = np.linspace(0, 10, 100).reshape(-1, 1)
+        Z = np.linspace(0, 10, 10).reshape(-1, 1)
+        Y = np.sin(X) + np.random.randn(*X.shape) * 0.1
+        Y[50] += 3
+
+        lik = GPy.likelihoods.StudentT(deg_free=2)
+        k = GPy.kern.RBF(1, lengthscale=5.0) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
+
+    def test_grad(self):
+        self.setup()
+        assert self.m.checkgrad(step=1e-4)
+
+
+class TestSVGP_classification:
+    """
+    Inference in the SVGP with a Bernoulli likelihood
+    """
+
+    def setup(self):
+        X = np.linspace(0, 10, 100).reshape(-1, 1)
+        Z = np.linspace(0, 10, 10).reshape(-1, 1)
+        Y = np.where((np.sin(X) + np.random.randn(*X.shape) * 0.1) > 0, 1, 0)
+
+        lik = GPy.likelihoods.Bernoulli()
+        k = GPy.kern.RBF(1, lengthscale=5.0) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
+
+    def test_grad(self):
+        self.setup()
+        assert self.m.checkgrad(step=1e-4)
+
+
+class TestSVGP_Poisson_with_meanfunction:
+    """
+    Inference in the SVGP with a Bernoulli likelihood
+    """
+
+    def setup(self):
+        X = np.linspace(0, 10, 100).reshape(-1, 1)
+        Z = np.linspace(0, 10, 10).reshape(-1, 1)
+        latent_f = np.exp(0.1 * X * 0.05 * X**2)
+        Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1, 1)
+
+        mf = GPy.mappings.Linear(1, 1)
+
+        lik = GPy.likelihoods.Poisson()
+        k = GPy.kern.RBF(1, lengthscale=5.0) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
+
+    def test_grad(self):
+        self.setup()
+        assert self.m.checkgrad(step=1e-4)
--- a/GPy/testing/tp_tests.py
+++ b/GPy/testing/tp_tests.py
@ -1,29 +1,30 @@
-'''
+"""
 Created on 14 Jul 2017, based on gp_tests

@author: javdrher
-'''
-import unittest
-import numpy as np, GPy
+"""
+import numpy as np
+import GPy


-class Test(unittest.TestCase):
-    def setUp(self):
+class TestTP:
+    def setup(self):
        np.random.seed(12345)
        self.N = 20
        self.N_new = 50
        self.D = 1
-        self.X = np.random.uniform(-3., 3., (self.N, 1))
+        self.X = np.random.uniform(-3.0, 3.0, (self.N, 1))
        self.Y = np.sin(self.X) + np.random.randn(self.N, self.D) * 0.05
-        self.X_new = np.random.uniform(-3., 3., (self.N_new, 1))
+        self.X_new = np.random.uniform(-3.0, 3.0, (self.N_new, 1))

    def test_setxy_gp(self):
+        self.setup()
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        m = GPy.models.TPRegression(self.X, self.Y, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X.copy()
        m.set_XY(m.X[:10], m.Y[:10])
-        assert (m.checkgrad(tolerance=1e-2))
+        assert m.checkgrad(tolerance=1e-2)
        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
        np.testing.assert_allclose(mu, mu2)
@ -33,10 +34,12 @@ class Test(unittest.TestCase):
        from GPy.core.parameterization.param import Param
        from GPy.core.mapping import Mapping

+        self.setup()
+
        class Parabola(Mapping):
-            def __init__(self, variance, degree=2, name='parabola'):
+            def __init__(self, variance, degree=2, name="parabola"):
                super(Parabola, self).__init__(1, 1, name)
-                self.variance = Param('variance', np.ones(degree + 1) * variance)
+                self.variance = Param("variance", np.ones(degree + 1) * variance)
                self.degree = degree
                self.link_parameter(self.variance)

@ -59,21 +62,28 @@ class Test(unittest.TestCase):
        X = np.linspace(-2, 2, 100)[:, None]
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        k.randomize()
-        p = Parabola(.3)
+        p = Parabola(0.3)
        p.randomize()
-        Y = p.f(X) + np.random.multivariate_normal(np.zeros(X.shape[0]), k.K(X) + np.eye(X.shape[0]) * 1e-8)[:,
-                     None] + np.random.normal(0, .1, (X.shape[0], 1))
+        Y = (
+            p.f(X)
+            + np.random.multivariate_normal(
+                np.zeros(X.shape[0]), k.K(X) + np.eye(X.shape[0]) * 1e-8
+            )[:, None]
+            + np.random.normal(0, 0.1, (X.shape[0], 1))
+        )
        m = GPy.models.TPRegression(X, Y, kernel=k, mean_function=p)
-        assert (m.checkgrad(tolerance=2e-1))
+        assert m.checkgrad(tolerance=2e-1)
        _ = m.predict(m.X)

    def test_normalizer(self):
+        self.setup()
+
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        Y = self.Y
        mu, std = Y.mean(0), Y.std(0)
        m = GPy.models.TPRegression(self.X, Y, kernel=k, normalizer=True)
        m.optimize()
-        assert (m.checkgrad())
+        assert m.checkgrad()
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        m2 = GPy.models.TPRegression(self.X, (Y - mu) / std, kernel=k, normalizer=False)
        m2[:] = m[:]
@ -81,13 +91,13 @@ class Test(unittest.TestCase):
        mu1, var1 = m.predict(m.X, full_cov=True)
        mu2, var2 = m2.predict(m2.X, full_cov=True)
        np.testing.assert_allclose(mu1, (mu2 * std) + mu)
-        np.testing.assert_allclose(var1, var2 * std ** 2)
+        np.testing.assert_allclose(var1, var2 * std**2)

        mu1, var1 = m.predict(m.X, full_cov=False)
        mu2, var2 = m2.predict(m2.X, full_cov=False)

        np.testing.assert_allclose(mu1, (mu2 * std) + mu)
-        np.testing.assert_allclose(var1, var2 * std ** 2)
+        np.testing.assert_allclose(var1, var2 * std**2)

        q50n = m.predict_quantiles(m.X, (50,))
        q50 = m2.predict_quantiles(m2.X, (50,))
@ -102,10 +112,15 @@ class Test(unittest.TestCase):
        q95 = m2.predict_quantiles(self.X[[c]], qs)
        mu, var = m2.predict(self.X[[c]])
        from scipy.stats import t
-        np.testing.assert_allclose((mu + (t.ppf(qs / 100., m2.nu + m2.num_data) * np.sqrt(var))).flatten(),
-                                   np.array(q95).flatten())
+
+        np.testing.assert_allclose(
+            (mu + (t.ppf(qs / 100.0, m2.nu + m2.num_data) * np.sqrt(var))).flatten(),
+            np.array(q95).flatten(),
+        )

    def test_predict_equivalence(self):
+        self.setup()
+
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        m = GPy.models.TPRegression(self.X, self.Y, kernel=k)
        m.optimize()
@ -124,10 +139,12 @@ class Test(unittest.TestCase):
        mu3, var3 = m2._raw_predict(m.X)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1, var2)
-        self.assertFalse(np.allclose(mu1, mu3))
-        self.assertFalse(np.allclose(var1, var3))
+        assert not np.allclose(mu1, mu3)
+        assert not np.allclose(var1, var3)

    def test_gp_equivalence(self):
+        self.setup()
+
        k = GPy.kern.RBF(1)
        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
        m.optimize()
@ -139,7 +156,3 @@ class Test(unittest.TestCase):
        mu2, var2 = m2.predict(self.X)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1, var2)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/GPy/testing/test_util.py
+++ b/GPy/testing/test_util.py
@ -0,0 +1,284 @@
+# ===============================================================================
+# Copyright (c) 2016, Max Zwiessele, Alan Saul
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of GPy.testing.util_tests nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ===============================================================================
+
+import numpy as np
+import GPy
+
+
+class UtilTest:
+    def test_checkFinite(self):
+        from GPy.util.debug import checkFinite
+
+        array = np.random.normal(0, 1, 100).reshape(25, 4)
+        assert checkFinite(array, name="test")
+
+        array[np.random.binomial(1, 0.3, array.shape).astype(bool)] = np.nan
+        assert not checkFinite(array)
+
+    def test_checkFullRank(self):
+        from GPy.util.debug import checkFullRank
+        from GPy.util.linalg import tdot
+
+        array = np.random.normal(0, 1, 100).reshape(25, 4)
+        assert not checkFullRank(tdot(array), name="test")
+
+        array = np.random.normal(0, 1, (25, 25))
+        assert checkFullRank(tdot(array))
+
+    def test_fixed_inputs_median(self):
+        """test fixed_inputs convenience function"""
+        from GPy.plotting.matplot_dep.util import fixed_inputs
+        import GPy
+
+        X = np.random.randn(10, 3)
+        Y = np.sin(X) + np.random.randn(10, 3) * 1e-3
+        m = GPy.models.GPRegression(X, Y)
+        fixed = fixed_inputs(m, [1], fix_routine="median", as_list=True, X_all=False)
+        assert (0, np.median(X[:, 0])) in fixed
+        assert (2, np.median(X[:, 2])) in fixed
+        assert (
+            len([t for t in fixed if t[0] == 1]) == 0
+        )  # Unfixed input should not be in fixed
+
+    def test_fixed_inputs_mean(self):
+        from GPy.plotting.matplot_dep.util import fixed_inputs
+        import GPy
+
+        X = np.random.randn(10, 3)
+        Y = np.sin(X) + np.random.randn(10, 3) * 1e-3
+        m = GPy.models.GPRegression(X, Y)
+        fixed = fixed_inputs(m, [1], fix_routine="mean", as_list=True, X_all=False)
+        assert (0, np.mean(X[:, 0])) in fixed
+        assert (2, np.mean(X[:, 2])) in fixed
+        assert (
+            len([t for t in fixed if t[0] == 1]) == 0
+        )  # Unfixed input should not be in fixed
+
+    def test_fixed_inputs_zero(self):
+        from GPy.plotting.matplot_dep.util import fixed_inputs
+        import GPy
+
+        X = np.random.randn(10, 3)
+        Y = np.sin(X) + np.random.randn(10, 3) * 1e-3
+        m = GPy.models.GPRegression(X, Y)
+        fixed = fixed_inputs(m, [1], fix_routine="zero", as_list=True, X_all=False)
+        assert (0, 0.0) in fixed
+        assert (2, 0.0) in fixed
+        assert (
+            len([t for t in fixed if t[0] == 1]) == 0
+        )  # Unfixed input should not be in fixed
+
+    def test_fixed_inputs_uncertain(self):
+        from GPy.plotting.matplot_dep.util import fixed_inputs
+        import GPy
+        from GPy.core.parameterization.variational import NormalPosterior
+
+        X_mu = np.random.randn(10, 3)
+        X_var = np.random.randn(10, 3)
+        X = NormalPosterior(X_mu, X_var)
+        Y = np.sin(X_mu) + np.random.randn(10, 3) * 1e-3
+        m = GPy.models.BayesianGPLVM(Y, X=X_mu, X_variance=X_var, input_dim=3)
+        fixed = fixed_inputs(m, [1], fix_routine="median", as_list=True, X_all=False)
+        assert (0, np.median(X.mean.values[:, 0])) in fixed
+        assert (2, np.median(X.mean.values[:, 2])) in fixed
+        assert (
+            len([t for t in fixed if t[0] == 1]) == 0
+        )  # Unfixed input should not be in fixed
+
+    def test_DSYR(self):
+        from GPy.util.linalg import DSYR, DSYR_numpy
+
+        A = np.arange(9.0).reshape(3, 3)
+        A = np.dot(A.T, A)
+        b = np.ones(3, dtype=float)
+        alpha = 1.0
+        DSYR(A, b, alpha)
+        R = np.array([[46, 55, 64], [55, 67, 79], [64, 79, 94]])
+        assert abs(np.sum(A - R)) < 1e-12
+
+    def test_subarray(self):
+        import GPy
+
+        X = np.zeros((3, 6), dtype=bool)
+        X[[1, 1, 1], [0, 4, 5]] = 1
+        X[1:, [2, 3]] = 1
+        d = GPy.util.subarray_and_sorting.common_subarrays(X, axis=1)
+        assert len(d) == 3
+        X[:, d[tuple(X[:, 0])]]
+        assert d[tuple(X[:, 4])] == d[tuple(X[:, 0])] == [0, 4, 5]
+        assert d[tuple(X[:, 1])] == [1]
+
+    def test_offset_cluster(self):
+        # Tests the GPy.util.cluster_with_offset.cluster utility with a small
+        # test data set. Not using random noise just in case it occasionally
+        # causes it not to cluster correctly.
+        # groundtruth cluster identifiers are: [0,1,1,0]
+
+        # data contains a list of the four sets of time series (3 per data point)
+
+        data = [
+            np.array(
+                [
+                    [2.18094245, 1.96529789, 2.00265523, 2.18218742, 2.06795428],
+                    [1.62254829, 1.75748448, 1.83879347, 1.87531326, 1.52503496],
+                    [1.54589609, 1.61607914, 2.00463192, 1.48771394, 1.63339218],
+                ]
+            ),
+            np.array(
+                [
+                    [2.86766106, 2.97953437, 2.91958876, 2.92510506, 3.03239241],
+                    [2.57368423, 2.59954886, 3.10000395, 2.75806125, 2.89865704],
+                    [2.58916318, 2.53698259, 2.63858411, 2.63102504, 2.51853901],
+                ]
+            ),
+            np.array(
+                [
+                    [2.77834168, 2.9618564, 2.88482141, 3.24259745, 2.9716821],
+                    [2.60675576, 2.67095624, 2.94824436, 2.80520631, 2.87247516],
+                    [2.49543562, 2.5492281, 2.6505866, 2.65015308, 2.59738616],
+                ]
+            ),
+            np.array(
+                [
+                    [1.76783086, 2.21666738, 2.07939706, 1.9268263, 2.23360121],
+                    [1.94305547, 1.94648592, 2.1278921, 2.09481457, 2.08575238],
+                    [1.69336013, 1.72285186, 1.6339506, 1.61212022, 1.39198698],
+                ]
+            ),
+        ]
+
+        # inputs contains their associated X values
+
+        inputs = [
+            np.array([[0.0], [0.68040097], [1.20316795], [1.798749], [2.14891733]]),
+            np.array([[0.0], [0.51910637], [0.98259352], [1.57442965], [1.82515098]]),
+            np.array([[0.0], [0.66645478], [1.59464591], [1.69769551], [1.80932752]]),
+            np.array([[0.0], [0.87512108], [1.71881079], [2.67162871], [3.23761907]]),
+        ]
+
+        # try doing the clustering
+        active = GPy.util.cluster_with_offset.cluster(data, inputs)
+        # check to see that the clustering has correctly clustered the time series.
+        clusters = set([frozenset(cluster) for cluster in active])
+        assert set([1, 2]) in clusters, "Offset Clustering algorithm failed"
+        assert set([0, 3]) in clusters, "Offset Clustering algoirthm failed"
+
+
+class TestUnivariateGaussian:
+    def setup(self):
+        self.zz = [-5.0, -0.8, 0.0, 0.5, 2.0, 10.0]
+
+    def test_logPdfNormal(self):
+        from GPy.util.univariate_Gaussian import logPdfNormal
+
+        self.setup()
+
+        pySols = [
+            -13.4189385332,
+            -1.2389385332,
+            -0.918938533205,
+            -1.0439385332,
+            -2.9189385332,
+            -50.9189385332,
+        ]
+        diff = 0.0
+        for i in range(len(pySols)):
+            diff += abs(logPdfNormal(self.zz[i]) - pySols[i])
+        assert diff < 1e-10
+
+    def test_cdfNormal(self):
+        from GPy.util.univariate_Gaussian import cdfNormal
+
+        self.setup()
+
+        pySols = [
+            2.86651571879e-07,
+            0.211855398583,
+            0.5,
+            0.691462461274,
+            0.977249868052,
+            1.0,
+        ]
+        diff = 0.0
+        for i in range(len(pySols)):
+            diff += abs(cdfNormal(self.zz[i]) - pySols[i])
+        assert diff < 1e-10
+
+    def test_logCdfNormal(self):
+        from GPy.util.univariate_Gaussian import logCdfNormal
+
+        self.setup()
+
+        pySols = [
+            -15.064998394,
+            -1.55185131919,
+            -0.69314718056,
+            -0.368946415289,
+            -0.023012909329,
+            0.0,
+        ]
+        diff = 0.0
+        for i in range(len(pySols)):
+            diff += abs(logCdfNormal(self.zz[i]) - pySols[i])
+        assert diff < 1e-10
+
+    def test_derivLogCdfNormal(self):
+        from GPy.util.univariate_Gaussian import derivLogCdfNormal
+
+        self.setup()
+
+        pySols = [
+            5.18650396941,
+            1.3674022693,
+            0.79788456081,
+            0.50916043387,
+            0.0552478626962,
+            0.0,
+        ]
+        diff = 0.0
+        for i in range(len(pySols)):
+            diff += abs(derivLogCdfNormal(self.zz[i]) - pySols[i])
+        assert diff < 1e-8
+
+
+class TestStandardize:
+    def setup(self):
+        self.normalizer = GPy.util.normalizer.Standardize()
+        y = np.stack([np.random.randn(10), 2 * np.random.randn(10)], axis=1)
+        self.normalizer.scale_by(y)
+
+    def test_inverse_covariance(self):
+        """
+        Test inverse covariance outputs correct size
+        """
+        self.setup()
+        covariance = np.random.rand(100, 100)
+        output = self.normalizer.inverse_covariance(covariance)
+        assert output.shape == (100, 100, 2)
--- a/GPy/testing/variational_tests.py
+++ b/GPy/testing/variational_tests.py
@ -1,4 +1,4 @@
-'''
+"""
 Copyright (c) 2015, Max Zwiessele
 All rights reserved.

@ -26,38 +26,35 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-'''
-import unittest
+"""
 import GPy, numpy as np

-class KLGrad(GPy.core.Model):
-            def __init__(self, Xvar, kl):   
-                super(KLGrad, self).__init__(name="klgrad")     
-                self.kl = kl
-                self.link_parameter(Xvar)
-                self.Xvar = Xvar
-                self._obj = 0
-            def parameters_changed(self):
-                self.Xvar.gradient[:] = 0
-                self.kl.update_gradients_KL(self.Xvar)
-                self._obj = self.kl.KL_divergence(self.Xvar)
-            def objective_function(self):
-                return self._obj
-        
-class Test(unittest.TestCase):

-    def setUp(self):
+class KLGrad(GPy.core.Model):
+    def __init__(self, Xvar, kl):
+        super(KLGrad, self).__init__(name="klgrad")
+        self.kl = kl
+        self.link_parameter(Xvar)
+        self.Xvar = Xvar
+        self._obj = 0
+
+    def parameters_changed(self):
+        self.Xvar.gradient[:] = 0
+        self.kl.update_gradients_KL(self.Xvar)
+        self._obj = self.kl.KL_divergence(self.Xvar)
+
+    def objective_function(self):
+        return self._obj
+
+
+class TestVariational:
+    def setup(self):
        np.random.seed(12345)
        self.Xvar = GPy.core.parameterization.variational.NormalPosterior(
-            np.random.uniform(0,1,(10,3)), 
-            np.random.uniform(1e-5,.01, (10,3))
-            )
+            np.random.uniform(0, 1, (10, 3)), np.random.uniform(1e-5, 0.01, (10, 3))
+        )

-
-    def testNormal(self):
+    def test_normal(self):
+        self.setup()
        klgrad = KLGrad(self.Xvar, GPy.core.parameterization.variational.NormalPrior())
        np.testing.assert_(klgrad.checkgrad())
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.testNormal']
-    unittest.main()
--- a/GPy/testing/todo.md
+++ b/GPy/testing/todo.md
@ -0,0 +1,14 @@
+As off now, I am once through all of the tests and basic migration is done.
+
+Now, fix the below things and todos before starting to get the tests running using pytest
+
+
+ update test script names according to pytest conversion
+ check for TODOs
+ + there are many associated with "iscloseto" functions from np.testing. Will have to figure out how these
+ + some tests are not that clear to me tbh
+ check nomenclature of test files and test classes and test functions
+ chatgpt says that I should replace delta with the decimal but a delta of 1e-4 should be decimal=4. Not sure about this yet  but that is something I need to fix later on
+--> this gives more content to it: https://docs.python.org/3/library/unittest.html#unittest.TestCase.assertAlmostEqual
+I need to write a custom function that behaves accordingly as in some cases, np.testing.assert_almost_equal won't be applicable, https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html
+or how about this: `np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)`
--- a/GPy/testing/util_tests.py
+++ b/GPy/testing/util_tests.py
@ -1,242 +0,0 @@
-#===============================================================================
-# Copyright (c) 2016, Max Zwiessele, Alan Saul
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of GPy.testing.util_tests nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
-
-import unittest
-import numpy as np
-import GPy
-
-class TestDebug(unittest.TestCase):
-    def test_checkFinite(self):
-        from GPy.util.debug import checkFinite
-        array = np.random.normal(0, 1, 100).reshape(25,4)
-        self.assertTrue(checkFinite(array, name='test'))
-
-        array[np.random.binomial(1, .3, array.shape).astype(bool)] = np.nan
-        self.assertFalse(checkFinite(array))
-
-    def test_checkFullRank(self):
-        from GPy.util.debug import checkFullRank
-        from GPy.util.linalg import tdot
-        array = np.random.normal(0, 1, 100).reshape(25,4)
-        self.assertFalse(checkFullRank(tdot(array), name='test'))
-
-        array = np.random.normal(0, 1, (25,25))
-        self.assertTrue(checkFullRank(tdot(array)))
-
-    def test_fixed_inputs_median(self):
-        """ test fixed_inputs convenience function """
-        from GPy.plotting.matplot_dep.util import fixed_inputs
-        import GPy
-        X = np.random.randn(10, 3)
-        Y = np.sin(X) + np.random.randn(10, 3)*1e-3
-        m = GPy.models.GPRegression(X, Y)
-        fixed = fixed_inputs(m, [1], fix_routine='median', as_list=True, X_all=False)
-        self.assertTrue((0, np.median(X[:,0])) in fixed)
-        self.assertTrue((2, np.median(X[:,2])) in fixed)
-        self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
-
-    def test_fixed_inputs_mean(self):
-        from GPy.plotting.matplot_dep.util import fixed_inputs
-        import GPy
-        X = np.random.randn(10, 3)
-        Y = np.sin(X) + np.random.randn(10, 3)*1e-3
-        m = GPy.models.GPRegression(X, Y)
-        fixed = fixed_inputs(m, [1], fix_routine='mean', as_list=True, X_all=False)
-        self.assertTrue((0, np.mean(X[:,0])) in fixed)
-        self.assertTrue((2, np.mean(X[:,2])) in fixed)
-        self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
-
-    def test_fixed_inputs_zero(self):
-        from GPy.plotting.matplot_dep.util import fixed_inputs
-        import GPy
-        X = np.random.randn(10, 3)
-        Y = np.sin(X) + np.random.randn(10, 3)*1e-3
-        m = GPy.models.GPRegression(X, Y)
-        fixed = fixed_inputs(m, [1], fix_routine='zero', as_list=True, X_all=False)
-        self.assertTrue((0, 0.0) in fixed)
-        self.assertTrue((2, 0.0) in fixed)
-        self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
-
-    def test_fixed_inputs_uncertain(self):
-        from GPy.plotting.matplot_dep.util import fixed_inputs
-        import GPy
-        from GPy.core.parameterization.variational import NormalPosterior
-        X_mu = np.random.randn(10, 3)
-        X_var = np.random.randn(10, 3)
-        X = NormalPosterior(X_mu, X_var)
-        Y = np.sin(X_mu) + np.random.randn(10, 3)*1e-3
-        m = GPy.models.BayesianGPLVM(Y, X=X_mu, X_variance=X_var, input_dim=3)
-        fixed = fixed_inputs(m, [1], fix_routine='median', as_list=True, X_all=False)
-        self.assertTrue((0, np.median(X.mean.values[:,0])) in fixed)
-        self.assertTrue((2, np.median(X.mean.values[:,2])) in fixed)
-        self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
-
-    def test_DSYR(self):
-        from GPy.util.linalg import DSYR, DSYR_numpy
-        A = np.arange(9.0).reshape(3,3)
-        A = np.dot(A.T, A)
-        b = np.ones(3, dtype=float)
-        alpha = 1.0
-        DSYR(A, b, alpha)
-        R = np.array([
-            [46, 55, 64],
-            [55, 67, 79],
-            [64, 79, 94]]
-            )
-        self.assertTrue(abs(np.sum(A - R)) < 1e-12)
-
-    def test_subarray(self):
-        import GPy
-        X = np.zeros((3,6), dtype=bool)
-        X[[1,1,1],[0,4,5]] = 1
-        X[1:,[2,3]] = 1
-        d = GPy.util.subarray_and_sorting.common_subarrays(X,axis=1)
-        self.assertTrue(len(d) == 3)
-        X[:, d[tuple(X[:,0])]]
-        self.assertTrue(d[tuple(X[:,4])] == d[tuple(X[:,0])] == [0, 4, 5])
-        self.assertTrue(d[tuple(X[:,1])] == [1])
-
-    def test_offset_cluster(self):
-        #Tests the GPy.util.cluster_with_offset.cluster utility with a small
-        #test data set. Not using random noise just in case it occasionally
-        #causes it not to cluster correctly.
-        #groundtruth cluster identifiers are: [0,1,1,0]
-
-        #data contains a list of the four sets of time series (3 per data point)
-
-        data = [np.array([[ 2.18094245,  1.96529789,  2.00265523,  2.18218742,  2.06795428],
-                [ 1.62254829,  1.75748448,  1.83879347,  1.87531326,  1.52503496],
-                [ 1.54589609,  1.61607914,  2.00463192,  1.48771394,  1.63339218]]),
-         np.array([[ 2.86766106,  2.97953437,  2.91958876,  2.92510506,  3.03239241],
-                [ 2.57368423,  2.59954886,  3.10000395,  2.75806125,  2.89865704],
-                [ 2.58916318,  2.53698259,  2.63858411,  2.63102504,  2.51853901]]),
-         np.array([[ 2.77834168,  2.9618564 ,  2.88482141,  3.24259745,  2.9716821 ],
-                [ 2.60675576,  2.67095624,  2.94824436,  2.80520631,  2.87247516],
-                [ 2.49543562,  2.5492281 ,  2.6505866 ,  2.65015308,  2.59738616]]),
-         np.array([[ 1.76783086,  2.21666738,  2.07939706,  1.9268263 ,  2.23360121],
-                [ 1.94305547,  1.94648592,  2.1278921 ,  2.09481457,  2.08575238],
-                [ 1.69336013,  1.72285186,  1.6339506 ,  1.61212022,  1.39198698]])]
-
-        #inputs contains their associated X values
-
-        inputs = [np.array([[ 0.        ],
-                [ 0.68040097],
-                [ 1.20316795],
-                [ 1.798749  ],
-                [ 2.14891733]]), np.array([[ 0.        ],
-                [ 0.51910637],
-                [ 0.98259352],
-                [ 1.57442965],
-                [ 1.82515098]]), np.array([[ 0.        ],
-                [ 0.66645478],
-                [ 1.59464591],
-                [ 1.69769551],
-                [ 1.80932752]]), np.array([[ 0.        ],
-                [ 0.87512108],
-                [ 1.71881079],
-                [ 2.67162871],
-                [ 3.23761907]])]
-
-        #try doing the clustering
-        active = GPy.util.cluster_with_offset.cluster(data,inputs)
-        #check to see that the clustering has correctly clustered the time series.
-        clusters = set([frozenset(cluster) for cluster in active])
-        assert set([1,2]) in clusters, "Offset Clustering algorithm failed"
-        assert set([0,3]) in clusters, "Offset Clustering algoirthm failed"
-
-
-class TestUnivariateGaussian(unittest.TestCase):
-    def setUp(self):
-        self.zz = [-5.0, -0.8, 0.0, 0.5, 2.0, 10.0]
-
-    def test_logPdfNormal(self):
-        from GPy.util.univariate_Gaussian import logPdfNormal
-        pySols = [-13.4189385332,
-            -1.2389385332,
-            -0.918938533205,
-            -1.0439385332,
-            -2.9189385332,
-            -50.9189385332]
-        diff = 0.0
-        for i in range(len(pySols)):
-            diff += abs(logPdfNormal(self.zz[i]) - pySols[i])
-        self.assertTrue(diff  < 1e-10)
-
-    def test_cdfNormal(self):
-        from GPy.util.univariate_Gaussian import cdfNormal
-        pySols = [2.86651571879e-07,
-          0.211855398583,
-          0.5,
-          0.691462461274,
-          0.977249868052,
-          1.0]
-        diff = 0.0
-        for i in range(len(pySols)):
-            diff += abs(cdfNormal(self.zz[i]) - pySols[i])
-        self.assertTrue(diff  < 1e-10)
-
-    def test_logCdfNormal(self):
-        from GPy.util.univariate_Gaussian import logCdfNormal
-        pySols = [-15.064998394,
-          -1.55185131919,
-          -0.69314718056,
-          -0.368946415289,
-          -0.023012909329,
-          0.0]
-        diff = 0.0
-        for i in range(len(pySols)):
-            diff += abs(logCdfNormal(self.zz[i]) - pySols[i])
-        self.assertTrue(diff  < 1e-10)
-    def test_derivLogCdfNormal(self):
-        from GPy.util.univariate_Gaussian import derivLogCdfNormal
-        pySols = [5.18650396941,
-          1.3674022693,
-          0.79788456081,
-          0.50916043387,
-          0.0552478626962,
-          0.0]
-        diff = 0.0
-        for i in range(len(pySols)):
-          diff += abs(derivLogCdfNormal(self.zz[i]) - pySols[i])
-        self.assertTrue(diff  < 1e-8)
-
-class TestStandardize(unittest.TestCase):
-    def setUp(self):
-        self.normalizer = GPy.util.normalizer.Standardize()
-        y = np.stack([np.random.randn(10), 2*np.random.randn(10)], axis=1)
-        self.normalizer.scale_by(y)
-    
-    def test_inverse_covariance(self):
-        """
-        Test inverse covariance outputs correct size
-        """
-        covariance = np.random.rand(100, 100)
-        output = self.normalizer.inverse_covariance(covariance)
-        self.assertTrue(output.shape == (100, 100, 2))
--- a/GPy/util/classification.py
+++ b/GPy/util/classification.py
@ -2,7 +2,8 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np

-def conf_matrix(p,labels,names=['1','0'],threshold=.5,show=True):
+
+def conf_matrix(p, labels, names=["1", "0"], threshold=0.5, show=True):
    """
    Returns error rate and true/false positives in a binary classification problem
    - Actual classes are displayed by column.
@ -16,18 +17,18 @@ def conf_matrix(p,labels,names=['1','0'],threshold=.5,show=True):
    :type show: False|True
    """
    assert p.size == labels.size, "Arrays p and labels have different dimensions."
-    decision = np.ones((labels.size,1))
-    decision[p<threshold] = 0
+    decision = np.ones((labels.size, 1))
+    decision[p < threshold] = 0
    diff = decision - labels
    false_0 = diff[diff == -1].size
    false_1 = diff[diff == 1].size
-    true_1 = np.sum(decision[diff ==0])
+    true_1 = np.sum(decision[diff == 0])
    true_0 = labels.size - true_1 - false_0 - false_1
-    error = (false_1 + false_0)/np.float(labels.size)
+    error = (false_1 + false_0) / float(labels.size)
    if show:
-        print(100. - error * 100,'% instances correctly classified')
-        print('%-10s|  %-10s|  %-10s| ' % ('',names[0],names[1]))
-        print('----------|------------|------------|')
-        print('%-10s|  %-10s|  %-10s| ' % (names[0],true_1,false_0))
-        print('%-10s|  %-10s|  %-10s| ' % (names[1],false_1,true_0))
-    return error,true_1, false_1, true_0, false_0
+        print(100.0 - error * 100, "% instances correctly classified")
+        print("%-10s|  %-10s|  %-10s| " % ("", names[0], names[1]))
+        print("----------|------------|------------|")
+        print("%-10s|  %-10s|  %-10s| " % (names[0], true_1, false_0))
+        print("%-10s|  %-10s|  %-10s| " % (names[1], false_1, true_0))
+    return error, true_1, false_1, true_0, false_0
--- a/GPy/util/multioutput.py
+++ b/GPy/util/multioutput.py
@ -2,6 +2,7 @@ import numpy as np
 import warnings
 import GPy

+
 def index_to_slices(index):
    """
    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index.
@ -16,28 +17,35 @@ def index_to_slices(index):
    returns
    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
    """
-    if len(index)==0:
-        return[]
+    if len(index) == 0:
+        return []

-    #contruct the return structure
-    ind = np.asarray(index,dtype=np.int)
-    ret = [[] for i in range(ind.max()+1)]
+    # contruct the return structure
+    ind = np.asarray(index, dtype=int)
+    ret = [[] for i in range(ind.max() + 1)]

-    #find the switchpoints
-    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
-    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
+    # find the switchpoints
+    ind_ = np.hstack((ind, ind[0] + ind[-1] + 1))
+    switchpoints = np.nonzero(ind_ - np.roll(ind_, +1))[0]

-    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
+    [
+        ret[ind_i].append(slice(*indexes_i))
+        for ind_i, indexes_i in zip(
+            ind[switchpoints[:-1]], zip(switchpoints, switchpoints[1:])
+        )
+    ]
    return ret

+
 def get_slices(input_list):
    num_outputs = len(input_list)
-    _s = [0] + [ _x.shape[0] for _x in input_list ]
+    _s = [0] + [_x.shape[0] for _x in input_list]
    _s = np.cumsum(_s)
-    slices = [slice(a,b) for a,b in zip(_s[:-1],_s[1:])]
+    slices = [slice(a, b) for a, b in zip(_s[:-1], _s[1:])]
    return slices

-def build_XY(input_list,output_list=None,index=None):
+
+def build_XY(input_list, output_list=None, index=None):
    num_outputs = len(input_list)
    if output_list is not None:
        assert num_outputs == len(output_list)
@ -47,27 +55,35 @@ def build_XY(input_list,output_list=None,index=None):

    if index is not None:
        assert len(index) == num_outputs
-        I = np.hstack( [np.repeat(j,_x.shape[0]) for _x,j in zip(input_list,index)] )
+        I = np.hstack([np.repeat(j, _x.shape[0]) for _x, j in zip(input_list, index)])
    else:
-        I = np.hstack( [np.repeat(j,_x.shape[0]) for _x,j in zip(input_list,range(num_outputs))] )
+        I = np.hstack(
+            [np.repeat(j, _x.shape[0]) for _x, j in zip(input_list, range(num_outputs))]
+        )

    X = np.vstack(input_list)
-    X = np.hstack([X,I[:,None]])
+    X = np.hstack([X, I[:, None]])

-    return X,Y,I[:,None]#slices
+    return X, Y, I[:, None]  # slices

-def build_likelihood(Y_list,noise_index,likelihoods_list=None):
+
+def build_likelihood(Y_list, noise_index, likelihoods_list=None):
    Ny = len(Y_list)
    if likelihoods_list is None:
-       likelihoods_list = [GPy.likelihoods.Gaussian(name="Gaussian_noise_%s" %j) for y,j in zip(Y_list,range(Ny))]
+        likelihoods_list = [
+            GPy.likelihoods.Gaussian(name="Gaussian_noise_%s" % j)
+            for y, j in zip(Y_list, range(Ny))
+        ]
    else:
        assert len(likelihoods_list) == Ny
-    #likelihood = GPy.likelihoods.mixed_noise.MixedNoise(likelihoods_list=likelihoods_list, noise_index=noise_index)
-    likelihood = GPy.likelihoods.mixed_noise.MixedNoise(likelihoods_list=likelihoods_list)
+    # likelihood = GPy.likelihoods.mixed_noise.MixedNoise(likelihoods_list=likelihoods_list, noise_index=noise_index)
+    likelihood = GPy.likelihoods.mixed_noise.MixedNoise(
+        likelihoods_list=likelihoods_list
+    )
    return likelihood


-def ICM(input_dim, num_outputs, kernel, W_rank=1,W=None,kappa=None,name='ICM'):
+def ICM(input_dim, num_outputs, kernel, W_rank=1, W=None, kappa=None, name="ICM"):
    """
    Builds a kernel for an Intrinsic Coregionalization Model

@ -80,13 +96,26 @@ def ICM(input_dim, num_outputs, kernel, W_rank=1,W=None,kappa=None,name='ICM'):
    """
    if kernel.input_dim != input_dim:
        kernel.input_dim = input_dim
-        warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
+        warnings.warn(
+            "kernel's input dimension overwritten to fit input_dim parameter."
+        )

-    K = kernel.prod(GPy.kern.Coregionalize(1, num_outputs, active_dims=[input_dim], rank=W_rank,W=W,kappa=kappa,name='B'),name=name)
+    K = kernel.prod(
+        GPy.kern.Coregionalize(
+            1,
+            num_outputs,
+            active_dims=[input_dim],
+            rank=W_rank,
+            W=W,
+            kappa=kappa,
+            name="B",
+        ),
+        name=name,
+    )
    return K


-def LCM(input_dim, num_outputs, kernels_list, W_rank=1,name='ICM'):
+def LCM(input_dim, num_outputs, kernels_list, W_rank=1, name="ICM"):
    """
    Builds a kernel for an Linear Coregionalization Model

@ -98,15 +127,15 @@ def LCM(input_dim, num_outputs, kernels_list, W_rank=1,name='ICM'):
    :type W_rank: integer
    """
    Nk = len(kernels_list)
-    K = ICM(input_dim,num_outputs,kernels_list[0],W_rank,name='%s%s' %(name,0))
+    K = ICM(input_dim, num_outputs, kernels_list[0], W_rank, name="%s%s" % (name, 0))
    j = 1
    for kernel in kernels_list[1:]:
-        K += ICM(input_dim,num_outputs,kernel,W_rank,name='%s%s' %(name,j))
+        K += ICM(input_dim, num_outputs, kernel, W_rank, name="%s%s" % (name, j))
        j += 1
    return K


-def Private(input_dim, num_outputs, kernel, output, kappa=None,name='X'):
+def Private(input_dim, num_outputs, kernel, output, kappa=None, name="X"):
    """
    Builds a kernel for an Intrinsic Coregionalization Model

@ -117,7 +146,7 @@ def Private(input_dim, num_outputs, kernel, output, kappa=None,name='X'):
    :param W_rank: number tuples of the corregionalization parameters 'W'
    :type W_rank: integer
    """
-    K = ICM(input_dim,num_outputs,kernel,W_rank=1,kappa=kappa,name=name)
+    K = ICM(input_dim, num_outputs, kernel, W_rank=1, kappa=kappa, name=name)
    K.B.W.fix(0)
    _range = range(num_outputs)
    _range.pop(output)
--- a/README.md
+++ b/README.md
@ -129,7 +129,7 @@ If you're having trouble installing GPy via `pip install GPy` here is a probable
    cd GPy
    git checkout devel
    python setup.py build_ext --inplace
-    nosetests GPy/testing
+    pytest .

 ### Direct downloads

@ -171,13 +171,13 @@ print(m_load)

 New way of running tests is using coverage:

-Ensure nose and coverage is installed:
+Ensure pytest and coverage is installed:

-    pip install nose coverage
+    pip install pytest

 Run nosetests from root directory of repository:

-    coverage run travis_tests.py
+    python travis_tests.py

 Create coverage report in htmlcov/

--- a/appveyor.yml
+++ b/appveyor.yml
@ -33,13 +33,12 @@ install:
 # We need wheel installed to build wheels
 - python -m pip install wheel
 # GPy needs paramz
- - python -m pip install paramz
- - python -m pip install nose-show-skipped
 - python -m pip install coverage
 - python -m pip install coveralls
 - python -m pip install codecov
 - python -m pip install twine
- - "python setup.py develop"
+ - python -m pip install pytest
+ - python setup.py develop

 build: off

--- a/benchmarks/regression/evaluation.py
+++ b/benchmarks/regression/evaluation.py
@ -4,18 +4,19 @@
 import abc
 import numpy as np

+
 class Evaluation(object):
    __metaclass__ = abc.ABCMeta
-    
+
    @abc.abstractmethod
    def evaluate(self, gt, pred):
        """Compute a scalar for access the performance"""
        return None

+
 class RMSE(Evaluation):
    "Rooted Mean Square Error"
-    name = 'RMSE'
-    
+    name = "RMSE"
+
    def evaluate(self, gt, pred):
-        return np.sqrt(np.square(gt-pred).astype(np.float).mean())
-    
+        return np.sqrt(np.square(gt - pred).astype(float).mean())
--- a/doc/source/requirements.txt
+++ b/doc/source/requirements.txt
@ -7,4 +7,4 @@ paramz
 cython
 mock
 sympy
-nose
+pytest
--- a/setup.py
+++ b/setup.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

-#===============================================================================
+# ===============================================================================
 # Copyright (c) 2012 - 2014, GPy authors (see AUTHORS.txt).
 # Copyright (c) 2014, James Hensman, Max Zwiessele
 # Copyright (c) 2015, Max Zwiessele
@ -32,7 +32,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
+# ===============================================================================

 from __future__ import print_function
 import os
@ -45,22 +45,26 @@ try:
 except NameError:
    ModuleNotFoundError = ImportError

+
 def read(fname):
-    with codecs.open(fname, 'r', 'latin') as f:
+    with codecs.open(fname, "r", "latin") as f:
        return f.read()

+
 def read_to_rst(fname):
    try:
        import pypandoc
-        rstname = "{}.{}".format(os.path.splitext(fname)[0], 'rst')
-        pypandoc.convert(read(fname), 'rst', format='md', outputfile=rstname)
-        with open(rstname, 'r') as f:
+
+        rstname = "{}.{}".format(os.path.splitext(fname)[0], "rst")
+        pypandoc.convert(read(fname), "rst", format="md", outputfile=rstname)
+        with open(rstname, "r") as f:
            rststr = f.read()
        return rststr
-        #return read(rstname)
+        # return read(rstname)
    except ImportError:
        return read(fname)

+
 desc = """

 Please refer to the github homepage for detailed instructions on installation and usage.
@ -68,155 +72,192 @@ Please refer to the github homepage for detailed instructions on installation an
 """

 version_dummy = {}
-exec(read('GPy/__version__.py'), version_dummy)
-__version__ = version_dummy['__version__']
+exec(read("GPy/__version__.py"), version_dummy)
+__version__ = version_dummy["__version__"]
 del version_dummy

-#Mac OS X Clang doesn't support OpenMP at the current time.
-#This detects if we are building on a Mac
+
+# Mac OS X Clang doesn't support OpenMP at the current time.
+# This detects if we are building on a Mac
 def ismac():
-    return sys.platform[:6] == 'darwin'
+    return sys.platform[:6] == "darwin"
+

 if ismac():
-    compile_flags = [ '-O3', ]
+    compile_flags = [
+        "-O3",
+    ]
    link_args = []
 else:
-    compile_flags = [ '-fopenmp', '-O3']
-    link_args = ['-lgomp' ]
+    compile_flags = ["-fopenmp", "-O3"]
+    link_args = ["-lgomp"]

 try:
    # So that we don't need numpy installed to determine it's a dependency.
    import numpy as np

-    ext_mods = [Extension(name='GPy.kern.src.stationary_cython',
-                          sources=['GPy/kern/src/stationary_cython.pyx',
-                                   'GPy/kern/src/stationary_utils.c'],
-                          include_dirs=[np.get_include(), '.'],
-                          extra_compile_args=compile_flags,
-                          extra_link_args=link_args),
-                Extension(name='GPy.util.choleskies_cython',
-                          sources=['GPy/util/choleskies_cython.pyx'],
-                          include_dirs=[np.get_include(), '.'],
-                          extra_link_args=link_args,
-                          extra_compile_args=compile_flags),
-                Extension(name='GPy.util.linalg_cython',
-                          sources=['GPy/util/linalg_cython.pyx'],
-                          include_dirs=[np.get_include(), '.'],
-                          extra_compile_args=compile_flags,
-                          extra_link_args=link_args),
-                Extension(name='GPy.kern.src.coregionalize_cython',
-                          sources=['GPy/kern/src/coregionalize_cython.pyx'],
-                          include_dirs=[np.get_include(), '.'],
-                          extra_compile_args=compile_flags,
-                          extra_link_args=link_args),
-                Extension(name='GPy.models.state_space_cython',
-                          sources=['GPy/models/state_space_cython.pyx'],
-                          include_dirs=[np.get_include(), '.'],
-                          extra_compile_args=compile_flags,
-                          extra_link_args=link_args)]
+    ext_mods = [
+        Extension(
+            name="GPy.kern.src.stationary_cython",
+            sources=[
+                "GPy/kern/src/stationary_cython.pyx",
+                "GPy/kern/src/stationary_utils.c",
+            ],
+            include_dirs=[np.get_include(), "."],
+            extra_compile_args=compile_flags,
+            extra_link_args=link_args,
+        ),
+        Extension(
+            name="GPy.util.choleskies_cython",
+            sources=["GPy/util/choleskies_cython.pyx"],
+            include_dirs=[np.get_include(), "."],
+            extra_link_args=link_args,
+            extra_compile_args=compile_flags,
+        ),
+        Extension(
+            name="GPy.util.linalg_cython",
+            sources=["GPy/util/linalg_cython.pyx"],
+            include_dirs=[np.get_include(), "."],
+            extra_compile_args=compile_flags,
+            extra_link_args=link_args,
+        ),
+        Extension(
+            name="GPy.kern.src.coregionalize_cython",
+            sources=["GPy/kern/src/coregionalize_cython.pyx"],
+            include_dirs=[np.get_include(), "."],
+            extra_compile_args=compile_flags,
+            extra_link_args=link_args,
+        ),
+        Extension(
+            name="GPy.models.state_space_cython",
+            sources=["GPy/models/state_space_cython.pyx"],
+            include_dirs=[np.get_include(), "."],
+            extra_compile_args=compile_flags,
+            extra_link_args=link_args,
+        ),
+    ]
 except ModuleNotFoundError:
    ext_mods = []

-install_requirements = ['numpy>=1.7', 'six', 'paramz>=0.9.0', 'cython>=0.29']
-matplotlib_version = 'matplotlib==3.3.4'
-install_requirements += ['scipy>=1.3.0']
+install_requirements = [
+    "numpy>=1.7",
+    "six",
+    # "paramz @ git+https://github.com/connorfuhrman/paramz/tree/connorfuhrman/np_type_alias_dep.git",
+    "paramz @ git+https://github.com/MartinBubel/paramz.git@fix-numpy-types",
+    "cython>=0.29",
+]
+# 'some-pkg @ git+ssh://git@github.com/someorgname/pkg-repo-name@v1.1#egg=some-pkg',
+matplotlib_version = "matplotlib==3.3.4"
+install_requirements += ["scipy>=1.3.0"]

-setup(name = 'GPy',
-      version = __version__,
-      author = read_to_rst('AUTHORS.txt'),
-      author_email = "gpy.authors@gmail.com",
-      description = ("The Gaussian Process Toolbox"),
-      long_description = desc,
-      license = "BSD 3-clause",
-      keywords = "machine-learning gaussian-processes kernels",
-      url = "https://sheffieldml.github.io/GPy/",
-      download_url='https://github.com/SheffieldML/GPy/archive/refs/heads/devel.zip',
-      ext_modules = ext_mods,
-      packages = ["GPy",
-                  "GPy.core",
-                  "GPy.core.parameterization",
-                  "GPy.kern",
-                  "GPy.kern.src",
-                  "GPy.kern.src.psi_comp",
-                  "GPy.models",
-                  "GPy.inference",
-                  "GPy.inference.optimization",
-                  "GPy.inference.mcmc",
-                  "GPy.inference.latent_function_inference",
-                  "GPy.likelihoods",
-                  "GPy.mappings",
-                  "GPy.examples",
-                  "GPy.testing",
-                  "GPy.util",
-                  "GPy.plotting",
-                  "GPy.plotting.gpy_plot",
-                  "GPy.plotting.matplot_dep",
-                  "GPy.plotting.matplot_dep.controllers",
-                  "GPy.plotting.plotly_dep",
-                  ],
-      package_dir={'GPy': 'GPy'},
-      #package_data = {'GPy': ['defaults.cfg', 'installation.cfg',
-      #                        'util/data_resources.json',
-      #                        'util/football_teams.json',
-      #                        'testing/plotting_tests/baseline/*.png'
-      #                        ]},
-      #data_files=[('GPy/testing/plotting_tests/baseline', 'testing/plotting_tests/baseline/*.png'),
-      #            ('GPy/testing/', 'GPy/testing/pickle_test.pickle'),
-      #             ],
-      include_package_data = True,
-      py_modules = ['GPy.__init__'],
-      test_suite = 'GPy.testing',
-      setup_requires = ['numpy>=1.7'],
-      install_requires = install_requirements,
-      extras_require = {'docs':['sphinx'],
-                        'optional':['mpi4py',
-                                    'ipython>=4.0.0',
-                                    ],
-                        #matplotlib Version see github issue #955
-                        'plotting':[matplotlib_version,
-                                    'plotly >= 1.8.6'],
-                        'notebook':['jupyter_client >= 4.0.6',
-                                    'ipywidgets >= 4.0.3',
-                                    'ipykernel >= 4.1.0',
-                                    'notebook >= 4.0.5',
-                                    ],
-                        },
-      classifiers=['License :: OSI Approved :: BSD License',
-                   'Natural Language :: English',
-                   'Operating System :: MacOS :: MacOS X',
-                   'Operating System :: Microsoft :: Windows',
-                   'Operating System :: POSIX :: Linux',
-                   'Programming Language :: Python :: 3.5',
-                   'Programming Language :: Python :: 3.6',
-                   'Programming Language :: Python :: 3.7',
-                   'Programming Language :: Python :: 3.8',
-                   'Programming Language :: Python :: 3.9',
-                   'Framework :: IPython',
-                   'Intended Audience :: Science/Research',
-                   'Intended Audience :: Developers',
-                   'Topic :: Software Development',
-                   'Topic :: Software Development :: Libraries :: Python Modules',
-
-                   ],
-      project_urls = {"Source Code": "https://github.com/SheffieldML/GPy",
-                      "Bug Tracker": "https://github.com/SheffieldML/GPy/issues",
-                     }
-      )
+setup(
+    name="GPy",
+    version=__version__,
+    author=read_to_rst("AUTHORS.txt"),
+    author_email="gpy.authors@gmail.com",
+    description=("The Gaussian Process Toolbox"),
+    long_description=desc,
+    license="BSD 3-clause",
+    keywords="machine-learning gaussian-processes kernels",
+    url="https://sheffieldml.github.io/GPy/",
+    download_url="https://github.com/SheffieldML/GPy/archive/refs/heads/devel.zip",
+    ext_modules=ext_mods,
+    packages=[
+        "GPy",
+        "GPy.core",
+        "GPy.core.parameterization",
+        "GPy.kern",
+        "GPy.kern.src",
+        "GPy.kern.src.psi_comp",
+        "GPy.models",
+        "GPy.inference",
+        "GPy.inference.optimization",
+        "GPy.inference.mcmc",
+        "GPy.inference.latent_function_inference",
+        "GPy.likelihoods",
+        "GPy.mappings",
+        "GPy.examples",
+        "GPy.testing",
+        "GPy.util",
+        "GPy.plotting",
+        "GPy.plotting.gpy_plot",
+        "GPy.plotting.matplot_dep",
+        "GPy.plotting.matplot_dep.controllers",
+        "GPy.plotting.plotly_dep",
+    ],
+    package_dir={"GPy": "GPy"},
+    # package_data = {'GPy': ['defaults.cfg', 'installation.cfg',
+    #                        'util/data_resources.json',
+    #                        'util/football_teams.json',
+    #                        'testing/plotting_tests/baseline/*.png'
+    #                        ]},
+    # data_files=[('GPy/testing/plotting_tests/baseline', 'testing/plotting_tests/baseline/*.png'),
+    #            ('GPy/testing/', 'GPy/testing/pickle_test.pickle'),
+    #             ],
+    include_package_data=True,
+    py_modules=["GPy.__init__"],
+    test_suite="GPy.testing",
+    setup_requires=["numpy>=1.7"],
+    install_requires=install_requirements,
+    extras_require={
+        "docs": ["sphinx"],
+        "optional": [
+            "mpi4py",
+            "ipython>=4.0.0",
+        ],
+        # matplotlib Version see github issue #955
+        "plotting": [matplotlib_version, "plotly >= 1.8.6"],
+        "notebook": [
+            "jupyter_client >= 4.0.6",
+            "ipywidgets >= 4.0.3",
+            "ipykernel >= 4.1.0",
+            "notebook >= 4.0.5",
+        ],
+        "dev": ["pytest", "matplotlib", "pods"],
+    },
+    classifiers=[
+        "License :: OSI Approved :: BSD License",
+        "Natural Language :: English",
+        "Operating System :: MacOS :: MacOS X",
+        "Operating System :: Microsoft :: Windows",
+        "Operating System :: POSIX :: Linux",
+        "Programming Language :: Python :: 3.5",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Framework :: IPython",
+        "Intended Audience :: Science/Research",
+        "Intended Audience :: Developers",
+        "Topic :: Software Development",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+    project_urls={
+        "Source Code": "https://github.com/SheffieldML/GPy",
+        "Bug Tracker": "https://github.com/SheffieldML/GPy/issues",
+    },
+)


 # Check config files and settings:
-local_file = os.path.abspath(os.path.join(os.path.dirname(__file__), 'GPy', 'installation.cfg'))
-home = os.getenv('HOME') or os.getenv('USERPROFILE')
-user_file = os.path.join(home,'.config', 'GPy', 'user.cfg')
+local_file = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "GPy", "installation.cfg")
+)
+home = os.getenv("HOME") or os.getenv("USERPROFILE")
+user_file = os.path.join(home, ".config", "GPy", "user.cfg")

 print("")
 try:
    if not os.path.exists(user_file):
        # Does an old config exist?
-        old_user_file = os.path.join(home,'.gpy_user.cfg')
+        old_user_file = os.path.join(home, ".gpy_user.cfg")
        if os.path.exists(old_user_file):
            # Move it to new location:
-            print("GPy: Found old config file, moving to new location {}".format(user_file))
+            print(
+                "GPy: Found old config file, moving to new location {}".format(
+                    user_file
+                )
+            )
            if not os.path.exists(os.path.dirname(user_file)):
                os.makedirs(os.path.dirname(user_file))
            os.rename(old_user_file, user_file)
@ -225,8 +266,8 @@ try:
            print("GPy: Saving user configuration file to {}".format(user_file))
            if not os.path.exists(os.path.dirname(user_file)):
                os.makedirs(os.path.dirname(user_file))
-            with open(user_file, 'w') as f:
-                with open(local_file, 'r') as l:
+            with open(user_file, "w") as f:
+                with open(local_file, "r") as l:
                    tmp = l.read()
                    f.write(tmp)
    else:
--- a/travis_tests.py
+++ b/travis_tests.py
@ -1,4 +1,4 @@
-#===============================================================================
+# ===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 #
 # All rights reserved.
@ -27,14 +27,12 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
+# ===============================================================================

 #!/usr/bin/env python
+import pytest
 import matplotlib
-matplotlib.use('agg')

-import nose, warnings
-with warnings.catch_warnings():
-    warnings.simplefilter("ignore")
-    nose.main('GPy', defaultTest='GPy/testing', argv=['', '--show-skipped'])
+matplotlib.use("agg")

+pytest.main(["GPy/testing/"])