Merge pull request #1084 from SheffieldML/devel

Release version 1.13.2
2026-07-23 17:01:06 +02:00 · 2024-07-21 17:35:25 +02:00 · 2024-07-21 17:35:25 +02:00 · 7e1cb7adee
commit 7e1cb7adee
parent 6254451513 282fcd4d68
103 changed files with 35568 additions and 21650 deletions
--- a/.appveyor_twine_upload.bat
+++ b/.appveyor_twine_upload.bat
@ -1,5 +0,0 @@
-IF "%APPVEYOR_REPO_BRANCH%"=="deploy" (
-  twine upload --skip-existing dist/*
-) ELSE (
-  ECHO Only deploy on deploy branch
-)
--- a/.github/workflows/test-and-deploy.yml
+++ b/.github/workflows/test-and-deploy.yml
@ -0,0 +1,268 @@
+name: "Test Python Lib"
+on:
+  push:
+    branches:
+      - main
+      - devel
+      - deploy
+  pull_request:
+  release:
+
+permissions:
+  contents: read
+  pull-requests: read
+
+jobs:
+  test-windows:
+    strategy:
+      matrix:
+        os: [windows-latest]
+        python: ['3.9', '3.10', '3.11', '3.12']
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Install build dependencies
+        run: |
+          pip install setuptools
+      
+      - name: Install lib
+        run: |
+          python setup.py develop
+      
+      - name: Install test dependencies
+        run: |
+          pip install matplotlib
+          pip install pytest
+
+      - name: pytest
+        run: |
+          pytest GPy/testing
+
+  test-linux:
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python: ['3.9', '3.10', '3.11', '3.12']
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Install build dependencies
+        run: |
+          pip install setuptools
+
+      - name: Install lib
+        run: |
+          python setup.py develop
+
+      - name: Install test dependencies
+        run: |
+          pip install matplotlib
+          pip install pytest
+
+      - name: pytest
+        run: |
+          pytest GPy/testing
+
+  test-macos:
+    strategy:
+      matrix:
+        os: [macos-latest]
+        python: ['3.10', '3.11', '3.12']
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Install dependencies
+        run: |
+          pip install setuptools
+      
+      - name: Install lib
+        run: |
+          python setup.py develop
+
+      - name: Install test dependencies
+        run: |
+          pip install matplotlib
+          pip install pytest
+
+      - name: pytest
+        run: |
+          pytest GPy/testing
+
+  build-windows:
+    if: github.event_name == 'release'
+    strategy:
+      matrix:
+        os: [windows-latest]
+        python: ['3.9', '3.10', '3.11', '3.12']
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Build lib
+        run: |
+          pip install wheel
+          python setup.py develop
+          python setup.py bdist_wheel
+          python setup.py sdist bdist_wheel
+
+      - name: List contents of dist
+        run: ls -R dist
+
+      - name: Archive build artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: dist-artifacts-${{ matrix.os }}-${{ matrix.python }}
+          path: dist
+
+  build-macos:
+    if: github.event_name == 'release'
+    strategy:
+      matrix:
+        os: [macos-latest]
+        python: ['3.10', '3.11', '3.12']  # 3.9 triggers scipy issues when installing
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Build lib
+        run: |
+          pip install wheel
+          python setup.py develop
+          python setup.py bdist_wheel
+
+      - name: List contents of dist
+        run: ls -R dist
+
+      - name: Archive build artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: dist-artifacts-${{ matrix.os }}-${{ matrix.python }}
+          path: dist/*
+
+  build-linux:
+    if: github.event_name == 'release'
+    strategy:
+        matrix:
+          python: ['cp39-cp39', 'cp310-cp310', 'cp311-cp311', 'cp312-cp312']
+    runs-on: ubuntu-latest
+    container:
+      image: quay.io/pypa/manylinux2014_x86_64
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Compile c headers
+        run: |
+          /opt/python/${{ matrix.python }}/bin/python setup.py develop
+
+      - name: Build wheel files
+        run: |
+          /opt/python/${{ matrix.python }}/bin/python setup.py bdist_wheel
+
+      - name: Install auditwheel  # this should be available?!
+        run: |
+          /opt/python/${{ matrix.python }}/bin/python -m pip install auditwheel
+      
+      - name: Repair wheel files
+        run: |
+          /opt/python/${{ matrix.python }}/bin/python -m auditwheel repair dist/*${{ matrix.python }}-linux_x86_64.whl
+
+      - name: List contents of dist
+        run: ls -R dist
+
+      - name: List contests of wheelhouse
+        run: ls -R wheelhouse
+
+      - name: Move wheelhouse wheel files to dist
+        run: |
+          rm dist/*
+          mv wheelhouse/* dist/
+          rmdir wheelhouse
+
+      - name: List contents of dist
+        run: ls -R dist
+
+      - name: Archive build artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: dist-artifacts-manylinux-${{ matrix.python }}
+          path: dist/*
+
+  deploy:
+    runs-on: ubuntu-latest
+    needs: [test-windows, test-linux, test-macos, build-linux, build-windows, build-macos]
+    if: github.event_name == 'release'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
+
+      - name: Install twine
+        run: |
+          pip install --upgrade pip
+          pip install twine
+    
+      - name: Download all artifacts to a specific directory
+        uses: actions/download-artifact@v3
+        with:
+          path: dist
+
+      - name: Create dist directory
+        run: mkdir -p dist
+
+      - name: Move files from subdirectories
+        run: |
+          for subdirectory in dist/*/; do
+            dir_name=$(basename "$subdirectory")
+            mv "$subdirectory"* dist/
+            rm -r "$subdirectory"
+            echo "Moved files from '$dir_name' to 'dist/'"
+          done
+
+      - name: Inspect wheel files
+        run: |
+          ls -R dist
+
+      - name: Upload to PyPI using twine
+        run: twine upload --skip-existing dist/*
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
--- a/.gitignore
+++ b/.gitignore
@ -55,4 +55,9 @@ iterate.dat
 GPy*.rst

 # vscode
-settings.json
+settings.json
+
+# local dev
+.eggs
+.venv
+.env
--- a/.travis.yml
+++ b/.travis.yml
@ -1,73 +0,0 @@
-sudo: false
-
-osx_image: xcode12.2
-
-os:
- osx
- linux
-
-addons:
-  apt_packages:
-    - pandoc
-
-#cache:
-#  directories:
-#  - "$HOME/download/"
-#  - "$HOME/install/"
-
-env:
-  - PYTHON_VERSION=3.6
-  - PYTHON_VERSION=3.7
-  - PYTHON_VERSION=3.8
-  - PYTHON_VERSION=3.9
-
-before_install:
- wget https://github.com/mzwiessele/travis_scripts/raw/master/download_miniconda.sh
- wget https://github.com/mzwiessele/travis_scripts/raw/master/install_retry.sh
- source download_miniconda.sh
- echo $PATH
-
-install:
- echo $PATH
- source install_retry.sh
- if [[ "$TRAVIS_OS_NAME" == "osx" ]];
-  then
-    conda install --yes pandoc;
-  fi;
- pip install codecov
- pip install coveralls
- pip install pypandoc
- pip install git+git://github.com/BRML/climin.git
- pip install autograd
- pip install nose-show-skipped
- python setup.py develop
-
-script:
-  - coverage run travis_tests.py
-
-after_success:
-  - codecov
-  - coveralls
-
-before_deploy:
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]];
-    then
-      export DIST='sdist bdist_rpm bdist_dumb';
-    elif [[ "$TRAVIS_OS_NAME" == "osx" ]];
-    then
-      export DIST='bdist_wheel';
-    fi;
-
-deploy:
-  provider: pypi
-  user: maxz
-  password:
-    secure: "vMEOlP7DQhFJ7hQAKtKC5hrJXFl5BkUt4nXdosWWiw//Kg8E+PPLg88XPI2gqIosir9wwgtbSBBbbwCxkM6uxRNMpoNR8Ixyv9fmSXp4rLl7bbBY768W7IRXKIBjpuEy2brQjoT+CwDDSzUkckHvuUjJDNRvUv8ab4P/qYO1LG4="
-  on:
-    branch: deploy
-  edge:
-    branch: v1.8.45
-  distributions: $DIST
-  skip_existing: true
-  skip_cleanup: true
-  skip_upload_docs: false
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,29 @@
 # Changelog

+## Unreleased
+
+## v1.13.2 (2024-07-21)
+* update string checks in initialization method for latent variable and put `empirical_samples` init-method on a deprecation path
+
+* update dependencies to `numpy>=1.7.0,<2.0.0`
+
+* update dependencies to `numpy>=1.7.0,<2.0.0`
+
+* update import in `.plotting.matplot_dep.defaults` due to change in matplotlib
+
+* Correct dl_dm term in student t inference #1065
+
+## v1.13.1 (2024-01-14)
+
+* limit `scipy<1.12` as macos and linux jobs install some pre-release version of `scipy==1.12` which breaks tests
+
+## v1.13.0 (2023-12-20)
+
+* update `paramz` depdency to `>=0.9.6`
+
+* limit supported python versions to `">=3.9"` in accordance with numpy
+
+* Change from `nosetest` to `pytest`

 ## v1.9.8 (2019-05-17)

--- a/GPy/init.py
+++ b/GPy/init.py
@ -1,6 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import warnings
+
 warnings.filterwarnings("ignore", category=DeprecationWarning)

 from . import core
@ -18,30 +19,25 @@ from .util import normalizer

 # backwards compatibility
 import sys
-backwards_compatibility = ['lists_and_dicts', 'observable_array', 'index_operations']
+
+backwards_compatibility = ["lists_and_dicts", "observable_array", "index_operations"]
 for bc in backwards_compatibility:
-    sys.modules['GPy.core.parameterization.{!s}'.format(bc)] = getattr(core.parameterization, bc)
+    sys.modules["GPy.core.parameterization.{!s}".format(bc)] = getattr(
+        core.parameterization, bc
+    )

 # Direct imports for convenience:
 from .core import Model
 from .core.parameterization import priors
-from .core.parameterization import Param, Parameterized, ObsAr, transformations as constraints
+from .core.parameterization import (
+    Param,
+    Parameterized,
+    ObsAr,
+    transformations as constraints,
+)

 from .__version__ import __version__

-from numpy.testing import Tester
-
-with warnings.catch_warnings():
-    warnings.simplefilter('ignore')
-    try:
-        #Get rid of nose dependency by only ignoring if you have nose installed
-        from nose.tools import nottest
-        @nottest
-        def tests(verbose=10):
-            Tester(testing).test(verbose=verbose)
-    except:
-        def tests(verbose=10):
-            Tester(testing).test(verbose=verbose)

 def load(file_or_path):
    """
@ -52,10 +48,12 @@ def load(file_or_path):
    # This is the pickling pain when changing _src -> src
    import sys
    import inspect
-    sys.modules['GPy.kern._src'] = kern.src
+
+    sys.modules["GPy.kern._src"] = kern.src
    for name, module in inspect.getmembers(kern.src):
-        if not name.startswith('_'):
-            sys.modules['GPy.kern._src.{}'.format(name)] = module
-    sys.modules['GPy.inference.optimization'] = inference.optimization
+        if not name.startswith("_"):
+            sys.modules["GPy.kern._src.{}".format(name)] = module
+    sys.modules["GPy.inference.optimization"] = inference.optimization
    import paramz
+
    return paramz.load(file_or_path)
--- a/GPy/version.py
+++ b/GPy/version.py
@ -1 +1 @@
-__version__ = "1.12.0"
+__version__ = "1.13.2"
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -13,14 +13,15 @@ import weakref
 class Prior(object):
    domain = None
    _instance = None
+
    def __new__(cls, *args, **kwargs):
        if not cls._instance or cls._instance.__class__ is not cls:
-                newfunc = super(Prior, cls).__new__
-                if newfunc is object.__new__:
-                    cls._instance = newfunc(cls)
-                else:
-                    cls._instance = newfunc(cls, *args, **kwargs)
-                return cls._instance
+            newfunc = super(Prior, cls).__new__
+            if newfunc is object.__new__:
+                cls._instance = newfunc(cls)
+            else:
+                cls._instance = newfunc(cls, *args, **kwargs)
+            return cls._instance

    def pdf(self, x):
        return np.exp(self.lnpdf(x))
@ -47,6 +48,7 @@ class Gaussian(Prior):
    .. Note:: Bishop 2006 notation is used throughout the code

    """
+
    domain = _REAL
    _instances = []

@ -82,6 +84,7 @@ class Gaussian(Prior):
    def rvs(self, n):
        return np.random.randn(n) * self.sigma + self.mu

+
 #     def __getstate__(self):
 #         return self.mu, self.sigma
 #
@ -91,6 +94,7 @@ class Gaussian(Prior):
 #         self.sigma2 = np.square(self.sigma)
 #         self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)

+
 class Uniform(Prior):
    _instances = []

@ -132,6 +136,7 @@ class Uniform(Prior):
    def rvs(self, n):
        return np.random.uniform(self.lower, self.upper, size=n)

+
 #     def __getstate__(self):
 #         return self.lower, self.upper
 #
@ -139,6 +144,7 @@ class Uniform(Prior):
 #         self.lower = state[0]
 #         self.upper = state[1]

+
 class LogGaussian(Gaussian):
    """
    Implementation of the univariate *log*-Gaussian probability function, coupled with random variables.
@ -149,6 +155,7 @@ class LogGaussian(Gaussian):
    .. Note:: Bishop 2006 notation is used throughout the code

    """
+
    domain = _POSITIVE
    _instances = []

@ -160,7 +167,7 @@ class LogGaussian(Gaussian):
                    return instance()
        newfunc = super(Prior, cls).__new__
        if newfunc is object.__new__:
-            o = newfunc(cls)  
+            o = newfunc(cls)
        else:
            o = newfunc(cls, mu, sigma)
        cls._instances.append(weakref.ref(o))
@ -176,10 +183,14 @@ class LogGaussian(Gaussian):
        return "lnN({:.2g}, {:.2g})".format(self.mu, self.sigma)

    def lnpdf(self, x):
-        return self.constant - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2 - np.log(x)
+        return (
+            self.constant
+            - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2
+            - np.log(x)
+        )

    def lnpdf_grad(self, x):
-        return -((np.log(x) - self.mu) / self.sigma2 + 1.) / x
+        return -((np.log(x) - self.mu) / self.sigma2 + 1.0) / x

    def rvs(self, n):
        return np.exp(np.random.randn(int(n)) * self.sigma + self.mu)
@ -195,16 +206,15 @@ class MultivariateGaussian(Prior):
    .. Note:: Bishop 2006 notation is used throughout the code

    """
+
    domain = _REAL
    _instances = []

    def __new__(cls, mu=0, var=1):  # Singleton:
        if cls._instances:
-            cls._instances[:] = [instance for instance in cls._instances if
-                                 instance()]
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
-                if np.all(instance().mu == mu) and np.all(
-                        instance().var == var):
+                if np.all(instance().mu == mu) and np.all(instance().var == var):
                    return instance()
        newfunc = super(Prior, cls).__new__
        if newfunc is object.__new__:
@ -217,16 +227,17 @@ class MultivariateGaussian(Prior):
    def __init__(self, mu, var):
        self.mu = np.array(mu).flatten()
        self.var = np.array(var)
-        assert len(self.var.shape) == 2, 'Covariance must be a matrix'
-        assert self.var.shape[0] == self.var.shape[1], \
-            'Covariance must be a square matrix'
+        assert len(self.var.shape) == 2, "Covariance must be a matrix"
+        assert (
+            self.var.shape[0] == self.var.shape[1]
+        ), "Covariance must be a square matrix"
        assert self.var.shape[0] == self.mu.size
        self.input_dim = self.mu.size
        self.inv, _, self.hld, _ = pdinv(self.var)
        self.constant = -0.5 * (self.input_dim * np.log(2 * np.pi) + self.hld)

    def __str__(self):
-        return 'MultiN(' + str(self.mu) + ', ' + str(np.diag(self.var)) + ')'
+        return "MultiN(" + str(self.mu) + ", " + str(np.diag(self.var)) + ")"

    def summary(self):
        raise NotImplementedError
@ -243,7 +254,7 @@ class MultivariateGaussian(Prior):
    def lnpdf_grad(self, x):
        x = np.array(x).flatten()
        d = x - self.mu
-        return - np.dot(self.inv, d)
+        return -np.dot(self.inv, d)

    def rvs(self, n):
        return np.random.multivariate_normal(self.mu, self.var, n)
@ -262,14 +273,16 @@ class MultivariateGaussian(Prior):
    def __setstate__(self, state):
        self.mu = np.array(state[0]).flatten()
        self.var = state[1]
-        assert len(self.var.shape) == 2, 'Covariance must be a matrix'
-        assert self.var.shape[0] == self.var.shape[1], \
-            'Covariance must be a square matrix'
+        assert len(self.var.shape) == 2, "Covariance must be a matrix"
+        assert (
+            self.var.shape[0] == self.var.shape[1]
+        ), "Covariance must be a square matrix"
        assert self.var.shape[0] == self.mu.size
        self.input_dim = self.mu.size
        self.inv, _, self.hld, _ = pdinv(self.var)
        self.constant = -0.5 * (self.input_dim * np.log(2 * np.pi) + self.hld)

+
 def gamma_from_EV(E, V):
    warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning)
    return Gamma.from_EV(E, V)
@ -285,10 +298,11 @@ class Gamma(Prior):
    .. Note:: Bishop 2006 notation is used throughout the code

    """
+
    domain = _POSITIVE
    _instances = []

-    def __new__(cls, a=1, b=.5):  # Singleton:
+    def __new__(cls, a=1, b=0.5):  # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
@ -319,24 +333,29 @@ class Gamma(Prior):
        return "Ga({:.2g}, {:.2g})".format(self.a, self.b)

    def summary(self):
-        ret = {"E[x]": self.a / self.b, \
-               "E[ln x]": digamma(self.a) - np.log(self.b), \
-               "var[x]": self.a / self.b / self.b, \
-               "Entropy": gammaln(self.a) - (self.a - 1.) * digamma(self.a) - np.log(self.b) + self.a}
+        ret = {
+            "E[x]": self.a / self.b,
+            "E[ln x]": digamma(self.a) - np.log(self.b),
+            "var[x]": self.a / self.b / self.b,
+            "Entropy": gammaln(self.a)
+            - (self.a - 1.0) * digamma(self.a)
+            - np.log(self.b)
+            + self.a,
+        }
        if self.a > 1:
-            ret['Mode'] = (self.a - 1.) / self.b
+            ret["Mode"] = (self.a - 1.0) / self.b
        else:
-            ret['mode'] = np.nan
+            ret["mode"] = np.nan
        return ret

    def lnpdf(self, x):
        return self.constant + (self.a - 1) * np.log(x) - self.b * x

    def lnpdf_grad(self, x):
-        return (self.a - 1.) / x - self.b
+        return (self.a - 1.0) / x - self.b

    def rvs(self, n):
-        return np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+        return np.random.gamma(scale=1.0 / self.b, shape=self.a, size=n)

    @staticmethod
    def from_EV(E, V):
@ -359,6 +378,7 @@ class Gamma(Prior):
        self._b = state[1]
        self.constant = -gammaln(self.a) + self.a * np.log(self.b)

+
 class InverseGamma(Gamma):
    """
    Implementation of the inverse-Gamma probability function, coupled with random variables.
@ -369,6 +389,7 @@ class InverseGamma(Gamma):
    .. Note:: Bishop 2006 notation is used throughout the code

    """
+
    domain = _POSITIVE
    _instances = []

@ -386,10 +407,11 @@ class InverseGamma(Gamma):
        return self.constant - (self.a + 1) * np.log(x) - self.b / x

    def lnpdf_grad(self, x):
-        return -(self.a + 1.) / x + self.b / x ** 2
+        return -(self.a + 1.0) / x + self.b / x**2

    def rvs(self, n):
-        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+        return 1.0 / np.random.gamma(scale=1.0 / self.b, shape=self.a, size=n)
+

 class DGPLVM_KFDA(Prior):
    """
@ -403,6 +425,7 @@ class DGPLVM_KFDA(Prior):
    .. Note:: Surpassing Human-Level Face paper dgplvm implementation

    """
+
    domain = _REAL
    # _instances = []
    # def __new__(cls, lambdaa, sigma2):  # Singleton:
@ -459,8 +482,8 @@ class DGPLVM_KFDA(Prior):
        lst_ni = []
        lst_ni1 = []
        lst_ni2 = []
-        f1 = (np.where(self.lbl[:, 0] == 1)[0])
-        f2 = (np.where(self.lbl[:, 1] == 1)[0])
+        f1 = np.where(self.lbl[:, 0] == 1)[0]
+        f2 = np.where(self.lbl[:, 1] == 1)[0]
        for idx in f1:
            lst_ni1.append(idx)
        for idx in f2:
@ -474,11 +497,11 @@ class DGPLVM_KFDA(Prior):
        count = 0
        for N_i in lst_ni:
            if N_i == lst_ni[0]:
-                a[count:count + N_i] = (float(1) / N_i) * a[count]
+                a[count : count + N_i] = (float(1) / N_i) * a[count]
                count += N_i
            else:
                if N_i == lst_ni[1]:
-                    a[count: count + N_i] = -(float(1) / N_i) * a[count]
+                    a[count : count + N_i] = -(float(1) / N_i) * a[count]
                    count += N_i
        return a

@ -486,8 +509,12 @@ class DGPLVM_KFDA(Prior):
        A = np.zeros((self.datanum, self.datanum))
        idx = 0
        for N_i in lst_ni:
-            B = float(1) / np.sqrt(N_i) * (np.eye(N_i) - ((float(1) / N_i) * np.ones((N_i, N_i))))
-            A[idx:idx + N_i, idx:idx + N_i] = B
+            B = (
+                float(1)
+                / np.sqrt(N_i)
+                * (np.eye(N_i) - ((float(1) / N_i) * np.ones((N_i, N_i))))
+            )
+            A[idx : idx + N_i, idx : idx + N_i] = B
            idx += N_i
        return A

@ -498,9 +525,11 @@ class DGPLVM_KFDA(Prior):
        a_trans = np.transpose(self.a)
        paran = self.lambdaa * np.eye(x.shape[0]) + self.A.dot(K).dot(self.A)
        inv_part = pdinv(paran)[0]
-        J = a_trans.dot(K).dot(self.a) - a_trans.dot(K).dot(self.A).dot(inv_part).dot(self.A).dot(K).dot(self.a)
-        J_star = (1. / self.lambdaa) * J
-        return (-1. / self.sigma2) * J_star
+        J = a_trans.dot(K).dot(self.a) - a_trans.dot(K).dot(self.A).dot(inv_part).dot(
+            self.A
+        ).dot(K).dot(self.a)
+        J_star = (1.0 / self.lambdaa) * J
+        return (-1.0 / self.sigma2) * J_star

    # Here gradient function
    def lnpdf_grad(self, x):
@ -511,15 +540,15 @@ class DGPLVM_KFDA(Prior):
        b = self.A.dot(inv_part).dot(self.A).dot(K).dot(self.a)
        a_Minus_b = self.a - b
        a_b_trans = np.transpose(a_Minus_b)
-        DJ_star_DK = (1. / self.lambdaa) * (a_Minus_b.dot(a_b_trans))
+        DJ_star_DK = (1.0 / self.lambdaa) * (a_Minus_b.dot(a_b_trans))
        DJ_star_DX = self.kern.gradients_X(DJ_star_DK, x)
-        return (-1. / self.sigma2) * DJ_star_DX
+        return (-1.0 / self.sigma2) * DJ_star_DX

    def rvs(self, n):
        return np.random.rand(n)  # A WRONG implementation

    def __str__(self):
-        return 'DGPLVM_prior'
+        return "DGPLVM_prior"

    def __getstate___(self):
        return self.lbl, self.lambdaa, self.sigma2, self.kern, self.x_shape
@ -547,6 +576,7 @@ class DGPLVM(Prior):
    .. Note:: DGPLVM for Classification paper implementation

    """
+
    domain = _REAL

    def __new__(cls, sigma2, lbl, x_shape):
@ -606,7 +636,7 @@ class DGPLVM(Prior):
        for i in data_idx:
            if len(lst_idx) == 0:
                pass
-                #Do nothing, because it is the first time list is created so is empty
+                # Do nothing, because it is the first time list is created so is empty
            else:
                lst_idx = []
            # Here we put indices of each class in to the list called lst_idx_all
@ -631,9 +661,9 @@ class DGPLVM(Prior):
            N_i = float(len(cls[i]))
            W_WT = np.zeros((self.dim, self.dim))
            for xk in cls[i]:
-                W = (xk - M_i[i])
+                W = xk - M_i[i]
                W_WT += np.outer(W, W)
-            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+            Sw += (N_i / self.datanum) * ((1.0 / N_i) * W_WT)
        return Sw

    # Calculating beta and Bi for Sb
@ -658,7 +688,6 @@ class DGPLVM(Prior):
        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
        return Sig_beta_B_i_all

-
    # Calculating W_j s separately so we can access all the W_j s anytime
    def compute_wj(self, data_idx, M_i):
        W_i = np.zeros((self.datanum, self.dim))
@ -667,7 +696,7 @@ class DGPLVM(Prior):
            for tpl in data_idx[i]:
                xj = tpl[1]
                j = tpl[0]
-                W_i[j] = (xj - M_i[i])
+                W_i[j] = xj - M_i[i]
        return W_i

    # Calculating alpha and Wj for Sw
@ -680,11 +709,11 @@ class DGPLVM(Prior):
                for j in lst_idx_all[i]:
                    if k == j:
                        alpha = 1 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
                    else:
                        alpha = 0 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
-        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
+        Sig_alpha_W_i = (1.0 / self.datanum) * np.transpose(Sig_alpha_W_i)
        return Sig_alpha_W_i

    # This function calculates log of our prior
@ -696,9 +725,9 @@ class DGPLVM(Prior):
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
        # sb_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))

    # This function calculates derivative of the log of prior function
@ -717,19 +746,20 @@ class DGPLVM(Prior):

        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)

        # Calculating DJ/DXk
        DJ_Dxk = 2 * (
-            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
-                Sig_alpha_W_i))
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all)
+            + Sb_inv_N_trans.dot(Sig_alpha_W_i)
+        )
        # Calculating derivative of the log of the prior
-        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        DPx_Dx = (-1 / self.sigma2) * DJ_Dxk
        return DPx_Dx.T

    # def frb(self, x):
@ -744,7 +774,7 @@ class DGPLVM(Prior):
        return np.random.rand(n)  # A WRONG implementation

    def __str__(self):
-        return 'DGPLVM_prior_Raq'
+        return "DGPLVM_prior_Raq"


 # ******************************************
@ -752,6 +782,7 @@ class DGPLVM(Prior):
 from . import Parameterized
 from . import Param

+
 class DGPLVM_Lamda(Prior, Parameterized):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
@ -761,6 +792,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
    .. Note:: DGPLVM for Classification paper implementation

    """
+
    domain = _REAL
    # _instances = []
    # def __new__(cls, mu, sigma): # Singleton:
@ -773,7 +805,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
    #     cls._instances.append(weakref.ref(o))
    #     return cls._instances[-1]()

-    def __init__(self, sigma2, lbl, x_shape, lamda, name='DP_prior'):
+    def __init__(self, sigma2, lbl, x_shape, lamda, name="DP_prior"):
        super(DGPLVM_Lamda, self).__init__(name=name)
        self.sigma2 = sigma2
        # self.x = x
@ -783,7 +815,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        self.datanum = lbl.shape[0]
        self.x_shape = x_shape
        self.dim = x_shape[1]
-        self.lamda = Param('lamda', np.diag(lamda))
+        self.lamda = Param("lamda", np.diag(lamda))
        self.link_parameter(self.lamda)

    def get_class_label(self, y):
@ -831,7 +863,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        for i in data_idx:
            if len(lst_idx) == 0:
                pass
-                #Do nothing, because it is the first time list is created so is empty
+                # Do nothing, because it is the first time list is created so is empty
            else:
                lst_idx = []
            # Here we put indices of each class in to the list called lst_idx_all
@ -856,9 +888,9 @@ class DGPLVM_Lamda(Prior, Parameterized):
            N_i = float(len(cls[i]))
            W_WT = np.zeros((self.dim, self.dim))
            for xk in cls[i]:
-                W = (xk - M_i[i])
+                W = xk - M_i[i]
                W_WT += np.outer(W, W)
-            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+            Sw += (N_i / self.datanum) * ((1.0 / N_i) * W_WT)
        return Sw

    # Calculating beta and Bi for Sb
@ -883,7 +915,6 @@ class DGPLVM_Lamda(Prior, Parameterized):
        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
        return Sig_beta_B_i_all

-
    # Calculating W_j s separately so we can access all the W_j s anytime
    def compute_wj(self, data_idx, M_i):
        W_i = np.zeros((self.datanum, self.dim))
@ -892,7 +923,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
            for tpl in data_idx[i]:
                xj = tpl[1]
                j = tpl[0]
-                W_i[j] = (xj - M_i[i])
+                W_i[j] = xj - M_i[i]
        return W_i

    # Calculating alpha and Wj for Sw
@ -905,11 +936,11 @@ class DGPLVM_Lamda(Prior, Parameterized):
                for j in lst_idx_all[i]:
                    if k == j:
                        alpha = 1 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
                    else:
                        alpha = 0 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
-        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
+        Sig_alpha_W_i = (1.0 / self.datanum) * np.transpose(Sig_alpha_W_i)
        return Sig_alpha_W_i

    # This function calculates log of our prior
@ -917,7 +948,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        x = x.reshape(self.x_shape)

        #!!!!!!!!!!!!!!!!!!!!!!!!!!!
-        #self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()
+        # self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()

        xprime = x.dot(np.diagflat(self.lamda))
        x = xprime
@ -928,9 +959,9 @@ class DGPLVM_Lamda(Prior, Parameterized):
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.9)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))

    # This function calculates derivative of the log of prior function
@ -952,19 +983,20 @@ class DGPLVM_Lamda(Prior, Parameterized):

        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.9)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)

        # Calculating DJ/DXk
        DJ_Dxk = 2 * (
-            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
-                Sig_alpha_W_i))
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all)
+            + Sb_inv_N_trans.dot(Sig_alpha_W_i)
+        )
        # Calculating derivative of the log of the prior
-        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        DPx_Dx = (-1 / self.sigma2) * DJ_Dxk

        DPxprim_Dx = np.diagflat(self.lamda).dot(DPx_Dx)

@ -980,7 +1012,6 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # print DPxprim_Dx
        return DPxprim_Dx

-
    # def frb(self, x):
    #     from functools import partial
    #     from GPy.models import GradientChecker
@ -993,10 +1024,12 @@ class DGPLVM_Lamda(Prior, Parameterized):
        return np.random.rand(n)  # A WRONG implementation

    def __str__(self):
-        return 'DGPLVM_prior_Raq_Lamda'
+        return "DGPLVM_prior_Raq_Lamda"
+

 # ******************************************

+
 class DGPLVM_T(Prior):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
@ -1006,6 +1039,7 @@ class DGPLVM_T(Prior):
    .. Note:: DGPLVM for Classification paper implementation

    """
+
    domain = _REAL
    # _instances = []
    # def __new__(cls, mu, sigma): # Singleton:
@ -1028,7 +1062,6 @@ class DGPLVM_T(Prior):
        self.dim = x_shape[1]
        self.vec = vec

-
    def get_class_label(self, y):
        for idx, v in enumerate(y):
            if v == 1:
@ -1075,7 +1108,7 @@ class DGPLVM_T(Prior):
        for i in data_idx:
            if len(lst_idx) == 0:
                pass
-                #Do nothing, because it is the first time list is created so is empty
+                # Do nothing, because it is the first time list is created so is empty
            else:
                lst_idx = []
            # Here we put indices of each class in to the list called lst_idx_all
@ -1100,9 +1133,9 @@ class DGPLVM_T(Prior):
            N_i = float(len(cls[i]))
            W_WT = np.zeros((self.dim, self.dim))
            for xk in cls[i]:
-                W = (xk - M_i[i])
+                W = xk - M_i[i]
                W_WT += np.outer(W, W)
-            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+            Sw += (N_i / self.datanum) * ((1.0 / N_i) * W_WT)
        return Sw

    # Calculating beta and Bi for Sb
@ -1127,7 +1160,6 @@ class DGPLVM_T(Prior):
        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
        return Sig_beta_B_i_all

-
    # Calculating W_j s separately so we can access all the W_j s anytime
    def compute_wj(self, data_idx, M_i):
        W_i = np.zeros((self.datanum, self.dim))
@ -1136,7 +1168,7 @@ class DGPLVM_T(Prior):
            for tpl in data_idx[i]:
                xj = tpl[1]
                j = tpl[0]
-                W_i[j] = (xj - M_i[i])
+                W_i[j] = xj - M_i[i]
        return W_i

    # Calculating alpha and Wj for Sw
@ -1149,11 +1181,11 @@ class DGPLVM_T(Prior):
                for j in lst_idx_all[i]:
                    if k == j:
                        alpha = 1 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
                    else:
                        alpha = 0 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
-        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
+        Sig_alpha_W_i = (1.0 / self.datanum) * np.transpose(Sig_alpha_W_i)
        return Sig_alpha_W_i

    # This function calculates log of our prior
@ -1168,10 +1200,10 @@ class DGPLVM_T(Prior):
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #print 'SB_inv: ', Sb_inv_N
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # print 'SB_inv: ', Sb_inv_N
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))

    # This function calculates derivative of the log of prior function
@ -1193,20 +1225,21 @@ class DGPLVM_T(Prior):

        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #print 'SB_inv: ',Sb_inv_N
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # print 'SB_inv: ',Sb_inv_N
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)

        # Calculating DJ/DXk
        DJ_Dxk = 2 * (
-            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
-                Sig_alpha_W_i))
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all)
+            + Sb_inv_N_trans.dot(Sig_alpha_W_i)
+        )
        # Calculating derivative of the log of the prior
-        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        DPx_Dx = (-1 / self.sigma2) * DJ_Dxk
        return DPx_Dx.T

    # def frb(self, x):
@ -1221,9 +1254,7 @@ class DGPLVM_T(Prior):
        return np.random.rand(n)  # A WRONG implementation

    def __str__(self):
-        return 'DGPLVM_prior_Raq_TTT'
-
-
+        return "DGPLVM_prior_Raq_TTT"


 class HalfT(Prior):
@ -1234,6 +1265,7 @@ class HalfT(Prior):
    :param nu: degrees of freedom

    """
+
    domain = _POSITIVE
    _instances = []

@ -1250,13 +1282,22 @@ class HalfT(Prior):
    def __init__(self, A, nu):
        self.A = float(A)
        self.nu = float(nu)
-        self.constant = gammaln(.5*(self.nu+1.)) - gammaln(.5*self.nu) - .5*np.log(np.pi*self.A*self.nu)
+        self.constant = (
+            gammaln(0.5 * (self.nu + 1.0))
+            - gammaln(0.5 * self.nu)
+            - 0.5 * np.log(np.pi * self.A * self.nu)
+        )

    def __str__(self):
        return "hT({:.2g}, {:.2g})".format(self.A, self.nu)

    def lnpdf(self, theta):
-        return (theta > 0) * (self.constant - .5*(self.nu + 1) * np.log(1. + (1./self.nu) * (theta/self.A)**2))
+        return (theta > 0) * (
+            self.constant
+            - 0.5
+            * (self.nu + 1)
+            * np.log(1.0 + (1.0 / self.nu) * (theta / self.A) ** 2)
+        )

        # theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
        # lnpdfs = np.zeros_like(theta)
@ -1268,7 +1309,7 @@ class HalfT(Prior):
        # lnpdfs[above_zero] = (+ gammaln((v + 1) * 0.5)
        #     - gammaln(v * 0.5)
        #     - 0.5*np.log(sigma2 * v * np.pi)
-        #     - 0.5*(v + 1)*np.log(1 + (1/np.float(v))*((theta[above_zero][0]**2)/sigma2))
+        #     - 0.5*(v + 1)*np.log(1 + (1/float(v))*((theta[above_zero][0]**2)/sigma2))
        # )
        # return lnpdfs

@ -1278,12 +1319,18 @@ class HalfT(Prior):
        above_zero = theta > 1e-6
        v = self.nu
        sigma2 = self.A
-        grad[above_zero] = -0.5*(v+1)*(2*theta[above_zero])/(v*sigma2 + theta[above_zero][0]**2)
+        grad[above_zero] = (
+            -0.5
+            * (v + 1)
+            * (2 * theta[above_zero])
+            / (v * sigma2 + theta[above_zero][0] ** 2)
+        )
        return grad

    def rvs(self, n):
        # return np.random.randn(n) * self.sigma + self.mu
        from scipy.stats import t
+
        # [np.abs(x) for x in t.rvs(df=4,loc=0,scale=50, size=10000)])
        ret = t.rvs(self.nu, loc=0, scale=self.A, size=n)
        ret[ret < 0] = 0
@ -1298,6 +1345,7 @@ class Exponential(Prior):
    :param l: shape parameter

    """
+
    domain = _POSITIVE
    _instances = []

@ -1318,22 +1366,25 @@ class Exponential(Prior):
        return "Exp({:.2g})".format(self.l)

    def summary(self):
-        ret = {"E[x]": 1. / self.l,
-               "E[ln x]": np.nan,
-               "var[x]": 1. / self.l**2,
-               "Entropy": 1. - np.log(self.l),
-               "Mode": 0.}
+        ret = {
+            "E[x]": 1.0 / self.l,
+            "E[ln x]": np.nan,
+            "var[x]": 1.0 / self.l**2,
+            "Entropy": 1.0 - np.log(self.l),
+            "Mode": 0.0,
+        }
        return ret

    def lnpdf(self, x):
        return np.log(self.l) - self.l * x

    def lnpdf_grad(self, x):
-        return - self.l
+        return -self.l

    def rvs(self, n):
        return np.random.exponential(scale=self.l, size=n)

+
 class StudentT(Prior):
    """
    Implementation of the student t probability function, coupled with random variables.
@ -1345,6 +1396,7 @@ class StudentT(Prior):
    .. Note:: Bishop 2006 notation is used throughout the code

    """
+
    domain = _REAL
    _instances = []

@ -1352,7 +1404,11 @@ class StudentT(Prior):
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
-                if instance().mu == mu and instance().sigma == sigma and instance().nu == nu:
+                if (
+                    instance().mu == mu
+                    and instance().sigma == sigma
+                    and instance().nu == nu
+                ):
                    return instance()
        newfunc = super(Prior, cls).__new__
        if newfunc is object.__new__:
@ -1373,13 +1429,18 @@ class StudentT(Prior):

    def lnpdf(self, x):
        from scipy.stats import t
-        return t.logpdf(x,self.nu,self.mu,self.sigma)
+
+        return t.logpdf(x, self.nu, self.mu, self.sigma)

    def lnpdf_grad(self, x):
-        return -(self.nu + 1.)*(x - self.mu)/( self.nu*self.sigma2 + np.square(x - self.mu) )
+        return (
+            -(self.nu + 1.0)
+            * (x - self.mu)
+            / (self.nu * self.sigma2 + np.square(x - self.mu))
+        )

    def rvs(self, n):
        from scipy.stats import t
+
        ret = t.rvs(self.nu, loc=self.mu, scale=self.sigma, size=n)
        return ret
-
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -771,3 +771,117 @@ def multioutput_gp_with_derivative_observations(plot=True):
    mu, var = m.predict_noiseless(Xnew=[xpred, np.empty((0, 1))])

    return m
+
+def multioutput_gp_with_derivative_observations_2D(optimize=True, plot=False):
+    '''
+    This in an example on how to use a MultioutputGP model with gradient
+    observations and multiple single-dimensional kernels of differing types.
+    '''
+
+    period = 3
+    w = 2*np.pi/period # angular frequency
+    bounds = (-period, period)
+
+    # latent function and gradient
+    f = lambda x: (np.exp(-x[:,0]**2) + np.cos(w*x[:,1]))[:,None]
+    df = lambda x: np.array([-2*np.exp(-x[:,0]**2)*x[:,0], -w*np.sin(w*x[:,1])]).T
+
+    # 2D input grid
+    ppa = 25 # points per axis
+    x = np.linspace(*bounds, ppa)
+    xx, yy = np.meshgrid(x, x)
+    grid = np.array([xx.reshape(-1), yy.reshape(-1)]).T
+
+    fgrid = f(grid)
+    dfgrid = df(grid)
+
+    # 10 random training points generated with a space-filling sobol sequence
+    X = np.array([
+        [ 0.50421399,  2.1331483 ],
+        [-2.15717152, -1.70295936],
+        [-1.46704334,  1.37111521],
+        [ 2.79064536, -0.9649018 ],
+        [ 1.60728264,  0.27702713],
+        [-0.30712366, -0.57372129],
+        [-2.6140632 ,  2.49192488],
+        [ 0.89078772, -2.85873686],
+        [ 1.15813136,  0.96910322],
+        [-2.83307021, -1.38155383]
+    ])
+
+    # Note!
+    # This example uses the same inputs for function and gradient observations.
+
+    noise_std = 1e-2
+    # function observations
+    Y = f(X) + np.random.normal(scale=noise_std, size=(len(X), 1))
+    # gradient observations
+    dY = df(X) + np.random.normal(scale=noise_std, size=(len(X), 2))
+
+    # gather inputs and observations into lists
+    X_list = [X, X, X]
+    # once for function observations, and once for each partial derivative
+    # make sure all arrays are of shape (N x dims), where N is # of training points
+    Y_list = [Y, dY[:,0,None], dY[:,1,None]]
+
+    # create a kernel that is the product of two one-dimensional kernels
+    # the first kernel is an RBF kernel
+    kern0 = GPy.kern.RBF(input_dim=1, active_dims=[0])
+    # as the function is periodic in the second dimension, we use a StdP kernel
+    kern1 = GPy.kern.StdPeriodic(input_dim=1, active_dims=[1], period=period)
+    kern1.period.constrain_fixed()
+    # the kernels can be multiplied together into a product kernel
+    kern = kern0 * kern1
+
+    # with gradient observations, we need to define a DiffKern for each dimension
+    # the DiffKern is given the main kernel as a base kernel
+    diffkern0 = GPy.kern.DiffKern(kern, 0)
+    diffkern1 = GPy.kern.DiffKern(kern, 1)
+
+    # gather the main kernel and diffkerns into a list
+    kern_list = [kern, diffkern0, diffkern1]
+
+    # define a likelihood and repeat it in a list
+    likelihood_list = [GPy.likelihoods.Gaussian(variance=noise_std**2)]*3
+
+    # create the MultioutputGP model and optimize
+    model = GPy.models.MultioutputGP(X_list, Y_list, kern_list, likelihood_list)
+    model.likelihood.constrain_fixed()
+    if optimize:
+        model.optimize()
+
+    # make function predictions
+    Xnew, _, ind = GPy.util.multioutput.build_XY([grid], index=[0])
+    Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
+
+    mu, var = model.predict(Xnew, Y_metadata=Y_metadata)
+
+    # make gradient predictions
+    Xnew, _, ind = GPy.util.multioutput.build_XY([grid]*2, index=[1, 2])
+    Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
+
+    mu_d, var_d = model.predict(Xnew, Y_metadata=Y_metadata)
+
+    mu_d = np.array([mu_d[:len(grid)], mu_d[len(grid):]]).T[0]
+    var_d = np.array([var_d[:len(grid)], var_d[len(grid):]]).T[0]
+
+    if plot and MPL_AVAILABLE:
+        fig, axs = plt.subplots(1, 3)
+        for ax in axs: ax.set_box_aspect(1)
+        axs[0].set_title('true f')
+        axs[0].contourf(xx, yy, fgrid.reshape(ppa, ppa), levels=25)
+        axs[1].set_title('true df1')
+        axs[1].contourf(xx, yy, dfgrid[:,0].reshape(ppa, ppa), levels=25)
+        axs[2].set_title('true df2')
+        axs[2].contourf(xx, yy, dfgrid[:,1].reshape(ppa, ppa), levels=25)
+
+        fig, axs = plt.subplots(1, 3)
+        for ax in axs: ax.set_box_aspect(1)
+        axs[0].set_title('pred f')
+        axs[0].contourf(xx, yy, mu.reshape(ppa, ppa), levels=25)
+        axs[1].set_title('pred df1')
+        axs[1].contourf(xx, yy, mu_d[:,0].reshape(ppa, ppa), levels=25)
+        axs[2].set_title('pred df2')
+        axs[2].contourf(xx, yy, mu_d[:,1].reshape(ppa, ppa), levels=25)
+
+    return model
--- a/GPy/inference/latent_function_inference/exact_studentt_inference.py
+++ b/GPy/inference/latent_function_inference/exact_studentt_inference.py
@ -35,15 +35,20 @@ class ExactStudentTInference(LatentFunctionInference):
        # Log marginal
        N = Y.shape[0]
        D = Y.shape[1]
-        log_marginal = 0.5 * (-N * np.log((nu - 2) * np.pi) - W_logdet - (nu + N) * np.log(1 + beta / (nu - 2)))
+        log_marginal = 0.5 * (
+            -N * np.log((nu - 2) * np.pi)
+            - W_logdet
+            - (nu + N) * np.log(1 + beta / (nu - 2))
+        )
        log_marginal += gammaln((nu + N) / 2) - gammaln(nu / 2)

        # Gradients
        dL_dK = 0.5 * ((nu + N) / (nu + beta - 2) * tdot(alpha) - D * Wi)
-        dL_dnu = -N / (nu - 2.) + digamma(0.5 * (nu + N)) - digamma(0.5 * nu)
-        dL_dnu -= np.log(1 + beta / (nu - 2.))
+        dL_dnu = -N / (nu - 2.0) + digamma(0.5 * (nu + N)) - digamma(0.5 * nu)
+        dL_dnu -= np.log(1 + beta / (nu - 2.0))
        dL_dnu += ((nu + N) * beta) / ((nu - 2) * (beta + nu - 2))
        dL_dnu *= 0.5
-        gradients = {'dL_dK': dL_dK, 'dL_dnu': dL_dnu, 'dL_dm': alpha}
+        dL_dm = (nu + N) / (nu + beta - 2) * alpha
+        gradients = {"dL_dK": dL_dK, "dL_dnu": dL_dnu, "dL_dm": dL_dm}

        return posterior, log_marginal, gradients
--- a/GPy/kern/src/coregionalize.py
+++ b/GPy/kern/src/coregionalize.py
@ -5,13 +5,16 @@ from .kern import Kern
 import numpy as np
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
-from ...util.config import config # for assesing whether to use cython
+from ...util.config import config  # for assesing whether to use cython

 try:
    from . import coregionalize_cython
-    use_coregionalize_cython = config.getboolean('cython', 'working')
+
+    use_coregionalize_cython = config.getboolean("cython", "working")
 except ImportError:
-    print('warning in coregionalize: failed to import cython module: falling back to numpy')
+    print(
+        "warning in coregionalize: failed to import cython module: falling back to numpy"
+    )
    use_coregionalize_cython = False


@ -43,22 +46,34 @@ class Coregionalize(Kern):

    .. note: see coregionalization examples in GPy.examples.regression for some usage.
    """
-    def __init__(self, input_dim, output_dim, rank=1, W=None, kappa=None, active_dims=None, name='coregion'):
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        rank=1,
+        W=None,
+        kappa=None,
+        active_dims=None,
+        name="coregion",
+    ):
        super(Coregionalize, self).__init__(input_dim, active_dims, name=name)
        self.output_dim = output_dim
        self.rank = rank
-        if self.rank>output_dim:
-            print("Warning: Unusual choice of rank, it should normally be less than the output_dim.")
+        if self.rank > output_dim:
+            print(
+                "Warning: Unusual choice of rank, it should normally be less than the output_dim."
+            )
        if W is None:
-            W = 0.5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
+            W = 0.5 * np.random.randn(self.output_dim, self.rank) / np.sqrt(self.rank)
        else:
-            assert W.shape==(self.output_dim, self.rank)
-        self.W = Param('W', W)
+            assert W.shape == (self.output_dim, self.rank)
+        self.W = Param("W", W)
        if kappa is None:
-            kappa = 0.5*np.ones(self.output_dim)
+            kappa = 0.5 * np.ones(self.output_dim)
        else:
-            assert kappa.shape==(self.output_dim, )
-        self.kappa = Param('kappa', kappa, Logexp())
+            assert kappa.shape == (self.output_dim,)
+        self.kappa = Param("kappa", kappa, Logexp())
        self.link_parameters(self.W, self.kappa)

    def parameters_changed(self):
@ -70,63 +85,69 @@ class Coregionalize(Kern):
        else:
            return self._K_numpy(X, X2)

-
    def _K_numpy(self, X, X2=None):
-        index = np.asarray(X, dtype=np.int)
+        index = np.asarray(X, dtype=int)
        if X2 is None:
-            return self.B[index,index.T]
+            return self.B[index, index.T]
        else:
-            index2 = np.asarray(X2, dtype=np.int)
-            return self.B[index,index2.T]
+            index2 = np.asarray(X2, dtype=int)
+            return self.B[index, index2.T]

    def _K_cython(self, X, X2=None):
        if X2 is None:
-            return coregionalize_cython.K_symmetric(self.B, np.asarray(X, dtype=np.int64)[:,0])
-        return coregionalize_cython.K_asymmetric(self.B, np.asarray(X, dtype=np.int64)[:,0], np.asarray(X2, dtype=np.int64)[:,0])
-
+            return coregionalize_cython.K_symmetric(
+                self.B, np.asarray(X, dtype=np.int64)[:, 0]
+            )
+        return coregionalize_cython.K_asymmetric(
+            self.B,
+            np.asarray(X, dtype=np.int64)[:, 0],
+            np.asarray(X2, dtype=np.int64)[:, 0],
+        )

    def Kdiag(self, X):
-        return np.diag(self.B)[np.asarray(X, dtype=np.int).flatten()]
+        return np.diag(self.B)[np.asarray(X, dtype=int).flatten()]

    def update_gradients_full(self, dL_dK, X, X2=None):
-        index = np.asarray(X, dtype=np.int)
+        index = np.asarray(X, dtype=int)
        if X2 is None:
            index2 = index
        else:
-            index2 = np.asarray(X2, dtype=np.int)
+            index2 = np.asarray(X2, dtype=int)

-        #attempt to use cython for a nasty double indexing loop: fall back to numpy
+        # attempt to use cython for a nasty double indexing loop: fall back to numpy
        if use_coregionalize_cython:
            dL_dK_small = self._gradient_reduce_cython(dL_dK, index, index2)
        else:
            dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)

-
        dkappa = np.diag(dL_dK_small).copy()
        dL_dK_small += dL_dK_small.T
-        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
+        dW = (self.W[:, None, :] * dL_dK_small[:, :, None]).sum(0)

        self.W.gradient = dW
        self.kappa.gradient = dkappa

    def _gradient_reduce_numpy(self, dL_dK, index, index2):
-        index, index2 = index[:,0], index2[:,0]
+        index, index2 = index[:, 0], index2[:, 0]
        dL_dK_small = np.zeros_like(self.B)
        for i in range(self.output_dim):
-            tmp1 = dL_dK[index==i]
+            tmp1 = dL_dK[index == i]
            for j in range(self.output_dim):
-                dL_dK_small[j,i] = tmp1[:,index2==j].sum()
+                dL_dK_small[j, i] = tmp1[:, index2 == j].sum()
        return dL_dK_small

    def _gradient_reduce_cython(self, dL_dK, index, index2):
-        index, index2 = np.int64(index[:,0]), np.int64(index2[:,0])
-        return coregionalize_cython.gradient_reduce(self.B.shape[0], dL_dK, index, index2)
-
+        index, index2 = np.int64(index[:, 0]), np.int64(index2[:, 0])
+        return coregionalize_cython.gradient_reduce(
+            self.B.shape[0], dL_dK, index, index2
+        )

    def update_gradients_diag(self, dL_dKdiag, X):
-        index = np.asarray(X, dtype=np.int).flatten()
-        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in range(self.output_dim)])
-        self.W.gradient = 2.*self.W*dL_dKdiag_small[:, None]
+        index = np.asarray(X, dtype=int).flatten()
+        dL_dKdiag_small = np.array(
+            [dL_dKdiag[index == i].sum() for i in range(self.output_dim)]
+        )
+        self.W.gradient = 2.0 * self.W * dL_dKdiag_small[:, None]
        self.kappa.gradient = dL_dKdiag_small

    def gradients_X(self, dL_dK, X, X2=None):
@ -154,8 +175,8 @@ class Coregionalize(Kern):

    @staticmethod
    def _build_from_input_dict(kernel_class, input_dict):
-        useGPU = input_dict.pop('useGPU', None)
+        useGPU = input_dict.pop("useGPU", None)
        # W and kappa must be converted back to numpy arrays
-        input_dict['W'] = np.array(input_dict['W'])
-        input_dict['kappa'] = np.array(input_dict['kappa'])
+        input_dict["W"] = np.array(input_dict["W"])
+        input_dict["kappa"] = np.array(input_dict["kappa"])
        return Coregionalize(**input_dict)
--- a/GPy/kern/src/coregionalize_cython.c
+++ b/GPy/kern/src/coregionalize_cython.c
--- a/GPy/kern/src/diff_kern.py
+++ b/GPy/kern/src/diff_kern.py
@ -23,24 +23,42 @@ class DiffKern(Kern):
        self.base_kern.parameters_changed()

    @Cache_this(limit=3, ignore_args=())
-    def K(self, X, X2=None, dimX2 = None): #X in dimension self.dimension
+    def K(self, X, X2=None, dimX2=None): #X in dimension self.dimension
        if X2 is None:
            X2 = X
        if dimX2 is None:
            dimX2 = self.dimension
-        return self.base_kern.dK2_dXdX2(X,X2, self.dimension, dimX2)
- 
+        return self.base_kern.dK2_dXdX2(X, X2, self.dimension, dimX2)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dX(self, X, X2, dimX, dimX2=None):
+        if dimX2 is None:
+            dimX2 = self.dimension
+        return self.base_kern.dK3_dXdXdX2(X, X2, dimX, self.dimension, dimX2)
+
    @Cache_this(limit=3, ignore_args=())
    def Kdiag(self, X):
-        return np.diag(self.base_kern.dK2_dXdX2(X,X, self.dimension, self.dimension))
-    
+        return self.base_kern.dK2_dXdX2diag(X, self.dimension, self.dimension)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dXdiag(self, X, dimX):
+        return self.base_kern.dK3_dXdXdX2diag(X, dimX, self.dimension, self.dimension)
+
    @Cache_this(limit=3, ignore_args=())
    def dK_dX_wrap(self, X, X2): #X in dimension self.dimension
-        return self.base_kern.dK_dX(X,X2, self.dimension)
+        return self.base_kern.dK_dX(X, X2, self.dimension)

    @Cache_this(limit=3, ignore_args=())
    def dK_dX2_wrap(self, X, X2): #X in dimension self.dimension
-        return self.base_kern.dK_dX2(X,X2, self.dimension)
+        return self.base_kern.dK_dX2(X, X2, self.dimension)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dXdX2_wrap(self, X, X2, dimX):
+        return self.base_kern.dK2_dXdX2(X, X2, dimX, self.dimension)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dXdX_wrap(self, X, X2, dimX):
+        return self.base_kern.dK2_dXdX(X, X2, dimX, self.dimension)

    def reset_gradients(self):
        self.base_kern.reset_gradients()
@ -56,33 +74,33 @@ class DiffKern(Kern):
    def update_gradients_full(self, dL_dK, X, X2=None, dimX2=None):
        if dimX2 is None:
            dimX2 = self.dimension
-        gradients = self.base_kern.dgradients2_dXdX2(X,X2,self.dimension,dimX2)
+        gradients = self.base_kern.dgradients2_dXdX2(X, X2, self.dimension, dimX2)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])

    def update_gradients_diag(self, dL_dK_diag, X):
-        gradients = self.base_kern.dgradients2_dXdX2(X,X, self.dimension, self.dimension)
+        gradients = self.base_kern.dgradients2_dXdX2(X, X, self.dimension, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK_diag, gradient, f=np.diag) for gradient in gradients])

    def update_gradients_dK_dX(self, dL_dK, X, X2=None):
        if X2 is None:
            X2 = X
-        gradients = self.base_kern.dgradients_dX(X,X2, self.dimension)
+        gradients = self.base_kern.dgradients_dX(X, X2, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])

    def update_gradients_dK_dX2(self, dL_dK, X, X2=None):
-        gradients = self.base_kern.dgradients_dX2(X,X2, self.dimension)
+        gradients = self.base_kern.dgradients_dX2(X, X2, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])

    def gradients_X(self, dL_dK, X, X2):
-        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,:, self.dimension]
+        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,:,self.dimension]
        return np.sum(tmp, axis=1)
    
    def gradients_X2(self, dL_dK, X, X2):
-        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:, :, self.dimension, :]
+        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,self.dimension,:]
        return np.sum(tmp, axis=1)

-    def _convert_gradients(self, l,g, f = lambda x:x):
+    def _convert_gradients(self, l, g, f=lambda x:x):
        if type(g) is np.ndarray:
            return np.sum(f(l)*f(g))
        else:
-            return np.array([np.sum(f(l)*f(gi)) for gi in g])
+            return np.array([np.sum(f(l)*f(gi)) for gi in g])
--- a/GPy/kern/src/eq_ode1.py
+++ b/GPy/kern/src/eq_ode1.py
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
--- a/GPy/kern/src/kernel_slice_operations.py
+++ b/GPy/kern/src/kernel_slice_operations.py
@ -22,7 +22,14 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        put_clean(dct, 'dK_dX', _slice_dK_dX)
        put_clean(dct, 'dK_dX2', _slice_dK_dX)
        put_clean(dct, 'dK2_dXdX2', _slice_dK2_dXdX2)
+        put_clean(dct, 'dK2_dXdX', _slice_dK2_dXdX2)
+        put_clean(dct, 'dK3_dXdXdX2', _slice_dK3_dXdXdX2)
        put_clean(dct, 'Kdiag', _slice_Kdiag)
+        put_clean(dct, 'dK_dXdiag', _slice_dK_dXdiag)
+        put_clean(dct, 'dK_dX2diag', _slice_dK_dXdiag)
+        put_clean(dct, 'dK2_dXdX2diag', _slice_dK2_dXdX2diag)
+        put_clean(dct, 'dK2_dXdXdiag', _slice_dK2_dXdX2diag)
+        put_clean(dct, 'dK3_dXdXdX2diag', _slice_dK3_dXdXdX2diag)
        put_clean(dct, 'phi', _slice_Kdiag)
        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
@ -35,9 +42,10 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        put_clean(dct, 'gradients_XX_diag', _slice_gradients_XX_diag)
        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)

-        put_clean(dct, 'dgradients_dX',_slice_partial_gradients_list_X)
-        put_clean(dct, 'dgradients_dX2',_slice_partial_gradients_list_X)
-        put_clean(dct, 'dgradients2_dXdX2',_slice_partial_gradients_list_XX)
+        put_clean(dct, 'dgradients', _slice_partial_gradients_list)
+        put_clean(dct, 'dgradients_dX', _slice_partial_gradients_list_X)
+        put_clean(dct, 'dgradients_dX2', _slice_partial_gradients_list_X)
+        put_clean(dct, 'dgradients2_dXdX2', _slice_partial_gradients_list_XX)

        put_clean(dct, 'psi0', _slice_psi)
        put_clean(dct, 'psi1', _slice_psi)
@ -155,6 +163,18 @@ def _slice_dK_dX(f):
        return ret
    return wrap

+def _slice_dK_dXdiag(f):
+    @wraps(f)
+    def wrap(self, X, dim, *a, **kw):
+        with _Slice_wrap(self, X, None) as s:
+            d = s.k._project_dim(dim)
+            if d is None:
+                ret = np.zeros(X.shape[0])
+            else:
+                ret = f(self, s.X, dim, *a, **kw)
+        return ret
+    return wrap
+
 def _slice_dK2_dXdX2(f):
    @wraps(f)
    def wrap(self, X, X2, dimX, dimX2, *a, **kw):
@ -168,6 +188,59 @@ def _slice_dK2_dXdX2(f):
        return ret
    return wrap

+def _slice_dK2_dXdX2diag(f):
+    @wraps(f)
+    def wrap(self, X, dimX, dimX2, *a, **kw):
+        with _Slice_wrap(self, X, None) as s:
+            d = s.k._project_dim(dimX)
+            d2 = s.k._project_dim(dimX2)
+            if (d is None) or (d2 is None):
+                ret = np.zeros(X.shape[0])
+            else:
+                ret = f(self, s.X, d, d2, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_dK3_dXdXdX2(f):
+    @wraps(f)
+    def wrap(self, X, X2, dim, dimX, dimX2, *a, **kw):
+        with _Slice_wrap(self, X, X2) as s:
+            D = s.k._project_dim(dim)
+            d = s.k._project_dim(dimX)
+            d2 = s.k._project_dim(dimX2)
+            if (D is None) or (d is None) or (d2 is None):
+                ret = np.zeros((X.shape[0], X2.shape[0]))
+            else:
+                ret = f(self, s.X, s.X2, D, d, d2, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_dK3_dXdXdX2diag(f):
+    @wraps(f)
+    def wrap(self, X, dim, dimX, dimX2, *a, **kw):
+        with _Slice_wrap(self, X, None) as s:
+            D = s.k._project_dim(dim)
+            d = s.k._project_dim(dimX)
+            d2 = s.k._project_dim(dimX2)
+            if (D is None) or (d is None) or (d2 is None):
+                ret = np.zeros(X.shape[0])
+            else:
+                ret = f(self, s.X, D, d, d2, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_partial_gradients_list(f):
+    @wraps(f)
+    def wrap(self, X, X2):
+        if X2 is None:
+            N, M = X.shape[0], X.shape[0]
+        else:
+            N, M = X.shape[0], X2.shape[0]
+        with _Slice_wrap(self, X, X2, ret_shape=(N, M)) as s:
+            ret = f(self, s.X, s.X2)
+        return ret
+    return wrap
+
 def _slice_partial_gradients_X(f):
    @wraps(f)
    def wrap(self, X, X2, dim):
--- a/GPy/kern/src/multioutput_derivative_kern.py
+++ b/GPy/kern/src/multioutput_derivative_kern.py
@ -7,20 +7,24 @@ import numpy as np
 from functools import partial

 class KernWrapper(Kern):
-    def __init__(self, fk, fug, fg, base_kern):
+    def __init__(self, fk, fdk, fug, fg, base_kern):
        self.fk = fk
+        self.fdk = fdk
        self.fug = fug
        self.fg = fg
        self.base_kern = base_kern
-        super(KernWrapper, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='KernWrapper',useGPU=False)
+        super(KernWrapper, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='KernWrapper', useGPU=False)

    def K(self, X, X2=None):
-        return self.fk(X,X2=X2)
+        return self.fk(X, X2=X2)
+
+    def dK_dX(self, X, X2, dimX):
+        return self.fdk(X, X2, dimX)
    
-    def update_gradients_full(self,dL_dK, X, X2=None):
+    def update_gradients_full(self, dL_dK, X, X2=None):
        return self.fug(dL_dK, X, X2=X2)
    
-    def gradients_X(self,dL_dK, X, X2=None):
+    def gradients_X(self, dL_dK, X, X2=None):
        return self.fg(dL_dK, X, X2=X2)

    @property
@ -57,28 +61,46 @@ class MultioutputDerivativeKern(MultioutputKern):
        #build covariance structure
        covariance = [[None for i in range(nl)] for j in range(nl)]
        linked = []
-        for i in range(0,nl):
-            unique=True
-            for j in range(0,nl):
-                if i==j or (kernels[i] is kernels[j]):
+        for i in range(0, nl):
+            unique = True
+            for j in range(0, nl):
+                if (i == j) or (kernels[i] is kernels[j]):
                    kern = kernels[i]
-                    if i>j:
-                        unique=False
+                    if i > j:
+                        unique = False
                elif cross_covariances.get((i,j)) is not None: #cross covariance is given
                    kern = cross_covariances.get((i,j))
-                elif kernels[i].name == 'DiffKern' and kernels[i].base_kern == kernels[j]: # one is derivative of other
-                    kern = KernWrapper(kernels[i].dK_dX_wrap,kernels[i].update_gradients_dK_dX,kernels[i].gradients_X, kernels[j])
+                elif (kernels[i].name == 'DiffKern') and (kernels[i].base_kern == kernels[j]): # one is derivative of other
+                    kern = KernWrapper(
+                        kernels[i].dK_dX_wrap,
+                        kernels[i].dK2_dXdX_wrap,
+                        kernels[i].update_gradients_dK_dX,
+                        kernels[i].gradients_X,
+                        kernels[j]
+                        )
                    unique=False
-                elif kernels[j].name == 'DiffKern' and kernels[j].base_kern == kernels[i]: # one is derivative of other
-                    kern = KernWrapper(kernels[j].dK_dX2_wrap,kernels[j].update_gradients_dK_dX2,kernels[j].gradients_X2, kernels[i])
-                elif kernels[i].name == 'DiffKern' and kernels[j].name == 'DiffKern' and kernels[i].base_kern == kernels[j].base_kern: #both are partial derivatives
-                    kern = KernWrapper(partial(kernels[i].K, dimX2=kernels[j].dimension), partial(kernels[i].update_gradients_full, dimX2=kernels[j].dimension),None, kernels[i].base_kern)
-                    if i>j:
-                        unique=False
+                elif (kernels[j].name == 'DiffKern') and (kernels[j].base_kern == kernels[i]): # one is derivative of other
+                    kern = KernWrapper(
+                        kernels[j].dK_dX2_wrap,
+                        kernels[j].dK2_dXdX2_wrap,
+                        kernels[j].update_gradients_dK_dX2,
+                        kernels[j].gradients_X2,
+                        kernels[i]
+                        )
+                elif (kernels[i].name == 'DiffKern') and (kernels[j].name == 'DiffKern') and (kernels[i].base_kern == kernels[j].base_kern): #both are partial derivatives
+                    kern = KernWrapper(
+                        partial(kernels[i].K, dimX2=kernels[j].dimension),
+                        partial(kernels[i].dK_dX, dimX2=kernels[j].dimension),
+                        partial(kernels[i].update_gradients_full, dimX2=kernels[j].dimension),
+                        None,
+                        kernels[i].base_kern
+                        )
+                    if i > j:
+                        unique = False
                else:
                    kern = ZeroKern()
                covariance[i][j] = kern
            if unique is True:
                linked.append(i)
        self.covariance = covariance
-        self.link_parameters(*[kernels[i] for i in linked])
+        self.link_parameters(*[kernels[i] for i in linked])
--- a/GPy/kern/src/multioutput_kern.py
+++ b/GPy/kern/src/multioutput_kern.py
@ -85,21 +85,63 @@ class MultioutputKern(CombinationKernel):
        self.link_parameters(*[kernels[i] for i in linked])
        
    @Cache_this(limit=3, ignore_args=())
-    def K(self, X ,X2=None):
+    def K(self, X, X2=None):
        if X2 is None:
            X2 = X
        slices = index_to_slices(X[:,self.index_dim])
        slices2 = index_to_slices(X2[:,self.index_dim])
+
        target =  np.zeros((X.shape[0], X2.shape[0]))
-        [[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j].K(X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))]  
+        for j in range(len(slices2)):
+            for i in range(len(slices)):
+                for l in range(len(slices2[j])):
+                    for k in range(len(slices[i])):
+                        cov_K = self.covariance[i][j].K(X[slices[i][k],:], X2[slices2[j][l],:])
+                        target.__setitem__((slices[i][k], slices2[j][l]), cov_K)
        return target

    @Cache_this(limit=3, ignore_args=())
-    def Kdiag(self,X):
+    def Kdiag(self, X):
        slices = index_to_slices(X[:,self.index_dim])
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        target = np.zeros(X.shape[0])
-        [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
+        for kern, slices_i in zip(kerns, slices):
+            for s in slices_i:
+                np.copyto(target[s], kern.Kdiag(X[s]))
+        return target
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dX(self, X, X2, dimX):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+        """
+        if X2 is None:
+            X2 = X
+        slices = index_to_slices(X[:,self.index_dim])
+        slices2 = index_to_slices(X2[:,self.index_dim])
+
+        target =  np.zeros((X.shape[0], X2.shape[0]))
+        for j in range(len(slices2)):
+            for i in range(len(slices)):
+                for l in range(len(slices2[j])):
+                    for k in range(len(slices[i])):
+                        cov_dK_dX = self.covariance[i][j].dK_dX(X[slices[i][k],:], X2[slices2[j][l],:], dimX)
+                        target.__setitem__((slices[i][k], slices2[j][l]), cov_dK_dX)
+        return target
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dXdiag(self, X, dimX):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+        """
+        slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        target = np.zeros(X.shape[0])
+        for kern, slices_i in zip(kerns, slices):
+            for s in slices_i:
+                np.copyto(target[s], kern.dK_dXdiag(X[s], dimX))
        return target
    
    def _update_gradients_full_wrapper(self, kern, dL_dK, X, X2):
@ -115,19 +157,35 @@ class MultioutputKern(CombinationKernel):
    def reset_gradients(self):
        for kern in self.kern: kern.reset_gradients()

-    def update_gradients_full(self,dL_dK, X, X2=None):
-        self.reset_gradients()
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None:
+            X2 = X
        slices = index_to_slices(X[:,self.index_dim])
-        if X2 is not None:
-            slices2 = index_to_slices(X2[:,self.index_dim])
-            [[[[ self._update_gradients_full_wrapper(self.covariance[i][j], dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
-        else:
-            [[[[ self._update_gradients_full_wrapper(self.covariance[i][j], dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], X[slices[j][l],:]) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
-            
+        slices2 = index_to_slices(X2[:,self.index_dim])
+
+        self.reset_gradients()
+        for j in range(len(slices2)):
+            for i in range(len(slices)):
+                for l in range(len(slices2[j])):
+                    for k in range(len(slices[i])):
+                        self._update_gradients_full_wrapper(
+                            self.covariance[i][j],
+                            dL_dK[slices[i][k],slices2[j][l]],
+                            X[slices[i][k],:],
+                            X2[slices2[j][l],:]
+                            )
+
    def update_gradients_diag(self, dL_dKdiag, X):
-        self.reset_gradients()
        slices = index_to_slices(X[:,self.index_dim])
-        [[ self._update_gradients_diag_wrapper(self.covariance[i][i], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))]
+
+        self.reset_gradients()
+        for i in range(len(slices)):
+            for k in range(len(slices[i])):
+                self._update_gradients_diag_wrapper(
+                    self.covariance[i][i],
+                    dL_dKdiag[slices[i][k]],
+                    X[slices[i][k],:]
+                    )
    
    def gradients_X(self,dL_dK, X, X2=None):
        slices = index_to_slices(X[:,self.index_dim])
@ -137,4 +195,4 @@ class MultioutputKern(CombinationKernel):
            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) ) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
        else:
            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], (None if (i==j and k==l) else X[slices[j][l],:] )) ) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
-        return target
+        return target
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@ -70,6 +70,310 @@ class Prod(CombinationKernel):
            which_parts = self.parts
        return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))

+    def reset_gradients(self):
+        for part in self.parts:
+            part.reset_gradients()
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK_dX(self, X, X2, dimX, which_parts=None):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
+        for combination in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination) > 0:
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update = list(set(which_parts) - set(combination))[0]
+            prod_sum += prod*to_update.dK_dX(X, X2, dimX)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK_dXdiag(self, X, dimX, which_parts=None):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+
+        Returns only diagonal elements.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros(X.shape[0])
+        for combination in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination) > 0:
+                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update = list(set(which_parts) - set(combination))[0]
+            prod_sum += prod*to_update.dK_dXdiag(X, dimX)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK_dX2(self, X, X2, dimX2, which_parts=None):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX2 of set X2.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
+        for combination in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination) > 0:
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update = list(set(which_parts) - set(combination))[0]
+            prod_sum += prod*to_update.dK_dX2(X, X2, dimX2)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK2_dXdX2(self, X, X2, dimX, dimX2, which_parts=None):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
+        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination1) > 0:
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination1])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update1 = list(set(which_parts) - set(combination1))[0]
+            prod_sum += prod*to_update1.dK2_dXdX2(X, X2, dimX, dimX2)
+            if len(which_parts) > 1:
+                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
+                    if len(combination2) > 0:
+                        prod = reduce(np.multiply, [p.K(X, X2) for p in combination2])
+                    else:
+                        prod = np.ones(prod_sum.shape)
+                    to_update2 = list(set(combination1) - set(combination2))[0]
+                    prod_sum += prod*to_update1.dK_dX(X, X2, dimX)*to_update2.dK_dX2(X, X2, dimX2)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK2_dXdX2diag(self, X, dimX, dimX2, which_parts=None):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros(X.shape[0])
+        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination1) > 0:
+                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination1])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update1 = list(set(which_parts) - set(combination1))[0]
+            prod_sum += prod*to_update1.dK2_dXdX2diag(X, dimX, dimX2)
+            if len(which_parts) > 1:
+                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
+                    if len(combination2) > 0:
+                        prod = reduce(np.multiply, [p.Kdiag(X) for p in combination2])
+                    else:
+                        prod = np.ones(prod_sum.shape)
+                    to_update2 = list(set(combination1) - set(combination2))[0]
+                    prod_sum += prod*to_update1.dK_dXdiag(X, dimX)*to_update2.dK_dX2diag(X, dimX)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK2_dXdX(self, X, X2, dimX_0, dimX_1, which_parts=None):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX_0 of set X, and
+            dimension dimX_1 of set X.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
+        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination1) > 0:
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination1])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update1 = list(set(which_parts) - set(combination1))[0]
+            prod_sum += prod*to_update1.dK2_dXdX(X, X2, dimX_0, dimX_1)
+            if len(which_parts) > 1:
+                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
+                    if len(combination2) > 0:
+                        prod = reduce(np.multiply, [p.K(X, X2) for p in combination2])
+                    else:
+                        prod = np.ones(prod_sum.shape)
+                    to_update2 = list(set(combination1) - set(combination2))[0]
+                    prod_sum += prod*to_update1.dK_dX(X, X2, dimX_0)*to_update2.dK_dX(X, X2, dimX_1)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK3_dXdXdX2(self, X, X2, dimX_0, dimX_1, dimX2, which_parts=None):
+        """
+        Compute the third derivative of K with respect to:
+            dimension dimX_0 of set X,
+            dimension dimX_1 of set X, and
+            dimension dimX2 of set X2.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
+        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination1) > 0:
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination1])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update1 = list(set(which_parts) - set(combination1))[0]
+            prod_sum += prod*to_update1.dK3_dXdXdX2(X, X2, dimX_0, dimX_1, dimX2)
+            if len(which_parts) > 1:
+                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
+                    if len(combination2) > 0:
+                        prod = reduce(np.multiply, [p.K(X, X2) for p in combination2])
+                    else:
+                        prod = np.ones(prod_sum.shape)
+                    to_update2 = list(set(combination1) - set(combination2))[0]
+                    prod_sum += prod*to_update1.dK2_dXdX2(X, X2, dimX_0, dimX2)*to_update2.dK_dX(X, X2, dimX_1)
+                    prod_sum += prod*to_update1.dK2_dXdX(X, X2, dimX_0, dimX_1)*to_update2.dK_dX2(X, X2, dimX2)
+                    prod_sum += prod*to_update1.dK_dX(X, X2, dimX_0)*to_update2.dK2_dXdX2(X, X2, dimX_1, dimX2)
+                    if len(which_parts) > 2:
+                        for combination3 in itertools.combinations(combination2, len(combination2) - 1):
+                            if len(combination3) > 0:
+                                prod = reduce(np.multiply, [p.K(X, X2) for p in combination3])
+                            else:
+                                prod = np.ones(prod_sum.shape)
+                            to_update3 = list(set(combination2) - set(combination3))[0]
+                            prod_sum += prod*to_update1.dK_dX(X, X2, dimX_0)*to_update2.dK_dX2(X, X2, dimX2)*to_update3.dK_dX(X, X2, dimX_1)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK3_dXdXdX2diag(self, X, dimX_0, dimX_1, dimX2, which_parts=None):
+        """
+        Compute the third derivative of K with respect to:
+            dimension dimX_0 of set X,
+            dimension dimX_1 of set X, and
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements of the covariance matrix.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros(X.shape[0])
+        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination1) > 0:
+                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination1])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update1 = list(set(which_parts) - set(combination1))[0]
+            prod_sum += prod*to_update1.dK3_dXdXdX2diag(X, dimX_0, dimX_1, dimX2)
+            if len(which_parts) > 1:
+                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
+                    if len(combination2) > 0:
+                        prod = reduce(np.multiply, [p.Kdiag(X) for p in combination2])
+                    else:
+                        prod = np.ones(prod_sum.shape)
+                    to_update2 = list(set(combination1) - set(combination2))[0]
+                    prod_sum += prod*to_update1.dK2_dXdX2diag(X, dimX_0, dimX2)*to_update2.dK_dXdiag(X, dimX_1)
+                    prod_sum += prod*to_update1.dK2_dXdXdiag(X, dimX_0, dimX_1)*to_update2.dK_dX2diag(X, dimX2)
+                    prod_sum += prod*to_update1.dK_dXdiag(X, dimX_0)*to_update2.dK2_dXdX2diag(X, dimX_1, dimX2)
+                    if len(which_parts) > 2:
+                        for combination3 in itertools.combinations(combination2, len(combination2) - 1):
+                            if len(combination3) > 0:
+                                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination3])
+                            else:
+                                prod = np.ones(prod_sum.shape)
+                            to_update3 = list(set(combination2) - set(combination3))[0]
+                            prod_sum += prod*to_update1.dK_dXdiag(X, dimX_0)*to_update2.dK_dX2diag(X, dimX2)*to_update3.dK_dXdiag(X, dimX_1)
+        return prod_sum
+
+    def update_gradients_direct(self, *args):
+        for i, (g,p) in enumerate(zip(args, self.parts)):
+            p.update_gradients_direct(*g)
+
+    def dgradients_dX(self, X, X2, dimX, parts=None):
+        """
+        Compute the hyperparameter gradients of:
+            the derivative of K with respect to dimension dimX of set X
+            ("dK_dX").
+        """
+        if parts is None:
+            parts = self.parts
+        gradients = []
+        for part in parts:
+            neq_parts = [p for p in parts if p is not part]
+
+            if len(neq_parts) > 0:
+                K = self.K(X, X2, which_parts=neq_parts)
+                K_dx = self.dK_dX(X, X2, dimX, which_parts=neq_parts)
+            else:
+                K = np.ones((X.shape[0], X2.shape[0]))
+                K_dx = np.zeros((X.shape[0], X2.shape[0]))
+
+            g = part.dgradients(X, X2)
+            g_dx = part.dgradients_dX(X, X2, dimX)
+
+            gradients += [[(g_i*K_dx + g_dx_i*K) for (g_i, g_dx_i) in zip(g, g_dx)]]
+            
+        return gradients
+
+    def dgradients_dX2(self, X, X2, dimX2, parts=None):
+        """
+        Compute the hyperparameter gradients of:
+            the derivative of K with respect to dimension dimX2 of set X2
+            ("dK_dX2").
+        """
+        if parts is None:
+            parts = self.parts
+        gradients = []
+        for part in parts:
+            neq_parts = [p for p in parts if p is not part]
+
+            if len(neq_parts) > 0:
+                K = self.K(X, X2, which_parts=neq_parts)
+                K_dx2 = self.dK_dX2(X, X2, dimX2, which_parts=neq_parts)
+            else:
+                K = np.ones((X.shape[0], X2.shape[0]))
+                K_dx2 = np.zeros((X.shape[0], X2.shape[0]))
+
+            g = part.dgradients(X, X2)
+            g_dx2 = part.dgradients_dX2(X, X2, dimX2)
+
+            gradients += [[(g_i*K_dx2 + g_dx2_i*K) for (g_i, g_dx2_i) in zip(g, g_dx2)]]
+            
+        return gradients
+
+    def dgradients2_dXdX2(self, X, X2, dimX, dimX2, parts=None):
+        """
+        Compute the hyperparameter gradients of:
+            the second derivative of K with respect to:
+                dimension dimX of set X, and
+                dimension dimX2 of set X2
+            ("dK2_dXdX2").
+        """
+        if parts is None:
+            parts = self.parts
+        gradients = []
+        for part in parts:
+            neq_parts = [p for p in parts if p is not part]
+
+            K = self.K(X, X2, which_parts=neq_parts)
+            K_dx = self.dK_dX(X, X2, dimX, which_parts=neq_parts)
+            K_dx2 = self.dK_dX2(X, X2, dimX2, which_parts=neq_parts)
+            K_dxdx2 = self.dK2_dXdX2(X, X2, dimX, dimX2, which_parts=neq_parts)
+
+            g = part.dgradients(X, X2)
+            g_dx = part.dgradients_dX(X, X2, dimX)
+            g_dx2 = part.dgradients_dX2(X, X2, dimX2)
+            g_dxdx2 = part.dgradients2_dXdX2(X, X2, dimX, dimX2)
+
+            gradients += [[(g_i*K_dxdx2 + g_dx_i*K_dx2 + g_dx2_i*K_dx + g_dxdx2_i*K) for (g_i, g_dx_i, g_dx2_i, g_dxdx2_i) in zip(g, g_dx, g_dx2, g_dxdx2)]]
+        return gradients
+
    def update_gradients_full(self, dL_dK, X, X2=None):
        if len(self.parts)==2:
            self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
--- a/GPy/kern/src/rbf.py
+++ b/GPy/kern/src/rbf.py
@ -53,24 +53,126 @@ class RBF(Stationary):

    @Cache_this(limit=3, ignore_args=())
    def dK_dX(self, X, X2, dimX):
-        r = self._scaled_dist(X, X2)
-        K = self.K_of_r(r)
-        dist = X[:,None,dimX]-X2[None,:,dimX]
-        lengthscale2inv = (np.ones((X.shape[1]))/(self.lengthscale**2))[dimX]
-        return -1.*K*dist*lengthscale2inv
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
+        dist = X[:,None,dimX] - X2[None,:,dimX]
+        return -dist*(lengthscaleinv**2)*self._clean_K(X, X2)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dXdiag(self, X, dimX):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+
+        Returns only diagonal elements.
+        """
+        return np.zeros(X.shape[0])
+
    @Cache_this(limit=3, ignore_args=())
    def dK_dX2(self, X, X2, dimX2):
-        return -self.dK_dX(X,X2, dimX2)
-    
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX2 of set X2.
+        """
+        return -self._clean_dK_dX(X, X2, dimX2)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dX2diag(self, X, dimX2):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements.
+        """
+        return np.zeros(X.shape[0])
+
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX2(self, X, X2, dimX, dimX2):
-        r = self._scaled_dist(X, X2)
-        K = self.K_of_r(r)
-        if X2 is None:
-            X2=X
-        dist = X[:,None,:]-X2[None,:,:]
-        lengthscale2inv = np.ones((X.shape[1]))/(self.lengthscale**2)
-        return -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX]*lengthscale2inv[dimX2] + (dimX==dimX2)*K*lengthscale2inv[dimX]
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+
+        term = dist[dimX]*(lengthscaleinv[dimX]**2)
+        term *= dist[dimX2]*(lengthscaleinv[dimX2]**2)
+        if dimX == dimX2:
+            term -= (lengthscaleinv[dimX]**2)
+        return -term*self._clean_K(X, X2)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dXdX2diag(self, X, dimX, dimX2):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements.
+        """
+        if dimX == dimX2:
+            lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
+            return np.ones(X.shape[0])*(lengthscaleinv[dimX]**2)*self.variance
+        else:
+            return np.zeros(X.shape[0])
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dXdX(self, X, X2, dimX_0, dimX_1):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX_0 of set X, and
+            dimension dimX_1 of set X.
+        """
+        return -self._clean_dK2_dXdX2(X, X2, dimX_0, dimX_1)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dXdXdiag(self, X, dimX_0, dimX_1):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX_0 of set X, and
+            dimension dimX_1 of set X.
+
+        Returns only diagonal elements.
+        """
+        return -self._clean_dK2_dXdX2diag(X, dimX_0, dimX_1)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK3_dXdXdX2(self, X, X2, dimX_0, dimX_1, dimX2):
+        """
+        Compute the third derivative of K with respect to:
+            dimension dimX_0 of set X,
+            dimension dimX_1 of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+
+        term = dist[dimX_0]*(lengthscaleinv[dimX_0]**2)
+        term *= dist[dimX_1]*(lengthscaleinv[dimX_1]**2)
+        term *= dist[dimX2]*(lengthscaleinv[dimX2]**2)
+        if dimX_0 == dimX_1:
+            term -= dist[dimX2]*(lengthscaleinv[dimX2]**2)*(lengthscaleinv[dimX_0]**2)
+        if dimX_0 == dimX2:
+            term -= dist[dimX_1]*(lengthscaleinv[dimX_1]**2)*(lengthscaleinv[dimX_0]**2)
+        if dimX_1 == dimX2:
+            term -= dist[dimX_0]*(lengthscaleinv[dimX_0]**2)*(lengthscaleinv[dimX_1]**2)
+        return term*self._clean_K(X, X2)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK3_dXdXdX2diag(self, X, dimX_0, dimX_1, dimX2):
+        """
+        Compute the third derivative of K with respect to:
+            dimension dimX_0 of set X,
+            dimension dimX_1 of set X, and
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements of the covariance matrix.
+        """
+        return np.zeros(X.shape[0])

    def dK_dr(self, r):
        return -r*self.K_of_r(r)
@ -80,73 +182,132 @@ class RBF(Stationary):

    def dK2_drdr_diag(self):
        return -self.variance # as the diagonal of r is always filled with zeros
-    
+
    @Cache_this(limit=3, ignore_args=())
-    def dK_dvariance(self,X,X2):
-        return self.K(X,X2)/self.variance
-    
+    def dK_dvariance(self, X, X2):
+        """
+        Compute the derivative of K with respect to variance.
+        """
+        return self._clean_K(X, X2)/self.variance
+
    @Cache_this(limit=3, ignore_args=())
-    def dK2_dvariancedX(self, X, X2, dim):
-        return self.dK_dX(X,X2, dim)/self.variance
-    
+    def dK_dlengthscale(self, X, X2):
+        """
+        Compute the derivative(s) of K with respect to lengthscale(s).
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+
+        K = self._clean_K(X, X2)
+
+        if self.ARD:
+            g = []
+            for diml in range(self.input_dim):
+                g += [(dist[diml]**2)*(lengthscaleinv[diml]**3)*K]
+        else:
+            g = (lengthscaleinv[0]**3)*np.sum(dist**2, axis=0)*K
+        return g
+
    @Cache_this(limit=3, ignore_args=())
-    def dK2_dvariancedX2(self, X, X2, dim):
-        return self.dK_dX2(X,X2, dim)/self.variance
-    
+    def dK2_dvariancedX(self, X, X2, dimX):
+        """
+        Compute the second derivative of K with respect to:
+            variance, and
+            dimension dimX of set X.
+        """
+        return self._clean_dK_dX(X, X2, dimX)/self.variance
+
    @Cache_this(limit=3, ignore_args=())
-    def dK3_dvariancedXdX2(self, X, X2, dim, dimX2):
-        return self.dK2_dXdX2(X, X2, dim, dimX2)/self.variance
+    def dK2_dvariancedX2(self, X, X2, dimX2):
+        """
+        Compute the second derivative of K with respect to:
+            variance, and
+            dimension dimX2 of set X2.
+        """
+        return -self.dK2_dvariancedX(X, X2, dimX2)

    @Cache_this(limit=3, ignore_args=())
    def dK2_dlengthscaledX(self, X, X2, dimX):
-        r = self._scaled_dist(X, X2)
-        K = self.K_of_r(r)
-        if X2 is None:
-            X2=X
-        dist = X[:,None,:]-X2[None,:,:]
-        lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
+        """
+        Compute the second derivative(s) of K with respect to:
+            lengthscale(s), and
+            dimension dimX of set X.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+
+        dK_dX = self._clean_dK_dX(X, X2, dimX)
+        dK_dl = self.dK_dlengthscale(X, X2)
+
        if self.ARD:
            g = []
-            for diml in range(X.shape[1]):
-                g += [-1.*K*dist[:,:,dimX]*(dist[:,:,diml]**2)*(lengthscaleinv[dimX]**2)*(lengthscaleinv[diml]**3) + 2.*dist[:,:,dimX]*(lengthscaleinv[diml]**3)*K*(dimX == diml)]
+            for diml in range(self.input_dim):
+                term = -dist[dimX]*(lengthscaleinv[dimX]**2)*dK_dl[diml]
+                if diml == dimX:
+                    term -= 2*lengthscaleinv[dimX]*dK_dX
+                g += [term]
        else:
-            g = -1.*K*dist[:,:,dimX]*np.sum(dist**2, axis=2)*(lengthscaleinv[dimX]**5) + 2.*dist[:,:,dimX]*(lengthscaleinv[dimX]**3)*K
+            term = -dist[dimX]*(lengthscaleinv[0]**2)*dK_dl
+            term -= 2*lengthscaleinv[0]*dK_dX
+            g = term
        return g
-    
+
    @Cache_this(limit=3, ignore_args=())
    def dK2_dlengthscaledX2(self, X, X2, dimX2):
-        tmp = self.dK2_dlengthscaledX(X, X2, dimX2)
+        """
+        Compute the second derivative(s) of K with respect to:
+            lengthscale(s), and
+            dimension dimX2 of set X2.
+        """
+        dK2_dlengthscaledX = self.dK2_dlengthscaledX(X, X2, dimX2)
        if self.ARD:
-            return [-1.*g for g in tmp]
+            return [-1.*g for g in dK2_dlengthscaledX]
        else:
-            return -1*tmp
+            return -1*dK2_dlengthscaledX
    
+    @Cache_this(limit=3, ignore_args=())
+    def dK3_dvariancedXdX2(self, X, X2, dimX, dimX2):
+        """
+        Compute the third derivative of K with respect to:
+            variance,
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        return self._clean_dK2_dXdX2(X, X2, dimX, dimX2)/self.variance
+
    @Cache_this(limit=3, ignore_args=())
    def dK3_dlengthscaledXdX2(self, X, X2, dimX, dimX2):
-        r = self._scaled_dist(X, X2)
-        K = self.K_of_r(r)
-        if X2 is None:
-            X2=X
-        dist = X[:,None,:]-X2[None,:,:]
-        lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
-        lengthscale2inv = lengthscaleinv**2
+        """
+        Compute the third derivative(s) of K with respect to:
+            lengthscale(s),
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+
+        K = self._clean_K(X, X2)
+        dK_dX = self._clean_dK_dX(X, X2, dimX)
+        dK_dX2 = self._clean_dK_dX(X, X2, dimX2)
+        dK2_dXdX2 = self._clean_dK2_dXdX2(X, X2, dimX, dimX2)
+
        if self.ARD:
            g = []
-            for diml in range(X.shape[1]):
-                tmp = -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*(dist[:,:,diml]**2)*lengthscale2inv[dimX]*lengthscale2inv[dimX2]*(lengthscaleinv[diml]**3)
-                if dimX == dimX2:
-                    tmp += K*lengthscale2inv[dimX]*(lengthscaleinv[diml]**3)*(dist[:,:,diml]**2)
+            for diml in range(self.input_dim):
+                term = (dist[diml]**2)*(lengthscaleinv[diml]**3)*dK2_dXdX2
                if diml == dimX:
-                    tmp += 2.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX2]*(lengthscaleinv[dimX]**3)
+                    term -= 2*dist[dimX]*(lengthscaleinv[dimX]**3)*dK_dX2
                if diml == dimX2:
-                    tmp += 2.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX]*(lengthscaleinv[dimX2]**3)
-                    if dimX == dimX2:
-                        tmp += -2.*K*(lengthscaleinv[dimX]**3)
-                g += [tmp]
+                    term -= 2*dist[dimX2]*(lengthscaleinv[dimX2]**3)*dK_dX
+                if diml == dimX == dimX2:
+                    term -= 2*(lengthscaleinv[dimX]**3)*K
+                g += [term]
        else:
-            g = -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*np.sum(dist**2, axis=2)*(lengthscaleinv[dimX]**7) +4*K*dist[:,:,dimX]*dist[:,:,dimX2]*(lengthscaleinv[dimX]**5)
+            term = np.sum(dist**2, axis=0)*dK2_dXdX2
+            term -= 4*dist[dimX2]*dK_dX
            if dimX == dimX2:
-                g += -2.*K*(lengthscaleinv[dimX]**3) + K*(lengthscaleinv[dimX]**5)*np.sum(dist**2, axis=2)
+                term -= 2*K
+            g = (lengthscaleinv[0]**3)*term
        return g

    def __getstate__(self):
--- a/GPy/kern/src/sde_standard_periodic.py
+++ b/GPy/kern/src/sde_standard_periodic.py
@ -13,213 +13,267 @@ import warnings

 from scipy import special as special

+
 class sde_StdPeriodic(StdPeriodic):
    """
-    
+
    Class provide extra functionality to transfer this covariance function into
    SDE form.
-    
+
    Standard Periodic kernel:

    .. math::

-       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
+       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}
       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }

    """
+
    # TODO: write comment to the constructor arguments
    def __init__(self, *args, **kwargs):
        """
        Init constructior.
-        
-        Two optinal extra parameters are added in addition to the ones in 
+
+        Two optinal extra parameters are added in addition to the ones in
        StdPeriodic kernel.
-        
+
        :param approx_order: approximation order for the RBF covariance. (Default 7)
        :type approx_order: int
-        
+
        :param balance: Whether to balance this kernel separately. (Defaulf False). Model has a separate parameter for balancing.
        :type balance: bool
        """
-        
-        #import pdb; pdb.set_trace()
-        
-        if 'approx_order' in kwargs:
-            self.approx_order = kwargs.get('approx_order')
-            del kwargs['approx_order']
+
+        # import pdb; pdb.set_trace()
+
+        if "approx_order" in kwargs:
+            self.approx_order = kwargs.get("approx_order")
+            del kwargs["approx_order"]
        else:
            self.approx_order = 7
-        
-        
-        if 'balance' in kwargs:
-            self.balance = bool( kwargs.get('balance') )
-            del kwargs['balance']
+
+        if "balance" in kwargs:
+            self.balance = bool(kwargs.get("balance"))
+            del kwargs["balance"]
        else:
            self.balance = False
-        
+
        super(sde_StdPeriodic, self).__init__(*args, **kwargs)
-        
+
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
-    
+
        self.variance.gradient = gradients[0]
        self.period.gradient = gradients[1]
        self.lengthscale.gradient = gradients[2]
-        
-    def sde(self): 
-        """ 
+
+    def sde(self):
+        """
        Return the state space representation of the standard periodic covariance.
-        
-        
+
+
        ! Note: one must constrain lengthscale not to drop below 0.2. (independently of approximation order)
        After this Bessel functions of the first becomes NaN. Rescaling
        time variable might help.
-        
+
        ! Note: one must keep period also not very low. Because then
-        the gradients wrt wavelength become ustable. 
+        the gradients wrt wavelength become ustable.
        However this might depend on the data. For test example with
-        300 data points the low limit is 0.15. 
-        """ 
-        
-        #import pdb; pdb.set_trace()
+        300 data points the low limit is 0.15.
+        """
+
+        # import pdb; pdb.set_trace()
        # Params to use: (in that order)
-        #self.variance
-        #self.period
-        #self.lengthscale
+        # self.variance
+        # self.period
+        # self.lengthscale
        if self.approx_order is not None:
            N = int(self.approx_order)
        else:
-            N = 7 # approximation order        
-        
-        p_period = float(self.period)        
-        p_lengthscale = 2*float(self.lengthscale)
-        p_variance = float(self.variance)        
-        
-        w0 = 2*np.pi/p_period # frequency
+            N = 7  # approximation order
+
+        p_period = float(self.period)
+        p_lengthscale = 2 * float(self.lengthscale)
+        p_variance = float(self.variance)
+
+        w0 = 2 * np.pi / p_period  # frequency
        # lengthscale is multiplied by 2 because of different definition of lengthscale
-        
-        [q2,dq2l] = seriescoeff(N, p_lengthscale, p_variance)        
-        
-        dq2l = 2*dq2l  # This is because the lengthscale if multiplied by 2.
-        
+
+        [q2, dq2l] = seriescoeff(N, p_lengthscale, p_variance)
+
+        dq2l = 2 * dq2l  # This is because the lengthscale if multiplied by 2.
+
        eps = 1e-12
-        if np.any( np.isfinite(q2) == False) or np.any( np.abs(q2) > 1.0/eps) or np.any( np.abs(q2) < eps):
-            warnings.warn("sde_Periodic:  Infinite, too small, or too large (eps={0:e}) values in q2 :".format(eps) + q2.__format__("") )
-                                
-        if np.any( np.isfinite(dq2l) == False) or np.any( np.abs(dq2l) > 1.0/eps) or np.any( np.abs(dq2l) < eps):
-            warnings.warn("sde_Periodic:  Infinite, too small, or too large (eps={0:e}) values in dq2l :".format(eps) + q2.__format__("") )
-                 
-                 
-        F    = np.kron(np.diag(range(0,N+1)),np.array( ((0, -w0), (w0, 0)) ) )
-        L    = np.eye(2*(N+1))
-        Qc   = np.zeros((2*(N+1), 2*(N+1)))
-        P_inf = np.kron(np.diag(q2),np.eye(2))
-        H    = np.kron(np.ones((1,N+1)),np.array((1,0)) )
+        if (
+            np.any(np.isfinite(q2) == False)
+            or np.any(np.abs(q2) > 1.0 / eps)
+            or np.any(np.abs(q2) < eps)
+        ):
+            warnings.warn(
+                "sde_Periodic:  Infinite, too small, or too large (eps={0:e}) values in q2 :".format(
+                    eps
+                )
+                + q2.__format__("")
+            )
+
+        if (
+            np.any(np.isfinite(dq2l) == False)
+            or np.any(np.abs(dq2l) > 1.0 / eps)
+            or np.any(np.abs(dq2l) < eps)
+        ):
+            warnings.warn(
+                "sde_Periodic:  Infinite, too small, or too large (eps={0:e}) values in dq2l :".format(
+                    eps
+                )
+                + q2.__format__("")
+            )
+
+        F = np.kron(np.diag(range(0, N + 1)), np.array(((0, -w0), (w0, 0))))
+        L = np.eye(2 * (N + 1))
+        Qc = np.zeros((2 * (N + 1), 2 * (N + 1)))
+        P_inf = np.kron(np.diag(q2), np.eye(2))
+        H = np.kron(np.ones((1, N + 1)), np.array((1, 0)))
        P0 = P_inf.copy()
-        
+
        # Derivatives
        dF = np.empty((F.shape[0], F.shape[1], 3))
        dQc = np.empty((Qc.shape[0], Qc.shape[1], 3))
-        dP_inf = np.empty((P_inf.shape[0], P_inf.shape[1], 3))         
-        
+        dP_inf = np.empty((P_inf.shape[0], P_inf.shape[1], 3))
+
        # Derivatives wrt self.variance
-        dF[:,:,0] = np.zeros(F.shape)
-        dQc[:,:,0] = np.zeros(Qc.shape)
-        dP_inf[:,:,0] = P_inf / p_variance
+        dF[:, :, 0] = np.zeros(F.shape)
+        dQc[:, :, 0] = np.zeros(Qc.shape)
+        dP_inf[:, :, 0] = P_inf / p_variance

        # Derivatives self.period
-        dF[:,:,1] = np.kron(np.diag(range(0,N+1)),np.array( ((0,  w0), (-w0, 0)) ) / p_period );
-        dQc[:,:,1] = np.zeros(Qc.shape)
-        dP_inf[:,:,1] = np.zeros(P_inf.shape)      
-        
-        # Derivatives self.lengthscales        
-        dF[:,:,2] = np.zeros(F.shape)
-        dQc[:,:,2] = np.zeros(Qc.shape)
-        dP_inf[:,:,2] = np.kron(np.diag(dq2l),np.eye(2))
+        dF[:, :, 1] = np.kron(
+            np.diag(range(0, N + 1)), np.array(((0, w0), (-w0, 0))) / p_period
+        )
+        dQc[:, :, 1] = np.zeros(Qc.shape)
+        dP_inf[:, :, 1] = np.zeros(P_inf.shape)
+
+        # Derivatives self.lengthscales
+        dF[:, :, 2] = np.zeros(F.shape)
+        dQc[:, :, 2] = np.zeros(Qc.shape)
+        dP_inf[:, :, 2] = np.kron(np.diag(dq2l), np.eye(2))
        dP0 = dP_inf.copy()
-        
+
        if self.balance:
            # Benefits of this are not very sound.
            import GPy.models.state_space_main as ssm
-            (F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf,dP0) = ssm.balance_ss_model(F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0 )
-            
+
+            (F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0) = ssm.balance_ss_model(
+                F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0
+            )
+
        return (F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0)
-        
-        
-        
-        
-def seriescoeff(m=6,lengthScale=1.0,magnSigma2=1.0, true_covariance=False):
+
+
+def seriescoeff(m=6, lengthScale=1.0, magnSigma2=1.0, true_covariance=False):
    """
-    Calculate the coefficients q_j^2 for the covariance function 
+    Calculate the coefficients q_j^2 for the covariance function
    approximation:
-    
+
        k(\tau) =  \sum_{j=0}^{+\infty} q_j^2 \cos(j\omega_0 \tau)
-    
+
    Reference is:

-    [1] Arno Solin and Simo Särkkä (2014). Explicit link between periodic 
-        covariance functions and state space models. In Proceedings of the 
-        Seventeenth International Conference on Artifcial Intelligence and 
-        Statistics (AISTATS 2014). JMLR: W&CP, volume 33.    
-    
-    Note! Only the infinite approximation (through Bessel function) 
+    [1] Arno Solin and Simo Särkkä (2014). Explicit link between periodic
+        covariance functions and state space models. In Proceedings of the
+        Seventeenth International Conference on Artifcial Intelligence and
+        Statistics (AISTATS 2014). JMLR: W&CP, volume 33.
+
+    Note! Only the infinite approximation (through Bessel function)
          is currently implemented.

    Input:
    ----------------
-    
+
    m: int
        Degree of approximation. Default 6.
    lengthScale: float
        Length scale parameter in the kerenl
    magnSigma2:float
        Multiplier in front of the kernel.
-        
-    
+
+
    Output:
    -----------------
-    
+
    coeffs: array(m+1)
        Covariance series coefficients
-    
+
    coeffs_dl: array(m+1)
        Derivatives of the coefficients with respect to lengthscale.
-    
+
    """
-    
+
    if true_covariance:
-        
-        bb = lambda j,m: (1.0 + np.array((j != 0), dtype=np.float64) ) / (2**(j)) *\
-            sp.special.binom(j, sp.floor( (j-m)/2.0 * np.array(m<=j, dtype=np.float64) ))*\
-            np.array(m<=j, dtype=np.float64) *np.array(sp.mod(j-m,2)==0, dtype=np.float64)
-                
-        M,J = np.meshgrid(range(0,m+1),range(0,m+1))
-        
-        coeffs = bb(J,M) / sp.misc.factorial(J) * sp.exp( -lengthScale**(-2) ) *\
-             (lengthScale**(-2))**J  *magnSigma2
-        
-        coeffs_dl = np.sum( coeffs*lengthScale**(-3)*(2.0-2.0*J*lengthScale**2),0)         
-        
-        coeffs = np.sum(coeffs,0)
-        
+
+        bb = (
+            lambda j, m: (1.0 + np.array((j != 0), dtype=np.float64))
+            / (2 ** (j))
+            * sp.special.binom(
+                j, sp.floor((j - m) / 2.0 * np.array(m <= j, dtype=np.float64))
+            )
+            * np.array(m <= j, dtype=np.float64)
+            * np.array(sp.mod(j - m, 2) == 0, dtype=np.float64)
+        )
+
+        M, J = np.meshgrid(range(0, m + 1), range(0, m + 1))
+
+        coeffs = (
+            bb(J, M)
+            / sp.misc.factorial(J)
+            * np.exp(-(lengthScale ** (-2)))
+            * (lengthScale ** (-2)) ** J
+            * magnSigma2
+        )
+
+        coeffs_dl = np.sum(
+            coeffs * lengthScale ** (-3) * (2.0 - 2.0 * J * lengthScale**2), 0
+        )
+
+        coeffs = np.sum(coeffs, 0)
+
    else:
-        coeffs = 2*magnSigma2*sp.exp( -lengthScale**(-2) ) * special.iv(range(0,m+1),1.0/lengthScale**(2))
-        if np.any( np.isfinite(coeffs) == False):
+        coeffs = (
+            2
+            * magnSigma2
+            * np.exp(-(lengthScale ** (-2)))
+            * special.iv(range(0, m + 1), 1.0 / lengthScale ** (2))
+        )
+        if np.any(np.isfinite(coeffs) == False):
            raise ValueError("sde_standard_periodic: Coefficients are not finite!")
-        #import pdb; pdb.set_trace()
-        coeffs[0] = 0.5*coeffs[0]
-        #print(coeffs)
+        # import pdb; pdb.set_trace()
+        coeffs[0] = 0.5 * coeffs[0]
+        # print(coeffs)
        # Derivatives wrt (lengthScale)
-        coeffs_dl = np.zeros(m+1)
-        coeffs_dl[1:] = magnSigma2*lengthScale**(-3) * sp.exp(-lengthScale**(-2))*\
-        (-4*special.iv(range(0,m),lengthScale**(-2)) + 4*(1+np.arange(1,m+1)*lengthScale**(2))*special.iv(range(1,m+1),lengthScale**(-2)) )    
-            
+        coeffs_dl = np.zeros(m + 1)
+        coeffs_dl[1:] = (
+            magnSigma2
+            * lengthScale ** (-3)
+            * np.exp(-(lengthScale ** (-2)))
+            * (
+                -4 * special.iv(range(0, m), lengthScale ** (-2))
+                + 4
+                * (1 + np.arange(1, m + 1) * lengthScale ** (2))
+                * special.iv(range(1, m + 1), lengthScale ** (-2))
+            )
+        )
+
        # The first element
-        coeffs_dl[0] = magnSigma2*lengthScale**(-3) * np.exp(-lengthScale**(-2))*\
-            (2*special.iv(0,lengthScale**(-2)) - 2*special.iv(1,lengthScale**(-2)) )     
-        
+        coeffs_dl[0] = (
+            magnSigma2
+            * lengthScale ** (-3)
+            * np.exp(-(lengthScale ** (-2)))
+            * (
+                2 * special.iv(0, lengthScale ** (-2))
+                - 2 * special.iv(1, lengthScale ** (-2))
+            )
+        )

    return coeffs.squeeze(), coeffs_dl.squeeze()
--- a/GPy/kern/src/sde_stationary.py
+++ b/GPy/kern/src/sde_stationary.py
@ -11,12 +11,14 @@ from .stationary import RatQuad

 import numpy as np
 import scipy as sp
+
 try:
    from scipy.linalg import solve_continuous_lyapunov as lyap
 except ImportError:
    from scipy.linalg import solve_lyapunov as lyap
 import warnings

+
 class sde_RBF(RBF):
    """

@ -30,37 +32,35 @@ class sde_RBF(RBF):
        k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }

    """
+
    def __init__(self, *args, **kwargs):
        """
        Init constructior.
-        
-        Two optinal extra parameters are added in addition to the ones in 
+
+        Two optinal extra parameters are added in addition to the ones in
        RBF kernel.
-        
+
        :param approx_order: approximation order for the RBF covariance. (Default 10)
        :type approx_order: int
-        
+
        :param balance: Whether to balance this kernel separately. (Defaulf True). Model has a separate parameter for balancing.
        :type balance: bool
        """
-        
-        if 'balance' in kwargs:
-            self.balance = bool( kwargs.get('balance') )
-            del kwargs['balance']
+
+        if "balance" in kwargs:
+            self.balance = bool(kwargs.get("balance"))
+            del kwargs["balance"]
        else:
            self.balance = True
-        
-        
-        if 'approx_order' in kwargs:
-            self.approx_order = kwargs.get('approx_order')
-            del kwargs['approx_order']
+
+        if "approx_order" in kwargs:
+            self.approx_order = kwargs.get("approx_order")
+            del kwargs["approx_order"]
        else:
            self.approx_order = 6
-        
-        
-        
+
        super(sde_RBF, self).__init__(*args, **kwargs)
-        
+
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
@ -73,86 +73,111 @@ class sde_RBF(RBF):
    def sde(self):
        """
        Return the state space representation of the covariance.
-        
+
        Note! For Sparse GP inference too small or two high values of lengthscale
        lead to instabilities. This is because Qc are too high or too low
        and P_inf are not full rank. This effect depends on approximatio order.
        For N = 10. lengthscale must be in (0.8,8). For other N tests must be conducted.
        N=6: (0.06,31)
        Variance should be within reasonable bounds as well, but its dependence is linear.
-        
+
        The above facts do not take into accout regularization.
        """
-        #import pdb; pdb.set_trace()
+        # import pdb; pdb.set_trace()
        if self.approx_order is not None:
            N = self.approx_order
        else:
-            N = 10# approximation order ( number of terms in exponent series expansion)
-            
+            N = 10  # approximation order ( number of terms in exponent series expansion)
+
        roots_rounding_decimals = 6

        fn = np.math.factorial(N)

-        p_lengthscale = float( self.lengthscale )
+        p_lengthscale = float(self.lengthscale)
        p_variance = float(self.variance)
-        kappa = 1.0/2.0/p_lengthscale**2
+        kappa = 1.0 / 2.0 / p_lengthscale**2
+
+        Qc = np.array(((p_variance * np.sqrt(np.pi / kappa) * fn * (4 * kappa) ** N,),))

-        Qc = np.array( ((p_variance*np.sqrt(np.pi/kappa)*fn*(4*kappa)**N,),) )
-        
        eps = 1e-12
-        if (float(Qc) > 1.0/eps) or (float(Qc) < eps):
-            warnings.warn("""sde_RBF kernel: the noise variance Qc is either very large or very small. 
-                                It influece conditioning of P_inf: {0:e}""".format(float(Qc)) )
+        if (float(Qc) > 1.0 / eps) or (float(Qc) < eps):
+            warnings.warn(
+                """sde_RBF kernel: the noise variance Qc is either very large or very small. 
+                                It influece conditioning of P_inf: {0:e}""".format(
+                    float(Qc)
+                )
+            )

-        pp1 = np.zeros((2*N+1,)) # array of polynomial coefficients from higher power to lower
+        pp1 = np.zeros(
+            (2 * N + 1,)
+        )  # array of polynomial coefficients from higher power to lower

-        for n in range(0, N+1): # (2N+1) - number of polynomial coefficients
-            pp1[2*(N-n)] = fn*(4.0*kappa)**(N-n)/np.math.factorial(n)*(-1)**n
-            
-        pp = sp.poly1d(pp1)
-        roots = sp.roots(pp)
+        for n in range(0, N + 1):  # (2N+1) - number of polynomial coefficients
+            pp1[2 * (N - n)] = (
+                fn * (4.0 * kappa) ** (N - n) / np.math.factorial(n) * (-1) ** n
+            )

-        neg_real_part_roots = roots[np.round(np.real(roots) ,roots_rounding_decimals) < 0]
-        aa = sp.poly1d(neg_real_part_roots, r=True).coeffs
+        pp = np.poly1d(pp1)
+        roots = np.roots(pp)

-        F = np.diag(np.ones((N-1,)),1)
-        F[-1,:] = -aa[-1:0:-1]
+        neg_real_part_roots = roots[
+            np.round(np.real(roots), roots_rounding_decimals) < 0
+        ]
+        aa = np.poly1d(neg_real_part_roots, r=True).coeffs

-        L= np.zeros((N,1))
-        L[N-1,0] = 1
+        F = np.diag(np.ones((N - 1,)), 1)
+        F[-1, :] = -aa[-1:0:-1]

-        H = np.zeros((1,N))
-        H[0,0] = 1
+        L = np.zeros((N, 1))
+        L[N - 1, 0] = 1
+
+        H = np.zeros((1, N))
+        H[0, 0] = 1

        # Infinite covariance:
-        Pinf = lyap(F, -np.dot(L,np.dot( Qc[0,0],L.T)))
-        Pinf = 0.5*(Pinf + Pinf.T)
+        Pinf = lyap(F, -np.dot(L, np.dot(Qc[0, 0], L.T)))
+        Pinf = 0.5 * (Pinf + Pinf.T)
        # Allocating space for derivatives
-        dF    = np.empty([F.shape[0],F.shape[1],2])
-        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2])
-        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2])
+        dF = np.empty([F.shape[0], F.shape[1], 2])
+        dQc = np.empty([Qc.shape[0], Qc.shape[1], 2])
+        dPinf = np.empty([Pinf.shape[0], Pinf.shape[1], 2])

        # Derivatives:
        dFvariance = np.zeros(F.shape)
        dFlengthscale = np.zeros(F.shape)
-        dFlengthscale[-1,:] = -aa[-1:0:-1]/p_lengthscale * np.arange(-N,0,1)
+        dFlengthscale[-1, :] = -aa[-1:0:-1] / p_lengthscale * np.arange(-N, 0, 1)

-        dQcvariance = Qc/p_variance
-        dQclengthscale = np.array(( (p_variance*np.sqrt(2*np.pi)*fn*2**N*p_lengthscale**(-2*N)*(1-2*N),),))
-        
-        dPinf_variance = Pinf/p_variance
+        dQcvariance = Qc / p_variance
+        dQclengthscale = np.array(
+            (
+                (
+                    p_variance
+                    * np.sqrt(2 * np.pi)
+                    * fn
+                    * 2**N
+                    * p_lengthscale ** (-2 * N)
+                    * (1 - 2 * N),
+                ),
+            )
+        )
+
+        dPinf_variance = Pinf / p_variance

        lp = Pinf.shape[0]
-        coeff = np.arange(1,lp+1).reshape(lp,1) + np.arange(1,lp+1).reshape(1,lp) - 2
-        coeff[np.mod(coeff,2) != 0] = 0
-        dPinf_lengthscale = -1/p_lengthscale*Pinf*coeff
+        coeff = (
+            np.arange(1, lp + 1).reshape(lp, 1)
+            + np.arange(1, lp + 1).reshape(1, lp)
+            - 2
+        )
+        coeff[np.mod(coeff, 2) != 0] = 0
+        dPinf_lengthscale = -1 / p_lengthscale * Pinf * coeff

-        dF[:,:,0]    = dFvariance
-        dF[:,:,1]    = dFlengthscale
-        dQc[:,:,0]   = dQcvariance
-        dQc[:,:,1]   = dQclengthscale
-        dPinf[:,:,0] = dPinf_variance
-        dPinf[:,:,1] = dPinf_lengthscale
+        dF[:, :, 0] = dFvariance
+        dF[:, :, 1] = dFlengthscale
+        dQc[:, :, 0] = dQcvariance
+        dQc[:, :, 1] = dQclengthscale
+        dPinf[:, :, 0] = dPinf_variance
+        dPinf[:, :, 1] = dPinf_lengthscale

        P0 = Pinf.copy()
        dP0 = dPinf.copy()
@ -161,10 +186,14 @@ class sde_RBF(RBF):
            # Benefits of this are not very sound. Helps only in one case:
            # SVD Kalman + RBF kernel
            import GPy.models.state_space_main as ssm
-            (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf,dP0) = ssm.balance_ss_model(F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0 )
+
+            (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0) = ssm.balance_ss_model(
+                F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0
+            )

        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)

+
 class sde_Exponential(Exponential):
    """

@ -195,30 +224,31 @@ class sde_Exponential(Exponential):
        variance = float(self.variance.values)
        lengthscale = float(self.lengthscale)

-        F  = np.array(((-1.0/lengthscale,),))
-        L  = np.array(((1.0,),))
-        Qc = np.array( ((2.0*variance/lengthscale,),) )
+        F = np.array(((-1.0 / lengthscale,),))
+        L = np.array(((1.0,),))
+        Qc = np.array(((2.0 * variance / lengthscale,),))
        H = np.array(((1.0,),))
        Pinf = np.array(((variance,),))
        P0 = Pinf.copy()

-        dF = np.zeros((1,1,2));
-        dQc = np.zeros((1,1,2));
-        dPinf = np.zeros((1,1,2));
+        dF = np.zeros((1, 1, 2))
+        dQc = np.zeros((1, 1, 2))
+        dPinf = np.zeros((1, 1, 2))

-        dF[:,:,0] = 0.0
-        dF[:,:,1] = 1.0/lengthscale**2
+        dF[:, :, 0] = 0.0
+        dF[:, :, 1] = 1.0 / lengthscale**2

-        dQc[:,:,0] = 2.0/lengthscale
-        dQc[:,:,1] = -2.0*variance/lengthscale**2
+        dQc[:, :, 0] = 2.0 / lengthscale
+        dQc[:, :, 1] = -2.0 * variance / lengthscale**2

-        dPinf[:,:,0] = 1.0
-        dPinf[:,:,1] = 0.0
+        dPinf[:, :, 0] = 1.0
+        dPinf[:, :, 1] = 0.0

        dP0 = dPinf.copy()

        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)

+
 class sde_RatQuad(RatQuad):
    """

@ -238,12 +268,12 @@ class sde_RatQuad(RatQuad):
        Return the state space representation of the covariance.
        """

-        assert False, 'Not Implemented'
+        assert False, "Not Implemented"

        # Params to use:

        # self.lengthscale
        # self.variance
-        #self.power
+        # self.power

-        #return (F, L, Qc, H, Pinf, dF, dQc, dPinf)
+        # return (F, L, Qc, H, Pinf, dF, dQc, dPinf)
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@ -122,7 +122,6 @@ class StdPeriodic(Kern):

        pass

-
    def K(self, X, X2=None):
        """Compute the covariance matrix between X and X2."""
        if X2 is None:
@ -133,13 +132,372 @@ class StdPeriodic(Kern):

        return self.variance * exp_dist

-
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix associated to X."""
        ret = np.empty(X.shape[0])
        ret[:] = self.variance
        return ret

+    def dK_dX(self, X, X2, dimX):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX]
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        dist = X[:,None,dimX] - X2[None,:,dimX]
+        base = np.pi*periodinv*dist
+
+        return -F*np.sin(2*base)*self._clean_K(X, X2)
+
+    def dK_dXdiag(self, X, dimX):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+
+        Returns only diagonal elements.
+        """
+        return np.zeros(X.shape[0])
+
+    def dK_dX2(self, X, X2, dimX2):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX2 of set X2.
+        """
+        return -self._clean_dK_dX(X, X2, dimX2)
+
+    def dK_dX2diag(self, X, dimX2):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements.
+        """
+        return np.zeros(X.shape[0])
+    
+    def dK2_dXdX2(self, X, X2, dimX, dimX2):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        dist = X[:,None,dimX2] - X2[None,:,dimX2]
+        base = np.pi*periodinv*dist
+
+        term = np.sin(2*base)*self._clean_dK_dX(X, X2, dimX)
+        if dimX == dimX2:
+            term += 2*np.pi*periodinv*np.cos(2*base)*self._clean_K(X, X2)
+        return F*term
+
+    def dK2_dXdX2diag(self, X, dimX, dimX2):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements.
+        """
+        if dimX == dimX2:
+            lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
+            periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
+            return (np.pi**2)*(lengthscaleinv**2)*(periodinv**2)*self.variance*np.ones(X.shape[0])
+        else:
+            return np.zeros(X.shape[0])
+
+    def dK2_dXdX(self, X, X2, dimX_0, dimX_1):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX_0 of set X, and
+            dimension dimX_1 of set X.
+        """
+        return -self._clean_dK2_dXdX2(X, X2, dimX_0, dimX_1)
+
+    def dK2_dXdXdiag(self, X, dimX_0, dimX_1):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX_0 of set X, and
+            dimension dimX_1 of set X.
+
+        Returns only diagonal elements.
+        """
+        return -self._clean_dK2_dXdX2diag(X, dimX_0, dimX_1)
+
+    def dK3_dXdXdX2(self, X, X2, dimX_0, dimX_1, dimX2):
+        """
+        Compute the third derivative of K with respect to:
+            dimension dimX_0 of set X,
+            dimension dimX_1 of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        dist = X[:,None,dimX2] - X2[None,:,dimX2]
+        base = np.pi*periodinv*dist
+
+        term = np.sin(2*base)*self._clean_dK2_dXdX(X, X2, dimX_0, dimX_1)
+        if dimX_0 == dimX2:
+            term += 2*np.pi*periodinv*np.cos(2*base)*self._clean_dK_dX(X, X2, dimX_1)
+        if dimX_1 == dimX2:
+            term += 2*np.pi*periodinv*np.cos(2*base)*self._clean_dK_dX(X, X2, dimX_0)
+        if dimX_0 == dimX_1 == dimX2:
+            term -= 4*(np.pi**2)*(periodinv**2)*np.sin(2*base)*self._clean_K(X, X2)
+        return F*term
+
+    def dK3_dXdXdX2diag(self, X, dimX_0, dimX_1, dimX2):
+        """
+        Compute the third derivative of K with respect to:
+            dimension dimX_0 of set X,
+            dimension dimX_1 of set X, and
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements of the covariance matrix.
+        """
+        return np.zeros(X.shape[0])
+
+    def dK_dvariance(self, X, X2):
+        """
+        Compute the derivative of K with respect to variance.
+        """
+        return self._clean_K(X, X2)/self.variance
+
+    def dK_dlengthscale(self, X, X2):
+        """
+        Compute the derivative(s) of K with respect to lengthscale(s).
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        periodinv = (np.ones(X.shape[1])/(self.period))
+
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+        base = np.pi*periodinv[:,None,None]*dist
+
+        K = self._clean_K(X, X2)
+
+        if self.ARD2:
+            g = []
+            for diml in range(self.input_dim):
+                g += [(lengthscaleinv[diml]**3)*np.square(np.sin(base[diml]))*K]
+        else:
+            g = (lengthscaleinv[0]**3)*np.sum(np.square(np.sin(base)), axis=0)*K
+        return g
+
+    def dK_dperiod(self, X, X2):
+        """
+        Compute the derivative(s) of K with respect to period(s).
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        periodinv = (np.ones(X.shape[1])/(self.period))
+
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+        base = np.pi*periodinv[:,None,None]*dist
+
+        K = self._clean_K(X, X2)
+
+        if self.ARD1:
+            g = []
+            for diml in range(self.input_dim):
+                g += [0.5*base[diml]*(lengthscaleinv[diml]**2)*periodinv[diml]*np.sin(2*base[diml])*K]
+        else:
+            g = 0.5*periodinv[0]*np.sum(base*(lengthscaleinv**2)[:,None,None]*np.sin(2*base), axis=0)*K
+        return g
+
+    def dK2_dvariancedX(self, X, X2, dimX):
+        """
+        Compute the second derivative of K with respect to:
+            variance, and
+            dimension dimX of set X.
+        """
+        return self._clean_dK_dX(X, X2, dimX)/self.variance
+
+    def dK2_dvariancedX2(self, X, X2, dimX2):
+        """
+        Compute the second derivative of K with respect to:
+            variance, and
+            dimension dimX2 of set X2.
+        """
+        return -self.dK2_dvariancedX(X, X2, dimX2)
+
+    def dK2_dlengthscaledX(self, X, X2, dimX):
+        """
+        Compute the second derivative(s) of K with respect to:
+            lengthscale(s), and
+            dimension dimX of set X.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX]
+
+        dist = X[:,None,dimX] - X2[None,:,dimX]
+        base = np.pi*periodinv*dist
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        K = self._clean_K(X, X2)
+        dK_dl = self.dK_dlengthscale(X, X2)
+
+        if self.ARD2:
+            g = []
+            for diml in range(self.input_dim):
+                term = dK_dl[diml]
+                if diml == dimX:
+                    term -= 2*lengthscaleinv*K
+                g += [-F*np.sin(2*base)*term]
+        else:
+            g = -F*np.sin(2*base)*(dK_dl - 2*lengthscaleinv*K)
+        return g
+
+    def dK2_dlengthscaledX2(self, X, X2, dimX2):
+        """
+        Compute the second derivative(s) of K with respect to:
+            lengthscale(s), and
+            dimension dimX2 of set X2.
+        """
+        dK2_dldX = self.dK2_dlengthscaledX(X, X2, dimX2)
+        if self.ARD2:
+            return [-1*g for g in dK2_dldX]
+        else:
+            return -1*dK2_dldX
+
+    def dK2_dperioddX(self, X, X2, dimX):
+        """
+        Compute the second derivative(s) of K with respect to:
+            period(s), and
+            dimension dimX of set X.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX]
+
+        dist = X[:,None,dimX] - X2[None,:,dimX]
+        base = np.pi*periodinv*dist
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        K = self._clean_K(X, X2)
+        dK_dT = self.dK_dperiod(X, X2)
+
+        if self.ARD1:
+            g = []
+            for dimT in range(self.input_dim):
+                term = np.sin(2*base)*dK_dT[dimT]
+                if dimT == dimX:
+                    term -= periodinv*(np.sin(2*base)+2*base*np.cos(2*base))*K
+                g += [-F*term]
+        else:
+            term = np.sin(2*base)*dK_dT
+            term -= periodinv*(np.sin(2*base)+2*base*np.cos(2*base))*K
+            g = -F*term
+        return g
+
+    def dK2_dperioddX2(self, X, X2, dimX2):
+        """
+        Compute the second derivative(s) of K with respect to:
+            period(s), and
+            dimension dimX2 of set X2.
+        """
+        dK2_dperioddX = self.dK2_dperioddX(X, X2, dimX2)
+        if self.ARD1:
+            return [-1*g for g in dK2_dperioddX]
+        else:
+            return -1*dK2_dperioddX
+
+    def dK3_dvariancedXdX2(self, X, X2, dimX, dimX2):
+        """
+        Compute the third derivative of K with respect to:
+            variance,
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        return self._clean_dK2_dXdX2(X, X2, dimX, dimX2)/self.variance
+
+    def dK3_dlengthscaledXdX2(self, X, X2, dimX, dimX2):
+        """
+        Compute the third derivative(s) of K with respect to:
+            lengthscale(s),
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
+
+        dist = X[:,None,dimX2] - X2[None,:,dimX2]
+        base = np.pi*periodinv*dist
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        dK2_dXdX2 = self._clean_dK2_dXdX2(X, X2, dimX, dimX2)
+        dK_dl = self.dK_dlengthscale(X, X2)
+        dK2_dldX = self.dK2_dlengthscaledX(X, X2, dimX)
+
+        if self.ARD2:
+            g = []
+            for diml in range(self.input_dim):
+                term = np.sin(2*base)*dK2_dldX[diml]
+                if dimX == dimX2:
+                    term += 2*np.pi*periodinv*np.cos(2*base)*dK_dl[diml]
+                term *= F
+                if diml == dimX2:
+                    term -= 2*lengthscaleinv*dK2_dXdX2
+                g += [term]
+        else:
+            term = np.sin(2*base)*dK2_dldX
+            if dimX == dimX2:
+                term += 2*np.pi*periodinv*np.cos(2*base)*dK_dl
+            term *= F
+            term -= 2*lengthscaleinv*dK2_dXdX2
+            g = term
+        return g
+
+    def dK3_dperioddXdX2(self, X, X2, dimX, dimX2):
+        """
+        Compute the third derivative(s) of K with respect to:
+            period(s),
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
+
+        dist = X[:,None,dimX2] - X2[None,:,dimX2]
+        base = np.pi*periodinv*dist
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        K = self._clean_K(X, X2)
+        dK_dX = self._clean_dK_dX(X, X2, dimX)
+        dK2_dXdX2 = self._clean_dK2_dXdX2(X, X2, dimX, dimX2)
+        dK_dT = self.dK_dperiod(X, X2)
+        dK2_dTdX = self.dK2_dperioddX(X, X2, dimX)
+
+        if self.ARD1:
+            g = []
+            for dimT in range(self.input_dim):
+                term = np.sin(2*base)*dK2_dTdX[dimT]
+                if dimT == dimX2:
+                    term -= 2*periodinv*np.cos(2*base)*base*dK_dX
+                if dimX == dimX2:
+                    term += 2*np.pi*periodinv*np.cos(2*base)*dK_dT[dimT]
+                if dimX == dimX2 == dimT:
+                    term += 2*np.pi*(periodinv**2)*(2*base*np.sin(2*base)-np.cos(2*base))*K
+                term *= F
+                if dimT == dimX2:
+                    term -= periodinv*dK2_dXdX2
+                g += [term]
+        else:
+            term = np.sin(2*base)*dK2_dTdX-2*periodinv*base*np.cos(2*base)*dK_dX
+            if dimX == dimX2:
+                term += 2*np.pi*periodinv*(np.cos(2*base)*dK_dT+periodinv*(2*base*np.sin(2*base)-np.cos(2*base))*K)
+            g = F*term-periodinv*dK2_dXdX2
+        return g
+
    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None:
@ -167,12 +525,52 @@ class StdPeriodic(Kern):
        else: # same lengthscales
            self.lengthscale.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)

+    def update_gradients_direct(self, dL_dVar, dL_dPer, dL_dLen):
+        self.variance.gradient = dL_dVar
+        self.period.gradient = dL_dPer
+        self.lengthscale.gradient = dL_dLen
+
+    def reset_gradients(self):
+        self.variance.gradient = 0.
+        if not self.ARD1:
+            self.period.gradient = 0.
+        else:
+            self.period.gradient = np.zeros(self.input_dim)
+        if not self.ARD2:
+            self.lengthscale.gradient = 0.
+        else:
+            self.lengthscale.gradient = np.zeros(self.input_dim)
+
    def update_gradients_diag(self, dL_dKdiag, X):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        self.variance.gradient = np.sum(dL_dKdiag)
        self.period.gradient = 0
        self.lengthscale.gradient = 0

+    def dgradients(self, X, X2):
+        g1 = self.dK_dvariance(X, X2)
+        g2 = self.dK_dperiod(X, X2)
+        g3 = self.dK_dlengthscale(X, X2)
+        return [g1, g2, g3]
+
+    def dgradients_dX(self, X, X2, dimX):
+        g1 = self.dK2_dvariancedX(X, X2, dimX)
+        g2 = self.dK2_dperioddX(X, X2, dimX)
+        g3 = self.dK2_dlengthscaledX(X, X2, dimX)
+        return [g1, g2, g3]
+
+    def dgradients_dX2(self, X, X2, dimX2):
+        g1 = self.dK2_dvariancedX2(X, X2, dimX2)
+        g2 = self.dK2_dperioddX2(X, X2, dimX2)
+        g3 = self.dK2_dlengthscaledX2(X, X2, dimX2)
+        return [g1, g2, g3]
+
+    def dgradients2_dXdX2(self, X, X2, dimX, dimX2):
+        g1 = self.dK3_dvariancedXdX2(X, X2, dimX, dimX2)
+        g2 = self.dK3_dperioddXdX2(X, X2, dimX, dimX2)
+        g3 = self.dK3_dlengthscaledXdX2(X, X2, dimX, dimX2)
+        return [g1, g2, g3]
+
    def gradients_X(self, dL_dK, X, X2=None):
        K = self.K(X, X2)
        if X2 is None:
@ -185,4 +583,4 @@ class StdPeriodic(Kern):
        return np.zeros(X.shape)
    
    def input_sensitivity(self, summarize=True):
-        return self.variance*np.ones(self.input_dim)/self.lengthscale**2
+        return self.variance*np.ones(self.input_dim)/self.lengthscale**2
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -306,7 +306,12 @@ class Stationary(Kern):
        l4 =  np.ones(X.shape[1])*self.lengthscale**2
        return dL_dK_diag * (np.eye(X.shape[1]) * -self.dK2_drdr_diag()/(l4))[None, :,:]# np.zeros(X.shape+(X.shape[1],))
        #return np.ones(X.shape) * d2L_dK * self.variance/self.lengthscale**2 # np.zeros(X.shape)
-    
+
+    def dgradients(self, X, X2):
+        g1 = self.dK_dvariance(X, X2)
+        g2 = self.dK_dlengthscale(X, X2)
+        return [g1, g2]
+
    def dgradients_dX(self, X, X2, dimX):
        g1 = self.dK2_dvariancedX(X, X2, dimX)
        g2 = self.dK2_dlengthscaledX(X, X2, dimX)
--- a/GPy/kern/src/stationary_cython.c
+++ b/GPy/kern/src/stationary_cython.c
--- a/GPy/kern/src/todo/eq_ode1.py
+++ b/GPy/kern/src/todo/eq_ode1.py
@ -121,7 +121,7 @@ class Eq_ode1(Kernpart):
            target+=self.initial_variance * np.exp(- self.decay * (t1_mat + t2_mat))

    def Kdiag(self,index,target):
-        #target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
+        #target += np.diag(self.B)[np.asarray(index,dtype=int).flatten()]
        pass
    
    def _param_grad_helper(self,dL_dK,X,X2,target):
@ -203,7 +203,7 @@ class Eq_ode1(Kernpart):
        self._t = X[:, 0]
        if not X.shape[1] == 2:
            raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
-        self._index = np.asarray(X[:, 1],dtype=np.int)
+        self._index = np.asarray(X[:, 1],dtype=int)
        # Sort indices so that outputs are in blocks for computational
        # convenience.
        self._order = self._index.argsort()
@ -220,7 +220,7 @@ class Eq_ode1(Kernpart):
            if not X2.shape[1] == 2:
                raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
            self._t2 = X2[:, 0]
-            self._index2 = np.asarray(X2[:, 1],dtype=np.int)
+            self._index2 = np.asarray(X2[:, 1],dtype=int)
            self._order2 = self._index2.argsort()
            self._index2 = self._index2[self._order2]
            self._t2 = self._t2[self._order2]
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@ -12,6 +12,7 @@ from ..core.parameterization import Param
 from paramz.transformations import Logexp
 from scipy.special import psi as digamma

+
 class StudentT(Likelihood):
    """
    Student T likelihood
@ -22,17 +23,18 @@ class StudentT(Likelihood):
        p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}

    """
-    def __init__(self,gp_link=None, deg_free=5, sigma2=2):
+
+    def __init__(self, gp_link=None, deg_free=5, sigma2=2):
        if gp_link is None:
            gp_link = link_functions.Identity()

-        super(StudentT, self).__init__(gp_link, name='Student_T')
+        super(StudentT, self).__init__(gp_link, name="Student_T")
        # sigma2 is not a noise parameter, it is a squared scale.
-        self.sigma2 = Param('t_scale2', float(sigma2), Logexp())
-        self.v = Param('deg_free', float(deg_free), Logexp())
+        self.sigma2 = Param("t_scale2", float(sigma2), Logexp())
+        self.v = Param("deg_free", float(deg_free), Logexp())
        self.link_parameter(self.sigma2)
        self.link_parameter(self.v)
-        #self.v.constrain_fixed()
+        # self.v.constrain_fixed()

        self.log_concave = False

@ -61,11 +63,14 @@ class StudentT(Likelihood):
        """
        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
        e = y - inv_link_f
-        #Careful gamma(big_number) is infinity!
-        objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
-                     / (np.sqrt(self.v * np.pi * self.sigma2)))
-                     * ((1 + (1./float(self.v))*((e**2)/float(self.sigma2)))**(-0.5*(self.v + 1)))
-                    )
+        # Careful gamma(big_number) is infinity!
+        objective = (
+            np.exp(gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5))
+            / (np.sqrt(self.v * np.pi * self.sigma2))
+        ) * (
+            (1 + (1.0 / float(self.v)) * ((e**2) / float(self.sigma2)))
+            ** (-0.5 * (self.v + 1))
+        )
        return np.prod(objective)

    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
@ -85,15 +90,16 @@ class StudentT(Likelihood):

        """
        e = y - inv_link_f
-        #FIXME:
-        #Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?!
-        #But np.log(1 + (1/float(self.v))*((y-inv_link_f)**2)/self.sigma2) throws it correctly
-        #print - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
-        objective = (+ gammaln((self.v + 1) * 0.5)
-                    - gammaln(self.v * 0.5)
-                    - 0.5*np.log(self.sigma2 * self.v * np.pi)
-                    - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
-                    )
+        # FIXME:
+        # Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?!
+        # But np.log(1 + (1/float(self.v))*((y-inv_link_f)**2)/self.sigma2) throws it correctly
+        # print - 0.5*(self.v + 1)*np.log(1 + (1/(self.v))*((e**2)/self.sigma2))
+        objective = (
+            +gammaln((self.v + 1) * 0.5)
+            - gammaln(self.v * 0.5)
+            - 0.5 * np.log(self.sigma2 * self.v * np.pi)
+            - 0.5 * (self.v + 1) * np.log(1 + (1 / (self.v)) * ((e**2) / self.sigma2))
+        )
        return objective

    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
@ -138,7 +144,9 @@ class StudentT(Likelihood):
            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
        """
        e = y - inv_link_f
-        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
+        hess = ((self.v + 1) * (e**2 - self.v * self.sigma2)) / (
+            (self.sigma2 * self.v + e**2) ** 2
+        )
        return hess

    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
@ -157,9 +165,9 @@ class StudentT(Likelihood):
        :rtype: Nx1 array
        """
        e = y - inv_link_f
-        d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
-                       ((e**2 + self.sigma2*self.v)**3)
-                    )
+        d3lik_dlink3 = -(
+            2 * (self.v + 1) * (-e) * (e**2 - 3 * self.v * self.sigma2)
+        ) / ((e**2 + self.sigma2 * self.v) ** 3)
        return d3lik_dlink3

    def dlogpdf_link_dvar(self, inv_link_f, y, Y_metadata=None):
@ -179,7 +187,11 @@ class StudentT(Likelihood):
        """
        e = y - inv_link_f
        e2 = np.square(e)
-        dlogpdf_dvar = self.v*(e2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e2))
+        dlogpdf_dvar = (
+            self.v
+            * (e2 - self.sigma2)
+            / (2 * self.sigma2 * (self.sigma2 * self.v + e2))
+        )
        return dlogpdf_dvar

    def dlogpdf_dlink_dvar(self, inv_link_f, y, Y_metadata=None):
@ -198,7 +210,9 @@ class StudentT(Likelihood):
        :rtype: Nx1 array
        """
        e = y - inv_link_f
-        dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
+        dlogpdf_dlink_dvar = (self.v * (self.v + 1) * (-e)) / (
+            (self.sigma2 * self.v + e**2) ** 2
+        )
        return dlogpdf_dlink_dvar

    def d2logpdf_dlink2_dvar(self, inv_link_f, y, Y_metadata=None):
@ -217,9 +231,9 @@ class StudentT(Likelihood):
        :rtype: Nx1 array
        """
        e = y - inv_link_f
-        d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
-                              / ((self.sigma2*self.v + (e**2))**3)
-                           )
+        d2logpdf_dlink2_dvar = (
+            self.v * (self.v + 1) * (self.sigma2 * self.v - 3 * (e**2))
+        ) / ((self.sigma2 * self.v + (e**2)) ** 3)
        return d2logpdf_dlink2_dvar

    def dlogpdf_link_dv(self, inv_link_f, y, Y_metadata=None):
@ -227,9 +241,11 @@ class StudentT(Likelihood):
        e2 = np.square(e)
        df = float(self.v[:])
        s2 = float(self.sigma2[:])
-        dlogpdf_dv =  0.5*digamma(0.5*(df+1)) - 0.5*digamma(0.5*df) - 1.0/(2*df)
-        dlogpdf_dv += 0.5*(df+1)*e2/(df*(e2 + s2*df))
-        dlogpdf_dv -= 0.5*np.log1p(e2/(s2*df))
+        dlogpdf_dv = (
+            0.5 * digamma(0.5 * (df + 1)) - 0.5 * digamma(0.5 * df) - 1.0 / (2 * df)
+        )
+        dlogpdf_dv += 0.5 * (df + 1) * e2 / (df * (e2 + s2 * df))
+        dlogpdf_dv -= 0.5 * np.log1p(e2 / (s2 * df))
        return dlogpdf_dv

    def dlogpdf_dlink_dv(self, inv_link_f, y, Y_metadata=None):
@ -237,7 +253,7 @@ class StudentT(Likelihood):
        e2 = np.square(e)
        df = float(self.v[:])
        s2 = float(self.sigma2[:])
-        dlogpdf_df_dv = e*(e2 - self.sigma2)/(e2 + s2*df)**2
+        dlogpdf_df_dv = e * (e2 - self.sigma2) / (e2 + s2 * df) ** 2
        return dlogpdf_df_dv

    def d2logpdf_dlink2_dv(self, inv_link_f, y, Y_metadata=None):
@ -245,8 +261,10 @@ class StudentT(Likelihood):
        e2 = np.square(e)
        df = float(self.v[:])
        s2 = float(self.sigma2[:])
-        e2_s2v = e**2 + s2*df
-        d2logpdf_df2_dv = (-s2*(df+1) + e2 - s2*df)/e2_s2v**2 - 2*s2*(df+1)*(e2 - s2*df)/e2_s2v**3
+        e2_s2v = e**2 + s2 * df
+        d2logpdf_df2_dv = (-s2 * (df + 1) + e2 - s2 * df) / e2_s2v**2 - 2 * s2 * (
+            df + 1
+        ) * (e2 - s2 * df) / e2_s2v**3
        return d2logpdf_df2_dv

    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
@ -266,19 +284,23 @@ class StudentT(Likelihood):

    def predictive_mean(self, mu, sigma, Y_metadata=None):
        # The comment here confuses mean and median.
-        return self.gp_link.transf(mu) # only true if link is monotonic, which it is.
+        return self.gp_link.transf(mu)  # only true if link is monotonic, which it is.

-    def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
-        if self.deg_free<=2.:
-            return np.empty(mu.shape)*np.nan # does not exist for degrees of freedom <= 2.
+    def predictive_variance(self, mu, variance, predictive_mean=None, Y_metadata=None):
+        if self.deg_free <= 2.0:
+            return (
+                np.empty(mu.shape) * np.nan
+            )  # does not exist for degrees of freedom <= 2.
        else:
-            return super(StudentT, self).predictive_variance(mu, variance, predictive_mean, Y_metadata)
+            return super(StudentT, self).predictive_variance(
+                mu, variance, predictive_mean, Y_metadata
+            )

    def conditional_mean(self, gp):
        return self.gp_link.transf(gp)

    def conditional_variance(self, gp):
-        return self.deg_free/(self.deg_free - 2.)
+        return self.deg_free / (self.deg_free - 2.0)

    def samples(self, gp, Y_metadata=None):
        """
@ -288,11 +310,10 @@ class StudentT(Likelihood):
        """
        orig_shape = gp.shape
        gp = gp.flatten()
-        #FIXME: Very slow as we are computing a new random variable per input!
-        #Can't get it to sample all at the same time
-        #student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
-        dfs = np.ones_like(gp)*self.v
-        scales = np.ones_like(gp)*np.sqrt(self.sigma2)
-        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp),
-                                        scale=scales)
+        # FIXME: Very slow as we are computing a new random variable per input!
+        # Can't get it to sample all at the same time
+        # student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
+        dfs = np.ones_like(gp) * self.v
+        scales = np.ones_like(gp) * np.sqrt(self.sigma2)
+        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp), scale=scales)
        return student_t_samples.reshape(orig_shape)
--- a/GPy/models/multioutput_gp.py
+++ b/GPy/models/multioutput_gp.py
@ -9,6 +9,7 @@ from ..core.mapping import Mapping
 from .. import likelihoods
 from ..likelihoods.gaussian import Gaussian
 from .. import kern
+from ..kern import DiffKern
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
 from ..util.normalizer import Standardize
 from .. import util
@ -69,39 +70,80 @@ class MultioutputGP(GP):
            if Y_metadata is None:
                Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
        return super(MultioutputGP, self).predict_quantiles(X, quantiles, Y_metadata, kern, likelihood)
-    
-    def predictive_gradients(self, Xnew, kern=None):
-        if isinstance(Xnew, list):
-            Xnew, _, ind  = util.multioutput.build_XY(Xnew, None)
-            #if Y_metadata is None:
-                #Y_metadata={'output_index': ind}
-        return super(MultioutputGP, self).predictive_gradients(Xnew, kern)

-    def predictive_gradients(self, Xnew, kern=None): #XNEW IS NOT A LIST!!
+    def predictive_gradients(self, Xnew, kern=None):
        """
-        Compute the derivatives of the predicted latent function with respect to X*
+        Compute the derivatives of the predicted latent function with respect
+        to X*
+        
        Given a set of points at which to predict X* (size [N*,Q]), compute the
        derivatives of the mean and variance. Resulting arrays are sized:
-         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
-        Note that this is not the same as computing the mean and variance of the derivative of the function!
+            dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP
+            (usually one).
+        
+        Note that this is not the same as computing the mean and variance of
+        the derivative of the function!
+
         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
        :param X: The points at which to get the predictive gradients
        :type X: np.ndarray (Xnew x self.input_dim)
        :returns: dmu_dX, dv_dX
        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q) ]
+
        """
        
        if isinstance(Xnew, list):
            Xnew, _, ind  = util.multioutput.build_XY(Xnew, None)
        
        slices = index_to_slices(Xnew[:,-1])
-        
-        for i in range(len(slices)):
-            if ((self.kern.kern[i].name == 'diffKern' ) and len(slices[i])>0):
-                assert 0, "It is not (yet) possible to predict gradients of gradient observations, sorry :)"
- 
+
        if kern is None:
            kern = self.kern
+
+        if all([(isinstance(k, DiffKern)) for k in self.kern.kern[1:]]):
+            """
+            Compute the gradients of the predicted latent function and predicted
+            partial derivatives with respect to X*.
+
+            This works only for models that observe the gradient of the latent function.
+
+            Xnew is given as a list of arrays, where each array X*_i (size [N_i*, Q])
+            contains points at which to compute gradients for each predicted latent
+            function or partial derivative.
+
+            Resulting arrays are sized [sum_i^D : N_i*, Q]
+
+            Passing a list of only one array [X*] returns only gradients of
+            the predicted latent function and does not compute gradients of
+            predicted partial derivatives.
+
+            In this case the resulting arrays are sized [N*, Q].
+            
+            :param Xnew: points at which to compute predictive gradients
+            :type Xnew: list
+            :type Xnew[i]: np.darray (sum_i^D : N_i*, Q)
+            :returns: dmu_dX, dv_dX
+            :rtype: (np.ndarray (sum_i^D : N_i*, Q), np.ndarray (sum_i^D : N_i*, Q))
+            """
+
+            dims = Xnew.shape[1] - 1
+
+            mean_jac = np.empty((Xnew.shape[0], dims))
+            var_jac = np.empty((Xnew.shape[0], dims))
+
+            X = self._predictive_variable
+            alpha = self.posterior.woodbury_vector
+            Wi = self.posterior.woodbury_inv
+
+            k = kern.K(Xnew, X)
+            for dimX in range(dims):
+                dk_dx = kern.dK_dX(Xnew, X, dimX)
+                dk_dxdiag = kern.dK_dXdiag(Xnew, dimX)
+
+                mean_jac[:,dimX] = np.dot(dk_dx, alpha).flatten()
+                var_jac[:,dimX] = dk_dxdiag - 2*(np.dot(k, Wi)*dk_dx).sum(-1)
+            return mean_jac, var_jac
+
        mean_jac = np.empty((Xnew.shape[0],Xnew.shape[1]-1,self.output_dim))
        for i in range(self.output_dim):
            mean_jac[:,:,i] = kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self._predictive_variable)[:,0:-1]
--- a/GPy/models/sparse_gp_coregionalized_regression.py
+++ b/GPy/models/sparse_gp_coregionalized_regression.py
@ -7,6 +7,7 @@ from ..inference.latent_function_inference import VarDTC
 from .. import kern
 from .. import util

+
 class SparseGPCoregionalizedRegression(SparseGP):
    """
    Sparse Gaussian Process model for heteroscedastic multioutput regression
@ -34,34 +35,65 @@ class SparseGPCoregionalizedRegression(SparseGP):
    :type kernel_name: string
    """

-    def __init__(self, X_list, Y_list, Z_list=[], kernel=None, likelihoods_list=None, num_inducing=10, X_variance=None, name='SGPCR',W_rank=1,kernel_name='coreg'):
-
-        #Input and Output
-        X,Y,self.output_index = util.multioutput.build_XY(X_list,Y_list)
+    def __init__(
+        self,
+        X_list,
+        Y_list,
+        Z_list=[],
+        kernel=None,
+        likelihoods_list=None,
+        num_inducing=10,
+        X_variance=None,
+        name="SGPCR",
+        W_rank=1,
+        kernel_name="coreg",
+    ):
+        # Input and Output
+        X, Y, self.output_index = util.multioutput.build_XY(X_list, Y_list)
        Ny = len(Y_list)

-        #Kernel
+        # Kernel
        if kernel is None:
-            kernel = kern.RBF(X.shape[1]-1)
-            
-            kernel = util.multioutput.ICM(input_dim=X.shape[1]-1, num_outputs=Ny, kernel=kernel, W_rank=W_rank, name=kernel_name)
+            kernel = kern.RBF(X.shape[1] - 1)

-        #Likelihood
-        likelihood = util.multioutput.build_likelihood(Y_list,self.output_index,likelihoods_list)
+            kernel = util.multioutput.ICM(
+                input_dim=X.shape[1] - 1,
+                num_outputs=Ny,
+                kernel=kernel,
+                W_rank=W_rank,
+                name=kernel_name,
+            )

-        #Inducing inputs list
+        # Likelihood
+        likelihood = util.multioutput.build_likelihood(
+            Y_list, self.output_index, likelihoods_list
+        )
+
+        # Inducing inputs list
        if len(Z_list):
-            assert len(Z_list) == Ny, 'Number of outputs do not match length of inducing inputs list.'
+            assert (
+                len(Z_list) == Ny
+            ), "Number of outputs do not match length of inducing inputs list."
        else:
-            if isinstance(num_inducing,np.int):
+            if isinstance(num_inducing, int):
                num_inducing = [num_inducing] * Ny
            num_inducing = np.asarray(num_inducing)
-            assert num_inducing.size == Ny, 'Number of outputs do not match length of inducing inputs list.'
-            for ni,Xi in zip(num_inducing,X_list):
+            assert (
+                num_inducing.size == Ny
+            ), "Number of outputs do not match length of inducing inputs list."
+            for ni, Xi in zip(num_inducing, X_list):
                i = np.random.permutation(Xi.shape[0])[:ni]
                Z_list.append(Xi[i].copy())

        Z, _, Iz = util.multioutput.build_XY(Z_list)

-        super(SparseGPCoregionalizedRegression, self).__init__(X, Y, Z, kernel, likelihood, inference_method=VarDTC(), Y_metadata={'output_index':self.output_index})
-        self['.*inducing'][:,-1].fix()
+        super(SparseGPCoregionalizedRegression, self).__init__(
+            X,
+            Y,
+            Z,
+            kernel,
+            likelihood,
+            inference_method=VarDTC(),
+            Y_metadata={"output_index": self.output_index},
+        )
+        self[".*inducing"][:, -1].fix()
--- a/GPy/models/ss_mrd.py
+++ b/GPy/models/ss_mrd.py
@ -5,52 +5,110 @@ The Maniforld Relevance Determination model with the spike-and-slab prior
 import numpy as np
 from ..core import Model
 from .ss_gplvm import SSGPLVM
-from GPy.core.parameterization.variational import SpikeAndSlabPrior,NormalPosterior,VariationalPrior
+from GPy.core.parameterization.variational import (
+    SpikeAndSlabPrior,
+    NormalPosterior,
+    VariationalPrior,
+)
 from ..util.misc import param_to_array
 from ..kern import RBF
 from ..core import Param
 from numpy.linalg.linalg import LinAlgError

+
 class SSMRD(Model):
-    
-    def __init__(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx = 'PCA_concat', initz = 'permute', 
-                 num_inducing=10, Zs=None, kernels=None, inference_methods=None, likelihoods=None, group_spike=True,
-                 pi=0.5, name='ss_mrd', Ynames=None, mpi_comm=None, IBP=False, alpha=2., taus=None, ):
+    def __init__(
+        self,
+        Ylist,
+        input_dim,
+        X=None,
+        X_variance=None,
+        Gammas=None,
+        initx="PCA_concat",
+        initz="permute",
+        num_inducing=10,
+        Zs=None,
+        kernels=None,
+        inference_methods=None,
+        likelihoods=None,
+        group_spike=True,
+        pi=0.5,
+        name="ss_mrd",
+        Ynames=None,
+        mpi_comm=None,
+        IBP=False,
+        alpha=2.0,
+        taus=None,
+    ):
        super(SSMRD, self).__init__(name)
        self.mpi_comm = mpi_comm
        self._PROPAGATE_ = False
-        
+
        # initialize X for individual models
-        X, X_variance, Gammas, fracs = self._init_X(Ylist, input_dim, X, X_variance, Gammas, initx)
+        X, X_variance, Gammas, fracs = self._init_X(
+            Ylist, input_dim, X, X_variance, Gammas, initx
+        )
        self.X = NormalPosterior(means=X, variances=X_variance)
-        
+
        if kernels is None:
-            kernels = [RBF(input_dim, lengthscale=1./fracs, ARD=True) for i in range(len(Ylist))]
+            kernels = [
+                RBF(input_dim, lengthscale=1.0 / fracs, ARD=True)
+                for i in range(len(Ylist))
+            ]
        if Zs is None:
-            Zs = [None]* len(Ylist)
+            Zs = [None] * len(Ylist)
        if likelihoods is None:
-            likelihoods = [None]* len(Ylist)
+            likelihoods = [None] * len(Ylist)
        if inference_methods is None:
-            inference_methods = [None]* len(Ylist)
-        
+            inference_methods = [None] * len(Ylist)
+
        if IBP:
-            self.var_priors = [IBPPrior_SSMRD(len(Ylist),input_dim,alpha=alpha) for i in range(len(Ylist))]
+            self.var_priors = [
+                IBPPrior_SSMRD(len(Ylist), input_dim, alpha=alpha)
+                for i in range(len(Ylist))
+            ]
        else:
-            self.var_priors = [SpikeAndSlabPrior_SSMRD(nModels=len(Ylist),pi=pi,learnPi=False, group_spike=group_spike) for i in range(len(Ylist))]
-        self.models = [SSGPLVM(y, input_dim, X=X.copy(), X_variance=X_variance.copy(), Gamma=Gammas[i], num_inducing=num_inducing,Z=Zs[i], learnPi=False, group_spike=group_spike,
-                               kernel=kernels[i],inference_method=inference_methods[i],likelihood=likelihoods[i], variational_prior=self.var_priors[i], IBP=IBP, tau=None if taus is None else taus[i],
-                               name='model_'+str(i), mpi_comm=mpi_comm, sharedX=True) for i,y in enumerate(Ylist)]
-        self.link_parameters(*(self.models+[self.X]))
-        
+            self.var_priors = [
+                SpikeAndSlabPrior_SSMRD(
+                    nModels=len(Ylist), pi=pi, learnPi=False, group_spike=group_spike
+                )
+                for i in range(len(Ylist))
+            ]
+        self.models = [
+            SSGPLVM(
+                y,
+                input_dim,
+                X=X.copy(),
+                X_variance=X_variance.copy(),
+                Gamma=Gammas[i],
+                num_inducing=num_inducing,
+                Z=Zs[i],
+                learnPi=False,
+                group_spike=group_spike,
+                kernel=kernels[i],
+                inference_method=inference_methods[i],
+                likelihood=likelihoods[i],
+                variational_prior=self.var_priors[i],
+                IBP=IBP,
+                tau=None if taus is None else taus[i],
+                name="model_" + str(i),
+                mpi_comm=mpi_comm,
+                sharedX=True,
+            )
+            for i, y in enumerate(Ylist)
+        ]
+        self.link_parameters(*(self.models + [self.X]))
+
    def _propogate_X_val(self):
-        if self._PROPAGATE_: return
+        if self._PROPAGATE_:
+            return
        for m in self.models:
            m.X.mean.values[:] = self.X.mean.values
            m.X.variance.values[:] = self.X.variance.values
        varp_list = [m.X for m in self.models]
        [vp._update_inernal(varp_list) for vp in self.var_priors]
-        self._PROPAGATE_=True
-    
+        self._PROPAGATE_ = True
+
    def _collate_X_gradient(self):
        self._PROPAGATE_ = False
        self.X.mean.gradient[:] = 0
@ -58,86 +116,92 @@ class SSMRD(Model):
        for m in self.models:
            self.X.mean.gradient += m.X.mean.gradient
            self.X.variance.gradient += m.X.variance.gradient
-        
+
    def parameters_changed(self):
        super(SSMRD, self).parameters_changed()
        [m.parameters_changed() for m in self.models]
-        self._log_marginal_likelihood = sum([m._log_marginal_likelihood for m in self.models])
+        self._log_marginal_likelihood = sum(
+            [m._log_marginal_likelihood for m in self.models]
+        )
        self._collate_X_gradient()

    def log_likelihood(self):
        return self._log_marginal_likelihood
-    
-    def _init_X(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx='PCA_concat'):
-        
+
+    def _init_X(
+        self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx="PCA_concat"
+    ):
        # Divide latent dimensions
-        idx = np.empty((input_dim,),dtype=np.int)
-        residue = (input_dim)%(len(Ylist))
+        idx = np.empty((input_dim,), dtype=int)
+        residue = (input_dim) % (len(Ylist))
        for i in range(len(Ylist)):
            if i < residue:
-                size = input_dim/len(Ylist)+1
-                idx[i*size:(i+1)*size] = i
+                size = input_dim / len(Ylist) + 1
+                idx[i * size : (i + 1) * size] = i
            else:
-                size = input_dim/len(Ylist)
-                idx[i*size+residue:(i+1)*size+residue] = i
-        
+                size = input_dim / len(Ylist)
+                idx[i * size + residue : (i + 1) * size + residue] = i
+
        if X is None:
-            if initx == 'PCA_concat':
-                X = np.empty((Ylist[0].shape[0],input_dim))
+            if initx == "PCA_concat":
+                X = np.empty((Ylist[0].shape[0], input_dim))
                fracs = np.empty((input_dim,))
                from ..util.initialization import initialize_latent
+
                for i in range(len(Ylist)):
                    Y = Ylist[i]
-                    dim = (idx==i).sum()
-                    if dim>0:
-                        x, fr = initialize_latent('PCA', dim, Y)
-                        X[:,idx==i] = x
-                        fracs[idx==i] = fr
-            elif initx=='PCA_joint':
+                    dim = (idx == i).sum()
+                    if dim > 0:
+                        x, fr = initialize_latent("PCA", dim, Y)
+                        X[:, idx == i] = x
+                        fracs[idx == i] = fr
+            elif initx == "PCA_joint":
                y = np.hstack(Ylist)
                from ..util.initialization import initialize_latent
-                X, fracs = initialize_latent('PCA', input_dim, y)
+
+                X, fracs = initialize_latent("PCA", input_dim, y)
            else:
                X = np.random.randn(Ylist[0].shape[0], input_dim)
                fracs = np.ones(input_dim)
        else:
            fracs = np.ones(input_dim)
-            
-    
-        if X_variance is None: # The variance of the variational approximation (S)
-            X_variance = np.random.uniform(0,.1,X.shape)
-            
+
+        if X_variance is None:  # The variance of the variational approximation (S)
+            X_variance = np.random.uniform(0, 0.1, X.shape)
+
        if Gammas is None:
            Gammas = []
            for x in X:
-                gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+                gamma = np.empty_like(
+                    X
+                )  # The posterior probabilities of the binary variable in the variational approximation
                gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
-                gamma[gamma>1.-1e-9] = 1.-1e-9
-                gamma[gamma<1e-9] = 1e-9
+                gamma[gamma > 1.0 - 1e-9] = 1.0 - 1e-9
+                gamma[gamma < 1e-9] = 1e-9
                Gammas.append(gamma)
        return X, X_variance, Gammas, fracs

    @Model.optimizer_array.setter
    def optimizer_array(self, p):
        if self.mpi_comm != None:
-            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0:
-                self.mpi_comm.Bcast(np.int32(1),root=0)
-            self.mpi_comm.Bcast(p, root=0)        
-        Model.optimizer_array.fset(self,p)
-        
+            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank == 0:
+                self.mpi_comm.Bcast(np.int32(1), root=0)
+            self.mpi_comm.Bcast(p, root=0)
+        Model.optimizer_array.fset(self, p)
+
    def optimize(self, optimizer=None, start=None, **kwargs):
        self._IN_OPTIMIZATION_ = True
-        if self.mpi_comm==None:
-            super(SSMRD, self).optimize(optimizer,start,**kwargs)
-        elif self.mpi_comm.rank==0:
-            super(SSMRD, self).optimize(optimizer,start,**kwargs)
-            self.mpi_comm.Bcast(np.int32(-1),root=0)
-        elif self.mpi_comm.rank>0:
+        if self.mpi_comm == None:
+            super(SSMRD, self).optimize(optimizer, start, **kwargs)
+        elif self.mpi_comm.rank == 0:
+            super(SSMRD, self).optimize(optimizer, start, **kwargs)
+            self.mpi_comm.Bcast(np.int32(-1), root=0)
+        elif self.mpi_comm.rank > 0:
            x = self.optimizer_array.copy()
-            flag = np.empty(1,dtype=np.int32)
+            flag = np.empty(1, dtype=np.int32)
            while True:
-                self.mpi_comm.Bcast(flag,root=0)
-                if flag==1:
+                self.mpi_comm.Bcast(flag, root=0)
+                if flag == 1:
                    try:
                        self.optimizer_array = x
                        self._fail_count = 0
@ -145,29 +209,51 @@ class SSMRD(Model):
                        if self._fail_count >= self._allowed_failures:
                            raise
                        self._fail_count += 1
-                elif flag==-1:
+                elif flag == -1:
                    break
                else:
                    self._IN_OPTIMIZATION_ = False
                    raise Exception("Unrecognizable flag for synchronization!")
        self._IN_OPTIMIZATION_ = False
-        
+

 class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
-    def __init__(self, nModels, pi=0.5, learnPi=False, group_spike=True, variance = 1.0, name='SSMRDPrior', **kw):
+    def __init__(
+        self,
+        nModels,
+        pi=0.5,
+        learnPi=False,
+        group_spike=True,
+        variance=1.0,
+        name="SSMRDPrior",
+        **kw
+    ):
        self.nModels = nModels
        self._b_prob_all = 0.5
-        super(SpikeAndSlabPrior_SSMRD, self).__init__(pi=pi,learnPi=learnPi,group_spike=group_spike,variance=variance, name=name, **kw)
-    
+        super(SpikeAndSlabPrior_SSMRD, self).__init__(
+            pi=pi,
+            learnPi=learnPi,
+            group_spike=group_spike,
+            variance=variance,
+            name=name,
+            **kw
+        )
+
    def _update_inernal(self, varp_list):
        """Make an update of the internal status by gathering the variational posteriors for all the individual models."""
        # The probability for the binary variable for the same latent dimension of any of the models is on.
        if self.group_spike:
-            self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group)
-            [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]]
+            self._b_prob_all = 1.0 - param_to_array(varp_list[0].gamma_group)
+            [
+                np.multiply(self._b_prob_all, 1.0 - vp.gamma_group, self._b_prob_all)
+                for vp in varp_list[1:]
+            ]
        else:
-            self._b_prob_all = 1.-param_to_array(varp_list[0].binary_prob)
-            [np.multiply(self._b_prob_all, 1.-vp.binary_prob, self._b_prob_all) for vp in varp_list[1:]]            
+            self._b_prob_all = 1.0 - param_to_array(varp_list[0].binary_prob)
+            [
+                np.multiply(self._b_prob_all, 1.0 - vp.binary_prob, self._b_prob_all)
+                for vp in varp_list[1:]
+            ]

    def KL_divergence(self, variational_posterior):
        mu = variational_posterior.mean
@ -176,16 +262,20 @@ class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
            gamma = variational_posterior.binary_prob[0]
        else:
            gamma = variational_posterior.binary_prob
-        if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+        if len(self.pi.shape) == 2:
+            idx = np.unique(gamma._raveled_index() / gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi

-        var_mean = np.square(mu)/self.variance
-        var_S = (S/self.variance - np.log(S))
-        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
-        return var_gamma +((1.-self._b_prob_all)*(np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels)
+        var_mean = np.square(mu) / self.variance
+        var_S = S / self.variance - np.log(S)
+        var_gamma = (gamma * np.log(gamma / pi)).sum() + (
+            (1 - gamma) * np.log((1 - gamma) / (1 - pi))
+        ).sum()
+        return var_gamma + (
+            (1.0 - self._b_prob_all) * (np.log(self.variance) - 1.0 + var_mean + var_S)
+        ).sum() / (2.0 * self.nModels)

    def update_gradients_KL(self, variational_posterior):
        mu = variational_posterior.mean
@ -195,63 +285,141 @@ class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
            gamma = variational_posterior.binary_prob.values[0]
        else:
            gamma = variational_posterior.binary_prob.values
-        if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+        if len(self.pi.shape) == 2:
+            idx = np.unique(gamma._raveled_index() / gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi

        if self.group_spike:
-            tmp = self._b_prob_all/(1.-gamma)
-            variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))/N +tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+            tmp = self._b_prob_all / (1.0 - gamma)
+            variational_posterior.binary_prob.gradient -= (
+                np.log((1 - pi) / pi * gamma / (1.0 - gamma)) / N
+                + tmp
+                * (
+                    (np.square(mu) + S) / self.variance
+                    - np.log(S)
+                    + np.log(self.variance)
+                    - 1.0
+                )
+                / 2.0
+            )
        else:
-            variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
-        mu.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels)
-        S.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels)
+            variational_posterior.binary_prob.gradient -= (
+                np.log((1 - pi) / pi * gamma / (1.0 - gamma))
+                + (
+                    (np.square(mu) + S) / self.variance
+                    - np.log(S)
+                    + np.log(self.variance)
+                    - 1.0
+                )
+                / 2.0
+            )
+        mu.gradient -= (1.0 - self._b_prob_all) * mu / (self.variance * self.nModels)
+        S.gradient -= (
+            (1.0 / self.variance - 1.0 / S)
+            * (1.0 - self._b_prob_all)
+            / (2.0 * self.nModels)
+        )
        if self.learnPi:
-            raise 'Not Supported!'
+            raise "Not Supported!"
+

 class IBPPrior_SSMRD(VariationalPrior):
-    def __init__(self, nModels, input_dim, alpha =2., tau=None, name='IBPPrior', **kw):
+    def __init__(self, nModels, input_dim, alpha=2.0, tau=None, name="IBPPrior", **kw):
        super(IBPPrior_SSMRD, self).__init__(name=name, **kw)
-        from paramz.transformations import Logexp, __fixed__  
+        from paramz.transformations import Logexp, __fixed__
+
        self.nModels = nModels
        self._b_prob_all = 0.5
        self.input_dim = input_dim
-        self.variance = 1.
-        self.alpha = Param('alpha', alpha, __fixed__)
+        self.variance = 1.0
+        self.alpha = Param("alpha", alpha, __fixed__)
        self.link_parameter(self.alpha)
-        
+
    def _update_inernal(self, varp_list):
        """Make an update of the internal status by gathering the variational posteriors for all the individual models."""
        # The probability for the binary variable for the same latent dimension of any of the models is on.
-        self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group)
-        [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]]
+        self._b_prob_all = 1.0 - param_to_array(varp_list[0].gamma_group)
+        [
+            np.multiply(self._b_prob_all, 1.0 - vp.gamma_group, self._b_prob_all)
+            for vp in varp_list[1:]
+        ]

    def KL_divergence(self, variational_posterior):
-        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
-            
-        var_mean = np.square(mu)/self.variance
-        var_S = (S/self.variance - np.log(S))
-        part1 = ((1.-self._b_prob_all)* (np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels)
-        
-        ad = self.alpha/self.input_dim
-        from scipy.special import betaln,digamma
-        part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + (betaln(ad,1.)*self.input_dim -betaln(tau[:,0], tau[:,1]).sum())/self.nModels \
-                 + (( (tau[:,0]-ad)/self.nModels -gamma)*digamma(tau[:,0])).sum() + \
-                (((tau[:,1]-1.)/self.nModels+gamma-1.)*digamma(tau[:,1])).sum() + (((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*digamma(tau.sum(axis=1))).sum()
-        return part1+part2
+        mu, S, gamma, tau = (
+            variational_posterior.mean.values,
+            variational_posterior.variance.values,
+            variational_posterior.gamma_group.values,
+            variational_posterior.tau.values,
+        )
+
+        var_mean = np.square(mu) / self.variance
+        var_S = S / self.variance - np.log(S)
+        part1 = (
+            (1.0 - self._b_prob_all) * (np.log(self.variance) - 1.0 + var_mean + var_S)
+        ).sum() / (2.0 * self.nModels)
+
+        ad = self.alpha / self.input_dim
+        from scipy.special import betaln, digamma
+
+        part2 = (
+            (gamma * np.log(gamma)).sum()
+            + ((1.0 - gamma) * np.log(1.0 - gamma)).sum()
+            + (betaln(ad, 1.0) * self.input_dim - betaln(tau[:, 0], tau[:, 1]).sum())
+            / self.nModels
+            + (((tau[:, 0] - ad) / self.nModels - gamma) * digamma(tau[:, 0])).sum()
+            + (
+                ((tau[:, 1] - 1.0) / self.nModels + gamma - 1.0) * digamma(tau[:, 1])
+            ).sum()
+            + (
+                ((1.0 + ad - tau[:, 0] - tau[:, 1]) / self.nModels + 1.0)
+                * digamma(tau.sum(axis=1))
+            ).sum()
+        )
+        return part1 + part2

    def update_gradients_KL(self, variational_posterior):
-        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+        mu, S, gamma, tau = (
+            variational_posterior.mean.values,
+            variational_posterior.variance.values,
+            variational_posterior.gamma_group.values,
+            variational_posterior.tau.values,
+        )

-        variational_posterior.mean.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels)
-        variational_posterior.variance.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels)
-        from scipy.special import digamma,polygamma
-        tmp = self._b_prob_all/(1.-gamma)
-        dgamma = (np.log(gamma/(1.-gamma))+ digamma(tau[:,1])-digamma(tau[:,0]))/variational_posterior.num_data
-        variational_posterior.binary_prob.gradient -= dgamma+tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
-        ad = self.alpha/self.input_dim
-        common = ((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*polygamma(1,tau.sum(axis=1))
-        variational_posterior.tau.gradient[:,0] = -(((tau[:,0]-ad)/self.nModels -gamma)*polygamma(1,tau[:,0])+common)
-        variational_posterior.tau.gradient[:,1] = -(((tau[:,1]-1.)/self.nModels+gamma-1.)*polygamma(1,tau[:,1])+common)
+        variational_posterior.mean.gradient -= (
+            (1.0 - self._b_prob_all) * mu / (self.variance * self.nModels)
+        )
+        variational_posterior.variance.gradient -= (
+            (1.0 / self.variance - 1.0 / S)
+            * (1.0 - self._b_prob_all)
+            / (2.0 * self.nModels)
+        )
+        from scipy.special import digamma, polygamma
+
+        tmp = self._b_prob_all / (1.0 - gamma)
+        dgamma = (
+            np.log(gamma / (1.0 - gamma)) + digamma(tau[:, 1]) - digamma(tau[:, 0])
+        ) / variational_posterior.num_data
+        variational_posterior.binary_prob.gradient -= (
+            dgamma
+            + tmp
+            * (
+                (np.square(mu) + S) / self.variance
+                - np.log(S)
+                + np.log(self.variance)
+                - 1.0
+            )
+            / 2.0
+        )
+        ad = self.alpha / self.input_dim
+        common = ((1.0 + ad - tau[:, 0] - tau[:, 1]) / self.nModels + 1.0) * polygamma(
+            1, tau.sum(axis=1)
+        )
+        variational_posterior.tau.gradient[:, 0] = -(
+            ((tau[:, 0] - ad) / self.nModels - gamma) * polygamma(1, tau[:, 0]) + common
+        )
+        variational_posterior.tau.gradient[:, 1] = -(
+            ((tau[:, 1] - 1.0) / self.nModels + gamma - 1.0) * polygamma(1, tau[:, 1])
+            + common
+        )
--- a/GPy/models/state_space_cython.c
+++ b/GPy/models/state_space_cython.c
--- a/GPy/models/state_space_main.py
+++ b/GPy/models/state_space_main.py
--- a/GPy/old_tests/bcgplvm_tests.py
+++ b/GPy/old_tests/bcgplvm_tests.py
@ -17,7 +17,7 @@ class BCGPLVMTests(unittest.TestCase):
        mapping = GPy.mappings.Kernel(output_dim=input_dim, X=Y, kernel=bk)
        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
        
    def test_linear_backconstraint(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
@ -30,7 +30,7 @@ class BCGPLVMTests(unittest.TestCase):
        mapping = GPy.mappings.Linear(output_dim=input_dim, input_dim=output_dim)
        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
        
    def test_mlp_backconstraint(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
@ -43,7 +43,7 @@ class BCGPLVMTests(unittest.TestCase):
        mapping = GPy.mappings.MLP(output_dim=input_dim, input_dim=output_dim, hidden_dim=[5, 4, 7])
        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/old_tests/gp_transformation_tests.py
+++ b/GPy/old_tests/gp_transformation_tests.py
@ -1,4 +1,3 @@
-from nose.tools import with_setup
 from GPy.models import GradientChecker
 from GPy.likelihoods.noise_models import gp_transformations
 import inspect
--- a/GPy/old_tests/gplvm_tests.py
+++ b/GPy/old_tests/gplvm_tests.py
@ -15,7 +15,7 @@ class GPLVMTests(unittest.TestCase):
        k = GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_linear_kern(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
@ -26,7 +26,7 @@ class GPLVMTests(unittest.TestCase):
        k = GPy.kern.Linear(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_rbf_kern(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
@ -37,7 +37,7 @@ class GPLVMTests(unittest.TestCase):
        k = GPy.kern.RBF(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/old_tests/psi_stat_gradient_tests.py
+++ b/GPy/old_tests/psi_stat_gradient_tests.py
@ -1,8 +1,8 @@
-'''
+"""
 Created on 22 Apr 2013

@author: maxz
-'''
+"""
 import unittest
 import numpy

@ -13,42 +13,66 @@ from GPy.core.parameterization.param import Param
 from GPy.core.parameterization.transformations import Logexp
 from GPy.core.parameterization.variational import NormalPosterior

+
 class PsiStatModel(Model):
    def __init__(self, which, X, X_variance, Z, num_inducing, kernel):
-        super(PsiStatModel, self).__init__(name='psi stat test')
+        super(PsiStatModel, self).__init__(name="psi stat test")
        self.which = which
        self.X = Param("X", X)
-        self.X_variance = Param('X_variance', X_variance, Logexp())
+        self.X_variance = Param("X_variance", X_variance, Logexp())
        self.q = NormalPosterior(self.X, self.X_variance)
        self.Z = Param("Z", Z)
        self.N, self.input_dim = X.shape
        self.num_inducing, input_dim = Z.shape
-        assert self.input_dim == input_dim, "shape missmatch: Z:{!s} X:{!s}".format(Z.shape, X.shape)
+        assert self.input_dim == input_dim, "shape missmatch: Z:{!s} X:{!s}".format(
+            Z.shape, X.shape
+        )
        self.kern = kernel
        self.psi_ = self.kern.__getattribute__(self.which)(self.Z, self.q)
        self.add_parameters(self.q, self.Z, self.kern)

    def log_likelihood(self):
-        return self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance).sum()
+        return self.kern.__getattribute__(self.which)(
+            self.Z, self.X, self.X_variance
+        ).sum()

    def parameters_changed(self):
-        psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(numpy.ones_like(self.psi_), self.Z, self.q)
+        psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(
+            numpy.ones_like(self.psi_), self.Z, self.q
+        )
        self.X.gradient = psimu
        self.X_variance.gradient = psiS
-        #psimu, psiS = numpy.ones(self.N * self.input_dim), numpy.ones(self.N * self.input_dim)
-        try: psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.q)
-        except AttributeError: psiZ = numpy.zeros_like(self.Z)
+        # psimu, psiS = numpy.ones(self.N * self.input_dim), numpy.ones(self.N * self.input_dim)
+        try:
+            psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(
+                numpy.ones_like(self.psi_), self.Z, self.q
+            )
+        except AttributeError:
+            psiZ = numpy.zeros_like(self.Z)
        self.Z.gradient = psiZ
-        #psiZ = numpy.ones(self.num_inducing * self.input_dim)
-        N,M = self.X.shape[0], self.Z.shape[0]
-        dL_dpsi0, dL_dpsi1, dL_dpsi2 = numpy.zeros([N]), numpy.zeros([N,M]), numpy.zeros([N,M,M])
-        if self.which == 'psi0': dL_dpsi0 += 1
-        if self.which == 'psi1': dL_dpsi1 += 1
-        if self.which == 'psi2': dL_dpsi2 += 1
-        self.kern.update_gradients_variational(numpy.zeros([1,1]),
-                                               dL_dpsi0,
-                                               dL_dpsi1,
-                                               dL_dpsi2, self.X, self.X_variance, self.Z)
+        # psiZ = numpy.ones(self.num_inducing * self.input_dim)
+        N, M = self.X.shape[0], self.Z.shape[0]
+        dL_dpsi0, dL_dpsi1, dL_dpsi2 = (
+            numpy.zeros([N]),
+            numpy.zeros([N, M]),
+            numpy.zeros([N, M, M]),
+        )
+        if self.which == "psi0":
+            dL_dpsi0 += 1
+        if self.which == "psi1":
+            dL_dpsi1 += 1
+        if self.which == "psi2":
+            dL_dpsi2 += 1
+        self.kern.update_gradients_variational(
+            numpy.zeros([1, 1]),
+            dL_dpsi0,
+            dL_dpsi1,
+            dL_dpsi2,
+            self.X,
+            self.X_variance,
+            self.Z,
+        )
+

 class DPsiStatTest(unittest.TestCase):
    input_dim = 5
@ -56,128 +80,206 @@ class DPsiStatTest(unittest.TestCase):
    num_inducing = 10
    input_dim = 20
    X = numpy.random.randn(N, input_dim)
-    X_var = .5 * numpy.ones_like(X) + .4 * numpy.clip(numpy.random.randn(*X.shape), 0, 1)
+    X_var = 0.5 * numpy.ones_like(X) + 0.4 * numpy.clip(
+        numpy.random.randn(*X.shape), 0, 1
+    )
    Z = numpy.random.permutation(X)[:num_inducing]
    Y = X.dot(numpy.random.randn(input_dim, input_dim))
-#     kernels = [GPy.kern.Linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)), GPy.kern.RBF(input_dim, ARD=True), GPy.kern.Bias(input_dim)]
+    #     kernels = [GPy.kern.Linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)), GPy.kern.RBF(input_dim, ARD=True), GPy.kern.Bias(input_dim)]

    kernels = [
-               GPy.kern.Linear(input_dim),
-               GPy.kern.RBF(input_dim),
-               #GPy.kern.Bias(input_dim),
-               #GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim),
-               #GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim)
-               ]
+        GPy.kern.Linear(input_dim),
+        GPy.kern.RBF(input_dim),
+        # GPy.kern.Bias(input_dim),
+        # GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim),
+        # GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim)
+    ]

    def testPsi0(self):
        for k in self.kernels:
-            m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,\
-                             num_inducing=self.num_inducing, kernel=k)
+            m = PsiStatModel(
+                "psi0",
+                X=self.X,
+                X_variance=self.X_var,
+                Z=self.Z,
+                num_inducing=self.num_inducing,
+                kernel=k,
+            )
            m.randomize()
-            assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k._parameters_)))
+            assert m.checkgrad(), "{} x psi0".format(
+                "+".join(map(lambda x: x.name, k._parameters_))
+            )

    def testPsi1(self):
        for k in self.kernels:
-            m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     num_inducing=self.num_inducing, kernel=k)
+            m = PsiStatModel(
+                "psi1",
+                X=self.X,
+                X_variance=self.X_var,
+                Z=self.Z,
+                num_inducing=self.num_inducing,
+                kernel=k,
+            )
            m.randomize()
-            assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k._parameters_)))
+            assert m.checkgrad(), "{} x psi1".format(
+                "+".join(map(lambda x: x.name, k._parameters_))
+            )

    def testPsi2_lin(self):
        k = self.kernels[0]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                 num_inducing=self.num_inducing, kernel=k)
+        m = PsiStatModel(
+            "psi2",
+            X=self.X,
+            X_variance=self.X_var,
+            Z=self.Z,
+            num_inducing=self.num_inducing,
+            kernel=k,
+        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
+            "+".join(map(lambda x: x.name, k._parameters_))
+        )
+
    def testPsi2_lin_bia(self):
        k = self.kernels[3]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     num_inducing=self.num_inducing, kernel=k)
+        m = PsiStatModel(
+            "psi2",
+            X=self.X,
+            X_variance=self.X_var,
+            Z=self.Z,
+            num_inducing=self.num_inducing,
+            kernel=k,
+        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
+            "+".join(map(lambda x: x.name, k._parameters_))
+        )
+
    def testPsi2_rbf(self):
        k = self.kernels[1]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     num_inducing=self.num_inducing, kernel=k)
+        m = PsiStatModel(
+            "psi2",
+            X=self.X,
+            X_variance=self.X_var,
+            Z=self.Z,
+            num_inducing=self.num_inducing,
+            kernel=k,
+        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
+            "+".join(map(lambda x: x.name, k._parameters_))
+        )
+
    def testPsi2_rbf_bia(self):
        k = self.kernels[-1]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     num_inducing=self.num_inducing, kernel=k)
+        m = PsiStatModel(
+            "psi2",
+            X=self.X,
+            X_variance=self.X_var,
+            Z=self.Z,
+            num_inducing=self.num_inducing,
+            kernel=k,
+        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
+            "+".join(map(lambda x: x.name, k._parameters_))
+        )
+
    def testPsi2_bia(self):
        k = self.kernels[2]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     num_inducing=self.num_inducing, kernel=k)
+        m = PsiStatModel(
+            "psi2",
+            X=self.X,
+            X_variance=self.X_var,
+            Z=self.Z,
+            num_inducing=self.num_inducing,
+            kernel=k,
+        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
+            "+".join(map(lambda x: x.name, k._parameters_))
+        )


 if __name__ == "__main__":
    import sys
-    interactive = 'i' in sys.argv
+
+    interactive = "i" in sys.argv
    if interactive:
-#         N, num_inducing, input_dim, input_dim = 30, 5, 4, 30
-#         X = numpy.random.rand(N, input_dim)
-#         k = GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
-#         K = k.K(X)
-#         Y = numpy.random.multivariate_normal(numpy.zeros(N), K, input_dim).T
-#         Y -= Y.mean(axis=0)
-#         k = GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
-#         m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
-#         m.randomize()
-# #         self.assertTrue(m.checkgrad())
+        #         N, num_inducing, input_dim, input_dim = 30, 5, 4, 30
+        #         X = numpy.random.rand(N, input_dim)
+        #         k = GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
+        #         K = k.K(X)
+        #         Y = numpy.random.multivariate_normal(numpy.zeros(N), K, input_dim).T
+        #         Y -= Y.mean(axis=0)
+        #         k = GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
+        #         m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+        #         m.randomize()
+        # #         assert m.checkgrad()
        numpy.random.seed(0)
        input_dim = 3
        N = 3
        num_inducing = 2
        D = 15
        X = numpy.random.randn(N, input_dim)
-        X_var = .5 * numpy.ones_like(X) + .1 * numpy.clip(numpy.random.randn(*X.shape), 0, 1)
+        X_var = 0.5 * numpy.ones_like(X) + 0.1 * numpy.clip(
+            numpy.random.randn(*X.shape), 0, 1
+        )
        Z = numpy.random.permutation(X)[:num_inducing]
        Y = X.dot(numpy.random.randn(input_dim, D))
-#         kernel = GPy.kern.Bias(input_dim)
-#
-#         kernels = [GPy.kern.Linear(input_dim), GPy.kern.RBF(input_dim), GPy.kern.Bias(input_dim),
-#                GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim),
-#                GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim)]
+        #         kernel = GPy.kern.Bias(input_dim)
+        #
+        #         kernels = [GPy.kern.Linear(input_dim), GPy.kern.RBF(input_dim), GPy.kern.Bias(input_dim),
+        #                GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim),
+        #                GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim)]

-#         for k in kernels:
-#             m = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
-#                      num_inducing=num_inducing, kernel=k)
-#             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
-#
-        m0 = PsiStatModel('psi0', X=X, X_variance=X_var, Z=Z,
-                         num_inducing=num_inducing, kernel=GPy.kern.RBF(input_dim)+GPy.kern.Bias(input_dim))
-#         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=kernel)
-#         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=kernel)
-#         m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=GPy.kern.RBF(input_dim))
-#         m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=GPy.kern.Linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
+        #         for k in kernels:
+        #             m = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
+        #                      num_inducing=num_inducing, kernel=k)
+        #             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
+        #
+        m0 = PsiStatModel(
+            "psi0",
+            X=X,
+            X_variance=X_var,
+            Z=Z,
+            num_inducing=num_inducing,
+            kernel=GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim),
+        )
+        #         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
+        #                          num_inducing=num_inducing, kernel=kernel)
+        #         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
+        #                          num_inducing=num_inducing, kernel=kernel)
+        #         m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+        #                          num_inducing=num_inducing, kernel=GPy.kern.RBF(input_dim))
+        #         m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+        #                          num_inducing=num_inducing, kernel=GPy.kern.Linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
        # + GPy.kern.Bias(input_dim))
-#         m = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing,
-#                          kernel=(
-#             GPy.kern.RBF(input_dim, ARD=1)
-#             +GPy.kern.Linear(input_dim, ARD=1)
-#             +GPy.kern.Bias(input_dim))
-#                          )
-#         m.ensure_default_constraints()
-        m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-                         num_inducing=num_inducing, kernel=(
-            GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1)
-            #+GPy.kern.Linear(input_dim, numpy.random.rand(input_dim), ARD=1)
-            #+GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1)
-            #+GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(), ARD=0)
-            +GPy.kern.Bias(input_dim)
-            +GPy.kern.White(input_dim)
-            )
-            )
-        #m2.ensure_default_constraints()
+        #         m = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+        #                          num_inducing=num_inducing,
+        #                          kernel=(
+        #             GPy.kern.RBF(input_dim, ARD=1)
+        #             +GPy.kern.Linear(input_dim, ARD=1)
+        #             +GPy.kern.Bias(input_dim))
+        #                          )
+        #         m.ensure_default_constraints()
+        m2 = PsiStatModel(
+            "psi2",
+            X=X,
+            X_variance=X_var,
+            Z=Z,
+            num_inducing=num_inducing,
+            kernel=(
+                GPy.kern.RBF(
+                    input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1
+                )
+                # +GPy.kern.Linear(input_dim, numpy.random.rand(input_dim), ARD=1)
+                # +GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1)
+                # +GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(), ARD=0)
+                + GPy.kern.Bias(input_dim)
+                + GPy.kern.White(input_dim)
+            ),
+        )
+        # m2.ensure_default_constraints()
    else:
        unittest.main()
--- a/GPy/old_tests/sparse_gplvm_tests.py
+++ b/GPy/old_tests/sparse_gplvm_tests.py
@ -16,7 +16,7 @@ class sparse_GPLVMTests(unittest.TestCase):
        k = GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_linear_kern(self):
        N, num_inducing, input_dim, D = 10, 3, 2, 4
@ -27,7 +27,7 @@ class sparse_GPLVMTests(unittest.TestCase):
        k = GPy.kern.Linear(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_rbf_kern(self):
        N, num_inducing, input_dim, D = 10, 3, 2, 4
@ -38,7 +38,7 @@ class sparse_GPLVMTests(unittest.TestCase):
        k = GPy.kern.RBF(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/plotting/gpy_plot/plot_util.py
+++ b/GPy/plotting/gpy_plot/plot_util.py
@ -72,7 +72,7 @@ def helper_predict_with_model(self, Xgrid, plot_raw, apply_link, percentiles, wh
    if 'Y_metadata' not in predict_kw:
        predict_kw['Y_metadata'] = {}
    if 'output_index' not in predict_kw['Y_metadata']:
-        predict_kw['Y_metadata']['output_index'] = Xgrid[:,-1:].astype(np.int)
+        predict_kw['Y_metadata']['output_index'] = Xgrid[:,-1:].astype(np.int64)

    mu, _ = self.predict(Xgrid, **predict_kw)

--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -5,6 +5,7 @@ import numpy as np

 from .util import align_subplot_array, align_subplots

+
 def ax_default(fignum, ax):
    if ax is None:
        fig = plt.figure(fignum)
@ -13,11 +14,23 @@ def ax_default(fignum, ax):
        fig = ax.figure
    return fig, ax

-def meanplot(x, mu, color='#3300FF', ax=None, fignum=None, linewidth=2,**kw):
-    _, axes = ax_default(fignum, ax)
-    return axes.plot(x,mu,color=color,linewidth=linewidth,**kw)

-def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, fignum=None, **kwargs):
+def meanplot(x, mu, color="#3300FF", ax=None, fignum=None, linewidth=2, **kw):
+    _, axes = ax_default(fignum, ax)
+    return axes.plot(x, mu, color=color, linewidth=linewidth, **kw)
+
+
+def gpplot(
+    x,
+    mu,
+    lower,
+    upper,
+    edgecol="#3300FF",
+    fillcol="#33CCFF",
+    ax=None,
+    fignum=None,
+    **kwargs
+):
    _, axes = ax_default(fignum, ax)

    mu = mu.flatten()
@ -27,51 +40,62 @@ def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, f

    plots = []

-    #here's the mean
+    # here's the mean
    plots.append(meanplot(x, mu, edgecol, axes))

-    #here's the box
-    kwargs['linewidth']=0.5
-    if not 'alpha' in kwargs.keys():
-        kwargs['alpha'] = 0.3
-    plots.append(axes.fill(np.hstack((x,x[::-1])),np.hstack((upper,lower[::-1])),color=fillcol,**kwargs))
+    # here's the box
+    kwargs["linewidth"] = 0.5
+    if not "alpha" in kwargs.keys():
+        kwargs["alpha"] = 0.3
+    plots.append(
+        axes.fill(
+            np.hstack((x, x[::-1])),
+            np.hstack((upper, lower[::-1])),
+            color=fillcol,
+            **kwargs
+        )
+    )

-    #this is the edge:
-    plots.append(meanplot(x, upper,color=edgecol, linewidth=0.2, ax=axes))
-    plots.append(meanplot(x, lower,color=edgecol, linewidth=0.2, ax=axes))
+    # this is the edge:
+    plots.append(meanplot(x, upper, color=edgecol, linewidth=0.2, ax=axes))
+    plots.append(meanplot(x, lower, color=edgecol, linewidth=0.2, ax=axes))

    return plots

+
 def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
    _, ax = ax_default(fignum, ax)

    plots = []

-    #here's the box
-    if 'linewidth' not in kwargs:
-        kwargs['linewidth'] = 0.5
-    if not 'alpha' in kwargs.keys():
-        kwargs['alpha'] = 1./(len(percentiles))
+    # here's the box
+    if "linewidth" not in kwargs:
+        kwargs["linewidth"] = 0.5
+    if not "alpha" in kwargs.keys():
+        kwargs["alpha"] = 1.0 / (len(percentiles))

    # pop where from kwargs
-    where = kwargs.pop('where') if 'where' in kwargs else None
+    where = kwargs.pop("where") if "where" in kwargs else None
    # pop interpolate, which we actually do not do here!
-    if 'interpolate' in kwargs: kwargs.pop('interpolate')
+    if "interpolate" in kwargs:
+        kwargs.pop("interpolate")

    def pairwise(inlist):
        l = len(inlist)
-        for i in range(int(np.ceil(l/2.))):
-            yield inlist[:][i], inlist[:][(l-1)-i]
+        for i in range(int(np.ceil(l / 2.0))):
+            yield inlist[:][i], inlist[:][(l - 1) - i]

    polycol = []
    for y1, y2 in pairwise(percentiles):
        import matplotlib.mlab as mlab
+
        # Handle united data, such as dates
        ax._process_unit_info(xdata=x, ydata=y1)
        ax._process_unit_info(ydata=y2)

        # Convert the arrays so we can work with them
        from numpy import ma
+
        x = ma.masked_invalid(ax.convert_xunits(x))
        y1 = ma.masked_invalid(ax.convert_yunits(y1))
        y2 = ma.masked_invalid(ax.convert_yunits(y2))
@ -103,7 +127,7 @@ def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
                continue

            N = len(xslice)
-            X = np.zeros((2 * N + 2, 2), np.float)
+            X = np.zeros((2 * N + 2, 2), float)

            # the purpose of the next two lines is for when y2 is a
            # scalar like 0 and we want the fill to go all the way
@ -114,19 +138,21 @@ def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
            X[0] = start
            X[N + 1] = end

-            X[1:N + 1, 0] = xslice
-            X[1:N + 1, 1] = y1slice
-            X[N + 2:, 0] = xslice[::-1]
-            X[N + 2:, 1] = y2slice[::-1]
+            X[1 : N + 1, 0] = xslice
+            X[1 : N + 1, 1] = y1slice
+            X[N + 2 :, 0] = xslice[::-1]
+            X[N + 2 :, 1] = y2slice[::-1]

            polys.append(X)
        polycol.extend(polys)
    from matplotlib.collections import PolyCollection
+
    plots.append(PolyCollection(polycol, **kwargs))
    ax.add_collection(plots[-1], autolim=True)
    ax.autoscale_view()
    return plots

+
 def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)

@ -138,17 +164,19 @@ def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
    plots = []

    if edgecol is None:
-        edgecol='#3300FF'
+        edgecol = "#3300FF"

-    if not 'alpha' in kwargs.keys():
-        kwargs['alpha'] = 1.
+    if not "alpha" in kwargs.keys():
+        kwargs["alpha"] = 1.0

+    if not "lw" in kwargs.keys():
+        kwargs["lw"] = 1.0

-    if not 'lw' in kwargs.keys():
-        kwargs['lw'] = 1.
-
-
-    plots.append(axes.errorbar(x,mu,yerr=np.vstack([mu-lower,upper-mu]),color=edgecol,**kwargs))
+    plots.append(
+        axes.errorbar(
+            x, mu, yerr=np.vstack([mu - lower, upper - mu]), color=edgecol, **kwargs
+        )
+    )
    plots[-1][0].remove()
    return plots

@ -156,53 +184,60 @@ def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
 def removeRightTicks(ax=None):
    ax = ax or plt.gca()
    for i, line in enumerate(ax.get_yticklines()):
-        if i%2 == 1:   # odd indices
+        if i % 2 == 1:  # odd indices
            line.set_visible(False)

+
 def removeUpperTicks(ax=None):
    ax = ax or plt.gca()
    for i, line in enumerate(ax.get_xticklines()):
-        if i%2 == 1:   # odd indices
+        if i % 2 == 1:  # odd indices
            line.set_visible(False)

-def fewerXticks(ax=None,divideby=2):
+
+def fewerXticks(ax=None, divideby=2):
    ax = ax or plt.gca()
    ax.set_xticks(ax.get_xticks()[::divideby])

-def x_frame1D(X,plot_limits=None,resolution=None):
+
+def x_frame1D(X, plot_limits=None, resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
    """
-    assert X.shape[1] ==1, "x_frame1D is defined for one-dimensional inputs"
+    assert X.shape[1] == 1, "x_frame1D is defined for one-dimensional inputs"
    if plot_limits is None:
        from ...core.parameterization.variational import VariationalPosterior
+
        if isinstance(X, VariationalPosterior):
-            xmin,xmax = X.mean.min(0),X.mean.max(0)
+            xmin, xmax = X.mean.min(0), X.mean.max(0)
        else:
-            xmin,xmax = X.min(0),X.max(0)
-        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
-    elif len(plot_limits)==2:
+            xmin, xmax = X.min(0), X.max(0)
+        xmin, xmax = xmin - 0.2 * (xmax - xmin), xmax + 0.2 * (xmax - xmin)
+    elif len(plot_limits) == 2:
        xmin, xmax = plot_limits
    else:
        raise ValueError("Bad limits for plotting")

-    Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
+    Xnew = np.linspace(xmin, xmax, resolution or 200)[:, None]
    return Xnew, xmin, xmax

-def x_frame2D(X,plot_limits=None,resolution=None):
+
+def x_frame2D(X, plot_limits=None, resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
    """
-    assert X.shape[1] ==2, "x_frame2D is defined for two-dimensional inputs"
+    assert X.shape[1] == 2, "x_frame2D is defined for two-dimensional inputs"
    if plot_limits is None:
-        xmin,xmax = X.min(0),X.max(0)
-        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
-    elif len(plot_limits)==2:
+        xmin, xmax = X.min(0), X.max(0)
+        xmin, xmax = xmin - 0.2 * (xmax - xmin), xmax + 0.2 * (xmax - xmin)
+    elif len(plot_limits) == 2:
        xmin, xmax = plot_limits
    else:
        raise ValueError("Bad limits for plotting")

    resolution = resolution or 50
-    xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
-    Xnew = np.vstack((xx.flatten(),yy.flatten())).T
+    xx, yy = np.mgrid[
+        xmin[0] : xmax[0] : 1j * resolution, xmin[1] : xmax[1] : 1j * resolution
+    ]
+    Xnew = np.vstack((xx.flatten(), yy.flatten())).T
    return Xnew, xx, yy, xmin, xmax
--- a/GPy/plotting/matplot_dep/defaults.py
+++ b/GPy/plotting/matplot_dep/defaults.py
@ -1,4 +1,4 @@
-#===============================================================================
+# ===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 # All rights reserved.
 #
@ -26,12 +26,12 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
+# ===============================================================================

-from matplotlib import cm
+from matplotlib import pyplot
 from .. import Tango

-'''
+"""
 This file is for defaults for the gpy plot, specific to the plotting library.

 Create a kwargs dictionary with the right name for the plotting function
@ -40,36 +40,55 @@ the plotting library will be used.

 In the code, always ise plotting.gpy_plots.defaults to get the defaults, as
 it gives back an empty default, when defaults are not defined.
-'''
+"""

 # Data plots:
-data_1d = dict(lw=1.5, marker='x', color='k')
-data_2d = dict(s=35, edgecolors='none', linewidth=0., cmap=cm.get_cmap('hot'), alpha=.5)
-inducing_1d = dict(lw=0, s=500, color=Tango.colorsHex['darkRed'])
-inducing_2d = dict(s=17, edgecolor='k', linewidth=.4, color='white', alpha=.5, marker='^')
-inducing_3d = dict(lw=.3, s=500, color=Tango.colorsHex['darkRed'], edgecolor='k')
-xerrorbar = dict(color='k', fmt='none', elinewidth=.5, alpha=.5)
-yerrorbar = dict(color=Tango.colorsHex['darkRed'], fmt='none', elinewidth=.5, alpha=.5)
+data_1d = dict(lw=1.5, marker="x", color="k")
+data_2d = dict(
+    s=35, edgecolors="none", linewidth=0.0, cmap=pyplot.get_cmap("hot"), alpha=0.5
+)
+inducing_1d = dict(lw=0, s=500, color=Tango.colorsHex["darkRed"])
+inducing_2d = dict(
+    s=17, edgecolor="k", linewidth=0.4, color="white", alpha=0.5, marker="^"
+)
+inducing_3d = dict(lw=0.3, s=500, color=Tango.colorsHex["darkRed"], edgecolor="k")
+xerrorbar = dict(color="k", fmt="none", elinewidth=0.5, alpha=0.5)
+yerrorbar = dict(
+    color=Tango.colorsHex["darkRed"], fmt="none", elinewidth=0.5, alpha=0.5
+)

 # GP plots:
-meanplot_1d = dict(color=Tango.colorsHex['mediumBlue'], linewidth=2)
-meanplot_2d = dict(cmap='hot', linewidth=.5)
-meanplot_3d = dict(linewidth=0, antialiased=True, cstride=1, rstride=1, cmap='hot', alpha=.3)
-samples_1d = dict(color=Tango.colorsHex['mediumBlue'], linewidth=.3)
-samples_3d = dict(cmap='hot', alpha=.1, antialiased=True, cstride=1, rstride=1, linewidth=0)
-confidence_interval = dict(edgecolor=Tango.colorsHex['darkBlue'], linewidth=.5, color=Tango.colorsHex['lightBlue'],alpha=.2)
-density = dict(alpha=.5, color=Tango.colorsHex['lightBlue'])
+meanplot_1d = dict(color=Tango.colorsHex["mediumBlue"], linewidth=2)
+meanplot_2d = dict(cmap="hot", linewidth=0.5)
+meanplot_3d = dict(
+    linewidth=0, antialiased=True, cstride=1, rstride=1, cmap="hot", alpha=0.3
+)
+samples_1d = dict(color=Tango.colorsHex["mediumBlue"], linewidth=0.3)
+samples_3d = dict(
+    cmap="hot", alpha=0.1, antialiased=True, cstride=1, rstride=1, linewidth=0
+)
+confidence_interval = dict(
+    edgecolor=Tango.colorsHex["darkBlue"],
+    linewidth=0.5,
+    color=Tango.colorsHex["lightBlue"],
+    alpha=0.2,
+)
+density = dict(alpha=0.5, color=Tango.colorsHex["lightBlue"])

 # GPLVM plots:
-data_y_1d = dict(linewidth=0, cmap='RdBu', s=40)
-data_y_1d_plot = dict(color='k', linewidth=1.5)
+data_y_1d = dict(linewidth=0, cmap="RdBu", s=40)
+data_y_1d_plot = dict(color="k", linewidth=1.5)

 # Kernel plots:
-ard = dict(edgecolor='k', linewidth=1.2)
+ard = dict(edgecolor="k", linewidth=1.2)

 # Input plots:
-latent = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
-gradient = dict(aspect='auto', cmap='RdBu', interpolation='nearest', alpha=.7)
-magnification = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
-latent_scatter = dict(s=20, linewidth=.2, edgecolor='k', alpha=.9)
-annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
+latent = dict(aspect="auto", cmap="Greys", interpolation="bicubic")
+gradient = dict(aspect="auto", cmap="RdBu", interpolation="nearest", alpha=0.7)
+magnification = dict(aspect="auto", cmap="Greys", interpolation="bicubic")
+latent_scatter = dict(s=20, linewidth=0.2, edgecolor="k", alpha=0.9)
+annotation = dict(
+    fontdict=dict(family="sans-serif", weight="light", fontsize=9),
+    zorder=0.3,
+    alpha=0.7,
+)
--- a/GPy/plotting/matplot_dep/plot_definitions.py
+++ b/GPy/plotting/matplot_dep/plot_definitions.py
@ -1,4 +1,4 @@
-#===============================================================================
+# ===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 # All rights reserved.
 #
@ -26,7 +26,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
+# ===============================================================================
 import numpy as np
 from matplotlib import pyplot as plt
 from ..abstract_plotting_library import AbstractPlottingLibrary
@ -37,6 +37,7 @@ from .controllers import ImshowController, ImAnnotateController
 import itertools
 from .util import legend_ontop

+
 class MatplotlibPlots(AbstractPlottingLibrary):
    def __init__(self):
        super(MatplotlibPlots, self).__init__()
@ -49,54 +50,86 @@ class MatplotlibPlots(AbstractPlottingLibrary):
        fig.gridspec = plt.GridSpec(rows, cols, **gridspec_kwargs)
        return fig

-    def new_canvas(self, figure=None, row=1, col=1, projection='2d', xlabel=None, ylabel=None, zlabel=None, title=None, xlim=None, ylim=None, zlim=None, **kwargs):
-        if projection == '3d':
+    def new_canvas(
+        self,
+        figure=None,
+        row=1,
+        col=1,
+        projection="2d",
+        xlabel=None,
+        ylabel=None,
+        zlabel=None,
+        title=None,
+        xlim=None,
+        ylim=None,
+        zlim=None,
+        **kwargs
+    ):
+        if projection == "3d":
            from mpl_toolkits.mplot3d import Axes3D
-        elif projection == '2d':
+        elif projection == "2d":
            projection = None
-        if 'ax' in kwargs:
-            ax = kwargs.pop('ax')
+        if "ax" in kwargs:
+            ax = kwargs.pop("ax")
        else:
            if figure is not None:
                fig = figure
-            elif 'num' in kwargs and 'figsize' in kwargs:
-                fig = self.figure(num=kwargs.pop('num'), figsize=kwargs.pop('figsize'))
-            elif 'num' in kwargs:
-                fig = self.figure(num=kwargs.pop('num'))
-            elif 'figsize' in kwargs:
-                fig = self.figure(figsize=kwargs.pop('figsize'))
+            elif "num" in kwargs and "figsize" in kwargs:
+                fig = self.figure(num=kwargs.pop("num"), figsize=kwargs.pop("figsize"))
+            elif "num" in kwargs:
+                fig = self.figure(num=kwargs.pop("num"))
+            elif "figsize" in kwargs:
+                fig = self.figure(figsize=kwargs.pop("figsize"))
            else:
                fig = self.figure()

-            #if hasattr(fig, 'rows') and hasattr(fig, 'cols'):
-            ax = fig.add_subplot(fig.gridspec[row-1, col-1], projection=projection)
+            # if hasattr(fig, 'rows') and hasattr(fig, 'cols'):
+            ax = fig.add_subplot(fig.gridspec[row - 1, col - 1], projection=projection)

-        if xlim is not None: ax.set_xlim(xlim)
-        if ylim is not None: ax.set_ylim(ylim)
-        if xlabel is not None: ax.set_xlabel(xlabel)
-        if ylabel is not None: ax.set_ylabel(ylabel)
-        if title is not None: ax.set_title(title)
-        if projection == '3d':
-            if zlim is not None: ax.set_zlim(zlim)
-            if zlabel is not None: ax.set_zlabel(zlabel)
+        if xlim is not None:
+            ax.set_xlim(xlim)
+        if ylim is not None:
+            ax.set_ylim(ylim)
+        if xlabel is not None:
+            ax.set_xlabel(xlabel)
+        if ylabel is not None:
+            ax.set_ylabel(ylabel)
+        if title is not None:
+            ax.set_title(title)
+        if projection == "3d":
+            if zlim is not None:
+                ax.set_zlim(zlim)
+            if zlabel is not None:
+                ax.set_zlabel(zlabel)
        return ax, kwargs

    def add_to_canvas(self, ax, plots, legend=False, title=None, **kwargs):
-        #ax.autoscale_view()
-        fontdict=dict(family='sans-serif', weight='light', size=9)
+        # ax.autoscale_view()
+        fontdict = dict(family="sans-serif", weight="light", size=9)
        if legend is True:
            ax.legend(*ax.get_legend_handles_labels())
        elif legend >= 1:
-            #ax.legend(prop=fontdict)
+            # ax.legend(prop=fontdict)
            legend_ontop(ax, ncol=legend, fontdict=fontdict)
-        if title is not None: ax.figure.suptitle(title)
+        if title is not None:
+            ax.figure.suptitle(title)
        return plots

    def show_canvas(self, ax, **kwargs):
        ax.figure.canvas.draw()
        return ax.figure

-    def scatter(self, ax, X, Y, Z=None, color=Tango.colorsHex['mediumBlue'], label=None, marker='o', **kwargs):
+    def scatter(
+        self,
+        ax,
+        X,
+        Y,
+        Z=None,
+        color=Tango.colorsHex["mediumBlue"],
+        label=None,
+        marker="o",
+        **kwargs
+    ):
        if Z is not None:
            return ax.scatter(X, Y, c=color, zs=Z, label=label, marker=marker, **kwargs)
        return ax.scatter(X, Y, c=color, label=label, marker=marker, **kwargs)
@ -106,129 +139,258 @@ class MatplotlibPlots(AbstractPlottingLibrary):
            return ax.plot(X, Y, color=color, zs=Z, label=label, **kwargs)
        return ax.plot(X, Y, color=color, label=label, **kwargs)

-    def plot_axis_lines(self, ax, X, color=Tango.colorsHex['darkRed'], label=None, **kwargs):
+    def plot_axis_lines(
+        self, ax, X, color=Tango.colorsHex["darkRed"], label=None, **kwargs
+    ):
        from matplotlib import transforms
        from matplotlib.path import Path
-        if 'marker' not in kwargs:
-            kwargs['marker'] = Path([[-.2,0.],    [-.2,.5],    [0.,1.],    [.2,.5],     [.2,0.],     [-.2,0.]],
-                                    [Path.MOVETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.CLOSEPOLY])
-        if 'transform' not in kwargs:
+
+        if "marker" not in kwargs:
+            kwargs["marker"] = Path(
+                [
+                    [-0.2, 0.0],
+                    [-0.2, 0.5],
+                    [0.0, 1.0],
+                    [0.2, 0.5],
+                    [0.2, 0.0],
+                    [-0.2, 0.0],
+                ],
+                [
+                    Path.MOVETO,
+                    Path.LINETO,
+                    Path.LINETO,
+                    Path.LINETO,
+                    Path.LINETO,
+                    Path.CLOSEPOLY,
+                ],
+            )
+        if "transform" not in kwargs:
            if X.shape[1] == 1:
-                kwargs['transform'] = transforms.blended_transform_factory(ax.transData, ax.transAxes)
+                kwargs["transform"] = transforms.blended_transform_factory(
+                    ax.transData, ax.transAxes
+                )
        if X.shape[1] == 2:
-            return ax.scatter(X[:,0], X[:,1], ax.get_zlim()[0], c=color, label=label, **kwargs)
+            return ax.scatter(
+                X[:, 0], X[:, 1], ax.get_zlim()[0], c=color, label=label, **kwargs
+            )
        return ax.scatter(X, np.zeros_like(X), c=color, label=label, **kwargs)

-    def barplot(self, ax, x, height, width=0.8, bottom=0, color=Tango.colorsHex['mediumBlue'], label=None, **kwargs):
-        if 'align' not in kwargs:
-            kwargs['align'] = 'center'
-        return ax.bar(x=x, height=height, width=width,
-               bottom=bottom, label=label, color=color,
-               **kwargs)
+    def barplot(
+        self,
+        ax,
+        x,
+        height,
+        width=0.8,
+        bottom=0,
+        color=Tango.colorsHex["mediumBlue"],
+        label=None,
+        **kwargs
+    ):
+        if "align" not in kwargs:
+            kwargs["align"] = "center"
+        return ax.bar(
+            x=x,
+            height=height,
+            width=width,
+            bottom=bottom,
+            label=label,
+            color=color,
+            **kwargs
+        )

-    def xerrorbar(self, ax, X, Y, error, color=Tango.colorsHex['darkRed'], label=None, **kwargs):
-        if not('linestyle' in kwargs or 'ls' in kwargs):
-            kwargs['ls'] = 'none'
-        #if Z is not None:
+    def xerrorbar(
+        self, ax, X, Y, error, color=Tango.colorsHex["darkRed"], label=None, **kwargs
+    ):
+        if not ("linestyle" in kwargs or "ls" in kwargs):
+            kwargs["ls"] = "none"
+        # if Z is not None:
        #    return ax.errorbar(X, Y, Z, xerr=error, ecolor=color, label=label, **kwargs)
        return ax.errorbar(X, Y, xerr=error, ecolor=color, label=label, **kwargs)

-    def yerrorbar(self, ax, X, Y, error, color=Tango.colorsHex['darkRed'], label=None, **kwargs):
-        if not('linestyle' in kwargs or 'ls' in kwargs):
-            kwargs['ls'] = 'none'
-        #if Z is not None:
+    def yerrorbar(
+        self, ax, X, Y, error, color=Tango.colorsHex["darkRed"], label=None, **kwargs
+    ):
+        if not ("linestyle" in kwargs or "ls" in kwargs):
+            kwargs["ls"] = "none"
+        # if Z is not None:
        #    return ax.errorbar(X, Y, Z, yerr=error, ecolor=color, label=label, **kwargs)
        return ax.errorbar(X, Y, yerr=error, ecolor=color, label=label, **kwargs)

-    def imshow(self, ax, X, extent=None, label=None, vmin=None, vmax=None, **imshow_kwargs):
-        if 'origin' not in imshow_kwargs:
-            imshow_kwargs['origin'] = 'lower'
-        #xmin, xmax, ymin, ymax = extent
-        #xoffset, yoffset = (xmax - xmin) / (2. * X.shape[0]), (ymax - ymin) / (2. * X.shape[1])
-        #xmin, xmax, ymin, ymax = extent = xmin-xoffset, xmax+xoffset, ymin-yoffset, ymax+yoffset
-        return ax.imshow(X, label=label, extent=extent, vmin=vmin, vmax=vmax, **imshow_kwargs)
+    def imshow(
+        self, ax, X, extent=None, label=None, vmin=None, vmax=None, **imshow_kwargs
+    ):
+        if "origin" not in imshow_kwargs:
+            imshow_kwargs["origin"] = "lower"
+        # xmin, xmax, ymin, ymax = extent
+        # xoffset, yoffset = (xmax - xmin) / (2. * X.shape[0]), (ymax - ymin) / (2. * X.shape[1])
+        # xmin, xmax, ymin, ymax = extent = xmin-xoffset, xmax+xoffset, ymin-yoffset, ymax+yoffset
+        return ax.imshow(
+            X, label=label, extent=extent, vmin=vmin, vmax=vmax, **imshow_kwargs
+        )

-    def imshow_interact(self, ax, plot_function, extent, label=None, resolution=None, vmin=None, vmax=None, **imshow_kwargs):
-        if imshow_kwargs is None: imshow_kwargs = {}
-        if 'origin' not in imshow_kwargs:
-            imshow_kwargs['origin'] = 'lower'
-        return ImshowController(ax, plot_function, extent, resolution=resolution, vmin=vmin, vmax=vmax, **imshow_kwargs)
+    def imshow_interact(
+        self,
+        ax,
+        plot_function,
+        extent,
+        label=None,
+        resolution=None,
+        vmin=None,
+        vmax=None,
+        **imshow_kwargs
+    ):
+        if imshow_kwargs is None:
+            imshow_kwargs = {}
+        if "origin" not in imshow_kwargs:
+            imshow_kwargs["origin"] = "lower"
+        return ImshowController(
+            ax,
+            plot_function,
+            extent,
+            resolution=resolution,
+            vmin=vmin,
+            vmax=vmax,
+            **imshow_kwargs
+        )

-    def annotation_heatmap(self, ax, X, annotation, extent=None, label=None, imshow_kwargs=None, **annotation_kwargs):
-        if imshow_kwargs is None: imshow_kwargs = {}
-        if 'origin' not in imshow_kwargs:
-            imshow_kwargs['origin'] = 'lower'
-        if ('ha' not in annotation_kwargs) and ('horizontalalignment' not in annotation_kwargs):
-            annotation_kwargs['ha'] = 'center'
-        if ('va' not in annotation_kwargs) and ('verticalalignment' not in annotation_kwargs):
-            annotation_kwargs['va'] = 'center'
+    def annotation_heatmap(
+        self,
+        ax,
+        X,
+        annotation,
+        extent=None,
+        label=None,
+        imshow_kwargs=None,
+        **annotation_kwargs
+    ):
+        if imshow_kwargs is None:
+            imshow_kwargs = {}
+        if "origin" not in imshow_kwargs:
+            imshow_kwargs["origin"] = "lower"
+        if ("ha" not in annotation_kwargs) and (
+            "horizontalalignment" not in annotation_kwargs
+        ):
+            annotation_kwargs["ha"] = "center"
+        if ("va" not in annotation_kwargs) and (
+            "verticalalignment" not in annotation_kwargs
+        ):
+            annotation_kwargs["va"] = "center"
        imshow = self.imshow(ax, X, extent, label, **imshow_kwargs)
        if extent is None:
            extent = (0, X.shape[0], 0, X.shape[1])
        xmin, xmax, ymin, ymax = extent
-        xoffset, yoffset = (xmax - xmin) / (2. * X.shape[0]), (ymax - ymin) / (2. * X.shape[1])
+        xoffset, yoffset = (xmax - xmin) / (2.0 * X.shape[0]), (ymax - ymin) / (
+            2.0 * X.shape[1]
+        )
        xlin = np.linspace(xmin, xmax, X.shape[0], endpoint=False)
        ylin = np.linspace(ymin, ymax, X.shape[1], endpoint=False)
        annotations = []
        for [i, x], [j, y] in itertools.product(enumerate(xlin), enumerate(ylin)):
-            annotations.append(ax.text(x+xoffset, y+yoffset, "{}".format(annotation[j, i]), **annotation_kwargs))
+            annotations.append(
+                ax.text(
+                    x + xoffset,
+                    y + yoffset,
+                    "{}".format(annotation[j, i]),
+                    **annotation_kwargs
+                )
+            )
        return imshow, annotations

-    def annotation_heatmap_interact(self, ax, plot_function, extent, label=None, resolution=15, imshow_kwargs=None, **annotation_kwargs):
-        if imshow_kwargs is None: imshow_kwargs = {}
-        if 'origin' not in imshow_kwargs:
-            imshow_kwargs['origin'] = 'lower'
-        return ImAnnotateController(ax, plot_function, extent, resolution=resolution, imshow_kwargs=imshow_kwargs or {}, **annotation_kwargs)
+    def annotation_heatmap_interact(
+        self,
+        ax,
+        plot_function,
+        extent,
+        label=None,
+        resolution=15,
+        imshow_kwargs=None,
+        **annotation_kwargs
+    ):
+        if imshow_kwargs is None:
+            imshow_kwargs = {}
+        if "origin" not in imshow_kwargs:
+            imshow_kwargs["origin"] = "lower"
+        return ImAnnotateController(
+            ax,
+            plot_function,
+            extent,
+            resolution=resolution,
+            imshow_kwargs=imshow_kwargs or {},
+            **annotation_kwargs
+        )

    def contour(self, ax, X, Y, C, levels=20, label=None, **kwargs):
-        return ax.contour(X, Y, C, levels=np.linspace(C.min(), C.max(), levels), label=label, **kwargs)
+        return ax.contour(
+            X, Y, C, levels=np.linspace(C.min(), C.max(), levels), label=label, **kwargs
+        )

    def surface(self, ax, X, Y, Z, color=None, label=None, **kwargs):
        return ax.plot_surface(X, Y, Z, label=label, **kwargs)

-    def fill_between(self, ax, X, lower, upper, color=Tango.colorsHex['mediumBlue'], label=None, **kwargs):
+    def fill_between(
+        self,
+        ax,
+        X,
+        lower,
+        upper,
+        color=Tango.colorsHex["mediumBlue"],
+        label=None,
+        **kwargs
+    ):
        return ax.fill_between(X, lower, upper, facecolor=color, label=label, **kwargs)

-    def fill_gradient(self, canvas, X, percentiles, color=Tango.colorsHex['mediumBlue'], label=None, **kwargs):
+    def fill_gradient(
+        self,
+        canvas,
+        X,
+        percentiles,
+        color=Tango.colorsHex["mediumBlue"],
+        label=None,
+        **kwargs
+    ):
        ax = canvas
        plots = []

-        if 'edgecolors' not in kwargs:
-            kwargs['edgecolors'] = 'none'
+        if "edgecolors" not in kwargs:
+            kwargs["edgecolors"] = "none"

-        if 'facecolors' in kwargs:
-            color = kwargs.pop('facecolors')
+        if "facecolors" in kwargs:
+            color = kwargs.pop("facecolors")

-        if 'array' in kwargs:
-            array = kwargs.pop('array')
+        if "array" in kwargs:
+            array = kwargs.pop("array")
        else:
-            array = 1.-np.abs(np.linspace(-.97, .97, len(percentiles)-1))
+            array = 1.0 - np.abs(np.linspace(-0.97, 0.97, len(percentiles) - 1))

-        if 'alpha' in kwargs:
-            alpha = kwargs.pop('alpha')
+        if "alpha" in kwargs:
+            alpha = kwargs.pop("alpha")
        else:
-            alpha = .8
+            alpha = 0.8

-        if 'cmap' in kwargs:
-            cmap = kwargs.pop('cmap')
+        if "cmap" in kwargs:
+            cmap = kwargs.pop("cmap")
        else:
-            cmap = LinearSegmentedColormap.from_list('WhToColor', (color, color), N=array.size)
+            cmap = LinearSegmentedColormap.from_list(
+                "WhToColor", (color, color), N=array.size
+            )
        cmap._init()
-        cmap._lut[:-3, -1] = alpha*array
+        cmap._lut[:-3, -1] = alpha * array

-        kwargs['facecolors'] = [cmap(i) for i in np.linspace(0,1,cmap.N)]
+        kwargs["facecolors"] = [cmap(i) for i in np.linspace(0, 1, cmap.N)]

        # pop where from kwargs
-        where = kwargs.pop('where') if 'where' in kwargs else None
+        where = kwargs.pop("where") if "where" in kwargs else None
        # pop interpolate, which we actually do not do here!
-        if 'interpolate' in kwargs: kwargs.pop('interpolate')
+        if "interpolate" in kwargs:
+            kwargs.pop("interpolate")

        def pairwise(iterable):
            "s -> (s0,s1), (s1,s2), (s2, s3), ..."
            from itertools import tee
-            #try:
+
+            # try:
            #    from itertools import izip as zip
-            #except ImportError:
+            # except ImportError:
            #    pass
            a, b = tee(iterable)
            next(b, None)
@ -245,6 +407,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
            ax._process_unit_info(ydata=y2)
            # Convert the arrays so we can work with them
            from numpy import ma
+
            x = ma.masked_invalid(ax.convert_xunits(X))
            y1 = ma.masked_invalid(ax.convert_yunits(y1))
            y2 = ma.masked_invalid(ax.convert_yunits(y2))
@ -263,6 +426,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                raise ValueError("Argument dimensions are incompatible")

            from functools import reduce
+
            mask = reduce(ma.mask_or, [ma.getmask(a) for a in (x, y1, y2)])
            if mask is not ma.nomask:
                where &= ~mask
@ -277,7 +441,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                    continue

                N = len(xslice)
-                p = np.zeros((2 * N + 2, 2), np.float)
+                p = np.zeros((2 * N + 2, 2), float)

                # the purpose of the next two lines is for when y2 is a
                # scalar like 0 and we want the fill to go all the way
@ -288,16 +452,17 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                p[0] = start
                p[N + 1] = end

-                p[1:N + 1, 0] = xslice
-                p[1:N + 1, 1] = y1slice
-                p[N + 2:, 0] = xslice[::-1]
-                p[N + 2:, 1] = y2slice[::-1]
+                p[1 : N + 1, 0] = xslice
+                p[1 : N + 1, 1] = y1slice
+                p[N + 2 :, 0] = xslice[::-1]
+                p[N + 2 :, 1] = y2slice[::-1]

                polys.append(p)
            polycol.extend(polys)
        from matplotlib.collections import PolyCollection
-        if 'zorder' not in kwargs:
-            kwargs['zorder'] = 0
+
+        if "zorder" not in kwargs:
+            kwargs["zorder"] = 0
        plots.append(PolyCollection(polycol, label=label, **kwargs))
        ax.add_collection(plots[-1], autolim=True)
        ax.autoscale_view()
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@ -1,4 +1,6 @@
-from matplotlib import pyplot as pb, numpy as np
+from matplotlib import pyplot as pb
+import numpy as np
+

 def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
    """
@ -17,6 +19,7 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
    if colors is None:
        from ..Tango import mediumList
        from itertools import cycle
+
        colors = cycle(mediumList)
        pb.clf()
    else:
@ -33,21 +36,30 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
            a = ax[i]
        else:
            raise ValueError("Need one ax per latent dimension input_dim")
-        bg_lines.append(a.plot(means, c='k', alpha=.3))
-        lines.extend(a.plot(x, means.T[i], c=next(colors), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
-        fills.append(a.fill_between(x,
-                        means.T[i] - 2 * np.sqrt(variances.T[i]),
-                        means.T[i] + 2 * np.sqrt(variances.T[i]),
-                        facecolor=lines[-1].get_color(),
-                        alpha=.3))
-        a.legend(borderaxespad=0.)
+        bg_lines.append(a.plot(means, c="k", alpha=0.3))
+        lines.extend(
+            a.plot(
+                x, means.T[i], c=next(colors), label=r"$\mathbf{{X_{{{}}}}}$".format(i)
+            )
+        )
+        fills.append(
+            a.fill_between(
+                x,
+                means.T[i] - 2 * np.sqrt(variances.T[i]),
+                means.T[i] + 2 * np.sqrt(variances.T[i]),
+                facecolor=lines[-1].get_color(),
+                alpha=0.3,
+            )
+        )
+        a.legend(borderaxespad=0.0)
        a.set_xlim(x.min(), x.max())
        if i < means.shape[1] - 1:
-            a.set_xticklabels('')
+            a.set_xticklabels("")
    pb.draw()
-    a.figure.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
+    a.figure.tight_layout(h_pad=0.01)  # , rect=(0, 0, 1, .95))
    return dict(lines=lines, fills=fills, bg_lines=bg_lines)

+
 def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_side=True):
    """
    Plot latent space X in 1D:
@ -62,45 +74,60 @@ def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_sid
    """
    if ax is None:
        if side_by_side:
-            fig = pb.figure(num=fignum, figsize=(16, min(12, (2 * parameterized.mean.shape[1]))))
+            fig = pb.figure(
+                num=fignum, figsize=(16, min(12, (2 * parameterized.mean.shape[1])))
+            )
        else:
-            fig = pb.figure(num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1]))))
+            fig = pb.figure(
+                num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1])))
+            )
    if colors is None:
        from ..Tango import mediumList
        from itertools import cycle
+
        colors = cycle(mediumList)
        pb.clf()
    else:
        colors = iter(colors)
    plots = []
-    means, variances, gamma = parameterized.mean, parameterized.variance, parameterized.binary_prob
+    means, variances, gamma = (
+        parameterized.mean,
+        parameterized.variance,
+        parameterized.binary_prob,
+    )
    x = np.arange(means.shape[0])
    for i in range(means.shape[1]):
        if side_by_side:
-            sub1 = (means.shape[1],2,2*i+1)
-            sub2 = (means.shape[1],2,2*i+2)
+            sub1 = (means.shape[1], 2, 2 * i + 1)
+            sub2 = (means.shape[1], 2, 2 * i + 2)
        else:
-            sub1 = (means.shape[1]*2,1,2*i+1)
-            sub2 = (means.shape[1]*2,1,2*i+2)
+            sub1 = (means.shape[1] * 2, 1, 2 * i + 1)
+            sub2 = (means.shape[1] * 2, 1, 2 * i + 2)

        # mean and variance plot
        a = fig.add_subplot(*sub1)
-        a.plot(means, c='k', alpha=.3)
-        plots.extend(a.plot(x, means.T[i], c=next(colors), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
-        a.fill_between(x,
-                        means.T[i] - 2 * np.sqrt(variances.T[i]),
-                        means.T[i] + 2 * np.sqrt(variances.T[i]),
-                        facecolor=plots[-1].get_color(),
-                        alpha=.3)
-        a.legend(borderaxespad=0.)
+        a.plot(means, c="k", alpha=0.3)
+        plots.extend(
+            a.plot(
+                x, means.T[i], c=next(colors), label=r"$\mathbf{{X_{{{}}}}}$".format(i)
+            )
+        )
+        a.fill_between(
+            x,
+            means.T[i] - 2 * np.sqrt(variances.T[i]),
+            means.T[i] + 2 * np.sqrt(variances.T[i]),
+            facecolor=plots[-1].get_color(),
+            alpha=0.3,
+        )
+        a.legend(borderaxespad=0.0)
        a.set_xlim(x.min(), x.max())
        if i < means.shape[1] - 1:
-            a.set_xticklabels('')
+            a.set_xticklabels("")
        # binary prob plot
        a = fig.add_subplot(*sub2)
-        a.bar(x,gamma[:,i],bottom=0.,linewidth=1.,width=1.0,align='center')
+        a.bar(x, gamma[:, i], bottom=0.0, linewidth=1.0, width=1.0, align="center")
        a.set_xlim(x.min(), x.max())
-        a.set_ylim([0.,1.])
+        a.set_ylim([0.0, 1.0])
    pb.draw()
-    fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
+    fig.tight_layout(h_pad=0.01)  # , rect=(0, 0, 1, .95))
    return fig
--- a/GPy/testing/init.py
+++ b/GPy/testing/init.py
@ -1,9 +0,0 @@
-# Copyright (c) 2014, Max Zwiessele, GPy Authors
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-import unittest
-import sys
-
-def deepTest(reason):
-    if reason:
-        return lambda x:x
-    return unittest.skip("Not deep scanning, enable deepscan by adding 'deep' argument to unittest call")
--- a/GPy/testing/cython_tests.py
+++ b/GPy/testing/cython_tests.py
@ -1,81 +0,0 @@
-import numpy as np
-import scipy as sp
-from GPy.util import choleskies
-import GPy
-import unittest
-
-from ..util.config import config
-
-try:
-    from ..util import choleskies_cython
-    choleskies_cython_working = config.getboolean('cython', 'working')
-except ImportError:
-    choleskies_cython_working = False
-
-try:
-    from ..kern.src import stationary_cython
-    stationary_cython_working = config.getboolean('cython', 'working')
-except ImportError:
-    stationary_cython_working = False
-
-"""
-These tests make sure that the pure python and cython codes work the same
-"""
-
-@unittest.skipIf(not choleskies_cython_working,"Cython cholesky module has not been built on this machine")
-class CythonTestChols(np.testing.TestCase):
-    def setUp(self):
-        self.flat = np.random.randn(45,5)
-        self.triang = np.array([np.eye(20) for i in range(3)])
-    def test_flat_to_triang(self):
-        L1 = choleskies._flat_to_triang_pure(self.flat)
-        L2 = choleskies._flat_to_triang_cython(self.flat)
-        np.testing.assert_allclose(L1, L2)
-    def test_triang_to_flat(self):
-        A1 = choleskies._triang_to_flat_pure(self.triang)
-        A2 = choleskies._triang_to_flat_cython(self.triang)
-        np.testing.assert_allclose(A1, A2)
-
-@unittest.skipIf(not stationary_cython_working,"Cython stationary module has not been built on this machine")
-class test_stationary(np.testing.TestCase):
-    def setUp(self):
-        self.k = GPy.kern.RBF(10)
-        self.X = np.random.randn(300,10)
-        self.Z = np.random.randn(20,10)
-        self.dKxx = np.random.randn(300,300)
-        self.dKzz = np.random.randn(20,20)
-        self.dKxz = np.random.randn(300,20)
-
-    def test_square_gradX(self):
-        g1 = self.k._gradients_X_cython(self.dKxx, self.X)
-        g2 = self.k._gradients_X_pure(self.dKxx, self.X)
-        np.testing.assert_allclose(g1, g2)
-
-    def test_rect_gradx(self):
-        g1 = self.k._gradients_X_cython(self.dKxz, self.X, self.Z)
-        g2 = self.k._gradients_X_pure(self.dKxz, self.X, self.Z)
-        np.testing.assert_allclose(g1, g2)
-
-    def test_square_lengthscales(self):
-        g1 = self.k._lengthscale_grads_pure(self.dKxx, self.X, self.X)
-        g2 = self.k._lengthscale_grads_cython(self.dKxx, self.X, self.X)
-        np.testing.assert_allclose(g1, g2)
-
-    def test_rect_lengthscales(self):
-        g1 = self.k._lengthscale_grads_pure(self.dKxz, self.X, self.Z)
-        g2 = self.k._lengthscale_grads_cython(self.dKxz, self.X, self.Z)
-        np.testing.assert_allclose(g1, g2)
-
-@unittest.skipIf(not choleskies_cython_working,"Cython cholesky module has not been built on this machine")
-class test_choleskies_backprop(np.testing.TestCase):
-    def setUp(self):
-        a =np.random.randn(10,12)
-        A = a.dot(a.T)
-        self.L = GPy.util.linalg.jitchol(A)
-        self.dL = np.random.randn(10,10)
-    def test(self):
-        r1 = choleskies._backprop_gradient_pure(self.dL, self.L)
-        r2 = choleskies_cython.backprop_gradient(self.dL, self.L)
-        r3 = choleskies_cython.backprop_gradient_par_c(self.dL, self.L)
-        np.testing.assert_allclose(r1, r2)
-        np.testing.assert_allclose(r1, r3)
--- a/GPy/testing/deactivated/deactivated_test_examples.py
+++ b/GPy/testing/deactivated/deactivated_test_examples.py
@ -1,61 +1,65 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import unittest
-import numpy as np
 import GPy
 import inspect
 import pkgutil
 import os
-import random
-from nose.tools import nottest
-import sys
-import itertools

-class ExamplesTests(unittest.TestCase):
-    def _checkgrad(self, Model):
-        self.assertTrue(Model.checkgrad())

-    def _model_instance(self, Model):
-        self.assertTrue(isinstance(Model, GPy.models))
+def check_grad(Model):
+    assert Model.checkgrad(), "Gradient check failed!"
+
+
+def check_model_instance(Model):
+    assert isinstance(Model, GPy.models), "Wrong type!"
+

 def model_checkgrads(model):
    model.randomize()
-    #NOTE: Step as 1e-4, this should be acceptable for more peaky models
+    # NOTE: Step as 1e-4, this should be acceptable for more peaky models
    return model.checkgrad(step=1e-4)

+
 def model_instance(model):
    return isinstance(model, GPy.core.model.Model)

+
 def flatten_nested(lst):
    result = []
    for element in lst:
-        if hasattr(element, '__iter__'):
+        if hasattr(element, "__iter__"):
            result.extend(flatten_nested(element))
        else:
            result.append(element)
    return result

-@nottest
+
 def test_models():
-    optimize=False
-    plot=True
+    optimize = False
+    plot = True
    examples_path = os.path.dirname(GPy.examples.__file__)
    # Load modules
    failing_models = {}
-    for loader, module_name, is_pkg in pkgutil.iter_modules([examples_path]):
+    for loader, module_name, _is_pkg in pkgutil.iter_modules([examples_path]):
        # Load examples
        module_examples = loader.find_module(module_name).load_module(module_name)
        print("MODULE", module_examples)
        print("Before")
        print(inspect.getmembers(module_examples, predicate=inspect.isfunction))
-        functions = [ func for func in inspect.getmembers(module_examples, predicate=inspect.isfunction) if func[0].startswith('_') is False ][::-1]
+        functions = [
+            func
+            for func in inspect.getmembers(
+                module_examples, predicate=inspect.isfunction
+            )
+            if func[0].startswith("_") is False
+        ][::-1]
        print("After")
        print(functions)
        for example in functions:
-            if example[0] in ['epomeo_gpx']:
-                #These are the edge cases that we might want to handle specially
-                if example[0] == 'epomeo_gpx' and not GPy.util.datasets.gpxpy_available:
+            if example[0] in ["epomeo_gpx"]:
+                # These are the edge cases that we might want to handle specially
+                if example[0] == "epomeo_gpx" and not GPy.util.datasets.gpxpy_available:
                    print("Skipping as gpxpy is not available to parse GPS")
                    continue

@ -63,14 +67,14 @@ def test_models():
            # Generate model

            try:
-                models = [ example[1](optimize=optimize, plot=plot) ]
-                #If more than one model returned, flatten them
+                models = [example[1](optimize=optimize, plot=plot)]
+                # If more than one model returned, flatten them
                models = flatten_nested(models)
            except Exception as e:
                failing_models[example[0]] = "Cannot make model: \n{e}".format(e=e)
            else:
                print(models)
-                model_checkgrads.description = 'test_checkgrads_%s' % example[0]
+                model_checkgrads.description = "test_checkgrads_%s" % example[0]
                try:
                    for model in models:
                        if not model_checkgrads(model):
@ -78,7 +82,7 @@ def test_models():
                except Exception as e:
                    failing_models[model_checkgrads.description] = e

-                model_instance.description = 'test_instance_%s' % example[0]
+                model_instance.description = "test_instance_%s" % example[0]
                try:
                    for model in models:
                        if not model_instance(model):
@ -86,8 +90,8 @@ def test_models():
                except Exception as e:
                    failing_models[model_instance.description] = e

-            #yield model_checkgrads, model
-            #yield model_instance, model
+            # yield model_checkgrads, model
+            # yield model_instance, model

        print("Finished checking module {m}".format(m=module_name))
        if len(failing_models.keys()) > 0:
@ -97,9 +101,3 @@ def test_models():
    if len(failing_models.keys()) > 0:
        print(failing_models)
        raise Exception(failing_models)
-
-
-if __name__ == "__main__":
-    print("Running unit tests, please be (very) patient...")
-    # unittest.main()
-    test_models()
--- a/GPy/testing/deactivated/deactivated_test_mpi.py
+++ b/GPy/testing/deactivated/deactivated_test_mpi.py
@ -1,16 +1,12 @@
 # Copyright (c) 2013-2014, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import unittest
 import numpy as np
-import GPy

 try:
-    from mpi4py import MPI
    import subprocess

-    class MPITests(unittest.TestCase):
-            
+    class TestMPI:
        def test_BayesianGPLVM_MPI(self):
            code = """
 import numpy as np
@ -33,17 +29,20 @@ if comm.rank==0:
    m._trigger_params_changed()
    print float(m.objective_function())
            """
-            with open('mpi_test__.py','w') as f:
+            with open("mpi_test__.py", "w") as f:
                f.write(code)
                f.close()
-            p = subprocess.Popen('mpirun -n 4 python mpi_test__.py',stdout=subprocess.PIPE,shell=True)
-            (stdout, stderr) = p.communicate()
-            L1 =  float(stdout.splitlines()[-2])
-            L2 =  float(stdout.splitlines()[-1])
-            self.assertTrue(np.allclose(L1,L2))
+            p = subprocess.Popen(
+                "mpirun -n 4 python mpi_test__.py", stdout=subprocess.PIPE, shell=True
+            )
+            (stdout, _stderr) = p.communicate()
+            L1 = float(stdout.splitlines()[-2])
+            L2 = float(stdout.splitlines()[-1])
+            self.assertTrue(np.allclose(L1, L2))
            import os
-            os.remove('mpi_test__.py')
-            
+
+            os.remove("mpi_test__.py")
+
        def test_SparseGPRegression_MPI(self):
            code = """
 import numpy as np
@ -66,27 +65,19 @@ if comm.rank==0:
    m._trigger_params_changed()
    print float(m.objective_function())
            """
-            with open('mpi_test__.py','w') as f:
+            with open("mpi_test__.py", "w") as f:
                f.write(code)
                f.close()
-            p = subprocess.Popen('mpirun -n 4 python mpi_test__.py',stdout=subprocess.PIPE,shell=True)
+            p = subprocess.Popen(
+                "mpirun -n 4 python mpi_test__.py", stdout=subprocess.PIPE, shell=True
+            )
            (stdout, stderr) = p.communicate()
-            L1 =  float(stdout.splitlines()[-2])
-            L2 =  float(stdout.splitlines()[-1])
-            self.assertTrue(np.allclose(L1,L2))
+            L1 = float(stdout.splitlines()[-2])
+            L2 = float(stdout.splitlines()[-1])
+            assert np.allclose(L1, L2)
            import os
-            os.remove('mpi_test__.py')

+            os.remove("mpi_test__.py")

 except:
    pass
-
-
-
-if __name__ == "__main__":
-    print("Running unit tests, please be (very) patient...")
-    try:
-        import mpi4py
-        unittest.main()
-    except:
-        pass
--- a/GPy/testing/fitc.py
+++ b/GPy/testing/fitc.py
@ -1,34 +1,38 @@
 # Copyright (c) 2014, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import unittest
 import numpy as np
 import GPy

-class FITCtest(unittest.TestCase):
-    def setUp(self):
+
+class FITCtest:
+    def setup(self):
        ######################################
        # # 1 dimensional example

        N = 20
        # sample inputs and outputs
-        self.X1D = np.random.uniform(-3., 3., (N, 1))
+        self.X1D = np.random.uniform(-3.0, 3.0, (N, 1))
        self.Y1D = np.sin(self.X1D) + np.random.randn(N, 1) * 0.05

        ######################################
        # # 2 dimensional example

        # sample inputs and outputs
-        self.X2D = np.random.uniform(-3., 3., (N, 2))
-        self.Y2D = np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2]) + np.random.randn(N, 1) * 0.05
+        self.X2D = np.random.uniform(-3.0, 3.0, (N, 2))
+        self.Y2D = (
+            np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2])
+            + np.random.randn(N, 1) * 0.05
+        )

    def test_fitc_1d(self):
+        self.setup()
        m = GPy.models.SparseGPRegression(self.X1D, self.Y1D)
-        m.inference_method=GPy.inference.latent_function_inference.FITC()
-        self.assertTrue(m.checkgrad())
+        m.inference_method = GPy.inference.latent_function_inference.FITC()
+        assert m.checkgrad(), "Gradient check failed!"

    def test_fitc_2d(self):
+        self.setup()
        m = GPy.models.SparseGPRegression(self.X2D, self.Y2D)
-        m.inference_method=GPy.inference.latent_function_inference.FITC()
-        self.assertTrue(m.checkgrad())
-
+        m.inference_method = GPy.inference.latent_function_inference.FITC()
+        assert m.checkgrad(), "Gradient check failed!"
--- a/GPy/testing/gpy_kernels_state_space_tests.py
+++ b/GPy/testing/gpy_kernels_state_space_tests.py
@ -1,454 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2015, Alex Grigorevskiy
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-"""
-Testing state space related functions.
-"""
-import unittest
-import numpy as np
-import GPy
-import GPy.models.state_space_model as SS_model
-from .state_space_main_tests import generate_x_points, generate_sine_data, \
-    generate_linear_data, generate_brownian_data, generate_linear_plus_sin
-from nose import SkipTest
-
-#from state_space_main_tests import generate_x_points, generate_sine_data, \
-#    generate_linear_data, generate_brownian_data, generate_linear_plus_sin
-
-class StateSpaceKernelsTests(np.testing.TestCase):
-    def setUp(self):
-        pass
-
-    def run_for_model(self, X, Y, ss_kernel, kalman_filter_type = 'regular',
-                      use_cython=False, check_gradients=True,
-                      optimize=True, optimize_max_iters=250, predict_X=None,
-                      compare_with_GP=True, gp_kernel=None,
-                      mean_compare_decimal=10, var_compare_decimal=7):
-
-        m1  = SS_model.StateSpace(X,Y, ss_kernel,
-                                kalman_filter_type=kalman_filter_type,
-                                use_cython=use_cython)
-
-        m1.likelihood[:] = Y.var()/100.
-
-        if check_gradients:
-            self.assertTrue(m1.checkgrad())
-
-        if 1:#optimize:
-            m1.optimize(optimizer='lbfgsb', max_iters=1)
-
-        if compare_with_GP and (predict_X is None):
-            predict_X = X
-
-        self.assertTrue(compare_with_GP)
-        if compare_with_GP:
-            m2  = GPy.models.GPRegression(X,Y, gp_kernel)
-
-            m2[:] = m1[:]
-
-            if (predict_X is not None):
-                x_pred_reg_1 = m1.predict(predict_X)
-                x_quant_reg_1 = m1.predict_quantiles(predict_X)
-
-            x_pred_reg_2 = m2.predict(predict_X)
-            x_quant_reg_2 = m2.predict_quantiles(predict_X)
-
-            np.testing.assert_array_almost_equal(x_pred_reg_1[0], x_pred_reg_2[0], mean_compare_decimal)
-            np.testing.assert_array_almost_equal(x_pred_reg_1[1], x_pred_reg_2[1], var_compare_decimal)
-            np.testing.assert_array_almost_equal(x_quant_reg_1[0], x_quant_reg_2[0], mean_compare_decimal)
-            np.testing.assert_array_almost_equal(x_quant_reg_1[1], x_quant_reg_2[1], mean_compare_decimal)
-            np.testing.assert_array_almost_equal(m1.gradient, m2.gradient, var_compare_decimal)
-            np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), var_compare_decimal)
-
-
-    def test_Matern32_kernel(self,):
-        np.random.seed(234) # seed the random number generator
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
-                        plot = False, points_num=50, x_interval = (0, 20), random=True)
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_Matern32(1,active_dims=[0,])
-        gp_kernel = GPy.kern.Matern32(1,active_dims=[0,])
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                           predict_X=X,
-                           compare_with_GP=True,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=5, var_compare_decimal=5)
-
-    def test_Matern52_kernel(self,):
-        np.random.seed(234) # seed the random number generator
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
-                        plot = False, points_num=50, x_interval = (0, 20), random=True)
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_Matern52(1,active_dims=[0,])
-        gp_kernel = GPy.kern.Matern52(1,active_dims=[0,])
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                           optimize = True, predict_X=X,
-                           compare_with_GP=True, gp_kernel=gp_kernel,
-                           mean_compare_decimal=5, var_compare_decimal=5)
-
-    def test_RBF_kernel(self,):
-        #import pdb;pdb.set_trace()
-        
-        np.random.seed(234) # seed the random number generator
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
-                        plot = False, points_num=50, x_interval = (0, 20), random=True)
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_RBF(1, 110., 1.5, active_dims=[0,], balance=True, approx_order=10)
-        gp_kernel = GPy.kern.RBF(1, 110., 1.5, active_dims=[0,])
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                           predict_X=X,
-                           gp_kernel=gp_kernel,
-                           optimize_max_iters=1000,
-                           mean_compare_decimal=2, var_compare_decimal=1)
-
-    def test_periodic_kernel(self,):
-        np.random.seed(322) # seed the random number generator
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
-                        plot = False, points_num=50, x_interval = (0, 20), random=True)
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
-        ss_kernel.lengthscale.constrain_bounded(0.27, 1000)
-        ss_kernel.period.constrain_bounded(0.17, 100)
-
-        gp_kernel = GPy.kern.StdPeriodic(1,active_dims=[0,])
-        gp_kernel.lengthscale.constrain_bounded(0.27, 1000)
-        gp_kernel.period.constrain_bounded(0.17, 100)
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                           predict_X=X,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=3, var_compare_decimal=3)
-
-    def test_quasi_periodic_kernel(self,):
-        np.random.seed(329) # seed the random number generator
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
-                        plot = False, points_num=50, x_interval = (0, 20), random=True)
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_Matern32(1)*GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
-        ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        ss_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        gp_kernel = GPy.kern.Matern32(1)*GPy.kern.StdPeriodic(1,active_dims=[0,])
-        gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        gp_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                            predict_X=X,
-                            gp_kernel=gp_kernel,
-                            mean_compare_decimal=1, var_compare_decimal=2)
-
-    def test_linear_kernel(self,):
-
-        np.random.seed(234) # seed the random number generator
-        (X,Y) = generate_linear_data(x_points=None, tangent=2.0, add_term=20.0, noise_var=2.0,
-                    plot = False, points_num=50, x_interval = (0, 20), random=True)
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_Linear(1,X,active_dims=[0,]) + GPy.kern.sde_Bias(1, active_dims=[0,])
-        gp_kernel = GPy.kern.Linear(1, active_dims=[0,]) + GPy.kern.Bias(1, active_dims=[0,])
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients= False,
-                           predict_X=X,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=5, var_compare_decimal=5)
-
-    def test_brownian_kernel(self,):
-        np.random.seed(234) # seed the random number generator
-        (X,Y) = generate_brownian_data(x_points=None, kernel_var=2.0, noise_var = 0.1,
-                    plot = False, points_num=50, x_interval = (0, 20), random=True)
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_Brownian()
-        gp_kernel = GPy.kern.Brownian()
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                            predict_X=X,
-                            gp_kernel=gp_kernel,
-                            mean_compare_decimal=4, var_compare_decimal=4)
-
-    def test_exponential_kernel(self,):
-        np.random.seed(12345) # seed the random number generator
-        (X,Y) = generate_linear_data(x_points=None, tangent=1.0, add_term=20.0, noise_var=2.0,
-                    plot = False, points_num=10, x_interval = (0, 20), random=True)
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        ss_kernel = GPy.kern.sde_Exponential(1, Y.var(), X.ptp()/2., active_dims=[0,])
-        gp_kernel = GPy.kern.Exponential(1, Y.var(), X.ptp()/2., active_dims=[0,])
-
-        Y -= Y.mean()
-
-        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
-                      predict_X=X,
-                      gp_kernel=gp_kernel,
-                      optimize_max_iters=1000,
-                      mean_compare_decimal=2, var_compare_decimal=2)
-
-    def test_kernel_addition_svd(self,):
-        #np.random.seed(329) # seed the random number generator
-        np.random.seed(42)
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
-                        plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
-                    plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        # Sine data <-
-        Y = Y + Y1
-        Y -= Y.mean()
-    
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        def get_new_kernels():
-            ss_kernel = GPy.kern.sde_Linear(1, X, variances=1) + GPy.kern.sde_StdPeriodic(1, period=5.0, variance=300, lengthscale=3, active_dims=[0,])
-            #ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-            #ss_kernel.std_periodic.period.constrain_bounded(3, 8)
-
-            gp_kernel = GPy.kern.Linear(1, variances=1) + GPy.kern.StdPeriodic(1, period=5.0, variance=300, lengthscale=3, active_dims=[0,])
-            #gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-            #gp_kernel.std_periodic.period.constrain_bounded(3, 8)
-
-            return ss_kernel, gp_kernel
-
-        # Cython is available only with svd.
-        ss_kernel, gp_kernel = get_new_kernels()
-        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'svd',
-                           use_cython=True, optimize_max_iters=10, check_gradients=False,
-                           predict_X=X,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=3, var_compare_decimal=3)
-
-        ss_kernel, gp_kernel = get_new_kernels()
-        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'svd',
-                           use_cython=False, optimize_max_iters=10, check_gradients=False,
-                           predict_X=X,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=3, var_compare_decimal=3)
-
-    def test_kernel_addition_regular(self,):
-        #np.random.seed(329) # seed the random number generator
-        np.random.seed(42)
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
-                        plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
-                    plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        # Sine data <-
-        Y = Y + Y1
-        Y -= Y.mean()
-    
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        def get_new_kernels():
-            ss_kernel = GPy.kern.sde_Linear(1, X, variances=1) + GPy.kern.sde_StdPeriodic(1, period=5.0, variance=300, lengthscale=3, active_dims=[0,])
-            #ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-            #ss_kernel.std_periodic.period.constrain_bounded(3, 8)
-
-            gp_kernel = GPy.kern.Linear(1, variances=1) + GPy.kern.StdPeriodic(1, period=5.0, variance=300, lengthscale=3, active_dims=[0,])
-            #gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-            #gp_kernel.std_periodic.period.constrain_bounded(3, 8)
-
-            return ss_kernel, gp_kernel
-
-        ss_kernel, gp_kernel = get_new_kernels()
-        try:
-            self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'regular',
-                               use_cython=False, optimize_max_iters=10, check_gradients=True,
-                               predict_X=X,
-                               gp_kernel=gp_kernel,
-                               mean_compare_decimal=2, var_compare_decimal=2)
-        except AssertionError:
-            raise SkipTest("Skipping Regular kalman filter for kernel addition, because it is not stable (normal situation) for this data.")
-
-
-    def test_kernel_multiplication(self,):
-        np.random.seed(329) # seed the random number generator
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
-                        plot = False, points_num=50, x_interval = (0, 20), random=True)
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-
-        def get_new_kernels():
-            ss_kernel = GPy.kern.sde_Matern32(1)*GPy.kern.sde_Matern52(1)
-            gp_kernel = GPy.kern.Matern32(1)*GPy.kern.sde_Matern52(1)
-
-            return ss_kernel, gp_kernel
-
-        ss_kernel, gp_kernel = get_new_kernels()
-
-        #import ipdb;ipdb.set_trace()
-        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'svd',
-                           use_cython=True, optimize_max_iters=10, check_gradients=True,
-                            predict_X=X,
-                            gp_kernel=gp_kernel,
-                            mean_compare_decimal=2, var_compare_decimal=2)
-
-        ss_kernel, gp_kernel = get_new_kernels()
-        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'regular',
-                           use_cython=False, optimize_max_iters=10, check_gradients=True,
-                            predict_X=X,
-                            gp_kernel=gp_kernel,
-                            mean_compare_decimal=2, var_compare_decimal=2)
-
-        ss_kernel, gp_kernel = get_new_kernels()
-        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'svd',
-                           use_cython=False, optimize_max_iters=10, check_gradients=True,
-                            predict_X=X,
-                            gp_kernel=gp_kernel,
-                            mean_compare_decimal=2, var_compare_decimal=2)
-
-    def test_forecast_regular(self,):
-        # Generate data ->
-        np.random.seed(339) # seed the random number generator
-        #import pdb; pdb.set_trace()
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
-                        plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
-                    plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        Y = Y + Y1
-
-        X_train = X[X <= 20]
-        Y_train = Y[X <= 20]
-        X_test = X[X > 20]
-        Y_test = Y[X > 20]
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-        X_train.shape = (X_train.shape[0],1); Y_train.shape = (Y_train.shape[0],1)
-        X_test.shape = (X_test.shape[0],1); Y_test.shape = (Y_test.shape[0],1)
-        # Generate data <-
-
-        #import pdb; pdb.set_trace()
-
-        periodic_kernel = GPy.kern.StdPeriodic(1,active_dims=[0,])
-        gp_kernel = GPy.kern.Linear(1, active_dims=[0,]) + GPy.kern.Bias(1, active_dims=[0,]) + periodic_kernel
-        gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        gp_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        periodic_kernel = GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
-        ss_kernel = GPy.kern.sde_Linear(1,X,active_dims=[0,]) + \
-            GPy.kern.sde_Bias(1, active_dims=[0,]) + periodic_kernel
-
-        ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        ss_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        self.run_for_model(X_train, Y_train, ss_kernel, kalman_filter_type = 'regular',
-                           use_cython=False, optimize_max_iters=30, check_gradients=True,
-                           predict_X=X_test,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=2, var_compare_decimal=2)
-
-    def test_forecast_svd(self,):
-        # Generate data ->
-        np.random.seed(339) # seed the random number generator
-        #import pdb; pdb.set_trace()
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
-                        plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
-                    plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        Y = Y + Y1
-
-        X_train = X[X <= 20]
-        Y_train = Y[X <= 20]
-        X_test = X[X > 20]
-        Y_test = Y[X > 20]
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-        X_train.shape = (X_train.shape[0],1); Y_train.shape = (Y_train.shape[0],1)
-        X_test.shape = (X_test.shape[0],1); Y_test.shape = (Y_test.shape[0],1)
-        # Generate data <-
-
-        #import pdb; pdb.set_trace()
-
-        periodic_kernel = GPy.kern.StdPeriodic(1,active_dims=[0,])
-        gp_kernel = GPy.kern.Linear(1, active_dims=[0,]) + GPy.kern.Bias(1, active_dims=[0,]) + periodic_kernel
-        gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        gp_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        periodic_kernel = GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
-        ss_kernel = GPy.kern.sde_Linear(1,X,active_dims=[0,]) + \
-            GPy.kern.sde_Bias(1, active_dims=[0,]) + periodic_kernel
-
-        ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        ss_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        self.run_for_model(X_train, Y_train, ss_kernel, kalman_filter_type = 'svd',
-                           use_cython=False, optimize_max_iters=30, check_gradients=False,
-                           predict_X=X_test,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=2, var_compare_decimal=2)
-
-    def test_forecast_svd_cython(self,):
-        # Generate data ->
-        np.random.seed(339) # seed the random number generator
-        #import pdb; pdb.set_trace()
-        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
-                        plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
-                    plot = False, points_num=100, x_interval = (0, 40), random=True)
-
-        Y = Y + Y1
-
-        X_train = X[X <= 20]
-        Y_train = Y[X <= 20]
-        X_test = X[X > 20]
-        Y_test = Y[X > 20]
-
-        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
-        X_train.shape = (X_train.shape[0],1); Y_train.shape = (Y_train.shape[0],1)
-        X_test.shape = (X_test.shape[0],1); Y_test.shape = (Y_test.shape[0],1)
-        # Generate data <-
-
-        #import pdb; pdb.set_trace()
-
-        periodic_kernel = GPy.kern.StdPeriodic(1,active_dims=[0,])
-        gp_kernel = GPy.kern.Linear(1, active_dims=[0,]) + GPy.kern.Bias(1, active_dims=[0,]) + periodic_kernel
-        gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        gp_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        periodic_kernel = GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
-        ss_kernel = GPy.kern.sde_Linear(1,X,active_dims=[0,]) + \
-            GPy.kern.sde_Bias(1, active_dims=[0,]) + periodic_kernel
-
-        ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
-        ss_kernel.std_periodic.period.constrain_bounded(0.15, 100)
-
-        self.run_for_model(X_train, Y_train, ss_kernel, kalman_filter_type = 'svd',
-                           use_cython=True, optimize_max_iters=30, check_gradients=False,
-                           predict_X=X_test,
-                           gp_kernel=gp_kernel,
-                           mean_compare_decimal=2, var_compare_decimal=2)
-
-if __name__ == "__main__":
-    print("Running state-space inference tests...")
-    unittest.main()
-
-    #tt = StateSpaceKernelsTests('test_RBF_kernel')
-    #import pdb; pdb.set_trace()
-    #tt.test_Matern32_kernel()
-    #tt.test_Matern52_kernel()
-    #tt.test_RBF_kernel()
-    #tt.test_periodic_kernel()
-    #tt.test_quasi_periodic_kernel()
-    #tt.test_linear_kernel()
-    #tt.test_brownian_kernel()
-    #tt.test_exponential_kernel()
-    #tt.test_kernel_addition()
-    #tt.test_kernel_multiplication()
-    #tt.test_forecast()
-
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@ -1,179 +0,0 @@
-# Copyright (c) 2014, Max Zwiessele
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-"""
-The test cases for various inference algorithms
-"""
-
-import unittest
-import numpy as np
-import GPy
-#np.seterr(invalid='raise')
-
-class InferenceXTestCase(unittest.TestCase):
-
-    def genData(self):
-        np.random.seed(1111)
-        Ylist = GPy.examples.dimensionality_reduction._simulate_matern(5, 1, 1, 10, 3, False)[0]
-        return Ylist[0]
-
-    def test_inferenceX_BGPLVM_Linear(self):
-        Ys = self.genData()
-        m = GPy.models.BayesianGPLVM(Ys,3,kernel=GPy.kern.Linear(3,ARD=True))
-        m.optimize()
-        x, mi = m.infer_newX(m.Y, optimize=True)
-        np.testing.assert_array_almost_equal(m.X.mean, mi.X.mean, decimal=2)
-        np.testing.assert_array_almost_equal(m.X.variance, mi.X.variance, decimal=2)
-
-    def test_inferenceX_BGPLVM_RBF(self):
-        Ys = self.genData()
-        m = GPy.models.BayesianGPLVM(Ys,3,kernel=GPy.kern.RBF(3,ARD=True))
-        import warnings
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            m.optimize()
-        x, mi = m.infer_newX(m.Y, optimize=True)
-        np.testing.assert_array_almost_equal(m.X.mean, mi.X.mean, decimal=2)
-        np.testing.assert_array_almost_equal(m.X.variance, mi.X.variance, decimal=2)
-
-    def test_inferenceX_GPLVM_Linear(self):
-        Ys = self.genData()
-        m = GPy.models.GPLVM(Ys,3,kernel=GPy.kern.Linear(3,ARD=True))
-        m.optimize()
-        x, mi = m.infer_newX(m.Y, optimize=True)
-        np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
-
-    def test_inferenceX_GPLVM_RBF(self):
-        Ys = self.genData()
-        m = GPy.models.GPLVM(Ys,3,kernel=GPy.kern.RBF(3,ARD=True))
-        m.optimize()
-        x, mi = m.infer_newX(m.Y, optimize=True)
-        np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
-
-class InferenceGPEP(unittest.TestCase):
-
-    def genData(self):
-        np.random.seed(1)
-        k = GPy.kern.RBF(1, variance=7., lengthscale=0.2)
-        X = np.random.rand(200,1)
-        f = np.random.multivariate_normal(np.zeros(200), k.K(X) + 1e-5 * np.eye(X.shape[0]))
-        lik = GPy.likelihoods.Bernoulli()
-        p = lik.gp_link.transf(f) # squash the latent function
-        Y = lik.samples(f).reshape(-1,1)
-        return X, Y
-
-    def genNoisyData(self):
-        np.random.seed(1)
-        X = np.random.rand(100,1)
-        self.real_std = 0.1
-        noise = np.random.randn(*X[:, 0].shape)*self.real_std
-        Y = (np.sin(X[:, 0]*2*np.pi) + noise)[:, None]
-        self.f = np.random.rand(X.shape[0],1)
-        Y_extra_noisy = Y.copy()
-        Y_extra_noisy[50] += 4.
-        # Y_extra_noisy[80:83] -= 2.
-        return X, Y, Y_extra_noisy
-
-    def test_inference_EP(self):
-        from paramz import ObsAr
-        X, Y = self.genData()
-        lik = GPy.likelihoods.Bernoulli()
-        k = GPy.kern.RBF(1, variance=7., lengthscale=0.2)
-        inf = GPy.inference.latent_function_inference.expectation_propagation.EP(max_iters=30, delta=0.5)
-        self.model = GPy.core.GP(X=X,
-                        Y=Y,
-                        kernel=k,
-                        inference_method=inf,
-                        likelihood=lik)
-        K = self.model.kern.K(X)
-        mean_prior = np.zeros(K.shape[0])
-        post_params, ga_approx, cav_params, log_Z_tilde = self.model.inference_method.expectation_propagation(mean_prior, K, ObsAr(Y), lik, None)
-
-        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
-        p, m, d = self.model.inference_method._inference(Y, mean_prior, K, ga_approx, cav_params, lik, Y_metadata=None,  Z_tilde=log_Z_tilde)
-        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, inf).inference(k, X,lik ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
-
-        assert (np.sum(np.array([m - m0,
-                    np.sum(d['dL_dK'] - d0['dL_dK']),
-                    np.sum(d['dL_dthetaL'] - d0['dL_dthetaL']),
-                    np.sum(d['dL_dm'] - d0['dL_dm']),
-                    np.sum(p._woodbury_vector - p0._woodbury_vector),
-                    np.sum(p.woodbury_inv - p0.woodbury_inv)])) < 1e6)
-
-    # NOTE: adding a test like above for parameterized likelihood- the above test is
-    # only for probit likelihood which does not have any tunable hyperparameter which is why
-    # the term in dictionary of gradients: dL_dthetaL will always be zero. So here we repeat tests for
-    # student-t likelihood and heterodescastic gaussian noise case. This test simply checks if the posterior
-    # and gradients of log marginal are roughly the same for inference through EP and exact gaussian inference using
-    # the gaussian approximation for the individual likelihood site terms. For probit likelihood, it is possible to
-    # calculate moments analytically, but for other likelihoods, we will need to use numerical quadrature techniques,
-    # and it is possible that any error might creep up because of quadrature implementation.
-    def test_inference_EP_non_classification(self):
-        from paramz import ObsAr
-        X, Y, Y_extra_noisy = self.genNoisyData()
-        deg_freedom = 5.
-        init_noise_var = 0.08
-        lik_studentT = GPy.likelihoods.StudentT(deg_free=deg_freedom, sigma2=init_noise_var)
-        # like_gaussian_noise = GPy.likelihoods.MixedNoise()
-        k = GPy.kern.RBF(1, variance=2., lengthscale=1.1)
-        ep_inf_alt = GPy.inference.latent_function_inference.expectation_propagation.EP(max_iters=4, delta=0.5)
-        # ep_inf_nested = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode='nested', max_iters=100, delta=0.5)
-        m = GPy.core.GP(X=X,Y=Y_extra_noisy,kernel=k,likelihood=lik_studentT,inference_method=ep_inf_alt)
-        K = m.kern.K(X)
-        mean_prior = np.zeros(K.shape[0])
-        post_params, ga_approx, cav_params, log_Z_tilde = m.inference_method.expectation_propagation(mean_prior, K, ObsAr(Y_extra_noisy), lik_studentT, None)
-
-        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
-        p, m, d = m.inference_method._inference(Y_extra_noisy, mean_prior, K, ga_approx, cav_params, lik_studentT, Y_metadata=None,  Z_tilde=log_Z_tilde)
-        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, ep_inf_alt).inference(k, X,lik_studentT ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
-
-        assert (np.sum(np.array([m - m0,
-                    np.sum(d['dL_dK'] - d0['dL_dK']),
-                    np.sum(d['dL_dthetaL'] - d0['dL_dthetaL']),
-                    np.sum(d['dL_dm'] - d0['dL_dm']),
-                    np.sum(p._woodbury_vector - p0._woodbury_vector),
-                    np.sum(p.woodbury_inv - p0.woodbury_inv)])) < 1e6)
-
-class VarDtcTest(unittest.TestCase):
-
-    def test_var_dtc_inference_with_mean(self):
-        """ Check dL_dm in var_dtc is calculated correctly"""
-        np.random.seed(1)
-        x = np.linspace(0.,2*np.pi,100)[:,None]
-        y = -np.cos(x)+np.random.randn(*x.shape)*0.3+1
-        m = GPy.models.SparseGPRegression(x,y, mean_function=GPy.mappings.Linear(input_dim=1, output_dim=1))
-        self.assertTrue(m.checkgrad())
-
-
-class HMCSamplerTest(unittest.TestCase):
-
-    def test_sampling(self):
-        np.random.seed(1)
-        x = np.linspace(0.,2*np.pi,100)[:,None]
-        y = -np.cos(x)+np.random.randn(*x.shape)*0.3+1
-
-        m = GPy.models.GPRegression(x,y)
-        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
-        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
-        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
-
-        hmc = GPy.inference.mcmc.HMC(m,stepsize=1e-2)
-        s = hmc.sample(num_samples=3)
-
-class MCMCSamplerTest(unittest.TestCase):
-
-    def test_sampling(self):
-        np.random.seed(1)
-        x = np.linspace(0.,2*np.pi,100)[:,None]
-        y = -np.cos(x)+np.random.randn(*x.shape)*0.3+1
-
-        m = GPy.models.GPRegression(x,y)
-        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
-        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
-        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
-
-        mcmc = GPy.inference.mcmc.Metropolis_Hastings(m)
-        mcmc.sample(Ntotal=100, Nburn=10)
-
-if __name__ == "__main__":
-    unittest.main()
--- a/GPy/testing/link_function_tests.py
+++ b/GPy/testing/link_function_tests.py
@ -1,148 +0,0 @@
-import numpy as np
-import scipy
-from scipy.special import cbrt
-from GPy.models import GradientChecker
-import random
-_lim_val = np.finfo(np.float64).max
-_lim_val_exp = np.log(_lim_val)
-_lim_val_square = np.sqrt(_lim_val)
-_lim_val_cube = cbrt(_lim_val)
-from GPy.likelihoods.link_functions import Identity, Probit, Cloglog, Log, Log_ex_1, Reciprocal, Heaviside, ScaledProbit
-
-class LinkFunctionTests(np.testing.TestCase):
-    def setUp(self):
-        self.small_f = np.array([[-1e-4]])
-        self.zero_f = np.array([[1e-4]])
-        self.mid_f = np.array([[5.0]])
-        self.large_f = np.array([[1e4]])
-        self.f_lower_lim = np.array(-np.inf)
-        self.f_upper_lim = np.array(np.inf)
-
-    def check_gradient(self, link_func, lim_of_inf, test_lim=False):
-        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.mid_f)
-        self.assertTrue(grad.checkgrad(verbose=True))
-        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.mid_f)
-        self.assertTrue(grad2.checkgrad(verbose=True))
-        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.mid_f)
-        self.assertTrue(grad3.checkgrad(verbose=True))
-
-        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.small_f)
-        self.assertTrue(grad.checkgrad(verbose=True))
-        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.small_f)
-        self.assertTrue(grad2.checkgrad(verbose=True))
-        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.small_f)
-        self.assertTrue(grad3.checkgrad(verbose=True))
-
-        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.zero_f)
-        self.assertTrue(grad.checkgrad(verbose=True))
-        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.zero_f)
-        self.assertTrue(grad2.checkgrad(verbose=True))
-        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.zero_f)
-        self.assertTrue(grad3.checkgrad(verbose=True))
-
-        #Do a limit test if the large f value is too large
-        large_f = np.clip(self.large_f, -np.inf, lim_of_inf-1e-3)
-        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=large_f)
-        self.assertTrue(grad.checkgrad(verbose=True))
-        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=large_f)
-        self.assertTrue(grad2.checkgrad(verbose=True))
-        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=large_f)
-        self.assertTrue(grad3.checkgrad(verbose=True))
-
-        if test_lim:
-            print("Testing limits")
-            #Remove some otherwise we are too close to the limit for gradcheck to work effectively
-            lim_of_inf = lim_of_inf - 1e-4
-            grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=lim_of_inf)
-            self.assertTrue(grad.checkgrad(verbose=True))
-            grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=lim_of_inf)
-            self.assertTrue(grad2.checkgrad(verbose=True))
-            grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=lim_of_inf)
-            self.assertTrue(grad3.checkgrad(verbose=True))
-
-    def check_overflow(self, link_func, lim_of_inf):
-        #Check that it does something sensible beyond this limit,
-        #note this is not checking the value is correct, just that it isn't nan
-        beyond_lim_of_inf = lim_of_inf + 100.0
-        self.assertFalse(np.isinf(link_func.transf(beyond_lim_of_inf)))
-        self.assertFalse(np.isinf(link_func.dtransf_df(beyond_lim_of_inf)))
-        self.assertFalse(np.isinf(link_func.d2transf_df2(beyond_lim_of_inf)))
-
-        self.assertFalse(np.isnan(link_func.transf(beyond_lim_of_inf)))
-        self.assertFalse(np.isnan(link_func.dtransf_df(beyond_lim_of_inf)))
-        self.assertFalse(np.isnan(link_func.d2transf_df2(beyond_lim_of_inf)))
-
-    def test_log_overflow(self):
-        link = Log()
-        lim_of_inf = _lim_val_exp
-
-        np.testing.assert_almost_equal(np.exp(self.mid_f), link.transf(self.mid_f))
-        assert np.isinf(np.exp(np.log(self.f_upper_lim)))
-        #Check the clipping works
-        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
-        self.assertTrue(np.isfinite(link.transf(self.f_upper_lim)))
-        self.check_overflow(link, lim_of_inf)
-
-        #Check that it would otherwise fail
-        beyond_lim_of_inf = lim_of_inf + 10.0
-        old_err_state = np.seterr(over='ignore')
-        self.assertTrue(np.isinf(np.exp(beyond_lim_of_inf)))
-        np.seterr(**old_err_state)
-
-    def test_log_ex_1_overflow(self):
-        link = Log_ex_1()
-        lim_of_inf = _lim_val_exp
-
-        np.testing.assert_almost_equal(scipy.special.log1p(np.exp(self.mid_f)), link.transf(self.mid_f))
-        assert np.isinf(scipy.special.log1p(np.exp(np.log(self.f_upper_lim))))
-        #Check the clipping works
-        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
-        #Need to look at most significant figures here rather than the decimals
-        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), scipy.special.log1p(_lim_val), significant=5)
-        self.check_overflow(link, lim_of_inf)
-
-        #Check that it would otherwise fail
-        beyond_lim_of_inf = lim_of_inf + 10.0
-        old_err_state = np.seterr(over='ignore')
-        self.assertTrue(np.isinf(scipy.special.log1p(np.exp(beyond_lim_of_inf))))
-        np.seterr(**old_err_state)
-
-
-    def test_log_gradients(self):
-        # transf dtransf_df d2transf_df2 d3transf_df3
-        link = Log()
-        lim_of_inf = _lim_val_exp
-        self.check_gradient(link, lim_of_inf, test_lim=True)
-
-    def test_identity_gradients(self):
-        link = Identity()
-        lim_of_inf = _lim_val
-        #FIXME: Should be able to think of a way to test the limits of this
-        self.check_gradient(link, lim_of_inf, test_lim=False)
-
-    def test_probit_gradients(self):
-        link = Probit()
-        lim_of_inf = _lim_val
-        self.check_gradient(link, lim_of_inf, test_lim=True)
-        
-    def test_scaledprobit_gradients(self):
-        link = ScaledProbit(nu=random.random())
-        lim_of_inf = _lim_val
-        self.check_gradient(link, lim_of_inf, test_lim=True)
-
-    def test_Cloglog_gradients(self):
-        link = Cloglog()
-        lim_of_inf = _lim_val_exp
-        self.check_gradient(link, lim_of_inf, test_lim=True)
-
-    def test_Log_ex_1_gradients(self):
-        link = Log_ex_1()
-        lim_of_inf = _lim_val_exp
-        self.check_gradient(link, lim_of_inf, test_lim=True)
-        self.check_overflow(link, lim_of_inf)
-
-    def test_reciprocal_gradients(self):
-        link = Reciprocal()
-        lim_of_inf = _lim_val
-        #Does not work with much smaller values, and values closer to zero than 1e-5
-        self.check_gradient(link, lim_of_inf, test_lim=True)
--- a/GPy/testing/meanfunc_tests.py
+++ b/GPy/testing/meanfunc_tests.py
@ -1,95 +0,0 @@
-# Copyright (c) 2015, James Hensman
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import unittest
-import numpy as np
-import GPy
-
-class MFtests(unittest.TestCase):
-    def test_simple_mean_function(self):
-        """
-        The simplest possible mean function. No parameters, just a simple Sinusoid.
-        """
-        #create  simple mean function
-        mf = GPy.core.Mapping(1,1)
-        mf.f = np.sin
-        mf.update_gradients = lambda a,b: None
-
-        X = np.linspace(0,10,50).reshape(-1,1)
-        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
-
-        k =GPy.kern.RBF(1)
-        lik = GPy.likelihoods.Gaussian()
-        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
-        self.assertTrue(m.checkgrad())
-
-    def test_parametric_mean_function(self):
-        """
-        A linear mean function with parameters that we'll learn alongside the kernel
-        """
-
-        X = np.linspace(-1,10,50).reshape(-1,1)
-        
-        Y = 3-np.abs((X-6))
-        Y += .5*np.cos(3*X) + 0.3*np.random.randn(*X.shape) 
-
-        mf = GPy.mappings.PiecewiseLinear(1, 1, [-1,1], [9,2])
-
-        k =GPy.kern.RBF(1)
-        lik = GPy.likelihoods.Gaussian()
-        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
-        self.assertTrue(m.checkgrad())
-
-    def test_parametric_mean_function_composition(self):
-        """
-        A linear mean function with parameters that we'll learn alongside the kernel
-        """
-
-        X = np.linspace(0,10,50).reshape(-1,1)
-        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
-
-        mf = GPy.mappings.Compound(GPy.mappings.Linear(1,1), 
-                                   GPy.mappings.Kernel(1, 1, np.random.normal(0,1,(1,1)), 
-                                                       GPy.kern.RBF(1))
-                                   )
-
-        k =GPy.kern.RBF(1)
-        lik = GPy.likelihoods.Gaussian()
-        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
-        self.assertTrue(m.checkgrad())
-
-    def test_parametric_mean_function_additive(self):
-        """
-        A linear mean function with parameters that we'll learn alongside the kernel
-        """
-
-        X = np.linspace(0,10,50).reshape(-1,1)
-        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
-
-        mf = GPy.mappings.Additive(GPy.mappings.Constant(1,1,3),
-               GPy.mappings.Additive(GPy.mappings.MLP(1,1),
-                     GPy.mappings.Identity(1,1)
-                           )
-                        )
-
-        k =GPy.kern.RBF(1)
-        lik = GPy.likelihoods.Gaussian()
-        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
-        self.assertTrue(m.checkgrad())
-
-    def test_svgp_mean_function(self):
-
-        # an instance of the SVIGOP with a men function
-        X = np.linspace(0,10,500).reshape(-1,1)
-        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
-        Y = np.where(Y>0, 1,0) # make aclassificatino problem
-
-        mf = GPy.mappings.Linear(1,1)
-        Z = np.linspace(0,10,50).reshape(-1,1)
-        lik = GPy.likelihoods.Bernoulli()
-        k =GPy.kern.RBF(1) + GPy.kern.White(1, 1e-4)
-        m = GPy.core.SVGP(X, Y,Z=Z, kernel=k, likelihood=lik, mean_function=mf)
-        self.assertTrue(m.checkgrad())
-
-
-
--- a/GPy/testing/minibatch_tests.py
+++ b/GPy/testing/minibatch_tests.py
@ -1,230 +0,0 @@
-'''
-Created on 4 Sep 2015
-
-@author: maxz
-'''
-import unittest
-import numpy as np
-import GPy
-
-class BGPLVMTest(unittest.TestCase):
-
-
-    def setUp(self):
-        np.random.seed(12345)
-        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
-        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
-        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
-        self.X, self.W, self.Y = X,W,Y
-        self.Q = 3
-        self.m_full = GPy.models.BayesianGPLVM(Y, self.Q)
-
-    def test_lik_comparisons_m1_s0(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False)
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_predict_missing_data(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-
-        self.assertRaises(NotImplementedError, m.predict, m.X, full_cov=True)
-
-        mu1, var1 = m.predict(m.X, full_cov=False)
-        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
-        np.testing.assert_allclose(mu1, mu2)
-        np.testing.assert_allclose(var1, var2)
-
-        mu1, var1 = m.predict(m.X.mean, full_cov=True)
-        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=True)
-        np.testing.assert_allclose(mu1, mu2)
-        np.testing.assert_allclose(var1[:,:,0], var2)
-
-        mu1, var1 = m.predict(m.X.mean, full_cov=False)
-        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=False)
-        np.testing.assert_allclose(mu1, mu2)
-        np.testing.assert_allclose(var1[:,[0]], var2)
-
-    def test_lik_comparisons_m0_s0(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=self.m_full.X.variance.values, missing_data=False, stochastic=False)
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_lik_comparisons_m1_s1(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_lik_comparisons_m0_s1(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_gradients_missingdata(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
-        assert(m.checkgrad())
-
-    def test_gradients_missingdata_stochastics(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=1)
-        assert(m.checkgrad())
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=4)
-        assert(m.checkgrad())
-
-    def test_gradients_stochastics(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=1)
-        assert(m.checkgrad())
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=4)
-        assert(m.checkgrad())
-
-    def test_predict(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-class SparseGPMinibatchTest(unittest.TestCase):
-
-
-    def setUp(self):
-        np.random.seed(12345)
-        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
-        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
-        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
-        self.X, self.W, self.Y = X,W,Y
-        self.Q = 3
-        self.m_full = GPy.models.SparseGPLVM(Y, self.Q, kernel=GPy.kern.RBF(self.Q, ARD=True))
-
-    def test_lik_comparisons_m1_s0(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False)
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_sparsegp_init(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        try:
-            np.random.seed(1234)
-            Z = self.X[np.random.choice(self.X.shape[0], replace=False, size=10)].copy()
-            Q = Z.shape[1]
-            m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=True, stochastic=False)
-            assert(m.checkgrad())
-            m.optimize('adadelta', max_iters=10)
-            assert(m.checkgrad())
-    
-            m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=True, stochastic=True)
-            assert(m.checkgrad())
-            m.optimize('rprop', max_iters=10)
-            assert(m.checkgrad())
-            
-            m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=False, stochastic=False)
-            assert(m.checkgrad())
-            m.optimize('rprop', max_iters=10)
-            assert(m.checkgrad())
-            
-            m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=False, stochastic=True)
-            assert(m.checkgrad())
-            m.optimize('adadelta', max_iters=10)
-            assert(m.checkgrad())
-        except ImportError:
-            from nose import SkipTest
-            raise SkipTest('climin not installed, skipping stochastic gradients')
-
-    def test_predict_missing_data(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-
-        mu1, var1 = m.predict(m.X, full_cov=False)
-        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
-        np.testing.assert_allclose(mu1, mu2)
-        for i in range(var1.shape[1]):
-            np.testing.assert_allclose(var1[:,[i]], var2)
-
-        mu1, var1 = m.predict(m.X, full_cov=True)
-        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=True)
-        np.testing.assert_allclose(mu1, mu2)
-        for i in range(var1.shape[2]):
-            np.testing.assert_allclose(var1[:,:,i], var2)
-            
-    def test_lik_comparisons_m0_s0(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=False)
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_lik_comparisons_m1_s1(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_lik_comparisons_m0_s1(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_gradients_missingdata(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
-        assert(m.checkgrad())
-
-    def test_gradients_missingdata_stochastics(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=1)
-        assert(m.checkgrad())
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=4)
-        assert(m.checkgrad())
-
-    def test_gradients_stochastics(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=1)
-        assert(m.checkgrad())
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=4)
-        assert(m.checkgrad())
-
-    def test_predict(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.testName']
-    unittest.main()
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
--- a/GPy/testing/mpi_test__.py
+++ b/GPy/testing/mpi_test__.py
@ -0,0 +1,21 @@
+
+import numpy as np
+import GPy
+from mpi4py import MPI
+np.random.seed(123456)
+comm = MPI.COMM_WORLD
+N = 100
+x = np.linspace(-6., 6., N)
+y = np.sin(x) + np.random.randn(N) * 0.05
+comm.Bcast(y)
+data = np.vstack([x,y])
+#infr = GPy.inference.latent_function_inference.VarDTC_minibatch(mpi_comm=comm)
+m = GPy.models.SparseGPRegression(data[:1].T,data[1:2].T,mpi_comm=comm)
+m.optimize(max_iters=10)
+if comm.rank==0:
+    print float(m.objective_function())
+    m.inference_method.mpi_comm=None
+    m.mpi_comm=None
+    m._trigger_params_changed()
+    print float(m.objective_function())
+            
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@ -1,130 +0,0 @@
-'''
-Created on 13 Mar 2014
-
-@author: maxz
-'''
-import unittest, itertools
-#import cPickle as pickle
-import pickle
-import numpy as np
-import tempfile
-from GPy.examples.dimensionality_reduction import mrd_simulation
-from GPy.core.parameterization.variational import NormalPosterior
-from GPy.models.gp_regression import GPRegression
-import GPy
-from nose import SkipTest
-
-def toy_model():
-    X = np.linspace(0,1,50)[:, None]
-    Y = np.sin(X)
-    m = GPRegression(X=X, Y=Y)
-    return m
-
-class ListDictTestCase(unittest.TestCase):
-    def assertListDictEquals(self, d1, d2, msg=None):
-        #py3 fix
-        #for k,v in d1.iteritems():
-        for k,v in d1.items():
-            self.assertListEqual(list(v), list(d2[k]), msg)
-    def assertArrayListEquals(self, l1, l2):
-        for a1, a2 in zip(l1,l2):
-            np.testing.assert_array_equal(a1, a2)
-
-class Test(ListDictTestCase):
-    @SkipTest
-    def test_load_pickle(self):
-        import os
-        m = GPy.load(os.path.join(os.path.abspath(os.path.split(__file__)[0]), 'pickle_test.pickle'))
-        self.assertTrue(m.checkgrad())
-        self.assertEqual(m.log_likelihood(), -4.7351019830022087)
-
-    def test_model(self):
-        par = toy_model()
-        pcopy = par.copy()
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
-        self.assertTrue(pcopy.checkgrad())
-        self.assert_(np.any(pcopy.gradient!=0.0))
-        with tempfile.TemporaryFile('w+b') as f:
-            par.pickle(f)
-            f.seek(0)
-            pcopy = pickle.load(f)
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assert_(pcopy.checkgrad())
-
-    def test_modelrecreation(self):
-        par = toy_model()
-        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
-        np.testing.assert_allclose(par.param_array, pcopy.param_array)
-        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
-        self.assertTrue(pcopy.checkgrad())
-        self.assert_(np.any(pcopy.gradient!=0.0))
-        np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)
-        par.randomize()
-        with tempfile.TemporaryFile('w+b') as f:
-            par.pickle(f)
-            f.seek(0)
-            pcopy = pickle.load(f)
-        np.testing.assert_allclose(par.param_array, pcopy.param_array)
-        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full, atol=1e-6)
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assert_(pcopy.checkgrad())
-
-    def test_posterior(self):
-        X = np.random.randn(3,5)
-        Xv = np.random.rand(*X.shape)
-        par = NormalPosterior(X,Xv)
-        par.gradient = 10
-        pcopy = par.copy()
-        pcopy.gradient = 10
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
-        with tempfile.TemporaryFile('w+b') as f:
-            par.pickle(f)
-            f.seek(0)
-            pcopy = pickle.load(f)
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        pcopy.gradient = 10
-        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
-        np.testing.assert_allclose(pcopy.mean.gradient_full, 10)
-        self.assertSequenceEqual(str(par), str(pcopy))
-
-    def test_model_concat(self):
-        par = mrd_simulation(optimize=0, plot=0, plot_sim=0)
-        par.randomize()
-        pcopy = par.copy()
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
-        self.assertTrue(par.checkgrad())
-        self.assertTrue(pcopy.checkgrad())
-        self.assert_(np.any(pcopy.gradient!=0.0))
-        with tempfile.TemporaryFile('w+b') as f:
-            par.pickle(f)
-            f.seek(0)
-            pcopy = pickle.load(f)
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
-        self.assertSequenceEqual(str(par), str(pcopy))
-        self.assert_(pcopy.checkgrad())
-
-    def _callback(self, what, which):
-        what.count += 1
-
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.test_parameter_index_operations']
-    unittest.main()
--- a/GPy/testing/plotting_tests.py
+++ b/GPy/testing/plotting_tests.py
@ -1,509 +0,0 @@
-#===============================================================================
-# Copyright (c) 2015, Max Zwiessele
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of GPy nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
-
-
-#===============================================================================
-# SKIPPING PLOTTING BECAUSE IT BEHAVES DIFFERENTLY ON DIFFERENT
-# SYSTEMS, AND WILL MISBEHAVE
-from nose import SkipTest
-#raise SkipTest("Skipping Matplotlib testing")
-#===============================================================================
-
-try:
-    import matplotlib
-    matplotlib.use('agg')
-except ImportError:
-    # matplotlib not installed
-    from nose import SkipTest
-    raise SkipTest("Error importing matplotlib")
-
-from unittest.case import TestCase
-
-import numpy as np
-import GPy, os
-import logging
-
-from GPy.util.config import config
-from GPy.plotting import change_plotting_library, plotting_library
-
-class ConfigTest(TestCase):
-    def tearDown(self):
-        change_plotting_library('matplotlib')
-
-    def test_change_plotting(self):
-        self.assertRaises(ValueError, change_plotting_library, 'not+in9names')
-        change_plotting_library('none')
-        self.assertRaises(RuntimeError, plotting_library)
-
-change_plotting_library('matplotlib')
-if config.get('plotting', 'library') != 'matplotlib':
-    raise SkipTest("Matplotlib not installed, not testing plots")
-
-try:
-    from matplotlib import cbook, pyplot as plt
-    from matplotlib.testing.compare import compare_images
-except ImportError:
-    raise SkipTest("Matplotlib not installed, not testing plots")
-
-extensions = ['npz']
-
-basedir = os.path.dirname(os.path.relpath(os.path.abspath(__file__)))
-
-def _image_directories():
-    """
-    Compute the baseline and result image directories for testing *func*.
-    Create the result directory if it doesn't exist.
-    """
-    #module_name = __init__.__module__
-    #mods = module_name.split('.')
-    #basedir = os.path.join(*mods)
-    result_dir = os.path.join(basedir, 'testresult','.')
-    baseline_dir = os.path.join(basedir, 'baseline','.')
-    if not os.path.exists(result_dir):
-        os.makedirs(result_dir)
-    return baseline_dir, result_dir
-
-baseline_dir, result_dir = _image_directories()
-if not os.path.exists(baseline_dir):
-    raise SkipTest("Not installed from source, baseline not available. Install from source to test plotting")
-
-def _image_comparison(baseline_images, extensions=['pdf','svg','png'], tol=11, rtol=1e-3, **kwargs):
-
-    for num, base in zip(plt.get_fignums(), baseline_images):
-        for ext in extensions:
-            fig = plt.figure(num)
-            try:
-                fig.canvas.draw()
-            except Exception as e:
-                logging.error(base)
-                #raise SkipTest(e)
-            #fig.axes[0].set_axis_off()
-            #fig.set_frameon(False)
-            if ext in ['npz']:
-                figdict = flatten_axis(fig)
-                np.savez_compressed(os.path.join(result_dir, "{}.{}".format(base, ext)), **figdict)
-                try:
-                    fig.savefig(os.path.join(result_dir, "{}.{}".format(base, 'png')),
-                                transparent=True,
-                                edgecolor='none',
-                                facecolor='none',
-                                #bbox='tight'
-                                )
-                except:
-                    logging.error(base)
-                    # raise
-            else:
-                fig.savefig(os.path.join(result_dir, "{}.{}".format(base, ext)),
-                            transparent=True,
-                            edgecolor='none',
-                            facecolor='none',
-                            #bbox='tight'
-                            )
-    for num, base in zip(plt.get_fignums(), baseline_images):
-        for ext in extensions:
-            #plt.close(num)
-            actual = os.path.join(result_dir, "{}.{}".format(base, ext))
-            expected = os.path.join(baseline_dir, "{}.{}".format(base, ext))
-            if ext == 'npz':
-                def do_test():
-                    if not os.path.exists(expected):
-                        import shutil
-                        shutil.copy2(actual, expected)
-                        #shutil.copy2(os.path.join(result_dir, "{}.{}".format(base, 'png')), os.path.join(baseline_dir, "{}.{}".format(base, 'png')))
-                        raise IOError("Baseline file {} not found, copying result {}".format(expected, actual))
-                    else:
-                        exp_dict = dict(np.load(expected).items())
-                        act_dict = dict(np.load(actual).items())
-                        for name in act_dict:
-                            if name in exp_dict:
-                                try:
-                                    np.testing.assert_allclose(exp_dict[name], act_dict[name], err_msg="Mismatch in {}.{}".format(base, name), rtol=rtol, **kwargs)
-                                except AssertionError as e:
-                                    raise SkipTest(e)
-            else:
-                def do_test():
-                    err = compare_images(expected, actual, tol, in_decorator=True)
-                    if err:
-                        raise SkipTest("Error between {} and {} is {:.5f}, which is bigger then the tolerance of {:.5f}".format(actual, expected, err['rms'], tol))
-            yield do_test
-    plt.close('all')
-
-def flatten_axis(ax, prevname=''):
-    import inspect
-    members = inspect.getmembers(ax)
-
-    arrays = {}
-
-    def _flatten(l, pre):
-        arr = {}
-        if isinstance(l, np.ndarray):
-            if l.size:
-                arr[pre] = np.asarray(l)
-        elif isinstance(l, dict):
-            for _n in l:
-                _tmp = _flatten(l, pre+"."+_n+".")
-                for _nt in _tmp.keys():
-                    arrays[_nt] = _tmp[_nt]
-        elif isinstance(l, list) and len(l)>0:
-            for i in range(len(l)):
-                _tmp = _flatten(l[i], pre+"[{}]".format(i))
-                for _n in _tmp:
-                    arr["{}".format(_n)] = _tmp[_n]
-        else:
-            return flatten_axis(l, pre+'.')
-        return arr
-
-
-    for name, l in members:
-        if isinstance(l, np.ndarray):
-            arrays[prevname+name] = np.asarray(l)
-        elif isinstance(l, list) and len(l)>0:
-            for i in range(len(l)):
-                _tmp = _flatten(l[i], prevname+name+"[{}]".format(i))
-                for _n in _tmp:
-                    arrays["{}".format(_n)] = _tmp[_n]
-
-    return arrays
-
-def _a(x,y,decimal):
-    np.testing.assert_array_almost_equal(x, y, decimal)
-
-def compare_axis_dicts(x, y, decimal=6):
-    try:
-        assert(len(x)==len(y))
-        for name in x:
-            _a(x[name], y[name], decimal)
-    except AssertionError as e:
-        raise SkipTest(e.message)
-
-def test_figure():
-    np.random.seed(1239847)
-    from GPy.plotting import plotting_library as pl
-    #import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    import warnings
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-
-        ax, _ = pl().new_canvas(num="imshow_interact")
-        def test_func(x):
-            return x[:, 0].reshape(3,3)
-        pl().imshow_interact(ax, test_func, extent=(-1,1,-1,1), resolution=3)
-
-        ax, _ = pl().new_canvas()
-        def test_func_2(x):
-            y = x[:, 0].reshape(3,3)
-            anno = np.argmax(x, axis=1).reshape(3,3)
-            return y, anno
-
-        pl().annotation_heatmap_interact(ax, test_func_2, extent=(-1,1,-1,1), resolution=3)
-        pl().annotation_heatmap_interact(ax, test_func_2, extent=(-1,1,-1,1), resolution=3, imshow_kwargs=dict(interpolation='nearest'))
-
-        ax, _ = pl().new_canvas(figsize=(4,3))
-        x = np.linspace(0,1,100)
-        y = [0,1,2]
-        array = np.array([.4,.5])
-        cmap = matplotlib.colors.LinearSegmentedColormap.from_list('WhToColor', ('r', 'b'), N=array.size)
-
-        pl().fill_gradient(ax, x, y, facecolors=['r', 'g'], array=array, cmap=cmap)
-
-        ax, _ = pl().new_canvas(num="3d_plot", figsize=(4,3), projection='3d', xlabel='x', ylabel='y', zlabel='z', title='awsome title', xlim=(-1,1), ylim=(-1,1), zlim=(-3,3))
-        z = 2-np.abs(np.linspace(-2,2,(100)))+1
-        x, y = z*np.sin(np.linspace(-2*np.pi,2*np.pi,(100))), z*np.cos(np.linspace(-np.pi,np.pi,(100)))
-
-        pl().plot(ax, x, y, z, linewidth=2)
-
-        for do_test in _image_comparison(
-                baseline_images=['coverage_{}'.format(sub) for sub in ["imshow_interact",'annotation_interact','gradient','3d_plot',]],
-                extensions=extensions):
-            yield (do_test, )
-
-
-def test_kernel():
-    np.random.seed(1239847)
-    #import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    import warnings
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        k = GPy.kern.RBF(5, ARD=True) * GPy.kern.Linear(3, active_dims=[0,2,4], ARD=True) + GPy.kern.Bias(2)
-        k.randomize()
-        k2 = GPy.kern.RBF(5, ARD=True) * GPy.kern.Linear(3, active_dims=[0,2,4], ARD=True) + GPy.kern.Bias(2) + GPy.kern.White(4)
-        k2[:-1] = k[:]
-        k2.plot_ARD(['rbf', 'linear', 'bias'], legend=True)
-        k2.plot_covariance(visible_dims=[0, 3], plot_limits=(-1,3))
-        k2.plot_covariance(visible_dims=[2], plot_limits=(-1, 3))
-        k2.plot_covariance(visible_dims=[2, 4], plot_limits=((-1, 0), (5, 3)), projection='3d', rstride=10, cstride=10)
-        k2.plot_covariance(visible_dims=[1, 4])
-        for do_test in _image_comparison(
-                baseline_images=['kern_{}'.format(sub) for sub in ["ARD", 'cov_2d', 'cov_1d', 'cov_3d', 'cov_no_lim']],
-                extensions=extensions):
-            yield (do_test, )
-
-def test_plot():
-    np.random.seed(111)
-    import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    import warnings
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        X = np.random.uniform(-2, 2, (40, 1))
-        f = .2 * np.sin(1.3*X) + 1.3*np.cos(2*X)
-        Y = f+np.random.normal(0, .1, f.shape)
-        m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X)*[0.06])
-        #m.optimize()
-        m.plot_data()
-        m.plot_mean()
-        m.plot_confidence()
-        m.plot_density()
-        m.plot_errorbars_trainset()
-        m.plot_samples()
-        m.plot_data_error()
-    for do_test in _image_comparison(baseline_images=['gp_{}'.format(sub) for sub in ["data", "mean", 'conf',
-                                                                                      'density',
-                                                                                      'out_error',
-                                                                                      'samples', 'in_error']], extensions=extensions):
-        yield (do_test, )
-
-def test_twod():
-    np.random.seed(11111)
-    import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    X = np.random.uniform(-2, 2, (40, 2))
-    f = .2 * np.sin(1.3*X[:,[0]]) + 1.3*np.cos(2*X[:,[1]])
-    Y = f+np.random.normal(0, .1, f.shape)
-    m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X)*[0.01, 0.2])
-    #m.optimize()
-    m.plot_data()
-    m.plot_mean()
-    m.plot_inducing(legend=False, marker='s')
-    #m.plot_errorbars_trainset()
-    m.plot_data_error()
-    for do_test in _image_comparison(baseline_images=['gp_2d_{}'.format(sub) for sub in ["data", "mean",
-                                                                                         'inducing',
-                                                                                         #'out_error',
-                                                                                         'in_error',
-                                                                                         ]], extensions=extensions):
-        yield (do_test, )
-
-def test_threed():
-    np.random.seed(11111)
-    import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    X = np.random.uniform(-2, 2, (40, 2))
-    f = .2 * np.sin(1.3*X[:,[0]]) + 1.3*np.cos(2*X[:,[1]])
-    Y = f+np.random.normal(0, .1, f.shape)
-    m = GPy.models.SparseGPRegression(X, Y)
-    m.likelihood.variance = .1
-    #m.optimize()
-    m.plot_samples(projection='3d', samples=1)
-    m.plot_samples(projection='3d', plot_raw=False, samples=1)
-    plt.close('all')
-    m.plot_data(projection='3d')
-    m.plot_mean(projection='3d', rstride=10, cstride=10)
-    m.plot_inducing(projection='3d')
-    #m.plot_errorbars_trainset(projection='3d')
-    for do_test in _image_comparison(baseline_images=[
-        'gp_3d_{}'.format(sub) for sub in ["data", "mean", 'inducing',
-    ]], extensions=extensions):
-        yield (do_test, )
-
-def test_sparse():
-    np.random.seed(11111)
-    import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    X = np.random.uniform(-2, 2, (40, 1))
-    f = .2 * np.sin(1.3*X) + 1.3*np.cos(2*X)
-    Y = f+np.random.normal(0, .1, f.shape)
-    m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X)*0.1)
-    #m.optimize()
-    #m.plot_inducing()
-    _, ax = plt.subplots()
-    m.plot_data(ax=ax)
-    m.plot_data_error(ax=ax)
-    for do_test in _image_comparison(baseline_images=['sparse_gp_{}'.format(sub) for sub in ['data_error']], extensions=extensions):
-        yield (do_test, )
-
-def test_classification():
-    np.random.seed(11111)
-    import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    X = np.random.uniform(-2, 2, (40, 1))
-    f = .2 * np.sin(1.3*X) + 1.3*np.cos(2*X)
-    Y = f+np.random.normal(0, .1, f.shape)
-    m = GPy.models.GPClassification(X, Y>Y.mean())
-    #m.optimize()
-    _, ax = plt.subplots()
-    m.plot(plot_raw=False, apply_link=False, ax=ax, samples=3)
-    m.plot_errorbars_trainset(plot_raw=False, apply_link=False, ax=ax)
-    _, ax = plt.subplots()
-    m.plot(plot_raw=True, apply_link=False, ax=ax, samples=3)
-    m.plot_errorbars_trainset(plot_raw=True, apply_link=False, ax=ax)
-    _, ax = plt.subplots()
-    m.plot(plot_raw=True, apply_link=True, ax=ax, samples=3)
-    m.plot_errorbars_trainset(plot_raw=True, apply_link=True, ax=ax)
-    for do_test in _image_comparison(baseline_images=['gp_class_{}'.format(sub) for sub in ["likelihood", "raw", 'raw_link']], extensions=extensions):
-        yield (do_test, )
-
-
-def test_sparse_classification():
-    np.random.seed(11111)
-    import matplotlib
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    X = np.random.uniform(-2, 2, (40, 1))
-    f = .2 * np.sin(1.3*X) + 1.3*np.cos(2*X)
-    Y = f+np.random.normal(0, .1, f.shape)
-    m = GPy.models.SparseGPClassification(X, Y>Y.mean())
-    #m.optimize()
-    m.plot(plot_raw=False, apply_link=False, samples_likelihood=3)
-    np.random.seed(111)
-    m.plot(plot_raw=True, apply_link=False, samples=3)
-    np.random.seed(111)
-    m.plot(plot_raw=True, apply_link=True, samples=3)
-    for do_test in _image_comparison(baseline_images=['sparse_gp_class_{}'.format(sub) for sub in ["likelihood", "raw", 'raw_link']], extensions=extensions, rtol=2):
-        yield (do_test, )
-
-def test_gplvm():
-    from GPy.models import GPLVM
-    np.random.seed(12345)
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    #Q = 3
-    # Define dataset
-    #N = 60
-    #k1 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,10,10,0.1,0.1]), ARD=True)
-    #k2 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,0.1,10,0.1,10]), ARD=True)
-    #k3 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[0.1,0.1,10,10,10]), ARD=True)
-    #X = np.random.normal(0, 1, (N, 5))
-    #A = np.random.multivariate_normal(np.zeros(N), k1.K(X), Q).T
-    #B = np.random.multivariate_normal(np.zeros(N), k2.K(X), Q).T
-    #C = np.random.multivariate_normal(np.zeros(N), k3.K(X), Q).T
-    #Y = np.vstack((A,B,C))
-    #labels = np.hstack((np.zeros(A.shape[0]), np.ones(B.shape[0]), np.ones(C.shape[0])*2))
-
-    #k = RBF(Q, ARD=True, lengthscale=2)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
-    pars = np.load(os.path.join(basedir, 'b-gplvm-save.npz'))
-    Y = pars['Y']
-    Q = pars['Q']
-    labels = pars['labels']
-
-    import warnings
-    with warnings.catch_warnings(record=True) as w:
-        warnings.simplefilter('always')  # always print
-        m = GPLVM(Y, Q, initialize=False)
-    m.update_model(False)
-    m.initialize_parameter()
-    m[:] = pars['gplvm_p']
-    m.update_model(True)
-
-    #m.optimize(messages=0)
-    np.random.seed(111)
-    m.plot_latent(labels=labels)
-    np.random.seed(111)
-    m.plot_scatter(projection='3d', labels=labels)
-    np.random.seed(111)
-    m.plot_magnification(labels=labels)
-    m.plot_steepest_gradient_map(resolution=10, data_labels=labels)
-    for do_test in _image_comparison(baseline_images=['gplvm_{}'.format(sub) for sub in ["latent", "latent_3d", "magnification", 'gradient']],
-                                     extensions=extensions,
-                                     tol=12):
-        yield (do_test, )
-
-def test_bayesian_gplvm():
-    from ..models import BayesianGPLVM
-    np.random.seed(12345)
-    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
-    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
-    matplotlib.rcParams[u'text.usetex'] = False
-    #Q = 3
-    # Define dataset
-    #N = 10
-    #k1 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,10,10,0.1,0.1]), ARD=True)
-    #k2 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,0.1,10,0.1,10]), ARD=True)
-    #k3 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[0.1,0.1,10,10,10]), ARD=True)
-    #X = np.random.normal(0, 1, (N, 5))
-    #A = np.random.multivariate_normal(np.zeros(N), k1.K(X), Q).T
-    #B = np.random.multivariate_normal(np.zeros(N), k2.K(X), Q).T
-    #C = np.random.multivariate_normal(np.zeros(N), k3.K(X), Q).T
-
-    #Y = np.vstack((A,B,C))
-    #labels = np.hstack((np.zeros(A.shape[0]), np.ones(B.shape[0]), np.ones(C.shape[0])*2))
-
-    #k = RBF(Q, ARD=True, lengthscale=2)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
-    pars = np.load(os.path.join(basedir, 'b-gplvm-save.npz'))
-    Y = pars['Y']
-    Q = pars['Q']
-    labels = pars['labels']
-
-    import warnings
-    with warnings.catch_warnings(record=True) as w:
-        warnings.simplefilter('always')  # always print
-        m = BayesianGPLVM(Y, Q, initialize=False)
-    m.update_model(False)
-    m.initialize_parameter()
-    m[:] = pars['bgplvm_p']
-    m.update_model(True)
-
-    #m.optimize(messages=0)
-    np.random.seed(111)
-    m.plot_inducing(projection='2d')
-    np.random.seed(111)
-    m.plot_inducing(projection='3d')
-    np.random.seed(111)
-    m.plot_latent(projection='2d', labels=labels)
-    np.random.seed(111)
-    m.plot_scatter(projection='3d', labels=labels)
-    np.random.seed(111)
-    m.plot_magnification(labels=labels)
-    np.random.seed(111)
-    m.plot_steepest_gradient_map(resolution=10, data_labels=labels)
-    for do_test in _image_comparison(baseline_images=['bayesian_gplvm_{}'.format(sub) for sub in ["inducing", "inducing_3d", "latent", "latent_3d", "magnification", 'gradient']], extensions=extensions):
-        yield (do_test, )
-
-if __name__ == '__main__':
-    import nose
-    nose.main(defaultTest='./plotting_tests.py')
--- a/GPy/testing/run_coverage.sh
+++ b/GPy/testing/run_coverage.sh
@ -1 +1 @@
-nosetests . --with-coverage --logging-level=INFO --cover-html --cover-html-dir=coverage --cover-package=GPy --cover-erase
+pytest .
--- a/GPy/testing/rv_transformation_tests.py
+++ b/GPy/testing/rv_transformation_tests.py
@ -1,117 +0,0 @@
-# Written by Ilias Bilionis
-"""
-Test if hyperparameters in models are properly transformed.
-"""
-
-
-import unittest
-import numpy as np
-import scipy.stats as st
-import GPy
-
-
-class TestModel(GPy.core.Model):
-    """
-    A simple GPy model with one parameter.
-    """
-    def __init__(self, theta=1.):
-        super(TestModel, self).__init__('test_model')
-        theta = GPy.core.Param('theta', theta)
-        self.link_parameter(theta)
-
-    def log_likelihood(self):
-        return 0.
-
-
-class RVTransformationTestCase(unittest.TestCase):
-
-    def _test_trans(self, trans):
-        m = TestModel()
-        prior = GPy.priors.LogGaussian(.5, 0.1)
-        m.theta.set_prior(prior)
-        m.theta.unconstrain()
-        m.theta.constrain(trans)
-        # The PDF of the transformed variables
-        p_phi = lambda phi : np.exp(-m._objective_grads(phi)[0])
-        # To the empirical PDF of:
-        theta_s = prior.rvs(1e5)
-        phi_s = trans.finv(theta_s)
-        # which is essentially a kernel density estimation
-        kde = st.gaussian_kde(phi_s)
-        # We will compare the PDF here:
-        phi = np.linspace(phi_s.min(), phi_s.max(), 100)
-        # The transformed PDF of phi should be this:
-        pdf_phi = np.array([p_phi(p) for p in phi])
-        # UNCOMMENT TO SEE GRAPHICAL COMPARISON
-        #import matplotlib.pyplot as plt
-        #fig, ax = plt.subplots()
-        #ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Histogram')
-        #ax.plot(phi, kde(phi), '--', linewidth=2, label='Kernel Density Estimation')
-        #ax.plot(phi, pdf_phi, ':', linewidth=2, label='Transformed PDF')
-        #ax.set_xlabel(r'transformed $\theta$', fontsize=16)
-        #ax.set_ylabel('PDF', fontsize=16)
-        #plt.legend(loc='best')
-        #plt.show(block=True)
-        # END OF PLOT
-        # The following test cannot be very accurate
-        self.assertTrue(np.linalg.norm(pdf_phi - kde(phi)) / np.linalg.norm(kde(phi)) <= 1e-1)
-
-    def _test_grad(self, trans):
-        np.random.seed(1234)
-        m = TestModel(np.random.uniform(.5, 1.5, 20))
-        prior = GPy.priors.LogGaussian(.5, 0.1)
-        m.theta.set_prior(prior)
-        m.theta.constrain(trans)
-        m.randomize()
-        print(m)
-        self.assertTrue(m.checkgrad(1))
-
-    def test_Logexp(self):
-        self._test_trans(GPy.constraints.Logexp())
-
-    @unittest.skip("Gradient not checking right, @jameshensman what is going on here?")
-    def test_Logexp_grad(self):        
-        self._test_grad(GPy.constraints.Logexp())
-        
-    def test_Exponent(self):
-        self._test_trans(GPy.constraints.Exponent())
-    
-    @unittest.skip("Gradient not checking right, @jameshensman what is going on here?")
-    def test_Exponent_grad(self):
-        self._test_grad(GPy.constraints.Exponent())
-
-
-if __name__ == '__main__':
-    unittest.main()
-    quit()
-    m = TestModel()
-    prior = GPy.priors.LogGaussian(0., .9)
-    m.theta.set_prior(prior)
-
-    # The following should return the PDF in terms of the transformed quantities
-    p_phi = lambda phi : np.exp(-m._objective_grads(phi)[0])
-
-    # Let's look at the transformation phi = log(exp(theta - 1))
-    trans = GPy.constraints.Exponent()
-    m.theta.constrain(trans)
-    # Plot the transformed probability density
-    phi = np.linspace(-8, 8, 100)
-    fig, ax = plt.subplots()
-    # Let's draw some samples of theta and transform them so that we see
-    # which one is right
-    theta_s = prior.rvs(10000)
-    # Transform it to the new variables
-    phi_s = trans.finv(theta_s)
-    # And draw their histogram
-    ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Empirical')
-    # This is to be compared to the PDF of the model expressed in terms of these new
-    # variables
-    ax.plot(phi, [p_phi(p) for p in phi], label='Transformed PDF', linewidth=2)
-    ax.set_xlim(-3, 10)
-    ax.set_xlabel(r'transformed $\theta$', fontsize=16)
-    ax.set_ylabel('PDF', fontsize=16)
-    plt.legend(loc='best')
-    # Now let's test the gradients
-    m.checkgrad(verbose=True)
-    # And show the plot
-    plt.show(block=True)
--- a/GPy/testing/serialization_tests.py
+++ b/GPy/testing/serialization_tests.py
@ -1,279 +0,0 @@
-'''
-Created on 20 April 2017
-
-@author: pgmoren
-'''
-import unittest, itertools
-#import cPickle as pickle
-import pickle
-import numpy as np
-import tempfile
-import GPy
-from nose import SkipTest
-import numpy as np
-import os
-fixed_seed = 11
-
-
-class Test(unittest.TestCase):
-    def test_serialize_deserialize_kernels(self):
-        k1 = GPy.kern.RBF(2, variance=1.0, lengthscale=[1.0,1.0], ARD=True)
-        k2 = GPy.kern.RatQuad(2, variance=2.0, lengthscale=1.0, power=2.0, active_dims = [0,1])
-        k3 = GPy.kern.Bias(2, variance=2.0, active_dims = [1,0])
-        k4 = GPy.kern.StdPeriodic(2, variance=2.0, lengthscale=1.0, period=1.0, active_dims = [1,1])
-        k5 = GPy.kern.Linear(2, variances=[2.0, 1.0], ARD=True, active_dims = [1,1])
-        k6 = GPy.kern.Exponential(2, variance=1., lengthscale=2)
-        k7 = GPy.kern.Matern32(2, variance=1.0, lengthscale=[1.0,3.0], ARD=True, active_dims = [1,1])
-        k8 = GPy.kern.Matern52(2, variance=2.0, lengthscale=[2.0,1.0], ARD=True, active_dims = [1,0])
-        k9 = GPy.kern.ExpQuad(2, variance=3.0, lengthscale=[1.0,2.0], ARD=True, active_dims = [0,1])
-        k10 = GPy.kern.OU(2, variance=2.0, lengthscale=[2.0, 1.0], ARD=True, active_dims=[1, 0])
-        k11 = k1 + k1.copy() + k2 + k3 + k4 + k5 + k6
-        k12 = k1 * k2 * k2.copy() * k3 * k4 * k5
-        k13 = (k1 + k2) * (k3 + k4 + k5)
-        k14 = ((k1 + k2) * k3) + k4 + k5 * k7
-        k15 = ((k1 + k2) * k3) + k4 * k5 + k8 * k10
-        k16 = ((k1 * k2) * k3) + k4 * k5 + k8 + k9
-
-        k_list = [k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15,k16]
-
-        for kk in k_list:
-            kk_dict = kk.to_dict()
-            kk_r = GPy.kern.Kern.from_dict(kk_dict)
-            assert type(kk) == type(kk_r)
-            np.testing.assert_array_equal(kk[:], kk_r[:])
-            np.testing.assert_array_equal(np.array(kk.active_dims), np.array(kk_r.active_dims))
-
-    def test_serialize_deserialize_mappings(self):
-        m1 = GPy.mappings.Identity(3,2)
-        m2 = GPy.mappings.Constant(3,2,1)
-        m2_r = GPy.core.mapping.Mapping.from_dict(m2.to_dict())
-        np.testing.assert_array_equal(m2.C.values[:], m2_r.C.values[:])
-        m3 = GPy.mappings.Linear(3,2)
-        m3_r = GPy.core.mapping.Mapping.from_dict(m3.to_dict())
-        assert np.all(m3.A == m3_r.A)
-
-        m_list = [m1, m2, m3]
-        for mm in m_list:
-            mm_dict = mm.to_dict()
-            mm_r = GPy.core.mapping.Mapping.from_dict(mm_dict)
-            assert type(mm) == type(mm_r)
-            assert type(mm.input_dim) == type(mm_r.input_dim)
-            assert type(mm.output_dim) == type(mm_r.output_dim)
-
-    def test_serialize_deserialize_likelihoods(self):
-        l1 = GPy.likelihoods.Gaussian(GPy.likelihoods.link_functions.Identity(),variance=3.0)
-        l1_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l1.to_dict())
-        l2 = GPy.likelihoods.Bernoulli(GPy.likelihoods.link_functions.Probit())
-        l2_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l2.to_dict())
-        assert type(l1) == type(l1_r)
-        assert np.all(l1.variance == l1_r.variance)
-        assert type(l2) == type(l2_r)
-
-    def test_serialize_deserialize_normalizers(self):
-        n1 = GPy.util.normalizer.Standardize()
-        n1.scale_by(np.random.rand(10))
-        n1_r = GPy.util.normalizer._Norm.from_dict((n1.to_dict()))
-        assert type(n1) == type(n1_r)
-        assert np.all(n1.mean == n1_r.mean)
-        assert np.all(n1.std == n1_r.std)
-
-    def test_serialize_deserialize_link_functions(self):
-        l1 = GPy.likelihoods.link_functions.Identity()
-        l2 = GPy.likelihoods.link_functions.Probit()
-        l_list = [l1, l2]
-        for ll in l_list:
-            ll_dict = ll.to_dict()
-            ll_r = GPy.likelihoods.link_functions.GPTransformation.from_dict(ll_dict)
-            assert type(ll) == type(ll_r)
-
-    def test_serialize_deserialize_inference_methods(self):
-
-        e1 = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode="nested")
-        e1.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10))
-        e1._ep_approximation = []
-        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.posteriorParams(np.random.rand(10),np.random.rand(100).reshape((10,10))))
-        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10)))
-        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.cavityParams(10))
-        e1._ep_approximation[-1].v = np.random.rand(10)
-        e1._ep_approximation[-1].tau = np.random.rand(10)
-        e1._ep_approximation.append(np.random.rand(10))
-        e1_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e1.to_dict())
-
-        assert type(e1) == type(e1_r)
-        assert e1.epsilon==e1_r.epsilon
-        assert e1.eta==e1_r.eta
-        assert e1.delta==e1_r.delta
-        assert e1.always_reset==e1_r.always_reset
-        assert e1.max_iters==e1_r.max_iters
-        assert e1.ep_mode==e1_r.ep_mode
-        assert e1.parallel_updates==e1_r.parallel_updates
-
-        np.testing.assert_array_equal(e1.ga_approx_old.tau[:], e1_r.ga_approx_old.tau[:])
-        np.testing.assert_array_equal(e1.ga_approx_old.v[:], e1_r.ga_approx_old.v[:])
-        np.testing.assert_array_equal(e1._ep_approximation[0].mu[:], e1_r._ep_approximation[0].mu[:])
-        np.testing.assert_array_equal(e1._ep_approximation[0].Sigma[:], e1_r._ep_approximation[0].Sigma[:])
-        np.testing.assert_array_equal(e1._ep_approximation[1].tau[:], e1_r._ep_approximation[1].tau[:])
-        np.testing.assert_array_equal(e1._ep_approximation[1].v[:], e1_r._ep_approximation[1].v[:])
-        np.testing.assert_array_equal(e1._ep_approximation[2].tau[:], e1_r._ep_approximation[2].tau[:])
-        np.testing.assert_array_equal(e1._ep_approximation[2].v[:], e1_r._ep_approximation[2].v[:])
-        np.testing.assert_array_equal(e1._ep_approximation[3][:], e1_r._ep_approximation[3][:])
-
-        e2 = GPy.inference.latent_function_inference.expectation_propagation.EPDTC(ep_mode="nested")
-        e2.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10))
-        e2._ep_approximation = []
-        e2._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.posteriorParamsDTC(np.random.rand(10),np.random.rand(10)))
-        e2._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10)))
-        e2._ep_approximation.append(100.0)
-        e2_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e2.to_dict())
-
-        assert type(e2) == type(e2_r)
-        assert e2.epsilon==e2_r.epsilon
-        assert e2.eta==e2_r.eta
-        assert e2.delta==e2_r.delta
-        assert e2.always_reset==e2_r.always_reset
-        assert e2.max_iters==e2_r.max_iters
-        assert e2.ep_mode==e2_r.ep_mode
-        assert e2.parallel_updates==e2_r.parallel_updates
-
-        np.testing.assert_array_equal(e2.ga_approx_old.tau[:], e2_r.ga_approx_old.tau[:])
-        np.testing.assert_array_equal(e2.ga_approx_old.v[:], e2_r.ga_approx_old.v[:])
-        np.testing.assert_array_equal(e2._ep_approximation[0].mu[:], e2_r._ep_approximation[0].mu[:])
-        np.testing.assert_array_equal(e2._ep_approximation[0].Sigma_diag[:], e2_r._ep_approximation[0].Sigma_diag[:])
-        np.testing.assert_array_equal(e2._ep_approximation[1].tau[:], e2_r._ep_approximation[1].tau[:])
-        np.testing.assert_array_equal(e2._ep_approximation[1].v[:], e2_r._ep_approximation[1].v[:])
-        assert(e2._ep_approximation[2] == e2_r._ep_approximation[2])
-
-        e3 = GPy.inference.latent_function_inference.exact_gaussian_inference.ExactGaussianInference()
-        e3_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e3.to_dict())
-
-        assert type(e3) == type(e3_r)
-
-
-    def test_serialize_deserialize_GP(self):
-        np.random.seed(fixed_seed)
-        N = 20
-        Nhalf = int(N/2)
-        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
-        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
-        kernel = GPy.kern.RBF(1)
-        likelihood = GPy.likelihoods.Bernoulli()
-        inference_method=GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode="nested")
-        mean_function=None
-
-        m = GPy.core.GP(X=X, Y=Y,  kernel=kernel, likelihood=likelihood, inference_method=inference_method, mean_function=mean_function, normalizer=True, name='gp_classification')
-        m.optimize()
-        m.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
-        m.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
-        m1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
-        m2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X,Y))
-        os.remove("temp_test_gp_with_data.json.zip")
-        os.remove("temp_test_gp_without_data.json.zip")
-        var = m.predict(X)[0]
-        var1_r = m1_r.predict(X)[0]
-        var2_r = m2_r.predict(X)[0]
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var2_r).flatten())
-
-    def test_serialize_deserialize_SparseGP(self):
-        np.random.seed(fixed_seed)
-        N = 20
-        Nhalf = int(N/2)
-        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
-        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
-        kernel = GPy.kern.RBF(1)
-        likelihood = GPy.likelihoods.Bernoulli()
-        inference_method=GPy.inference.latent_function_inference.expectation_propagation.EPDTC(ep_mode="nested")
-        mean_function=None
-
-        sm = GPy.core.SparseGP(X=X, Y=Y, Z=X[0:20,:], kernel=kernel, likelihood=likelihood, inference_method=inference_method, mean_function=mean_function, normalizer=True, name='sparse_gp_classification')
-        sm.optimize()
-        sm.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
-        sm.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
-        sm1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
-        sm2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X,Y))
-        os.remove("temp_test_gp_with_data.json.zip")
-        os.remove("temp_test_gp_without_data.json.zip")
-        var = sm.predict(X)[0]
-        var1_r = sm1_r.predict(X)[0]
-        var2_r = sm2_r.predict(X)[0]
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var2_r).flatten())
-
-    def test_serialize_deserialize_GPRegressor(self):
-        np.random.seed(fixed_seed)
-        N = 50
-        N_new = 50
-        D = 1
-        X = np.random.uniform(-3., 3., (N, 1))
-        Y = np.sin(X) + np.random.randn(N, D) * 0.05
-        X_new = np.random.uniform(-3., 3., (N_new, 1))
-        k = GPy.kern.RBF(input_dim=1, lengthscale=10)
-        m = GPy.models.GPRegression(X,Y,k)
-        m.optimize()
-        m.save_model("temp_test_gp_regressor_with_data.json", compress=True, save_data=True)
-        m.save_model("temp_test_gp_regressor_without_data.json", compress=True, save_data=False)
-        m1_r = GPy.models.GPRegression.load_model("temp_test_gp_regressor_with_data.json.zip")
-        m2_r = GPy.models.GPRegression.load_model("temp_test_gp_regressor_without_data.json.zip", (X,Y))
-        os.remove("temp_test_gp_regressor_with_data.json.zip")
-        os.remove("temp_test_gp_regressor_without_data.json.zip")
-
-        Xp = np.random.uniform(size=(int(1e5),1))
-        Xp[:,0] = Xp[:,0]*15-5
-
-        _, var = m.predict(Xp)
-        _, var1_r = m1_r.predict(Xp)
-        _, var2_r = m2_r.predict(Xp)
-        np.testing.assert_array_equal(var.flatten(), var1_r.flatten())
-        np.testing.assert_array_equal(var.flatten(), var2_r.flatten())
-
-    def test_serialize_deserialize_GPClassification(self):
-        np.random.seed(fixed_seed)
-        N = 50
-        Nhalf = int(N/2)
-        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
-        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
-        kernel = GPy.kern.RBF(1)
-        m = GPy.models.GPClassification(X, Y, kernel=kernel)
-        m.optimize()
-        m.save_model("temp_test_gp_classifier_with_data.json", compress=True, save_data=True)
-        m.save_model("temp_test_gp_classifier_without_data.json", compress=True, save_data=False)
-        m1_r = GPy.models.GPClassification.load_model("temp_test_gp_classifier_with_data.json.zip")
-        self.assertTrue(type(m) == type(m1_r), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m1_r)))
-        m2_r = GPy.models.GPClassification.load_model("temp_test_gp_classifier_without_data.json.zip", (X,Y))
-        self.assertTrue(type(m) == type(m2_r), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m2_r)))
-        os.remove("temp_test_gp_classifier_with_data.json.zip")
-        os.remove("temp_test_gp_classifier_without_data.json.zip")
-
-        var = m.predict(X)[0]
-        var1_r = m1_r.predict(X)[0]
-        var2_r = m2_r.predict(X)[0]
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
-
-    def test_serialize_deserialize_SparseGPClassification(self):
-        np.random.seed(fixed_seed)
-        N = 50
-        Nhalf = int(N/2)
-        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
-        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
-        kernel = GPy.kern.RBF(1)
-        m = GPy.models.SparseGPClassification(X, Y, num_inducing=3, kernel=kernel)
-        m.optimize()
-        m.save_model("temp_test_sparse_gp_classifier_with_data.json", compress=True, save_data=True)
-        m.save_model("temp_test_sparse_gp_classifier_without_data.json", compress=True, save_data=False)
-        m1_r = GPy.models.SparseGPClassification.load_model("temp_test_sparse_gp_classifier_with_data.json.zip")
-        self.assertTrue(type(m) == type(m1_r), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m1_r)))
-        m2_r = GPy.models.SparseGPClassification.load_model("temp_test_sparse_gp_classifier_without_data.json.zip", (X,Y))
-        self.assertTrue(type(m) == type(m2_r), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m2_r)))
-        os.remove("temp_test_sparse_gp_classifier_with_data.json.zip")
-        os.remove("temp_test_sparse_gp_classifier_without_data.json.zip")
-
-        var = m.predict(X)[0]
-        var1_r = m1_r.predict(X)[0]
-        var2_r = m2_r.predict(X)[0]
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
-        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.test_parameter_index_operations']
-    unittest.main()
--- a/GPy/testing/state_space_main_tests.py
+++ b/GPy/testing/state_space_main_tests.py
--- a/GPy/testing/svgp_tests.py
+++ b/GPy/testing/svgp_tests.py
@ -1,54 +0,0 @@
-import numpy as np
-import scipy as sp
-import GPy
-
-class SVGP_nonconvex(np.testing.TestCase):
-    """
-    Inference in the SVGP with a student-T likelihood
-    """
-    def setUp(self):
-        X = np.linspace(0,10,100).reshape(-1,1)
-        Z = np.linspace(0,10,10).reshape(-1,1)
-        Y = np.sin(X) + np.random.randn(*X.shape)*0.1
-        Y[50] += 3
-
-        lik = GPy.likelihoods.StudentT(deg_free=2)
-        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
-        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
-    def test_grad(self):
-        assert self.m.checkgrad(step=1e-4)
-
-class SVGP_classification(np.testing.TestCase):
-    """
-    Inference in the SVGP with a Bernoulli likelihood
-    """
-    def setUp(self):
-        X = np.linspace(0,10,100).reshape(-1,1)
-        Z = np.linspace(0,10,10).reshape(-1,1)
-        Y = np.where((np.sin(X) + np.random.randn(*X.shape)*0.1)>0, 1,0)
-
-        lik = GPy.likelihoods.Bernoulli()
-        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
-        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
-    def test_grad(self):
-        assert self.m.checkgrad(step=1e-4)
-
-class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
-    """
-    Inference in the SVGP with a Bernoulli likelihood
-    """
-    def setUp(self):
-        X = np.linspace(0,10,100).reshape(-1,1)
-        Z = np.linspace(0,10,10).reshape(-1,1)
-        latent_f = np.exp(0.1*X * 0.05*X**2)
-        Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
-
-        mf = GPy.mappings.Linear(1,1)
-
-        lik = GPy.likelihoods.Poisson()
-        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
-        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
-    def test_grad(self):
-        assert self.m.checkgrad(step=1e-4)
-
-
--- a/GPy/testing/test_cython.py
+++ b/GPy/testing/test_cython.py
@ -0,0 +1,118 @@
+import numpy as np
+from GPy.util import choleskies
+import GPy
+import pytest
+
+from ..util.config import config
+
+try:
+    from ..util import choleskies_cython
+
+    choleskies_cython_working = config.getboolean("cython", "working")
+except ImportError:
+    choleskies_cython_working = False
+
+try:
+    from ..kern.src import stationary_cython
+
+    stationary_cython_working = config.getboolean("cython", "working")
+except ImportError:
+    stationary_cython_working = False
+
+"""
+These tests make sure that the pure python and cython codes work the same
+"""
+
+
+class CythonTestChols:
+    def setup(self):
+        self.flat = np.random.randn(45, 5)
+        self.triang = np.array([np.eye(20) for i in range(3)])
+
+    @pytest.mark.skipif(
+        not choleskies_cython_working,
+        "Cython cholesky module has not been built on this machine",
+    )
+    def test_flat_to_triang(self):
+        L1 = choleskies._flat_to_triang_pure(self.flat)
+        L2 = choleskies._flat_to_triang_cython(self.flat)
+        assert np.allclose(L1, L2), "Triang mismatch!"
+
+    @pytest.mark.skipif(
+        not choleskies_cython_working,
+        "Cython cholesky module has not been built on this machine",
+    )
+    def test_triang_to_flat(self):
+        A1 = choleskies._triang_to_flat_pure(self.triang)
+        A2 = choleskies._triang_to_flat_cython(self.triang)
+        assert np.allclose(A1, A2), "Flat mismatch!"
+
+
+class TestStationary:
+    def setup(self):
+        self.k = GPy.kern.RBF(10)
+        self.X = np.random.randn(300, 10)
+        self.Z = np.random.randn(20, 10)
+        self.dKxx = np.random.randn(300, 300)
+        self.dKzz = np.random.randn(20, 20)
+        self.dKxz = np.random.randn(300, 20)
+
+    @pytest.mark.skipif(
+        not stationary_cython_working,
+        reason="Cython stationary module has not been built on this machine",
+    )
+    def test_square_gradX(self):
+        self.setup()
+        g1 = self.k._gradients_X_cython(self.dKxx, self.X)
+        g2 = self.k._gradients_X_pure(self.dKxx, self.X)
+        assert np.allclose(g1, g2), "Gradient mismatch on square X!"
+
+    @pytest.mark.skipif(
+        not stationary_cython_working,
+        reason="Cython stationary module has not been built on this machine",
+    )
+    def test_rect_gradx(self):
+        self.setup()
+        g1 = self.k._gradients_X_cython(self.dKxz, self.X, self.Z)
+        g2 = self.k._gradients_X_pure(self.dKxz, self.X, self.Z)
+        assert np.allclose(g1, g2), "Gradient mismatch on rect X!"
+
+    @pytest.mark.skipif(
+        not stationary_cython_working,
+        reason="Cython stationary module has not been built on this machine",
+    )
+    def test_square_lengthscales(self):
+        self.setup()
+        g1 = self.k._lengthscale_grads_pure(self.dKxx, self.X, self.X)
+        g2 = self.k._lengthscale_grads_cython(self.dKxx, self.X, self.X)
+        assert np.allclose(g1, g2), "Gradient mismatch on square lengthscale!"
+
+    @pytest.mark.skipif(
+        not stationary_cython_working,
+        reason="Cython stationary module has not been built on this machine",
+    )
+    def test_rect_lengthscales(self):
+        self.setup()
+        g1 = self.k._lengthscale_grads_pure(self.dKxz, self.X, self.Z)
+        g2 = self.k._lengthscale_grads_cython(self.dKxz, self.X, self.Z)
+        assert np.allclose(g1, g2), "Gradient mismatch on rect lengthscale!"
+
+
+class TestCholeskiesBackprop:
+    def setup(self):
+        a = np.random.randn(10, 12)
+        A = a.dot(a.T)
+        self.L = GPy.util.linalg.jitchol(A)
+        self.dL = np.random.randn(10, 10)
+
+    @pytest.mark.skipif(
+        not choleskies_cython_working,
+        reason="Cython cholesky module has not been built on this machine",
+    )
+    def test_backprop(self):
+        self.setup()
+        r1 = choleskies._backprop_gradient_pure(self.dL, self.L)
+        r2 = choleskies_cython.backprop_gradient(self.dL, self.L)
+        r3 = choleskies_cython.backprop_gradient_par_c(self.dL, self.L)
+        assert np.allclose(r1, r2), "Gradient mismatch!"
+        assert np.allclose(r1, r3), "Gradient mismatch!"
--- a/GPy/testing/ep_likelihood_tests.py
+++ b/GPy/testing/ep_likelihood_tests.py
@ -1,17 +1,19 @@
-
+import pytest
 import numpy as np
-import unittest
 import GPy
-from GPy.models import GradientChecker
+

 fixed_seed = 10
-from nose.tools import with_setup, nottest
+
+
+def rmse(Y, Ystar):
+    return np.sqrt(np.mean((Y - Ystar) ** 2))


 # this file will contain some high level tests, this is not unit testing, but will give us a higher level estimate
 # if things are going well under the hood.
-class TestObservationModels(unittest.TestCase):
-    def setUp(self):
+class TestObservationModels:
+    def setup(self):
        np.random.seed(fixed_seed)
        self.N = 100
        self.D = 2
@ -22,7 +24,7 @@ class TestObservationModels(unittest.TestCase):
        self.Y = (np.sin(self.X[:, 0] * 2 * np.pi) + noise)[:, None]
        self.num_points = self.X.shape[0]
        self.f = np.random.rand(self.N, 1)
-        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
+        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=int)[:, None]
        # self.binary_Y[self.binary_Y == 0.0] = -1.0
        self.positive_Y = np.exp(self.Y.copy())

@ -31,45 +33,72 @@ class TestObservationModels(unittest.TestCase):
        self.Y_noisy[75] += 1.3

        self.init_var = 0.15
-        self.deg_free = 4.
+        self.deg_free = 4.0
        censored = np.zeros_like(self.Y)
        random_inds = np.random.choice(self.N, int(self.N / 2), replace=True)
        censored[random_inds] = 1
        self.Y_metadata = dict()
-        self.Y_metadata['censored'] = censored
+        self.Y_metadata["censored"] = censored
        self.kernel1 = GPy.kern.RBF(self.X.shape[1]) + GPy.kern.White(self.X.shape[1])

-    def tearDown(self):
+    def tear_down(self):
        self.Y = None
        self.X = None
-        self.binary_Y =None
+        self.binary_Y = None
        self.positive_Y = None
        self.kernel1 = None

-    @with_setup(setUp, tearDown)
-    def testEPClassification(self):
+    def test_epccassification(self):
+        self.setup()
+
        bernoulli = GPy.likelihoods.Bernoulli()
        laplace_inf = GPy.inference.latent_function_inference.Laplace()

-        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
-        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
-        ep_inf_fractional = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.9)
+        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode="alternated")
+        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode="nested")
+        ep_inf_fractional = GPy.inference.latent_function_inference.EP(
+            ep_mode="nested", eta=0.9
+        )

-        m1 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=laplace_inf)
+        m1 = GPy.core.GP(
+            self.X,
+            self.binary_Y.copy(),
+            kernel=self.kernel1.copy(),
+            likelihood=bernoulli.copy(),
+            inference_method=laplace_inf,
+        )
        m1.randomize()

-        m2 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_alt)
+        m2 = GPy.core.GP(
+            self.X,
+            self.binary_Y.copy(),
+            kernel=self.kernel1.copy(),
+            likelihood=bernoulli.copy(),
+            inference_method=ep_inf_alt,
+        )
        m2.randomize()

-        m3 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_nested)
+        m3 = GPy.core.GP(
+            self.X,
+            self.binary_Y.copy(),
+            kernel=self.kernel1.copy(),
+            likelihood=bernoulli.copy(),
+            inference_method=ep_inf_nested,
+        )
        m3.randomize()
        #
-        m4 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_fractional)
+        m4 = GPy.core.GP(
+            self.X,
+            self.binary_Y.copy(),
+            kernel=self.kernel1.copy(),
+            likelihood=bernoulli.copy(),
+            inference_method=ep_inf_fractional,
+        )
        m4.randomize()

-        optimizer = 'bfgs'
+        optimizer = "bfgs"

-        #do gradcheck here ...
+        # do gradcheck here ...
        # self.assertTrue(m1.checkgrad())
        # self.assertTrue(m2.checkgrad())
        # self.assertTrue(m3.checkgrad())
@ -86,35 +115,53 @@ class TestObservationModels(unittest.TestCase):
        probs_mean_ep_nested, probs_var_ep_nested = m3.predict(self.X)

        # for simple single dimension data , marginal likelihood for laplace and EP approximations should not be so far apart.
-        self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=1)
-        self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), delta=1)
-        self.assertAlmostEqual(m1.log_likelihood(), m4.log_likelihood(), delta=5)
+        # TODO: the below were assertAlmostEqual, not sure if allclose will do the job here
+        #     I replace the old delta with the atol
+        assert np.allclose(m1.log_likelihood(), m2.log_likelihood(), atol=1.0)
+        assert np.allclose(m1.log_likelihood(), m3.log_likelihood(), atol=1)
+        assert np.allclose(m1.log_likelihood(), m4.log_likelihood(), atol=5.0)

        GPy.util.classification.conf_matrix(probs_mean_lap, self.binary_Y)
        GPy.util.classification.conf_matrix(probs_mean_ep_alt, self.binary_Y)
        GPy.util.classification.conf_matrix(probs_mean_ep_nested, self.binary_Y)

-    @nottest
-    def rmse(self, Y, Ystar):
-        return np.sqrt(np.mean((Y - Ystar) ** 2))
+    @pytest.mark.skip(
+        "Fails as a consequence of fixing the DSYR function. Needs to be reviewed!"
+    )
+    def test_ep_with_studentt(self):
+        self.setup()
+        self.tear_down()

-    @with_setup(setUp, tearDown)
-    @unittest.skip("Fails as a consequence of fixing the DSYR function. Needs to be reviewed!")
-    def test_EP_with_StudentT(self):
-        studentT = GPy.likelihoods.StudentT(deg_free=self.deg_free, sigma2=self.init_var)
+        studentT = GPy.likelihoods.StudentT(
+            deg_free=self.deg_free, sigma2=self.init_var
+        )
        laplace_inf = GPy.inference.latent_function_inference.Laplace()

-        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
-        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
-        ep_inf_frac = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.7)
+        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode="alternated")
+        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode="nested")
+        ep_inf_frac = GPy.inference.latent_function_inference.EP(
+            ep_mode="nested", eta=0.7
+        )

-        m1 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=laplace_inf)
+        m1 = GPy.core.GP(
+            self.X.copy(),
+            self.Y_noisy.copy(),
+            kernel=self.kernel1.copy(),
+            likelihood=studentT.copy(),
+            inference_method=laplace_inf,
+        )
        # optimize
-        m1['.*white'].constrain_fixed(1e-5)
+        m1[".*white"].constrain_fixed(1e-5)
        m1.randomize()

-        m2 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=ep_inf_alt)
-        m2['.*white'].constrain_fixed(1e-5)
+        m2 = GPy.core.GP(
+            self.X.copy(),
+            self.Y_noisy.copy(),
+            kernel=self.kernel1.copy(),
+            likelihood=studentT.copy(),
+            inference_method=ep_inf_alt,
+        )
+        m2[".*white"].constrain_fixed(1e-5)
        # m2.constrain_bounded('.*t_scale2', 0.001, 10)
        m2.randomize()

@ -123,12 +170,14 @@ class TestObservationModels(unittest.TestCase):
        # # m3.constrain_bounded('.*t_scale2', 0.001, 10)
        # m3.randomize()

-        optimizer='bfgs'
-        m1.optimize(optimizer=optimizer,max_iters=400)
+        optimizer = "bfgs"
+        m1.optimize(optimizer=optimizer, max_iters=400)
        m2.optimize(optimizer=optimizer, max_iters=400)
        # m3.optimize(optimizer=optimizer, max_iters=500)

-        self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=200)
+        # TODO: this was assertAlmostEqual, not sure if allclose will do the job here
+        #    I replace the old delta with the atol
+        assert np.allclose(m1.log_likelihood(), m2.log_likelihood(), atol=200.0)

        # self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), 3)

@ -140,9 +189,7 @@ class TestObservationModels(unittest.TestCase):
        # rmse_nested = self.rmse(preds_mean_nested, self.Y_noisy)

        if rmse_alt > rmse_lap:
-            self.assertAlmostEqual(rmse_lap, rmse_alt, delta=1.5)
+            # TODO: this was assertAlmostEqual, not sure if allclose will do the job here
+            #   I replace the old delta with the atol
+            assert np.allclose(rmse_lap, rmse_alt, atol=1.5)
        # m3.optimize(optimizer=optimizer, max_iters=500)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/GPy/testing/gp_tests.py
+++ b/GPy/testing/gp_tests.py
@ -1,36 +1,36 @@
-'''
+"""
 Created on 4 Sep 2015

@author: maxz
-'''
-import unittest
-import numpy as np, GPy
+"""
+import numpy as np
+import GPy
 from GPy.core.parameterization.variational import NormalPosterior

-class Test(unittest.TestCase):

-
-    def setUp(self):
+class TestGP:
+    def setup(self):
        np.random.seed(12345)
        self.N = 20
        self.N_new = 50
        self.D = 1
-        self.X = np.random.uniform(-3., 3., (self.N, 1))
+        self.X = np.random.uniform(-3.0, 3.0, (self.N, 1))
        self.Y = np.sin(self.X) + np.random.randn(self.N, self.D) * 0.05
-        self.X_new = np.random.uniform(-3., 3., (self.N_new, 1))
-
+        self.X_new = np.random.uniform(-3.0, 3.0, (self.N_new, 1))

    def test_setxy_bgplvm(self):
+        self.setup()
+
        k = GPy.kern.RBF(1)
        m = GPy.models.BayesianGPLVM(self.Y, 1, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X
        Xnew = NormalPosterior(m.X.mean[:10].copy(), m.X.variance[:10].copy())
        m.set_XY(Xnew, m.Y[:10].copy())
-        assert(m.checkgrad())
+        assert m.checkgrad()

-        assert(m.num_data == m.X.shape[0])
-        assert(m.input_dim == m.X.shape[1])
+        assert m.num_data == m.X.shape[0]
+        assert m.input_dim == m.X.shape[1]

        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
@ -38,16 +38,18 @@ class Test(unittest.TestCase):
        np.testing.assert_allclose(var, var2)

    def test_setxy_gplvm(self):
+        self.setup()
+
        k = GPy.kern.RBF(1)
        m = GPy.models.GPLVM(self.Y, 1, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X.copy()
        Xnew = X[:10].copy()
        m.set_XY(Xnew, m.Y[:10].copy())
-        assert(m.checkgrad())
+        assert m.checkgrad()

-        assert(m.num_data == m.X.shape[0])
-        assert(m.input_dim == m.X.shape[1])
+        assert m.num_data == m.X.shape[0]
+        assert m.input_dim == m.X.shape[1]

        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
@ -55,15 +57,17 @@ class Test(unittest.TestCase):
        np.testing.assert_allclose(var, var2)

    def test_setxy_gp(self):
+        self.setup()
+
        k = GPy.kern.RBF(1)
        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X.copy()
        m.set_XY(m.X[:10], m.Y[:10])
-        assert(m.checkgrad())
+        assert m.checkgrad()

-        assert(m.num_data == m.X.shape[0])
-        assert(m.input_dim == m.X.shape[1])
+        assert m.num_data == m.X.shape[0]
+        assert m.input_dim == m.X.shape[1]

        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
@ -73,39 +77,45 @@ class Test(unittest.TestCase):
    def test_mean_function(self):
        from GPy.core.parameterization.param import Param
        from GPy.core.mapping import Mapping
+
+        self.setup()
+
        class Parabola(Mapping):
-            def __init__(self, variance, degree=2, name='parabola'):
+            def __init__(self, variance, degree=2, name="parabola"):
                super(Parabola, self).__init__(1, 1, name)
-                self.variance = Param('variance', np.ones(degree+1) * variance)
+                self.variance = Param("variance", np.ones(degree + 1) * variance)
                self.degree = degree
                self.link_parameter(self.variance)

            def f(self, X):
                p = self.variance[0] * np.ones(X.shape)
-                for i in range(1, self.degree+1):
-                    p += self.variance[i] * X**(i)
+                for i in range(1, self.degree + 1):
+                    p += self.variance[i] * X ** (i)
                return p

            def gradients_X(self, dL_dF, X):
                grad = np.zeros(X.shape)
-                for i in range(1, self.degree+1):
-                    grad += (i) * self.variance[i] * X**(i-1)
+                for i in range(1, self.degree + 1):
+                    grad += (i) * self.variance[i] * X ** (i - 1)
                return grad

            def update_gradients(self, dL_dF, X):
-                for i in range(self.degree+1):
-                    self.variance.gradient[i] = (dL_dF * X**(i)).sum(0)
+                for i in range(self.degree + 1):
+                    self.variance.gradient[i] = (dL_dF * X ** (i)).sum(0)
+
        X = np.linspace(-2, 2, 100)[:, None]
        k = GPy.kern.RBF(1)
        k.randomize()
-        p = Parabola(.3)
+        p = Parabola(0.3)
        p.randomize()
-        Y = p.f(X) + np.random.multivariate_normal(np.zeros(X.shape[0]), k.K(X)+np.eye(X.shape[0])*1e-8)[:,None] + np.random.normal(0, .1, (X.shape[0], 1))
+        Y = (
+            p.f(X)
+            + np.random.multivariate_normal(
+                np.zeros(X.shape[0]), k.K(X) + np.eye(X.shape[0]) * 1e-8
+            )[:, None]
+            + np.random.normal(0, 0.1, (X.shape[0], 1))
+        )
        m = GPy.models.GPRegression(X, Y, mean_function=p)
        m.randomize()
-        assert(m.checkgrad())
+        assert m.checkgrad()
        _ = m.predict(m.X)
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.testName']
-    unittest.main()
--- a/GPy/testing/test_gpy_kernels_state_space.py
+++ b/GPy/testing/test_gpy_kernels_state_space.py
--- a/GPy/testing/grid_tests.py
+++ b/GPy/testing/grid_tests.py
@ -3,21 +3,33 @@

 # Kurt Cutajar

-import unittest
 import numpy as np
 import GPy

-class GridModelTest(unittest.TestCase):
-    def setUp(self):
+
+class TestGridModel:
+    def setup(self):
        ######################################
        # # 3 dimensional example

        # sample inputs and outputs
-        self.X = np.array([[0,0,0],[0,0,1],[0,1,0],[0,1,1],[1,0,0],[1,0,1],[1,1,0],[1,1,1]])
+        self.X = np.array(
+            [
+                [0, 0, 0],
+                [0, 0, 1],
+                [0, 1, 0],
+                [0, 1, 1],
+                [1, 0, 0],
+                [1, 0, 1],
+                [1, 1, 0],
+                [1, 1, 1],
+            ]
+        )
        self.Y = np.random.randn(8, 1) * 100
        self.dim = self.X.shape[1]

    def test_alpha_match(self):
+        self.setup()
        kernel = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
        m = GPy.models.GPRegressionGrid(self.X, self.Y, kernel)

@ -27,25 +39,31 @@ class GridModelTest(unittest.TestCase):
        np.testing.assert_almost_equal(m.posterior.alpha, m2.posterior.woodbury_vector)

    def test_gradient_match(self):
+        self.setup()
        kernel = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
        m = GPy.models.GPRegressionGrid(self.X, self.Y, kernel)

        kernel2 = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
        m2 = GPy.models.GPRegression(self.X, self.Y, kernel2)

-        np.testing.assert_almost_equal(kernel.variance.gradient, kernel2.variance.gradient)
-        np.testing.assert_almost_equal(kernel.lengthscale.gradient, kernel2.lengthscale.gradient)
-        np.testing.assert_almost_equal(m.likelihood.variance.gradient, m2.likelihood.variance.gradient)
-
+        np.testing.assert_almost_equal(
+            kernel.variance.gradient, kernel2.variance.gradient
+        )
+        np.testing.assert_almost_equal(
+            kernel.lengthscale.gradient, kernel2.lengthscale.gradient
+        )
+        np.testing.assert_almost_equal(
+            m.likelihood.variance.gradient, m2.likelihood.variance.gradient
+        )

    def test_prediction_match(self):
+        self.setup()
        kernel = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
        m = GPy.models.GPRegressionGrid(self.X, self.Y, kernel)

        kernel2 = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
        m2 = GPy.models.GPRegression(self.X, self.Y, kernel2)

-        test = np.array([[0,0,2],[-1,3,-4]])
+        test = np.array([[0, 0, 2], [-1, 3, -4]])

        np.testing.assert_almost_equal(m.predict(test), m2.predict(test))
-
--- a/GPy/testing/test_inference.py
+++ b/GPy/testing/test_inference.py
@ -0,0 +1,275 @@
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+The test cases for various inference algorithms
+"""
+
+import numpy as np
+import GPy
+
+# np.seterr(invalid='raise')
+
+
+class TestInferenceXCase:
+    def get_data(self):
+        np.random.seed(1111)
+        Ylist = GPy.examples.dimensionality_reduction._simulate_matern(
+            5, 1, 1, 10, 3, False
+        )[0]
+        return Ylist[0]
+
+    def test_inferenceX_BGPLVM_Linear(self):
+        Ys = self.get_data()
+        m = GPy.models.BayesianGPLVM(Ys, 3, kernel=GPy.kern.Linear(3, ARD=True))
+        m.optimize()
+        x, mi = m.infer_newX(m.Y, optimize=True)
+        np.testing.assert_array_almost_equal(m.X.mean, mi.X.mean, decimal=2)
+        np.testing.assert_array_almost_equal(m.X.variance, mi.X.variance, decimal=2)
+
+    def test_inferenceX_BGPLVM_RBF(self):
+        Ys = self.get_data()
+        m = GPy.models.BayesianGPLVM(Ys, 3, kernel=GPy.kern.RBF(3, ARD=True))
+        import warnings
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            m.optimize()
+        _x, mi = m.infer_newX(m.Y, optimize=True)
+        np.testing.assert_array_almost_equal(m.X.mean, mi.X.mean, decimal=2)
+        np.testing.assert_array_almost_equal(m.X.variance, mi.X.variance, decimal=2)
+
+    def test_inferenceX_GPLVM_Linear(self):
+        Ys = self.get_data()
+        m = GPy.models.GPLVM(Ys, 3, kernel=GPy.kern.Linear(3, ARD=True))
+        m.optimize()
+        _x, mi = m.infer_newX(m.Y, optimize=True)
+        np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
+
+    def test_inferenceX_GPLVM_RBF(self):
+        Ys = self.get_data()
+        m = GPy.models.GPLVM(Ys, 3, kernel=GPy.kern.RBF(3, ARD=True))
+        m.optimize()
+        _x, mi = m.infer_newX(m.Y, optimize=True)
+        np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
+
+
+class TestInferenceGPEP:
+    def get_data(self):
+        np.random.seed(1)
+        k = GPy.kern.RBF(1, variance=7.0, lengthscale=0.2)
+        X = np.random.rand(200, 1)
+        f = np.random.multivariate_normal(
+            np.zeros(200), k.K(X) + 1e-5 * np.eye(X.shape[0])
+        )
+        lik = GPy.likelihoods.Bernoulli()
+        _p = lik.gp_link.transf(f)  # squash the latent function
+        Y = lik.samples(f).reshape(-1, 1)
+        return X, Y
+
+    def get_noisy_data(self):
+        np.random.seed(1)
+        X = np.random.rand(100, 1)
+        self.real_std = 0.1
+        noise = np.random.randn(*X[:, 0].shape) * self.real_std
+        Y = (np.sin(X[:, 0] * 2 * np.pi) + noise)[:, None]
+        self.f = np.random.rand(X.shape[0], 1)
+        Y_extra_noisy = Y.copy()
+        Y_extra_noisy[50] += 4.0
+        # Y_extra_noisy[80:83] -= 2.
+        return X, Y, Y_extra_noisy
+
+    def test_inference_EP(self):
+        from paramz import ObsAr
+
+        X, Y = self.get_data()
+        lik = GPy.likelihoods.Bernoulli()
+        k = GPy.kern.RBF(1, variance=7.0, lengthscale=0.2)
+        inf = GPy.inference.latent_function_inference.expectation_propagation.EP(
+            max_iters=30, delta=0.5
+        )
+        self.model = GPy.core.GP(
+            X=X, Y=Y, kernel=k, inference_method=inf, likelihood=lik
+        )
+        K = self.model.kern.K(X)
+        mean_prior = np.zeros(K.shape[0])
+        (
+            post_params,
+            ga_approx,
+            cav_params,
+            log_Z_tilde,
+        ) = self.model.inference_method.expectation_propagation(
+            mean_prior, K, ObsAr(Y), lik, None
+        )
+
+        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
+        p, m, d = self.model.inference_method._inference(
+            Y,
+            mean_prior,
+            K,
+            ga_approx,
+            cav_params,
+            lik,
+            Y_metadata=None,
+            Z_tilde=log_Z_tilde,
+        )
+        p0, m0, d0 = super(
+            GPy.inference.latent_function_inference.expectation_propagation.EP, inf
+        ).inference(
+            k,
+            X,
+            lik,
+            mu_tilde[:, None],
+            mean_function=None,
+            variance=1.0 / ga_approx.tau,
+            K=K,
+            Z_tilde=log_Z_tilde
+            + np.sum(
+                -0.5 * np.log(ga_approx.tau)
+                + 0.5 * (ga_approx.v * ga_approx.v * 1.0 / ga_approx.tau)
+            ),
+        )
+
+        assert (
+            np.sum(
+                np.array(
+                    [
+                        m - m0,
+                        np.sum(d["dL_dK"] - d0["dL_dK"]),
+                        np.sum(d["dL_dthetaL"] - d0["dL_dthetaL"]),
+                        np.sum(d["dL_dm"] - d0["dL_dm"]),
+                        np.sum(p._woodbury_vector - p0._woodbury_vector),
+                        np.sum(p.woodbury_inv - p0.woodbury_inv),
+                    ]
+                )
+            )
+            < 1e6
+        )
+
+    # NOTE: adding a test like above for parameterized likelihood- the above test is
+    # only for probit likelihood which does not have any tunable hyperparameter which is why
+    # the term in dictionary of gradients: dL_dthetaL will always be zero. So here we repeat tests for
+    # student-t likelihood and heterodescastic gaussian noise case. This test simply checks if the posterior
+    # and gradients of log marginal are roughly the same for inference through EP and exact gaussian inference using
+    # the gaussian approximation for the individual likelihood site terms. For probit likelihood, it is possible to
+    # calculate moments analytically, but for other likelihoods, we will need to use numerical quadrature techniques,
+    # and it is possible that any error might creep up because of quadrature implementation.
+    def test_inference_EP_non_classification(self):
+        from paramz import ObsAr
+
+        X, _Y, Y_extra_noisy = self.get_noisy_data()
+        deg_freedom = 5.0
+        init_noise_var = 0.08
+        lik_studentT = GPy.likelihoods.StudentT(
+            deg_free=deg_freedom, sigma2=init_noise_var
+        )
+        # like_gaussian_noise = GPy.likelihoods.MixedNoise()
+        k = GPy.kern.RBF(1, variance=2.0, lengthscale=1.1)
+        ep_inf_alt = GPy.inference.latent_function_inference.expectation_propagation.EP(
+            max_iters=4, delta=0.5
+        )
+        # ep_inf_nested = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode='nested', max_iters=100, delta=0.5)
+        m = GPy.core.GP(
+            X=X,
+            Y=Y_extra_noisy,
+            kernel=k,
+            likelihood=lik_studentT,
+            inference_method=ep_inf_alt,
+        )
+        K = m.kern.K(X)
+        mean_prior = np.zeros(K.shape[0])
+        (
+            post_params,
+            ga_approx,
+            cav_params,
+            log_Z_tilde,
+        ) = m.inference_method.expectation_propagation(
+            mean_prior, K, ObsAr(Y_extra_noisy), lik_studentT, None
+        )
+
+        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
+        p, m, d = m.inference_method._inference(
+            Y_extra_noisy,
+            mean_prior,
+            K,
+            ga_approx,
+            cav_params,
+            lik_studentT,
+            Y_metadata=None,
+            Z_tilde=log_Z_tilde,
+        )
+        p0, m0, d0 = super(
+            GPy.inference.latent_function_inference.expectation_propagation.EP,
+            ep_inf_alt,
+        ).inference(
+            k,
+            X,
+            lik_studentT,
+            mu_tilde[:, None],
+            mean_function=None,
+            variance=1.0 / ga_approx.tau,
+            K=K,
+            Z_tilde=log_Z_tilde
+            + np.sum(
+                -0.5 * np.log(ga_approx.tau)
+                + 0.5 * (ga_approx.v * ga_approx.v * 1.0 / ga_approx.tau)
+            ),
+        )
+
+        assert (
+            np.sum(
+                np.array(
+                    [
+                        m - m0,
+                        np.sum(d["dL_dK"] - d0["dL_dK"]),
+                        np.sum(d["dL_dthetaL"] - d0["dL_dthetaL"]),
+                        np.sum(d["dL_dm"] - d0["dL_dm"]),
+                        np.sum(p._woodbury_vector - p0._woodbury_vector),
+                        np.sum(p.woodbury_inv - p0.woodbury_inv),
+                    ]
+                )
+            )
+            < 1e6
+        )
+
+
+class TestVarDtc:
+    def test_var_dtc_inference_with_mean(self):
+        """Check dL_dm in var_dtc is calculated correctly"""
+        np.random.seed(1)
+        x = np.linspace(0.0, 2 * np.pi, 100)[:, None]
+        y = -np.cos(x) + np.random.randn(*x.shape) * 0.3 + 1
+        m = GPy.models.SparseGPRegression(
+            x, y, mean_function=GPy.mappings.Linear(input_dim=1, output_dim=1)
+        )
+        assert m.checkgrad()
+
+
+class TestHMCSampler:
+    def test_sampling(self):
+        np.random.seed(1)
+        x = np.linspace(0.0, 2 * np.pi, 100)[:, None]
+        y = -np.cos(x) + np.random.randn(*x.shape) * 0.3 + 1
+
+        m = GPy.models.GPRegression(x, y)
+        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
+        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
+        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
+
+        hmc = GPy.inference.mcmc.HMC(m, stepsize=1e-2)
+        _s = hmc.sample(num_samples=3)
+
+
+class TestMCMCSampler:
+    def test_sampling(self):
+        np.random.seed(1)
+        x = np.linspace(0.0, 2 * np.pi, 100)[:, None]
+        y = -np.cos(x) + np.random.randn(*x.shape) * 0.3 + 1
+
+        m = GPy.models.GPRegression(x, y)
+        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
+        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
+        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
+
+        mcmc = GPy.inference.mcmc.Metropolis_Hastings(m)
+        mcmc.sample(Ntotal=100, Nburn=10)
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
--- a/GPy/testing/test_linalg.py
+++ b/GPy/testing/test_linalg.py
@ -1,18 +1,19 @@
 import numpy as np
 import scipy as sp
-from ..util.linalg import jitchol,trace_dot, ijk_jlk_to_il, ijk_ljk_to_ilk
+from ..util.linalg import jitchol, trace_dot, ijk_jlk_to_il, ijk_ljk_to_ilk

-class LinalgTests(np.testing.TestCase):
-    def setUp(self):
-        #Create PD matrix
-        A = np.random.randn(20,100)
+
+class TestLinalg:
+    def setup(self):
+        # Create PD matrix
+        A = np.random.randn(20, 100)
        self.A = A.dot(A.T)
-        #compute Eigdecomp
+        # compute Eigdecomp
        vals, vectors = np.linalg.eig(self.A)
-        #Set smallest eigenval to be negative with 5 rounds worth of jitter
+        # Set smallest eigenval to be negative with 5 rounds worth of jitter
        vals[vals.argmin()] = 0
-        default_jitter = 1e-6*np.mean(vals)
-        vals[vals.argmin()] = -default_jitter*(10**3.5)
+        default_jitter = 1e-6 * np.mean(vals)
+        vals[vals.argmin()] = -default_jitter * (10**3.5)
        self.A_corrupt = (vectors * vals).dot(vectors.T)

    def test_jitchol_success(self):
@ -20,12 +21,16 @@ class LinalgTests(np.testing.TestCase):
        Expect 5 rounds of jitter to be added and for the recovered matrix to be
        identical to the corrupted matrix apart from the jitter added to the diagonal
        """
+        self.setup()
        L = jitchol(self.A_corrupt, maxtries=5)
        A_new = L.dot(L.T)
        diff = A_new - self.A_corrupt
-        np.testing.assert_allclose(diff, np.eye(A_new.shape[0])*np.diag(diff).mean(), atol=1e-13)
+        np.testing.assert_allclose(
+            diff, np.eye(A_new.shape[0]) * np.diag(diff).mean(), atol=1e-13
+        )

    def test_jitchol_failure(self):
+        self.setup()
        try:
            """
            Expecting an exception to be thrown as we expect it to require
@ -37,24 +42,27 @@ class LinalgTests(np.testing.TestCase):
            return True

    def test_trace_dot(self):
+        self.setup()
        N = 5
-        A = np.random.rand(N,N)
-        B = np.random.rand(N,N)
+        A = np.random.rand(N, N)
+        B = np.random.rand(N, N)
        trace = np.trace(A.dot(B))
-        test_trace = trace_dot(A,B)
-        np.testing.assert_allclose(trace,test_trace,atol=1e-13)
+        test_trace = trace_dot(A, B)
+        np.testing.assert_allclose(trace, test_trace, atol=1e-13)

    def test_einsum_ij_jlk_to_ilk(self):
+        self.setup()
        A = np.random.randn(15, 150, 5)
        B = np.random.randn(150, 50, 5)
-        pure = np.einsum('ijk,jlk->il', A, B)
-        quick = ijk_jlk_to_il(A,B)
+        pure = np.einsum("ijk,jlk->il", A, B)
+        quick = ijk_jlk_to_il(A, B)
        np.testing.assert_allclose(pure, quick)

    def test_einsum_ijk_ljk_to_ilk(self):
+        self.setup()
        A = np.random.randn(150, 20, 5)
        B = np.random.randn(150, 20, 5)
-        #B = A.copy()
-        pure = np.einsum('ijk,ljk->ilk', A, B)
-        quick = ijk_ljk_to_ilk(A,B)
+        # B = A.copy()
+        pure = np.einsum("ijk,ljk->ilk", A, B)
+        quick = ijk_ljk_to_ilk(A, B)
        np.testing.assert_allclose(pure, quick)
--- a/GPy/testing/test_link_function.py
+++ b/GPy/testing/test_link_function.py
@ -0,0 +1,196 @@
+import numpy as np
+import scipy
+from scipy.special import cbrt
+from GPy.models import GradientChecker
+import random
+
+_lim_val = np.finfo(np.float64).max
+_lim_val_exp = np.log(_lim_val)
+_lim_val_square = np.sqrt(_lim_val)
+_lim_val_cube = cbrt(_lim_val)
+from GPy.likelihoods.link_functions import (
+    Identity,
+    Probit,
+    Cloglog,
+    Log,
+    Log_ex_1,
+    Reciprocal,
+    Heaviside,
+    ScaledProbit,
+)
+
+
+class TestLinkFunction:
+    def setup(self):
+        self.small_f = np.array([[-1e-4]])
+        self.zero_f = np.array([[1e-4]])
+        self.mid_f = np.array([[5.0]])
+        self.large_f = np.array([[1e4]])
+        self.f_lower_lim = np.array(-np.inf)
+        self.f_upper_lim = np.array(np.inf)
+
+    def check_gradient(self, link_func, lim_of_inf, test_lim=False):
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.mid_f)
+        assert grad.checkgrad(verbose=True)
+        grad2 = GradientChecker(
+            link_func.dtransf_df, link_func.d2transf_df2, x0=self.mid_f
+        )
+        assert grad2.checkgrad(verbose=True)
+        grad3 = GradientChecker(
+            link_func.d2transf_df2, link_func.d3transf_df3, x0=self.mid_f
+        )
+        assert grad3.checkgrad(verbose=True)
+
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.small_f)
+        assert grad.checkgrad(verbose=True)
+        grad2 = GradientChecker(
+            link_func.dtransf_df, link_func.d2transf_df2, x0=self.small_f
+        )
+        assert grad2.checkgrad(verbose=True)
+        grad3 = GradientChecker(
+            link_func.d2transf_df2, link_func.d3transf_df3, x0=self.small_f
+        )
+        assert grad3.checkgrad(verbose=True)
+
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.zero_f)
+        assert grad.checkgrad(verbose=True)
+        grad2 = GradientChecker(
+            link_func.dtransf_df, link_func.d2transf_df2, x0=self.zero_f
+        )
+        assert grad2.checkgrad(verbose=True)
+        grad3 = GradientChecker(
+            link_func.d2transf_df2, link_func.d3transf_df3, x0=self.zero_f
+        )
+        assert grad3.checkgrad(verbose=True)
+
+        # Do a limit test if the large f value is too large
+        large_f = np.clip(self.large_f, -np.inf, lim_of_inf - 1e-3)
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=large_f)
+        assert grad.checkgrad(verbose=True)
+        grad2 = GradientChecker(
+            link_func.dtransf_df, link_func.d2transf_df2, x0=large_f
+        )
+        assert grad2.checkgrad(verbose=True)
+        grad3 = GradientChecker(
+            link_func.d2transf_df2, link_func.d3transf_df3, x0=large_f
+        )
+        assert grad3.checkgrad(verbose=True)
+
+        if test_lim:
+            print("Testing limits")
+            # Remove some otherwise we are too close to the limit for gradcheck to work effectively
+            lim_of_inf = lim_of_inf - 1e-4
+            grad = GradientChecker(
+                link_func.transf, link_func.dtransf_df, x0=lim_of_inf
+            )
+            assert grad.checkgrad(verbose=True)
+            grad2 = GradientChecker(
+                link_func.dtransf_df, link_func.d2transf_df2, x0=lim_of_inf
+            )
+            assert grad2.checkgrad(verbose=True)
+            grad3 = GradientChecker(
+                link_func.d2transf_df2, link_func.d3transf_df3, x0=lim_of_inf
+            )
+            assert grad3.checkgrad(verbose=True)
+
+    def check_overflow(self, link_func, lim_of_inf):
+        # Check that it does something sensible beyond this limit,
+        # note this is not checking the value is correct, just that it isn't nan
+        beyond_lim_of_inf = lim_of_inf + 100.0
+        assert not np.isinf(link_func.transf(beyond_lim_of_inf))
+        assert not np.isinf(link_func.dtransf_df(beyond_lim_of_inf))
+        assert not np.isinf(link_func.d2transf_df2(beyond_lim_of_inf))
+
+        assert not np.isnan(link_func.transf(beyond_lim_of_inf))
+        assert not np.isnan(link_func.dtransf_df(beyond_lim_of_inf))
+        assert not np.isnan(link_func.d2transf_df2(beyond_lim_of_inf))
+
+    def test_log_overflow(self):
+        self.setup()
+
+        link = Log()
+        lim_of_inf = _lim_val_exp
+
+        np.testing.assert_almost_equal(np.exp(self.mid_f), link.transf(self.mid_f))
+        assert np.isinf(np.exp(np.log(self.f_upper_lim)))
+        # Check the clipping works
+        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
+        assert np.isfinite(link.transf(self.f_upper_lim))
+        self.check_overflow(link, lim_of_inf)
+
+        # Check that it would otherwise fail
+        beyond_lim_of_inf = lim_of_inf + 10.0
+        old_err_state = np.seterr(over="ignore")
+        assert np.isinf(np.exp(beyond_lim_of_inf))
+        np.seterr(**old_err_state)
+
+    def test_log_ex_1_overflow(self):
+        self.setup()
+
+        link = Log_ex_1()
+        lim_of_inf = _lim_val_exp
+
+        np.testing.assert_almost_equal(
+            scipy.special.log1p(np.exp(self.mid_f)), link.transf(self.mid_f)
+        )
+        assert np.isinf(scipy.special.log1p(np.exp(np.log(self.f_upper_lim))))
+        # Check the clipping works
+        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
+        # Need to look at most significant figures here rather than the decimals
+        np.testing.assert_approx_equal(
+            link.transf(self.f_upper_lim), scipy.special.log1p(_lim_val), significant=5
+        )
+        self.check_overflow(link, lim_of_inf)
+
+        # Check that it would otherwise fail
+        beyond_lim_of_inf = lim_of_inf + 10.0
+        old_err_state = np.seterr(over="ignore")
+        assert np.isinf(scipy.special.log1p(np.exp(beyond_lim_of_inf)))
+        np.seterr(**old_err_state)
+
+    def test_log_gradients(self):
+        # transf dtransf_df d2transf_df2 d3transf_df3
+        self.setup()
+
+        link = Log()
+        lim_of_inf = _lim_val_exp
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_identity_gradients(self):
+        self.setup()
+        link = Identity()
+        lim_of_inf = _lim_val
+        # FIXME: Should be able to think of a way to test the limits of this
+        self.check_gradient(link, lim_of_inf, test_lim=False)
+
+    def test_probit_gradients(self):
+        self.setup()
+        link = Probit()
+        lim_of_inf = _lim_val
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_scaledprobit_gradients(self):
+        self.setup()
+        link = ScaledProbit(nu=random.random())
+        lim_of_inf = _lim_val
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_Cloglog_gradients(self):
+        self.setup()
+        link = Cloglog()
+        lim_of_inf = _lim_val_exp
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_Log_ex_1_gradients(self):
+        self.setup()
+        link = Log_ex_1()
+        lim_of_inf = _lim_val_exp
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+        self.check_overflow(link, lim_of_inf)
+
+    def test_reciprocal_gradients(self):
+        self.setup()
+        link = Reciprocal()
+        lim_of_inf = _lim_val
+        # Does not work with much smaller values, and values closer to zero than 1e-5
+        self.check_gradient(link, lim_of_inf, test_lim=True)
--- a/GPy/testing/mapping_tests.py
+++ b/GPy/testing/mapping_tests.py
@ -1,10 +1,10 @@
 # Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import unittest
 import numpy as np
 import GPy

+
 class MappingGradChecker(GPy.core.Model):
    """
    This class has everything we need to check the gradient of a mapping. It
@ -12,63 +12,60 @@ class MappingGradChecker(GPy.core.Model):
    mapping. the gradients are checked against the parameters of the mapping
    and the input.
    """
-    def __init__(self, mapping, X, name='map_grad_check'):
+
+    def __init__(self, mapping, X, name="map_grad_check"):
        super(MappingGradChecker, self).__init__(name)
        self.mapping = mapping
        self.link_parameter(self.mapping)
-        self.X = GPy.core.Param('X',X)
+        self.X = GPy.core.Param("X", X)
        self.link_parameter(self.X)
        self.dL_dY = np.random.randn(self.X.shape[0], self.mapping.output_dim)
+
    def log_likelihood(self):
        return np.sum(self.mapping.f(self.X) * self.dL_dY)
+
    def parameters_changed(self):
        self.X.gradient = self.mapping.gradients_X(self.dL_dY, self.X)
        self.mapping.update_gradients(self.dL_dY, self.X)


-class MappingTests(unittest.TestCase):
-
+class TestMapping:
    def test_kernelmapping(self):
-        X = np.random.randn(100,3)
-        Z = np.random.randn(10,3)
+        X = np.random.randn(100, 3)
+        Z = np.random.randn(10, 3)
        mapping = GPy.mappings.Kernel(3, 2, Z, GPy.kern.RBF(3))
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        assert MappingGradChecker(mapping, X).checkgrad()

    def test_linearmapping(self):
        mapping = GPy.mappings.Linear(3, 2)
-        X = np.random.randn(100,3)
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        X = np.random.randn(100, 3)
+        assert MappingGradChecker(mapping, X).checkgrad()

    def test_mlpmapping(self):
        mapping = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
-        X = np.random.randn(100,3)
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        X = np.random.randn(100, 3)
+        assert MappingGradChecker(mapping, X).checkgrad()

    def test_mlpextmapping(self):
        np.random.seed(42)
-        X = np.random.randn(100,3)
-        for activation in ['tanh', 'relu', 'sigmoid']:
-            mapping = GPy.mappings.MLPext(input_dim=3, hidden_dims=[5,5], output_dim=2, activation=activation)
-            self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        X = np.random.randn(100, 3)
+        for activation in ["tanh", "relu", "sigmoid"]:
+            mapping = GPy.mappings.MLPext(
+                input_dim=3, hidden_dims=[5, 5], output_dim=2, activation=activation
+            )
+            assert MappingGradChecker(mapping, X).checkgrad()

    def test_addmapping(self):
        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
        m2 = GPy.mappings.Linear(input_dim=3, output_dim=2)
        mapping = GPy.mappings.Additive(m1, m2)
-        X = np.random.randn(100,3)
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        X = np.random.randn(100, 3)
+        assert MappingGradChecker(mapping, X).checkgrad()

    def test_compoundmapping(self):
        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
-        Z = np.random.randn(10,2)
+        Z = np.random.randn(10, 2)
        m2 = GPy.mappings.Kernel(2, 4, Z, GPy.kern.RBF(2))
        mapping = GPy.mappings.Compound(m1, m2)
-        X = np.random.randn(100,3)
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
-
-
-
-
-if __name__ == "__main__":
-    print("Running unit tests, please be (very) patient...")
-    unittest.main()
+        X = np.random.randn(100, 3)
+        assert MappingGradChecker(mapping, X).checkgrad()
--- a/GPy/testing/test_meanfunc.py
+++ b/GPy/testing/test_meanfunc.py
@ -0,0 +1,90 @@
+# Copyright (c) 2015, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import GPy
+
+
+class TestMF:
+    def test_simple_mean_function(self):
+        """
+        The simplest possible mean function. No parameters, just a simple Sinusoid.
+        """
+        # create  simple mean function
+        mf = GPy.core.Mapping(1, 1)
+        mf.f = np.sin
+        mf.update_gradients = lambda a, b: None
+
+        X = np.linspace(0, 10, 50).reshape(-1, 1)
+        Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape)
+
+        k = GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        assert m.checkgrad()
+
+    def test_parametric_mean_function(self):
+        """
+        A linear mean function with parameters that we'll learn alongside the kernel
+        """
+
+        X = np.linspace(-1, 10, 50).reshape(-1, 1)
+
+        Y = 3 - np.abs((X - 6))
+        Y += 0.5 * np.cos(3 * X) + 0.3 * np.random.randn(*X.shape)
+
+        mf = GPy.mappings.PiecewiseLinear(1, 1, [-1, 1], [9, 2])
+
+        k = GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        assert m.checkgrad()
+
+    def test_parametric_mean_function_composition(self):
+        """
+        A linear mean function with parameters that we'll learn alongside the kernel
+        """
+
+        X = np.linspace(0, 10, 50).reshape(-1, 1)
+        Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape) + 3 * X
+
+        mf = GPy.mappings.Compound(
+            GPy.mappings.Linear(1, 1),
+            GPy.mappings.Kernel(1, 1, np.random.normal(0, 1, (1, 1)), GPy.kern.RBF(1)),
+        )
+
+        k = GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        assert m.checkgrad()
+
+    def test_parametric_mean_function_additive(self):
+        """
+        A linear mean function with parameters that we'll learn alongside the kernel
+        """
+
+        X = np.linspace(0, 10, 50).reshape(-1, 1)
+        Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape) + 3 * X
+
+        mf = GPy.mappings.Additive(
+            GPy.mappings.Constant(1, 1, 3),
+            GPy.mappings.Additive(GPy.mappings.MLP(1, 1), GPy.mappings.Identity(1, 1)),
+        )
+
+        k = GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        assert m.checkgrad()
+
+    def test_svgp_mean_function(self):
+        # an instance of the SVIGOP with a men function
+        X = np.linspace(0, 10, 500).reshape(-1, 1)
+        Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape)
+        Y = np.where(Y > 0, 1, 0)  # make aclassificatino problem
+
+        mf = GPy.mappings.Linear(1, 1)
+        Z = np.linspace(0, 10, 50).reshape(-1, 1)
+        lik = GPy.likelihoods.Bernoulli()
+        k = GPy.kern.RBF(1) + GPy.kern.White(1, 1e-4)
+        m = GPy.core.SVGP(X, Y, Z=Z, kernel=k, likelihood=lik, mean_function=mf)
+        assert m.checkgrad()
--- a/GPy/testing/test_minibatch.py
+++ b/GPy/testing/test_minibatch.py
@ -0,0 +1,416 @@
+"""
+Created on 4 Sep 2015
+
+@author: maxz
+"""
+import pytest
+import numpy as np
+import GPy
+
+try:
+    import climin
+except ImportError:
+    climin = None
+
+
+class TestBGPLVM:
+    def setup(self):
+        np.random.seed(12345)
+        X, W = np.random.normal(0, 1, (100, 6)), np.random.normal(0, 1, (6, 13))
+        Y = X.dot(W) + np.random.normal(0, 0.1, (X.shape[0], W.shape[1]))
+        self.inan = np.random.binomial(1, 0.1, Y.shape).astype(bool)
+        self.X, self.W, self.Y = X, W, Y
+        self.Q = 3
+        self.m_full = GPy.models.BayesianGPLVM(Y, self.Q)
+
+    def test_lik_comparisons_m1_s0(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, missing_data=True, stochastic=False
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_predict_missing_data(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            missing_data=True,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+
+        with pytest.raises(NotImplementedError):
+            m.predict(m.X, full_cov=True)
+
+        mu1, var1 = m.predict(m.X, full_cov=False)
+        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
+        np.testing.assert_allclose(mu1, mu2)
+        np.testing.assert_allclose(var1, var2)
+
+        mu1, var1 = m.predict(m.X.mean, full_cov=True)
+        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=True)
+        np.testing.assert_allclose(mu1, mu2)
+        np.testing.assert_allclose(var1[:, :, 0], var2)
+
+        mu1, var1 = m.predict(m.X.mean, full_cov=False)
+        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=False)
+        np.testing.assert_allclose(mu1, mu2)
+        np.testing.assert_allclose(var1[:, [0]], var2)
+
+    def test_lik_comparisons_m0_s0(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=self.m_full.X.variance.values,
+            missing_data=False,
+            stochastic=False,
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_lik_comparisons_m1_s1(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            missing_data=True,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_lik_comparisons_m0_s1(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            missing_data=False,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_gradients_missingdata(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            missing_data=True,
+            stochastic=False,
+            batchsize=self.Y.shape[1],
+        )
+        assert m.checkgrad()
+
+    def test_gradients_missingdata_stochastics(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, missing_data=True, stochastic=True, batchsize=1
+        )
+        assert m.checkgrad()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, missing_data=True, stochastic=True, batchsize=4
+        )
+        assert m.checkgrad()
+
+    def test_gradients_stochastics(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, missing_data=False, stochastic=True, batchsize=1
+        )
+        assert m.checkgrad()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, missing_data=False, stochastic=True, batchsize=4
+        )
+        assert m.checkgrad()
+
+    def test_predict(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            missing_data=True,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+
+class TestSparseGPMinibatch:
+    def setup(self):
+        np.random.seed(12345)
+        X, W = np.random.normal(0, 1, (100, 6)), np.random.normal(0, 1, (6, 13))
+        Y = X.dot(W) + np.random.normal(0, 0.1, (X.shape[0], W.shape[1]))
+        self.inan = np.random.binomial(1, 0.1, Y.shape).astype(bool)
+        self.X, self.W, self.Y = X, W, Y
+        self.Q = 3
+        self.m_full = GPy.models.SparseGPLVM(
+            Y, self.Q, kernel=GPy.kern.RBF(self.Q, ARD=True)
+        )
+
+    def test_lik_comparisons_m1_s0(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    @pytest.mark.skipif(climin is None, reason="climin not installed")
+    def test_sparsegp_init(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        np.random.seed(1234)
+        Z = self.X[np.random.choice(self.X.shape[0], replace=False, size=10)].copy()
+        Q = Z.shape[1]
+        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(
+            self.X,
+            self.Y,
+            Z,
+            GPy.kern.RBF(Q) + GPy.kern.Matern32(Q) + GPy.kern.Bias(Q),
+            GPy.likelihoods.Gaussian(),
+            missing_data=True,
+            stochastic=False,
+        )
+        assert m.checkgrad()
+        m.optimize("adadelta", max_iters=10)
+        assert m.checkgrad()
+
+        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(
+            self.X,
+            self.Y,
+            Z,
+            GPy.kern.RBF(Q) + GPy.kern.Matern32(Q) + GPy.kern.Bias(Q),
+            GPy.likelihoods.Gaussian(),
+            missing_data=True,
+            stochastic=True,
+        )
+        assert m.checkgrad()
+        m.optimize("rprop", max_iters=10)
+        assert m.checkgrad()
+
+        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(
+            self.X,
+            self.Y,
+            Z,
+            GPy.kern.RBF(Q) + GPy.kern.Matern32(Q) + GPy.kern.Bias(Q),
+            GPy.likelihoods.Gaussian(),
+            missing_data=False,
+            stochastic=False,
+        )
+        assert m.checkgrad()
+        m.optimize("rprop", max_iters=10)
+        assert m.checkgrad()
+
+        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(
+            self.X,
+            self.Y,
+            Z,
+            GPy.kern.RBF(Q) + GPy.kern.Matern32(Q) + GPy.kern.Bias(Q),
+            GPy.likelihoods.Gaussian(),
+            missing_data=False,
+            stochastic=True,
+        )
+        assert m.checkgrad()
+        m.optimize("adadelta", max_iters=10)
+        assert m.checkgrad()
+
+    def test_predict_missing_data(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=True,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+
+        mu1, var1 = m.predict(m.X, full_cov=False)
+        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
+        np.testing.assert_allclose(mu1, mu2)
+        for i in range(var1.shape[1]):
+            np.testing.assert_allclose(var1[:, [i]], var2)
+
+        mu1, var1 = m.predict(m.X, full_cov=True)
+        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=True)
+        np.testing.assert_allclose(mu1, mu2)
+        for i in range(var1.shape[2]):
+            np.testing.assert_allclose(var1[:, :, i], var2)
+
+    def test_lik_comparisons_m0_s0(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y, self.Q, X_variance=False, missing_data=False, stochastic=False
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_lik_comparisons_m1_s1(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=True,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_lik_comparisons_m0_s1(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=False,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
+
+    def test_gradients_missingdata(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=True,
+            stochastic=False,
+            batchsize=self.Y.shape[1],
+        )
+        assert m.checkgrad()
+
+    def test_gradients_missingdata_stochastics(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=True,
+            stochastic=True,
+            batchsize=1,
+        )
+        assert m.checkgrad()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=True,
+            stochastic=True,
+            batchsize=4,
+        )
+        assert m.checkgrad()
+
+    def test_gradients_stochastics(self):
+        self.setup()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=False,
+            stochastic=True,
+            batchsize=1,
+        )
+        assert m.checkgrad()
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=False,
+            stochastic=True,
+            batchsize=4,
+        )
+        assert m.checkgrad()
+
+    def test_predict(self):
+        self.setup()
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
+            self.Y,
+            self.Q,
+            X_variance=False,
+            missing_data=True,
+            stochastic=True,
+            batchsize=self.Y.shape[1],
+        )
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(
+            m.log_likelihood(), self.m_full.log_likelihood(), 7
+        )
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert m.checkgrad()
--- a/GPy/testing/misc_tests.py
+++ b/GPy/testing/misc_tests.py
@ -1,27 +1,28 @@
-from __future__ import print_function
 import numpy as np
-import scipy as sp
 import GPy
 import warnings

-class MiscTests(np.testing.TestCase):
+
+class TestMisc:
    """
    Testing some utilities of misc
    """
-    def setUp(self):
+
+    def setup(self):
        self._lim_val = np.finfo(np.float64).max
        self._lim_val_exp = np.log(self._lim_val)

    def test_safe_exp_upper(self):
+        self.setup()
        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter('always')  # always print
+            warnings.simplefilter("always")  # always print
            assert np.isfinite(np.exp(self._lim_val_exp))
            assert np.isinf(np.exp(self._lim_val_exp + 1))
            assert np.isfinite(GPy.util.misc.safe_exp(self._lim_val_exp + 1))

            print(w)
            print(len(w))
-            assert len(w)<=1 # should have one overflow warning
+            assert len(w) <= 1  # should have one overflow warning

    def test_safe_exp_lower(self):
        assert GPy.util.misc.safe_exp(1e-10) < np.inf
--- a/GPy/testing/test_model.py
+++ b/GPy/testing/test_model.py
--- a/GPy/testing/pep_tests.py
+++ b/GPy/testing/pep_tests.py
@ -1,94 +1,98 @@
 # Copyright (c) 2014, James Hensman, 2016, Thang Bui
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import unittest
 import numpy as np
 import GPy

-class PEPgradienttest(unittest.TestCase):
-    def setUp(self):
+
+class TestPEPgradient:
+    def setup(self):
        ######################################
        # # 1 dimensional example
        np.random.seed(10)

        N = 20
        # sample inputs and outputs
-        self.X1D = np.random.uniform(-3., 3., (N, 1))
+        self.X1D = np.random.uniform(-3.0, 3.0, (N, 1))
        self.Y1D = np.sin(self.X1D) + np.random.randn(N, 1) * 0.05

        ######################################
        # # 2 dimensional example

        # sample inputs and outputs
-        self.X2D = np.random.uniform(-3., 3., (N, 2))
-        self.Y2D = np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2]) + np.random.randn(N, 1) * 0.05
+        self.X2D = np.random.uniform(-3.0, 3.0, (N, 2))
+        self.Y2D = (
+            np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2])
+            + np.random.randn(N, 1) * 0.05
+        )

        #######################################
        # # more datapoints, check in alpha limits, the log marginal likelihood
        # # is consistent with FITC and VFE/Var_DTC
        M = 5
        np.random.seed(42)
-        self.X1 = np.c_[np.linspace(-1., 1., N)]
+        self.X1 = np.c_[np.linspace(-1.0, 1.0, N)]
        self.Y1 = np.sin(self.X1) + np.random.randn(N, 1) * 0.05
        self.kernel = GPy.kern.RBF(input_dim=1, lengthscale=0.5, variance=1)
        self.Z = np.random.uniform(-1, 1, (M, 1))
        self.lik_noise_var = 0.01

    def test_pep_1d_gradients(self):
+        self.setup()
        m = GPy.models.SparseGPRegression(self.X1D, self.Y1D)
-        m.inference_method = GPy.inference.latent_function_inference.PEP(alpha=np.random.rand())
-        self.assertTrue(m.checkgrad())
+        m.inference_method = GPy.inference.latent_function_inference.PEP(
+            alpha=np.random.rand()
+        )
+        assert m.checkgrad()

    def test_pep_2d_gradients(self):
+        self.setup()
        m = GPy.models.SparseGPRegression(self.X2D, self.Y2D)
-        m.inference_method = GPy.inference.latent_function_inference.PEP(alpha=np.random.rand())
-        self.assertTrue(m.checkgrad())
+        m.inference_method = GPy.inference.latent_function_inference.PEP(
+            alpha=np.random.rand()
+        )
+        assert m.checkgrad()

    def test_pep_vfe_consistency(self):
+        self.setup()
        vfe_model = GPy.models.SparseGPRegression(
-            self.X1, 
-            self.Y1, 
-            kernel=self.kernel, 
-            Z=self.Z
+            self.X1, self.Y1, kernel=self.kernel, Z=self.Z
        )
        vfe_model.inference_method = GPy.inference.latent_function_inference.VarDTC()
        vfe_model.Gaussian_noise.variance = self.lik_noise_var
        vfe_lml = vfe_model.log_likelihood()

        pep_model = GPy.models.SparseGPRegression(
-            self.X1, 
-            self.Y1, 
-            kernel=self.kernel, 
-            Z=self.Z
+            self.X1, self.Y1, kernel=self.kernel, Z=self.Z
+        )
+        pep_model.inference_method = GPy.inference.latent_function_inference.PEP(
+            alpha=1e-5
        )
-        pep_model.inference_method = GPy.inference.latent_function_inference.PEP(alpha=1e-5)
        pep_model.Gaussian_noise.variance = self.lik_noise_var
        pep_lml = pep_model.log_likelihood()

-        self.assertAlmostEqual(vfe_lml[0, 0], pep_lml[0], delta=abs(0.01*pep_lml[0]))
+        np.testing.assert_almost_equal(
+            vfe_lml[0, 0], pep_lml[0], decimal=abs(0.01 * pep_lml[0])
+        )

    def test_pep_fitc_consistency(self):
+        self.setup()
        fitc_model = GPy.models.SparseGPRegression(
-            self.X1D, 
-            self.Y1D, 
-            kernel=self.kernel, 
-            Z=self.Z
+            self.X1D, self.Y1D, kernel=self.kernel, Z=self.Z
        )
        fitc_model.inference_method = GPy.inference.latent_function_inference.FITC()
        fitc_model.Gaussian_noise.variance = self.lik_noise_var
        fitc_lml = fitc_model.log_likelihood()

        pep_model = GPy.models.SparseGPRegression(
-            self.X1D, 
-            self.Y1D, 
-            kernel=self.kernel, 
-            Z=self.Z
+            self.X1D, self.Y1D, kernel=self.kernel, Z=self.Z
+        )
+        pep_model.inference_method = GPy.inference.latent_function_inference.PEP(
+            alpha=1
        )
-        pep_model.inference_method = GPy.inference.latent_function_inference.PEP(alpha=1)
        pep_model.Gaussian_noise.variance = self.lik_noise_var
        pep_lml = pep_model.log_likelihood()

-        self.assertAlmostEqual(fitc_lml, pep_lml[0], delta=abs(0.001*pep_lml[0]))
-
-
-
+        np.testing.assert_almost_equal(
+            fitc_lml, pep_lml[0], decimal=abs(0.001 * pep_lml[0])
+        )
--- a/GPy/testing/test_pickle.py
+++ b/GPy/testing/test_pickle.py
@ -0,0 +1,133 @@
+"""
+Created on 13 Mar 2014
+
+@author: maxz
+"""
+# import cPickle as pickle
+import pickle
+import pytest
+import numpy as np
+import tempfile
+from GPy.examples.dimensionality_reduction import mrd_simulation
+from GPy.core.parameterization.variational import NormalPosterior
+from GPy.models.gp_regression import GPRegression
+import GPy
+
+
+def toy_model():
+    X = np.linspace(0, 1, 50)[:, None]
+    Y = np.sin(X)
+    m = GPRegression(X=X, Y=Y)
+    return m
+
+
+class ListDictTestCase:
+    def assertListDictEquals(self, d1, d2, msg=None):
+        # py3 fix
+        # for k,v in d1.iteritems():
+        for k, v in d1.items():
+            self.assertListEqual(list(v), list(d2[k]), msg)
+
+    def assertArrayListEquals(self, l1, l2):
+        for a1, a2 in zip(l1, l2):
+            np.testing.assert_array_equal(a1, a2)
+
+
+class TestPickleSupport(ListDictTestCase):
+    @pytest.mark.skip(reason="")  # why is this test skipped?
+    def test_load_pickle(self):
+        import os
+
+        m = GPy.load(
+            os.path.join(
+                os.path.abspath(os.path.split(__file__)[0]), "pickle_test.pickle"
+            )
+        )
+        assert m.checkgrad()
+        assert m.log_likelihood(), -4.7351019830022087
+
+    def test_model(self):
+        par = toy_model()
+        pcopy = par.copy()
+        assert par.param_array.tolist() == pcopy.param_array.tolist()
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
+        assert str(par) == str(pcopy)
+        assert np.all(par.param_array == pcopy.param_array)
+        assert np.all(par.gradient_full == pcopy.gradient_full)
+        assert pcopy.checkgrad()
+        assert np.any(pcopy.gradient != 0.0)
+        with tempfile.TemporaryFile("w+b") as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        assert par.param_array.tolist() == pcopy.param_array.tolist()
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
+        assert str(par) == str(pcopy)
+        assert pcopy.checkgrad()
+
+    def test_modelrecreation(self):
+        par = toy_model()
+        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
+        np.testing.assert_allclose(par.param_array, pcopy.param_array)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
+        assert str(par) == str(pcopy)
+        assert np.all(par.param_array == pcopy.param_array)
+        assert np.all(par.gradient_full == pcopy.gradient_full)
+        assert pcopy.checkgrad()
+        assert np.any(pcopy.gradient != 0.0)
+        np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)
+        par.randomize()
+        with tempfile.TemporaryFile("w+b") as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        np.testing.assert_allclose(par.param_array, pcopy.param_array)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full, atol=1e-6)
+        assert str(par) == str(pcopy)
+        assert pcopy.checkgrad()
+
+    def test_posterior(self):
+        X = np.random.randn(3, 5)
+        Xv = np.random.rand(*X.shape)
+        par = NormalPosterior(X, Xv)
+        par.gradient = 10
+        pcopy = par.copy()
+        pcopy.gradient = 10
+        assert par.param_array.tolist() == pcopy.param_array.tolist()
+        assert par.gradient_full.tolist() == pcopy.gradient_full.tolist()
+        assert str(par) == str(pcopy)
+        assert np.all(par.param_array == pcopy.param_array)
+        assert np.all(par.gradient_full == pcopy.gradient_full)
+        with tempfile.TemporaryFile("w+b") as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        assert par.param_array.tolist() == pcopy.param_array.tolist()
+        pcopy.gradient = 10
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
+        np.testing.assert_allclose(pcopy.mean.gradient_full, 10)
+        assert str(par) == str(pcopy)
+
+    def test_model_concat(self):
+        par = mrd_simulation(optimize=0, plot=0, plot_sim=0)
+        par.randomize()
+        pcopy = par.copy()
+        assert par.param_array.tolist() == pcopy.param_array.tolist()
+        assert par.gradient_full.tolist() == pcopy.gradient_full.tolist()
+        assert str(par) == str(pcopy)
+        assert np.all(par.param_array == pcopy.param_array)
+        assert np.all(par.gradient_full == pcopy.gradient_full)
+        assert par.checkgrad()
+        assert pcopy.checkgrad()
+        assert np.any(pcopy.gradient != 0.0)
+        with tempfile.TemporaryFile("w+b") as f:
+            par.pickle(f)
+            f.seek(0)
+            pcopy = pickle.load(f)
+        assert par.param_array.tolist() == pcopy.param_array.tolist()
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
+        assert str(par) == str(pcopy)
+        assert pcopy.checkgrad()
+
+    def _callback(self, what, which):
+        what.count += 1
--- a/GPy/testing/test_plotting.py
+++ b/GPy/testing/test_plotting.py
@ -0,0 +1,703 @@
+# ===============================================================================
+# Copyright (c) 2015, Max Zwiessele
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of GPy nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ===============================================================================
+
+
+# ===============================================================================
+# SKIPPING PLOTTING BECAUSE IT BEHAVES DIFFERENTLY ON DIFFERENT
+# SYSTEMS, AND WILL MISBEHAVE
+
+# raise SkipTest("Skipping Matplotlib testing")
+# ===============================================================================
+
+try:
+    import matplotlib
+    from matplotlib import pyplot as plt
+    from matplotlib.testing.compare import compare_images
+
+    matplotlib.use("agg")
+except ImportError:
+    # matplotlib not installed
+    matplotlib = None
+
+import pytest
+import numpy as np
+import GPy, os
+import logging
+
+from GPy.util.config import config
+from GPy.plotting import change_plotting_library, plotting_library
+
+
+class TestConfig:
+    def teardown(self):
+        change_plotting_library("matplotlib")
+
+    @pytest.mark.skipif(matplotlib is None, reason="Matplotlib not installed")
+    def test_change_plotting(self):
+        with pytest.raises(ValueError):
+            change_plotting_library("not+in9names")
+        change_plotting_library("none")
+        with pytest.raises(RuntimeError):
+            plotting_library()
+        self.teardown()
+
+
+change_plotting_library("matplotlib")
+
+extensions = ["npz"]
+
+basedir = os.path.dirname(os.path.relpath(os.path.abspath(__file__)))
+
+
+def _image_directories():
+    """
+    Compute the baseline and result image directories for testing *func*.
+    Create the result directory if it doesn't exist.
+    """
+    # module_name = __init__.__module__
+    # mods = module_name.split('.')
+    # basedir = os.path.join(*mods)
+    result_dir = os.path.join(basedir, "testresult", ".")
+    baseline_dir = os.path.join(basedir, "baseline", ".")
+    if not os.path.exists(result_dir):
+        os.makedirs(result_dir)
+    return baseline_dir, result_dir
+
+
+baseline_dir, result_dir = _image_directories()
+if not os.path.exists(baseline_dir):
+    baseline_dir = None
+
+
+def _image_comparison(
+    baseline_images, extensions=["pdf", "svg", "png"], tol=11, rtol=1e-3, **kwargs
+):
+    for num, base in zip(plt.get_fignums(), baseline_images):
+        for ext in extensions:
+            fig = plt.figure(num)
+            try:
+                fig.canvas.draw()
+            except Exception as e:
+                logging.error(base)
+                # raise SkipTest(e)
+            # fig.axes[0].set_axis_off()
+            # fig.set_frameon(False)
+            if ext in ["npz"]:
+                figdict = flatten_axis(fig)
+                np.savez_compressed(
+                    os.path.join(result_dir, "{}.{}".format(base, ext)), **figdict
+                )
+                try:
+                    fig.savefig(
+                        os.path.join(result_dir, "{}.{}".format(base, "png")),
+                        transparent=True,
+                        edgecolor="none",
+                        facecolor="none",
+                        # bbox='tight'
+                    )
+                except:
+                    logging.error(base)
+                    # raise
+            else:
+                fig.savefig(
+                    os.path.join(result_dir, "{}.{}".format(base, ext)),
+                    transparent=True,
+                    edgecolor="none",
+                    facecolor="none",
+                    # bbox='tight'
+                )
+    for num, base in zip(plt.get_fignums(), baseline_images):
+        for ext in extensions:
+            # plt.close(num)
+            actual = os.path.join(result_dir, "{}.{}".format(base, ext))
+            expected = os.path.join(baseline_dir, "{}.{}".format(base, ext))
+            if ext == "npz":
+
+                def do_test():
+                    with pytest.skip:
+                        if not os.path.exists(expected):
+                            import shutil
+
+                            shutil.copy2(actual, expected)
+                            # shutil.copy2(os.path.join(result_dir, "{}.{}".format(base, 'png')), os.path.join(baseline_dir, "{}.{}".format(base, 'png')))
+                            raise IOError(
+                                "Baseline file {} not found, copying result {}".format(
+                                    expected, actual
+                                )
+                            )
+                        else:
+                            exp_dict = dict(np.load(expected).items())
+                            act_dict = dict(np.load(actual).items())
+                            for name in act_dict:
+                                if name in exp_dict:
+                                    try:
+                                        np.testing.assert_allclose(
+                                            exp_dict[name],
+                                            act_dict[name],
+                                            err_msg="Mismatch in {}.{}".format(
+                                                base, name
+                                            ),
+                                            rtol=rtol,
+                                            **kwargs
+                                        )
+                                    except AssertionError as e:
+                                        pass
+
+            else:
+
+                def do_test():
+                    err = compare_images(expected, actual, tol, in_decorator=True)
+                    if err:
+                        print(
+                            "Error between {} and {} is {:.5f}, which is bigger then the tolerance of {:.5f}".format(
+                                actual, expected, err["rms"], tol
+                            )
+                        )
+                        pass
+
+            yield do_test
+    plt.close("all")
+
+
+def flatten_axis(ax, prevname=""):
+    import inspect
+
+    members = inspect.getmembers(ax)
+
+    arrays = {}
+
+    def _flatten(l, pre):
+        arr = {}
+        if isinstance(l, np.ndarray):
+            if l.size:
+                arr[pre] = np.asarray(l)
+        elif isinstance(l, dict):
+            for _n in l:
+                _tmp = _flatten(l, pre + "." + _n + ".")
+                for _nt in _tmp.keys():
+                    arrays[_nt] = _tmp[_nt]
+        elif isinstance(l, list) and len(l) > 0:
+            for i in range(len(l)):
+                _tmp = _flatten(l[i], pre + "[{}]".format(i))
+                for _n in _tmp:
+                    arr["{}".format(_n)] = _tmp[_n]
+        else:
+            return flatten_axis(l, pre + ".")
+        return arr
+
+    for name, l in members:
+        if isinstance(l, np.ndarray):
+            arrays[prevname + name] = np.asarray(l)
+        elif isinstance(l, list) and len(l) > 0:
+            for i in range(len(l)):
+                _tmp = _flatten(l[i], prevname + name + "[{}]".format(i))
+                for _n in _tmp:
+                    arrays["{}".format(_n)] = _tmp[_n]
+
+    return arrays
+
+
+def _a(x, y, decimal):
+    np.testing.assert_array_almost_equal(x, y, decimal)
+
+
+def compare_axis_dicts(x, y, decimal=6):
+    try:
+        assert len(x) == len(y)
+        for name in x:
+            _a(x[name], y[name], decimal)
+    except AssertionError as e:
+        print(e.message)
+        pass
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_figure():
+    np.random.seed(1239847)
+    from GPy.plotting import plotting_library as pl
+
+    # import matplotlib
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+
+        ax, _ = pl().new_canvas(num="imshow_interact")
+
+        def test_func(x):
+            return x[:, 0].reshape(3, 3)
+
+        pl().imshow_interact(ax, test_func, extent=(-1, 1, -1, 1), resolution=3)
+
+        ax, _ = pl().new_canvas()
+
+        def test_func_2(x):
+            y = x[:, 0].reshape(3, 3)
+            anno = np.argmax(x, axis=1).reshape(3, 3)
+            return y, anno
+
+        pl().annotation_heatmap_interact(
+            ax, test_func_2, extent=(-1, 1, -1, 1), resolution=3
+        )
+        pl().annotation_heatmap_interact(
+            ax,
+            test_func_2,
+            extent=(-1, 1, -1, 1),
+            resolution=3,
+            imshow_kwargs=dict(interpolation="nearest"),
+        )
+
+        ax, _ = pl().new_canvas(figsize=(4, 3))
+        x = np.linspace(0, 1, 100)
+        y = [0, 1, 2]
+        array = np.array([0.4, 0.5])
+        cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
+            "WhToColor", ("r", "b"), N=array.size
+        )
+
+        pl().fill_gradient(ax, x, y, facecolors=["r", "g"], array=array, cmap=cmap)
+
+        ax, _ = pl().new_canvas(
+            num="3d_plot",
+            figsize=(4, 3),
+            projection="3d",
+            xlabel="x",
+            ylabel="y",
+            zlabel="z",
+            title="awsome title",
+            xlim=(-1, 1),
+            ylim=(-1, 1),
+            zlim=(-3, 3),
+        )
+        z = 2 - np.abs(np.linspace(-2, 2, (100))) + 1
+        x, y = z * np.sin(np.linspace(-2 * np.pi, 2 * np.pi, (100))), z * np.cos(
+            np.linspace(-np.pi, np.pi, (100))
+        )
+
+        pl().plot(ax, x, y, z, linewidth=2)
+
+        for do_test in _image_comparison(
+            baseline_images=[
+                "coverage_{}".format(sub)
+                for sub in [
+                    "imshow_interact",
+                    "annotation_interact",
+                    "gradient",
+                    "3d_plot",
+                ]
+            ],
+            extensions=extensions,
+        ):
+            yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_kernel():
+    np.random.seed(1239847)
+    # import matplotlib
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        k = GPy.kern.RBF(5, ARD=True) * GPy.kern.Linear(
+            3, active_dims=[0, 2, 4], ARD=True
+        ) + GPy.kern.Bias(2)
+        k.randomize()
+        k2 = (
+            GPy.kern.RBF(5, ARD=True)
+            * GPy.kern.Linear(3, active_dims=[0, 2, 4], ARD=True)
+            + GPy.kern.Bias(2)
+            + GPy.kern.White(4)
+        )
+        k2[:-1] = k[:]
+        k2.plot_ARD(["rbf", "linear", "bias"], legend=True)
+        k2.plot_covariance(visible_dims=[0, 3], plot_limits=(-1, 3))
+        k2.plot_covariance(visible_dims=[2], plot_limits=(-1, 3))
+        k2.plot_covariance(
+            visible_dims=[2, 4],
+            plot_limits=((-1, 0), (5, 3)),
+            projection="3d",
+            rstride=10,
+            cstride=10,
+        )
+        k2.plot_covariance(visible_dims=[1, 4])
+        for do_test in _image_comparison(
+            baseline_images=[
+                "kern_{}".format(sub)
+                for sub in ["ARD", "cov_2d", "cov_1d", "cov_3d", "cov_no_lim"]
+            ],
+            extensions=extensions,
+        ):
+            yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_plot():
+    np.random.seed(111)
+    import matplotlib
+
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        X = np.random.uniform(-2, 2, (40, 1))
+        f = 0.2 * np.sin(1.3 * X) + 1.3 * np.cos(2 * X)
+        Y = f + np.random.normal(0, 0.1, f.shape)
+        m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X) * [0.06])
+        # m.optimize()
+        m.plot_data()
+        m.plot_mean()
+        m.plot_confidence()
+        m.plot_density()
+        m.plot_errorbars_trainset()
+        m.plot_samples()
+        m.plot_data_error()
+    for do_test in _image_comparison(
+        baseline_images=[
+            "gp_{}".format(sub)
+            for sub in [
+                "data",
+                "mean",
+                "conf",
+                "density",
+                "out_error",
+                "samples",
+                "in_error",
+            ]
+        ],
+        extensions=extensions,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_twod():
+    np.random.seed(11111)
+    import matplotlib
+
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    X = np.random.uniform(-2, 2, (40, 2))
+    f = 0.2 * np.sin(1.3 * X[:, [0]]) + 1.3 * np.cos(2 * X[:, [1]])
+    Y = f + np.random.normal(0, 0.1, f.shape)
+    m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X) * [0.01, 0.2])
+    # m.optimize()
+    m.plot_data()
+    m.plot_mean()
+    m.plot_inducing(legend=False, marker="s")
+    # m.plot_errorbars_trainset()
+    m.plot_data_error()
+    for do_test in _image_comparison(
+        baseline_images=[
+            "gp_2d_{}".format(sub)
+            for sub in [
+                "data",
+                "mean",
+                "inducing",
+                #'out_error',
+                "in_error",
+            ]
+        ],
+        extensions=extensions,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_threed():
+    np.random.seed(11111)
+    import matplotlib
+
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    X = np.random.uniform(-2, 2, (40, 2))
+    f = 0.2 * np.sin(1.3 * X[:, [0]]) + 1.3 * np.cos(2 * X[:, [1]])
+    Y = f + np.random.normal(0, 0.1, f.shape)
+    m = GPy.models.SparseGPRegression(X, Y)
+    m.likelihood.variance = 0.1
+    # m.optimize()
+    m.plot_samples(projection="3d", samples=1)
+    m.plot_samples(projection="3d", plot_raw=False, samples=1)
+    plt.close("all")
+    m.plot_data(projection="3d")
+    m.plot_mean(projection="3d", rstride=10, cstride=10)
+    m.plot_inducing(projection="3d")
+    # m.plot_errorbars_trainset(projection='3d')
+    for do_test in _image_comparison(
+        baseline_images=[
+            "gp_3d_{}".format(sub)
+            for sub in [
+                "data",
+                "mean",
+                "inducing",
+            ]
+        ],
+        extensions=extensions,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_sparse():
+    np.random.seed(11111)
+    import matplotlib
+
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    X = np.random.uniform(-2, 2, (40, 1))
+    f = 0.2 * np.sin(1.3 * X) + 1.3 * np.cos(2 * X)
+    Y = f + np.random.normal(0, 0.1, f.shape)
+    m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X) * 0.1)
+    # m.optimize()
+    # m.plot_inducing()
+    _, ax = plt.subplots()
+    m.plot_data(ax=ax)
+    m.plot_data_error(ax=ax)
+    for do_test in _image_comparison(
+        baseline_images=["sparse_gp_{}".format(sub) for sub in ["data_error"]],
+        extensions=extensions,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_classification():
+    np.random.seed(11111)
+    import matplotlib
+
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    X = np.random.uniform(-2, 2, (40, 1))
+    f = 0.2 * np.sin(1.3 * X) + 1.3 * np.cos(2 * X)
+    Y = f + np.random.normal(0, 0.1, f.shape)
+    m = GPy.models.GPClassification(X, Y > Y.mean())
+    # m.optimize()
+    _, ax = plt.subplots()
+    m.plot(plot_raw=False, apply_link=False, ax=ax, samples=3)
+    m.plot_errorbars_trainset(plot_raw=False, apply_link=False, ax=ax)
+    _, ax = plt.subplots()
+    m.plot(plot_raw=True, apply_link=False, ax=ax, samples=3)
+    m.plot_errorbars_trainset(plot_raw=True, apply_link=False, ax=ax)
+    _, ax = plt.subplots()
+    m.plot(plot_raw=True, apply_link=True, ax=ax, samples=3)
+    m.plot_errorbars_trainset(plot_raw=True, apply_link=True, ax=ax)
+    for do_test in _image_comparison(
+        baseline_images=[
+            "gp_class_{}".format(sub) for sub in ["likelihood", "raw", "raw_link"]
+        ],
+        extensions=extensions,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_sparse_classification():
+    np.random.seed(11111)
+    import matplotlib
+
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    X = np.random.uniform(-2, 2, (40, 1))
+    f = 0.2 * np.sin(1.3 * X) + 1.3 * np.cos(2 * X)
+    Y = f + np.random.normal(0, 0.1, f.shape)
+    m = GPy.models.SparseGPClassification(X, Y > Y.mean())
+    # m.optimize()
+    m.plot(plot_raw=False, apply_link=False, samples_likelihood=3)
+    np.random.seed(111)
+    m.plot(plot_raw=True, apply_link=False, samples=3)
+    np.random.seed(111)
+    m.plot(plot_raw=True, apply_link=True, samples=3)
+    for do_test in _image_comparison(
+        baseline_images=[
+            "sparse_gp_class_{}".format(sub)
+            for sub in ["likelihood", "raw", "raw_link"]
+        ],
+        extensions=extensions,
+        rtol=2,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_gplvm():
+    from GPy.models import GPLVM
+
+    np.random.seed(12345)
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    # Q = 3
+    # Define dataset
+    # N = 60
+    # k1 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,10,10,0.1,0.1]), ARD=True)
+    # k2 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,0.1,10,0.1,10]), ARD=True)
+    # k3 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[0.1,0.1,10,10,10]), ARD=True)
+    # X = np.random.normal(0, 1, (N, 5))
+    # A = np.random.multivariate_normal(np.zeros(N), k1.K(X), Q).T
+    # B = np.random.multivariate_normal(np.zeros(N), k2.K(X), Q).T
+    # C = np.random.multivariate_normal(np.zeros(N), k3.K(X), Q).T
+    # Y = np.vstack((A,B,C))
+    # labels = np.hstack((np.zeros(A.shape[0]), np.ones(B.shape[0]), np.ones(C.shape[0])*2))
+
+    # k = RBF(Q, ARD=True, lengthscale=2)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    pars = np.load(os.path.join(basedir, "b-gplvm-save.npz"))
+    Y = pars["Y"]
+    Q = pars["Q"]
+    labels = pars["labels"]
+
+    import warnings
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")  # always print
+        m = GPLVM(Y, Q, initialize=False)
+    m.update_model(False)
+    m.initialize_parameter()
+    m[:] = pars["gplvm_p"]
+    m.update_model(True)
+
+    # m.optimize(messages=0)
+    np.random.seed(111)
+    m.plot_latent(labels=labels)
+    np.random.seed(111)
+    m.plot_scatter(projection="3d", labels=labels)
+    np.random.seed(111)
+    m.plot_magnification(labels=labels)
+    m.plot_steepest_gradient_map(resolution=10, data_labels=labels)
+    for do_test in _image_comparison(
+        baseline_images=[
+            "gplvm_{}".format(sub)
+            for sub in ["latent", "latent_3d", "magnification", "gradient"]
+        ],
+        extensions=extensions,
+        tol=12,
+    ):
+        yield (do_test,)
+
+
+@pytest.mark.skipif(
+    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
+)
+def test_bayesian_gplvm():
+    from ..models import BayesianGPLVM
+
+    np.random.seed(12345)
+    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
+    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
+    matplotlib.rcParams["text.usetex"] = False
+    # Q = 3
+    # Define dataset
+    # N = 10
+    # k1 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,10,10,0.1,0.1]), ARD=True)
+    # k2 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,0.1,10,0.1,10]), ARD=True)
+    # k3 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[0.1,0.1,10,10,10]), ARD=True)
+    # X = np.random.normal(0, 1, (N, 5))
+    # A = np.random.multivariate_normal(np.zeros(N), k1.K(X), Q).T
+    # B = np.random.multivariate_normal(np.zeros(N), k2.K(X), Q).T
+    # C = np.random.multivariate_normal(np.zeros(N), k3.K(X), Q).T
+
+    # Y = np.vstack((A,B,C))
+    # labels = np.hstack((np.zeros(A.shape[0]), np.ones(B.shape[0]), np.ones(C.shape[0])*2))
+
+    # k = RBF(Q, ARD=True, lengthscale=2)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    pars = np.load(os.path.join(basedir, "b-gplvm-save.npz"))
+    Y = pars["Y"]
+    Q = pars["Q"]
+    labels = pars["labels"]
+
+    import warnings
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")  # always print
+        m = BayesianGPLVM(Y, Q, initialize=False)
+    m.update_model(False)
+    m.initialize_parameter()
+    m[:] = pars["bgplvm_p"]
+    m.update_model(True)
+
+    # m.optimize(messages=0)
+    np.random.seed(111)
+    m.plot_inducing(projection="2d")
+    np.random.seed(111)
+    m.plot_inducing(projection="3d")
+    np.random.seed(111)
+    m.plot_latent(projection="2d", labels=labels)
+    np.random.seed(111)
+    m.plot_scatter(projection="3d", labels=labels)
+    np.random.seed(111)
+    m.plot_magnification(labels=labels)
+    np.random.seed(111)
+    m.plot_steepest_gradient_map(resolution=10, data_labels=labels)
+    for do_test in _image_comparison(
+        baseline_images=[
+            "bayesian_gplvm_{}".format(sub)
+            for sub in [
+                "inducing",
+                "inducing_3d",
+                "latent",
+                "latent_3d",
+                "magnification",
+                "gradient",
+            ]
+        ],
+        extensions=extensions,
+    ):
+        yield (do_test,)
--- a/GPy/testing/prior_tests.py
+++ b/GPy/testing/prior_tests.py
@ -1,138 +1,142 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import unittest
+import pytest
 import numpy as np
 import GPy

-class PriorTests(unittest.TestCase):
+
+class TestPrior:
    def test_studentT(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        studentT = GPy.priors.StudentT(1, 2, 4)
-        
+
        m = GPy.models.SparseGPRegression(X, y)
        m.Z.set_prior(studentT)

        # setting a StudentT prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, studentT)
-        
+
+        with pytest.raises(AssertionError):
+            m.rbf.set_prior(studentT)
+
        # The gradients need to be checked
-        self.assertTrue(m.checkgrad())
-        
+        assert m.checkgrad()
+
        # Check the singleton pattern:
-        self.assertIs(studentT, GPy.priors.StudentT(1,2,4))
-        self.assertIsNot(studentT, GPy.priors.StudentT(2,2,4))
-    
+        assert studentT is GPy.priors.StudentT(1, 2, 4)
+        assert studentT is not GPy.priors.StudentT(2, 2, 4)
+
    def test_lognormal(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        lognormal = GPy.priors.LogGaussian(1, 2)
        m.rbf.set_prior(lognormal)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_Gamma(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        Gamma = GPy.priors.Gamma(1, 1)
        m.rbf.set_prior(Gamma)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_InverseGamma(self):
        # Test that this prior object can be instantiated and performs its basic functions
        # in integration.
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        InverseGamma = GPy.priors.InverseGamma(1, 1)
        m.rbf.set_prior(InverseGamma)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_incompatibility(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        gaussian = GPy.priors.Gaussian(1, 1)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, gaussian)
+        with pytest.raises(AssertionError):
+            m.rbf.set_prior(gaussian)

    def test_set_prior(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)

        gaussian = GPy.priors.Gaussian(1, 1)
-        #m.rbf.set_prior(gaussian)
+        # m.rbf.set_prior(gaussian)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, gaussian)
+        with pytest.raises(AssertionError):
+            m.rbf.set_prior(gaussian)

    def test_uniform(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.SparseGPRegression(X, y)
        uniform = GPy.priors.Uniform(0, 2)
        m.rbf.set_prior(uniform)
        m.randomize()
-        self.assertTrue(m.checkgrad())
-        
+        assert m.checkgrad()
+
        m.Z.set_prior(uniform)
        m.randomize()
-        self.assertTrue(m.checkgrad())
-        
+        assert m.checkgrad()
+
        m.Z.unconstrain()
        uniform = GPy.priors.Uniform(-1, 10)
        m.Z.set_prior(uniform)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

        m.Z.constrain_negative()
        uniform = GPy.priors.Uniform(-1, 0)
        m.Z.set_prior(uniform)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()

    def test_set_gaussian_for_reals(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.SparseGPRegression(X, y)

@ -140,16 +144,15 @@ class PriorTests(unittest.TestCase):
        m.Z.set_prior(gaussian)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        #self.assertRaises(AssertionError, m.Z.set_prior, gaussian)
-        self.assertTrue(m.checkgrad())
-
+        # self.assertRaises(AssertionError, m.Z.set_prior, gaussian)
+        assert m.checkgrad()

    def test_fixed_domain_check(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)

@ -157,14 +160,15 @@ class PriorTests(unittest.TestCase):
        gaussian = GPy.priors.Gaussian(1, 1)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, gaussian)
+        with pytest.raises(AssertionError):
+            m.rbf.set_prior(gaussian)

    def test_fixed_domain_check1(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y = b * X + C + 1 * np.sin(X)
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)

@ -172,8 +176,5 @@ class PriorTests(unittest.TestCase):
        gaussian = GPy.priors.Gaussian(1, 1)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, gaussian)
-
-if __name__ == "__main__":
-    print("Running unit tests, please be (very) patient...")
-    unittest.main()
+        with pytest.raises(AssertionError):
+            m.rbf.set_prior(gaussian)
--- a/GPy/testing/quadrature_tests.py
+++ b/GPy/testing/quadrature_tests.py
@ -1,23 +1,19 @@
 from __future__ import print_function, division
 import numpy as np
-import GPy
-import warnings
-from  ..util.quad_integrate import quadgk_int, quadvgk
+from ..util.quad_integrate import quadgk_int, quadvgk


-
-class QuadTests(np.testing.TestCase):
+class TestQuad:
    """
    test file for checking implementation of gaussian-kronrod quadrature.
    we will take a function which can be integrated analytically and check if quadgk result is similar or not!
    through this file we can test how numerically accurate quadrature implementation in native numpy or manual code is.
    """
-    def setUp(self):
-        pass

    def test_infinite_quad(self):
        def f(x):
-            return np.exp(-0.5*x**2)*np.power(x,np.arange(3)[:,None])
+            return np.exp(-0.5 * x**2) * np.power(x, np.arange(3)[:, None])
+
        quad_int_val = quadgk_int(f)
        real_val = np.sqrt(np.pi * 2)
        np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
@ -25,15 +21,18 @@ class QuadTests(np.testing.TestCase):
    def test_finite_quad(self):
        def f2(x):
            return x**2
-        quad_int_val = quadvgk(f2, 1.,2.)
-        real_val = 7/3.
+
+        quad_int_val = quadvgk(f2, 1.0, 2.0)
+        real_val = 7 / 3.0
        np.testing.assert_almost_equal(real_val, quad_int_val, decimal=5)

-if __name__ == '__main__':
+
+if __name__ == "__main__":
+
    def f(x):
-        return np.exp(-0.5 * x ** 2) * np.power(x, np.arange(3)[:, None])
+        return np.exp(-0.5 * x**2) * np.power(x, np.arange(3)[:, None])

    quad_int_val = quadgk_int(f)
-    real_val = np.sqrt(np.pi*2)
+    real_val = np.sqrt(np.pi * 2)
    np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
    print(quadgk_int(f))
--- a/GPy/testing/test_rv_transformation.py
+++ b/GPy/testing/test_rv_transformation.py
@ -0,0 +1,84 @@
+# Written by Ilias Bilionis
+"""
+Test if hyperparameters in models are properly transformed.
+"""
+
+import pytest
+import numpy as np
+import scipy.stats as st
+import GPy
+
+
+class Model(GPy.core.Model):
+    """
+    A simple GPy model with one parameter.
+    """
+
+    def __init__(self, theta=1.0):
+        super(Model, self).__init__("test_model")
+        theta = GPy.core.Param("theta", theta)
+        self.link_parameter(theta)
+
+    def log_likelihood(self):
+        return 0.0
+
+
+class TestRVTransformation:
+    def _test_trans(self, trans):
+        m = Model()
+        prior = GPy.priors.LogGaussian(0.5, 0.1)
+        m.theta.set_prior(prior)
+        m.theta.unconstrain()
+        m.theta.constrain(trans)
+        # The PDF of the transformed variables
+        p_phi = lambda phi: np.exp(-m._objective_grads(phi)[0])
+        # To the empirical PDF of:
+        theta_s = prior.rvs(1e5)
+        phi_s = trans.finv(theta_s)
+        # which is essentially a kernel density estimation
+        kde = st.gaussian_kde(phi_s)
+        # We will compare the PDF here:
+        phi = np.linspace(phi_s.min(), phi_s.max(), 100)
+        # The transformed PDF of phi should be this:
+        pdf_phi = np.array([p_phi(p) for p in phi])
+        # UNCOMMENT TO SEE GRAPHICAL COMPARISON
+        # import matplotlib.pyplot as plt
+        # fig, ax = plt.subplots()
+        # ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Histogram')
+        # ax.plot(phi, kde(phi), '--', linewidth=2, label='Kernel Density Estimation')
+        # ax.plot(phi, pdf_phi, ':', linewidth=2, label='Transformed PDF')
+        # ax.set_xlabel(r'transformed $\theta$', fontsize=16)
+        # ax.set_ylabel('PDF', fontsize=16)
+        # plt.legend(loc='best')
+        # plt.show(block=True)
+        # END OF PLOT
+        # The following test cannot be very accurate
+        assert np.linalg.norm(pdf_phi - kde(phi)) / np.linalg.norm(kde(phi)) <= 1e-1
+
+    def _test_grad(self, trans):
+        np.random.seed(1234)
+        m = Model(np.random.uniform(0.5, 1.5, 20))
+        prior = GPy.priors.LogGaussian(0.5, 0.1)
+        m.theta.set_prior(prior)
+        m.theta.constrain(trans)
+        m.randomize()
+        print(m)
+        assert m.checkgrad(1)
+
+    def test_Logexp(self):
+        self._test_trans(GPy.constraints.Logexp())
+
+    @pytest.mark.skip(
+        "Gradient not checking right, @jameshensman what is going on here?"
+    )
+    def test_Logexp_grad(self):
+        self._test_grad(GPy.constraints.Logexp())
+
+    def test_Exponent(self):
+        self._test_trans(GPy.constraints.Exponent())
+
+    @pytest.mark.skip(
+        "Gradient not checking right, @jameshensman what is going on here?"
+    )
+    def test_Exponent_grad(self):
+        self._test_grad(GPy.constraints.Exponent())
--- a/GPy/testing/test_serialization.py
+++ b/GPy/testing/test_serialization.py
@ -0,0 +1,440 @@
+"""
+Created on 20 April 2017
+
+@author: pgmoren
+"""
+import numpy as np
+import GPy
+import os
+
+fixed_seed = 11
+
+
+class TestSerialization:
+    def test_serialize_deserialize_kernels(self):
+        k1 = GPy.kern.RBF(2, variance=1.0, lengthscale=[1.0, 1.0], ARD=True)
+        k2 = GPy.kern.RatQuad(
+            2, variance=2.0, lengthscale=1.0, power=2.0, active_dims=[0, 1]
+        )
+        k3 = GPy.kern.Bias(2, variance=2.0, active_dims=[1, 0])
+        k4 = GPy.kern.StdPeriodic(
+            2, variance=2.0, lengthscale=1.0, period=1.0, active_dims=[1, 1]
+        )
+        k5 = GPy.kern.Linear(2, variances=[2.0, 1.0], ARD=True, active_dims=[1, 1])
+        k6 = GPy.kern.Exponential(2, variance=1.0, lengthscale=2)
+        k7 = GPy.kern.Matern32(
+            2, variance=1.0, lengthscale=[1.0, 3.0], ARD=True, active_dims=[1, 1]
+        )
+        k8 = GPy.kern.Matern52(
+            2, variance=2.0, lengthscale=[2.0, 1.0], ARD=True, active_dims=[1, 0]
+        )
+        k9 = GPy.kern.ExpQuad(
+            2, variance=3.0, lengthscale=[1.0, 2.0], ARD=True, active_dims=[0, 1]
+        )
+        k10 = GPy.kern.OU(
+            2, variance=2.0, lengthscale=[2.0, 1.0], ARD=True, active_dims=[1, 0]
+        )
+        k11 = k1 + k1.copy() + k2 + k3 + k4 + k5 + k6
+        k12 = k1 * k2 * k2.copy() * k3 * k4 * k5
+        k13 = (k1 + k2) * (k3 + k4 + k5)
+        k14 = ((k1 + k2) * k3) + k4 + k5 * k7
+        k15 = ((k1 + k2) * k3) + k4 * k5 + k8 * k10
+        k16 = ((k1 * k2) * k3) + k4 * k5 + k8 + k9
+
+        k_list = [k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14, k15, k16]
+
+        for kk in k_list:
+            kk_dict = kk.to_dict()
+            kk_r = GPy.kern.Kern.from_dict(kk_dict)
+            assert type(kk) == type(kk_r)
+            np.testing.assert_array_equal(kk[:], kk_r[:])
+            np.testing.assert_array_equal(
+                np.array(kk.active_dims), np.array(kk_r.active_dims)
+            )
+
+    def test_serialize_deserialize_mappings(self):
+        m1 = GPy.mappings.Identity(3, 2)
+        m2 = GPy.mappings.Constant(3, 2, 1)
+        m2_r = GPy.core.mapping.Mapping.from_dict(m2.to_dict())
+        np.testing.assert_array_equal(m2.C.values[:], m2_r.C.values[:])
+        m3 = GPy.mappings.Linear(3, 2)
+        m3_r = GPy.core.mapping.Mapping.from_dict(m3.to_dict())
+        assert np.all(m3.A == m3_r.A)
+
+        m_list = [m1, m2, m3]
+        for mm in m_list:
+            mm_dict = mm.to_dict()
+            mm_r = GPy.core.mapping.Mapping.from_dict(mm_dict)
+            assert type(mm) == type(mm_r)
+            assert type(mm.input_dim) == type(mm_r.input_dim)
+            assert type(mm.output_dim) == type(mm_r.output_dim)
+
+    def test_serialize_deserialize_likelihoods(self):
+        l1 = GPy.likelihoods.Gaussian(
+            GPy.likelihoods.link_functions.Identity(), variance=3.0
+        )
+        l1_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l1.to_dict())
+        l2 = GPy.likelihoods.Bernoulli(GPy.likelihoods.link_functions.Probit())
+        l2_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l2.to_dict())
+        assert type(l1) == type(l1_r)
+        assert np.all(l1.variance == l1_r.variance)
+        assert type(l2) == type(l2_r)
+
+    def test_serialize_deserialize_normalizers(self):
+        n1 = GPy.util.normalizer.Standardize()
+        n1.scale_by(np.random.rand(10))
+        n1_r = GPy.util.normalizer._Norm.from_dict((n1.to_dict()))
+        assert type(n1) == type(n1_r)
+        assert np.all(n1.mean == n1_r.mean)
+        assert np.all(n1.std == n1_r.std)
+
+    def test_serialize_deserialize_link_functions(self):
+        l1 = GPy.likelihoods.link_functions.Identity()
+        l2 = GPy.likelihoods.link_functions.Probit()
+        l_list = [l1, l2]
+        for ll in l_list:
+            ll_dict = ll.to_dict()
+            ll_r = GPy.likelihoods.link_functions.GPTransformation.from_dict(ll_dict)
+            assert type(ll) == type(ll_r)
+
+    def test_serialize_deserialize_inference_methods(self):
+        e1 = GPy.inference.latent_function_inference.expectation_propagation.EP(
+            ep_mode="nested"
+        )
+        e1.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(
+            np.random.rand(10), np.random.rand(10)
+        )
+        e1._ep_approximation = []
+        e1._ep_approximation.append(
+            GPy.inference.latent_function_inference.expectation_propagation.posteriorParams(
+                np.random.rand(10), np.random.rand(100).reshape((10, 10))
+            )
+        )
+        e1._ep_approximation.append(
+            GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(
+                np.random.rand(10), np.random.rand(10)
+            )
+        )
+        e1._ep_approximation.append(
+            GPy.inference.latent_function_inference.expectation_propagation.cavityParams(
+                10
+            )
+        )
+        e1._ep_approximation[-1].v = np.random.rand(10)
+        e1._ep_approximation[-1].tau = np.random.rand(10)
+        e1._ep_approximation.append(np.random.rand(10))
+        e1_r = (
+            GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(
+                e1.to_dict()
+            )
+        )
+
+        assert type(e1) == type(e1_r)
+        assert e1.epsilon == e1_r.epsilon
+        assert e1.eta == e1_r.eta
+        assert e1.delta == e1_r.delta
+        assert e1.always_reset == e1_r.always_reset
+        assert e1.max_iters == e1_r.max_iters
+        assert e1.ep_mode == e1_r.ep_mode
+        assert e1.parallel_updates == e1_r.parallel_updates
+
+        np.testing.assert_array_equal(
+            e1.ga_approx_old.tau[:], e1_r.ga_approx_old.tau[:]
+        )
+        np.testing.assert_array_equal(e1.ga_approx_old.v[:], e1_r.ga_approx_old.v[:])
+        np.testing.assert_array_equal(
+            e1._ep_approximation[0].mu[:], e1_r._ep_approximation[0].mu[:]
+        )
+        np.testing.assert_array_equal(
+            e1._ep_approximation[0].Sigma[:], e1_r._ep_approximation[0].Sigma[:]
+        )
+        np.testing.assert_array_equal(
+            e1._ep_approximation[1].tau[:], e1_r._ep_approximation[1].tau[:]
+        )
+        np.testing.assert_array_equal(
+            e1._ep_approximation[1].v[:], e1_r._ep_approximation[1].v[:]
+        )
+        np.testing.assert_array_equal(
+            e1._ep_approximation[2].tau[:], e1_r._ep_approximation[2].tau[:]
+        )
+        np.testing.assert_array_equal(
+            e1._ep_approximation[2].v[:], e1_r._ep_approximation[2].v[:]
+        )
+        np.testing.assert_array_equal(
+            e1._ep_approximation[3][:], e1_r._ep_approximation[3][:]
+        )
+
+        e2 = GPy.inference.latent_function_inference.expectation_propagation.EPDTC(
+            ep_mode="nested"
+        )
+        e2.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(
+            np.random.rand(10), np.random.rand(10)
+        )
+        e2._ep_approximation = []
+        e2._ep_approximation.append(
+            GPy.inference.latent_function_inference.expectation_propagation.posteriorParamsDTC(
+                np.random.rand(10), np.random.rand(10)
+            )
+        )
+        e2._ep_approximation.append(
+            GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(
+                np.random.rand(10), np.random.rand(10)
+            )
+        )
+        e2._ep_approximation.append(100.0)
+        e2_r = (
+            GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(
+                e2.to_dict()
+            )
+        )
+
+        assert type(e2) == type(e2_r)
+        assert e2.epsilon == e2_r.epsilon
+        assert e2.eta == e2_r.eta
+        assert e2.delta == e2_r.delta
+        assert e2.always_reset == e2_r.always_reset
+        assert e2.max_iters == e2_r.max_iters
+        assert e2.ep_mode == e2_r.ep_mode
+        assert e2.parallel_updates == e2_r.parallel_updates
+
+        np.testing.assert_array_equal(
+            e2.ga_approx_old.tau[:], e2_r.ga_approx_old.tau[:]
+        )
+        np.testing.assert_array_equal(e2.ga_approx_old.v[:], e2_r.ga_approx_old.v[:])
+        np.testing.assert_array_equal(
+            e2._ep_approximation[0].mu[:], e2_r._ep_approximation[0].mu[:]
+        )
+        np.testing.assert_array_equal(
+            e2._ep_approximation[0].Sigma_diag[:],
+            e2_r._ep_approximation[0].Sigma_diag[:],
+        )
+        np.testing.assert_array_equal(
+            e2._ep_approximation[1].tau[:], e2_r._ep_approximation[1].tau[:]
+        )
+        np.testing.assert_array_equal(
+            e2._ep_approximation[1].v[:], e2_r._ep_approximation[1].v[:]
+        )
+        assert e2._ep_approximation[2] == e2_r._ep_approximation[2]
+
+        e3 = (
+            GPy.inference.latent_function_inference.exact_gaussian_inference.ExactGaussianInference()
+        )
+        e3_r = (
+            GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(
+                e3.to_dict()
+            )
+        )
+
+        assert type(e3) == type(e3_r)
+
+    def test_serialize_deserialize_GP(self):
+        np.random.seed(fixed_seed)
+        N = 20
+        Nhalf = int(N / 2)
+        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[
+            :, None
+        ]
+        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
+        kernel = GPy.kern.RBF(1)
+        likelihood = GPy.likelihoods.Bernoulli()
+        inference_method = (
+            GPy.inference.latent_function_inference.expectation_propagation.EP(
+                ep_mode="nested"
+            )
+        )
+        mean_function = None
+
+        m = GPy.core.GP(
+            X=X,
+            Y=Y,
+            kernel=kernel,
+            likelihood=likelihood,
+            inference_method=inference_method,
+            mean_function=mean_function,
+            normalizer=True,
+            name="gp_classification",
+        )
+        m.optimize()
+        m.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
+        m.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
+        m1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
+        m2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X, Y))
+        os.remove("temp_test_gp_with_data.json.zip")
+        os.remove("temp_test_gp_without_data.json.zip")
+        var = m.predict(X)[0]
+        var1_r = m1_r.predict(X)[0]
+        var2_r = m2_r.predict(X)[0]
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var1_r).flatten()
+        )
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var2_r).flatten()
+        )
+
+    def test_serialize_deserialize_SparseGP(self):
+        np.random.seed(fixed_seed)
+        N = 20
+        Nhalf = int(N / 2)
+        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[
+            :, None
+        ]
+        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
+        kernel = GPy.kern.RBF(1)
+        likelihood = GPy.likelihoods.Bernoulli()
+        inference_method = (
+            GPy.inference.latent_function_inference.expectation_propagation.EPDTC(
+                ep_mode="nested"
+            )
+        )
+        mean_function = None
+
+        sm = GPy.core.SparseGP(
+            X=X,
+            Y=Y,
+            Z=X[0:20, :],
+            kernel=kernel,
+            likelihood=likelihood,
+            inference_method=inference_method,
+            mean_function=mean_function,
+            normalizer=True,
+            name="sparse_gp_classification",
+        )
+        sm.optimize()
+        sm.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
+        sm.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
+        sm1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
+        sm2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X, Y))
+        os.remove("temp_test_gp_with_data.json.zip")
+        os.remove("temp_test_gp_without_data.json.zip")
+        var = sm.predict(X)[0]
+        var1_r = sm1_r.predict(X)[0]
+        var2_r = sm2_r.predict(X)[0]
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var1_r).flatten()
+        )
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var2_r).flatten()
+        )
+
+    def test_serialize_deserialize_GPRegressor(self):
+        np.random.seed(fixed_seed)
+        N = 50
+        N_new = 50
+        D = 1
+        X = np.random.uniform(-3.0, 3.0, (N, 1))
+        Y = np.sin(X) + np.random.randn(N, D) * 0.05
+        X_new = np.random.uniform(-3.0, 3.0, (N_new, 1))
+        k = GPy.kern.RBF(input_dim=1, lengthscale=10)
+        m = GPy.models.GPRegression(X, Y, k)
+        m.optimize()
+        m.save_model(
+            "temp_test_gp_regressor_with_data.json", compress=True, save_data=True
+        )
+        m.save_model(
+            "temp_test_gp_regressor_without_data.json", compress=True, save_data=False
+        )
+        m1_r = GPy.models.GPRegression.load_model(
+            "temp_test_gp_regressor_with_data.json.zip"
+        )
+        m2_r = GPy.models.GPRegression.load_model(
+            "temp_test_gp_regressor_without_data.json.zip", (X, Y)
+        )
+        os.remove("temp_test_gp_regressor_with_data.json.zip")
+        os.remove("temp_test_gp_regressor_without_data.json.zip")
+
+        Xp = np.random.uniform(size=(int(1e5), 1))
+        Xp[:, 0] = Xp[:, 0] * 15 - 5
+
+        _, var = m.predict(Xp)
+        _, var1_r = m1_r.predict(Xp)
+        _, var2_r = m2_r.predict(Xp)
+        np.testing.assert_array_equal(var.flatten(), var1_r.flatten())
+        np.testing.assert_array_equal(var.flatten(), var2_r.flatten())
+
+    def test_serialize_deserialize_GPClassification(self):
+        np.random.seed(fixed_seed)
+        N = 50
+        Nhalf = int(N / 2)
+        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[
+            :, None
+        ]
+        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
+        kernel = GPy.kern.RBF(1)
+        m = GPy.models.GPClassification(X, Y, kernel=kernel)
+        m.optimize()
+        m.save_model(
+            "temp_test_gp_classifier_with_data.json", compress=True, save_data=True
+        )
+        m.save_model(
+            "temp_test_gp_classifier_without_data.json", compress=True, save_data=False
+        )
+        m1_r = GPy.models.GPClassification.load_model(
+            "temp_test_gp_classifier_with_data.json.zip"
+        )
+        assert type(m) == type(
+            m1_r
+        ), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m1_r))
+        m2_r = GPy.models.GPClassification.load_model(
+            "temp_test_gp_classifier_without_data.json.zip", (X, Y)
+        )
+        assert type(m) == type(
+            m2_r
+        ), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m2_r))
+        os.remove("temp_test_gp_classifier_with_data.json.zip")
+        os.remove("temp_test_gp_classifier_without_data.json.zip")
+
+        var = m.predict(X)[0]
+        var1_r = m1_r.predict(X)[0]
+        _var2_r = m2_r.predict(X)[0]
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var1_r).flatten()
+        )
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var1_r).flatten()
+        )
+
+    def test_serialize_deserialize_SparseGPClassification(self):
+        np.random.seed(fixed_seed)
+        N = 50
+        Nhalf = int(N / 2)
+        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[
+            :, None
+        ]
+        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
+        kernel = GPy.kern.RBF(1)
+        m = GPy.models.SparseGPClassification(X, Y, num_inducing=3, kernel=kernel)
+        m.optimize()
+        m.save_model(
+            "temp_test_sparse_gp_classifier_with_data.json",
+            compress=True,
+            save_data=True,
+        )
+        m.save_model(
+            "temp_test_sparse_gp_classifier_without_data.json",
+            compress=True,
+            save_data=False,
+        )
+        m1_r = GPy.models.SparseGPClassification.load_model(
+            "temp_test_sparse_gp_classifier_with_data.json.zip"
+        )
+        assert type(m) == type(
+            m1_r
+        ), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m1_r))
+        m2_r = GPy.models.SparseGPClassification.load_model(
+            "temp_test_sparse_gp_classifier_without_data.json.zip", (X, Y)
+        )
+        assert type(m) == type(
+            m2_r
+        ), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m2_r))
+        os.remove("temp_test_sparse_gp_classifier_with_data.json.zip")
+        os.remove("temp_test_sparse_gp_classifier_without_data.json.zip")
+
+        var = m.predict(X)[0]
+        var1_r = m1_r.predict(X)[0]
+        var2_r = m2_r.predict(X)[0]
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var1_r).flatten()
+        )
+        np.testing.assert_array_equal(
+            np.array(var).flatten(), np.array(var1_r).flatten()
+        )
--- a/GPy/testing/test_svgp.py
+++ b/GPy/testing/test_svgp.py
@ -0,0 +1,63 @@
+import numpy as np
+import GPy
+
+
+class TestSVGP_nonconvex:
+    """
+    Inference in the SVGP with a student-T likelihood
+    """
+
+    def setup(self):
+        X = np.linspace(0, 10, 100).reshape(-1, 1)
+        Z = np.linspace(0, 10, 10).reshape(-1, 1)
+        Y = np.sin(X) + np.random.randn(*X.shape) * 0.1
+        Y[50] += 3
+
+        lik = GPy.likelihoods.StudentT(deg_free=2)
+        k = GPy.kern.RBF(1, lengthscale=5.0) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
+
+    def test_grad(self):
+        self.setup()
+        assert self.m.checkgrad(step=1e-4)
+
+
+class TestSVGP_classification:
+    """
+    Inference in the SVGP with a Bernoulli likelihood
+    """
+
+    def setup(self):
+        X = np.linspace(0, 10, 100).reshape(-1, 1)
+        Z = np.linspace(0, 10, 10).reshape(-1, 1)
+        Y = np.where((np.sin(X) + np.random.randn(*X.shape) * 0.1) > 0, 1, 0)
+
+        lik = GPy.likelihoods.Bernoulli()
+        k = GPy.kern.RBF(1, lengthscale=5.0) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
+
+    def test_grad(self):
+        self.setup()
+        assert self.m.checkgrad(step=1e-4)
+
+
+class TestSVGP_Poisson_with_meanfunction:
+    """
+    Inference in the SVGP with a Bernoulli likelihood
+    """
+
+    def setup(self):
+        X = np.linspace(0, 10, 100).reshape(-1, 1)
+        Z = np.linspace(0, 10, 10).reshape(-1, 1)
+        latent_f = np.exp(0.1 * X * 0.05 * X**2)
+        Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1, 1)
+
+        mf = GPy.mappings.Linear(1, 1)
+
+        lik = GPy.likelihoods.Poisson()
+        k = GPy.kern.RBF(1, lengthscale=5.0) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
+
+    def test_grad(self):
+        self.setup()
+        assert self.m.checkgrad(step=1e-4)
--- a/GPy/testing/tp_tests.py
+++ b/GPy/testing/tp_tests.py
@ -1,29 +1,30 @@
-'''
+"""
 Created on 14 Jul 2017, based on gp_tests

@author: javdrher
-'''
-import unittest
-import numpy as np, GPy
+"""
+import numpy as np
+import GPy


-class Test(unittest.TestCase):
-    def setUp(self):
+class TestTP:
+    def setup(self):
        np.random.seed(12345)
        self.N = 20
        self.N_new = 50
        self.D = 1
-        self.X = np.random.uniform(-3., 3., (self.N, 1))
+        self.X = np.random.uniform(-3.0, 3.0, (self.N, 1))
        self.Y = np.sin(self.X) + np.random.randn(self.N, self.D) * 0.05
-        self.X_new = np.random.uniform(-3., 3., (self.N_new, 1))
+        self.X_new = np.random.uniform(-3.0, 3.0, (self.N_new, 1))

    def test_setxy_gp(self):
+        self.setup()
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        m = GPy.models.TPRegression(self.X, self.Y, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X.copy()
        m.set_XY(m.X[:10], m.Y[:10])
-        assert (m.checkgrad(tolerance=1e-2))
+        assert m.checkgrad(tolerance=1e-2)
        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
        np.testing.assert_allclose(mu, mu2)
@ -33,10 +34,12 @@ class Test(unittest.TestCase):
        from GPy.core.parameterization.param import Param
        from GPy.core.mapping import Mapping

+        self.setup()
+
        class Parabola(Mapping):
-            def __init__(self, variance, degree=2, name='parabola'):
+            def __init__(self, variance, degree=2, name="parabola"):
                super(Parabola, self).__init__(1, 1, name)
-                self.variance = Param('variance', np.ones(degree + 1) * variance)
+                self.variance = Param("variance", np.ones(degree + 1) * variance)
                self.degree = degree
                self.link_parameter(self.variance)

@ -59,21 +62,28 @@ class Test(unittest.TestCase):
        X = np.linspace(-2, 2, 100)[:, None]
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        k.randomize()
-        p = Parabola(.3)
+        p = Parabola(0.3)
        p.randomize()
-        Y = p.f(X) + np.random.multivariate_normal(np.zeros(X.shape[0]), k.K(X) + np.eye(X.shape[0]) * 1e-8)[:,
-                     None] + np.random.normal(0, .1, (X.shape[0], 1))
+        Y = (
+            p.f(X)
+            + np.random.multivariate_normal(
+                np.zeros(X.shape[0]), k.K(X) + np.eye(X.shape[0]) * 1e-8
+            )[:, None]
+            + np.random.normal(0, 0.1, (X.shape[0], 1))
+        )
        m = GPy.models.TPRegression(X, Y, kernel=k, mean_function=p)
-        assert (m.checkgrad(tolerance=2e-1))
+        assert m.checkgrad(tolerance=2e-1)
        _ = m.predict(m.X)

    def test_normalizer(self):
+        self.setup()
+
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        Y = self.Y
        mu, std = Y.mean(0), Y.std(0)
        m = GPy.models.TPRegression(self.X, Y, kernel=k, normalizer=True)
        m.optimize()
-        assert (m.checkgrad())
+        assert m.checkgrad()
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        m2 = GPy.models.TPRegression(self.X, (Y - mu) / std, kernel=k, normalizer=False)
        m2[:] = m[:]
@ -81,13 +91,13 @@ class Test(unittest.TestCase):
        mu1, var1 = m.predict(m.X, full_cov=True)
        mu2, var2 = m2.predict(m2.X, full_cov=True)
        np.testing.assert_allclose(mu1, (mu2 * std) + mu)
-        np.testing.assert_allclose(var1, var2 * std ** 2)
+        np.testing.assert_allclose(var1, var2 * std**2)

        mu1, var1 = m.predict(m.X, full_cov=False)
        mu2, var2 = m2.predict(m2.X, full_cov=False)

        np.testing.assert_allclose(mu1, (mu2 * std) + mu)
-        np.testing.assert_allclose(var1, var2 * std ** 2)
+        np.testing.assert_allclose(var1, var2 * std**2)

        q50n = m.predict_quantiles(m.X, (50,))
        q50 = m2.predict_quantiles(m2.X, (50,))
@ -102,10 +112,15 @@ class Test(unittest.TestCase):
        q95 = m2.predict_quantiles(self.X[[c]], qs)
        mu, var = m2.predict(self.X[[c]])
        from scipy.stats import t
-        np.testing.assert_allclose((mu + (t.ppf(qs / 100., m2.nu + m2.num_data) * np.sqrt(var))).flatten(),
-                                   np.array(q95).flatten())
+
+        np.testing.assert_allclose(
+            (mu + (t.ppf(qs / 100.0, m2.nu + m2.num_data) * np.sqrt(var))).flatten(),
+            np.array(q95).flatten(),
+        )

    def test_predict_equivalence(self):
+        self.setup()
+
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        m = GPy.models.TPRegression(self.X, self.Y, kernel=k)
        m.optimize()
@ -124,10 +139,12 @@ class Test(unittest.TestCase):
        mu3, var3 = m2._raw_predict(m.X)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1, var2)
-        self.assertFalse(np.allclose(mu1, mu3))
-        self.assertFalse(np.allclose(var1, var3))
+        assert not np.allclose(mu1, mu3)
+        assert not np.allclose(var1, var3)

    def test_gp_equivalence(self):
+        self.setup()
+
        k = GPy.kern.RBF(1)
        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
        m.optimize()
@ -139,7 +156,3 @@ class Test(unittest.TestCase):
        mu2, var2 = m2.predict(self.X)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1, var2)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/GPy/testing/test_util.py
+++ b/GPy/testing/test_util.py
@ -0,0 +1,284 @@
+# ===============================================================================
+# Copyright (c) 2016, Max Zwiessele, Alan Saul
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of GPy.testing.util_tests nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ===============================================================================
+
+import numpy as np
+import GPy
+
+
+class UtilTest:
+    def test_checkFinite(self):
+        from GPy.util.debug import checkFinite
+
+        array = np.random.normal(0, 1, 100).reshape(25, 4)
+        assert checkFinite(array, name="test")
+
+        array[np.random.binomial(1, 0.3, array.shape).astype(bool)] = np.nan
+        assert not checkFinite(array)
+
+    def test_checkFullRank(self):
+        from GPy.util.debug import checkFullRank
+        from GPy.util.linalg import tdot
+
+        array = np.random.normal(0, 1, 100).reshape(25, 4)
+        assert not checkFullRank(tdot(array), name="test")
+
+        array = np.random.normal(0, 1, (25, 25))
+        assert checkFullRank(tdot(array))
+
+    def test_fixed_inputs_median(self):
+        """test fixed_inputs convenience function"""
+        from GPy.plotting.matplot_dep.util import fixed_inputs
+        import GPy
+
+        X = np.random.randn(10, 3)
+        Y = np.sin(X) + np.random.randn(10, 3) * 1e-3
+        m = GPy.models.GPRegression(X, Y)
+        fixed = fixed_inputs(m, [1], fix_routine="median", as_list=True, X_all=False)
+        assert (0, np.median(X[:, 0])) in fixed
+        assert (2, np.median(X[:, 2])) in fixed
+        assert (
+            len([t for t in fixed if t[0] == 1]) == 0
+        )  # Unfixed input should not be in fixed
+
+    def test_fixed_inputs_mean(self):
+        from GPy.plotting.matplot_dep.util import fixed_inputs
+        import GPy
+
+        X = np.random.randn(10, 3)
+        Y = np.sin(X) + np.random.randn(10, 3) * 1e-3
+        m = GPy.models.GPRegression(X, Y)
+        fixed = fixed_inputs(m, [1], fix_routine="mean", as_list=True, X_all=False)
+        assert (0, np.mean(X[:, 0])) in fixed
+        assert (2, np.mean(X[:, 2])) in fixed
+        assert (
+            len([t for t in fixed if t[0] == 1]) == 0
+        )  # Unfixed input should not be in fixed
+
+    def test_fixed_inputs_zero(self):
+        from GPy.plotting.matplot_dep.util import fixed_inputs
+        import GPy
+
+        X = np.random.randn(10, 3)
+        Y = np.sin(X) + np.random.randn(10, 3) * 1e-3
+        m = GPy.models.GPRegression(X, Y)
+        fixed = fixed_inputs(m, [1], fix_routine="zero", as_list=True, X_all=False)
+        assert (0, 0.0) in fixed
+        assert (2, 0.0) in fixed
+        assert (
+            len([t for t in fixed if t[0] == 1]) == 0
+        )  # Unfixed input should not be in fixed
+
+    def test_fixed_inputs_uncertain(self):
+        from GPy.plotting.matplot_dep.util import fixed_inputs
+        import GPy
+        from GPy.core.parameterization.variational import NormalPosterior
+
+        X_mu = np.random.randn(10, 3)
+        X_var = np.random.randn(10, 3)
+        X = NormalPosterior(X_mu, X_var)
+        Y = np.sin(X_mu) + np.random.randn(10, 3) * 1e-3
+        m = GPy.models.BayesianGPLVM(Y, X=X_mu, X_variance=X_var, input_dim=3)
+        fixed = fixed_inputs(m, [1], fix_routine="median", as_list=True, X_all=False)
+        assert (0, np.median(X.mean.values[:, 0])) in fixed
+        assert (2, np.median(X.mean.values[:, 2])) in fixed
+        assert (
+            len([t for t in fixed if t[0] == 1]) == 0
+        )  # Unfixed input should not be in fixed
+
+    def test_DSYR(self):
+        from GPy.util.linalg import DSYR, DSYR_numpy
+
+        A = np.arange(9.0).reshape(3, 3)
+        A = np.dot(A.T, A)
+        b = np.ones(3, dtype=float)
+        alpha = 1.0
+        DSYR(A, b, alpha)
+        R = np.array([[46, 55, 64], [55, 67, 79], [64, 79, 94]])
+        assert abs(np.sum(A - R)) < 1e-12
+
+    def test_subarray(self):
+        import GPy
+
+        X = np.zeros((3, 6), dtype=bool)
+        X[[1, 1, 1], [0, 4, 5]] = 1
+        X[1:, [2, 3]] = 1
+        d = GPy.util.subarray_and_sorting.common_subarrays(X, axis=1)
+        assert len(d) == 3
+        X[:, d[tuple(X[:, 0])]]
+        assert d[tuple(X[:, 4])] == d[tuple(X[:, 0])] == [0, 4, 5]
+        assert d[tuple(X[:, 1])] == [1]
+
+    def test_offset_cluster(self):
+        # Tests the GPy.util.cluster_with_offset.cluster utility with a small
+        # test data set. Not using random noise just in case it occasionally
+        # causes it not to cluster correctly.
+        # groundtruth cluster identifiers are: [0,1,1,0]
+
+        # data contains a list of the four sets of time series (3 per data point)
+
+        data = [
+            np.array(
+                [
+                    [2.18094245, 1.96529789, 2.00265523, 2.18218742, 2.06795428],
+                    [1.62254829, 1.75748448, 1.83879347, 1.87531326, 1.52503496],
+                    [1.54589609, 1.61607914, 2.00463192, 1.48771394, 1.63339218],
+                ]
+            ),
+            np.array(
+                [
+                    [2.86766106, 2.97953437, 2.91958876, 2.92510506, 3.03239241],
+                    [2.57368423, 2.59954886, 3.10000395, 2.75806125, 2.89865704],
+                    [2.58916318, 2.53698259, 2.63858411, 2.63102504, 2.51853901],
+                ]
+            ),
+            np.array(
+                [
+                    [2.77834168, 2.9618564, 2.88482141, 3.24259745, 2.9716821],
+                    [2.60675576, 2.67095624, 2.94824436, 2.80520631, 2.87247516],
+                    [2.49543562, 2.5492281, 2.6505866, 2.65015308, 2.59738616],
+                ]
+            ),
+            np.array(
+                [
+                    [1.76783086, 2.21666738, 2.07939706, 1.9268263, 2.23360121],
+                    [1.94305547, 1.94648592, 2.1278921, 2.09481457, 2.08575238],
+                    [1.69336013, 1.72285186, 1.6339506, 1.61212022, 1.39198698],
+                ]
+            ),
+        ]
+
+        # inputs contains their associated X values
+
+        inputs = [
+            np.array([[0.0], [0.68040097], [1.20316795], [1.798749], [2.14891733]]),
+            np.array([[0.0], [0.51910637], [0.98259352], [1.57442965], [1.82515098]]),
+            np.array([[0.0], [0.66645478], [1.59464591], [1.69769551], [1.80932752]]),
+            np.array([[0.0], [0.87512108], [1.71881079], [2.67162871], [3.23761907]]),
+        ]
+
+        # try doing the clustering
+        active = GPy.util.cluster_with_offset.cluster(data, inputs)
+        # check to see that the clustering has correctly clustered the time series.
+        clusters = set([frozenset(cluster) for cluster in active])
+        assert set([1, 2]) in clusters, "Offset Clustering algorithm failed"
+        assert set([0, 3]) in clusters, "Offset Clustering algoirthm failed"
+
+
+class TestUnivariateGaussian:
+    def setup(self):
+        self.zz = [-5.0, -0.8, 0.0, 0.5, 2.0, 10.0]
+
+    def test_logPdfNormal(self):
+        from GPy.util.univariate_Gaussian import logPdfNormal
+
+        self.setup()
+
+        pySols = [
+            -13.4189385332,
+            -1.2389385332,
+            -0.918938533205,
+            -1.0439385332,
+            -2.9189385332,
+            -50.9189385332,
+        ]
+        diff = 0.0
+        for i in range(len(pySols)):
+            diff += abs(logPdfNormal(self.zz[i]) - pySols[i])
+        assert diff < 1e-10
+
+    def test_cdfNormal(self):
+        from GPy.util.univariate_Gaussian import cdfNormal
+
+        self.setup()
+
+        pySols = [
+            2.86651571879e-07,
+            0.211855398583,
+            0.5,
+            0.691462461274,
+            0.977249868052,
+            1.0,
+        ]
+        diff = 0.0
+        for i in range(len(pySols)):
+            diff += abs(cdfNormal(self.zz[i]) - pySols[i])
+        assert diff < 1e-10
+
+    def test_logCdfNormal(self):
+        from GPy.util.univariate_Gaussian import logCdfNormal
+
+        self.setup()
+
+        pySols = [
+            -15.064998394,
+            -1.55185131919,
+            -0.69314718056,
+            -0.368946415289,
+            -0.023012909329,
+            0.0,
+        ]
+        diff = 0.0
+        for i in range(len(pySols)):
+            diff += abs(logCdfNormal(self.zz[i]) - pySols[i])
+        assert diff < 1e-10
+
+    def test_derivLogCdfNormal(self):
+        from GPy.util.univariate_Gaussian import derivLogCdfNormal
+
+        self.setup()
+
+        pySols = [
+            5.18650396941,
+            1.3674022693,
+            0.79788456081,
+            0.50916043387,
+            0.0552478626962,
+            0.0,
+        ]
+        diff = 0.0
+        for i in range(len(pySols)):
+            diff += abs(derivLogCdfNormal(self.zz[i]) - pySols[i])
+        assert diff < 1e-8
+
+
+class TestStandardize:
+    def setup(self):
+        self.normalizer = GPy.util.normalizer.Standardize()
+        y = np.stack([np.random.randn(10), 2 * np.random.randn(10)], axis=1)
+        self.normalizer.scale_by(y)
+
+    def test_inverse_covariance(self):
+        """
+        Test inverse covariance outputs correct size
+        """
+        self.setup()
+        covariance = np.random.rand(100, 100)
+        output = self.normalizer.inverse_covariance(covariance)
+        assert output.shape == (100, 100, 2)
--- a/GPy/testing/variational_tests.py
+++ b/GPy/testing/variational_tests.py
@ -1,4 +1,4 @@
-'''
+"""
 Copyright (c) 2015, Max Zwiessele
 All rights reserved.

@ -26,38 +26,35 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-'''
-import unittest
+"""
 import GPy, numpy as np

-class KLGrad(GPy.core.Model):
-            def __init__(self, Xvar, kl):   
-                super(KLGrad, self).__init__(name="klgrad")     
-                self.kl = kl
-                self.link_parameter(Xvar)
-                self.Xvar = Xvar
-                self._obj = 0
-            def parameters_changed(self):
-                self.Xvar.gradient[:] = 0
-                self.kl.update_gradients_KL(self.Xvar)
-                self._obj = self.kl.KL_divergence(self.Xvar)
-            def objective_function(self):
-                return self._obj
-        
-class Test(unittest.TestCase):

-    def setUp(self):
+class KLGrad(GPy.core.Model):
+    def __init__(self, Xvar, kl):
+        super(KLGrad, self).__init__(name="klgrad")
+        self.kl = kl
+        self.link_parameter(Xvar)
+        self.Xvar = Xvar
+        self._obj = 0
+
+    def parameters_changed(self):
+        self.Xvar.gradient[:] = 0
+        self.kl.update_gradients_KL(self.Xvar)
+        self._obj = self.kl.KL_divergence(self.Xvar)
+
+    def objective_function(self):
+        return self._obj
+
+
+class TestVariational:
+    def setup(self):
        np.random.seed(12345)
        self.Xvar = GPy.core.parameterization.variational.NormalPosterior(
-            np.random.uniform(0,1,(10,3)), 
-            np.random.uniform(1e-5,.01, (10,3))
-            )
+            np.random.uniform(0, 1, (10, 3)), np.random.uniform(1e-5, 0.01, (10, 3))
+        )

-
-    def testNormal(self):
+    def test_normal(self):
+        self.setup()
        klgrad = KLGrad(self.Xvar, GPy.core.parameterization.variational.NormalPrior())
        np.testing.assert_(klgrad.checkgrad())
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.testNormal']
-    unittest.main()
--- a/GPy/testing/todo.md
+++ b/GPy/testing/todo.md
@ -0,0 +1,14 @@
+As off now, I am once through all of the tests and basic migration is done.
+
+Now, fix the below things and todos before starting to get the tests running using pytest
+
+
+ update test script names according to pytest conversion
+ check for TODOs
+ + there are many associated with "iscloseto" functions from np.testing. Will have to figure out how these
+ + some tests are not that clear to me tbh
+ check nomenclature of test files and test classes and test functions
+ chatgpt says that I should replace delta with the decimal but a delta of 1e-4 should be decimal=4. Not sure about this yet  but that is something I need to fix later on
+--> this gives more content to it: https://docs.python.org/3/library/unittest.html#unittest.TestCase.assertAlmostEqual
+I need to write a custom function that behaves accordingly as in some cases, np.testing.assert_almost_equal won't be applicable, https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html
+or how about this: `np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)`
--- a/GPy/testing/util_tests.py
+++ b/GPy/testing/util_tests.py
@ -1,242 +0,0 @@
-#===============================================================================
-# Copyright (c) 2016, Max Zwiessele, Alan Saul
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of GPy.testing.util_tests nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
-
-import unittest
-import numpy as np
-import GPy
-
-class TestDebug(unittest.TestCase):
-    def test_checkFinite(self):
-        from GPy.util.debug import checkFinite
-        array = np.random.normal(0, 1, 100).reshape(25,4)
-        self.assertTrue(checkFinite(array, name='test'))
-
-        array[np.random.binomial(1, .3, array.shape).astype(bool)] = np.nan
-        self.assertFalse(checkFinite(array))
-
-    def test_checkFullRank(self):
-        from GPy.util.debug import checkFullRank
-        from GPy.util.linalg import tdot
-        array = np.random.normal(0, 1, 100).reshape(25,4)
-        self.assertFalse(checkFullRank(tdot(array), name='test'))
-
-        array = np.random.normal(0, 1, (25,25))
-        self.assertTrue(checkFullRank(tdot(array)))
-
-    def test_fixed_inputs_median(self):
-        """ test fixed_inputs convenience function """
-        from GPy.plotting.matplot_dep.util import fixed_inputs
-        import GPy
-        X = np.random.randn(10, 3)
-        Y = np.sin(X) + np.random.randn(10, 3)*1e-3
-        m = GPy.models.GPRegression(X, Y)
-        fixed = fixed_inputs(m, [1], fix_routine='median', as_list=True, X_all=False)
-        self.assertTrue((0, np.median(X[:,0])) in fixed)
-        self.assertTrue((2, np.median(X[:,2])) in fixed)
-        self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
-
-    def test_fixed_inputs_mean(self):
-        from GPy.plotting.matplot_dep.util import fixed_inputs
-        import GPy
-        X = np.random.randn(10, 3)
-        Y = np.sin(X) + np.random.randn(10, 3)*1e-3
-        m = GPy.models.GPRegression(X, Y)
-        fixed = fixed_inputs(m, [1], fix_routine='mean', as_list=True, X_all=False)
-        self.assertTrue((0, np.mean(X[:,0])) in fixed)
-        self.assertTrue((2, np.mean(X[:,2])) in fixed)
-        self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
-
-    def test_fixed_inputs_zero(self):
-        from GPy.plotting.matplot_dep.util import fixed_inputs
-        import GPy
-        X = np.random.randn(10, 3)
-        Y = np.sin(X) + np.random.randn(10, 3)*1e-3
-        m = GPy.models.GPRegression(X, Y)
-        fixed = fixed_inputs(m, [1], fix_routine='zero', as_list=True, X_all=False)
-        self.assertTrue((0, 0.0) in fixed)
-        self.assertTrue((2, 0.0) in fixed)
-        self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
-
-    def test_fixed_inputs_uncertain(self):
-        from GPy.plotting.matplot_dep.util import fixed_inputs
-        import GPy
-        from GPy.core.parameterization.variational import NormalPosterior
-        X_mu = np.random.randn(10, 3)
-        X_var = np.random.randn(10, 3)
-        X = NormalPosterior(X_mu, X_var)
-        Y = np.sin(X_mu) + np.random.randn(10, 3)*1e-3
-        m = GPy.models.BayesianGPLVM(Y, X=X_mu, X_variance=X_var, input_dim=3)
-        fixed = fixed_inputs(m, [1], fix_routine='median', as_list=True, X_all=False)
-        self.assertTrue((0, np.median(X.mean.values[:,0])) in fixed)
-        self.assertTrue((2, np.median(X.mean.values[:,2])) in fixed)
-        self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
-
-    def test_DSYR(self):
-        from GPy.util.linalg import DSYR, DSYR_numpy
-        A = np.arange(9.0).reshape(3,3)
-        A = np.dot(A.T, A)
-        b = np.ones(3, dtype=float)
-        alpha = 1.0
-        DSYR(A, b, alpha)
-        R = np.array([
-            [46, 55, 64],
-            [55, 67, 79],
-            [64, 79, 94]]
-            )
-        self.assertTrue(abs(np.sum(A - R)) < 1e-12)
-
-    def test_subarray(self):
-        import GPy
-        X = np.zeros((3,6), dtype=bool)
-        X[[1,1,1],[0,4,5]] = 1
-        X[1:,[2,3]] = 1
-        d = GPy.util.subarray_and_sorting.common_subarrays(X,axis=1)
-        self.assertTrue(len(d) == 3)
-        X[:, d[tuple(X[:,0])]]
-        self.assertTrue(d[tuple(X[:,4])] == d[tuple(X[:,0])] == [0, 4, 5])
-        self.assertTrue(d[tuple(X[:,1])] == [1])
-
-    def test_offset_cluster(self):
-        #Tests the GPy.util.cluster_with_offset.cluster utility with a small
-        #test data set. Not using random noise just in case it occasionally
-        #causes it not to cluster correctly.
-        #groundtruth cluster identifiers are: [0,1,1,0]
-
-        #data contains a list of the four sets of time series (3 per data point)
-
-        data = [np.array([[ 2.18094245,  1.96529789,  2.00265523,  2.18218742,  2.06795428],
-                [ 1.62254829,  1.75748448,  1.83879347,  1.87531326,  1.52503496],
-                [ 1.54589609,  1.61607914,  2.00463192,  1.48771394,  1.63339218]]),
-         np.array([[ 2.86766106,  2.97953437,  2.91958876,  2.92510506,  3.03239241],
-                [ 2.57368423,  2.59954886,  3.10000395,  2.75806125,  2.89865704],
-                [ 2.58916318,  2.53698259,  2.63858411,  2.63102504,  2.51853901]]),
-         np.array([[ 2.77834168,  2.9618564 ,  2.88482141,  3.24259745,  2.9716821 ],
-                [ 2.60675576,  2.67095624,  2.94824436,  2.80520631,  2.87247516],
-                [ 2.49543562,  2.5492281 ,  2.6505866 ,  2.65015308,  2.59738616]]),
-         np.array([[ 1.76783086,  2.21666738,  2.07939706,  1.9268263 ,  2.23360121],
-                [ 1.94305547,  1.94648592,  2.1278921 ,  2.09481457,  2.08575238],
-                [ 1.69336013,  1.72285186,  1.6339506 ,  1.61212022,  1.39198698]])]
-
-        #inputs contains their associated X values
-
-        inputs = [np.array([[ 0.        ],
-                [ 0.68040097],
-                [ 1.20316795],
-                [ 1.798749  ],
-                [ 2.14891733]]), np.array([[ 0.        ],
-                [ 0.51910637],
-                [ 0.98259352],
-                [ 1.57442965],
-                [ 1.82515098]]), np.array([[ 0.        ],
-                [ 0.66645478],
-                [ 1.59464591],
-                [ 1.69769551],
-                [ 1.80932752]]), np.array([[ 0.        ],
-                [ 0.87512108],
-                [ 1.71881079],
-                [ 2.67162871],
-                [ 3.23761907]])]
-
-        #try doing the clustering
-        active = GPy.util.cluster_with_offset.cluster(data,inputs)
-        #check to see that the clustering has correctly clustered the time series.
-        clusters = set([frozenset(cluster) for cluster in active])
-        assert set([1,2]) in clusters, "Offset Clustering algorithm failed"
-        assert set([0,3]) in clusters, "Offset Clustering algoirthm failed"
-
-
-class TestUnivariateGaussian(unittest.TestCase):
-    def setUp(self):
-        self.zz = [-5.0, -0.8, 0.0, 0.5, 2.0, 10.0]
-
-    def test_logPdfNormal(self):
-        from GPy.util.univariate_Gaussian import logPdfNormal
-        pySols = [-13.4189385332,
-            -1.2389385332,
-            -0.918938533205,
-            -1.0439385332,
-            -2.9189385332,
-            -50.9189385332]
-        diff = 0.0
-        for i in range(len(pySols)):
-            diff += abs(logPdfNormal(self.zz[i]) - pySols[i])
-        self.assertTrue(diff  < 1e-10)
-
-    def test_cdfNormal(self):
-        from GPy.util.univariate_Gaussian import cdfNormal
-        pySols = [2.86651571879e-07,
-          0.211855398583,
-          0.5,
-          0.691462461274,
-          0.977249868052,
-          1.0]
-        diff = 0.0
-        for i in range(len(pySols)):
-            diff += abs(cdfNormal(self.zz[i]) - pySols[i])
-        self.assertTrue(diff  < 1e-10)
-
-    def test_logCdfNormal(self):
-        from GPy.util.univariate_Gaussian import logCdfNormal
-        pySols = [-15.064998394,
-          -1.55185131919,
-          -0.69314718056,
-          -0.368946415289,
-          -0.023012909329,
-          0.0]
-        diff = 0.0
-        for i in range(len(pySols)):
-            diff += abs(logCdfNormal(self.zz[i]) - pySols[i])
-        self.assertTrue(diff  < 1e-10)
-    def test_derivLogCdfNormal(self):
-        from GPy.util.univariate_Gaussian import derivLogCdfNormal
-        pySols = [5.18650396941,
-          1.3674022693,
-          0.79788456081,
-          0.50916043387,
-          0.0552478626962,
-          0.0]
-        diff = 0.0
-        for i in range(len(pySols)):
-          diff += abs(derivLogCdfNormal(self.zz[i]) - pySols[i])
-        self.assertTrue(diff  < 1e-8)
-
-class TestStandardize(unittest.TestCase):
-    def setUp(self):
-        self.normalizer = GPy.util.normalizer.Standardize()
-        y = np.stack([np.random.randn(10), 2*np.random.randn(10)], axis=1)
-        self.normalizer.scale_by(y)
-    
-    def test_inverse_covariance(self):
-        """
-        Test inverse covariance outputs correct size
-        """
-        covariance = np.random.rand(100, 100)
-        output = self.normalizer.inverse_covariance(covariance)
-        self.assertTrue(output.shape == (100, 100, 2))
--- a/GPy/util/choleskies_cython.c
+++ b/GPy/util/choleskies_cython.c
--- a/GPy/util/classification.py
+++ b/GPy/util/classification.py
@ -2,7 +2,8 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np

-def conf_matrix(p,labels,names=['1','0'],threshold=.5,show=True):
+
+def conf_matrix(p, labels, names=["1", "0"], threshold=0.5, show=True):
    """
    Returns error rate and true/false positives in a binary classification problem
    - Actual classes are displayed by column.
@ -16,18 +17,18 @@ def conf_matrix(p,labels,names=['1','0'],threshold=.5,show=True):
    :type show: False|True
    """
    assert p.size == labels.size, "Arrays p and labels have different dimensions."
-    decision = np.ones((labels.size,1))
-    decision[p<threshold] = 0
+    decision = np.ones((labels.size, 1))
+    decision[p < threshold] = 0
    diff = decision - labels
    false_0 = diff[diff == -1].size
    false_1 = diff[diff == 1].size
-    true_1 = np.sum(decision[diff ==0])
+    true_1 = np.sum(decision[diff == 0])
    true_0 = labels.size - true_1 - false_0 - false_1
-    error = (false_1 + false_0)/np.float(labels.size)
+    error = (false_1 + false_0) / float(labels.size)
    if show:
-        print(100. - error * 100,'% instances correctly classified')
-        print('%-10s|  %-10s|  %-10s| ' % ('',names[0],names[1]))
-        print('----------|------------|------------|')
-        print('%-10s|  %-10s|  %-10s| ' % (names[0],true_1,false_0))
-        print('%-10s|  %-10s|  %-10s| ' % (names[1],false_1,true_0))
-    return error,true_1, false_1, true_0, false_0
+        print(100.0 - error * 100, "% instances correctly classified")
+        print("%-10s|  %-10s|  %-10s| " % ("", names[0], names[1]))
+        print("----------|------------|------------|")
+        print("%-10s|  %-10s|  %-10s| " % (names[0], true_1, false_0))
+        print("%-10s|  %-10s|  %-10s| " % (names[1], false_1, true_0))
+    return error, true_1, false_1, true_0, false_0
--- a/GPy/util/config.py
+++ b/GPy/util/config.py
@ -2,31 +2,46 @@
 # This loads the configuration
 #
 import os
+
 try:
-    #Attempt Python 2 ConfigParser setup
+    # Attempt Python 2 ConfigParser setup
    import ConfigParser
+
    config = ConfigParser.ConfigParser()
    from ConfigParser import NoOptionError
 except ImportError:
-    #Attempt Python 3 ConfigParser setup
+    # Attempt Python 3 ConfigParser setup
    import configparser
+
    config = configparser.ConfigParser()
    from configparser import NoOptionError

 # This is the default configuration file that always needs to be present.
-default_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'defaults.cfg'))
+default_file = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "defaults.cfg")
+)

 # These files are optional
 # This specifies configurations that are typically specific to the machine (it is found alongside the GPy installation).
-local_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'installation.cfg'))
+local_file = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "installation.cfg")
+)

 # This specifies configurations specific to the user (it is found in the user home directory)
-home = os.getenv('HOME') or os.getenv('USERPROFILE') or ''
-user_file = os.path.join(home,'.config','GPy', 'user.cfg')
+home = os.getenv("HOME") or os.getenv("USERPROFILE") or ""
+user_file = os.path.join(home, ".config", "GPy", "user.cfg")

 # Read in the given files.
-config.readfp(open(default_file))
+config.read_file(open(default_file))
 config.read([local_file, user_file])

 if not config:
-    raise ValueError("No configuration file found at either " + user_file + " or " + local_file + " or " + default_file + ".")
+    raise ValueError(
+        "No configuration file found at either "
+        + user_file
+        + " or "
+        + local_file
+        + " or "
+        + default_file
+        + "."
+    )
--- a/GPy/util/initialization.py
+++ b/GPy/util/initialization.py
@ -1,31 +1,59 @@
-'''
+"""
 Created on 24 Feb 2014

@author: maxz
-'''
+"""

 import numpy as np
+import warnings
 from ..util.pca import PCA

+
 def initialize_latent(init, input_dim, Y):
+    """
+    :param init: initialization method for the latent space, 'PCA' or 'random'
+    """
    Xr = np.asfortranarray(np.random.normal(0, 1, (Y.shape[0], input_dim)))
-    if 'PCA' in init:
+    if "PCA" == init:
        p = PCA(Y)
        PC = p.project(Y, min(input_dim, Y.shape[1]))
-        Xr[:PC.shape[0], :PC.shape[1]] = PC
-        var = .1*p.fracs[:input_dim]
-    elif init in 'empirical_samples':
+        Xr[: PC.shape[0], : PC.shape[1]] = PC
+        var = 0.1 * p.fracs[:input_dim]
+    elif init == "empirical_samples":
+        # dealing with depcrecated initialization method
+        # should be remove along the next major release
+        warnings.warn(
+            "Deprecated initialization method 'empirical_samples'. "
+            "Use 'random' instead.",
+            DeprecationWarning,
+        )
+
        from ..util.linalg import tdot
        from ..util import diag
+
        YYT = tdot(Y)
        diag.add(YYT, 1e-6)
-        EMP = np.asfortranarray(np.random.multivariate_normal(np.zeros(Y.shape[0]), YYT, min(input_dim, Y.shape[1])).T)
-        Xr[:EMP.shape[0], :EMP.shape[1]] = EMP
+        EMP = np.asfortranarray(
+            np.random.multivariate_normal(
+                np.zeros(Y.shape[0]), YYT, min(input_dim, Y.shape[1])
+            ).T
+        )
+        Xr[: EMP.shape[0], : EMP.shape[1]] = EMP
        var = np.random.uniform(0.5, 1.5, input_dim)
-    else:
+    elif init == "random":
        var = Xr.var(0)
+    else:
+        # dealing with depcrecated initialization method
+        # should be remove along the next major release
+        warnings.warn(
+            f"{init} is not a valid initialization method."
+            "Supoprt for anything else than 'PCA' or 'random' will be removed in the next major release.",
+            DeprecationWarning,
+        )
+        var = Xr.var(0)
+        

    Xr -= Xr.mean(0)
    Xr /= Xr.std(0)

-    return Xr, var/var.max()
+    return Xr, var / var.max()
--- a/GPy/util/linalg_cython.c
+++ b/GPy/util/linalg_cython.c
--- a/GPy/util/multioutput.py
+++ b/GPy/util/multioutput.py
@ -2,6 +2,7 @@ import numpy as np
 import warnings
 import GPy

+
 def index_to_slices(index):
    """
    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index.
@ -16,28 +17,35 @@ def index_to_slices(index):
    returns
    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
    """
-    if len(index)==0:
-        return[]
+    if len(index) == 0:
+        return []

-    #contruct the return structure
-    ind = np.asarray(index,dtype=np.int)
-    ret = [[] for i in range(ind.max()+1)]
+    # contruct the return structure
+    ind = np.asarray(index, dtype=int)
+    ret = [[] for i in range(ind.max() + 1)]

-    #find the switchpoints
-    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
-    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
+    # find the switchpoints
+    ind_ = np.hstack((ind, ind[0] + ind[-1] + 1))
+    switchpoints = np.nonzero(ind_ - np.roll(ind_, +1))[0]

-    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
+    [
+        ret[ind_i].append(slice(*indexes_i))
+        for ind_i, indexes_i in zip(
+            ind[switchpoints[:-1]], zip(switchpoints, switchpoints[1:])
+        )
+    ]
    return ret

+
 def get_slices(input_list):
    num_outputs = len(input_list)
-    _s = [0] + [ _x.shape[0] for _x in input_list ]
+    _s = [0] + [_x.shape[0] for _x in input_list]
    _s = np.cumsum(_s)
-    slices = [slice(a,b) for a,b in zip(_s[:-1],_s[1:])]
+    slices = [slice(a, b) for a, b in zip(_s[:-1], _s[1:])]
    return slices

-def build_XY(input_list,output_list=None,index=None):
+
+def build_XY(input_list, output_list=None, index=None):
    num_outputs = len(input_list)
    if output_list is not None:
        assert num_outputs == len(output_list)
@ -47,27 +55,35 @@ def build_XY(input_list,output_list=None,index=None):

    if index is not None:
        assert len(index) == num_outputs
-        I = np.hstack( [np.repeat(j,_x.shape[0]) for _x,j in zip(input_list,index)] )
+        I = np.hstack([np.repeat(j, _x.shape[0]) for _x, j in zip(input_list, index)])
    else:
-        I = np.hstack( [np.repeat(j,_x.shape[0]) for _x,j in zip(input_list,range(num_outputs))] )
+        I = np.hstack(
+            [np.repeat(j, _x.shape[0]) for _x, j in zip(input_list, range(num_outputs))]
+        )

    X = np.vstack(input_list)
-    X = np.hstack([X,I[:,None]])
+    X = np.hstack([X, I[:, None]])

-    return X,Y,I[:,None]#slices
+    return X, Y, I[:, None]  # slices

-def build_likelihood(Y_list,noise_index,likelihoods_list=None):
+
+def build_likelihood(Y_list, noise_index, likelihoods_list=None):
    Ny = len(Y_list)
    if likelihoods_list is None:
-       likelihoods_list = [GPy.likelihoods.Gaussian(name="Gaussian_noise_%s" %j) for y,j in zip(Y_list,range(Ny))]
+        likelihoods_list = [
+            GPy.likelihoods.Gaussian(name="Gaussian_noise_%s" % j)
+            for y, j in zip(Y_list, range(Ny))
+        ]
    else:
        assert len(likelihoods_list) == Ny
-    #likelihood = GPy.likelihoods.mixed_noise.MixedNoise(likelihoods_list=likelihoods_list, noise_index=noise_index)
-    likelihood = GPy.likelihoods.mixed_noise.MixedNoise(likelihoods_list=likelihoods_list)
+    # likelihood = GPy.likelihoods.mixed_noise.MixedNoise(likelihoods_list=likelihoods_list, noise_index=noise_index)
+    likelihood = GPy.likelihoods.mixed_noise.MixedNoise(
+        likelihoods_list=likelihoods_list
+    )
    return likelihood


-def ICM(input_dim, num_outputs, kernel, W_rank=1,W=None,kappa=None,name='ICM'):
+def ICM(input_dim, num_outputs, kernel, W_rank=1, W=None, kappa=None, name="ICM"):
    """
    Builds a kernel for an Intrinsic Coregionalization Model

@ -80,13 +96,26 @@ def ICM(input_dim, num_outputs, kernel, W_rank=1,W=None,kappa=None,name='ICM'):
    """
    if kernel.input_dim != input_dim:
        kernel.input_dim = input_dim
-        warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
+        warnings.warn(
+            "kernel's input dimension overwritten to fit input_dim parameter."
+        )

-    K = kernel.prod(GPy.kern.Coregionalize(1, num_outputs, active_dims=[input_dim], rank=W_rank,W=W,kappa=kappa,name='B'),name=name)
+    K = kernel.prod(
+        GPy.kern.Coregionalize(
+            1,
+            num_outputs,
+            active_dims=[input_dim],
+            rank=W_rank,
+            W=W,
+            kappa=kappa,
+            name="B",
+        ),
+        name=name,
+    )
    return K


-def LCM(input_dim, num_outputs, kernels_list, W_rank=1,name='ICM'):
+def LCM(input_dim, num_outputs, kernels_list, W_rank=1, name="ICM"):
    """
    Builds a kernel for an Linear Coregionalization Model

@ -98,15 +127,15 @@ def LCM(input_dim, num_outputs, kernels_list, W_rank=1,name='ICM'):
    :type W_rank: integer
    """
    Nk = len(kernels_list)
-    K = ICM(input_dim,num_outputs,kernels_list[0],W_rank,name='%s%s' %(name,0))
+    K = ICM(input_dim, num_outputs, kernels_list[0], W_rank, name="%s%s" % (name, 0))
    j = 1
    for kernel in kernels_list[1:]:
-        K += ICM(input_dim,num_outputs,kernel,W_rank,name='%s%s' %(name,j))
+        K += ICM(input_dim, num_outputs, kernel, W_rank, name="%s%s" % (name, j))
        j += 1
    return K


-def Private(input_dim, num_outputs, kernel, output, kappa=None,name='X'):
+def Private(input_dim, num_outputs, kernel, output, kappa=None, name="X"):
    """
    Builds a kernel for an Intrinsic Coregionalization Model

@ -117,7 +146,7 @@ def Private(input_dim, num_outputs, kernel, output, kappa=None,name='X'):
    :param W_rank: number tuples of the corregionalization parameters 'W'
    :type W_rank: integer
    """
-    K = ICM(input_dim,num_outputs,kernel,W_rank=1,kappa=kappa,name=name)
+    K = ICM(input_dim, num_outputs, kernel, W_rank=1, kappa=kappa, name=name)
    K.B.W.fix(0)
    _range = range(num_outputs)
    _range.pop(output)
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -3,7 +3,6 @@ include doc/source/conf.py
 include doc/source/index.rst
 include doc/source/tuto*.rst
 include README.md
-include README.rst
 include AUTHORS.txt

 # Data and config
--- a/README.md
+++ b/README.md
@ -79,7 +79,7 @@ If that is the case, it is best to clean the repo and reinstall.
 [<img src="https://upload.wikimedia.org/wikipedia/commons/8/8e/OS_X-Logo.svg" height=40px>](http://www.apple.com/osx/)
 [<img src="https://upload.wikimedia.org/wikipedia/commons/3/35/Tux.svg" height=40px>](https://en.wikipedia.org/wiki/List_of_Linux_distributions)

-Python 3.5 and higher
+Python 3.9 and higher

 ## Citation

@ -129,7 +129,7 @@ If you're having trouble installing GPy via `pip install GPy` here is a probable
    cd GPy
    git checkout devel
    python setup.py build_ext --inplace
-    nosetests GPy/testing
+    pytest .

 ### Direct downloads

@ -171,13 +171,13 @@ print(m_load)

 New way of running tests is using coverage:

-Ensure nose and coverage is installed:
+Ensure pytest and coverage is installed:

-    pip install nose coverage
+    pip install pytest

 Run nosetests from root directory of repository:

-    coverage run travis_tests.py
+    python travis_tests.py

 Create coverage report in htmlcov/

--- a/appveyor.yml
+++ b/appveyor.yml
@ -1,92 +0,0 @@
-environment:
-  pip_access:
-    secure: 8/ZjXFwtd1S7ixd7PJOpptupKKEDhm2da/q3unabJ00=
-  COVERALLS_REPO_TOKEN:
-    secure: d3Luic/ESkGaWnZrvWZTKrzO+xaVwJWaRCEP0F+K/9DQGPSRZsJ/Du5g3s4XF+tS
-  gpy_version: 1.12.0
-  matrix:
-    - PYTHON_VERSION: 3.6
-      MINICONDA: C:\Miniconda3-x64
-      MPL_VERSION: 3.3.4
-    - PYTHON_VERSION: 3.7
-      MINICONDA: C:\Miniconda3-x64
-      MPL_VERSION: 3.3.4
-    - PYTHON_VERSION: 3.8
-      MINICONDA: C:\Miniconda3-x64
-      MPL_VERSION: 3.3.4
-    - PYTHON_VERSION: 3.9
-      MINICONDA: C:\Miniconda3-x64
-      MPL_VERSION: 3.3.4
-
-#configuration:
-#  - Debug
-#  - Release
-
-install:
- - "set PATH=%MINICONDA%;%MINICONDA%\\Scripts;%PATH%"
- - conda config --set always_yes yes --set changeps1 no
- - conda update -q conda
- - conda info -a
-# github issue #955: freeze build version of matplotlib
- - "conda create -q -n build-environment python=%PYTHON_VERSION% numpy scipy matplotlib=%MPL_VERSION%"
- - activate build-environment
- # We need wheel installed to build wheels
- - python -m pip install wheel
- # GPy needs paramz
- - python -m pip install paramz
- - python -m pip install nose-show-skipped
- - python -m pip install coverage
- - python -m pip install coveralls
- - python -m pip install codecov
- - python -m pip install twine
- - "python setup.py develop"
-
-build: off
-
-test_script:
-  # Put your test command here.
-  # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
-  # you can remove "build.cmd" from the front of the command, as it's
-  # only needed to support those cases.
-  # Note that you must use the environment variable %PYTHON% to refer to
-  # the interpreter you're using - Appveyor does not do anything special
-  # to put the Python evrsion you want to use on PATH.
-  #- "build.cmd %PYTHON%\\python.exe setup.py test"
-  - "coverage run travis_tests.py"
-
-after_test:
-  # This step builds your wheels.
-  - "python setup.py bdist_wheel"
-  - codecov
-
-artifacts:
-  # bdist_wheel puts your built wheel in the dist directory
-  - path: dist\*
-
-
-deploy_script:
- echo [distutils] > %USERPROFILE%\\.pypirc
- echo index-servers = >> %USERPROFILE%\\.pypirc
- echo     pypi >> %USERPROFILE%\\.pypirc
- echo     test >> %USERPROFILE%\\.pypirc
- echo[
- echo [pypi] >> %USERPROFILE%\\.pypirc
- echo username = maxz >> %USERPROFILE%\\.pypirc
- echo password = %pip_access% >> %USERPROFILE%\\.pypirc
- echo[
- echo [test] >> %USERPROFILE%\\.pypirc
- echo repository = https://testpypi.python.org/pypi >> %USERPROFILE%\\.pypirc
- echo username = maxz >> %USERPROFILE%\\.pypirc
- echo password = %pip_access% >> %USERPROFILE%\\.pypirc
- .appveyor_twine_upload.bat
-
-# deploy:
-#   - provider: GitHub
-#     release: GPy-v$(gpy_version)
-#     description: 'GPy windows install'
-#     artifact: dist/*.exe               # upload wininst to GitHub
-#     draft: false
-#     prerelease: false
-#     on:
-#         branch: deploy                 # release from deploy branch only
-#         appveyor_repo_tag: true        # deploy on tag push only
--- a/benchmarks/regression/evaluation.py
+++ b/benchmarks/regression/evaluation.py
@ -4,18 +4,19 @@
 import abc
 import numpy as np

+
 class Evaluation(object):
    __metaclass__ = abc.ABCMeta
-    
+
    @abc.abstractmethod
    def evaluate(self, gt, pred):
        """Compute a scalar for access the performance"""
        return None

+
 class RMSE(Evaluation):
    "Rooted Mean Square Error"
-    name = 'RMSE'
-    
+    name = "RMSE"
+
    def evaluate(self, gt, pred):
-        return np.sqrt(np.square(gt-pred).astype(np.float).mean())
-    
+        return np.sqrt(np.square(gt - pred).astype(float).mean())
--- a/doc/source/requirements.txt
+++ b/doc/source/requirements.txt
@ -7,4 +7,4 @@ paramz
 cython
 mock
 sympy
-nose
+pytest
--- a/Show more
+++ b/Show more