Merge pull request #1084 from SheffieldML/devel

Release version 1.13.2
2026-07-02 16:01:03 +02:00 · 2024-07-21 17:35:25 +02:00 · 2024-07-21 17:35:25 +02:00 · 7e1cb7adee
commit 7e1cb7adee
parent 6254451513 282fcd4d68
103 changed files with 35568 additions and 21650 deletions
--- a/.appveyor_twine_upload.bat
+++ b/.appveyor_twine_upload.bat
@ -1,5 +0,0 @@
 IF "%APPVEYOR_REPO_BRANCH%"=="deploy" (
  twine upload --skip-existing dist/*
 ) ELSE (
  ECHO Only deploy on deploy branch
 )
--- a/.github/workflows/test-and-deploy.yml
+++ b/.github/workflows/test-and-deploy.yml
@ -0,0 +1,268 @@
 name: "Test Python Lib"
 on:
  push:
    branches:
      - main
      - devel
      - deploy
  pull_request:
  release:
 permissions:
  contents: read
  pull-requests: read
 jobs:
  test-windows:
    strategy:
      matrix:
        os: [windows-latest]
        python: ['3.9', '3.10', '3.11', '3.12']
    runs-on: ${{ matrix.os }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup python
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python }}
      - name: Install build dependencies
        run: |
          pip install setuptools
      - name: Install lib
        run: |
          python setup.py develop
      - name: Install test dependencies
        run: |
          pip install matplotlib
          pip install pytest
      - name: pytest
        run: |
          pytest GPy/testing
  test-linux:
    strategy:
      matrix:
        os: [ubuntu-latest]
        python: ['3.9', '3.10', '3.11', '3.12']
    runs-on: ${{ matrix.os }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup python
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python }}
      - name: Install build dependencies
        run: |
          pip install setuptools
      - name: Install lib
        run: |
          python setup.py develop
      - name: Install test dependencies
        run: |
          pip install matplotlib
          pip install pytest
      - name: pytest
        run: |
          pytest GPy/testing
  test-macos:
    strategy:
      matrix:
        os: [macos-latest]
        python: ['3.10', '3.11', '3.12']
    runs-on: ${{ matrix.os }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup python
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python }}
      - name: Install dependencies
        run: |
          pip install setuptools
      - name: Install lib
        run: |
          python setup.py develop
      - name: Install test dependencies
        run: |
          pip install matplotlib
          pip install pytest
      - name: pytest
        run: |
          pytest GPy/testing
  build-windows:
    if: github.event_name == 'release'
    strategy:
      matrix:
        os: [windows-latest]
        python: ['3.9', '3.10', '3.11', '3.12']
    runs-on: ${{ matrix.os }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup python
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python }}
      - name: Build lib
        run: |
          pip install wheel
          python setup.py develop
          python setup.py bdist_wheel
          python setup.py sdist bdist_wheel
      - name: List contents of dist
        run: ls -R dist
      - name: Archive build artifacts
        uses: actions/upload-artifact@v3
        with:
          name: dist-artifacts-${{ matrix.os }}-${{ matrix.python }}
          path: dist
  build-macos:
    if: github.event_name == 'release'
    strategy:
      matrix:
        os: [macos-latest]
        python: ['3.10', '3.11', '3.12']  # 3.9 triggers scipy issues when installing
    runs-on: ${{ matrix.os }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup python
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python }}
      - name: Build lib
        run: |
          pip install wheel
          python setup.py develop
          python setup.py bdist_wheel
      - name: List contents of dist
        run: ls -R dist
      - name: Archive build artifacts
        uses: actions/upload-artifact@v3
        with:
          name: dist-artifacts-${{ matrix.os }}-${{ matrix.python }}
          path: dist/*
  build-linux:
    if: github.event_name == 'release'
    strategy:
        matrix:
          python: ['cp39-cp39', 'cp310-cp310', 'cp311-cp311', 'cp312-cp312']
    runs-on: ubuntu-latest
    container:
      image: quay.io/pypa/manylinux2014_x86_64
    steps:
      - name: Checkout
        uses: actions/checkout@v3
      - name: Compile c headers
        run: |
          /opt/python/${{ matrix.python }}/bin/python setup.py develop
      - name: Build wheel files
        run: |
          /opt/python/${{ matrix.python }}/bin/python setup.py bdist_wheel
      - name: Install auditwheel  # this should be available?!
        run: |
          /opt/python/${{ matrix.python }}/bin/python -m pip install auditwheel
      - name: Repair wheel files
        run: |
          /opt/python/${{ matrix.python }}/bin/python -m auditwheel repair dist/*${{ matrix.python }}-linux_x86_64.whl
      - name: List contents of dist
        run: ls -R dist
      - name: List contests of wheelhouse
        run: ls -R wheelhouse
      - name: Move wheelhouse wheel files to dist
        run: |
          rm dist/*
          mv wheelhouse/* dist/
          rmdir wheelhouse
      - name: List contents of dist
        run: ls -R dist
      - name: Archive build artifacts
        uses: actions/upload-artifact@v3
        with:
          name: dist-artifacts-manylinux-${{ matrix.python }}
          path: dist/*
  deploy:
    runs-on: ubuntu-latest
    needs: [test-windows, test-linux, test-macos, build-linux, build-windows, build-macos]
    if: github.event_name == 'release'
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup python
        uses: actions/setup-python@v4
        with:
          python-version: '3.9'
      - name: Install twine
        run: |
          pip install --upgrade pip
          pip install twine
      - name: Download all artifacts to a specific directory
        uses: actions/download-artifact@v3
        with:
          path: dist
      - name: Create dist directory
        run: mkdir -p dist
      - name: Move files from subdirectories
        run: |
          for subdirectory in dist/*/; do
            dir_name=$(basename "$subdirectory")
            mv "$subdirectory"* dist/
            rm -r "$subdirectory"
            echo "Moved files from '$dir_name' to 'dist/'"
          done
      - name: Inspect wheel files
        run: |
          ls -R dist
      - name: Upload to PyPI using twine
        run: twine upload --skip-existing dist/*
        env:
          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
--- a/.gitignore
+++ b/.gitignore
@ -56,3 +56,8 @@ GPy*.rst
 # vscode
 settings.json
 # local dev
 .eggs
 .venv
 .env
--- a/.travis.yml
+++ b/.travis.yml
@ -1,73 +0,0 @@
 sudo: false
 osx_image: xcode12.2
 os:
 - osx
 - linux
 addons:
  apt_packages:
    - pandoc
 #cache:
 #  directories:
 #  - "$HOME/download/"
 #  - "$HOME/install/"
 env:
  - PYTHON_VERSION=3.6
  - PYTHON_VERSION=3.7
  - PYTHON_VERSION=3.8
  - PYTHON_VERSION=3.9
 before_install:
 - wget https://github.com/mzwiessele/travis_scripts/raw/master/download_miniconda.sh
 - wget https://github.com/mzwiessele/travis_scripts/raw/master/install_retry.sh
 - source download_miniconda.sh
 - echo $PATH
 install:
 - echo $PATH
 - source install_retry.sh
 - if [[ "$TRAVIS_OS_NAME" == "osx" ]];
  then
    conda install --yes pandoc;
  fi;
 - pip install codecov
 - pip install coveralls
 - pip install pypandoc
 - pip install git+git://github.com/BRML/climin.git
 - pip install autograd
 - pip install nose-show-skipped
 - python setup.py develop
 script:
  - coverage run travis_tests.py
 after_success:
  - codecov
  - coveralls
 before_deploy:
  - if [[ "$TRAVIS_OS_NAME" == "linux" ]];
    then
      export DIST='sdist bdist_rpm bdist_dumb';
    elif [[ "$TRAVIS_OS_NAME" == "osx" ]];
    then
      export DIST='bdist_wheel';
    fi;
 deploy:
  provider: pypi
  user: maxz
  password:
    secure: "vMEOlP7DQhFJ7hQAKtKC5hrJXFl5BkUt4nXdosWWiw//Kg8E+PPLg88XPI2gqIosir9wwgtbSBBbbwCxkM6uxRNMpoNR8Ixyv9fmSXp4rLl7bbBY768W7IRXKIBjpuEy2brQjoT+CwDDSzUkckHvuUjJDNRvUv8ab4P/qYO1LG4="
  on:
    branch: deploy
  edge:
    branch: v1.8.45
  distributions: $DIST
  skip_existing: true
  skip_cleanup: true
  skip_upload_docs: false
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,29 @@
 # Changelog
 ## Unreleased
 ## v1.13.2 (2024-07-21)
 * update string checks in initialization method for latent variable and put `empirical_samples` init-method on a deprecation path
 * update dependencies to `numpy>=1.7.0,<2.0.0`
 * update dependencies to `numpy>=1.7.0,<2.0.0`
 * update import in `.plotting.matplot_dep.defaults` due to change in matplotlib
 * Correct dl_dm term in student t inference #1065
 ## v1.13.1 (2024-01-14)
 * limit `scipy<1.12` as macos and linux jobs install some pre-release version of `scipy==1.12` which breaks tests
 ## v1.13.0 (2023-12-20)
 * update `paramz` depdency to `>=0.9.6`
 * limit supported python versions to `">=3.9"` in accordance with numpy
 * Change from `nosetest` to `pytest`
 ## v1.9.8 (2019-05-17)
--- a/GPy/init.py
+++ b/GPy/init.py
@ -1,6 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 from . import core
@ -18,30 +19,25 @@ from .util import normalizer
 # backwards compatibility
 import sys
-backwards_compatibility = ['lists_and_dicts', 'observable_array', 'index_operations']
+
 backwards_compatibility = ["lists_and_dicts", "observable_array", "index_operations"]
 for bc in backwards_compatibility:
-    sys.modules['GPy.core.parameterization.{!s}'.format(bc)] = getattr(core.parameterization, bc)
+    sys.modules["GPy.core.parameterization.{!s}".format(bc)] = getattr(
        core.parameterization, bc
    )
 # Direct imports for convenience:
 from .core import Model
 from .core.parameterization import priors
-from .core.parameterization import Param, Parameterized, ObsAr, transformations as constraints
+from .core.parameterization import (
    Param,
    Parameterized,
    ObsAr,
    transformations as constraints,
 )
 from .__version__ import __version__
 from numpy.testing import Tester
 with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    try:
        #Get rid of nose dependency by only ignoring if you have nose installed
        from nose.tools import nottest
        @nottest
        def tests(verbose=10):
            Tester(testing).test(verbose=verbose)
    except:
        def tests(verbose=10):
            Tester(testing).test(verbose=verbose)
 def load(file_or_path):
    """
@ -52,10 +48,12 @@ def load(file_or_path):
    # This is the pickling pain when changing _src -> src
    import sys
    import inspect
-    sys.modules['GPy.kern._src'] = kern.src
+
    sys.modules["GPy.kern._src"] = kern.src
    for name, module in inspect.getmembers(kern.src):
-        if not name.startswith('_'):
+        if not name.startswith("_"):
-            sys.modules['GPy.kern._src.{}'.format(name)] = module
+            sys.modules["GPy.kern._src.{}".format(name)] = module
-    sys.modules['GPy.inference.optimization'] = inference.optimization
+    sys.modules["GPy.inference.optimization"] = inference.optimization
    import paramz
    return paramz.load(file_or_path)
--- a/GPy/version.py
+++ b/GPy/version.py
@ -1 +1 @@
-__version__ = "1.12.0"
+__version__ = "1.13.2"
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -13,14 +13,15 @@ import weakref
 class Prior(object):
    domain = None
    _instance = None
    def __new__(cls, *args, **kwargs):
        if not cls._instance or cls._instance.__class__ is not cls:
-                newfunc = super(Prior, cls).__new__
+            newfunc = super(Prior, cls).__new__
-                if newfunc is object.__new__:
+            if newfunc is object.__new__:
-                    cls._instance = newfunc(cls)
+                cls._instance = newfunc(cls)
-                else:
+            else:
-                    cls._instance = newfunc(cls, *args, **kwargs)
+                cls._instance = newfunc(cls, *args, **kwargs)
-                return cls._instance
+            return cls._instance
    def pdf(self, x):
        return np.exp(self.lnpdf(x))
@ -47,6 +48,7 @@ class Gaussian(Prior):
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = _REAL
    _instances = []
@ -82,6 +84,7 @@ class Gaussian(Prior):
    def rvs(self, n):
        return np.random.randn(n) * self.sigma + self.mu
 #     def __getstate__(self):
 #         return self.mu, self.sigma
 #
@ -91,6 +94,7 @@ class Gaussian(Prior):
 #         self.sigma2 = np.square(self.sigma)
 #         self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
 class Uniform(Prior):
    _instances = []
@ -132,6 +136,7 @@ class Uniform(Prior):
    def rvs(self, n):
        return np.random.uniform(self.lower, self.upper, size=n)
 #     def __getstate__(self):
 #         return self.lower, self.upper
 #
@ -139,6 +144,7 @@ class Uniform(Prior):
 #         self.lower = state[0]
 #         self.upper = state[1]
 class LogGaussian(Gaussian):
    """
    Implementation of the univariate *log*-Gaussian probability function, coupled with random variables.
@ -149,6 +155,7 @@ class LogGaussian(Gaussian):
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = _POSITIVE
    _instances = []
@ -176,10 +183,14 @@ class LogGaussian(Gaussian):
        return "lnN({:.2g}, {:.2g})".format(self.mu, self.sigma)
    def lnpdf(self, x):
-        return self.constant - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2 - np.log(x)
+        return (
            self.constant
            - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2
            - np.log(x)
        )
    def lnpdf_grad(self, x):
-        return -((np.log(x) - self.mu) / self.sigma2 + 1.) / x
+        return -((np.log(x) - self.mu) / self.sigma2 + 1.0) / x
    def rvs(self, n):
        return np.exp(np.random.randn(int(n)) * self.sigma + self.mu)
@ -195,16 +206,15 @@ class MultivariateGaussian(Prior):
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = _REAL
    _instances = []
    def __new__(cls, mu=0, var=1):  # Singleton:
        if cls._instances:
-            cls._instances[:] = [instance for instance in cls._instances if
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
                                 instance()]
            for instance in cls._instances:
-                if np.all(instance().mu == mu) and np.all(
+                if np.all(instance().mu == mu) and np.all(instance().var == var):
                        instance().var == var):
                    return instance()
        newfunc = super(Prior, cls).__new__
        if newfunc is object.__new__:
@ -217,16 +227,17 @@ class MultivariateGaussian(Prior):
    def __init__(self, mu, var):
        self.mu = np.array(mu).flatten()
        self.var = np.array(var)
-        assert len(self.var.shape) == 2, 'Covariance must be a matrix'
+        assert len(self.var.shape) == 2, "Covariance must be a matrix"
-        assert self.var.shape[0] == self.var.shape[1], \
+        assert (
-            'Covariance must be a square matrix'
+            self.var.shape[0] == self.var.shape[1]
        ), "Covariance must be a square matrix"
        assert self.var.shape[0] == self.mu.size
        self.input_dim = self.mu.size
        self.inv, _, self.hld, _ = pdinv(self.var)
        self.constant = -0.5 * (self.input_dim * np.log(2 * np.pi) + self.hld)
    def __str__(self):
-        return 'MultiN(' + str(self.mu) + ', ' + str(np.diag(self.var)) + ')'
+        return "MultiN(" + str(self.mu) + ", " + str(np.diag(self.var)) + ")"
    def summary(self):
        raise NotImplementedError
@ -243,7 +254,7 @@ class MultivariateGaussian(Prior):
    def lnpdf_grad(self, x):
        x = np.array(x).flatten()
        d = x - self.mu
-        return - np.dot(self.inv, d)
+        return -np.dot(self.inv, d)
    def rvs(self, n):
        return np.random.multivariate_normal(self.mu, self.var, n)
@ -262,14 +273,16 @@ class MultivariateGaussian(Prior):
    def __setstate__(self, state):
        self.mu = np.array(state[0]).flatten()
        self.var = state[1]
-        assert len(self.var.shape) == 2, 'Covariance must be a matrix'
+        assert len(self.var.shape) == 2, "Covariance must be a matrix"
-        assert self.var.shape[0] == self.var.shape[1], \
+        assert (
-            'Covariance must be a square matrix'
+            self.var.shape[0] == self.var.shape[1]
        ), "Covariance must be a square matrix"
        assert self.var.shape[0] == self.mu.size
        self.input_dim = self.mu.size
        self.inv, _, self.hld, _ = pdinv(self.var)
        self.constant = -0.5 * (self.input_dim * np.log(2 * np.pi) + self.hld)
 def gamma_from_EV(E, V):
    warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning)
    return Gamma.from_EV(E, V)
@ -285,10 +298,11 @@ class Gamma(Prior):
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = _POSITIVE
    _instances = []
-    def __new__(cls, a=1, b=.5):  # Singleton:
+    def __new__(cls, a=1, b=0.5):  # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
@ -319,24 +333,29 @@ class Gamma(Prior):
        return "Ga({:.2g}, {:.2g})".format(self.a, self.b)
    def summary(self):
-        ret = {"E[x]": self.a / self.b, \
+        ret = {
-               "E[ln x]": digamma(self.a) - np.log(self.b), \
+            "E[x]": self.a / self.b,
-               "var[x]": self.a / self.b / self.b, \
+            "E[ln x]": digamma(self.a) - np.log(self.b),
-               "Entropy": gammaln(self.a) - (self.a - 1.) * digamma(self.a) - np.log(self.b) + self.a}
+            "var[x]": self.a / self.b / self.b,
            "Entropy": gammaln(self.a)
            - (self.a - 1.0) * digamma(self.a)
            - np.log(self.b)
            + self.a,
        }
        if self.a > 1:
-            ret['Mode'] = (self.a - 1.) / self.b
+            ret["Mode"] = (self.a - 1.0) / self.b
        else:
-            ret['mode'] = np.nan
+            ret["mode"] = np.nan
        return ret
    def lnpdf(self, x):
        return self.constant + (self.a - 1) * np.log(x) - self.b * x
    def lnpdf_grad(self, x):
-        return (self.a - 1.) / x - self.b
+        return (self.a - 1.0) / x - self.b
    def rvs(self, n):
-        return np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+        return np.random.gamma(scale=1.0 / self.b, shape=self.a, size=n)
    @staticmethod
    def from_EV(E, V):
@ -359,6 +378,7 @@ class Gamma(Prior):
        self._b = state[1]
        self.constant = -gammaln(self.a) + self.a * np.log(self.b)
 class InverseGamma(Gamma):
    """
    Implementation of the inverse-Gamma probability function, coupled with random variables.
@ -369,6 +389,7 @@ class InverseGamma(Gamma):
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = _POSITIVE
    _instances = []
@ -386,10 +407,11 @@ class InverseGamma(Gamma):
        return self.constant - (self.a + 1) * np.log(x) - self.b / x
    def lnpdf_grad(self, x):
-        return -(self.a + 1.) / x + self.b / x ** 2
+        return -(self.a + 1.0) / x + self.b / x**2
    def rvs(self, n):
-        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+        return 1.0 / np.random.gamma(scale=1.0 / self.b, shape=self.a, size=n)
 class DGPLVM_KFDA(Prior):
    """
@ -403,6 +425,7 @@ class DGPLVM_KFDA(Prior):
    .. Note:: Surpassing Human-Level Face paper dgplvm implementation
    """
    domain = _REAL
    # _instances = []
    # def __new__(cls, lambdaa, sigma2):  # Singleton:
@ -459,8 +482,8 @@ class DGPLVM_KFDA(Prior):
        lst_ni = []
        lst_ni1 = []
        lst_ni2 = []
-        f1 = (np.where(self.lbl[:, 0] == 1)[0])
+        f1 = np.where(self.lbl[:, 0] == 1)[0]
-        f2 = (np.where(self.lbl[:, 1] == 1)[0])
+        f2 = np.where(self.lbl[:, 1] == 1)[0]
        for idx in f1:
            lst_ni1.append(idx)
        for idx in f2:
@ -474,11 +497,11 @@ class DGPLVM_KFDA(Prior):
        count = 0
        for N_i in lst_ni:
            if N_i == lst_ni[0]:
-                a[count:count + N_i] = (float(1) / N_i) * a[count]
+                a[count : count + N_i] = (float(1) / N_i) * a[count]
                count += N_i
            else:
                if N_i == lst_ni[1]:
-                    a[count: count + N_i] = -(float(1) / N_i) * a[count]
+                    a[count : count + N_i] = -(float(1) / N_i) * a[count]
                    count += N_i
        return a
@ -486,8 +509,12 @@ class DGPLVM_KFDA(Prior):
        A = np.zeros((self.datanum, self.datanum))
        idx = 0
        for N_i in lst_ni:
-            B = float(1) / np.sqrt(N_i) * (np.eye(N_i) - ((float(1) / N_i) * np.ones((N_i, N_i))))
+            B = (
-            A[idx:idx + N_i, idx:idx + N_i] = B
+                float(1)
                / np.sqrt(N_i)
                * (np.eye(N_i) - ((float(1) / N_i) * np.ones((N_i, N_i))))
            )
            A[idx : idx + N_i, idx : idx + N_i] = B
            idx += N_i
        return A
@ -498,9 +525,11 @@ class DGPLVM_KFDA(Prior):
        a_trans = np.transpose(self.a)
        paran = self.lambdaa * np.eye(x.shape[0]) + self.A.dot(K).dot(self.A)
        inv_part = pdinv(paran)[0]
-        J = a_trans.dot(K).dot(self.a) - a_trans.dot(K).dot(self.A).dot(inv_part).dot(self.A).dot(K).dot(self.a)
+        J = a_trans.dot(K).dot(self.a) - a_trans.dot(K).dot(self.A).dot(inv_part).dot(
-        J_star = (1. / self.lambdaa) * J
+            self.A
-        return (-1. / self.sigma2) * J_star
+        ).dot(K).dot(self.a)
        J_star = (1.0 / self.lambdaa) * J
        return (-1.0 / self.sigma2) * J_star
    # Here gradient function
    def lnpdf_grad(self, x):
@ -511,15 +540,15 @@ class DGPLVM_KFDA(Prior):
        b = self.A.dot(inv_part).dot(self.A).dot(K).dot(self.a)
        a_Minus_b = self.a - b
        a_b_trans = np.transpose(a_Minus_b)
-        DJ_star_DK = (1. / self.lambdaa) * (a_Minus_b.dot(a_b_trans))
+        DJ_star_DK = (1.0 / self.lambdaa) * (a_Minus_b.dot(a_b_trans))
        DJ_star_DX = self.kern.gradients_X(DJ_star_DK, x)
-        return (-1. / self.sigma2) * DJ_star_DX
+        return (-1.0 / self.sigma2) * DJ_star_DX
    def rvs(self, n):
        return np.random.rand(n)  # A WRONG implementation
    def __str__(self):
-        return 'DGPLVM_prior'
+        return "DGPLVM_prior"
    def __getstate___(self):
        return self.lbl, self.lambdaa, self.sigma2, self.kern, self.x_shape
@ -547,6 +576,7 @@ class DGPLVM(Prior):
    .. Note:: DGPLVM for Classification paper implementation
    """
    domain = _REAL
    def __new__(cls, sigma2, lbl, x_shape):
@ -606,7 +636,7 @@ class DGPLVM(Prior):
        for i in data_idx:
            if len(lst_idx) == 0:
                pass
-                #Do nothing, because it is the first time list is created so is empty
+                # Do nothing, because it is the first time list is created so is empty
            else:
                lst_idx = []
            # Here we put indices of each class in to the list called lst_idx_all
@ -631,9 +661,9 @@ class DGPLVM(Prior):
            N_i = float(len(cls[i]))
            W_WT = np.zeros((self.dim, self.dim))
            for xk in cls[i]:
-                W = (xk - M_i[i])
+                W = xk - M_i[i]
                W_WT += np.outer(W, W)
-            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+            Sw += (N_i / self.datanum) * ((1.0 / N_i) * W_WT)
        return Sw
    # Calculating beta and Bi for Sb
@ -658,7 +688,6 @@ class DGPLVM(Prior):
        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
        return Sig_beta_B_i_all
    # Calculating W_j s separately so we can access all the W_j s anytime
    def compute_wj(self, data_idx, M_i):
        W_i = np.zeros((self.datanum, self.dim))
@ -667,7 +696,7 @@ class DGPLVM(Prior):
            for tpl in data_idx[i]:
                xj = tpl[1]
                j = tpl[0]
-                W_i[j] = (xj - M_i[i])
+                W_i[j] = xj - M_i[i]
        return W_i
    # Calculating alpha and Wj for Sw
@ -680,11 +709,11 @@ class DGPLVM(Prior):
                for j in lst_idx_all[i]:
                    if k == j:
                        alpha = 1 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
                    else:
                        alpha = 0 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
-        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+        Sig_alpha_W_i = (1.0 / self.datanum) * np.transpose(Sig_alpha_W_i)
        return Sig_alpha_W_i
    # This function calculates log of our prior
@ -696,9 +725,9 @@ class DGPLVM(Prior):
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
        # sb_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
    # This function calculates derivative of the log of prior function
@ -717,19 +746,20 @@ class DGPLVM(Prior):
        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)
        # Calculating DJ/DXk
        DJ_Dxk = 2 * (
-            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all)
-                Sig_alpha_W_i))
+            + Sb_inv_N_trans.dot(Sig_alpha_W_i)
        )
        # Calculating derivative of the log of the prior
-        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        DPx_Dx = (-1 / self.sigma2) * DJ_Dxk
        return DPx_Dx.T
    # def frb(self, x):
@ -744,7 +774,7 @@ class DGPLVM(Prior):
        return np.random.rand(n)  # A WRONG implementation
    def __str__(self):
-        return 'DGPLVM_prior_Raq'
+        return "DGPLVM_prior_Raq"
 # ******************************************
@ -752,6 +782,7 @@ class DGPLVM(Prior):
 from . import Parameterized
 from . import Param
 class DGPLVM_Lamda(Prior, Parameterized):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
@ -761,6 +792,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
    .. Note:: DGPLVM for Classification paper implementation
    """
    domain = _REAL
    # _instances = []
    # def __new__(cls, mu, sigma): # Singleton:
@ -773,7 +805,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
    #     cls._instances.append(weakref.ref(o))
    #     return cls._instances[-1]()
-    def __init__(self, sigma2, lbl, x_shape, lamda, name='DP_prior'):
+    def __init__(self, sigma2, lbl, x_shape, lamda, name="DP_prior"):
        super(DGPLVM_Lamda, self).__init__(name=name)
        self.sigma2 = sigma2
        # self.x = x
@ -783,7 +815,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        self.datanum = lbl.shape[0]
        self.x_shape = x_shape
        self.dim = x_shape[1]
-        self.lamda = Param('lamda', np.diag(lamda))
+        self.lamda = Param("lamda", np.diag(lamda))
        self.link_parameter(self.lamda)
    def get_class_label(self, y):
@ -831,7 +863,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        for i in data_idx:
            if len(lst_idx) == 0:
                pass
-                #Do nothing, because it is the first time list is created so is empty
+                # Do nothing, because it is the first time list is created so is empty
            else:
                lst_idx = []
            # Here we put indices of each class in to the list called lst_idx_all
@ -856,9 +888,9 @@ class DGPLVM_Lamda(Prior, Parameterized):
            N_i = float(len(cls[i]))
            W_WT = np.zeros((self.dim, self.dim))
            for xk in cls[i]:
-                W = (xk - M_i[i])
+                W = xk - M_i[i]
                W_WT += np.outer(W, W)
-            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+            Sw += (N_i / self.datanum) * ((1.0 / N_i) * W_WT)
        return Sw
    # Calculating beta and Bi for Sb
@ -883,7 +915,6 @@ class DGPLVM_Lamda(Prior, Parameterized):
        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
        return Sig_beta_B_i_all
    # Calculating W_j s separately so we can access all the W_j s anytime
    def compute_wj(self, data_idx, M_i):
        W_i = np.zeros((self.datanum, self.dim))
@ -892,7 +923,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
            for tpl in data_idx[i]:
                xj = tpl[1]
                j = tpl[0]
-                W_i[j] = (xj - M_i[i])
+                W_i[j] = xj - M_i[i]
        return W_i
    # Calculating alpha and Wj for Sw
@ -905,11 +936,11 @@ class DGPLVM_Lamda(Prior, Parameterized):
                for j in lst_idx_all[i]:
                    if k == j:
                        alpha = 1 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
                    else:
                        alpha = 0 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
-        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+        Sig_alpha_W_i = (1.0 / self.datanum) * np.transpose(Sig_alpha_W_i)
        return Sig_alpha_W_i
    # This function calculates log of our prior
@ -917,7 +948,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        x = x.reshape(self.x_shape)
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!
-        #self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()
+        # self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()
        xprime = x.dot(np.diagflat(self.lamda))
        x = xprime
@ -928,9 +959,9 @@ class DGPLVM_Lamda(Prior, Parameterized):
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.9)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
    # This function calculates derivative of the log of prior function
@ -952,19 +983,20 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.9)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)
        # Calculating DJ/DXk
        DJ_Dxk = 2 * (
-            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all)
-                Sig_alpha_W_i))
+            + Sb_inv_N_trans.dot(Sig_alpha_W_i)
        )
        # Calculating derivative of the log of the prior
-        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        DPx_Dx = (-1 / self.sigma2) * DJ_Dxk
        DPxprim_Dx = np.diagflat(self.lamda).dot(DPx_Dx)
@ -980,7 +1012,6 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # print DPxprim_Dx
        return DPxprim_Dx
    # def frb(self, x):
    #     from functools import partial
    #     from GPy.models import GradientChecker
@ -993,10 +1024,12 @@ class DGPLVM_Lamda(Prior, Parameterized):
        return np.random.rand(n)  # A WRONG implementation
    def __str__(self):
-        return 'DGPLVM_prior_Raq_Lamda'
+        return "DGPLVM_prior_Raq_Lamda"
 # ******************************************
 class DGPLVM_T(Prior):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
@ -1006,6 +1039,7 @@ class DGPLVM_T(Prior):
    .. Note:: DGPLVM for Classification paper implementation
    """
    domain = _REAL
    # _instances = []
    # def __new__(cls, mu, sigma): # Singleton:
@ -1028,7 +1062,6 @@ class DGPLVM_T(Prior):
        self.dim = x_shape[1]
        self.vec = vec
    def get_class_label(self, y):
        for idx, v in enumerate(y):
            if v == 1:
@ -1075,7 +1108,7 @@ class DGPLVM_T(Prior):
        for i in data_idx:
            if len(lst_idx) == 0:
                pass
-                #Do nothing, because it is the first time list is created so is empty
+                # Do nothing, because it is the first time list is created so is empty
            else:
                lst_idx = []
            # Here we put indices of each class in to the list called lst_idx_all
@ -1100,9 +1133,9 @@ class DGPLVM_T(Prior):
            N_i = float(len(cls[i]))
            W_WT = np.zeros((self.dim, self.dim))
            for xk in cls[i]:
-                W = (xk - M_i[i])
+                W = xk - M_i[i]
                W_WT += np.outer(W, W)
-            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+            Sw += (N_i / self.datanum) * ((1.0 / N_i) * W_WT)
        return Sw
    # Calculating beta and Bi for Sb
@ -1127,7 +1160,6 @@ class DGPLVM_T(Prior):
        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
        return Sig_beta_B_i_all
    # Calculating W_j s separately so we can access all the W_j s anytime
    def compute_wj(self, data_idx, M_i):
        W_i = np.zeros((self.datanum, self.dim))
@ -1136,7 +1168,7 @@ class DGPLVM_T(Prior):
            for tpl in data_idx[i]:
                xj = tpl[1]
                j = tpl[0]
-                W_i[j] = (xj - M_i[i])
+                W_i[j] = xj - M_i[i]
        return W_i
    # Calculating alpha and Wj for Sw
@ -1149,11 +1181,11 @@ class DGPLVM_T(Prior):
                for j in lst_idx_all[i]:
                    if k == j:
                        alpha = 1 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
                    else:
                        alpha = 0 - (float(1) / N_i)
-                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                        Sig_alpha_W_i[k] += alpha * W_i[j]
-        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+        Sig_alpha_W_i = (1.0 / self.datanum) * np.transpose(Sig_alpha_W_i)
        return Sig_alpha_W_i
    # This function calculates log of our prior
@ -1168,10 +1200,10 @@ class DGPLVM_T(Prior):
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #print 'SB_inv: ', Sb_inv_N
+        # print 'SB_inv: ', Sb_inv_N
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
    # This function calculates derivative of the log of prior function
@ -1193,20 +1225,21 @@ class DGPLVM_T(Prior):
        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
-        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #print 'SB_inv: ',Sb_inv_N
+        # print 'SB_inv: ',Sb_inv_N
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        # Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0]) * 0.1)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)
        # Calculating DJ/DXk
        DJ_Dxk = 2 * (
-            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all)
-                Sig_alpha_W_i))
+            + Sb_inv_N_trans.dot(Sig_alpha_W_i)
        )
        # Calculating derivative of the log of the prior
-        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        DPx_Dx = (-1 / self.sigma2) * DJ_Dxk
        return DPx_Dx.T
    # def frb(self, x):
@ -1221,9 +1254,7 @@ class DGPLVM_T(Prior):
        return np.random.rand(n)  # A WRONG implementation
    def __str__(self):
-        return 'DGPLVM_prior_Raq_TTT'
+        return "DGPLVM_prior_Raq_TTT"
 class HalfT(Prior):
@ -1234,6 +1265,7 @@ class HalfT(Prior):
    :param nu: degrees of freedom
    """
    domain = _POSITIVE
    _instances = []
@ -1250,13 +1282,22 @@ class HalfT(Prior):
    def __init__(self, A, nu):
        self.A = float(A)
        self.nu = float(nu)
-        self.constant = gammaln(.5*(self.nu+1.)) - gammaln(.5*self.nu) - .5*np.log(np.pi*self.A*self.nu)
+        self.constant = (
            gammaln(0.5 * (self.nu + 1.0))
            - gammaln(0.5 * self.nu)
            - 0.5 * np.log(np.pi * self.A * self.nu)
        )
    def __str__(self):
        return "hT({:.2g}, {:.2g})".format(self.A, self.nu)
    def lnpdf(self, theta):
-        return (theta > 0) * (self.constant - .5*(self.nu + 1) * np.log(1. + (1./self.nu) * (theta/self.A)**2))
+        return (theta > 0) * (
            self.constant
            - 0.5
            * (self.nu + 1)
            * np.log(1.0 + (1.0 / self.nu) * (theta / self.A) ** 2)
        )
        # theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
        # lnpdfs = np.zeros_like(theta)
@ -1268,7 +1309,7 @@ class HalfT(Prior):
        # lnpdfs[above_zero] = (+ gammaln((v + 1) * 0.5)
        #     - gammaln(v * 0.5)
        #     - 0.5*np.log(sigma2 * v * np.pi)
-        #     - 0.5*(v + 1)*np.log(1 + (1/np.float(v))*((theta[above_zero][0]**2)/sigma2))
+        #     - 0.5*(v + 1)*np.log(1 + (1/float(v))*((theta[above_zero][0]**2)/sigma2))
        # )
        # return lnpdfs
@ -1278,12 +1319,18 @@ class HalfT(Prior):
        above_zero = theta > 1e-6
        v = self.nu
        sigma2 = self.A
-        grad[above_zero] = -0.5*(v+1)*(2*theta[above_zero])/(v*sigma2 + theta[above_zero][0]**2)
+        grad[above_zero] = (
            -0.5
            * (v + 1)
            * (2 * theta[above_zero])
            / (v * sigma2 + theta[above_zero][0] ** 2)
        )
        return grad
    def rvs(self, n):
        # return np.random.randn(n) * self.sigma + self.mu
        from scipy.stats import t
        # [np.abs(x) for x in t.rvs(df=4,loc=0,scale=50, size=10000)])
        ret = t.rvs(self.nu, loc=0, scale=self.A, size=n)
        ret[ret < 0] = 0
@ -1298,6 +1345,7 @@ class Exponential(Prior):
    :param l: shape parameter
    """
    domain = _POSITIVE
    _instances = []
@ -1318,22 +1366,25 @@ class Exponential(Prior):
        return "Exp({:.2g})".format(self.l)
    def summary(self):
-        ret = {"E[x]": 1. / self.l,
+        ret = {
-               "E[ln x]": np.nan,
+            "E[x]": 1.0 / self.l,
-               "var[x]": 1. / self.l**2,
+            "E[ln x]": np.nan,
-               "Entropy": 1. - np.log(self.l),
+            "var[x]": 1.0 / self.l**2,
-               "Mode": 0.}
+            "Entropy": 1.0 - np.log(self.l),
            "Mode": 0.0,
        }
        return ret
    def lnpdf(self, x):
        return np.log(self.l) - self.l * x
    def lnpdf_grad(self, x):
-        return - self.l
+        return -self.l
    def rvs(self, n):
        return np.random.exponential(scale=self.l, size=n)
 class StudentT(Prior):
    """
    Implementation of the student t probability function, coupled with random variables.
@ -1345,6 +1396,7 @@ class StudentT(Prior):
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = _REAL
    _instances = []
@ -1352,7 +1404,11 @@ class StudentT(Prior):
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
-                if instance().mu == mu and instance().sigma == sigma and instance().nu == nu:
+                if (
                    instance().mu == mu
                    and instance().sigma == sigma
                    and instance().nu == nu
                ):
                    return instance()
        newfunc = super(Prior, cls).__new__
        if newfunc is object.__new__:
@ -1373,13 +1429,18 @@ class StudentT(Prior):
    def lnpdf(self, x):
        from scipy.stats import t
-        return t.logpdf(x,self.nu,self.mu,self.sigma)
+
        return t.logpdf(x, self.nu, self.mu, self.sigma)
    def lnpdf_grad(self, x):
-        return -(self.nu + 1.)*(x - self.mu)/( self.nu*self.sigma2 + np.square(x - self.mu) )
+        return (
            -(self.nu + 1.0)
            * (x - self.mu)
            / (self.nu * self.sigma2 + np.square(x - self.mu))
        )
    def rvs(self, n):
        from scipy.stats import t
        ret = t.rvs(self.nu, loc=self.mu, scale=self.sigma, size=n)
        return ret
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -771,3 +771,117 @@ def multioutput_gp_with_derivative_observations(plot=True):
    mu, var = m.predict_noiseless(Xnew=[xpred, np.empty((0, 1))])
    return m
 def multioutput_gp_with_derivative_observations_2D(optimize=True, plot=False):
    '''
    This in an example on how to use a MultioutputGP model with gradient
    observations and multiple single-dimensional kernels of differing types.
    '''
    period = 3
    w = 2*np.pi/period # angular frequency
    bounds = (-period, period)
    # latent function and gradient
    f = lambda x: (np.exp(-x[:,0]**2) + np.cos(w*x[:,1]))[:,None]
    df = lambda x: np.array([-2*np.exp(-x[:,0]**2)*x[:,0], -w*np.sin(w*x[:,1])]).T
    # 2D input grid
    ppa = 25 # points per axis
    x = np.linspace(*bounds, ppa)
    xx, yy = np.meshgrid(x, x)
    grid = np.array([xx.reshape(-1), yy.reshape(-1)]).T
    fgrid = f(grid)
    dfgrid = df(grid)
    # 10 random training points generated with a space-filling sobol sequence
    X = np.array([
        [ 0.50421399,  2.1331483 ],
        [-2.15717152, -1.70295936],
        [-1.46704334,  1.37111521],
        [ 2.79064536, -0.9649018 ],
        [ 1.60728264,  0.27702713],
        [-0.30712366, -0.57372129],
        [-2.6140632 ,  2.49192488],
        [ 0.89078772, -2.85873686],
        [ 1.15813136,  0.96910322],
        [-2.83307021, -1.38155383]
    ])
    # Note!
    # This example uses the same inputs for function and gradient observations.
    noise_std = 1e-2
    # function observations
    Y = f(X) + np.random.normal(scale=noise_std, size=(len(X), 1))
    # gradient observations
    dY = df(X) + np.random.normal(scale=noise_std, size=(len(X), 2))
    # gather inputs and observations into lists
    X_list = [X, X, X]
    # once for function observations, and once for each partial derivative
    # make sure all arrays are of shape (N x dims), where N is # of training points
    Y_list = [Y, dY[:,0,None], dY[:,1,None]]
    # create a kernel that is the product of two one-dimensional kernels
    # the first kernel is an RBF kernel
    kern0 = GPy.kern.RBF(input_dim=1, active_dims=[0])
    # as the function is periodic in the second dimension, we use a StdP kernel
    kern1 = GPy.kern.StdPeriodic(input_dim=1, active_dims=[1], period=period)
    kern1.period.constrain_fixed()
    # the kernels can be multiplied together into a product kernel
    kern = kern0 * kern1
    # with gradient observations, we need to define a DiffKern for each dimension
    # the DiffKern is given the main kernel as a base kernel
    diffkern0 = GPy.kern.DiffKern(kern, 0)
    diffkern1 = GPy.kern.DiffKern(kern, 1)
    # gather the main kernel and diffkerns into a list
    kern_list = [kern, diffkern0, diffkern1]
    # define a likelihood and repeat it in a list
    likelihood_list = [GPy.likelihoods.Gaussian(variance=noise_std**2)]*3
    # create the MultioutputGP model and optimize
    model = GPy.models.MultioutputGP(X_list, Y_list, kern_list, likelihood_list)
    model.likelihood.constrain_fixed()
    if optimize:
        model.optimize()
    # make function predictions
    Xnew, _, ind = GPy.util.multioutput.build_XY([grid], index=[0])
    Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
    mu, var = model.predict(Xnew, Y_metadata=Y_metadata)
    # make gradient predictions
    Xnew, _, ind = GPy.util.multioutput.build_XY([grid]*2, index=[1, 2])
    Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
    mu_d, var_d = model.predict(Xnew, Y_metadata=Y_metadata)
    mu_d = np.array([mu_d[:len(grid)], mu_d[len(grid):]]).T[0]
    var_d = np.array([var_d[:len(grid)], var_d[len(grid):]]).T[0]
    if plot and MPL_AVAILABLE:
        fig, axs = plt.subplots(1, 3)
        for ax in axs: ax.set_box_aspect(1)
        axs[0].set_title('true f')
        axs[0].contourf(xx, yy, fgrid.reshape(ppa, ppa), levels=25)
        axs[1].set_title('true df1')
        axs[1].contourf(xx, yy, dfgrid[:,0].reshape(ppa, ppa), levels=25)
        axs[2].set_title('true df2')
        axs[2].contourf(xx, yy, dfgrid[:,1].reshape(ppa, ppa), levels=25)
        fig, axs = plt.subplots(1, 3)
        for ax in axs: ax.set_box_aspect(1)
        axs[0].set_title('pred f')
        axs[0].contourf(xx, yy, mu.reshape(ppa, ppa), levels=25)
        axs[1].set_title('pred df1')
        axs[1].contourf(xx, yy, mu_d[:,0].reshape(ppa, ppa), levels=25)
        axs[2].set_title('pred df2')
        axs[2].contourf(xx, yy, mu_d[:,1].reshape(ppa, ppa), levels=25)
    return model
--- a/GPy/inference/latent_function_inference/exact_studentt_inference.py
+++ b/GPy/inference/latent_function_inference/exact_studentt_inference.py
@ -35,15 +35,20 @@ class ExactStudentTInference(LatentFunctionInference):
        # Log marginal
        N = Y.shape[0]
        D = Y.shape[1]
-        log_marginal = 0.5 * (-N * np.log((nu - 2) * np.pi) - W_logdet - (nu + N) * np.log(1 + beta / (nu - 2)))
+        log_marginal = 0.5 * (
            -N * np.log((nu - 2) * np.pi)
            - W_logdet
            - (nu + N) * np.log(1 + beta / (nu - 2))
        )
        log_marginal += gammaln((nu + N) / 2) - gammaln(nu / 2)
        # Gradients
        dL_dK = 0.5 * ((nu + N) / (nu + beta - 2) * tdot(alpha) - D * Wi)
-        dL_dnu = -N / (nu - 2.) + digamma(0.5 * (nu + N)) - digamma(0.5 * nu)
+        dL_dnu = -N / (nu - 2.0) + digamma(0.5 * (nu + N)) - digamma(0.5 * nu)
-        dL_dnu -= np.log(1 + beta / (nu - 2.))
+        dL_dnu -= np.log(1 + beta / (nu - 2.0))
        dL_dnu += ((nu + N) * beta) / ((nu - 2) * (beta + nu - 2))
        dL_dnu *= 0.5
-        gradients = {'dL_dK': dL_dK, 'dL_dnu': dL_dnu, 'dL_dm': alpha}
+        dL_dm = (nu + N) / (nu + beta - 2) * alpha
        gradients = {"dL_dK": dL_dK, "dL_dnu": dL_dnu, "dL_dm": dL_dm}
        return posterior, log_marginal, gradients
--- a/GPy/kern/src/coregionalize.py
+++ b/GPy/kern/src/coregionalize.py
@ -5,13 +5,16 @@ from .kern import Kern
 import numpy as np
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
-from ...util.config import config # for assesing whether to use cython
+from ...util.config import config  # for assesing whether to use cython
 try:
    from . import coregionalize_cython
-    use_coregionalize_cython = config.getboolean('cython', 'working')
+
    use_coregionalize_cython = config.getboolean("cython", "working")
 except ImportError:
-    print('warning in coregionalize: failed to import cython module: falling back to numpy')
+    print(
        "warning in coregionalize: failed to import cython module: falling back to numpy"
    )
    use_coregionalize_cython = False
@ -43,22 +46,34 @@ class Coregionalize(Kern):
    .. note: see coregionalization examples in GPy.examples.regression for some usage.
    """
-    def __init__(self, input_dim, output_dim, rank=1, W=None, kappa=None, active_dims=None, name='coregion'):
+
    def __init__(
        self,
        input_dim,
        output_dim,
        rank=1,
        W=None,
        kappa=None,
        active_dims=None,
        name="coregion",
    ):
        super(Coregionalize, self).__init__(input_dim, active_dims, name=name)
        self.output_dim = output_dim
        self.rank = rank
-        if self.rank>output_dim:
+        if self.rank > output_dim:
-            print("Warning: Unusual choice of rank, it should normally be less than the output_dim.")
+            print(
                "Warning: Unusual choice of rank, it should normally be less than the output_dim."
            )
        if W is None:
-            W = 0.5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
+            W = 0.5 * np.random.randn(self.output_dim, self.rank) / np.sqrt(self.rank)
        else:
-            assert W.shape==(self.output_dim, self.rank)
+            assert W.shape == (self.output_dim, self.rank)
-        self.W = Param('W', W)
+        self.W = Param("W", W)
        if kappa is None:
-            kappa = 0.5*np.ones(self.output_dim)
+            kappa = 0.5 * np.ones(self.output_dim)
        else:
-            assert kappa.shape==(self.output_dim, )
+            assert kappa.shape == (self.output_dim,)
-        self.kappa = Param('kappa', kappa, Logexp())
+        self.kappa = Param("kappa", kappa, Logexp())
        self.link_parameters(self.W, self.kappa)
    def parameters_changed(self):
@ -70,63 +85,69 @@ class Coregionalize(Kern):
        else:
            return self._K_numpy(X, X2)
    def _K_numpy(self, X, X2=None):
-        index = np.asarray(X, dtype=np.int)
+        index = np.asarray(X, dtype=int)
        if X2 is None:
-            return self.B[index,index.T]
+            return self.B[index, index.T]
        else:
-            index2 = np.asarray(X2, dtype=np.int)
+            index2 = np.asarray(X2, dtype=int)
-            return self.B[index,index2.T]
+            return self.B[index, index2.T]
    def _K_cython(self, X, X2=None):
        if X2 is None:
-            return coregionalize_cython.K_symmetric(self.B, np.asarray(X, dtype=np.int64)[:,0])
+            return coregionalize_cython.K_symmetric(
-        return coregionalize_cython.K_asymmetric(self.B, np.asarray(X, dtype=np.int64)[:,0], np.asarray(X2, dtype=np.int64)[:,0])
+                self.B, np.asarray(X, dtype=np.int64)[:, 0]
-
+            )
        return coregionalize_cython.K_asymmetric(
            self.B,
            np.asarray(X, dtype=np.int64)[:, 0],
            np.asarray(X2, dtype=np.int64)[:, 0],
        )
    def Kdiag(self, X):
-        return np.diag(self.B)[np.asarray(X, dtype=np.int).flatten()]
+        return np.diag(self.B)[np.asarray(X, dtype=int).flatten()]
    def update_gradients_full(self, dL_dK, X, X2=None):
-        index = np.asarray(X, dtype=np.int)
+        index = np.asarray(X, dtype=int)
        if X2 is None:
            index2 = index
        else:
-            index2 = np.asarray(X2, dtype=np.int)
+            index2 = np.asarray(X2, dtype=int)
-        #attempt to use cython for a nasty double indexing loop: fall back to numpy
+        # attempt to use cython for a nasty double indexing loop: fall back to numpy
        if use_coregionalize_cython:
            dL_dK_small = self._gradient_reduce_cython(dL_dK, index, index2)
        else:
            dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)
        dkappa = np.diag(dL_dK_small).copy()
        dL_dK_small += dL_dK_small.T
-        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
+        dW = (self.W[:, None, :] * dL_dK_small[:, :, None]).sum(0)
        self.W.gradient = dW
        self.kappa.gradient = dkappa
    def _gradient_reduce_numpy(self, dL_dK, index, index2):
-        index, index2 = index[:,0], index2[:,0]
+        index, index2 = index[:, 0], index2[:, 0]
        dL_dK_small = np.zeros_like(self.B)
        for i in range(self.output_dim):
-            tmp1 = dL_dK[index==i]
+            tmp1 = dL_dK[index == i]
            for j in range(self.output_dim):
-                dL_dK_small[j,i] = tmp1[:,index2==j].sum()
+                dL_dK_small[j, i] = tmp1[:, index2 == j].sum()
        return dL_dK_small
    def _gradient_reduce_cython(self, dL_dK, index, index2):
-        index, index2 = np.int64(index[:,0]), np.int64(index2[:,0])
+        index, index2 = np.int64(index[:, 0]), np.int64(index2[:, 0])
-        return coregionalize_cython.gradient_reduce(self.B.shape[0], dL_dK, index, index2)
+        return coregionalize_cython.gradient_reduce(
-
+            self.B.shape[0], dL_dK, index, index2
        )
    def update_gradients_diag(self, dL_dKdiag, X):
-        index = np.asarray(X, dtype=np.int).flatten()
+        index = np.asarray(X, dtype=int).flatten()
-        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in range(self.output_dim)])
+        dL_dKdiag_small = np.array(
-        self.W.gradient = 2.*self.W*dL_dKdiag_small[:, None]
+            [dL_dKdiag[index == i].sum() for i in range(self.output_dim)]
        )
        self.W.gradient = 2.0 * self.W * dL_dKdiag_small[:, None]
        self.kappa.gradient = dL_dKdiag_small
    def gradients_X(self, dL_dK, X, X2=None):
@ -154,8 +175,8 @@ class Coregionalize(Kern):
    @staticmethod
    def _build_from_input_dict(kernel_class, input_dict):
-        useGPU = input_dict.pop('useGPU', None)
+        useGPU = input_dict.pop("useGPU", None)
        # W and kappa must be converted back to numpy arrays
-        input_dict['W'] = np.array(input_dict['W'])
+        input_dict["W"] = np.array(input_dict["W"])
-        input_dict['kappa'] = np.array(input_dict['kappa'])
+        input_dict["kappa"] = np.array(input_dict["kappa"])
        return Coregionalize(**input_dict)
--- a/GPy/kern/src/coregionalize_cython.c
+++ b/GPy/kern/src/coregionalize_cython.c
--- a/GPy/kern/src/diff_kern.py
+++ b/GPy/kern/src/diff_kern.py
@ -23,24 +23,42 @@ class DiffKern(Kern):
        self.base_kern.parameters_changed()
    @Cache_this(limit=3, ignore_args=())
-    def K(self, X, X2=None, dimX2 = None): #X in dimension self.dimension
+    def K(self, X, X2=None, dimX2=None): #X in dimension self.dimension
        if X2 is None:
            X2 = X
        if dimX2 is None:
            dimX2 = self.dimension
-        return self.base_kern.dK2_dXdX2(X,X2, self.dimension, dimX2)
+        return self.base_kern.dK2_dXdX2(X, X2, self.dimension, dimX2)
    @Cache_this(limit=3, ignore_args=())
    def dK_dX(self, X, X2, dimX, dimX2=None):
        if dimX2 is None:
            dimX2 = self.dimension
        return self.base_kern.dK3_dXdXdX2(X, X2, dimX, self.dimension, dimX2)
    @Cache_this(limit=3, ignore_args=())
    def Kdiag(self, X):
-        return np.diag(self.base_kern.dK2_dXdX2(X,X, self.dimension, self.dimension))
+        return self.base_kern.dK2_dXdX2diag(X, self.dimension, self.dimension)
    @Cache_this(limit=3, ignore_args=())
    def dK_dXdiag(self, X, dimX):
        return self.base_kern.dK3_dXdXdX2diag(X, dimX, self.dimension, self.dimension)
    @Cache_this(limit=3, ignore_args=())
    def dK_dX_wrap(self, X, X2): #X in dimension self.dimension
-        return self.base_kern.dK_dX(X,X2, self.dimension)
+        return self.base_kern.dK_dX(X, X2, self.dimension)
    @Cache_this(limit=3, ignore_args=())
    def dK_dX2_wrap(self, X, X2): #X in dimension self.dimension
-        return self.base_kern.dK_dX2(X,X2, self.dimension)
+        return self.base_kern.dK_dX2(X, X2, self.dimension)
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX2_wrap(self, X, X2, dimX):
        return self.base_kern.dK2_dXdX2(X, X2, dimX, self.dimension)
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX_wrap(self, X, X2, dimX):
        return self.base_kern.dK2_dXdX(X, X2, dimX, self.dimension)
    def reset_gradients(self):
        self.base_kern.reset_gradients()
@ -56,32 +74,32 @@ class DiffKern(Kern):
    def update_gradients_full(self, dL_dK, X, X2=None, dimX2=None):
        if dimX2 is None:
            dimX2 = self.dimension
-        gradients = self.base_kern.dgradients2_dXdX2(X,X2,self.dimension,dimX2)
+        gradients = self.base_kern.dgradients2_dXdX2(X, X2, self.dimension, dimX2)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
    def update_gradients_diag(self, dL_dK_diag, X):
-        gradients = self.base_kern.dgradients2_dXdX2(X,X, self.dimension, self.dimension)
+        gradients = self.base_kern.dgradients2_dXdX2(X, X, self.dimension, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK_diag, gradient, f=np.diag) for gradient in gradients])
    def update_gradients_dK_dX(self, dL_dK, X, X2=None):
        if X2 is None:
            X2 = X
-        gradients = self.base_kern.dgradients_dX(X,X2, self.dimension)
+        gradients = self.base_kern.dgradients_dX(X, X2, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
    def update_gradients_dK_dX2(self, dL_dK, X, X2=None):
-        gradients = self.base_kern.dgradients_dX2(X,X2, self.dimension)
+        gradients = self.base_kern.dgradients_dX2(X, X2, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
    def gradients_X(self, dL_dK, X, X2):
-        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,:, self.dimension]
+        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,:,self.dimension]
        return np.sum(tmp, axis=1)
    def gradients_X2(self, dL_dK, X, X2):
-        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:, :, self.dimension, :]
+        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,self.dimension,:]
        return np.sum(tmp, axis=1)
-    def _convert_gradients(self, l,g, f = lambda x:x):
+    def _convert_gradients(self, l, g, f=lambda x:x):
        if type(g) is np.ndarray:
            return np.sum(f(l)*f(g))
        else:
--- a/GPy/kern/src/eq_ode1.py
+++ b/GPy/kern/src/eq_ode1.py
@ -8,6 +8,7 @@ from ...core.parameterization import Param
 from paramz.transformations import Logexp
 from paramz.caching import Cache_this
 class EQ_ODE1(Kern):
    """
    Covariance function for first order differential equation driven by an exponentiated quadratic covariance.
@ -33,23 +34,36 @@ class EQ_ODE1(Kern):
    .. Note: see first order differential equation examples in GPy.examples.regression for some usage.
    """
-    def __init__(self, input_dim=2, output_dim=1, rank=1, W = None, lengthscale=None,  decay=None, active_dims=None, name='eq_ode1'):
+
    def __init__(
        self,
        input_dim=2,
        output_dim=1,
        rank=1,
        W=None,
        lengthscale=None,
        decay=None,
        active_dims=None,
        name="eq_ode1",
    ):
        assert input_dim == 2, "only defined for 1 input dims"
-        super(EQ_ODE1, self).__init__(input_dim=input_dim, active_dims=active_dims, name=name)
+        super(EQ_ODE1, self).__init__(
            input_dim=input_dim, active_dims=active_dims, name=name
        )
        self.rank = rank
        self.output_dim = output_dim
        if lengthscale is None:
-            lengthscale = .5 + np.random.rand(self.rank)
+            lengthscale = 0.5 + np.random.rand(self.rank)
        else:
            lengthscale = np.asarray(lengthscale)
            assert lengthscale.size in [1, self.rank], "Bad number of lengthscales"
            if lengthscale.size != self.rank:
-                lengthscale = np.ones(self.rank)*lengthscale
+                lengthscale = np.ones(self.rank) * lengthscale
        if W is None:
-            W = .5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
+            W = 0.5 * np.random.randn(self.output_dim, self.rank) / np.sqrt(self.rank)
        else:
            assert W.shape == (self.output_dim, self.rank)
@ -59,168 +73,181 @@ class EQ_ODE1(Kern):
            decay = np.asarray(decay)
            assert decay.size in [1, self.output_dim], "Bad number of decay"
            if decay.size != self.output_dim:
-                decay = np.ones(self.output_dim)*decay
+                decay = np.ones(self.output_dim) * decay
-#        if kappa is None:
+        #        if kappa is None:
-#            self.kappa = np.ones(self.output_dim)
+        #            self.kappa = np.ones(self.output_dim)
-#        else:
+        #        else:
-#            kappa = np.asarray(kappa)
+        #            kappa = np.asarray(kappa)
-#            assert kappa.size in [1, self.output_dim], "Bad number of kappa"
+        #            assert kappa.size in [1, self.output_dim], "Bad number of kappa"
-#            if decay.size != self.output_dim:
+        #            if decay.size != self.output_dim:
-#                decay = np.ones(self.output_dim)*kappa
+        #                decay = np.ones(self.output_dim)*kappa
-        #self.kappa = Param('kappa', kappa, Logexp())
+        # self.kappa = Param('kappa', kappa, Logexp())
-        #self.delay = Param('delay', delay, Logexp())
+        # self.delay = Param('delay', delay, Logexp())
-        #self.is_normalized = True
+        # self.is_normalized = True
-        #self.is_stationary = False
+        # self.is_stationary = False
-        #self.gaussian_initial = False
+        # self.gaussian_initial = False
-        self.lengthscale = Param('lengthscale', lengthscale, Logexp())
+        self.lengthscale = Param("lengthscale", lengthscale, Logexp())
-        self.decay = Param('decay', decay, Logexp())
+        self.decay = Param("decay", decay, Logexp())
-        self.W = Param('W', W)
+        self.W = Param("W", W)
        self.link_parameters(self.lengthscale, self.decay, self.W)
    @Cache_this(limit=3)
    def K(self, X, X2=None):
-        #This way is not working, indexes are lost after using k._slice_X
+        # This way is not working, indexes are lost after using k._slice_X
-        #index = np.asarray(X, dtype=np.int)
+        # index = np.asarray(X, dtype=int)
-        #index = index.reshape(index.size,)
+        # index = index.reshape(index.size,)
-        if hasattr(X, 'values'):
+        if hasattr(X, "values"):
            X = X.values
        index = np.int_(np.round(X[:, 1]))
-        index = index.reshape(index.size,)
+        index = index.reshape(
            index.size,
        )
        X_flag = index[0] >= self.output_dim
        if X2 is None:
            if X_flag:
-                #Calculate covariance function for the latent functions
+                # Calculate covariance function for the latent functions
                index -= self.output_dim
                return self._Kuu(X, index)
            else:
                raise NotImplementedError
        else:
-            #This way is not working, indexes are lost after using k._slice_X
+            # This way is not working, indexes are lost after using k._slice_X
-            #index2 = np.asarray(X2, dtype=np.int)
+            # index2 = np.asarray(X2, dtype=int)
-            #index2 = index2.reshape(index2.size,)
+            # index2 = index2.reshape(index2.size,)
-            if hasattr(X2, 'values'):
+            if hasattr(X2, "values"):
                X2 = X2.values
            index2 = np.int_(np.round(X2[:, 1]))
-            index2 = index2.reshape(index2.size,)
+            index2 = index2.reshape(
                index2.size,
            )
            X2_flag = index2[0] >= self.output_dim
-            #Calculate cross-covariance function
+            # Calculate cross-covariance function
            if not X_flag and X2_flag:
                index2 -= self.output_dim
-                return self._Kfu(X, index, X2, index2) #Kfu
+                return self._Kfu(X, index, X2, index2)  # Kfu
            elif X_flag and not X2_flag:
                index -= self.output_dim
-                return self._Kfu(X2, index2, X, index).T #Kuf
+                return self._Kfu(X2, index2, X, index).T  # Kuf
            elif X_flag and X2_flag:
                index -= self.output_dim
                index2 -= self.output_dim
-                return self._Kusu(X, index, X2, index2) #Ku_s u
+                return self._Kusu(X, index, X2, index2)  # Ku_s u
            else:
-                raise NotImplementedError #Kf_s f
+                raise NotImplementedError  # Kf_s f
-    #Calculate the covariance function for diag(Kff(X,X))
+    # Calculate the covariance function for diag(Kff(X,X))
    def Kdiag(self, X):
-        if hasattr(X, 'values'):
+        if hasattr(X, "values"):
            index = np.int_(np.round(X[:, 1].values))
        else:
            index = np.int_(np.round(X[:, 1]))
-        index = index.reshape(index.size,)
+        index = index.reshape(
            index.size,
        )
        X_flag = index[0] >= self.output_dim
-        if X_flag: #Kuudiag        
+        if X_flag:  # Kuudiag
-            return np.ones(X[:,0].shape)
+            return np.ones(X[:, 0].shape)
-        else: #Kffdiag
+        else:  # Kffdiag
            kdiag = self._Kdiag(X)
            return np.sum(kdiag, axis=1)
    def _Kdiag(self, X):
-        #This way is not working, indexes are lost after using k._slice_X
+        # This way is not working, indexes are lost after using k._slice_X
-        #index = np.asarray(X, dtype=np.int)
+        # index = np.asarray(X, dtype=int)
-        #index = index.reshape(index.size,)
+        # index = index.reshape(index.size,)
-        if hasattr(X, 'values'):
+        if hasattr(X, "values"):
            X = X.values
        index = np.int_(X[:, 1])
-        index = index.reshape(index.size,)
+        index = index.reshape(
            index.size,
        )
-        #terms that move along t
+        # terms that move along t
        t = X[:, 0].reshape(X.shape[0], 1)
-        d = np.unique(index) #Output Indexes
+        d = np.unique(index)  # Output Indexes
        B = self.decay.values[d]
        S = self.W.values[d, :]
-        #Index transformation
+        # Index transformation
        indd = np.arange(self.output_dim)
        indd[d] = np.arange(d.size)
        index = indd[index]
        B = B.reshape(B.size, 1)
-        #Terms that move along q
+        # Terms that move along q
        lq = self.lengthscale.values.reshape(1, self.rank)
-        S2 = S*S
+        S2 = S * S
-        kdiag = np.empty((t.size, ))
+        kdiag = np.empty((t.size,))
-        #Dx1 terms
+        # Dx1 terms
-        c0 = (S2/B)*((.5*np.sqrt(np.pi))*lq)
+        c0 = (S2 / B) * ((0.5 * np.sqrt(np.pi)) * lq)
-        #DxQ terms
+        # DxQ terms
-        nu = lq*(B*.5)
+        nu = lq * (B * 0.5)
-        nu2 = nu*nu
+        nu2 = nu * nu
-        #Nx1 terms
+        # Nx1 terms
-        gamt = -2.*B
+        gamt = -2.0 * B
-        gamt = gamt[index]*t
+        gamt = gamt[index] * t
-        #NxQ terms
+        # NxQ terms
-        t_lq = t/lq
+        t_lq = t / lq
        # Upsilon Calculations
        # Using wofz
-        #erfnu = erf(nu)
+        # erfnu = erf(nu)
-        upm = np.exp(nu2[index, :] + lnDifErf( nu[index, :] ,t_lq+nu[index,:] ))
+        upm = np.exp(nu2[index, :] + lnDifErf(nu[index, :], t_lq + nu[index, :]))
-        upm[t[:, 0] == 0, :] = 0.
+        upm[t[:, 0] == 0, :] = 0.0
        upv = np.exp(
            nu2[index, :] + gamt + lnDifErf(-t_lq + nu[index, :], nu[index, :])
        )
        upv[t[:, 0] == 0, :] = 0.0
-        upv = np.exp(nu2[index, :] + gamt + lnDifErf( -t_lq+nu[index,:], nu[index, :] ) )
+        # Covariance calculation
-        upv[t[:, 0] == 0, :] = 0.
+        # kdiag = np.sum(c0[index, :]*(upm-upv), axis=1)
-
+        kdiag = c0[index, :] * (upm - upv)
        #Covariance calculation
        #kdiag = np.sum(c0[index, :]*(upm-upv), axis=1)
        kdiag = c0[index, :]*(upm-upv)
        return kdiag
-    def update_gradients_full(self, dL_dK, X, X2 = None):
+    def update_gradients_full(self, dL_dK, X, X2=None):
-        #index = np.asarray(X, dtype=np.int)
+        # index = np.asarray(X, dtype=int)
-        #index = index.reshape(index.size,)
+        # index = index.reshape(index.size,)
-        if hasattr(X, 'values'):
+        if hasattr(X, "values"):
            X = X.values
        self.decay.gradient = np.zeros(self.decay.shape)
        self.W.gradient = np.zeros(self.W.shape)
        self.lengthscale.gradient = np.zeros(self.lengthscale.shape)
        index = np.int_(np.round(X[:, 1]))
-        index = index.reshape(index.size,)
+        index = index.reshape(
            index.size,
        )
        X_flag = index[0] >= self.output_dim
        if X2 is None:
-            if X_flag: #Kuu or Kmm
+            if X_flag:  # Kuu or Kmm
                index -= self.output_dim
-                tmp = dL_dK*self._gkuu_lq(X, index)
+                tmp = dL_dK * self._gkuu_lq(X, index)
                for q in np.unique(index):
                    ind = np.where(index == q)
                    self.lengthscale.gradient[q] = tmp[np.ix_(ind[0], ind[0])].sum()
            else:
                raise NotImplementedError
-        else: #Kfu or Knm
+        else:  # Kfu or Knm
-            #index2 = np.asarray(X2, dtype=np.int)
+            # index2 = np.asarray(X2, dtype=int)
-            #index2 = index2.reshape(index2.size,)
+            # index2 = index2.reshape(index2.size,)
-            if hasattr(X2, 'values'):
+            if hasattr(X2, "values"):
                X2 = X2.values
            index2 = np.int_(np.round(X2[:, 1]))
-            index2 = index2.reshape(index2.size,)
+            index2 = index2.reshape(
                index2.size,
            )
            X2_flag = index2[0] >= self.output_dim
-            if not X_flag and X2_flag: #Kfu
+            if not X_flag and X2_flag:  # Kfu
                index2 -= self.output_dim
-            else: #Kuf
+            else:  # Kuf
-                dL_dK = dL_dK.T #so we obtaing dL_Kfu
+                dL_dK = dL_dK.T  # so we obtaing dL_Kfu
                indtemp = index - self.output_dim
                Xtemp = X
                X = X2
@ -228,12 +255,12 @@ class EQ_ODE1(Kern):
                index = index2
                index2 = indtemp
            glq, gSdq, gB = self._gkfu(X, index, X2, index2)
-            tmp = dL_dK*glq
+            tmp = dL_dK * glq
            for q in np.unique(index2):
                ind = np.where(index2 == q)
                self.lengthscale.gradient[q] = tmp[:, ind].sum()
-            tmpB = dL_dK*gB
+            tmpB = dL_dK * gB
-            tmp = dL_dK*gSdq
+            tmp = dL_dK * gSdq
            for d in np.unique(index):
                ind = np.where(index == d)
                self.decay.gradient[d] = tmpB[ind, :].sum()
@ -242,404 +269,459 @@ class EQ_ODE1(Kern):
                    self.W.gradient[d, q] = tmp[np.ix_(ind[0], ind2[0])].sum()
    def update_gradients_diag(self, dL_dKdiag, X):
-        #index = np.asarray(X, dtype=np.int)
+        # index = np.asarray(X, dtype=int)
-        #index = index.reshape(index.size,)
+        # index = index.reshape(index.size,)
-        if hasattr(X, 'values'):
+        if hasattr(X, "values"):
            X = X.values
        self.decay.gradient = np.zeros(self.decay.shape)
        self.W.gradient = np.zeros(self.W.shape)
        self.lengthscale.gradient = np.zeros(self.lengthscale.shape)
        index = np.int_(X[:, 1])
-        index = index.reshape(index.size,)
+        index = index.reshape(
            index.size,
        )
        glq, gS, gB = self._gkdiag(X, index)
        if dL_dKdiag.size == X.shape[0]:
            dL_dKdiag = np.reshape(dL_dKdiag, (index.size, 1))
-        tmp = dL_dKdiag*glq
+        tmp = dL_dKdiag * glq
        self.lengthscale.gradient = tmp.sum(0)
-        tmpB = dL_dKdiag*gB
+        tmpB = dL_dKdiag * gB
-        tmp = dL_dKdiag*gS
+        tmp = dL_dKdiag * gS
        for d in np.unique(index):
            ind = np.where(index == d)
            self.decay.gradient[d] = tmpB[ind, :].sum()
            self.W.gradient[d, :] = tmp[ind].sum(0)
    def gradients_X(self, dL_dK, X, X2=None):
-        #index = np.asarray(X, dtype=np.int)
+        # index = np.asarray(X, dtype=int)
-        #index = index.reshape(index.size,)
+        # index = index.reshape(index.size,)
-        if hasattr(X, 'values'):
+        if hasattr(X, "values"):
            X = X.values
        index = np.int_(np.round(X[:, 1]))
-        index = index.reshape(index.size,)
+        index = index.reshape(
            index.size,
        )
        X_flag = index[0] >= self.output_dim
-        #If input_dim == 1, use this
+        # If input_dim == 1, use this
-        #gX = np.zeros((X.shape[0], 1))
+        # gX = np.zeros((X.shape[0], 1))
-        #Cheat to allow gradient for input_dim==2
+        # Cheat to allow gradient for input_dim==2
        gX = np.zeros(X.shape)
-        if X2 is None: #Kuu or Kmm
+        if X2 is None:  # Kuu or Kmm
            if X_flag:
                index -= self.output_dim
-                gX[:, 0] = 2.*(dL_dK*self._gkuu_X(X, index)).sum(0)
+                gX[:, 0] = 2.0 * (dL_dK * self._gkuu_X(X, index)).sum(0)
                return gX
            else:
                raise NotImplementedError
-        else: #Kuf or Kmn
+        else:  # Kuf or Kmn
-            #index2 = np.asarray(X2, dtype=np.int)
+            # index2 = np.asarray(X2, dtype=int)
-            #index2 = index2.reshape(index2.size,)
+            # index2 = index2.reshape(index2.size,)
-            if hasattr(X2, 'values'):
+            if hasattr(X2, "values"):
                X2 = X2.values
            index2 = np.int_(np.round(X2[:, 1]))
-            index2 = index2.reshape(index2.size,)
+            index2 = index2.reshape(
                index2.size,
            )
            X2_flag = index2[0] >= self.output_dim
-            if X_flag and not X2_flag: #gradient of Kuf(Z, X) wrt Z
+            if X_flag and not X2_flag:  # gradient of Kuf(Z, X) wrt Z
                index -= self.output_dim
-                gX[:, 0] = (dL_dK*self._gkfu_z(X2, index2, X, index).T).sum(1)
+                gX[:, 0] = (dL_dK * self._gkfu_z(X2, index2, X, index).T).sum(1)
                return gX
            else:
                raise NotImplementedError
-    #---------------------------------------#
+    # ---------------------------------------#
    #             Helper functions          #
-    #---------------------------------------#
+    # ---------------------------------------#
-    #Evaluation of squared exponential for LFM
+    # Evaluation of squared exponential for LFM
    def _Kuu(self, X, index):
-        index = index.reshape(index.size,)
+        index = index.reshape(
-        t = X[:, 0].reshape(X.shape[0],)
+            index.size,
-        lq = self.lengthscale.values.reshape(self.rank,)
+        )
-        lq2 = lq*lq
+        t = X[:, 0].reshape(
-        #Covariance matrix initialization
+            X.shape[0],
        )
        lq = self.lengthscale.values.reshape(
            self.rank,
        )
        lq2 = lq * lq
        # Covariance matrix initialization
        kuu = np.zeros((t.size, t.size))
-        #Assign 1. to diagonal terms
+        # Assign 1. to diagonal terms
-        kuu[np.diag_indices(t.size)] = 1.
+        kuu[np.diag_indices(t.size)] = 1.0
-        #Upper triangular indices
+        # Upper triangular indices
        indtri1, indtri2 = np.triu_indices(t.size, 1)
-        #Block Diagonal indices among Upper Triangular indices
+        # Block Diagonal indices among Upper Triangular indices
        ind = np.where(index[indtri1] == index[indtri2])
        indr = indtri1[ind]
        indc = indtri2[ind]
        r = t[indr] - t[indc]
-        r2 = r*r
+        r2 = r * r
-        #Calculation of  covariance function
+        # Calculation of  covariance function
-        kuu[indr, indc] = np.exp(-r2/lq2[index[indr]])
+        kuu[indr, indc] = np.exp(-r2 / lq2[index[indr]])
-        #Completion of lower triangular part
+        # Completion of lower triangular part
        kuu[indc, indr] = kuu[indr, indc]
        return kuu
    def _Kusu(self, X, index, X2, index2):
-        index = index.reshape(index.size,)
+        index = index.reshape(
-        index2 = index2.reshape(index2.size,)
+            index.size,
-        t = X[:, 0].reshape(X.shape[0],1)
+        )
-        t2 = X2[:, 0].reshape(1,X2.shape[0])
+        index2 = index2.reshape(
-        lq = self.lengthscale.values.reshape(self.rank,)
+            index2.size,
-        #Covariance matrix initialization
+        )
        t = X[:, 0].reshape(X.shape[0], 1)
        t2 = X2[:, 0].reshape(1, X2.shape[0])
        lq = self.lengthscale.values.reshape(
            self.rank,
        )
        # Covariance matrix initialization
        kuu = np.zeros((t.size, t2.size))
        for q in range(self.rank):
            ind1 = index == q
            ind2 = index2 == q
-            r = t[ind1]/lq[q] - t2[0,ind2]/lq[q]
+            r = t[ind1] / lq[q] - t2[0, ind2] / lq[q]
-            r2 = r*r
+            r2 = r * r
-            #Calculation of  covariance function
+            # Calculation of  covariance function
            kuu[np.ix_(ind1, ind2)] = np.exp(-r2)
        return kuu
-    #Evaluation of cross-covariance function
+    # Evaluation of cross-covariance function
    def _Kfu(self, X, index, X2, index2):
-        #terms that move along t
+        # terms that move along t
        t = X[:, 0].reshape(X.shape[0], 1)
-        d = np.unique(index) #Output Indexes
+        d = np.unique(index)  # Output Indexes
        B = self.decay.values[d]
        S = self.W.values[d, :]
-        #Index transformation
+        # Index transformation
        indd = np.arange(self.output_dim)
        indd[d] = np.arange(d.size)
        index = indd[index]
-        #Output related variables must be column-wise
+        # Output related variables must be column-wise
        B = B.reshape(B.size, 1)
-        #Input related variables must be row-wise
+        # Input related variables must be row-wise
        z = X2[:, 0].reshape(1, X2.shape[0])
        lq = self.lengthscale.values.reshape((1, self.rank))
        kfu = np.empty((t.size, z.size))
-        #DxQ terms
+        # DxQ terms
-        c0 = S*((.5*np.sqrt(np.pi))*lq)
+        c0 = S * ((0.5 * np.sqrt(np.pi)) * lq)
-        nu = B*(.5*lq)
+        nu = B * (0.5 * lq)
        nu2 = nu**2
-        #1xM terms
+        # 1xM terms
-        z_lq = z/lq[0, index2]
+        z_lq = z / lq[0, index2]
-        #NxM terms
+        # NxM terms
-        tz = t-z
+        tz = t - z
-        tz_lq = tz/lq[0, index2]
+        tz_lq = tz / lq[0, index2]
        # Upsilon Calculations
        fullind = np.ix_(index, index2)
-        upsi = np.exp(nu2[fullind] - B[index]*tz + lnDifErf( -tz_lq + nu[fullind], z_lq+nu[fullind]))
+        upsi = np.exp(
-        upsi[t[:, 0] == 0, :] = 0.
+            nu2[fullind]
-        #Covariance calculation
+            - B[index] * tz
-        kfu = c0[fullind]*upsi
+            + lnDifErf(-tz_lq + nu[fullind], z_lq + nu[fullind])
        )
        upsi[t[:, 0] == 0, :] = 0.0
        # Covariance calculation
        kfu = c0[fullind] * upsi
        return kfu
-    #Gradient of Kuu wrt lengthscale
+    # Gradient of Kuu wrt lengthscale
    def _gkuu_lq(self, X, index):
-        t = X[:, 0].reshape(X.shape[0],)
+        t = X[:, 0].reshape(
-        index = index.reshape(X.shape[0],)
+            X.shape[0],
-        lq = self.lengthscale.values.reshape(self.rank,)
+        )
-        lq2 = lq*lq
+        index = index.reshape(
-        #Covariance matrix initialization
+            X.shape[0],
        )
        lq = self.lengthscale.values.reshape(
            self.rank,
        )
        lq2 = lq * lq
        # Covariance matrix initialization
        glq = np.zeros((t.size, t.size))
-        #Upper triangular indices
+        # Upper triangular indices
        indtri1, indtri2 = np.triu_indices(t.size, 1)
-        #Block Diagonal indices among Upper Triangular indices
+        # Block Diagonal indices among Upper Triangular indices
        ind = np.where(index[indtri1] == index[indtri2])
        indr = indtri1[ind]
        indc = indtri2[ind]
        r = t[indr] - t[indc]
-        r2 = r*r
+        r2 = r * r
-        r2_lq2 = r2/lq2[index[indr]]
+        r2_lq2 = r2 / lq2[index[indr]]
-        #Calculation of  covariance function
+        # Calculation of  covariance function
        er2_lq2 = np.exp(-r2_lq2)
-        #Gradient wrt lq
+        # Gradient wrt lq
-        c = 2.*r2_lq2/lq[index[indr]]
+        c = 2.0 * r2_lq2 / lq[index[indr]]
-        glq[indr, indc] = er2_lq2*c
+        glq[indr, indc] = er2_lq2 * c
-        #Complete the lower triangular
+        # Complete the lower triangular
        glq[indc, indr] = glq[indr, indc]
        return glq
-    #Be careful this derivative should be transpose it
+    # Be careful this derivative should be transpose it
-    def _gkuu_X(self, X, index): #Diagonal terms are always zero
+    def _gkuu_X(self, X, index):  # Diagonal terms are always zero
-        t = X[:, 0].reshape(X.shape[0],)
+        t = X[:, 0].reshape(
-        index = index.reshape(index.size,)
+            X.shape[0],
-        lq = self.lengthscale.values.reshape(self.rank,)
+        )
-        lq2 = lq*lq
+        index = index.reshape(
-        #Covariance matrix initialization
+            index.size,
        )
        lq = self.lengthscale.values.reshape(
            self.rank,
        )
        lq2 = lq * lq
        # Covariance matrix initialization
        gt = np.zeros((t.size, t.size))
-        #Upper triangular indices
+        # Upper triangular indices
-        indtri1, indtri2 = np.triu_indices(t.size, 1) #Offset of 1 from the diagonal
+        indtri1, indtri2 = np.triu_indices(t.size, 1)  # Offset of 1 from the diagonal
-        #Block Diagonal indices among Upper Triangular indices
+        # Block Diagonal indices among Upper Triangular indices
        ind = np.where(index[indtri1] == index[indtri2])
        indr = indtri1[ind]
        indc = indtri2[ind]
        r = t[indr] - t[indc]
-        r2 = r*r
+        r2 = r * r
-        r2_lq2 = r2/(-lq2[index[indr]])
+        r2_lq2 = r2 / (-lq2[index[indr]])
-        #Calculation of  covariance function
+        # Calculation of  covariance function
        er2_lq2 = np.exp(r2_lq2)
-        #Gradient wrt t
+        # Gradient wrt t
-        c = 2.*r/lq2[index[indr]]
+        c = 2.0 * r / lq2[index[indr]]
-        gt[indr, indc] = er2_lq2*c
+        gt[indr, indc] = er2_lq2 * c
-        #Complete the lower triangular
+        # Complete the lower triangular
        gt[indc, indr] = -gt[indr, indc]
        return gt
-    #Gradients for Diagonal Kff
+    # Gradients for Diagonal Kff
    def _gkdiag(self, X, index):
-        index = index.reshape(index.size,)
+        index = index.reshape(
-        #terms that move along t
+            index.size,
        )
        # terms that move along t
        d = np.unique(index)
        B = self.decay[d].values
        S = self.W[d, :].values
-        #Index transformation
+        # Index transformation
        indd = np.arange(self.output_dim)
        indd[d] = np.arange(d.size)
        index = indd[index]
-        #Output related variables must be column-wise
+        # Output related variables must be column-wise
        t = X[:, 0].reshape(X.shape[0], 1)
        B = B.reshape(B.size, 1)
-        S2 = S*S
+        S2 = S * S
-        #Input related variables must be row-wise
+        # Input related variables must be row-wise
        lq = self.lengthscale.values.reshape(1, self.rank)
        gB = np.empty((t.size,))
        glq = np.empty((t.size, lq.size))
        gS = np.empty((t.size, lq.size))
-        #Dx1 terms
+        # Dx1 terms
-        c0 = S2*lq*np.sqrt(np.pi)
+        c0 = S2 * lq * np.sqrt(np.pi)
-        #DxQ terms
+        # DxQ terms
-        nu = (.5*lq)*B
+        nu = (0.5 * lq) * B
-        nu2 = nu*nu
+        nu2 = nu * nu
-        #Nx1 terms
+        # Nx1 terms
-        gamt = -B[index]*t
+        gamt = -B[index] * t
        egamt = np.exp(gamt)
-        e2gamt = egamt*egamt
+        e2gamt = egamt * egamt
-        #NxQ terms
+        # NxQ terms
-        t_lq = t/lq
+        t_lq = t / lq
-        t2_lq2 = -t_lq*t_lq
+        t2_lq2 = -t_lq * t_lq
-        etlq2gamt = np.exp(t2_lq2 + gamt) #NXQ
+        etlq2gamt = np.exp(t2_lq2 + gamt)  # NXQ
        ##Upsilon calculations
-        #erfnu = erf(nu) #TODO: This can be improved
+        # erfnu = erf(nu) #TODO: This can be improved
-        upm = np.exp(nu2[index, :] + lnDifErf( nu[index, :], t_lq + nu[index, :]) )
+        upm = np.exp(nu2[index, :] + lnDifErf(nu[index, :], t_lq + nu[index, :]))
-        upm[t[:, 0] == 0, :] = 0.
+        upm[t[:, 0] == 0, :] = 0.0
-        upv = np.exp(nu2[index, :] + 2.*gamt + lnDifErf(-t_lq + nu[index, :], nu[index, :]) ) #egamt*upv
+        upv = np.exp(
-        upv[t[:, 0] == 0, :] = 0.
+            nu2[index, :] + 2.0 * gamt + lnDifErf(-t_lq + nu[index, :], nu[index, :])
        )  # egamt*upv
        upv[t[:, 0] == 0, :] = 0.0
-        #Gradient wrt S
+        # Gradient wrt S
-        c0_S = (S/B)*(lq*np.sqrt(np.pi))
+        c0_S = (S / B) * (lq * np.sqrt(np.pi))
-        gS = c0_S[index]*(upm - upv)
+        gS = c0_S[index] * (upm - upv)
-        #For B
+        # For B
-        CB1 = (.5*lq)**2 - .5/B**2 #DXQ
+        CB1 = (0.5 * lq) ** 2 - 0.5 / B**2  # DXQ
-        lq2_2B = (.5*lq**2)*(S2/B) #DXQ
+        lq2_2B = (0.5 * lq**2) * (S2 / B)  # DXQ
-        CB2 = 2.*etlq2gamt - e2gamt - 1. #NxQ
+        CB2 = 2.0 * etlq2gamt - e2gamt - 1.0  # NxQ
        # gradient wrt B NxZ
-        gB = c0[index, :]*(CB1[index, :]*upm - (CB1[index, :] - t/B[index])*upv) + \
+        gB = (
-        lq2_2B[index, :]*CB2
+            c0[index, :] * (CB1[index, :] * upm - (CB1[index, :] - t / B[index]) * upv)
            + lq2_2B[index, :] * CB2
        )
-        #Gradient wrt lengthscale
+        # Gradient wrt lengthscale
-        #DxQ terms
+        # DxQ terms
-        c0 = (.5*np.sqrt(np.pi))*(S2/B)*(1.+.5*(lq*B)**2)
+        c0 = (0.5 * np.sqrt(np.pi)) * (S2 / B) * (1.0 + 0.5 * (lq * B) ** 2)
-        Clq1 = S2*(lq*.5)
+        Clq1 = S2 * (lq * 0.5)
-        glq = c0[index]*(upm - upv) + Clq1[index]*CB2
+        glq = c0[index] * (upm - upv) + Clq1[index] * CB2
        return glq, gS, gB
    def _gkfu(self, X, index, Z, index2):
-        index = index.reshape(index.size,)
+        index = index.reshape(
-        #TODO: reduce memory usage
+            index.size,
-        #terms that move along t
+        )
        # TODO: reduce memory usage
        # terms that move along t
        d = np.unique(index)
        B = self.decay[d].values
        S = self.W[d, :].values
-        #Index transformation
+        # Index transformation
        indd = np.arange(self.output_dim)
        indd[d] = np.arange(d.size)
        index = indd[index]
-        #t column
+        # t column
        t = X[:, 0].reshape(X.shape[0], 1)
        B = B.reshape(B.size, 1)
-        #z row
+        # z row
        z = Z[:, 0].reshape(1, Z.shape[0])
-        index2 = index2.reshape(index2.size,)
+        index2 = index2.reshape(
            index2.size,
        )
        lq = self.lengthscale.values.reshape((1, self.rank))
-        #kfu = np.empty((t.size, z.size))
+        # kfu = np.empty((t.size, z.size))
        glq = np.empty((t.size, z.size))
        gSdq = np.empty((t.size, z.size))
        gB = np.empty((t.size, z.size))
-        #Dx1 terms
+        # Dx1 terms
-        B_2 = B*.5
+        B_2 = B * 0.5
-        S_pi = S*(.5*np.sqrt(np.pi))
+        S_pi = S * (0.5 * np.sqrt(np.pi))
-        #DxQ terms
+        # DxQ terms
-        c0 = S_pi*lq #lq*Sdq*sqrt(pi)
+        c0 = S_pi * lq  # lq*Sdq*sqrt(pi)
-        nu = B*lq*.5
+        nu = B * lq * 0.5
-        nu2 = nu*nu
+        nu2 = nu * nu
-        #1xM terms
+        # 1xM terms
-        z_lq = z/lq[0, index2]
+        z_lq = z / lq[0, index2]
-        #NxM terms
+        # NxM terms
-        tz = t-z
+        tz = t - z
-        tz_lq = tz/lq[0, index2]
+        tz_lq = tz / lq[0, index2]
-        etz_lq2 = -np.exp(-tz_lq*tz_lq)
+        etz_lq2 = -np.exp(-tz_lq * tz_lq)
-        ez_lq_Bt = np.exp(-z_lq*z_lq -B[index]*t)
+        ez_lq_Bt = np.exp(-z_lq * z_lq - B[index] * t)
        # Upsilon calculations
        fullind = np.ix_(index, index2)
-        upsi = np.exp(nu2[fullind] - B[index]*tz + lnDifErf( -tz_lq + nu[fullind], z_lq+nu[fullind] ) )
+        upsi = np.exp(
-        upsi[t[:, 0] == 0., :] = 0.
+            nu2[fullind]
            - B[index] * tz
            + lnDifErf(-tz_lq + nu[fullind], z_lq + nu[fullind])
        )
        upsi[t[:, 0] == 0.0, :] = 0.0
-        #Gradient wrt S
+        # Gradient wrt S
-        #DxQ term
+        # DxQ term
-        Sa1 = lq*(.5*np.sqrt(np.pi))
+        Sa1 = lq * (0.5 * np.sqrt(np.pi))
-        gSdq = Sa1[0,index2]*upsi
+        gSdq = Sa1[0, index2] * upsi
-        #Gradient wrt lq
+        # Gradient wrt lq
-        la1 = S_pi*(1. + 2.*nu2)
+        la1 = S_pi * (1.0 + 2.0 * nu2)
-        Slq = S*lq
+        Slq = S * lq
-        uplq = etz_lq2*(tz_lq/lq[0, index2] + B_2[index])
+        uplq = etz_lq2 * (tz_lq / lq[0, index2] + B_2[index])
-        uplq += ez_lq_Bt*(-z_lq/lq[0, index2] + B_2[index])
+        uplq += ez_lq_Bt * (-z_lq / lq[0, index2] + B_2[index])
-        glq = la1[fullind]*upsi
+        glq = la1[fullind] * upsi
-        glq += Slq[fullind]*uplq
+        glq += Slq[fullind] * uplq
-        #Gradient wrt B
+        # Gradient wrt B
-        Slq = Slq*lq
+        Slq = Slq * lq
-        nulq = nu*lq
+        nulq = nu * lq
        upBd = etz_lq2 + ez_lq_Bt
-        gB = c0[fullind]*(nulq[fullind] - tz)*upsi + .5*Slq[fullind]*upBd
+        gB = c0[fullind] * (nulq[fullind] - tz) * upsi + 0.5 * Slq[fullind] * upBd
        return glq, gSdq, gB
-    #TODO: reduce memory usage
+    # TODO: reduce memory usage
-    def _gkfu_z(self, X, index, Z, index2): #Kfu(t,z)
+    def _gkfu_z(self, X, index, Z, index2):  # Kfu(t,z)
-        index = index.reshape(index.size,)
+        index = index.reshape(
-        #terms that move along t
+            index.size,
        )
        # terms that move along t
        d = np.unique(index)
        B = self.decay[d].values
        S = self.W[d, :].values
-        #Index transformation
+        # Index transformation
        indd = np.arange(self.output_dim)
        indd[d] = np.arange(d.size)
        index = indd[index]
-        #t column
+        # t column
        t = X[:, 0].reshape(X.shape[0], 1)
        B = B.reshape(B.size, 1)
-        #z row
+        # z row
        z = Z[:, 0].reshape(1, Z.shape[0])
-        index2 = index2.reshape(index2.size,)
+        index2 = index2.reshape(
            index2.size,
        )
        lq = self.lengthscale.values.reshape((1, self.rank))
-        #kfu = np.empty((t.size, z.size))
+        # kfu = np.empty((t.size, z.size))
        gz = np.empty((t.size, z.size))
-        #Dx1 terms
+        # Dx1 terms
-        S_pi =S*(.5*np.sqrt(np.pi))
+        S_pi = S * (0.5 * np.sqrt(np.pi))
-        #DxQ terms
+        # DxQ terms
-        #Slq = S*lq
+        # Slq = S*lq
-        c0 = S_pi*lq #lq*Sdq*sqrt(pi)
+        c0 = S_pi * lq  # lq*Sdq*sqrt(pi)
-        nu = (.5*lq)*B
+        nu = (0.5 * lq) * B
-        nu2 = nu*nu
+        nu2 = nu * nu
-        #1xM terms
+        # 1xM terms
-        z_lq = z/lq[0, index2]
+        z_lq = z / lq[0, index2]
-        z_lq2 = -z_lq*z_lq
+        z_lq2 = -z_lq * z_lq
-        #NxQ terms
+        # NxQ terms
-        t_lq = t/lq
+        t_lq = t / lq
-        #NxM terms
+        # NxM terms
        zt_lq = z_lq - t_lq[:, index2]
-        zt_lq2 = -zt_lq*zt_lq
+        zt_lq2 = -zt_lq * zt_lq
        # Upsilon calculations
        fullind = np.ix_(index, index2)
        z2 = z_lq + nu[fullind]
        z1 = z2 - t_lq[:, index2]
-        upsi = np.exp(nu2[fullind] - B[index]*(t-z) + lnDifErf(z1,z2) )
+        upsi = np.exp(nu2[fullind] - B[index] * (t - z) + lnDifErf(z1, z2))
-        upsi[t[:, 0] == 0., :] = 0.
+        upsi[t[:, 0] == 0.0, :] = 0.0
-        #Gradient wrt z
+        # Gradient wrt z
-        za1 = c0*B
+        za1 = c0 * B
-        #za2 = S_w
+        # za2 = S_w
-        gz = za1[fullind]*upsi + S[fullind]*( np.exp(z_lq2 - B[index]*t) -np.exp(zt_lq2) )
+        gz = za1[fullind] * upsi + S[fullind] * (
            np.exp(z_lq2 - B[index] * t) - np.exp(zt_lq2)
        )
        return gz
-def lnDifErf(z1,z2):
+
-    #Z2 is always positive
+def lnDifErf(z1, z2):
    # Z2 is always positive
    logdiferf = np.zeros(z1.shape)
-    ind = np.where(z1>0.)
+    ind = np.where(z1 > 0.0)
-    ind2 = np.where(z1<=0.)
+    ind2 = np.where(z1 <= 0.0)
    if ind[0].shape > 0:
        z1i = z1[ind]
-        z12 = z1i*z1i
+        z12 = z1i * z1i
        z2i = z2[ind]
-        logdiferf[ind] = -z12 + np.log(erfcx(z1i) - erfcx(z2i)*np.exp(z12-z2i**2))
+        logdiferf[ind] = -z12 + np.log(erfcx(z1i) - erfcx(z2i) * np.exp(z12 - z2i**2))
    if ind2[0].shape > 0:
        z1i = z1[ind2]
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
--- a/GPy/kern/src/kernel_slice_operations.py
+++ b/GPy/kern/src/kernel_slice_operations.py
@ -22,7 +22,14 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        put_clean(dct, 'dK_dX', _slice_dK_dX)
        put_clean(dct, 'dK_dX2', _slice_dK_dX)
        put_clean(dct, 'dK2_dXdX2', _slice_dK2_dXdX2)
        put_clean(dct, 'dK2_dXdX', _slice_dK2_dXdX2)
        put_clean(dct, 'dK3_dXdXdX2', _slice_dK3_dXdXdX2)
        put_clean(dct, 'Kdiag', _slice_Kdiag)
        put_clean(dct, 'dK_dXdiag', _slice_dK_dXdiag)
        put_clean(dct, 'dK_dX2diag', _slice_dK_dXdiag)
        put_clean(dct, 'dK2_dXdX2diag', _slice_dK2_dXdX2diag)
        put_clean(dct, 'dK2_dXdXdiag', _slice_dK2_dXdX2diag)
        put_clean(dct, 'dK3_dXdXdX2diag', _slice_dK3_dXdXdX2diag)
        put_clean(dct, 'phi', _slice_Kdiag)
        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
@ -35,9 +42,10 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        put_clean(dct, 'gradients_XX_diag', _slice_gradients_XX_diag)
        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
-        put_clean(dct, 'dgradients_dX',_slice_partial_gradients_list_X)
+        put_clean(dct, 'dgradients', _slice_partial_gradients_list)
-        put_clean(dct, 'dgradients_dX2',_slice_partial_gradients_list_X)
+        put_clean(dct, 'dgradients_dX', _slice_partial_gradients_list_X)
-        put_clean(dct, 'dgradients2_dXdX2',_slice_partial_gradients_list_XX)
+        put_clean(dct, 'dgradients_dX2', _slice_partial_gradients_list_X)
        put_clean(dct, 'dgradients2_dXdX2', _slice_partial_gradients_list_XX)
        put_clean(dct, 'psi0', _slice_psi)
        put_clean(dct, 'psi1', _slice_psi)
@ -155,6 +163,18 @@ def _slice_dK_dX(f):
        return ret
    return wrap
 def _slice_dK_dXdiag(f):
    @wraps(f)
    def wrap(self, X, dim, *a, **kw):
        with _Slice_wrap(self, X, None) as s:
            d = s.k._project_dim(dim)
            if d is None:
                ret = np.zeros(X.shape[0])
            else:
                ret = f(self, s.X, dim, *a, **kw)
        return ret
    return wrap
 def _slice_dK2_dXdX2(f):
    @wraps(f)
    def wrap(self, X, X2, dimX, dimX2, *a, **kw):
@ -168,6 +188,59 @@ def _slice_dK2_dXdX2(f):
        return ret
    return wrap
 def _slice_dK2_dXdX2diag(f):
    @wraps(f)
    def wrap(self, X, dimX, dimX2, *a, **kw):
        with _Slice_wrap(self, X, None) as s:
            d = s.k._project_dim(dimX)
            d2 = s.k._project_dim(dimX2)
            if (d is None) or (d2 is None):
                ret = np.zeros(X.shape[0])
            else:
                ret = f(self, s.X, d, d2, *a, **kw)
        return ret
    return wrap
 def _slice_dK3_dXdXdX2(f):
    @wraps(f)
    def wrap(self, X, X2, dim, dimX, dimX2, *a, **kw):
        with _Slice_wrap(self, X, X2) as s:
            D = s.k._project_dim(dim)
            d = s.k._project_dim(dimX)
            d2 = s.k._project_dim(dimX2)
            if (D is None) or (d is None) or (d2 is None):
                ret = np.zeros((X.shape[0], X2.shape[0]))
            else:
                ret = f(self, s.X, s.X2, D, d, d2, *a, **kw)
        return ret
    return wrap
 def _slice_dK3_dXdXdX2diag(f):
    @wraps(f)
    def wrap(self, X, dim, dimX, dimX2, *a, **kw):
        with _Slice_wrap(self, X, None) as s:
            D = s.k._project_dim(dim)
            d = s.k._project_dim(dimX)
            d2 = s.k._project_dim(dimX2)
            if (D is None) or (d is None) or (d2 is None):
                ret = np.zeros(X.shape[0])
            else:
                ret = f(self, s.X, D, d, d2, *a, **kw)
        return ret
    return wrap
 def _slice_partial_gradients_list(f):
    @wraps(f)
    def wrap(self, X, X2):
        if X2 is None:
            N, M = X.shape[0], X.shape[0]
        else:
            N, M = X.shape[0], X2.shape[0]
        with _Slice_wrap(self, X, X2, ret_shape=(N, M)) as s:
            ret = f(self, s.X, s.X2)
        return ret
    return wrap
 def _slice_partial_gradients_X(f):
    @wraps(f)
    def wrap(self, X, X2, dim):
--- a/GPy/kern/src/multioutput_derivative_kern.py
+++ b/GPy/kern/src/multioutput_derivative_kern.py
@ -7,20 +7,24 @@ import numpy as np
 from functools import partial
 class KernWrapper(Kern):
-    def __init__(self, fk, fug, fg, base_kern):
+    def __init__(self, fk, fdk, fug, fg, base_kern):
        self.fk = fk
        self.fdk = fdk
        self.fug = fug
        self.fg = fg
        self.base_kern = base_kern
-        super(KernWrapper, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='KernWrapper',useGPU=False)
+        super(KernWrapper, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='KernWrapper', useGPU=False)
    def K(self, X, X2=None):
-        return self.fk(X,X2=X2)
+        return self.fk(X, X2=X2)
-    def update_gradients_full(self,dL_dK, X, X2=None):
+    def dK_dX(self, X, X2, dimX):
        return self.fdk(X, X2, dimX)
    def update_gradients_full(self, dL_dK, X, X2=None):
        return self.fug(dL_dK, X, X2=X2)
-    def gradients_X(self,dL_dK, X, X2=None):
+    def gradients_X(self, dL_dK, X, X2=None):
        return self.fg(dL_dK, X, X2=X2)
    @property
@ -57,24 +61,42 @@ class MultioutputDerivativeKern(MultioutputKern):
        #build covariance structure
        covariance = [[None for i in range(nl)] for j in range(nl)]
        linked = []
-        for i in range(0,nl):
+        for i in range(0, nl):
-            unique=True
+            unique = True
-            for j in range(0,nl):
+            for j in range(0, nl):
-                if i==j or (kernels[i] is kernels[j]):
+                if (i == j) or (kernels[i] is kernels[j]):
                    kern = kernels[i]
-                    if i>j:
+                    if i > j:
-                        unique=False
+                        unique = False
                elif cross_covariances.get((i,j)) is not None: #cross covariance is given
                    kern = cross_covariances.get((i,j))
-                elif kernels[i].name == 'DiffKern' and kernels[i].base_kern == kernels[j]: # one is derivative of other
+                elif (kernels[i].name == 'DiffKern') and (kernels[i].base_kern == kernels[j]): # one is derivative of other
-                    kern = KernWrapper(kernels[i].dK_dX_wrap,kernels[i].update_gradients_dK_dX,kernels[i].gradients_X, kernels[j])
+                    kern = KernWrapper(
                        kernels[i].dK_dX_wrap,
                        kernels[i].dK2_dXdX_wrap,
                        kernels[i].update_gradients_dK_dX,
                        kernels[i].gradients_X,
                        kernels[j]
                        )
                    unique=False
-                elif kernels[j].name == 'DiffKern' and kernels[j].base_kern == kernels[i]: # one is derivative of other
+                elif (kernels[j].name == 'DiffKern') and (kernels[j].base_kern == kernels[i]): # one is derivative of other
-                    kern = KernWrapper(kernels[j].dK_dX2_wrap,kernels[j].update_gradients_dK_dX2,kernels[j].gradients_X2, kernels[i])
+                    kern = KernWrapper(
-                elif kernels[i].name == 'DiffKern' and kernels[j].name == 'DiffKern' and kernels[i].base_kern == kernels[j].base_kern: #both are partial derivatives
+                        kernels[j].dK_dX2_wrap,
-                    kern = KernWrapper(partial(kernels[i].K, dimX2=kernels[j].dimension), partial(kernels[i].update_gradients_full, dimX2=kernels[j].dimension),None, kernels[i].base_kern)
+                        kernels[j].dK2_dXdX2_wrap,
-                    if i>j:
+                        kernels[j].update_gradients_dK_dX2,
-                        unique=False
+                        kernels[j].gradients_X2,
                        kernels[i]
                        )
                elif (kernels[i].name == 'DiffKern') and (kernels[j].name == 'DiffKern') and (kernels[i].base_kern == kernels[j].base_kern): #both are partial derivatives
                    kern = KernWrapper(
                        partial(kernels[i].K, dimX2=kernels[j].dimension),
                        partial(kernels[i].dK_dX, dimX2=kernels[j].dimension),
                        partial(kernels[i].update_gradients_full, dimX2=kernels[j].dimension),
                        None,
                        kernels[i].base_kern
                        )
                    if i > j:
                        unique = False
                else:
                    kern = ZeroKern()
                covariance[i][j] = kern
--- a/GPy/kern/src/multioutput_kern.py
+++ b/GPy/kern/src/multioutput_kern.py
@ -85,21 +85,63 @@ class MultioutputKern(CombinationKernel):
        self.link_parameters(*[kernels[i] for i in linked])
    @Cache_this(limit=3, ignore_args=())
-    def K(self, X ,X2=None):
+    def K(self, X, X2=None):
        if X2 is None:
            X2 = X
        slices = index_to_slices(X[:,self.index_dim])
        slices2 = index_to_slices(X2[:,self.index_dim])
        target =  np.zeros((X.shape[0], X2.shape[0]))
-        [[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j].K(X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))]  
+        for j in range(len(slices2)):
            for i in range(len(slices)):
                for l in range(len(slices2[j])):
                    for k in range(len(slices[i])):
                        cov_K = self.covariance[i][j].K(X[slices[i][k],:], X2[slices2[j][l],:])
                        target.__setitem__((slices[i][k], slices2[j][l]), cov_K)
        return target
    @Cache_this(limit=3, ignore_args=())
-    def Kdiag(self,X):
+    def Kdiag(self, X):
        slices = index_to_slices(X[:,self.index_dim])
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        target = np.zeros(X.shape[0])
-        [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
+        for kern, slices_i in zip(kerns, slices):
            for s in slices_i:
                np.copyto(target[s], kern.Kdiag(X[s]))
        return target
    @Cache_this(limit=3, ignore_args=())
    def dK_dX(self, X, X2, dimX):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        """
        if X2 is None:
            X2 = X
        slices = index_to_slices(X[:,self.index_dim])
        slices2 = index_to_slices(X2[:,self.index_dim])
        target =  np.zeros((X.shape[0], X2.shape[0]))
        for j in range(len(slices2)):
            for i in range(len(slices)):
                for l in range(len(slices2[j])):
                    for k in range(len(slices[i])):
                        cov_dK_dX = self.covariance[i][j].dK_dX(X[slices[i][k],:], X2[slices2[j][l],:], dimX)
                        target.__setitem__((slices[i][k], slices2[j][l]), cov_dK_dX)
        return target
    @Cache_this(limit=3, ignore_args=())
    def dK_dXdiag(self, X, dimX):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        """
        slices = index_to_slices(X[:,self.index_dim])
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        target = np.zeros(X.shape[0])
        for kern, slices_i in zip(kerns, slices):
            for s in slices_i:
                np.copyto(target[s], kern.dK_dXdiag(X[s], dimX))
        return target
    def _update_gradients_full_wrapper(self, kern, dL_dK, X, X2):
@ -115,19 +157,35 @@ class MultioutputKern(CombinationKernel):
    def reset_gradients(self):
        for kern in self.kern: kern.reset_gradients()
-    def update_gradients_full(self,dL_dK, X, X2=None):
+    def update_gradients_full(self, dL_dK, X, X2=None):
-        self.reset_gradients()
+        if X2 is None:
            X2 = X
        slices = index_to_slices(X[:,self.index_dim])
-        if X2 is not None:
+        slices2 = index_to_slices(X2[:,self.index_dim])
-            slices2 = index_to_slices(X2[:,self.index_dim])
+
-            [[[[ self._update_gradients_full_wrapper(self.covariance[i][j], dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
+        self.reset_gradients()
-        else:
+        for j in range(len(slices2)):
-            [[[[ self._update_gradients_full_wrapper(self.covariance[i][j], dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], X[slices[j][l],:]) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
+            for i in range(len(slices)):
                for l in range(len(slices2[j])):
                    for k in range(len(slices[i])):
                        self._update_gradients_full_wrapper(
                            self.covariance[i][j],
                            dL_dK[slices[i][k],slices2[j][l]],
                            X[slices[i][k],:],
                            X2[slices2[j][l],:]
                            )
    def update_gradients_diag(self, dL_dKdiag, X):
        self.reset_gradients()
        slices = index_to_slices(X[:,self.index_dim])
-        [[ self._update_gradients_diag_wrapper(self.covariance[i][i], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))]
+
        self.reset_gradients()
        for i in range(len(slices)):
            for k in range(len(slices[i])):
                self._update_gradients_diag_wrapper(
                    self.covariance[i][i],
                    dL_dKdiag[slices[i][k]],
                    X[slices[i][k],:]
                    )
    def gradients_X(self,dL_dK, X, X2=None):
        slices = index_to_slices(X[:,self.index_dim])
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@ -70,6 +70,310 @@ class Prod(CombinationKernel):
            which_parts = self.parts
        return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))
    def reset_gradients(self):
        for part in self.parts:
            part.reset_gradients()
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK_dX(self, X, X2, dimX, which_parts=None):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
        for combination in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination) > 0:
                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
            else:
                prod = np.ones(prod_sum.shape)
            to_update = list(set(which_parts) - set(combination))[0]
            prod_sum += prod*to_update.dK_dX(X, X2, dimX)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK_dXdiag(self, X, dimX, which_parts=None):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        Returns only diagonal elements.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros(X.shape[0])
        for combination in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination) > 0:
                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination])
            else:
                prod = np.ones(prod_sum.shape)
            to_update = list(set(which_parts) - set(combination))[0]
            prod_sum += prod*to_update.dK_dXdiag(X, dimX)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK_dX2(self, X, X2, dimX2, which_parts=None):
        """
        Compute the derivative of K with respect to:
            dimension dimX2 of set X2.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
        for combination in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination) > 0:
                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
            else:
                prod = np.ones(prod_sum.shape)
            to_update = list(set(which_parts) - set(combination))[0]
            prod_sum += prod*to_update.dK_dX2(X, X2, dimX2)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK2_dXdX2(self, X, X2, dimX, dimX2, which_parts=None):
        """
        Compute the second derivative of K with respect to:
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination1) > 0:
                prod = reduce(np.multiply, [p.K(X, X2) for p in combination1])
            else:
                prod = np.ones(prod_sum.shape)
            to_update1 = list(set(which_parts) - set(combination1))[0]
            prod_sum += prod*to_update1.dK2_dXdX2(X, X2, dimX, dimX2)
            if len(which_parts) > 1:
                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
                    if len(combination2) > 0:
                        prod = reduce(np.multiply, [p.K(X, X2) for p in combination2])
                    else:
                        prod = np.ones(prod_sum.shape)
                    to_update2 = list(set(combination1) - set(combination2))[0]
                    prod_sum += prod*to_update1.dK_dX(X, X2, dimX)*to_update2.dK_dX2(X, X2, dimX2)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK2_dXdX2diag(self, X, dimX, dimX2, which_parts=None):
        """
        Compute the second derivative of K with respect to:
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        Returns only diagonal elements.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros(X.shape[0])
        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination1) > 0:
                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination1])
            else:
                prod = np.ones(prod_sum.shape)
            to_update1 = list(set(which_parts) - set(combination1))[0]
            prod_sum += prod*to_update1.dK2_dXdX2diag(X, dimX, dimX2)
            if len(which_parts) > 1:
                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
                    if len(combination2) > 0:
                        prod = reduce(np.multiply, [p.Kdiag(X) for p in combination2])
                    else:
                        prod = np.ones(prod_sum.shape)
                    to_update2 = list(set(combination1) - set(combination2))[0]
                    prod_sum += prod*to_update1.dK_dXdiag(X, dimX)*to_update2.dK_dX2diag(X, dimX)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK2_dXdX(self, X, X2, dimX_0, dimX_1, which_parts=None):
        """
        Compute the second derivative of K with respect to:
            dimension dimX_0 of set X, and
            dimension dimX_1 of set X.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination1) > 0:
                prod = reduce(np.multiply, [p.K(X, X2) for p in combination1])
            else:
                prod = np.ones(prod_sum.shape)
            to_update1 = list(set(which_parts) - set(combination1))[0]
            prod_sum += prod*to_update1.dK2_dXdX(X, X2, dimX_0, dimX_1)
            if len(which_parts) > 1:
                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
                    if len(combination2) > 0:
                        prod = reduce(np.multiply, [p.K(X, X2) for p in combination2])
                    else:
                        prod = np.ones(prod_sum.shape)
                    to_update2 = list(set(combination1) - set(combination2))[0]
                    prod_sum += prod*to_update1.dK_dX(X, X2, dimX_0)*to_update2.dK_dX(X, X2, dimX_1)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK3_dXdXdX2(self, X, X2, dimX_0, dimX_1, dimX2, which_parts=None):
        """
        Compute the third derivative of K with respect to:
            dimension dimX_0 of set X,
            dimension dimX_1 of set X, and
            dimension dimX2 of set X2.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination1) > 0:
                prod = reduce(np.multiply, [p.K(X, X2) for p in combination1])
            else:
                prod = np.ones(prod_sum.shape)
            to_update1 = list(set(which_parts) - set(combination1))[0]
            prod_sum += prod*to_update1.dK3_dXdXdX2(X, X2, dimX_0, dimX_1, dimX2)
            if len(which_parts) > 1:
                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
                    if len(combination2) > 0:
                        prod = reduce(np.multiply, [p.K(X, X2) for p in combination2])
                    else:
                        prod = np.ones(prod_sum.shape)
                    to_update2 = list(set(combination1) - set(combination2))[0]
                    prod_sum += prod*to_update1.dK2_dXdX2(X, X2, dimX_0, dimX2)*to_update2.dK_dX(X, X2, dimX_1)
                    prod_sum += prod*to_update1.dK2_dXdX(X, X2, dimX_0, dimX_1)*to_update2.dK_dX2(X, X2, dimX2)
                    prod_sum += prod*to_update1.dK_dX(X, X2, dimX_0)*to_update2.dK2_dXdX2(X, X2, dimX_1, dimX2)
                    if len(which_parts) > 2:
                        for combination3 in itertools.combinations(combination2, len(combination2) - 1):
                            if len(combination3) > 0:
                                prod = reduce(np.multiply, [p.K(X, X2) for p in combination3])
                            else:
                                prod = np.ones(prod_sum.shape)
                            to_update3 = list(set(combination2) - set(combination3))[0]
                            prod_sum += prod*to_update1.dK_dX(X, X2, dimX_0)*to_update2.dK_dX2(X, X2, dimX2)*to_update3.dK_dX(X, X2, dimX_1)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK3_dXdXdX2diag(self, X, dimX_0, dimX_1, dimX2, which_parts=None):
        """
        Compute the third derivative of K with respect to:
            dimension dimX_0 of set X,
            dimension dimX_1 of set X, and
            dimension dimX2 of set X2.
        Returns only diagonal elements of the covariance matrix.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros(X.shape[0])
        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination1) > 0:
                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination1])
            else:
                prod = np.ones(prod_sum.shape)
            to_update1 = list(set(which_parts) - set(combination1))[0]
            prod_sum += prod*to_update1.dK3_dXdXdX2diag(X, dimX_0, dimX_1, dimX2)
            if len(which_parts) > 1:
                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
                    if len(combination2) > 0:
                        prod = reduce(np.multiply, [p.Kdiag(X) for p in combination2])
                    else:
                        prod = np.ones(prod_sum.shape)
                    to_update2 = list(set(combination1) - set(combination2))[0]
                    prod_sum += prod*to_update1.dK2_dXdX2diag(X, dimX_0, dimX2)*to_update2.dK_dXdiag(X, dimX_1)
                    prod_sum += prod*to_update1.dK2_dXdXdiag(X, dimX_0, dimX_1)*to_update2.dK_dX2diag(X, dimX2)
                    prod_sum += prod*to_update1.dK_dXdiag(X, dimX_0)*to_update2.dK2_dXdX2diag(X, dimX_1, dimX2)
                    if len(which_parts) > 2:
                        for combination3 in itertools.combinations(combination2, len(combination2) - 1):
                            if len(combination3) > 0:
                                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination3])
                            else:
                                prod = np.ones(prod_sum.shape)
                            to_update3 = list(set(combination2) - set(combination3))[0]
                            prod_sum += prod*to_update1.dK_dXdiag(X, dimX_0)*to_update2.dK_dX2diag(X, dimX2)*to_update3.dK_dXdiag(X, dimX_1)
        return prod_sum
    def update_gradients_direct(self, *args):
        for i, (g,p) in enumerate(zip(args, self.parts)):
            p.update_gradients_direct(*g)
    def dgradients_dX(self, X, X2, dimX, parts=None):
        """
        Compute the hyperparameter gradients of:
            the derivative of K with respect to dimension dimX of set X
            ("dK_dX").
        """
        if parts is None:
            parts = self.parts
        gradients = []
        for part in parts:
            neq_parts = [p for p in parts if p is not part]
            if len(neq_parts) > 0:
                K = self.K(X, X2, which_parts=neq_parts)
                K_dx = self.dK_dX(X, X2, dimX, which_parts=neq_parts)
            else:
                K = np.ones((X.shape[0], X2.shape[0]))
                K_dx = np.zeros((X.shape[0], X2.shape[0]))
            g = part.dgradients(X, X2)
            g_dx = part.dgradients_dX(X, X2, dimX)
            gradients += [[(g_i*K_dx + g_dx_i*K) for (g_i, g_dx_i) in zip(g, g_dx)]]
        return gradients
    def dgradients_dX2(self, X, X2, dimX2, parts=None):
        """
        Compute the hyperparameter gradients of:
            the derivative of K with respect to dimension dimX2 of set X2
            ("dK_dX2").
        """
        if parts is None:
            parts = self.parts
        gradients = []
        for part in parts:
            neq_parts = [p for p in parts if p is not part]
            if len(neq_parts) > 0:
                K = self.K(X, X2, which_parts=neq_parts)
                K_dx2 = self.dK_dX2(X, X2, dimX2, which_parts=neq_parts)
            else:
                K = np.ones((X.shape[0], X2.shape[0]))
                K_dx2 = np.zeros((X.shape[0], X2.shape[0]))
            g = part.dgradients(X, X2)
            g_dx2 = part.dgradients_dX2(X, X2, dimX2)
            gradients += [[(g_i*K_dx2 + g_dx2_i*K) for (g_i, g_dx2_i) in zip(g, g_dx2)]]
        return gradients
    def dgradients2_dXdX2(self, X, X2, dimX, dimX2, parts=None):
        """
        Compute the hyperparameter gradients of:
            the second derivative of K with respect to:
                dimension dimX of set X, and
                dimension dimX2 of set X2
            ("dK2_dXdX2").
        """
        if parts is None:
            parts = self.parts
        gradients = []
        for part in parts:
            neq_parts = [p for p in parts if p is not part]
            K = self.K(X, X2, which_parts=neq_parts)
            K_dx = self.dK_dX(X, X2, dimX, which_parts=neq_parts)
            K_dx2 = self.dK_dX2(X, X2, dimX2, which_parts=neq_parts)
            K_dxdx2 = self.dK2_dXdX2(X, X2, dimX, dimX2, which_parts=neq_parts)
            g = part.dgradients(X, X2)
            g_dx = part.dgradients_dX(X, X2, dimX)
            g_dx2 = part.dgradients_dX2(X, X2, dimX2)
            g_dxdx2 = part.dgradients2_dXdX2(X, X2, dimX, dimX2)
            gradients += [[(g_i*K_dxdx2 + g_dx_i*K_dx2 + g_dx2_i*K_dx + g_dxdx2_i*K) for (g_i, g_dx_i, g_dx2_i, g_dxdx2_i) in zip(g, g_dx, g_dx2, g_dxdx2)]]
        return gradients
    def update_gradients_full(self, dL_dK, X, X2=None):
        if len(self.parts)==2:
            self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
--- a/GPy/kern/src/rbf.py
+++ b/GPy/kern/src/rbf.py
@ -53,24 +53,126 @@ class RBF(Stationary):
    @Cache_this(limit=3, ignore_args=())
    def dK_dX(self, X, X2, dimX):
-        r = self._scaled_dist(X, X2)
+        """
-        K = self.K_of_r(r)
+        Compute the derivative of K with respect to:
-        dist = X[:,None,dimX]-X2[None,:,dimX]
+            dimension dimX of set X.
-        lengthscale2inv = (np.ones((X.shape[1]))/(self.lengthscale**2))[dimX]
+        """
-        return -1.*K*dist*lengthscale2inv
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
        dist = X[:,None,dimX] - X2[None,:,dimX]
        return -dist*(lengthscaleinv**2)*self._clean_K(X, X2)
    @Cache_this(limit=3, ignore_args=())
    def dK_dXdiag(self, X, dimX):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        Returns only diagonal elements.
        """
        return np.zeros(X.shape[0])
    @Cache_this(limit=3, ignore_args=())
    def dK_dX2(self, X, X2, dimX2):
-        return -self.dK_dX(X,X2, dimX2)
+        """
        Compute the derivative of K with respect to:
            dimension dimX2 of set X2.
        """
        return -self._clean_dK_dX(X, X2, dimX2)
    @Cache_this(limit=3, ignore_args=())
    def dK_dX2diag(self, X, dimX2):
        """
        Compute the derivative of K with respect to:
            dimension dimX2 of set X2.
        Returns only diagonal elements.
        """
        return np.zeros(X.shape[0])
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX2(self, X, X2, dimX, dimX2):
-        r = self._scaled_dist(X, X2)
+        """
-        K = self.K_of_r(r)
+        Compute the second derivative of K with respect to:
-        if X2 is None:
+            dimension dimX of set X, and
-            X2=X
+            dimension dimX2 of set X2.
-        dist = X[:,None,:]-X2[None,:,:]
+        """
-        lengthscale2inv = np.ones((X.shape[1]))/(self.lengthscale**2)
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
-        return -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX]*lengthscale2inv[dimX2] + (dimX==dimX2)*K*lengthscale2inv[dimX]
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        term = dist[dimX]*(lengthscaleinv[dimX]**2)
        term *= dist[dimX2]*(lengthscaleinv[dimX2]**2)
        if dimX == dimX2:
            term -= (lengthscaleinv[dimX]**2)
        return -term*self._clean_K(X, X2)
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX2diag(self, X, dimX, dimX2):
        """
        Compute the second derivative of K with respect to:
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        Returns only diagonal elements.
        """
        if dimX == dimX2:
            lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
            return np.ones(X.shape[0])*(lengthscaleinv[dimX]**2)*self.variance
        else:
            return np.zeros(X.shape[0])
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX(self, X, X2, dimX_0, dimX_1):
        """
        Compute the second derivative of K with respect to:
            dimension dimX_0 of set X, and
            dimension dimX_1 of set X.
        """
        return -self._clean_dK2_dXdX2(X, X2, dimX_0, dimX_1)
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdXdiag(self, X, dimX_0, dimX_1):
        """
        Compute the second derivative of K with respect to:
            dimension dimX_0 of set X, and
            dimension dimX_1 of set X.
        Returns only diagonal elements.
        """
        return -self._clean_dK2_dXdX2diag(X, dimX_0, dimX_1)
    @Cache_this(limit=3, ignore_args=())
    def dK3_dXdXdX2(self, X, X2, dimX_0, dimX_1, dimX2):
        """
        Compute the third derivative of K with respect to:
            dimension dimX_0 of set X,
            dimension dimX_1 of set X, and
            dimension dimX2 of set X2.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        term = dist[dimX_0]*(lengthscaleinv[dimX_0]**2)
        term *= dist[dimX_1]*(lengthscaleinv[dimX_1]**2)
        term *= dist[dimX2]*(lengthscaleinv[dimX2]**2)
        if dimX_0 == dimX_1:
            term -= dist[dimX2]*(lengthscaleinv[dimX2]**2)*(lengthscaleinv[dimX_0]**2)
        if dimX_0 == dimX2:
            term -= dist[dimX_1]*(lengthscaleinv[dimX_1]**2)*(lengthscaleinv[dimX_0]**2)
        if dimX_1 == dimX2:
            term -= dist[dimX_0]*(lengthscaleinv[dimX_0]**2)*(lengthscaleinv[dimX_1]**2)
        return term*self._clean_K(X, X2)
    @Cache_this(limit=3, ignore_args=())
    def dK3_dXdXdX2diag(self, X, dimX_0, dimX_1, dimX2):
        """
        Compute the third derivative of K with respect to:
            dimension dimX_0 of set X,
            dimension dimX_1 of set X, and
            dimension dimX2 of set X2.
        Returns only diagonal elements of the covariance matrix.
        """
        return np.zeros(X.shape[0])
    def dK_dr(self, r):
        return -r*self.K_of_r(r)
@ -82,71 +184,130 @@ class RBF(Stationary):
        return -self.variance # as the diagonal of r is always filled with zeros
    @Cache_this(limit=3, ignore_args=())
-    def dK_dvariance(self,X,X2):
+    def dK_dvariance(self, X, X2):
-        return self.K(X,X2)/self.variance
+        """
        Compute the derivative of K with respect to variance.
        """
        return self._clean_K(X, X2)/self.variance
    @Cache_this(limit=3, ignore_args=())
-    def dK2_dvariancedX(self, X, X2, dim):
+    def dK_dlengthscale(self, X, X2):
-        return self.dK_dX(X,X2, dim)/self.variance
+        """
        Compute the derivative(s) of K with respect to lengthscale(s).
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        K = self._clean_K(X, X2)
        if self.ARD:
            g = []
            for diml in range(self.input_dim):
                g += [(dist[diml]**2)*(lengthscaleinv[diml]**3)*K]
        else:
            g = (lengthscaleinv[0]**3)*np.sum(dist**2, axis=0)*K
        return g
    @Cache_this(limit=3, ignore_args=())
-    def dK2_dvariancedX2(self, X, X2, dim):
+    def dK2_dvariancedX(self, X, X2, dimX):
-        return self.dK_dX2(X,X2, dim)/self.variance
+        """
        Compute the second derivative of K with respect to:
            variance, and
            dimension dimX of set X.
        """
        return self._clean_dK_dX(X, X2, dimX)/self.variance
    @Cache_this(limit=3, ignore_args=())
-    def dK3_dvariancedXdX2(self, X, X2, dim, dimX2):
+    def dK2_dvariancedX2(self, X, X2, dimX2):
-        return self.dK2_dXdX2(X, X2, dim, dimX2)/self.variance
+        """
        Compute the second derivative of K with respect to:
            variance, and
            dimension dimX2 of set X2.
        """
        return -self.dK2_dvariancedX(X, X2, dimX2)
    @Cache_this(limit=3, ignore_args=())
    def dK2_dlengthscaledX(self, X, X2, dimX):
-        r = self._scaled_dist(X, X2)
+        """
-        K = self.K_of_r(r)
+        Compute the second derivative(s) of K with respect to:
-        if X2 is None:
+            lengthscale(s), and
-            X2=X
+            dimension dimX of set X.
-        dist = X[:,None,:]-X2[None,:,:]
+        """
-        lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        dK_dX = self._clean_dK_dX(X, X2, dimX)
        dK_dl = self.dK_dlengthscale(X, X2)
        if self.ARD:
            g = []
-            for diml in range(X.shape[1]):
+            for diml in range(self.input_dim):
-                g += [-1.*K*dist[:,:,dimX]*(dist[:,:,diml]**2)*(lengthscaleinv[dimX]**2)*(lengthscaleinv[diml]**3) + 2.*dist[:,:,dimX]*(lengthscaleinv[diml]**3)*K*(dimX == diml)]
+                term = -dist[dimX]*(lengthscaleinv[dimX]**2)*dK_dl[diml]
                if diml == dimX:
                    term -= 2*lengthscaleinv[dimX]*dK_dX
                g += [term]
        else:
-            g = -1.*K*dist[:,:,dimX]*np.sum(dist**2, axis=2)*(lengthscaleinv[dimX]**5) + 2.*dist[:,:,dimX]*(lengthscaleinv[dimX]**3)*K
+            term = -dist[dimX]*(lengthscaleinv[0]**2)*dK_dl
            term -= 2*lengthscaleinv[0]*dK_dX
            g = term
        return g
    @Cache_this(limit=3, ignore_args=())
    def dK2_dlengthscaledX2(self, X, X2, dimX2):
-        tmp = self.dK2_dlengthscaledX(X, X2, dimX2)
+        """
        Compute the second derivative(s) of K with respect to:
            lengthscale(s), and
            dimension dimX2 of set X2.
        """
        dK2_dlengthscaledX = self.dK2_dlengthscaledX(X, X2, dimX2)
        if self.ARD:
-            return [-1.*g for g in tmp]
+            return [-1.*g for g in dK2_dlengthscaledX]
        else:
-            return -1*tmp
+            return -1*dK2_dlengthscaledX
    @Cache_this(limit=3, ignore_args=())
    def dK3_dvariancedXdX2(self, X, X2, dimX, dimX2):
        """
        Compute the third derivative of K with respect to:
            variance,
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        """
        return self._clean_dK2_dXdX2(X, X2, dimX, dimX2)/self.variance
    @Cache_this(limit=3, ignore_args=())
    def dK3_dlengthscaledXdX2(self, X, X2, dimX, dimX2):
-        r = self._scaled_dist(X, X2)
+        """
-        K = self.K_of_r(r)
+        Compute the third derivative(s) of K with respect to:
-        if X2 is None:
+            lengthscale(s),
-            X2=X
+            dimension dimX of set X, and
-        dist = X[:,None,:]-X2[None,:,:]
+            dimension dimX2 of set X2.
-        lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
+        """
-        lengthscale2inv = lengthscaleinv**2
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        K = self._clean_K(X, X2)
        dK_dX = self._clean_dK_dX(X, X2, dimX)
        dK_dX2 = self._clean_dK_dX(X, X2, dimX2)
        dK2_dXdX2 = self._clean_dK2_dXdX2(X, X2, dimX, dimX2)
        if self.ARD:
            g = []
-            for diml in range(X.shape[1]):
+            for diml in range(self.input_dim):
-                tmp = -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*(dist[:,:,diml]**2)*lengthscale2inv[dimX]*lengthscale2inv[dimX2]*(lengthscaleinv[diml]**3)
+                term = (dist[diml]**2)*(lengthscaleinv[diml]**3)*dK2_dXdX2
                if dimX == dimX2:
                    tmp += K*lengthscale2inv[dimX]*(lengthscaleinv[diml]**3)*(dist[:,:,diml]**2)
                if diml == dimX:
-                    tmp += 2.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX2]*(lengthscaleinv[dimX]**3)
+                    term -= 2*dist[dimX]*(lengthscaleinv[dimX]**3)*dK_dX2
                if diml == dimX2:
-                    tmp += 2.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX]*(lengthscaleinv[dimX2]**3)
+                    term -= 2*dist[dimX2]*(lengthscaleinv[dimX2]**3)*dK_dX
-                    if dimX == dimX2:
+                if diml == dimX == dimX2:
-                        tmp += -2.*K*(lengthscaleinv[dimX]**3)
+                    term -= 2*(lengthscaleinv[dimX]**3)*K
-                g += [tmp]
+                g += [term]
        else:
-            g = -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*np.sum(dist**2, axis=2)*(lengthscaleinv[dimX]**7) +4*K*dist[:,:,dimX]*dist[:,:,dimX2]*(lengthscaleinv[dimX]**5)
+            term = np.sum(dist**2, axis=0)*dK2_dXdX2
            term -= 4*dist[dimX2]*dK_dX
            if dimX == dimX2:
-                g += -2.*K*(lengthscaleinv[dimX]**3) + K*(lengthscaleinv[dimX]**5)*np.sum(dist**2, axis=2)
+                term -= 2*K
            g = (lengthscaleinv[0]**3)*term
        return g
    def __getstate__(self):
--- a/GPy/kern/src/sde_standard_periodic.py
+++ b/GPy/kern/src/sde_standard_periodic.py
@ -13,6 +13,7 @@ import warnings
 from scipy import special as special
 class sde_StdPeriodic(StdPeriodic):
    """
@ -27,6 +28,7 @@ class sde_StdPeriodic(StdPeriodic):
       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
    """
    # TODO: write comment to the constructor arguments
    def __init__(self, *args, **kwargs):
        """
@ -42,18 +44,17 @@ class sde_StdPeriodic(StdPeriodic):
        :type balance: bool
        """
-        #import pdb; pdb.set_trace()
+        # import pdb; pdb.set_trace()
-        if 'approx_order' in kwargs:
+        if "approx_order" in kwargs:
-            self.approx_order = kwargs.get('approx_order')
+            self.approx_order = kwargs.get("approx_order")
-            del kwargs['approx_order']
+            del kwargs["approx_order"]
        else:
            self.approx_order = 7
-        
+        if "balance" in kwargs:
-        if 'balance' in kwargs:
+            self.balance = bool(kwargs.get("balance"))
-            self.balance = bool( kwargs.get('balance') )
+            del kwargs["balance"]
            del kwargs['balance']
        else:
            self.balance = False
@ -84,40 +85,57 @@ class sde_StdPeriodic(StdPeriodic):
        300 data points the low limit is 0.15.
        """
-        #import pdb; pdb.set_trace()
+        # import pdb; pdb.set_trace()
        # Params to use: (in that order)
-        #self.variance
+        # self.variance
-        #self.period
+        # self.period
-        #self.lengthscale
+        # self.lengthscale
        if self.approx_order is not None:
            N = int(self.approx_order)
        else:
-            N = 7 # approximation order        
+            N = 7  # approximation order
        p_period = float(self.period)
-        p_lengthscale = 2*float(self.lengthscale)
+        p_lengthscale = 2 * float(self.lengthscale)
        p_variance = float(self.variance)
-        w0 = 2*np.pi/p_period # frequency
+        w0 = 2 * np.pi / p_period  # frequency
        # lengthscale is multiplied by 2 because of different definition of lengthscale
-        [q2,dq2l] = seriescoeff(N, p_lengthscale, p_variance)        
+        [q2, dq2l] = seriescoeff(N, p_lengthscale, p_variance)
-        dq2l = 2*dq2l  # This is because the lengthscale if multiplied by 2.
+        dq2l = 2 * dq2l  # This is because the lengthscale if multiplied by 2.
        eps = 1e-12
-        if np.any( np.isfinite(q2) == False) or np.any( np.abs(q2) > 1.0/eps) or np.any( np.abs(q2) < eps):
+        if (
-            warnings.warn("sde_Periodic:  Infinite, too small, or too large (eps={0:e}) values in q2 :".format(eps) + q2.__format__("") )
+            np.any(np.isfinite(q2) == False)
            or np.any(np.abs(q2) > 1.0 / eps)
            or np.any(np.abs(q2) < eps)
        ):
            warnings.warn(
                "sde_Periodic:  Infinite, too small, or too large (eps={0:e}) values in q2 :".format(
                    eps
                )
                + q2.__format__("")
            )
-        if np.any( np.isfinite(dq2l) == False) or np.any( np.abs(dq2l) > 1.0/eps) or np.any( np.abs(dq2l) < eps):
+        if (
-            warnings.warn("sde_Periodic:  Infinite, too small, or too large (eps={0:e}) values in dq2l :".format(eps) + q2.__format__("") )
+            np.any(np.isfinite(dq2l) == False)
            or np.any(np.abs(dq2l) > 1.0 / eps)
            or np.any(np.abs(dq2l) < eps)
        ):
            warnings.warn(
                "sde_Periodic:  Infinite, too small, or too large (eps={0:e}) values in dq2l :".format(
                    eps
                )
                + q2.__format__("")
            )
-                 
+        F = np.kron(np.diag(range(0, N + 1)), np.array(((0, -w0), (w0, 0))))
-        F    = np.kron(np.diag(range(0,N+1)),np.array( ((0, -w0), (w0, 0)) ) )
+        L = np.eye(2 * (N + 1))
-        L    = np.eye(2*(N+1))
+        Qc = np.zeros((2 * (N + 1), 2 * (N + 1)))
-        Qc   = np.zeros((2*(N+1), 2*(N+1)))
+        P_inf = np.kron(np.diag(q2), np.eye(2))
-        P_inf = np.kron(np.diag(q2),np.eye(2))
+        H = np.kron(np.ones((1, N + 1)), np.array((1, 0)))
        H    = np.kron(np.ones((1,N+1)),np.array((1,0)) )
        P0 = P_inf.copy()
        # Derivatives
@ -126,32 +144,35 @@ class sde_StdPeriodic(StdPeriodic):
        dP_inf = np.empty((P_inf.shape[0], P_inf.shape[1], 3))
        # Derivatives wrt self.variance
-        dF[:,:,0] = np.zeros(F.shape)
+        dF[:, :, 0] = np.zeros(F.shape)
-        dQc[:,:,0] = np.zeros(Qc.shape)
+        dQc[:, :, 0] = np.zeros(Qc.shape)
-        dP_inf[:,:,0] = P_inf / p_variance
+        dP_inf[:, :, 0] = P_inf / p_variance
        # Derivatives self.period
-        dF[:,:,1] = np.kron(np.diag(range(0,N+1)),np.array( ((0,  w0), (-w0, 0)) ) / p_period );
+        dF[:, :, 1] = np.kron(
-        dQc[:,:,1] = np.zeros(Qc.shape)
+            np.diag(range(0, N + 1)), np.array(((0, w0), (-w0, 0))) / p_period
-        dP_inf[:,:,1] = np.zeros(P_inf.shape)      
+        )
        dQc[:, :, 1] = np.zeros(Qc.shape)
        dP_inf[:, :, 1] = np.zeros(P_inf.shape)
        # Derivatives self.lengthscales
-        dF[:,:,2] = np.zeros(F.shape)
+        dF[:, :, 2] = np.zeros(F.shape)
-        dQc[:,:,2] = np.zeros(Qc.shape)
+        dQc[:, :, 2] = np.zeros(Qc.shape)
-        dP_inf[:,:,2] = np.kron(np.diag(dq2l),np.eye(2))
+        dP_inf[:, :, 2] = np.kron(np.diag(dq2l), np.eye(2))
        dP0 = dP_inf.copy()
        if self.balance:
            # Benefits of this are not very sound.
            import GPy.models.state_space_main as ssm
-            (F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf,dP0) = ssm.balance_ss_model(F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0 )
+
            (F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0) = ssm.balance_ss_model(
                F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0
            )
        return (F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0)
-        
+def seriescoeff(m=6, lengthScale=1.0, magnSigma2=1.0, true_covariance=False):
 def seriescoeff(m=6,lengthScale=1.0,magnSigma2=1.0, true_covariance=False):
    """
    Calculate the coefficients q_j^2 for the covariance function
    approximation:
@ -192,34 +213,67 @@ def seriescoeff(m=6,lengthScale=1.0,magnSigma2=1.0, true_covariance=False):
    if true_covariance:
-        bb = lambda j,m: (1.0 + np.array((j != 0), dtype=np.float64) ) / (2**(j)) *\
+        bb = (
-            sp.special.binom(j, sp.floor( (j-m)/2.0 * np.array(m<=j, dtype=np.float64) ))*\
+            lambda j, m: (1.0 + np.array((j != 0), dtype=np.float64))
-            np.array(m<=j, dtype=np.float64) *np.array(sp.mod(j-m,2)==0, dtype=np.float64)
+            / (2 ** (j))
            * sp.special.binom(
                j, sp.floor((j - m) / 2.0 * np.array(m <= j, dtype=np.float64))
            )
            * np.array(m <= j, dtype=np.float64)
            * np.array(sp.mod(j - m, 2) == 0, dtype=np.float64)
        )
-        M,J = np.meshgrid(range(0,m+1),range(0,m+1))
+        M, J = np.meshgrid(range(0, m + 1), range(0, m + 1))
-        coeffs = bb(J,M) / sp.misc.factorial(J) * sp.exp( -lengthScale**(-2) ) *\
+        coeffs = (
-             (lengthScale**(-2))**J  *magnSigma2
+            bb(J, M)
            / sp.misc.factorial(J)
            * np.exp(-(lengthScale ** (-2)))
            * (lengthScale ** (-2)) ** J
            * magnSigma2
        )
-        coeffs_dl = np.sum( coeffs*lengthScale**(-3)*(2.0-2.0*J*lengthScale**2),0)         
+        coeffs_dl = np.sum(
            coeffs * lengthScale ** (-3) * (2.0 - 2.0 * J * lengthScale**2), 0
        )
-        coeffs = np.sum(coeffs,0)
+        coeffs = np.sum(coeffs, 0)
    else:
-        coeffs = 2*magnSigma2*sp.exp( -lengthScale**(-2) ) * special.iv(range(0,m+1),1.0/lengthScale**(2))
+        coeffs = (
-        if np.any( np.isfinite(coeffs) == False):
+            2
            * magnSigma2
            * np.exp(-(lengthScale ** (-2)))
            * special.iv(range(0, m + 1), 1.0 / lengthScale ** (2))
        )
        if np.any(np.isfinite(coeffs) == False):
            raise ValueError("sde_standard_periodic: Coefficients are not finite!")
-        #import pdb; pdb.set_trace()
+        # import pdb; pdb.set_trace()
-        coeffs[0] = 0.5*coeffs[0]
+        coeffs[0] = 0.5 * coeffs[0]
-        #print(coeffs)
+        # print(coeffs)
        # Derivatives wrt (lengthScale)
-        coeffs_dl = np.zeros(m+1)
+        coeffs_dl = np.zeros(m + 1)
-        coeffs_dl[1:] = magnSigma2*lengthScale**(-3) * sp.exp(-lengthScale**(-2))*\
+        coeffs_dl[1:] = (
-        (-4*special.iv(range(0,m),lengthScale**(-2)) + 4*(1+np.arange(1,m+1)*lengthScale**(2))*special.iv(range(1,m+1),lengthScale**(-2)) )    
+            magnSigma2
            * lengthScale ** (-3)
            * np.exp(-(lengthScale ** (-2)))
            * (
                -4 * special.iv(range(0, m), lengthScale ** (-2))
                + 4
                * (1 + np.arange(1, m + 1) * lengthScale ** (2))
                * special.iv(range(1, m + 1), lengthScale ** (-2))
            )
        )
        # The first element
-        coeffs_dl[0] = magnSigma2*lengthScale**(-3) * np.exp(-lengthScale**(-2))*\
+        coeffs_dl[0] = (
-            (2*special.iv(0,lengthScale**(-2)) - 2*special.iv(1,lengthScale**(-2)) )     
+            magnSigma2
-        
+            * lengthScale ** (-3)
            * np.exp(-(lengthScale ** (-2)))
            * (
                2 * special.iv(0, lengthScale ** (-2))
                - 2 * special.iv(1, lengthScale ** (-2))
            )
        )
    return coeffs.squeeze(), coeffs_dl.squeeze()
--- a/GPy/kern/src/sde_stationary.py
+++ b/GPy/kern/src/sde_stationary.py
@ -11,12 +11,14 @@ from .stationary import RatQuad
 import numpy as np
 import scipy as sp
 try:
    from scipy.linalg import solve_continuous_lyapunov as lyap
 except ImportError:
    from scipy.linalg import solve_lyapunov as lyap
 import warnings
 class sde_RBF(RBF):
    """
@ -30,6 +32,7 @@ class sde_RBF(RBF):
        k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def __init__(self, *args, **kwargs):
        """
        Init constructior.
@ -44,21 +47,18 @@ class sde_RBF(RBF):
        :type balance: bool
        """
-        if 'balance' in kwargs:
+        if "balance" in kwargs:
-            self.balance = bool( kwargs.get('balance') )
+            self.balance = bool(kwargs.get("balance"))
-            del kwargs['balance']
+            del kwargs["balance"]
        else:
            self.balance = True
-        
+        if "approx_order" in kwargs:
-        if 'approx_order' in kwargs:
+            self.approx_order = kwargs.get("approx_order")
-            self.approx_order = kwargs.get('approx_order')
+            del kwargs["approx_order"]
            del kwargs['approx_order']
        else:
            self.approx_order = 6
        super(sde_RBF, self).__init__(*args, **kwargs)
    def sde_update_gradient_full(self, gradients):
@ -83,76 +83,101 @@ class sde_RBF(RBF):
        The above facts do not take into accout regularization.
        """
-        #import pdb; pdb.set_trace()
+        # import pdb; pdb.set_trace()
        if self.approx_order is not None:
            N = self.approx_order
        else:
-            N = 10# approximation order ( number of terms in exponent series expansion)
+            N = 10  # approximation order ( number of terms in exponent series expansion)
        roots_rounding_decimals = 6
        fn = np.math.factorial(N)
-        p_lengthscale = float( self.lengthscale )
+        p_lengthscale = float(self.lengthscale)
        p_variance = float(self.variance)
-        kappa = 1.0/2.0/p_lengthscale**2
+        kappa = 1.0 / 2.0 / p_lengthscale**2
-        Qc = np.array( ((p_variance*np.sqrt(np.pi/kappa)*fn*(4*kappa)**N,),) )
+        Qc = np.array(((p_variance * np.sqrt(np.pi / kappa) * fn * (4 * kappa) ** N,),))
        eps = 1e-12
-        if (float(Qc) > 1.0/eps) or (float(Qc) < eps):
+        if (float(Qc) > 1.0 / eps) or (float(Qc) < eps):
-            warnings.warn("""sde_RBF kernel: the noise variance Qc is either very large or very small. 
+            warnings.warn(
-                                It influece conditioning of P_inf: {0:e}""".format(float(Qc)) )
+                """sde_RBF kernel: the noise variance Qc is either very large or very small. 
                                It influece conditioning of P_inf: {0:e}""".format(
                    float(Qc)
                )
            )
-        pp1 = np.zeros((2*N+1,)) # array of polynomial coefficients from higher power to lower
+        pp1 = np.zeros(
            (2 * N + 1,)
        )  # array of polynomial coefficients from higher power to lower
-        for n in range(0, N+1): # (2N+1) - number of polynomial coefficients
+        for n in range(0, N + 1):  # (2N+1) - number of polynomial coefficients
-            pp1[2*(N-n)] = fn*(4.0*kappa)**(N-n)/np.math.factorial(n)*(-1)**n
+            pp1[2 * (N - n)] = (
                fn * (4.0 * kappa) ** (N - n) / np.math.factorial(n) * (-1) ** n
            )
-        pp = sp.poly1d(pp1)
+        pp = np.poly1d(pp1)
-        roots = sp.roots(pp)
+        roots = np.roots(pp)
-        neg_real_part_roots = roots[np.round(np.real(roots) ,roots_rounding_decimals) < 0]
+        neg_real_part_roots = roots[
-        aa = sp.poly1d(neg_real_part_roots, r=True).coeffs
+            np.round(np.real(roots), roots_rounding_decimals) < 0
        ]
        aa = np.poly1d(neg_real_part_roots, r=True).coeffs
-        F = np.diag(np.ones((N-1,)),1)
+        F = np.diag(np.ones((N - 1,)), 1)
-        F[-1,:] = -aa[-1:0:-1]
+        F[-1, :] = -aa[-1:0:-1]
-        L= np.zeros((N,1))
+        L = np.zeros((N, 1))
-        L[N-1,0] = 1
+        L[N - 1, 0] = 1
-        H = np.zeros((1,N))
+        H = np.zeros((1, N))
-        H[0,0] = 1
+        H[0, 0] = 1
        # Infinite covariance:
-        Pinf = lyap(F, -np.dot(L,np.dot( Qc[0,0],L.T)))
+        Pinf = lyap(F, -np.dot(L, np.dot(Qc[0, 0], L.T)))
-        Pinf = 0.5*(Pinf + Pinf.T)
+        Pinf = 0.5 * (Pinf + Pinf.T)
        # Allocating space for derivatives
-        dF    = np.empty([F.shape[0],F.shape[1],2])
+        dF = np.empty([F.shape[0], F.shape[1], 2])
-        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2])
+        dQc = np.empty([Qc.shape[0], Qc.shape[1], 2])
-        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2])
+        dPinf = np.empty([Pinf.shape[0], Pinf.shape[1], 2])
        # Derivatives:
        dFvariance = np.zeros(F.shape)
        dFlengthscale = np.zeros(F.shape)
-        dFlengthscale[-1,:] = -aa[-1:0:-1]/p_lengthscale * np.arange(-N,0,1)
+        dFlengthscale[-1, :] = -aa[-1:0:-1] / p_lengthscale * np.arange(-N, 0, 1)
-        dQcvariance = Qc/p_variance
+        dQcvariance = Qc / p_variance
-        dQclengthscale = np.array(( (p_variance*np.sqrt(2*np.pi)*fn*2**N*p_lengthscale**(-2*N)*(1-2*N),),))
+        dQclengthscale = np.array(
            (
                (
                    p_variance
                    * np.sqrt(2 * np.pi)
                    * fn
                    * 2**N
                    * p_lengthscale ** (-2 * N)
                    * (1 - 2 * N),
                ),
            )
        )
-        dPinf_variance = Pinf/p_variance
+        dPinf_variance = Pinf / p_variance
        lp = Pinf.shape[0]
-        coeff = np.arange(1,lp+1).reshape(lp,1) + np.arange(1,lp+1).reshape(1,lp) - 2
+        coeff = (
-        coeff[np.mod(coeff,2) != 0] = 0
+            np.arange(1, lp + 1).reshape(lp, 1)
-        dPinf_lengthscale = -1/p_lengthscale*Pinf*coeff
+            + np.arange(1, lp + 1).reshape(1, lp)
            - 2
        )
        coeff[np.mod(coeff, 2) != 0] = 0
        dPinf_lengthscale = -1 / p_lengthscale * Pinf * coeff
-        dF[:,:,0]    = dFvariance
+        dF[:, :, 0] = dFvariance
-        dF[:,:,1]    = dFlengthscale
+        dF[:, :, 1] = dFlengthscale
-        dQc[:,:,0]   = dQcvariance
+        dQc[:, :, 0] = dQcvariance
-        dQc[:,:,1]   = dQclengthscale
+        dQc[:, :, 1] = dQclengthscale
-        dPinf[:,:,0] = dPinf_variance
+        dPinf[:, :, 0] = dPinf_variance
-        dPinf[:,:,1] = dPinf_lengthscale
+        dPinf[:, :, 1] = dPinf_lengthscale
        P0 = Pinf.copy()
        dP0 = dPinf.copy()
@ -161,10 +186,14 @@ class sde_RBF(RBF):
            # Benefits of this are not very sound. Helps only in one case:
            # SVD Kalman + RBF kernel
            import GPy.models.state_space_main as ssm
-            (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf,dP0) = ssm.balance_ss_model(F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0 )
+
            (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0) = ssm.balance_ss_model(
                F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0
            )
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 class sde_Exponential(Exponential):
    """
@ -195,30 +224,31 @@ class sde_Exponential(Exponential):
        variance = float(self.variance.values)
        lengthscale = float(self.lengthscale)
-        F  = np.array(((-1.0/lengthscale,),))
+        F = np.array(((-1.0 / lengthscale,),))
-        L  = np.array(((1.0,),))
+        L = np.array(((1.0,),))
-        Qc = np.array( ((2.0*variance/lengthscale,),) )
+        Qc = np.array(((2.0 * variance / lengthscale,),))
        H = np.array(((1.0,),))
        Pinf = np.array(((variance,),))
        P0 = Pinf.copy()
-        dF = np.zeros((1,1,2));
+        dF = np.zeros((1, 1, 2))
-        dQc = np.zeros((1,1,2));
+        dQc = np.zeros((1, 1, 2))
-        dPinf = np.zeros((1,1,2));
+        dPinf = np.zeros((1, 1, 2))
-        dF[:,:,0] = 0.0
+        dF[:, :, 0] = 0.0
-        dF[:,:,1] = 1.0/lengthscale**2
+        dF[:, :, 1] = 1.0 / lengthscale**2
-        dQc[:,:,0] = 2.0/lengthscale
+        dQc[:, :, 0] = 2.0 / lengthscale
-        dQc[:,:,1] = -2.0*variance/lengthscale**2
+        dQc[:, :, 1] = -2.0 * variance / lengthscale**2
-        dPinf[:,:,0] = 1.0
+        dPinf[:, :, 0] = 1.0
-        dPinf[:,:,1] = 0.0
+        dPinf[:, :, 1] = 0.0
        dP0 = dPinf.copy()
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 class sde_RatQuad(RatQuad):
    """
@ -238,12 +268,12 @@ class sde_RatQuad(RatQuad):
        Return the state space representation of the covariance.
        """
-        assert False, 'Not Implemented'
+        assert False, "Not Implemented"
        # Params to use:
        # self.lengthscale
        # self.variance
-        #self.power
+        # self.power
-        #return (F, L, Qc, H, Pinf, dF, dQc, dPinf)
+        # return (F, L, Qc, H, Pinf, dF, dQc, dPinf)
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@ -122,7 +122,6 @@ class StdPeriodic(Kern):
        pass
    def K(self, X, X2=None):
        """Compute the covariance matrix between X and X2."""
        if X2 is None:
@ -133,13 +132,372 @@ class StdPeriodic(Kern):
        return self.variance * exp_dist
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix associated to X."""
        ret = np.empty(X.shape[0])
        ret[:] = self.variance
        return ret
    def dK_dX(self, X, X2, dimX):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX]
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        dist = X[:,None,dimX] - X2[None,:,dimX]
        base = np.pi*periodinv*dist
        return -F*np.sin(2*base)*self._clean_K(X, X2)
    def dK_dXdiag(self, X, dimX):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        Returns only diagonal elements.
        """
        return np.zeros(X.shape[0])
    def dK_dX2(self, X, X2, dimX2):
        """
        Compute the derivative of K with respect to:
            dimension dimX2 of set X2.
        """
        return -self._clean_dK_dX(X, X2, dimX2)
    def dK_dX2diag(self, X, dimX2):
        """
        Compute the derivative of K with respect to:
            dimension dimX2 of set X2.
        Returns only diagonal elements.
        """
        return np.zeros(X.shape[0])
    def dK2_dXdX2(self, X, X2, dimX, dimX2):
        """
        Compute the second derivative of K with respect to:
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        dist = X[:,None,dimX2] - X2[None,:,dimX2]
        base = np.pi*periodinv*dist
        term = np.sin(2*base)*self._clean_dK_dX(X, X2, dimX)
        if dimX == dimX2:
            term += 2*np.pi*periodinv*np.cos(2*base)*self._clean_K(X, X2)
        return F*term
    def dK2_dXdX2diag(self, X, dimX, dimX2):
        """
        Compute the second derivative of K with respect to:
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        Returns only diagonal elements.
        """
        if dimX == dimX2:
            lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
            periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
            return (np.pi**2)*(lengthscaleinv**2)*(periodinv**2)*self.variance*np.ones(X.shape[0])
        else:
            return np.zeros(X.shape[0])
    def dK2_dXdX(self, X, X2, dimX_0, dimX_1):
        """
        Compute the second derivative of K with respect to:
            dimension dimX_0 of set X, and
            dimension dimX_1 of set X.
        """
        return -self._clean_dK2_dXdX2(X, X2, dimX_0, dimX_1)
    def dK2_dXdXdiag(self, X, dimX_0, dimX_1):
        """
        Compute the second derivative of K with respect to:
            dimension dimX_0 of set X, and
            dimension dimX_1 of set X.
        Returns only diagonal elements.
        """
        return -self._clean_dK2_dXdX2diag(X, dimX_0, dimX_1)
    def dK3_dXdXdX2(self, X, X2, dimX_0, dimX_1, dimX2):
        """
        Compute the third derivative of K with respect to:
            dimension dimX_0 of set X,
            dimension dimX_1 of set X, and
            dimension dimX2 of set X2.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        dist = X[:,None,dimX2] - X2[None,:,dimX2]
        base = np.pi*periodinv*dist
        term = np.sin(2*base)*self._clean_dK2_dXdX(X, X2, dimX_0, dimX_1)
        if dimX_0 == dimX2:
            term += 2*np.pi*periodinv*np.cos(2*base)*self._clean_dK_dX(X, X2, dimX_1)
        if dimX_1 == dimX2:
            term += 2*np.pi*periodinv*np.cos(2*base)*self._clean_dK_dX(X, X2, dimX_0)
        if dimX_0 == dimX_1 == dimX2:
            term -= 4*(np.pi**2)*(periodinv**2)*np.sin(2*base)*self._clean_K(X, X2)
        return F*term
    def dK3_dXdXdX2diag(self, X, dimX_0, dimX_1, dimX2):
        """
        Compute the third derivative of K with respect to:
            dimension dimX_0 of set X,
            dimension dimX_1 of set X, and
            dimension dimX2 of set X2.
        Returns only diagonal elements of the covariance matrix.
        """
        return np.zeros(X.shape[0])
    def dK_dvariance(self, X, X2):
        """
        Compute the derivative of K with respect to variance.
        """
        return self._clean_K(X, X2)/self.variance
    def dK_dlengthscale(self, X, X2):
        """
        Compute the derivative(s) of K with respect to lengthscale(s).
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
        periodinv = (np.ones(X.shape[1])/(self.period))
        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        base = np.pi*periodinv[:,None,None]*dist
        K = self._clean_K(X, X2)
        if self.ARD2:
            g = []
            for diml in range(self.input_dim):
                g += [(lengthscaleinv[diml]**3)*np.square(np.sin(base[diml]))*K]
        else:
            g = (lengthscaleinv[0]**3)*np.sum(np.square(np.sin(base)), axis=0)*K
        return g
    def dK_dperiod(self, X, X2):
        """
        Compute the derivative(s) of K with respect to period(s).
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
        periodinv = (np.ones(X.shape[1])/(self.period))
        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        base = np.pi*periodinv[:,None,None]*dist
        K = self._clean_K(X, X2)
        if self.ARD1:
            g = []
            for diml in range(self.input_dim):
                g += [0.5*base[diml]*(lengthscaleinv[diml]**2)*periodinv[diml]*np.sin(2*base[diml])*K]
        else:
            g = 0.5*periodinv[0]*np.sum(base*(lengthscaleinv**2)[:,None,None]*np.sin(2*base), axis=0)*K
        return g
    def dK2_dvariancedX(self, X, X2, dimX):
        """
        Compute the second derivative of K with respect to:
            variance, and
            dimension dimX of set X.
        """
        return self._clean_dK_dX(X, X2, dimX)/self.variance
    def dK2_dvariancedX2(self, X, X2, dimX2):
        """
        Compute the second derivative of K with respect to:
            variance, and
            dimension dimX2 of set X2.
        """
        return -self.dK2_dvariancedX(X, X2, dimX2)
    def dK2_dlengthscaledX(self, X, X2, dimX):
        """
        Compute the second derivative(s) of K with respect to:
            lengthscale(s), and
            dimension dimX of set X.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX]
        dist = X[:,None,dimX] - X2[None,:,dimX]
        base = np.pi*periodinv*dist
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        K = self._clean_K(X, X2)
        dK_dl = self.dK_dlengthscale(X, X2)
        if self.ARD2:
            g = []
            for diml in range(self.input_dim):
                term = dK_dl[diml]
                if diml == dimX:
                    term -= 2*lengthscaleinv*K
                g += [-F*np.sin(2*base)*term]
        else:
            g = -F*np.sin(2*base)*(dK_dl - 2*lengthscaleinv*K)
        return g
    def dK2_dlengthscaledX2(self, X, X2, dimX2):
        """
        Compute the second derivative(s) of K with respect to:
            lengthscale(s), and
            dimension dimX2 of set X2.
        """
        dK2_dldX = self.dK2_dlengthscaledX(X, X2, dimX2)
        if self.ARD2:
            return [-1*g for g in dK2_dldX]
        else:
            return -1*dK2_dldX
    def dK2_dperioddX(self, X, X2, dimX):
        """
        Compute the second derivative(s) of K with respect to:
            period(s), and
            dimension dimX of set X.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX]
        dist = X[:,None,dimX] - X2[None,:,dimX]
        base = np.pi*periodinv*dist
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        K = self._clean_K(X, X2)
        dK_dT = self.dK_dperiod(X, X2)
        if self.ARD1:
            g = []
            for dimT in range(self.input_dim):
                term = np.sin(2*base)*dK_dT[dimT]
                if dimT == dimX:
                    term -= periodinv*(np.sin(2*base)+2*base*np.cos(2*base))*K
                g += [-F*term]
        else:
            term = np.sin(2*base)*dK_dT
            term -= periodinv*(np.sin(2*base)+2*base*np.cos(2*base))*K
            g = -F*term
        return g
    def dK2_dperioddX2(self, X, X2, dimX2):
        """
        Compute the second derivative(s) of K with respect to:
            period(s), and
            dimension dimX2 of set X2.
        """
        dK2_dperioddX = self.dK2_dperioddX(X, X2, dimX2)
        if self.ARD1:
            return [-1*g for g in dK2_dperioddX]
        else:
            return -1*dK2_dperioddX
    def dK3_dvariancedXdX2(self, X, X2, dimX, dimX2):
        """
        Compute the third derivative of K with respect to:
            variance,
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        """
        return self._clean_dK2_dXdX2(X, X2, dimX, dimX2)/self.variance
    def dK3_dlengthscaledXdX2(self, X, X2, dimX, dimX2):
        """
        Compute the third derivative(s) of K with respect to:
            lengthscale(s),
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
        dist = X[:,None,dimX2] - X2[None,:,dimX2]
        base = np.pi*periodinv*dist
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        dK2_dXdX2 = self._clean_dK2_dXdX2(X, X2, dimX, dimX2)
        dK_dl = self.dK_dlengthscale(X, X2)
        dK2_dldX = self.dK2_dlengthscaledX(X, X2, dimX)
        if self.ARD2:
            g = []
            for diml in range(self.input_dim):
                term = np.sin(2*base)*dK2_dldX[diml]
                if dimX == dimX2:
                    term += 2*np.pi*periodinv*np.cos(2*base)*dK_dl[diml]
                term *= F
                if diml == dimX2:
                    term -= 2*lengthscaleinv*dK2_dXdX2
                g += [term]
        else:
            term = np.sin(2*base)*dK2_dldX
            if dimX == dimX2:
                term += 2*np.pi*periodinv*np.cos(2*base)*dK_dl
            term *= F
            term -= 2*lengthscaleinv*dK2_dXdX2
            g = term
        return g
    def dK3_dperioddXdX2(self, X, X2, dimX, dimX2):
        """
        Compute the third derivative(s) of K with respect to:
            period(s),
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
        dist = X[:,None,dimX2] - X2[None,:,dimX2]
        base = np.pi*periodinv*dist
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        K = self._clean_K(X, X2)
        dK_dX = self._clean_dK_dX(X, X2, dimX)
        dK2_dXdX2 = self._clean_dK2_dXdX2(X, X2, dimX, dimX2)
        dK_dT = self.dK_dperiod(X, X2)
        dK2_dTdX = self.dK2_dperioddX(X, X2, dimX)
        if self.ARD1:
            g = []
            for dimT in range(self.input_dim):
                term = np.sin(2*base)*dK2_dTdX[dimT]
                if dimT == dimX2:
                    term -= 2*periodinv*np.cos(2*base)*base*dK_dX
                if dimX == dimX2:
                    term += 2*np.pi*periodinv*np.cos(2*base)*dK_dT[dimT]
                if dimX == dimX2 == dimT:
                    term += 2*np.pi*(periodinv**2)*(2*base*np.sin(2*base)-np.cos(2*base))*K
                term *= F
                if dimT == dimX2:
                    term -= periodinv*dK2_dXdX2
                g += [term]
        else:
            term = np.sin(2*base)*dK2_dTdX-2*periodinv*base*np.cos(2*base)*dK_dX
            if dimX == dimX2:
                term += 2*np.pi*periodinv*(np.cos(2*base)*dK_dT+periodinv*(2*base*np.sin(2*base)-np.cos(2*base))*K)
            g = F*term-periodinv*dK2_dXdX2
        return g
    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None:
@ -167,12 +525,52 @@ class StdPeriodic(Kern):
        else: # same lengthscales
            self.lengthscale.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
    def update_gradients_direct(self, dL_dVar, dL_dPer, dL_dLen):
        self.variance.gradient = dL_dVar
        self.period.gradient = dL_dPer
        self.lengthscale.gradient = dL_dLen
    def reset_gradients(self):
        self.variance.gradient = 0.
        if not self.ARD1:
            self.period.gradient = 0.
        else:
            self.period.gradient = np.zeros(self.input_dim)
        if not self.ARD2:
            self.lengthscale.gradient = 0.
        else:
            self.lengthscale.gradient = np.zeros(self.input_dim)
    def update_gradients_diag(self, dL_dKdiag, X):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        self.variance.gradient = np.sum(dL_dKdiag)
        self.period.gradient = 0
        self.lengthscale.gradient = 0
    def dgradients(self, X, X2):
        g1 = self.dK_dvariance(X, X2)
        g2 = self.dK_dperiod(X, X2)
        g3 = self.dK_dlengthscale(X, X2)
        return [g1, g2, g3]
    def dgradients_dX(self, X, X2, dimX):
        g1 = self.dK2_dvariancedX(X, X2, dimX)
        g2 = self.dK2_dperioddX(X, X2, dimX)
        g3 = self.dK2_dlengthscaledX(X, X2, dimX)
        return [g1, g2, g3]
    def dgradients_dX2(self, X, X2, dimX2):
        g1 = self.dK2_dvariancedX2(X, X2, dimX2)
        g2 = self.dK2_dperioddX2(X, X2, dimX2)
        g3 = self.dK2_dlengthscaledX2(X, X2, dimX2)
        return [g1, g2, g3]
    def dgradients2_dXdX2(self, X, X2, dimX, dimX2):
        g1 = self.dK3_dvariancedXdX2(X, X2, dimX, dimX2)
        g2 = self.dK3_dperioddXdX2(X, X2, dimX, dimX2)
        g3 = self.dK3_dlengthscaledXdX2(X, X2, dimX, dimX2)
        return [g1, g2, g3]
    def gradients_X(self, dL_dK, X, X2=None):
        K = self.K(X, X2)
        if X2 is None:
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -307,6 +307,11 @@ class Stationary(Kern):
        return dL_dK_diag * (np.eye(X.shape[1]) * -self.dK2_drdr_diag()/(l4))[None, :,:]# np.zeros(X.shape+(X.shape[1],))
        #return np.ones(X.shape) * d2L_dK * self.variance/self.lengthscale**2 # np.zeros(X.shape)
    def dgradients(self, X, X2):
        g1 = self.dK_dvariance(X, X2)
        g2 = self.dK_dlengthscale(X, X2)
        return [g1, g2]
    def dgradients_dX(self, X, X2, dimX):
        g1 = self.dK2_dvariancedX(X, X2, dimX)
        g2 = self.dK2_dlengthscaledX(X, X2, dimX)
--- a/GPy/kern/src/stationary_cython.c
+++ b/GPy/kern/src/stationary_cython.c
--- a/GPy/kern/src/todo/eq_ode1.py
+++ b/GPy/kern/src/todo/eq_ode1.py
@ -121,7 +121,7 @@ class Eq_ode1(Kernpart):
            target+=self.initial_variance * np.exp(- self.decay * (t1_mat + t2_mat))
    def Kdiag(self,index,target):
-        #target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
+        #target += np.diag(self.B)[np.asarray(index,dtype=int).flatten()]
        pass
    def _param_grad_helper(self,dL_dK,X,X2,target):
@ -203,7 +203,7 @@ class Eq_ode1(Kernpart):
        self._t = X[:, 0]
        if not X.shape[1] == 2:
            raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
-        self._index = np.asarray(X[:, 1],dtype=np.int)
+        self._index = np.asarray(X[:, 1],dtype=int)
        # Sort indices so that outputs are in blocks for computational
        # convenience.
        self._order = self._index.argsort()
@ -220,7 +220,7 @@ class Eq_ode1(Kernpart):
            if not X2.shape[1] == 2:
                raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
            self._t2 = X2[:, 0]
-            self._index2 = np.asarray(X2[:, 1],dtype=np.int)
+            self._index2 = np.asarray(X2[:, 1],dtype=int)
            self._order2 = self._index2.argsort()
            self._index2 = self._index2[self._order2]
            self._t2 = self._t2[self._order2]
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@ -12,6 +12,7 @@ from ..core.parameterization import Param
 from paramz.transformations import Logexp
 from scipy.special import psi as digamma
 class StudentT(Likelihood):
    """
    Student T likelihood
@ -22,17 +23,18 @@ class StudentT(Likelihood):
        p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
    """
-    def __init__(self,gp_link=None, deg_free=5, sigma2=2):
+
    def __init__(self, gp_link=None, deg_free=5, sigma2=2):
        if gp_link is None:
            gp_link = link_functions.Identity()
-        super(StudentT, self).__init__(gp_link, name='Student_T')
+        super(StudentT, self).__init__(gp_link, name="Student_T")
        # sigma2 is not a noise parameter, it is a squared scale.
-        self.sigma2 = Param('t_scale2', float(sigma2), Logexp())
+        self.sigma2 = Param("t_scale2", float(sigma2), Logexp())
-        self.v = Param('deg_free', float(deg_free), Logexp())
+        self.v = Param("deg_free", float(deg_free), Logexp())
        self.link_parameter(self.sigma2)
        self.link_parameter(self.v)
-        #self.v.constrain_fixed()
+        # self.v.constrain_fixed()
        self.log_concave = False
@ -61,11 +63,14 @@ class StudentT(Likelihood):
        """
        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
        e = y - inv_link_f
-        #Careful gamma(big_number) is infinity!
+        # Careful gamma(big_number) is infinity!
-        objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
+        objective = (
-                     / (np.sqrt(self.v * np.pi * self.sigma2)))
+            np.exp(gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5))
-                     * ((1 + (1./float(self.v))*((e**2)/float(self.sigma2)))**(-0.5*(self.v + 1)))
+            / (np.sqrt(self.v * np.pi * self.sigma2))
-                    )
+        ) * (
            (1 + (1.0 / float(self.v)) * ((e**2) / float(self.sigma2)))
            ** (-0.5 * (self.v + 1))
        )
        return np.prod(objective)
    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
@ -85,15 +90,16 @@ class StudentT(Likelihood):
        """
        e = y - inv_link_f
-        #FIXME:
+        # FIXME:
-        #Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?!
+        # Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?!
-        #But np.log(1 + (1/float(self.v))*((y-inv_link_f)**2)/self.sigma2) throws it correctly
+        # But np.log(1 + (1/float(self.v))*((y-inv_link_f)**2)/self.sigma2) throws it correctly
-        #print - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
+        # print - 0.5*(self.v + 1)*np.log(1 + (1/(self.v))*((e**2)/self.sigma2))
-        objective = (+ gammaln((self.v + 1) * 0.5)
+        objective = (
-                    - gammaln(self.v * 0.5)
+            +gammaln((self.v + 1) * 0.5)
-                    - 0.5*np.log(self.sigma2 * self.v * np.pi)
+            - gammaln(self.v * 0.5)
-                    - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
+            - 0.5 * np.log(self.sigma2 * self.v * np.pi)
-                    )
+            - 0.5 * (self.v + 1) * np.log(1 + (1 / (self.v)) * ((e**2) / self.sigma2))
        )
        return objective
    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
@ -138,7 +144,9 @@ class StudentT(Likelihood):
            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
        """
        e = y - inv_link_f
-        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
+        hess = ((self.v + 1) * (e**2 - self.v * self.sigma2)) / (
            (self.sigma2 * self.v + e**2) ** 2
        )
        return hess
    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
@ -157,9 +165,9 @@ class StudentT(Likelihood):
        :rtype: Nx1 array
        """
        e = y - inv_link_f
-        d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+        d3lik_dlink3 = -(
-                       ((e**2 + self.sigma2*self.v)**3)
+            2 * (self.v + 1) * (-e) * (e**2 - 3 * self.v * self.sigma2)
-                    )
+        ) / ((e**2 + self.sigma2 * self.v) ** 3)
        return d3lik_dlink3
    def dlogpdf_link_dvar(self, inv_link_f, y, Y_metadata=None):
@ -179,7 +187,11 @@ class StudentT(Likelihood):
        """
        e = y - inv_link_f
        e2 = np.square(e)
-        dlogpdf_dvar = self.v*(e2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e2))
+        dlogpdf_dvar = (
            self.v
            * (e2 - self.sigma2)
            / (2 * self.sigma2 * (self.sigma2 * self.v + e2))
        )
        return dlogpdf_dvar
    def dlogpdf_dlink_dvar(self, inv_link_f, y, Y_metadata=None):
@ -198,7 +210,9 @@ class StudentT(Likelihood):
        :rtype: Nx1 array
        """
        e = y - inv_link_f
-        dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
+        dlogpdf_dlink_dvar = (self.v * (self.v + 1) * (-e)) / (
            (self.sigma2 * self.v + e**2) ** 2
        )
        return dlogpdf_dlink_dvar
    def d2logpdf_dlink2_dvar(self, inv_link_f, y, Y_metadata=None):
@ -217,9 +231,9 @@ class StudentT(Likelihood):
        :rtype: Nx1 array
        """
        e = y - inv_link_f
-        d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
+        d2logpdf_dlink2_dvar = (
-                              / ((self.sigma2*self.v + (e**2))**3)
+            self.v * (self.v + 1) * (self.sigma2 * self.v - 3 * (e**2))
-                           )
+        ) / ((self.sigma2 * self.v + (e**2)) ** 3)
        return d2logpdf_dlink2_dvar
    def dlogpdf_link_dv(self, inv_link_f, y, Y_metadata=None):
@ -227,9 +241,11 @@ class StudentT(Likelihood):
        e2 = np.square(e)
        df = float(self.v[:])
        s2 = float(self.sigma2[:])
-        dlogpdf_dv =  0.5*digamma(0.5*(df+1)) - 0.5*digamma(0.5*df) - 1.0/(2*df)
+        dlogpdf_dv = (
-        dlogpdf_dv += 0.5*(df+1)*e2/(df*(e2 + s2*df))
+            0.5 * digamma(0.5 * (df + 1)) - 0.5 * digamma(0.5 * df) - 1.0 / (2 * df)
-        dlogpdf_dv -= 0.5*np.log1p(e2/(s2*df))
+        )
        dlogpdf_dv += 0.5 * (df + 1) * e2 / (df * (e2 + s2 * df))
        dlogpdf_dv -= 0.5 * np.log1p(e2 / (s2 * df))
        return dlogpdf_dv
    def dlogpdf_dlink_dv(self, inv_link_f, y, Y_metadata=None):
@ -237,7 +253,7 @@ class StudentT(Likelihood):
        e2 = np.square(e)
        df = float(self.v[:])
        s2 = float(self.sigma2[:])
-        dlogpdf_df_dv = e*(e2 - self.sigma2)/(e2 + s2*df)**2
+        dlogpdf_df_dv = e * (e2 - self.sigma2) / (e2 + s2 * df) ** 2
        return dlogpdf_df_dv
    def d2logpdf_dlink2_dv(self, inv_link_f, y, Y_metadata=None):
@ -245,8 +261,10 @@ class StudentT(Likelihood):
        e2 = np.square(e)
        df = float(self.v[:])
        s2 = float(self.sigma2[:])
-        e2_s2v = e**2 + s2*df
+        e2_s2v = e**2 + s2 * df
-        d2logpdf_df2_dv = (-s2*(df+1) + e2 - s2*df)/e2_s2v**2 - 2*s2*(df+1)*(e2 - s2*df)/e2_s2v**3
+        d2logpdf_df2_dv = (-s2 * (df + 1) + e2 - s2 * df) / e2_s2v**2 - 2 * s2 * (
            df + 1
        ) * (e2 - s2 * df) / e2_s2v**3
        return d2logpdf_df2_dv
    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
@ -266,19 +284,23 @@ class StudentT(Likelihood):
    def predictive_mean(self, mu, sigma, Y_metadata=None):
        # The comment here confuses mean and median.
-        return self.gp_link.transf(mu) # only true if link is monotonic, which it is.
+        return self.gp_link.transf(mu)  # only true if link is monotonic, which it is.
-    def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
+    def predictive_variance(self, mu, variance, predictive_mean=None, Y_metadata=None):
-        if self.deg_free<=2.:
+        if self.deg_free <= 2.0:
-            return np.empty(mu.shape)*np.nan # does not exist for degrees of freedom <= 2.
+            return (
                np.empty(mu.shape) * np.nan
            )  # does not exist for degrees of freedom <= 2.
        else:
-            return super(StudentT, self).predictive_variance(mu, variance, predictive_mean, Y_metadata)
+            return super(StudentT, self).predictive_variance(
                mu, variance, predictive_mean, Y_metadata
            )
    def conditional_mean(self, gp):
        return self.gp_link.transf(gp)
    def conditional_variance(self, gp):
-        return self.deg_free/(self.deg_free - 2.)
+        return self.deg_free / (self.deg_free - 2.0)
    def samples(self, gp, Y_metadata=None):
        """
@ -288,11 +310,10 @@ class StudentT(Likelihood):
        """
        orig_shape = gp.shape
        gp = gp.flatten()
-        #FIXME: Very slow as we are computing a new random variable per input!
+        # FIXME: Very slow as we are computing a new random variable per input!
-        #Can't get it to sample all at the same time
+        # Can't get it to sample all at the same time
-        #student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
+        # student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
-        dfs = np.ones_like(gp)*self.v
+        dfs = np.ones_like(gp) * self.v
-        scales = np.ones_like(gp)*np.sqrt(self.sigma2)
+        scales = np.ones_like(gp) * np.sqrt(self.sigma2)
-        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp),
+        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp), scale=scales)
                                        scale=scales)
        return student_t_samples.reshape(orig_shape)
--- a/GPy/models/multioutput_gp.py
+++ b/GPy/models/multioutput_gp.py
@ -9,6 +9,7 @@ from ..core.mapping import Mapping
 from .. import likelihoods
 from ..likelihoods.gaussian import Gaussian
 from .. import kern
 from ..kern import DiffKern
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
 from ..util.normalizer import Standardize
 from .. import util
@ -71,24 +72,24 @@ class MultioutputGP(GP):
        return super(MultioutputGP, self).predict_quantiles(X, quantiles, Y_metadata, kern, likelihood)
    def predictive_gradients(self, Xnew, kern=None):
        if isinstance(Xnew, list):
            Xnew, _, ind  = util.multioutput.build_XY(Xnew, None)
            #if Y_metadata is None:
                #Y_metadata={'output_index': ind}
        return super(MultioutputGP, self).predictive_gradients(Xnew, kern)
    def predictive_gradients(self, Xnew, kern=None): #XNEW IS NOT A LIST!!
        """
-        Compute the derivatives of the predicted latent function with respect to X*
+        Compute the derivatives of the predicted latent function with respect
        to X*
        Given a set of points at which to predict X* (size [N*,Q]), compute the
        derivatives of the mean and variance. Resulting arrays are sized:
-         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
+            dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP
-        Note that this is not the same as computing the mean and variance of the derivative of the function!
+            (usually one).
        Note that this is not the same as computing the mean and variance of
        the derivative of the function!
         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
        :param X: The points at which to get the predictive gradients
        :type X: np.ndarray (Xnew x self.input_dim)
        :returns: dmu_dX, dv_dX
        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q) ]
        """
        if isinstance(Xnew, list):
@ -96,12 +97,53 @@ class MultioutputGP(GP):
        slices = index_to_slices(Xnew[:,-1])
        for i in range(len(slices)):
            if ((self.kern.kern[i].name == 'diffKern' ) and len(slices[i])>0):
                assert 0, "It is not (yet) possible to predict gradients of gradient observations, sorry :)"
        if kern is None:
            kern = self.kern
        if all([(isinstance(k, DiffKern)) for k in self.kern.kern[1:]]):
            """
            Compute the gradients of the predicted latent function and predicted
            partial derivatives with respect to X*.
            This works only for models that observe the gradient of the latent function.
            Xnew is given as a list of arrays, where each array X*_i (size [N_i*, Q])
            contains points at which to compute gradients for each predicted latent
            function or partial derivative.
            Resulting arrays are sized [sum_i^D : N_i*, Q]
            Passing a list of only one array [X*] returns only gradients of
            the predicted latent function and does not compute gradients of
            predicted partial derivatives.
            In this case the resulting arrays are sized [N*, Q].
            :param Xnew: points at which to compute predictive gradients
            :type Xnew: list
            :type Xnew[i]: np.darray (sum_i^D : N_i*, Q)
            :returns: dmu_dX, dv_dX
            :rtype: (np.ndarray (sum_i^D : N_i*, Q), np.ndarray (sum_i^D : N_i*, Q))
            """
            dims = Xnew.shape[1] - 1
            mean_jac = np.empty((Xnew.shape[0], dims))
            var_jac = np.empty((Xnew.shape[0], dims))
            X = self._predictive_variable
            alpha = self.posterior.woodbury_vector
            Wi = self.posterior.woodbury_inv
            k = kern.K(Xnew, X)
            for dimX in range(dims):
                dk_dx = kern.dK_dX(Xnew, X, dimX)
                dk_dxdiag = kern.dK_dXdiag(Xnew, dimX)
                mean_jac[:,dimX] = np.dot(dk_dx, alpha).flatten()
                var_jac[:,dimX] = dk_dxdiag - 2*(np.dot(k, Wi)*dk_dx).sum(-1)
            return mean_jac, var_jac
        mean_jac = np.empty((Xnew.shape[0],Xnew.shape[1]-1,self.output_dim))
        for i in range(self.output_dim):
            mean_jac[:,:,i] = kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self._predictive_variable)[:,0:-1]
--- a/GPy/models/sparse_gp_coregionalized_regression.py
+++ b/GPy/models/sparse_gp_coregionalized_regression.py
@ -7,6 +7,7 @@ from ..inference.latent_function_inference import VarDTC
 from .. import kern
 from .. import util
 class SparseGPCoregionalizedRegression(SparseGP):
    """
    Sparse Gaussian Process model for heteroscedastic multioutput regression
@ -34,34 +35,65 @@ class SparseGPCoregionalizedRegression(SparseGP):
    :type kernel_name: string
    """
-    def __init__(self, X_list, Y_list, Z_list=[], kernel=None, likelihoods_list=None, num_inducing=10, X_variance=None, name='SGPCR',W_rank=1,kernel_name='coreg'):
+    def __init__(
-
+        self,
-        #Input and Output
+        X_list,
-        X,Y,self.output_index = util.multioutput.build_XY(X_list,Y_list)
+        Y_list,
        Z_list=[],
        kernel=None,
        likelihoods_list=None,
        num_inducing=10,
        X_variance=None,
        name="SGPCR",
        W_rank=1,
        kernel_name="coreg",
    ):
        # Input and Output
        X, Y, self.output_index = util.multioutput.build_XY(X_list, Y_list)
        Ny = len(Y_list)
-        #Kernel
+        # Kernel
        if kernel is None:
-            kernel = kern.RBF(X.shape[1]-1)
+            kernel = kern.RBF(X.shape[1] - 1)
-            kernel = util.multioutput.ICM(input_dim=X.shape[1]-1, num_outputs=Ny, kernel=kernel, W_rank=W_rank, name=kernel_name)
+            kernel = util.multioutput.ICM(
                input_dim=X.shape[1] - 1,
                num_outputs=Ny,
                kernel=kernel,
                W_rank=W_rank,
                name=kernel_name,
            )
-        #Likelihood
+        # Likelihood
-        likelihood = util.multioutput.build_likelihood(Y_list,self.output_index,likelihoods_list)
+        likelihood = util.multioutput.build_likelihood(
            Y_list, self.output_index, likelihoods_list
        )
-        #Inducing inputs list
+        # Inducing inputs list
        if len(Z_list):
-            assert len(Z_list) == Ny, 'Number of outputs do not match length of inducing inputs list.'
+            assert (
                len(Z_list) == Ny
            ), "Number of outputs do not match length of inducing inputs list."
        else:
-            if isinstance(num_inducing,np.int):
+            if isinstance(num_inducing, int):
                num_inducing = [num_inducing] * Ny
            num_inducing = np.asarray(num_inducing)
-            assert num_inducing.size == Ny, 'Number of outputs do not match length of inducing inputs list.'
+            assert (
-            for ni,Xi in zip(num_inducing,X_list):
+                num_inducing.size == Ny
            ), "Number of outputs do not match length of inducing inputs list."
            for ni, Xi in zip(num_inducing, X_list):
                i = np.random.permutation(Xi.shape[0])[:ni]
                Z_list.append(Xi[i].copy())
        Z, _, Iz = util.multioutput.build_XY(Z_list)
-        super(SparseGPCoregionalizedRegression, self).__init__(X, Y, Z, kernel, likelihood, inference_method=VarDTC(), Y_metadata={'output_index':self.output_index})
+        super(SparseGPCoregionalizedRegression, self).__init__(
-        self['.*inducing'][:,-1].fix()
+            X,
            Y,
            Z,
            kernel,
            likelihood,
            inference_method=VarDTC(),
            Y_metadata={"output_index": self.output_index},
        )
        self[".*inducing"][:, -1].fix()
--- a/GPy/models/ss_mrd.py
+++ b/GPy/models/ss_mrd.py
@ -5,51 +5,109 @@ The Maniforld Relevance Determination model with the spike-and-slab prior
 import numpy as np
 from ..core import Model
 from .ss_gplvm import SSGPLVM
-from GPy.core.parameterization.variational import SpikeAndSlabPrior,NormalPosterior,VariationalPrior
+from GPy.core.parameterization.variational import (
    SpikeAndSlabPrior,
    NormalPosterior,
    VariationalPrior,
 )
 from ..util.misc import param_to_array
 from ..kern import RBF
 from ..core import Param
 from numpy.linalg.linalg import LinAlgError
 class SSMRD(Model):
-    def __init__(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx = 'PCA_concat', initz = 'permute', 
+class SSMRD(Model):
-                 num_inducing=10, Zs=None, kernels=None, inference_methods=None, likelihoods=None, group_spike=True,
+    def __init__(
-                 pi=0.5, name='ss_mrd', Ynames=None, mpi_comm=None, IBP=False, alpha=2., taus=None, ):
+        self,
        Ylist,
        input_dim,
        X=None,
        X_variance=None,
        Gammas=None,
        initx="PCA_concat",
        initz="permute",
        num_inducing=10,
        Zs=None,
        kernels=None,
        inference_methods=None,
        likelihoods=None,
        group_spike=True,
        pi=0.5,
        name="ss_mrd",
        Ynames=None,
        mpi_comm=None,
        IBP=False,
        alpha=2.0,
        taus=None,
    ):
        super(SSMRD, self).__init__(name)
        self.mpi_comm = mpi_comm
        self._PROPAGATE_ = False
        # initialize X for individual models
-        X, X_variance, Gammas, fracs = self._init_X(Ylist, input_dim, X, X_variance, Gammas, initx)
+        X, X_variance, Gammas, fracs = self._init_X(
            Ylist, input_dim, X, X_variance, Gammas, initx
        )
        self.X = NormalPosterior(means=X, variances=X_variance)
        if kernels is None:
-            kernels = [RBF(input_dim, lengthscale=1./fracs, ARD=True) for i in range(len(Ylist))]
+            kernels = [
                RBF(input_dim, lengthscale=1.0 / fracs, ARD=True)
                for i in range(len(Ylist))
            ]
        if Zs is None:
-            Zs = [None]* len(Ylist)
+            Zs = [None] * len(Ylist)
        if likelihoods is None:
-            likelihoods = [None]* len(Ylist)
+            likelihoods = [None] * len(Ylist)
        if inference_methods is None:
-            inference_methods = [None]* len(Ylist)
+            inference_methods = [None] * len(Ylist)
        if IBP:
-            self.var_priors = [IBPPrior_SSMRD(len(Ylist),input_dim,alpha=alpha) for i in range(len(Ylist))]
+            self.var_priors = [
                IBPPrior_SSMRD(len(Ylist), input_dim, alpha=alpha)
                for i in range(len(Ylist))
            ]
        else:
-            self.var_priors = [SpikeAndSlabPrior_SSMRD(nModels=len(Ylist),pi=pi,learnPi=False, group_spike=group_spike) for i in range(len(Ylist))]
+            self.var_priors = [
-        self.models = [SSGPLVM(y, input_dim, X=X.copy(), X_variance=X_variance.copy(), Gamma=Gammas[i], num_inducing=num_inducing,Z=Zs[i], learnPi=False, group_spike=group_spike,
+                SpikeAndSlabPrior_SSMRD(
-                               kernel=kernels[i],inference_method=inference_methods[i],likelihood=likelihoods[i], variational_prior=self.var_priors[i], IBP=IBP, tau=None if taus is None else taus[i],
+                    nModels=len(Ylist), pi=pi, learnPi=False, group_spike=group_spike
-                               name='model_'+str(i), mpi_comm=mpi_comm, sharedX=True) for i,y in enumerate(Ylist)]
+                )
-        self.link_parameters(*(self.models+[self.X]))
+                for i in range(len(Ylist))
            ]
        self.models = [
            SSGPLVM(
                y,
                input_dim,
                X=X.copy(),
                X_variance=X_variance.copy(),
                Gamma=Gammas[i],
                num_inducing=num_inducing,
                Z=Zs[i],
                learnPi=False,
                group_spike=group_spike,
                kernel=kernels[i],
                inference_method=inference_methods[i],
                likelihood=likelihoods[i],
                variational_prior=self.var_priors[i],
                IBP=IBP,
                tau=None if taus is None else taus[i],
                name="model_" + str(i),
                mpi_comm=mpi_comm,
                sharedX=True,
            )
            for i, y in enumerate(Ylist)
        ]
        self.link_parameters(*(self.models + [self.X]))
    def _propogate_X_val(self):
-        if self._PROPAGATE_: return
+        if self._PROPAGATE_:
            return
        for m in self.models:
            m.X.mean.values[:] = self.X.mean.values
            m.X.variance.values[:] = self.X.variance.values
        varp_list = [m.X for m in self.models]
        [vp._update_inernal(varp_list) for vp in self.var_priors]
-        self._PROPAGATE_=True
+        self._PROPAGATE_ = True
    def _collate_X_gradient(self):
        self._PROPAGATE_ = False
@ -62,82 +120,88 @@ class SSMRD(Model):
    def parameters_changed(self):
        super(SSMRD, self).parameters_changed()
        [m.parameters_changed() for m in self.models]
-        self._log_marginal_likelihood = sum([m._log_marginal_likelihood for m in self.models])
+        self._log_marginal_likelihood = sum(
            [m._log_marginal_likelihood for m in self.models]
        )
        self._collate_X_gradient()
    def log_likelihood(self):
        return self._log_marginal_likelihood
-    def _init_X(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx='PCA_concat'):
+    def _init_X(
-        
+        self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx="PCA_concat"
    ):
        # Divide latent dimensions
-        idx = np.empty((input_dim,),dtype=np.int)
+        idx = np.empty((input_dim,), dtype=int)
-        residue = (input_dim)%(len(Ylist))
+        residue = (input_dim) % (len(Ylist))
        for i in range(len(Ylist)):
            if i < residue:
-                size = input_dim/len(Ylist)+1
+                size = input_dim / len(Ylist) + 1
-                idx[i*size:(i+1)*size] = i
+                idx[i * size : (i + 1) * size] = i
            else:
-                size = input_dim/len(Ylist)
+                size = input_dim / len(Ylist)
-                idx[i*size+residue:(i+1)*size+residue] = i
+                idx[i * size + residue : (i + 1) * size + residue] = i
        if X is None:
-            if initx == 'PCA_concat':
+            if initx == "PCA_concat":
-                X = np.empty((Ylist[0].shape[0],input_dim))
+                X = np.empty((Ylist[0].shape[0], input_dim))
                fracs = np.empty((input_dim,))
                from ..util.initialization import initialize_latent
                for i in range(len(Ylist)):
                    Y = Ylist[i]
-                    dim = (idx==i).sum()
+                    dim = (idx == i).sum()
-                    if dim>0:
+                    if dim > 0:
-                        x, fr = initialize_latent('PCA', dim, Y)
+                        x, fr = initialize_latent("PCA", dim, Y)
-                        X[:,idx==i] = x
+                        X[:, idx == i] = x
-                        fracs[idx==i] = fr
+                        fracs[idx == i] = fr
-            elif initx=='PCA_joint':
+            elif initx == "PCA_joint":
                y = np.hstack(Ylist)
                from ..util.initialization import initialize_latent
-                X, fracs = initialize_latent('PCA', input_dim, y)
+
                X, fracs = initialize_latent("PCA", input_dim, y)
            else:
                X = np.random.randn(Ylist[0].shape[0], input_dim)
                fracs = np.ones(input_dim)
        else:
            fracs = np.ones(input_dim)
-    
+        if X_variance is None:  # The variance of the variational approximation (S)
-        if X_variance is None: # The variance of the variational approximation (S)
+            X_variance = np.random.uniform(0, 0.1, X.shape)
            X_variance = np.random.uniform(0,.1,X.shape)
        if Gammas is None:
            Gammas = []
            for x in X:
-                gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+                gamma = np.empty_like(
                    X
                )  # The posterior probabilities of the binary variable in the variational approximation
                gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
-                gamma[gamma>1.-1e-9] = 1.-1e-9
+                gamma[gamma > 1.0 - 1e-9] = 1.0 - 1e-9
-                gamma[gamma<1e-9] = 1e-9
+                gamma[gamma < 1e-9] = 1e-9
                Gammas.append(gamma)
        return X, X_variance, Gammas, fracs
    @Model.optimizer_array.setter
    def optimizer_array(self, p):
        if self.mpi_comm != None:
-            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0:
+            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank == 0:
-                self.mpi_comm.Bcast(np.int32(1),root=0)
+                self.mpi_comm.Bcast(np.int32(1), root=0)
            self.mpi_comm.Bcast(p, root=0)
-        Model.optimizer_array.fset(self,p)
+        Model.optimizer_array.fset(self, p)
    def optimize(self, optimizer=None, start=None, **kwargs):
        self._IN_OPTIMIZATION_ = True
-        if self.mpi_comm==None:
+        if self.mpi_comm == None:
-            super(SSMRD, self).optimize(optimizer,start,**kwargs)
+            super(SSMRD, self).optimize(optimizer, start, **kwargs)
-        elif self.mpi_comm.rank==0:
+        elif self.mpi_comm.rank == 0:
-            super(SSMRD, self).optimize(optimizer,start,**kwargs)
+            super(SSMRD, self).optimize(optimizer, start, **kwargs)
-            self.mpi_comm.Bcast(np.int32(-1),root=0)
+            self.mpi_comm.Bcast(np.int32(-1), root=0)
-        elif self.mpi_comm.rank>0:
+        elif self.mpi_comm.rank > 0:
            x = self.optimizer_array.copy()
-            flag = np.empty(1,dtype=np.int32)
+            flag = np.empty(1, dtype=np.int32)
            while True:
-                self.mpi_comm.Bcast(flag,root=0)
+                self.mpi_comm.Bcast(flag, root=0)
-                if flag==1:
+                if flag == 1:
                    try:
                        self.optimizer_array = x
                        self._fail_count = 0
@ -145,7 +209,7 @@ class SSMRD(Model):
                        if self._fail_count >= self._allowed_failures:
                            raise
                        self._fail_count += 1
-                elif flag==-1:
+                elif flag == -1:
                    break
                else:
                    self._IN_OPTIMIZATION_ = False
@ -154,20 +218,42 @@ class SSMRD(Model):
 class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
-    def __init__(self, nModels, pi=0.5, learnPi=False, group_spike=True, variance = 1.0, name='SSMRDPrior', **kw):
+    def __init__(
        self,
        nModels,
        pi=0.5,
        learnPi=False,
        group_spike=True,
        variance=1.0,
        name="SSMRDPrior",
        **kw
    ):
        self.nModels = nModels
        self._b_prob_all = 0.5
-        super(SpikeAndSlabPrior_SSMRD, self).__init__(pi=pi,learnPi=learnPi,group_spike=group_spike,variance=variance, name=name, **kw)
+        super(SpikeAndSlabPrior_SSMRD, self).__init__(
            pi=pi,
            learnPi=learnPi,
            group_spike=group_spike,
            variance=variance,
            name=name,
            **kw
        )
    def _update_inernal(self, varp_list):
        """Make an update of the internal status by gathering the variational posteriors for all the individual models."""
        # The probability for the binary variable for the same latent dimension of any of the models is on.
        if self.group_spike:
-            self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group)
+            self._b_prob_all = 1.0 - param_to_array(varp_list[0].gamma_group)
-            [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]]
+            [
                np.multiply(self._b_prob_all, 1.0 - vp.gamma_group, self._b_prob_all)
                for vp in varp_list[1:]
            ]
        else:
-            self._b_prob_all = 1.-param_to_array(varp_list[0].binary_prob)
+            self._b_prob_all = 1.0 - param_to_array(varp_list[0].binary_prob)
-            [np.multiply(self._b_prob_all, 1.-vp.binary_prob, self._b_prob_all) for vp in varp_list[1:]]            
+            [
                np.multiply(self._b_prob_all, 1.0 - vp.binary_prob, self._b_prob_all)
                for vp in varp_list[1:]
            ]
    def KL_divergence(self, variational_posterior):
        mu = variational_posterior.mean
@ -176,16 +262,20 @@ class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
            gamma = variational_posterior.binary_prob[0]
        else:
            gamma = variational_posterior.binary_prob
-        if len(self.pi.shape)==2:
+        if len(self.pi.shape) == 2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(gamma._raveled_index() / gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi
-        var_mean = np.square(mu)/self.variance
+        var_mean = np.square(mu) / self.variance
-        var_S = (S/self.variance - np.log(S))
+        var_S = S / self.variance - np.log(S)
-        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
+        var_gamma = (gamma * np.log(gamma / pi)).sum() + (
-        return var_gamma +((1.-self._b_prob_all)*(np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels)
+            (1 - gamma) * np.log((1 - gamma) / (1 - pi))
        ).sum()
        return var_gamma + (
            (1.0 - self._b_prob_all) * (np.log(self.variance) - 1.0 + var_mean + var_S)
        ).sum() / (2.0 * self.nModels)
    def update_gradients_KL(self, variational_posterior):
        mu = variational_posterior.mean
@ -195,63 +285,141 @@ class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
            gamma = variational_posterior.binary_prob.values[0]
        else:
            gamma = variational_posterior.binary_prob.values
-        if len(self.pi.shape)==2:
+        if len(self.pi.shape) == 2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(gamma._raveled_index() / gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi
        if self.group_spike:
-            tmp = self._b_prob_all/(1.-gamma)
+            tmp = self._b_prob_all / (1.0 - gamma)
-            variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))/N +tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+            variational_posterior.binary_prob.gradient -= (
                np.log((1 - pi) / pi * gamma / (1.0 - gamma)) / N
                + tmp
                * (
                    (np.square(mu) + S) / self.variance
                    - np.log(S)
                    + np.log(self.variance)
                    - 1.0
                )
                / 2.0
            )
        else:
-            variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+            variational_posterior.binary_prob.gradient -= (
-        mu.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels)
+                np.log((1 - pi) / pi * gamma / (1.0 - gamma))
-        S.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels)
+                + (
                    (np.square(mu) + S) / self.variance
                    - np.log(S)
                    + np.log(self.variance)
                    - 1.0
                )
                / 2.0
            )
        mu.gradient -= (1.0 - self._b_prob_all) * mu / (self.variance * self.nModels)
        S.gradient -= (
            (1.0 / self.variance - 1.0 / S)
            * (1.0 - self._b_prob_all)
            / (2.0 * self.nModels)
        )
        if self.learnPi:
-            raise 'Not Supported!'
+            raise "Not Supported!"
 class IBPPrior_SSMRD(VariationalPrior):
-    def __init__(self, nModels, input_dim, alpha =2., tau=None, name='IBPPrior', **kw):
+    def __init__(self, nModels, input_dim, alpha=2.0, tau=None, name="IBPPrior", **kw):
        super(IBPPrior_SSMRD, self).__init__(name=name, **kw)
        from paramz.transformations import Logexp, __fixed__
        self.nModels = nModels
        self._b_prob_all = 0.5
        self.input_dim = input_dim
-        self.variance = 1.
+        self.variance = 1.0
-        self.alpha = Param('alpha', alpha, __fixed__)
+        self.alpha = Param("alpha", alpha, __fixed__)
        self.link_parameter(self.alpha)
    def _update_inernal(self, varp_list):
        """Make an update of the internal status by gathering the variational posteriors for all the individual models."""
        # The probability for the binary variable for the same latent dimension of any of the models is on.
-        self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group)
+        self._b_prob_all = 1.0 - param_to_array(varp_list[0].gamma_group)
-        [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]]
+        [
            np.multiply(self._b_prob_all, 1.0 - vp.gamma_group, self._b_prob_all)
            for vp in varp_list[1:]
        ]
    def KL_divergence(self, variational_posterior):
-        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+        mu, S, gamma, tau = (
            variational_posterior.mean.values,
            variational_posterior.variance.values,
            variational_posterior.gamma_group.values,
            variational_posterior.tau.values,
        )
-        var_mean = np.square(mu)/self.variance
+        var_mean = np.square(mu) / self.variance
-        var_S = (S/self.variance - np.log(S))
+        var_S = S / self.variance - np.log(S)
-        part1 = ((1.-self._b_prob_all)* (np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels)
+        part1 = (
            (1.0 - self._b_prob_all) * (np.log(self.variance) - 1.0 + var_mean + var_S)
        ).sum() / (2.0 * self.nModels)
-        ad = self.alpha/self.input_dim
+        ad = self.alpha / self.input_dim
-        from scipy.special import betaln,digamma
+        from scipy.special import betaln, digamma
-        part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + (betaln(ad,1.)*self.input_dim -betaln(tau[:,0], tau[:,1]).sum())/self.nModels \
+
-                 + (( (tau[:,0]-ad)/self.nModels -gamma)*digamma(tau[:,0])).sum() + \
+        part2 = (
-                (((tau[:,1]-1.)/self.nModels+gamma-1.)*digamma(tau[:,1])).sum() + (((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*digamma(tau.sum(axis=1))).sum()
+            (gamma * np.log(gamma)).sum()
-        return part1+part2
+            + ((1.0 - gamma) * np.log(1.0 - gamma)).sum()
            + (betaln(ad, 1.0) * self.input_dim - betaln(tau[:, 0], tau[:, 1]).sum())
            / self.nModels
            + (((tau[:, 0] - ad) / self.nModels - gamma) * digamma(tau[:, 0])).sum()
            + (
                ((tau[:, 1] - 1.0) / self.nModels + gamma - 1.0) * digamma(tau[:, 1])
            ).sum()
            + (
                ((1.0 + ad - tau[:, 0] - tau[:, 1]) / self.nModels + 1.0)
                * digamma(tau.sum(axis=1))
            ).sum()
        )
        return part1 + part2
    def update_gradients_KL(self, variational_posterior):
-        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+        mu, S, gamma, tau = (
            variational_posterior.mean.values,
            variational_posterior.variance.values,
            variational_posterior.gamma_group.values,
            variational_posterior.tau.values,
        )
-        variational_posterior.mean.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels)
+        variational_posterior.mean.gradient -= (
-        variational_posterior.variance.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels)
+            (1.0 - self._b_prob_all) * mu / (self.variance * self.nModels)
-        from scipy.special import digamma,polygamma
+        )
-        tmp = self._b_prob_all/(1.-gamma)
+        variational_posterior.variance.gradient -= (
-        dgamma = (np.log(gamma/(1.-gamma))+ digamma(tau[:,1])-digamma(tau[:,0]))/variational_posterior.num_data
+            (1.0 / self.variance - 1.0 / S)
-        variational_posterior.binary_prob.gradient -= dgamma+tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+            * (1.0 - self._b_prob_all)
-        ad = self.alpha/self.input_dim
+            / (2.0 * self.nModels)
-        common = ((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*polygamma(1,tau.sum(axis=1))
+        )
-        variational_posterior.tau.gradient[:,0] = -(((tau[:,0]-ad)/self.nModels -gamma)*polygamma(1,tau[:,0])+common)
+        from scipy.special import digamma, polygamma
-        variational_posterior.tau.gradient[:,1] = -(((tau[:,1]-1.)/self.nModels+gamma-1.)*polygamma(1,tau[:,1])+common)
+
        tmp = self._b_prob_all / (1.0 - gamma)
        dgamma = (
            np.log(gamma / (1.0 - gamma)) + digamma(tau[:, 1]) - digamma(tau[:, 0])
        ) / variational_posterior.num_data
        variational_posterior.binary_prob.gradient -= (
            dgamma
            + tmp
            * (
                (np.square(mu) + S) / self.variance
                - np.log(S)
                + np.log(self.variance)
                - 1.0
            )
            / 2.0
        )
        ad = self.alpha / self.input_dim
        common = ((1.0 + ad - tau[:, 0] - tau[:, 1]) / self.nModels + 1.0) * polygamma(
            1, tau.sum(axis=1)
        )
        variational_posterior.tau.gradient[:, 0] = -(
            ((tau[:, 0] - ad) / self.nModels - gamma) * polygamma(1, tau[:, 0]) + common
        )
        variational_posterior.tau.gradient[:, 1] = -(
            ((tau[:, 1] - 1.0) / self.nModels + gamma - 1.0) * polygamma(1, tau[:, 1])
            + common
        )
--- a/GPy/models/state_space_cython.c
+++ b/GPy/models/state_space_cython.c
--- a/GPy/models/state_space_main.py
+++ b/GPy/models/state_space_main.py
--- a/GPy/old_tests/bcgplvm_tests.py
+++ b/GPy/old_tests/bcgplvm_tests.py
@ -17,7 +17,7 @@ class BCGPLVMTests(unittest.TestCase):
        mapping = GPy.mappings.Kernel(output_dim=input_dim, X=Y, kernel=bk)
        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
    def test_linear_backconstraint(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
@ -30,7 +30,7 @@ class BCGPLVMTests(unittest.TestCase):
        mapping = GPy.mappings.Linear(output_dim=input_dim, input_dim=output_dim)
        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
    def test_mlp_backconstraint(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
@ -43,7 +43,7 @@ class BCGPLVMTests(unittest.TestCase):
        mapping = GPy.mappings.MLP(output_dim=input_dim, input_dim=output_dim, hidden_dim=[5, 4, 7])
        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/old_tests/gp_transformation_tests.py
+++ b/GPy/old_tests/gp_transformation_tests.py
@ -1,4 +1,3 @@
 from nose.tools import with_setup
 from GPy.models import GradientChecker
 from GPy.likelihoods.noise_models import gp_transformations
 import inspect
--- a/GPy/old_tests/gplvm_tests.py
+++ b/GPy/old_tests/gplvm_tests.py
@ -15,7 +15,7 @@ class GPLVMTests(unittest.TestCase):
        k = GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
    def test_linear_kern(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
@ -26,7 +26,7 @@ class GPLVMTests(unittest.TestCase):
        k = GPy.kern.Linear(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
    def test_rbf_kern(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
@ -37,7 +37,7 @@ class GPLVMTests(unittest.TestCase):
        k = GPy.kern.RBF(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/old_tests/psi_stat_gradient_tests.py
+++ b/GPy/old_tests/psi_stat_gradient_tests.py
@ -1,8 +1,8 @@
-'''
+"""
 Created on 22 Apr 2013
@author: maxz
-'''
+"""
 import unittest
 import numpy
@ -13,42 +13,66 @@ from GPy.core.parameterization.param import Param
 from GPy.core.parameterization.transformations import Logexp
 from GPy.core.parameterization.variational import NormalPosterior
 class PsiStatModel(Model):
    def __init__(self, which, X, X_variance, Z, num_inducing, kernel):
-        super(PsiStatModel, self).__init__(name='psi stat test')
+        super(PsiStatModel, self).__init__(name="psi stat test")
        self.which = which
        self.X = Param("X", X)
-        self.X_variance = Param('X_variance', X_variance, Logexp())
+        self.X_variance = Param("X_variance", X_variance, Logexp())
        self.q = NormalPosterior(self.X, self.X_variance)
        self.Z = Param("Z", Z)
        self.N, self.input_dim = X.shape
        self.num_inducing, input_dim = Z.shape
-        assert self.input_dim == input_dim, "shape missmatch: Z:{!s} X:{!s}".format(Z.shape, X.shape)
+        assert self.input_dim == input_dim, "shape missmatch: Z:{!s} X:{!s}".format(
            Z.shape, X.shape
        )
        self.kern = kernel
        self.psi_ = self.kern.__getattribute__(self.which)(self.Z, self.q)
        self.add_parameters(self.q, self.Z, self.kern)
    def log_likelihood(self):
-        return self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance).sum()
+        return self.kern.__getattribute__(self.which)(
            self.Z, self.X, self.X_variance
        ).sum()
    def parameters_changed(self):
-        psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(numpy.ones_like(self.psi_), self.Z, self.q)
+        psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(
            numpy.ones_like(self.psi_), self.Z, self.q
        )
        self.X.gradient = psimu
        self.X_variance.gradient = psiS
-        #psimu, psiS = numpy.ones(self.N * self.input_dim), numpy.ones(self.N * self.input_dim)
+        # psimu, psiS = numpy.ones(self.N * self.input_dim), numpy.ones(self.N * self.input_dim)
-        try: psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.q)
+        try:
-        except AttributeError: psiZ = numpy.zeros_like(self.Z)
+            psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(
                numpy.ones_like(self.psi_), self.Z, self.q
            )
        except AttributeError:
            psiZ = numpy.zeros_like(self.Z)
        self.Z.gradient = psiZ
-        #psiZ = numpy.ones(self.num_inducing * self.input_dim)
+        # psiZ = numpy.ones(self.num_inducing * self.input_dim)
-        N,M = self.X.shape[0], self.Z.shape[0]
+        N, M = self.X.shape[0], self.Z.shape[0]
-        dL_dpsi0, dL_dpsi1, dL_dpsi2 = numpy.zeros([N]), numpy.zeros([N,M]), numpy.zeros([N,M,M])
+        dL_dpsi0, dL_dpsi1, dL_dpsi2 = (
-        if self.which == 'psi0': dL_dpsi0 += 1
+            numpy.zeros([N]),
-        if self.which == 'psi1': dL_dpsi1 += 1
+            numpy.zeros([N, M]),
-        if self.which == 'psi2': dL_dpsi2 += 1
+            numpy.zeros([N, M, M]),
-        self.kern.update_gradients_variational(numpy.zeros([1,1]),
+        )
-                                               dL_dpsi0,
+        if self.which == "psi0":
-                                               dL_dpsi1,
+            dL_dpsi0 += 1
-                                               dL_dpsi2, self.X, self.X_variance, self.Z)
+        if self.which == "psi1":
            dL_dpsi1 += 1
        if self.which == "psi2":
            dL_dpsi2 += 1
        self.kern.update_gradients_variational(
            numpy.zeros([1, 1]),
            dL_dpsi0,
            dL_dpsi1,
            dL_dpsi2,
            self.X,
            self.X_variance,
            self.Z,
        )
 class DPsiStatTest(unittest.TestCase):
    input_dim = 5
@ -56,128 +80,206 @@ class DPsiStatTest(unittest.TestCase):
    num_inducing = 10
    input_dim = 20
    X = numpy.random.randn(N, input_dim)
-    X_var = .5 * numpy.ones_like(X) + .4 * numpy.clip(numpy.random.randn(*X.shape), 0, 1)
+    X_var = 0.5 * numpy.ones_like(X) + 0.4 * numpy.clip(
        numpy.random.randn(*X.shape), 0, 1
    )
    Z = numpy.random.permutation(X)[:num_inducing]
    Y = X.dot(numpy.random.randn(input_dim, input_dim))
-#     kernels = [GPy.kern.Linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)), GPy.kern.RBF(input_dim, ARD=True), GPy.kern.Bias(input_dim)]
+    #     kernels = [GPy.kern.Linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)), GPy.kern.RBF(input_dim, ARD=True), GPy.kern.Bias(input_dim)]
    kernels = [
-               GPy.kern.Linear(input_dim),
+        GPy.kern.Linear(input_dim),
-               GPy.kern.RBF(input_dim),
+        GPy.kern.RBF(input_dim),
-               #GPy.kern.Bias(input_dim),
+        # GPy.kern.Bias(input_dim),
-               #GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim),
+        # GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim),
-               #GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim)
+        # GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim)
-               ]
+    ]
    def testPsi0(self):
        for k in self.kernels:
-            m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,\
+            m = PsiStatModel(
-                             num_inducing=self.num_inducing, kernel=k)
+                "psi0",
                X=self.X,
                X_variance=self.X_var,
                Z=self.Z,
                num_inducing=self.num_inducing,
                kernel=k,
            )
            m.randomize()
-            assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k._parameters_)))
+            assert m.checkgrad(), "{} x psi0".format(
                "+".join(map(lambda x: x.name, k._parameters_))
            )
    def testPsi1(self):
        for k in self.kernels:
-            m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z,
+            m = PsiStatModel(
-                     num_inducing=self.num_inducing, kernel=k)
+                "psi1",
                X=self.X,
                X_variance=self.X_var,
                Z=self.Z,
                num_inducing=self.num_inducing,
                kernel=k,
            )
            m.randomize()
-            assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k._parameters_)))
+            assert m.checkgrad(), "{} x psi1".format(
                "+".join(map(lambda x: x.name, k._parameters_))
            )
    def testPsi2_lin(self):
        k = self.kernels[0]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
+        m = PsiStatModel(
-                 num_inducing=self.num_inducing, kernel=k)
+            "psi2",
            X=self.X,
            X_variance=self.X_var,
            Z=self.Z,
            num_inducing=self.num_inducing,
            kernel=k,
        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
            "+".join(map(lambda x: x.name, k._parameters_))
        )
    def testPsi2_lin_bia(self):
        k = self.kernels[3]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
+        m = PsiStatModel(
-                     num_inducing=self.num_inducing, kernel=k)
+            "psi2",
            X=self.X,
            X_variance=self.X_var,
            Z=self.Z,
            num_inducing=self.num_inducing,
            kernel=k,
        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
            "+".join(map(lambda x: x.name, k._parameters_))
        )
    def testPsi2_rbf(self):
        k = self.kernels[1]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
+        m = PsiStatModel(
-                     num_inducing=self.num_inducing, kernel=k)
+            "psi2",
            X=self.X,
            X_variance=self.X_var,
            Z=self.Z,
            num_inducing=self.num_inducing,
            kernel=k,
        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
            "+".join(map(lambda x: x.name, k._parameters_))
        )
    def testPsi2_rbf_bia(self):
        k = self.kernels[-1]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
+        m = PsiStatModel(
-                     num_inducing=self.num_inducing, kernel=k)
+            "psi2",
            X=self.X,
            X_variance=self.X_var,
            Z=self.Z,
            num_inducing=self.num_inducing,
            kernel=k,
        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
            "+".join(map(lambda x: x.name, k._parameters_))
        )
    def testPsi2_bia(self):
        k = self.kernels[2]
-        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
+        m = PsiStatModel(
-                     num_inducing=self.num_inducing, kernel=k)
+            "psi2",
            X=self.X,
            X_variance=self.X_var,
            Z=self.Z,
            num_inducing=self.num_inducing,
            kernel=k,
        )
        m.randomize()
-        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k._parameters_)))
+        assert m.checkgrad(), "{} x psi2".format(
            "+".join(map(lambda x: x.name, k._parameters_))
        )
 if __name__ == "__main__":
    import sys
-    interactive = 'i' in sys.argv
+
    interactive = "i" in sys.argv
    if interactive:
-#         N, num_inducing, input_dim, input_dim = 30, 5, 4, 30
+        #         N, num_inducing, input_dim, input_dim = 30, 5, 4, 30
-#         X = numpy.random.rand(N, input_dim)
+        #         X = numpy.random.rand(N, input_dim)
-#         k = GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
+        #         k = GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
-#         K = k.K(X)
+        #         K = k.K(X)
-#         Y = numpy.random.multivariate_normal(numpy.zeros(N), K, input_dim).T
+        #         Y = numpy.random.multivariate_normal(numpy.zeros(N), K, input_dim).T
-#         Y -= Y.mean(axis=0)
+        #         Y -= Y.mean(axis=0)
-#         k = GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
+        #         k = GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
-#         m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+        #         m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
-#         m.randomize()
+        #         m.randomize()
-# #         self.assertTrue(m.checkgrad())
+        # #         assert m.checkgrad()
        numpy.random.seed(0)
        input_dim = 3
        N = 3
        num_inducing = 2
        D = 15
        X = numpy.random.randn(N, input_dim)
-        X_var = .5 * numpy.ones_like(X) + .1 * numpy.clip(numpy.random.randn(*X.shape), 0, 1)
+        X_var = 0.5 * numpy.ones_like(X) + 0.1 * numpy.clip(
            numpy.random.randn(*X.shape), 0, 1
        )
        Z = numpy.random.permutation(X)[:num_inducing]
        Y = X.dot(numpy.random.randn(input_dim, D))
-#         kernel = GPy.kern.Bias(input_dim)
+        #         kernel = GPy.kern.Bias(input_dim)
-#
+        #
-#         kernels = [GPy.kern.Linear(input_dim), GPy.kern.RBF(input_dim), GPy.kern.Bias(input_dim),
+        #         kernels = [GPy.kern.Linear(input_dim), GPy.kern.RBF(input_dim), GPy.kern.Bias(input_dim),
-#                GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim),
+        #                GPy.kern.Linear(input_dim) + GPy.kern.Bias(input_dim),
-#                GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim)]
+        #                GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim)]
-#         for k in kernels:
+        #         for k in kernels:
-#             m = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
+        #             m = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
-#                      num_inducing=num_inducing, kernel=k)
+        #                      num_inducing=num_inducing, kernel=k)
-#             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
+        #             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
-#
+        #
-        m0 = PsiStatModel('psi0', X=X, X_variance=X_var, Z=Z,
+        m0 = PsiStatModel(
-                         num_inducing=num_inducing, kernel=GPy.kern.RBF(input_dim)+GPy.kern.Bias(input_dim))
+            "psi0",
-#         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
+            X=X,
-#                          num_inducing=num_inducing, kernel=kernel)
+            X_variance=X_var,
-#         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
+            Z=Z,
-#                          num_inducing=num_inducing, kernel=kernel)
+            num_inducing=num_inducing,
-#         m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+            kernel=GPy.kern.RBF(input_dim) + GPy.kern.Bias(input_dim),
-#                          num_inducing=num_inducing, kernel=GPy.kern.RBF(input_dim))
+        )
-#         m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+        #         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=GPy.kern.Linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
+        #                          num_inducing=num_inducing, kernel=kernel)
        #         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
        #                          num_inducing=num_inducing, kernel=kernel)
        #         m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
        #                          num_inducing=num_inducing, kernel=GPy.kern.RBF(input_dim))
        #         m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
        #                          num_inducing=num_inducing, kernel=GPy.kern.Linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
        # + GPy.kern.Bias(input_dim))
-#         m = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+        #         m = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing,
+        #                          num_inducing=num_inducing,
-#                          kernel=(
+        #                          kernel=(
-#             GPy.kern.RBF(input_dim, ARD=1)
+        #             GPy.kern.RBF(input_dim, ARD=1)
-#             +GPy.kern.Linear(input_dim, ARD=1)
+        #             +GPy.kern.Linear(input_dim, ARD=1)
-#             +GPy.kern.Bias(input_dim))
+        #             +GPy.kern.Bias(input_dim))
-#                          )
+        #                          )
-#         m.ensure_default_constraints()
+        #         m.ensure_default_constraints()
-        m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+        m2 = PsiStatModel(
-                         num_inducing=num_inducing, kernel=(
+            "psi2",
-            GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1)
+            X=X,
-            #+GPy.kern.Linear(input_dim, numpy.random.rand(input_dim), ARD=1)
+            X_variance=X_var,
-            #+GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1)
+            Z=Z,
-            #+GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(), ARD=0)
+            num_inducing=num_inducing,
-            +GPy.kern.Bias(input_dim)
+            kernel=(
-            +GPy.kern.White(input_dim)
+                GPy.kern.RBF(
-            )
+                    input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1
-            )
+                )
-        #m2.ensure_default_constraints()
+                # +GPy.kern.Linear(input_dim, numpy.random.rand(input_dim), ARD=1)
                # +GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1)
                # +GPy.kern.RBF(input_dim, numpy.random.rand(), numpy.random.rand(), ARD=0)
                + GPy.kern.Bias(input_dim)
                + GPy.kern.White(input_dim)
            ),
        )
        # m2.ensure_default_constraints()
    else:
        unittest.main()
--- a/GPy/old_tests/sparse_gplvm_tests.py
+++ b/GPy/old_tests/sparse_gplvm_tests.py
@ -16,7 +16,7 @@ class sparse_GPLVMTests(unittest.TestCase):
        k = GPy.kern.Bias(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
    def test_linear_kern(self):
        N, num_inducing, input_dim, D = 10, 3, 2, 4
@ -27,7 +27,7 @@ class sparse_GPLVMTests(unittest.TestCase):
        k = GPy.kern.Linear(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
    def test_rbf_kern(self):
        N, num_inducing, input_dim, D = 10, 3, 2, 4
@ -38,7 +38,7 @@ class sparse_GPLVMTests(unittest.TestCase):
        k = GPy.kern.RBF(input_dim) + GPy.kern.White(input_dim, 0.00001)
        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/plotting/gpy_plot/plot_util.py
+++ b/GPy/plotting/gpy_plot/plot_util.py
@ -72,7 +72,7 @@ def helper_predict_with_model(self, Xgrid, plot_raw, apply_link, percentiles, wh
    if 'Y_metadata' not in predict_kw:
        predict_kw['Y_metadata'] = {}
    if 'output_index' not in predict_kw['Y_metadata']:
-        predict_kw['Y_metadata']['output_index'] = Xgrid[:,-1:].astype(np.int)
+        predict_kw['Y_metadata']['output_index'] = Xgrid[:,-1:].astype(np.int64)
    mu, _ = self.predict(Xgrid, **predict_kw)
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -5,6 +5,7 @@ import numpy as np
 from .util import align_subplot_array, align_subplots
 def ax_default(fignum, ax):
    if ax is None:
        fig = plt.figure(fignum)
@ -13,11 +14,23 @@ def ax_default(fignum, ax):
        fig = ax.figure
    return fig, ax
 def meanplot(x, mu, color='#3300FF', ax=None, fignum=None, linewidth=2,**kw):
    _, axes = ax_default(fignum, ax)
    return axes.plot(x,mu,color=color,linewidth=linewidth,**kw)
-def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, fignum=None, **kwargs):
+def meanplot(x, mu, color="#3300FF", ax=None, fignum=None, linewidth=2, **kw):
    _, axes = ax_default(fignum, ax)
    return axes.plot(x, mu, color=color, linewidth=linewidth, **kw)
 def gpplot(
    x,
    mu,
    lower,
    upper,
    edgecol="#3300FF",
    fillcol="#33CCFF",
    ax=None,
    fignum=None,
    **kwargs
 ):
    _, axes = ax_default(fignum, ax)
    mu = mu.flatten()
@ -27,51 +40,62 @@ def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, f
    plots = []
-    #here's the mean
+    # here's the mean
    plots.append(meanplot(x, mu, edgecol, axes))
-    #here's the box
+    # here's the box
-    kwargs['linewidth']=0.5
+    kwargs["linewidth"] = 0.5
-    if not 'alpha' in kwargs.keys():
+    if not "alpha" in kwargs.keys():
-        kwargs['alpha'] = 0.3
+        kwargs["alpha"] = 0.3
-    plots.append(axes.fill(np.hstack((x,x[::-1])),np.hstack((upper,lower[::-1])),color=fillcol,**kwargs))
+    plots.append(
        axes.fill(
            np.hstack((x, x[::-1])),
            np.hstack((upper, lower[::-1])),
            color=fillcol,
            **kwargs
        )
    )
-    #this is the edge:
+    # this is the edge:
-    plots.append(meanplot(x, upper,color=edgecol, linewidth=0.2, ax=axes))
+    plots.append(meanplot(x, upper, color=edgecol, linewidth=0.2, ax=axes))
-    plots.append(meanplot(x, lower,color=edgecol, linewidth=0.2, ax=axes))
+    plots.append(meanplot(x, lower, color=edgecol, linewidth=0.2, ax=axes))
    return plots
 def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
    _, ax = ax_default(fignum, ax)
    plots = []
-    #here's the box
+    # here's the box
-    if 'linewidth' not in kwargs:
+    if "linewidth" not in kwargs:
-        kwargs['linewidth'] = 0.5
+        kwargs["linewidth"] = 0.5
-    if not 'alpha' in kwargs.keys():
+    if not "alpha" in kwargs.keys():
-        kwargs['alpha'] = 1./(len(percentiles))
+        kwargs["alpha"] = 1.0 / (len(percentiles))
    # pop where from kwargs
-    where = kwargs.pop('where') if 'where' in kwargs else None
+    where = kwargs.pop("where") if "where" in kwargs else None
    # pop interpolate, which we actually do not do here!
-    if 'interpolate' in kwargs: kwargs.pop('interpolate')
+    if "interpolate" in kwargs:
        kwargs.pop("interpolate")
    def pairwise(inlist):
        l = len(inlist)
-        for i in range(int(np.ceil(l/2.))):
+        for i in range(int(np.ceil(l / 2.0))):
-            yield inlist[:][i], inlist[:][(l-1)-i]
+            yield inlist[:][i], inlist[:][(l - 1) - i]
    polycol = []
    for y1, y2 in pairwise(percentiles):
        import matplotlib.mlab as mlab
        # Handle united data, such as dates
        ax._process_unit_info(xdata=x, ydata=y1)
        ax._process_unit_info(ydata=y2)
        # Convert the arrays so we can work with them
        from numpy import ma
        x = ma.masked_invalid(ax.convert_xunits(x))
        y1 = ma.masked_invalid(ax.convert_yunits(y1))
        y2 = ma.masked_invalid(ax.convert_yunits(y2))
@ -103,7 +127,7 @@ def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
                continue
            N = len(xslice)
-            X = np.zeros((2 * N + 2, 2), np.float)
+            X = np.zeros((2 * N + 2, 2), float)
            # the purpose of the next two lines is for when y2 is a
            # scalar like 0 and we want the fill to go all the way
@ -114,19 +138,21 @@ def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
            X[0] = start
            X[N + 1] = end
-            X[1:N + 1, 0] = xslice
+            X[1 : N + 1, 0] = xslice
-            X[1:N + 1, 1] = y1slice
+            X[1 : N + 1, 1] = y1slice
-            X[N + 2:, 0] = xslice[::-1]
+            X[N + 2 :, 0] = xslice[::-1]
-            X[N + 2:, 1] = y2slice[::-1]
+            X[N + 2 :, 1] = y2slice[::-1]
            polys.append(X)
        polycol.extend(polys)
    from matplotlib.collections import PolyCollection
    plots.append(PolyCollection(polycol, **kwargs))
    ax.add_collection(plots[-1], autolim=True)
    ax.autoscale_view()
    return plots
 def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)
@ -138,17 +164,19 @@ def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
    plots = []
    if edgecol is None:
-        edgecol='#3300FF'
+        edgecol = "#3300FF"
-    if not 'alpha' in kwargs.keys():
+    if not "alpha" in kwargs.keys():
-        kwargs['alpha'] = 1.
+        kwargs["alpha"] = 1.0
    if not "lw" in kwargs.keys():
        kwargs["lw"] = 1.0
-    if not 'lw' in kwargs.keys():
+    plots.append(
-        kwargs['lw'] = 1.
+        axes.errorbar(
-
+            x, mu, yerr=np.vstack([mu - lower, upper - mu]), color=edgecol, **kwargs
-
+        )
-    plots.append(axes.errorbar(x,mu,yerr=np.vstack([mu-lower,upper-mu]),color=edgecol,**kwargs))
+    )
    plots[-1][0].remove()
    return plots
@ -156,53 +184,60 @@ def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
 def removeRightTicks(ax=None):
    ax = ax or plt.gca()
    for i, line in enumerate(ax.get_yticklines()):
-        if i%2 == 1:   # odd indices
+        if i % 2 == 1:  # odd indices
            line.set_visible(False)
 def removeUpperTicks(ax=None):
    ax = ax or plt.gca()
    for i, line in enumerate(ax.get_xticklines()):
-        if i%2 == 1:   # odd indices
+        if i % 2 == 1:  # odd indices
            line.set_visible(False)
-def fewerXticks(ax=None,divideby=2):
+
 def fewerXticks(ax=None, divideby=2):
    ax = ax or plt.gca()
    ax.set_xticks(ax.get_xticks()[::divideby])
-def x_frame1D(X,plot_limits=None,resolution=None):
+
 def x_frame1D(X, plot_limits=None, resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
    """
-    assert X.shape[1] ==1, "x_frame1D is defined for one-dimensional inputs"
+    assert X.shape[1] == 1, "x_frame1D is defined for one-dimensional inputs"
    if plot_limits is None:
        from ...core.parameterization.variational import VariationalPosterior
        if isinstance(X, VariationalPosterior):
-            xmin,xmax = X.mean.min(0),X.mean.max(0)
+            xmin, xmax = X.mean.min(0), X.mean.max(0)
        else:
-            xmin,xmax = X.min(0),X.max(0)
+            xmin, xmax = X.min(0), X.max(0)
-        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
+        xmin, xmax = xmin - 0.2 * (xmax - xmin), xmax + 0.2 * (xmax - xmin)
-    elif len(plot_limits)==2:
+    elif len(plot_limits) == 2:
        xmin, xmax = plot_limits
    else:
        raise ValueError("Bad limits for plotting")
-    Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
+    Xnew = np.linspace(xmin, xmax, resolution or 200)[:, None]
    return Xnew, xmin, xmax
-def x_frame2D(X,plot_limits=None,resolution=None):
+
 def x_frame2D(X, plot_limits=None, resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
    """
-    assert X.shape[1] ==2, "x_frame2D is defined for two-dimensional inputs"
+    assert X.shape[1] == 2, "x_frame2D is defined for two-dimensional inputs"
    if plot_limits is None:
-        xmin,xmax = X.min(0),X.max(0)
+        xmin, xmax = X.min(0), X.max(0)
-        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
+        xmin, xmax = xmin - 0.2 * (xmax - xmin), xmax + 0.2 * (xmax - xmin)
-    elif len(plot_limits)==2:
+    elif len(plot_limits) == 2:
        xmin, xmax = plot_limits
    else:
        raise ValueError("Bad limits for plotting")
    resolution = resolution or 50
-    xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
+    xx, yy = np.mgrid[
-    Xnew = np.vstack((xx.flatten(),yy.flatten())).T
+        xmin[0] : xmax[0] : 1j * resolution, xmin[1] : xmax[1] : 1j * resolution
    ]
    Xnew = np.vstack((xx.flatten(), yy.flatten())).T
    return Xnew, xx, yy, xmin, xmax
--- a/GPy/plotting/matplot_dep/defaults.py
+++ b/GPy/plotting/matplot_dep/defaults.py
@ -1,4 +1,4 @@
-#===============================================================================
+# ===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 # All rights reserved.
 #
@ -26,12 +26,12 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
+# ===============================================================================
-from matplotlib import cm
+from matplotlib import pyplot
 from .. import Tango
-'''
+"""
 This file is for defaults for the gpy plot, specific to the plotting library.
 Create a kwargs dictionary with the right name for the plotting function
@ -40,36 +40,55 @@ the plotting library will be used.
 In the code, always ise plotting.gpy_plots.defaults to get the defaults, as
 it gives back an empty default, when defaults are not defined.
-'''
+"""
 # Data plots:
-data_1d = dict(lw=1.5, marker='x', color='k')
+data_1d = dict(lw=1.5, marker="x", color="k")
-data_2d = dict(s=35, edgecolors='none', linewidth=0., cmap=cm.get_cmap('hot'), alpha=.5)
+data_2d = dict(
-inducing_1d = dict(lw=0, s=500, color=Tango.colorsHex['darkRed'])
+    s=35, edgecolors="none", linewidth=0.0, cmap=pyplot.get_cmap("hot"), alpha=0.5
-inducing_2d = dict(s=17, edgecolor='k', linewidth=.4, color='white', alpha=.5, marker='^')
+)
-inducing_3d = dict(lw=.3, s=500, color=Tango.colorsHex['darkRed'], edgecolor='k')
+inducing_1d = dict(lw=0, s=500, color=Tango.colorsHex["darkRed"])
-xerrorbar = dict(color='k', fmt='none', elinewidth=.5, alpha=.5)
+inducing_2d = dict(
-yerrorbar = dict(color=Tango.colorsHex['darkRed'], fmt='none', elinewidth=.5, alpha=.5)
+    s=17, edgecolor="k", linewidth=0.4, color="white", alpha=0.5, marker="^"
 )
 inducing_3d = dict(lw=0.3, s=500, color=Tango.colorsHex["darkRed"], edgecolor="k")
 xerrorbar = dict(color="k", fmt="none", elinewidth=0.5, alpha=0.5)
 yerrorbar = dict(
    color=Tango.colorsHex["darkRed"], fmt="none", elinewidth=0.5, alpha=0.5
 )
 # GP plots:
-meanplot_1d = dict(color=Tango.colorsHex['mediumBlue'], linewidth=2)
+meanplot_1d = dict(color=Tango.colorsHex["mediumBlue"], linewidth=2)
-meanplot_2d = dict(cmap='hot', linewidth=.5)
+meanplot_2d = dict(cmap="hot", linewidth=0.5)
-meanplot_3d = dict(linewidth=0, antialiased=True, cstride=1, rstride=1, cmap='hot', alpha=.3)
+meanplot_3d = dict(
-samples_1d = dict(color=Tango.colorsHex['mediumBlue'], linewidth=.3)
+    linewidth=0, antialiased=True, cstride=1, rstride=1, cmap="hot", alpha=0.3
-samples_3d = dict(cmap='hot', alpha=.1, antialiased=True, cstride=1, rstride=1, linewidth=0)
+)
-confidence_interval = dict(edgecolor=Tango.colorsHex['darkBlue'], linewidth=.5, color=Tango.colorsHex['lightBlue'],alpha=.2)
+samples_1d = dict(color=Tango.colorsHex["mediumBlue"], linewidth=0.3)
-density = dict(alpha=.5, color=Tango.colorsHex['lightBlue'])
+samples_3d = dict(
    cmap="hot", alpha=0.1, antialiased=True, cstride=1, rstride=1, linewidth=0
 )
 confidence_interval = dict(
    edgecolor=Tango.colorsHex["darkBlue"],
    linewidth=0.5,
    color=Tango.colorsHex["lightBlue"],
    alpha=0.2,
 )
 density = dict(alpha=0.5, color=Tango.colorsHex["lightBlue"])
 # GPLVM plots:
-data_y_1d = dict(linewidth=0, cmap='RdBu', s=40)
+data_y_1d = dict(linewidth=0, cmap="RdBu", s=40)
-data_y_1d_plot = dict(color='k', linewidth=1.5)
+data_y_1d_plot = dict(color="k", linewidth=1.5)
 # Kernel plots:
-ard = dict(edgecolor='k', linewidth=1.2)
+ard = dict(edgecolor="k", linewidth=1.2)
 # Input plots:
-latent = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
+latent = dict(aspect="auto", cmap="Greys", interpolation="bicubic")
-gradient = dict(aspect='auto', cmap='RdBu', interpolation='nearest', alpha=.7)
+gradient = dict(aspect="auto", cmap="RdBu", interpolation="nearest", alpha=0.7)
-magnification = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
+magnification = dict(aspect="auto", cmap="Greys", interpolation="bicubic")
-latent_scatter = dict(s=20, linewidth=.2, edgecolor='k', alpha=.9)
+latent_scatter = dict(s=20, linewidth=0.2, edgecolor="k", alpha=0.9)
-annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
+annotation = dict(
    fontdict=dict(family="sans-serif", weight="light", fontsize=9),
    zorder=0.3,
    alpha=0.7,
 )
--- a/GPy/plotting/matplot_dep/plot_definitions.py
+++ b/GPy/plotting/matplot_dep/plot_definitions.py
@ -1,4 +1,4 @@
-#===============================================================================
+# ===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 # All rights reserved.
 #
@ -26,7 +26,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#===============================================================================
+# ===============================================================================
 import numpy as np
 from matplotlib import pyplot as plt
 from ..abstract_plotting_library import AbstractPlottingLibrary
@ -37,6 +37,7 @@ from .controllers import ImshowController, ImAnnotateController
 import itertools
 from .util import legend_ontop
 class MatplotlibPlots(AbstractPlottingLibrary):
    def __init__(self):
        super(MatplotlibPlots, self).__init__()
@ -49,54 +50,86 @@ class MatplotlibPlots(AbstractPlottingLibrary):
        fig.gridspec = plt.GridSpec(rows, cols, **gridspec_kwargs)
        return fig
-    def new_canvas(self, figure=None, row=1, col=1, projection='2d', xlabel=None, ylabel=None, zlabel=None, title=None, xlim=None, ylim=None, zlim=None, **kwargs):
+    def new_canvas(
-        if projection == '3d':
+        self,
        figure=None,
        row=1,
        col=1,
        projection="2d",
        xlabel=None,
        ylabel=None,
        zlabel=None,
        title=None,
        xlim=None,
        ylim=None,
        zlim=None,
        **kwargs
    ):
        if projection == "3d":
            from mpl_toolkits.mplot3d import Axes3D
-        elif projection == '2d':
+        elif projection == "2d":
            projection = None
-        if 'ax' in kwargs:
+        if "ax" in kwargs:
-            ax = kwargs.pop('ax')
+            ax = kwargs.pop("ax")
        else:
            if figure is not None:
                fig = figure
-            elif 'num' in kwargs and 'figsize' in kwargs:
+            elif "num" in kwargs and "figsize" in kwargs:
-                fig = self.figure(num=kwargs.pop('num'), figsize=kwargs.pop('figsize'))
+                fig = self.figure(num=kwargs.pop("num"), figsize=kwargs.pop("figsize"))
-            elif 'num' in kwargs:
+            elif "num" in kwargs:
-                fig = self.figure(num=kwargs.pop('num'))
+                fig = self.figure(num=kwargs.pop("num"))
-            elif 'figsize' in kwargs:
+            elif "figsize" in kwargs:
-                fig = self.figure(figsize=kwargs.pop('figsize'))
+                fig = self.figure(figsize=kwargs.pop("figsize"))
            else:
                fig = self.figure()
-            #if hasattr(fig, 'rows') and hasattr(fig, 'cols'):
+            # if hasattr(fig, 'rows') and hasattr(fig, 'cols'):
-            ax = fig.add_subplot(fig.gridspec[row-1, col-1], projection=projection)
+            ax = fig.add_subplot(fig.gridspec[row - 1, col - 1], projection=projection)
-        if xlim is not None: ax.set_xlim(xlim)
+        if xlim is not None:
-        if ylim is not None: ax.set_ylim(ylim)
+            ax.set_xlim(xlim)
-        if xlabel is not None: ax.set_xlabel(xlabel)
+        if ylim is not None:
-        if ylabel is not None: ax.set_ylabel(ylabel)
+            ax.set_ylim(ylim)
-        if title is not None: ax.set_title(title)
+        if xlabel is not None:
-        if projection == '3d':
+            ax.set_xlabel(xlabel)
-            if zlim is not None: ax.set_zlim(zlim)
+        if ylabel is not None:
-            if zlabel is not None: ax.set_zlabel(zlabel)
+            ax.set_ylabel(ylabel)
        if title is not None:
            ax.set_title(title)
        if projection == "3d":
            if zlim is not None:
                ax.set_zlim(zlim)
            if zlabel is not None:
                ax.set_zlabel(zlabel)
        return ax, kwargs
    def add_to_canvas(self, ax, plots, legend=False, title=None, **kwargs):
-        #ax.autoscale_view()
+        # ax.autoscale_view()
-        fontdict=dict(family='sans-serif', weight='light', size=9)
+        fontdict = dict(family="sans-serif", weight="light", size=9)
        if legend is True:
            ax.legend(*ax.get_legend_handles_labels())
        elif legend >= 1:
-            #ax.legend(prop=fontdict)
+            # ax.legend(prop=fontdict)
            legend_ontop(ax, ncol=legend, fontdict=fontdict)
-        if title is not None: ax.figure.suptitle(title)
+        if title is not None:
            ax.figure.suptitle(title)
        return plots
    def show_canvas(self, ax, **kwargs):
        ax.figure.canvas.draw()
        return ax.figure
-    def scatter(self, ax, X, Y, Z=None, color=Tango.colorsHex['mediumBlue'], label=None, marker='o', **kwargs):
+    def scatter(
        self,
        ax,
        X,
        Y,
        Z=None,
        color=Tango.colorsHex["mediumBlue"],
        label=None,
        marker="o",
        **kwargs
    ):
        if Z is not None:
            return ax.scatter(X, Y, c=color, zs=Z, label=label, marker=marker, **kwargs)
        return ax.scatter(X, Y, c=color, label=label, marker=marker, **kwargs)
@ -106,129 +139,258 @@ class MatplotlibPlots(AbstractPlottingLibrary):
            return ax.plot(X, Y, color=color, zs=Z, label=label, **kwargs)
        return ax.plot(X, Y, color=color, label=label, **kwargs)
-    def plot_axis_lines(self, ax, X, color=Tango.colorsHex['darkRed'], label=None, **kwargs):
+    def plot_axis_lines(
        self, ax, X, color=Tango.colorsHex["darkRed"], label=None, **kwargs
    ):
        from matplotlib import transforms
        from matplotlib.path import Path
-        if 'marker' not in kwargs:
+
-            kwargs['marker'] = Path([[-.2,0.],    [-.2,.5],    [0.,1.],    [.2,.5],     [.2,0.],     [-.2,0.]],
+        if "marker" not in kwargs:
-                                    [Path.MOVETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.CLOSEPOLY])
+            kwargs["marker"] = Path(
-        if 'transform' not in kwargs:
+                [
                    [-0.2, 0.0],
                    [-0.2, 0.5],
                    [0.0, 1.0],
                    [0.2, 0.5],
                    [0.2, 0.0],
                    [-0.2, 0.0],
                ],
                [
                    Path.MOVETO,
                    Path.LINETO,
                    Path.LINETO,
                    Path.LINETO,
                    Path.LINETO,
                    Path.CLOSEPOLY,
                ],
            )
        if "transform" not in kwargs:
            if X.shape[1] == 1:
-                kwargs['transform'] = transforms.blended_transform_factory(ax.transData, ax.transAxes)
+                kwargs["transform"] = transforms.blended_transform_factory(
                    ax.transData, ax.transAxes
                )
        if X.shape[1] == 2:
-            return ax.scatter(X[:,0], X[:,1], ax.get_zlim()[0], c=color, label=label, **kwargs)
+            return ax.scatter(
                X[:, 0], X[:, 1], ax.get_zlim()[0], c=color, label=label, **kwargs
            )
        return ax.scatter(X, np.zeros_like(X), c=color, label=label, **kwargs)
-    def barplot(self, ax, x, height, width=0.8, bottom=0, color=Tango.colorsHex['mediumBlue'], label=None, **kwargs):
+    def barplot(
-        if 'align' not in kwargs:
+        self,
-            kwargs['align'] = 'center'
+        ax,
-        return ax.bar(x=x, height=height, width=width,
+        x,
-               bottom=bottom, label=label, color=color,
+        height,
-               **kwargs)
+        width=0.8,
        bottom=0,
        color=Tango.colorsHex["mediumBlue"],
        label=None,
        **kwargs
    ):
        if "align" not in kwargs:
            kwargs["align"] = "center"
        return ax.bar(
            x=x,
            height=height,
            width=width,
            bottom=bottom,
            label=label,
            color=color,
            **kwargs
        )
-    def xerrorbar(self, ax, X, Y, error, color=Tango.colorsHex['darkRed'], label=None, **kwargs):
+    def xerrorbar(
-        if not('linestyle' in kwargs or 'ls' in kwargs):
+        self, ax, X, Y, error, color=Tango.colorsHex["darkRed"], label=None, **kwargs
-            kwargs['ls'] = 'none'
+    ):
-        #if Z is not None:
+        if not ("linestyle" in kwargs or "ls" in kwargs):
            kwargs["ls"] = "none"
        # if Z is not None:
        #    return ax.errorbar(X, Y, Z, xerr=error, ecolor=color, label=label, **kwargs)
        return ax.errorbar(X, Y, xerr=error, ecolor=color, label=label, **kwargs)
-    def yerrorbar(self, ax, X, Y, error, color=Tango.colorsHex['darkRed'], label=None, **kwargs):
+    def yerrorbar(
-        if not('linestyle' in kwargs or 'ls' in kwargs):
+        self, ax, X, Y, error, color=Tango.colorsHex["darkRed"], label=None, **kwargs
-            kwargs['ls'] = 'none'
+    ):
-        #if Z is not None:
+        if not ("linestyle" in kwargs or "ls" in kwargs):
            kwargs["ls"] = "none"
        # if Z is not None:
        #    return ax.errorbar(X, Y, Z, yerr=error, ecolor=color, label=label, **kwargs)
        return ax.errorbar(X, Y, yerr=error, ecolor=color, label=label, **kwargs)
-    def imshow(self, ax, X, extent=None, label=None, vmin=None, vmax=None, **imshow_kwargs):
+    def imshow(
-        if 'origin' not in imshow_kwargs:
+        self, ax, X, extent=None, label=None, vmin=None, vmax=None, **imshow_kwargs
-            imshow_kwargs['origin'] = 'lower'
+    ):
-        #xmin, xmax, ymin, ymax = extent
+        if "origin" not in imshow_kwargs:
-        #xoffset, yoffset = (xmax - xmin) / (2. * X.shape[0]), (ymax - ymin) / (2. * X.shape[1])
+            imshow_kwargs["origin"] = "lower"
-        #xmin, xmax, ymin, ymax = extent = xmin-xoffset, xmax+xoffset, ymin-yoffset, ymax+yoffset
+        # xmin, xmax, ymin, ymax = extent
-        return ax.imshow(X, label=label, extent=extent, vmin=vmin, vmax=vmax, **imshow_kwargs)
+        # xoffset, yoffset = (xmax - xmin) / (2. * X.shape[0]), (ymax - ymin) / (2. * X.shape[1])
        # xmin, xmax, ymin, ymax = extent = xmin-xoffset, xmax+xoffset, ymin-yoffset, ymax+yoffset
        return ax.imshow(
            X, label=label, extent=extent, vmin=vmin, vmax=vmax, **imshow_kwargs
        )
-    def imshow_interact(self, ax, plot_function, extent, label=None, resolution=None, vmin=None, vmax=None, **imshow_kwargs):
+    def imshow_interact(
-        if imshow_kwargs is None: imshow_kwargs = {}
+        self,
-        if 'origin' not in imshow_kwargs:
+        ax,
-            imshow_kwargs['origin'] = 'lower'
+        plot_function,
-        return ImshowController(ax, plot_function, extent, resolution=resolution, vmin=vmin, vmax=vmax, **imshow_kwargs)
+        extent,
        label=None,
        resolution=None,
        vmin=None,
        vmax=None,
        **imshow_kwargs
    ):
        if imshow_kwargs is None:
            imshow_kwargs = {}
        if "origin" not in imshow_kwargs:
            imshow_kwargs["origin"] = "lower"
        return ImshowController(
            ax,
            plot_function,
            extent,
            resolution=resolution,
            vmin=vmin,
            vmax=vmax,
            **imshow_kwargs
        )
-    def annotation_heatmap(self, ax, X, annotation, extent=None, label=None, imshow_kwargs=None, **annotation_kwargs):
+    def annotation_heatmap(
-        if imshow_kwargs is None: imshow_kwargs = {}
+        self,
-        if 'origin' not in imshow_kwargs:
+        ax,
-            imshow_kwargs['origin'] = 'lower'
+        X,
-        if ('ha' not in annotation_kwargs) and ('horizontalalignment' not in annotation_kwargs):
+        annotation,
-            annotation_kwargs['ha'] = 'center'
+        extent=None,
-        if ('va' not in annotation_kwargs) and ('verticalalignment' not in annotation_kwargs):
+        label=None,
-            annotation_kwargs['va'] = 'center'
+        imshow_kwargs=None,
        **annotation_kwargs
    ):
        if imshow_kwargs is None:
            imshow_kwargs = {}
        if "origin" not in imshow_kwargs:
            imshow_kwargs["origin"] = "lower"
        if ("ha" not in annotation_kwargs) and (
            "horizontalalignment" not in annotation_kwargs
        ):
            annotation_kwargs["ha"] = "center"
        if ("va" not in annotation_kwargs) and (
            "verticalalignment" not in annotation_kwargs
        ):
            annotation_kwargs["va"] = "center"
        imshow = self.imshow(ax, X, extent, label, **imshow_kwargs)
        if extent is None:
            extent = (0, X.shape[0], 0, X.shape[1])
        xmin, xmax, ymin, ymax = extent
-        xoffset, yoffset = (xmax - xmin) / (2. * X.shape[0]), (ymax - ymin) / (2. * X.shape[1])
+        xoffset, yoffset = (xmax - xmin) / (2.0 * X.shape[0]), (ymax - ymin) / (
            2.0 * X.shape[1]
        )
        xlin = np.linspace(xmin, xmax, X.shape[0], endpoint=False)
        ylin = np.linspace(ymin, ymax, X.shape[1], endpoint=False)
        annotations = []
        for [i, x], [j, y] in itertools.product(enumerate(xlin), enumerate(ylin)):
-            annotations.append(ax.text(x+xoffset, y+yoffset, "{}".format(annotation[j, i]), **annotation_kwargs))
+            annotations.append(
                ax.text(
                    x + xoffset,
                    y + yoffset,
                    "{}".format(annotation[j, i]),
                    **annotation_kwargs
                )
            )
        return imshow, annotations
-    def annotation_heatmap_interact(self, ax, plot_function, extent, label=None, resolution=15, imshow_kwargs=None, **annotation_kwargs):
+    def annotation_heatmap_interact(
-        if imshow_kwargs is None: imshow_kwargs = {}
+        self,
-        if 'origin' not in imshow_kwargs:
+        ax,
-            imshow_kwargs['origin'] = 'lower'
+        plot_function,
-        return ImAnnotateController(ax, plot_function, extent, resolution=resolution, imshow_kwargs=imshow_kwargs or {}, **annotation_kwargs)
+        extent,
        label=None,
        resolution=15,
        imshow_kwargs=None,
        **annotation_kwargs
    ):
        if imshow_kwargs is None:
            imshow_kwargs = {}
        if "origin" not in imshow_kwargs:
            imshow_kwargs["origin"] = "lower"
        return ImAnnotateController(
            ax,
            plot_function,
            extent,
            resolution=resolution,
            imshow_kwargs=imshow_kwargs or {},
            **annotation_kwargs
        )
    def contour(self, ax, X, Y, C, levels=20, label=None, **kwargs):
-        return ax.contour(X, Y, C, levels=np.linspace(C.min(), C.max(), levels), label=label, **kwargs)
+        return ax.contour(
            X, Y, C, levels=np.linspace(C.min(), C.max(), levels), label=label, **kwargs
        )
    def surface(self, ax, X, Y, Z, color=None, label=None, **kwargs):
        return ax.plot_surface(X, Y, Z, label=label, **kwargs)
-    def fill_between(self, ax, X, lower, upper, color=Tango.colorsHex['mediumBlue'], label=None, **kwargs):
+    def fill_between(
        self,
        ax,
        X,
        lower,
        upper,
        color=Tango.colorsHex["mediumBlue"],
        label=None,
        **kwargs
    ):
        return ax.fill_between(X, lower, upper, facecolor=color, label=label, **kwargs)
-    def fill_gradient(self, canvas, X, percentiles, color=Tango.colorsHex['mediumBlue'], label=None, **kwargs):
+    def fill_gradient(
        self,
        canvas,
        X,
        percentiles,
        color=Tango.colorsHex["mediumBlue"],
        label=None,
        **kwargs
    ):
        ax = canvas
        plots = []
-        if 'edgecolors' not in kwargs:
+        if "edgecolors" not in kwargs:
-            kwargs['edgecolors'] = 'none'
+            kwargs["edgecolors"] = "none"
-        if 'facecolors' in kwargs:
+        if "facecolors" in kwargs:
-            color = kwargs.pop('facecolors')
+            color = kwargs.pop("facecolors")
-        if 'array' in kwargs:
+        if "array" in kwargs:
-            array = kwargs.pop('array')
+            array = kwargs.pop("array")
        else:
-            array = 1.-np.abs(np.linspace(-.97, .97, len(percentiles)-1))
+            array = 1.0 - np.abs(np.linspace(-0.97, 0.97, len(percentiles) - 1))
-        if 'alpha' in kwargs:
+        if "alpha" in kwargs:
-            alpha = kwargs.pop('alpha')
+            alpha = kwargs.pop("alpha")
        else:
-            alpha = .8
+            alpha = 0.8
-        if 'cmap' in kwargs:
+        if "cmap" in kwargs:
-            cmap = kwargs.pop('cmap')
+            cmap = kwargs.pop("cmap")
        else:
-            cmap = LinearSegmentedColormap.from_list('WhToColor', (color, color), N=array.size)
+            cmap = LinearSegmentedColormap.from_list(
                "WhToColor", (color, color), N=array.size
            )
        cmap._init()
-        cmap._lut[:-3, -1] = alpha*array
+        cmap._lut[:-3, -1] = alpha * array
-        kwargs['facecolors'] = [cmap(i) for i in np.linspace(0,1,cmap.N)]
+        kwargs["facecolors"] = [cmap(i) for i in np.linspace(0, 1, cmap.N)]
        # pop where from kwargs
-        where = kwargs.pop('where') if 'where' in kwargs else None
+        where = kwargs.pop("where") if "where" in kwargs else None
        # pop interpolate, which we actually do not do here!
-        if 'interpolate' in kwargs: kwargs.pop('interpolate')
+        if "interpolate" in kwargs:
            kwargs.pop("interpolate")
        def pairwise(iterable):
            "s -> (s0,s1), (s1,s2), (s2, s3), ..."
            from itertools import tee
-            #try:
+
            # try:
            #    from itertools import izip as zip
-            #except ImportError:
+            # except ImportError:
            #    pass
            a, b = tee(iterable)
            next(b, None)
@ -245,6 +407,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
            ax._process_unit_info(ydata=y2)
            # Convert the arrays so we can work with them
            from numpy import ma
            x = ma.masked_invalid(ax.convert_xunits(X))
            y1 = ma.masked_invalid(ax.convert_yunits(y1))
            y2 = ma.masked_invalid(ax.convert_yunits(y2))
@ -263,6 +426,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                raise ValueError("Argument dimensions are incompatible")
            from functools import reduce
            mask = reduce(ma.mask_or, [ma.getmask(a) for a in (x, y1, y2)])
            if mask is not ma.nomask:
                where &= ~mask
@ -277,7 +441,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                    continue
                N = len(xslice)
-                p = np.zeros((2 * N + 2, 2), np.float)
+                p = np.zeros((2 * N + 2, 2), float)
                # the purpose of the next two lines is for when y2 is a
                # scalar like 0 and we want the fill to go all the way
@ -288,16 +452,17 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                p[0] = start
                p[N + 1] = end
-                p[1:N + 1, 0] = xslice
+                p[1 : N + 1, 0] = xslice
-                p[1:N + 1, 1] = y1slice
+                p[1 : N + 1, 1] = y1slice
-                p[N + 2:, 0] = xslice[::-1]
+                p[N + 2 :, 0] = xslice[::-1]
-                p[N + 2:, 1] = y2slice[::-1]
+                p[N + 2 :, 1] = y2slice[::-1]
                polys.append(p)
            polycol.extend(polys)
        from matplotlib.collections import PolyCollection
-        if 'zorder' not in kwargs:
+
-            kwargs['zorder'] = 0
+        if "zorder" not in kwargs:
            kwargs["zorder"] = 0
        plots.append(PolyCollection(polycol, label=label, **kwargs))
        ax.add_collection(plots[-1], autolim=True)
        ax.autoscale_view()
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@ -1,4 +1,6 @@
-from matplotlib import pyplot as pb, numpy as np
+from matplotlib import pyplot as pb
 import numpy as np
 def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
    """
@ -17,6 +19,7 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
    if colors is None:
        from ..Tango import mediumList
        from itertools import cycle
        colors = cycle(mediumList)
        pb.clf()
    else:
@ -33,21 +36,30 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
            a = ax[i]
        else:
            raise ValueError("Need one ax per latent dimension input_dim")
-        bg_lines.append(a.plot(means, c='k', alpha=.3))
+        bg_lines.append(a.plot(means, c="k", alpha=0.3))
-        lines.extend(a.plot(x, means.T[i], c=next(colors), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
+        lines.extend(
-        fills.append(a.fill_between(x,
+            a.plot(
-                        means.T[i] - 2 * np.sqrt(variances.T[i]),
+                x, means.T[i], c=next(colors), label=r"$\mathbf{{X_{{{}}}}}$".format(i)
-                        means.T[i] + 2 * np.sqrt(variances.T[i]),
+            )
-                        facecolor=lines[-1].get_color(),
+        )
-                        alpha=.3))
+        fills.append(
-        a.legend(borderaxespad=0.)
+            a.fill_between(
                x,
                means.T[i] - 2 * np.sqrt(variances.T[i]),
                means.T[i] + 2 * np.sqrt(variances.T[i]),
                facecolor=lines[-1].get_color(),
                alpha=0.3,
            )
        )
        a.legend(borderaxespad=0.0)
        a.set_xlim(x.min(), x.max())
        if i < means.shape[1] - 1:
-            a.set_xticklabels('')
+            a.set_xticklabels("")
    pb.draw()
-    a.figure.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
+    a.figure.tight_layout(h_pad=0.01)  # , rect=(0, 0, 1, .95))
    return dict(lines=lines, fills=fills, bg_lines=bg_lines)
 def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_side=True):
    """
    Plot latent space X in 1D:
@ -62,45 +74,60 @@ def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_sid
    """
    if ax is None:
        if side_by_side:
-            fig = pb.figure(num=fignum, figsize=(16, min(12, (2 * parameterized.mean.shape[1]))))
+            fig = pb.figure(
                num=fignum, figsize=(16, min(12, (2 * parameterized.mean.shape[1])))
            )
        else:
-            fig = pb.figure(num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1]))))
+            fig = pb.figure(
                num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1])))
            )
    if colors is None:
        from ..Tango import mediumList
        from itertools import cycle
        colors = cycle(mediumList)
        pb.clf()
    else:
        colors = iter(colors)
    plots = []
-    means, variances, gamma = parameterized.mean, parameterized.variance, parameterized.binary_prob
+    means, variances, gamma = (
        parameterized.mean,
        parameterized.variance,
        parameterized.binary_prob,
    )
    x = np.arange(means.shape[0])
    for i in range(means.shape[1]):
        if side_by_side:
-            sub1 = (means.shape[1],2,2*i+1)
+            sub1 = (means.shape[1], 2, 2 * i + 1)
-            sub2 = (means.shape[1],2,2*i+2)
+            sub2 = (means.shape[1], 2, 2 * i + 2)
        else:
-            sub1 = (means.shape[1]*2,1,2*i+1)
+            sub1 = (means.shape[1] * 2, 1, 2 * i + 1)
-            sub2 = (means.shape[1]*2,1,2*i+2)
+            sub2 = (means.shape[1] * 2, 1, 2 * i + 2)
        # mean and variance plot
        a = fig.add_subplot(*sub1)
-        a.plot(means, c='k', alpha=.3)
+        a.plot(means, c="k", alpha=0.3)
-        plots.extend(a.plot(x, means.T[i], c=next(colors), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
+        plots.extend(
-        a.fill_between(x,
+            a.plot(
-                        means.T[i] - 2 * np.sqrt(variances.T[i]),
+                x, means.T[i], c=next(colors), label=r"$\mathbf{{X_{{{}}}}}$".format(i)
-                        means.T[i] + 2 * np.sqrt(variances.T[i]),
+            )
-                        facecolor=plots[-1].get_color(),
+        )
-                        alpha=.3)
+        a.fill_between(
-        a.legend(borderaxespad=0.)
+            x,
            means.T[i] - 2 * np.sqrt(variances.T[i]),
            means.T[i] + 2 * np.sqrt(variances.T[i]),
            facecolor=plots[-1].get_color(),
            alpha=0.3,
        )
        a.legend(borderaxespad=0.0)
        a.set_xlim(x.min(), x.max())
        if i < means.shape[1] - 1:
-            a.set_xticklabels('')
+            a.set_xticklabels("")
        # binary prob plot
        a = fig.add_subplot(*sub2)
-        a.bar(x,gamma[:,i],bottom=0.,linewidth=1.,width=1.0,align='center')
+        a.bar(x, gamma[:, i], bottom=0.0, linewidth=1.0, width=1.0, align="center")
        a.set_xlim(x.min(), x.max())
-        a.set_ylim([0.,1.])
+        a.set_ylim([0.0, 1.0])
    pb.draw()
-    fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
+    fig.tight_layout(h_pad=0.01)  # , rect=(0, 0, 1, .95))
    return fig
--- a/GPy/testing/init.py
+++ b/GPy/testing/init.py
@ -1,9 +0,0 @@
 # Copyright (c) 2014, Max Zwiessele, GPy Authors
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 import sys
 def deepTest(reason):
    if reason:
        return lambda x:x
    return unittest.skip("Not deep scanning, enable deepscan by adding 'deep' argument to unittest call")
--- a/GPy/testing/cython_tests.py
+++ b/GPy/testing/cython_tests.py
@ -1,81 +0,0 @@
 import numpy as np
 import scipy as sp
 from GPy.util import choleskies
 import GPy
 import unittest
 from ..util.config import config
 try:
    from ..util import choleskies_cython
    choleskies_cython_working = config.getboolean('cython', 'working')
 except ImportError:
    choleskies_cython_working = False
 try:
    from ..kern.src import stationary_cython
    stationary_cython_working = config.getboolean('cython', 'working')
 except ImportError:
    stationary_cython_working = False
 """
 These tests make sure that the pure python and cython codes work the same
 """
@unittest.skipIf(not choleskies_cython_working,"Cython cholesky module has not been built on this machine")
 class CythonTestChols(np.testing.TestCase):
    def setUp(self):
        self.flat = np.random.randn(45,5)
        self.triang = np.array([np.eye(20) for i in range(3)])
    def test_flat_to_triang(self):
        L1 = choleskies._flat_to_triang_pure(self.flat)
        L2 = choleskies._flat_to_triang_cython(self.flat)
        np.testing.assert_allclose(L1, L2)
    def test_triang_to_flat(self):
        A1 = choleskies._triang_to_flat_pure(self.triang)
        A2 = choleskies._triang_to_flat_cython(self.triang)
        np.testing.assert_allclose(A1, A2)
@unittest.skipIf(not stationary_cython_working,"Cython stationary module has not been built on this machine")
 class test_stationary(np.testing.TestCase):
    def setUp(self):
        self.k = GPy.kern.RBF(10)
        self.X = np.random.randn(300,10)
        self.Z = np.random.randn(20,10)
        self.dKxx = np.random.randn(300,300)
        self.dKzz = np.random.randn(20,20)
        self.dKxz = np.random.randn(300,20)
    def test_square_gradX(self):
        g1 = self.k._gradients_X_cython(self.dKxx, self.X)
        g2 = self.k._gradients_X_pure(self.dKxx, self.X)
        np.testing.assert_allclose(g1, g2)
    def test_rect_gradx(self):
        g1 = self.k._gradients_X_cython(self.dKxz, self.X, self.Z)
        g2 = self.k._gradients_X_pure(self.dKxz, self.X, self.Z)
        np.testing.assert_allclose(g1, g2)
    def test_square_lengthscales(self):
        g1 = self.k._lengthscale_grads_pure(self.dKxx, self.X, self.X)
        g2 = self.k._lengthscale_grads_cython(self.dKxx, self.X, self.X)
        np.testing.assert_allclose(g1, g2)
    def test_rect_lengthscales(self):
        g1 = self.k._lengthscale_grads_pure(self.dKxz, self.X, self.Z)
        g2 = self.k._lengthscale_grads_cython(self.dKxz, self.X, self.Z)
        np.testing.assert_allclose(g1, g2)
@unittest.skipIf(not choleskies_cython_working,"Cython cholesky module has not been built on this machine")
 class test_choleskies_backprop(np.testing.TestCase):
    def setUp(self):
        a =np.random.randn(10,12)
        A = a.dot(a.T)
        self.L = GPy.util.linalg.jitchol(A)
        self.dL = np.random.randn(10,10)
    def test(self):
        r1 = choleskies._backprop_gradient_pure(self.dL, self.L)
        r2 = choleskies_cython.backprop_gradient(self.dL, self.L)
        r3 = choleskies_cython.backprop_gradient_par_c(self.dL, self.L)
        np.testing.assert_allclose(r1, r2)
        np.testing.assert_allclose(r1, r3)
--- a/GPy/testing/deactivated/deactivated_test_examples.py
+++ b/GPy/testing/deactivated/deactivated_test_examples.py
@ -1,61 +1,65 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 import numpy as np
 import GPy
 import inspect
 import pkgutil
 import os
 import random
 from nose.tools import nottest
 import sys
 import itertools
 class ExamplesTests(unittest.TestCase):
    def _checkgrad(self, Model):
        self.assertTrue(Model.checkgrad())
-    def _model_instance(self, Model):
+def check_grad(Model):
-        self.assertTrue(isinstance(Model, GPy.models))
+    assert Model.checkgrad(), "Gradient check failed!"
 def check_model_instance(Model):
    assert isinstance(Model, GPy.models), "Wrong type!"
 def model_checkgrads(model):
    model.randomize()
-    #NOTE: Step as 1e-4, this should be acceptable for more peaky models
+    # NOTE: Step as 1e-4, this should be acceptable for more peaky models
    return model.checkgrad(step=1e-4)
 def model_instance(model):
    return isinstance(model, GPy.core.model.Model)
 def flatten_nested(lst):
    result = []
    for element in lst:
-        if hasattr(element, '__iter__'):
+        if hasattr(element, "__iter__"):
            result.extend(flatten_nested(element))
        else:
            result.append(element)
    return result
-@nottest
+
 def test_models():
-    optimize=False
+    optimize = False
-    plot=True
+    plot = True
    examples_path = os.path.dirname(GPy.examples.__file__)
    # Load modules
    failing_models = {}
-    for loader, module_name, is_pkg in pkgutil.iter_modules([examples_path]):
+    for loader, module_name, _is_pkg in pkgutil.iter_modules([examples_path]):
        # Load examples
        module_examples = loader.find_module(module_name).load_module(module_name)
        print("MODULE", module_examples)
        print("Before")
        print(inspect.getmembers(module_examples, predicate=inspect.isfunction))
-        functions = [ func for func in inspect.getmembers(module_examples, predicate=inspect.isfunction) if func[0].startswith('_') is False ][::-1]
+        functions = [
            func
            for func in inspect.getmembers(
                module_examples, predicate=inspect.isfunction
            )
            if func[0].startswith("_") is False
        ][::-1]
        print("After")
        print(functions)
        for example in functions:
-            if example[0] in ['epomeo_gpx']:
+            if example[0] in ["epomeo_gpx"]:
-                #These are the edge cases that we might want to handle specially
+                # These are the edge cases that we might want to handle specially
-                if example[0] == 'epomeo_gpx' and not GPy.util.datasets.gpxpy_available:
+                if example[0] == "epomeo_gpx" and not GPy.util.datasets.gpxpy_available:
                    print("Skipping as gpxpy is not available to parse GPS")
                    continue
@ -63,14 +67,14 @@ def test_models():
            # Generate model
            try:
-                models = [ example[1](optimize=optimize, plot=plot) ]
+                models = [example[1](optimize=optimize, plot=plot)]
-                #If more than one model returned, flatten them
+                # If more than one model returned, flatten them
                models = flatten_nested(models)
            except Exception as e:
                failing_models[example[0]] = "Cannot make model: \n{e}".format(e=e)
            else:
                print(models)
-                model_checkgrads.description = 'test_checkgrads_%s' % example[0]
+                model_checkgrads.description = "test_checkgrads_%s" % example[0]
                try:
                    for model in models:
                        if not model_checkgrads(model):
@ -78,7 +82,7 @@ def test_models():
                except Exception as e:
                    failing_models[model_checkgrads.description] = e
-                model_instance.description = 'test_instance_%s' % example[0]
+                model_instance.description = "test_instance_%s" % example[0]
                try:
                    for model in models:
                        if not model_instance(model):
@ -86,8 +90,8 @@ def test_models():
                except Exception as e:
                    failing_models[model_instance.description] = e
-            #yield model_checkgrads, model
+            # yield model_checkgrads, model
-            #yield model_instance, model
+            # yield model_instance, model
        print("Finished checking module {m}".format(m=module_name))
        if len(failing_models.keys()) > 0:
@ -97,9 +101,3 @@ def test_models():
    if len(failing_models.keys()) > 0:
        print(failing_models)
        raise Exception(failing_models)
 if __name__ == "__main__":
    print("Running unit tests, please be (very) patient...")
    # unittest.main()
    test_models()
--- a/GPy/testing/deactivated/deactivated_test_mpi.py
+++ b/GPy/testing/deactivated/deactivated_test_mpi.py
@ -1,16 +1,12 @@
 # Copyright (c) 2013-2014, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 import numpy as np
 import GPy
 try:
    from mpi4py import MPI
    import subprocess
-    class MPITests(unittest.TestCase):
+    class TestMPI:
        def test_BayesianGPLVM_MPI(self):
            code = """
 import numpy as np
@ -33,16 +29,19 @@ if comm.rank==0:
    m._trigger_params_changed()
    print float(m.objective_function())
            """
-            with open('mpi_test__.py','w') as f:
+            with open("mpi_test__.py", "w") as f:
                f.write(code)
                f.close()
-            p = subprocess.Popen('mpirun -n 4 python mpi_test__.py',stdout=subprocess.PIPE,shell=True)
+            p = subprocess.Popen(
-            (stdout, stderr) = p.communicate()
+                "mpirun -n 4 python mpi_test__.py", stdout=subprocess.PIPE, shell=True
-            L1 =  float(stdout.splitlines()[-2])
+            )
-            L2 =  float(stdout.splitlines()[-1])
+            (stdout, _stderr) = p.communicate()
-            self.assertTrue(np.allclose(L1,L2))
+            L1 = float(stdout.splitlines()[-2])
            L2 = float(stdout.splitlines()[-1])
            self.assertTrue(np.allclose(L1, L2))
            import os
-            os.remove('mpi_test__.py')
+
            os.remove("mpi_test__.py")
        def test_SparseGPRegression_MPI(self):
            code = """
@ -66,27 +65,19 @@ if comm.rank==0:
    m._trigger_params_changed()
    print float(m.objective_function())
            """
-            with open('mpi_test__.py','w') as f:
+            with open("mpi_test__.py", "w") as f:
                f.write(code)
                f.close()
-            p = subprocess.Popen('mpirun -n 4 python mpi_test__.py',stdout=subprocess.PIPE,shell=True)
+            p = subprocess.Popen(
                "mpirun -n 4 python mpi_test__.py", stdout=subprocess.PIPE, shell=True
            )
            (stdout, stderr) = p.communicate()
-            L1 =  float(stdout.splitlines()[-2])
+            L1 = float(stdout.splitlines()[-2])
-            L2 =  float(stdout.splitlines()[-1])
+            L2 = float(stdout.splitlines()[-1])
-            self.assertTrue(np.allclose(L1,L2))
+            assert np.allclose(L1, L2)
            import os
            os.remove('mpi_test__.py')
            os.remove("mpi_test__.py")
 except:
    pass
 if __name__ == "__main__":
    print("Running unit tests, please be (very) patient...")
    try:
        import mpi4py
        unittest.main()
    except:
        pass
--- a/GPy/testing/fitc.py
+++ b/GPy/testing/fitc.py
@ -1,34 +1,38 @@
 # Copyright (c) 2014, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 import numpy as np
 import GPy
-class FITCtest(unittest.TestCase):
+
-    def setUp(self):
+class FITCtest:
    def setup(self):
        ######################################
        # # 1 dimensional example
        N = 20
        # sample inputs and outputs
-        self.X1D = np.random.uniform(-3., 3., (N, 1))
+        self.X1D = np.random.uniform(-3.0, 3.0, (N, 1))
        self.Y1D = np.sin(self.X1D) + np.random.randn(N, 1) * 0.05
        ######################################
        # # 2 dimensional example
        # sample inputs and outputs
-        self.X2D = np.random.uniform(-3., 3., (N, 2))
+        self.X2D = np.random.uniform(-3.0, 3.0, (N, 2))
-        self.Y2D = np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2]) + np.random.randn(N, 1) * 0.05
+        self.Y2D = (
            np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2])
            + np.random.randn(N, 1) * 0.05
        )
    def test_fitc_1d(self):
        self.setup()
        m = GPy.models.SparseGPRegression(self.X1D, self.Y1D)
-        m.inference_method=GPy.inference.latent_function_inference.FITC()
+        m.inference_method = GPy.inference.latent_function_inference.FITC()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad(), "Gradient check failed!"
    def test_fitc_2d(self):
        self.setup()
        m = GPy.models.SparseGPRegression(self.X2D, self.Y2D)
-        m.inference_method=GPy.inference.latent_function_inference.FITC()
+        m.inference_method = GPy.inference.latent_function_inference.FITC()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad(), "Gradient check failed!"
--- a/GPy/testing/gpy_kernels_state_space_tests.py
+++ b/GPy/testing/gpy_kernels_state_space_tests.py
@ -1,454 +0,0 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2015, Alex Grigorevskiy
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Testing state space related functions.
 """
 import unittest
 import numpy as np
 import GPy
 import GPy.models.state_space_model as SS_model
 from .state_space_main_tests import generate_x_points, generate_sine_data, \
    generate_linear_data, generate_brownian_data, generate_linear_plus_sin
 from nose import SkipTest
 #from state_space_main_tests import generate_x_points, generate_sine_data, \
 #    generate_linear_data, generate_brownian_data, generate_linear_plus_sin
 class StateSpaceKernelsTests(np.testing.TestCase):
    def setUp(self):
        pass
    def run_for_model(self, X, Y, ss_kernel, kalman_filter_type = 'regular',
                      use_cython=False, check_gradients=True,
                      optimize=True, optimize_max_iters=250, predict_X=None,
                      compare_with_GP=True, gp_kernel=None,
                      mean_compare_decimal=10, var_compare_decimal=7):
        m1  = SS_model.StateSpace(X,Y, ss_kernel,
                                kalman_filter_type=kalman_filter_type,
                                use_cython=use_cython)
        m1.likelihood[:] = Y.var()/100.
        if check_gradients:
            self.assertTrue(m1.checkgrad())
        if 1:#optimize:
            m1.optimize(optimizer='lbfgsb', max_iters=1)
        if compare_with_GP and (predict_X is None):
            predict_X = X
        self.assertTrue(compare_with_GP)
        if compare_with_GP:
            m2  = GPy.models.GPRegression(X,Y, gp_kernel)
            m2[:] = m1[:]
            if (predict_X is not None):
                x_pred_reg_1 = m1.predict(predict_X)
                x_quant_reg_1 = m1.predict_quantiles(predict_X)
            x_pred_reg_2 = m2.predict(predict_X)
            x_quant_reg_2 = m2.predict_quantiles(predict_X)
            np.testing.assert_array_almost_equal(x_pred_reg_1[0], x_pred_reg_2[0], mean_compare_decimal)
            np.testing.assert_array_almost_equal(x_pred_reg_1[1], x_pred_reg_2[1], var_compare_decimal)
            np.testing.assert_array_almost_equal(x_quant_reg_1[0], x_quant_reg_2[0], mean_compare_decimal)
            np.testing.assert_array_almost_equal(x_quant_reg_1[1], x_quant_reg_2[1], mean_compare_decimal)
            np.testing.assert_array_almost_equal(m1.gradient, m2.gradient, var_compare_decimal)
            np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), var_compare_decimal)
    def test_Matern32_kernel(self,):
        np.random.seed(234) # seed the random number generator
        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
                        plot = False, points_num=50, x_interval = (0, 20), random=True)
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        ss_kernel = GPy.kern.sde_Matern32(1,active_dims=[0,])
        gp_kernel = GPy.kern.Matern32(1,active_dims=[0,])
        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
                           predict_X=X,
                           compare_with_GP=True,
                           gp_kernel=gp_kernel,
                           mean_compare_decimal=5, var_compare_decimal=5)
    def test_Matern52_kernel(self,):
        np.random.seed(234) # seed the random number generator
        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
                        plot = False, points_num=50, x_interval = (0, 20), random=True)
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        ss_kernel = GPy.kern.sde_Matern52(1,active_dims=[0,])
        gp_kernel = GPy.kern.Matern52(1,active_dims=[0,])
        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
                           optimize = True, predict_X=X,
                           compare_with_GP=True, gp_kernel=gp_kernel,
                           mean_compare_decimal=5, var_compare_decimal=5)
    def test_RBF_kernel(self,):
        #import pdb;pdb.set_trace()
        np.random.seed(234) # seed the random number generator
        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
                        plot = False, points_num=50, x_interval = (0, 20), random=True)
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        ss_kernel = GPy.kern.sde_RBF(1, 110., 1.5, active_dims=[0,], balance=True, approx_order=10)
        gp_kernel = GPy.kern.RBF(1, 110., 1.5, active_dims=[0,])
        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
                           predict_X=X,
                           gp_kernel=gp_kernel,
                           optimize_max_iters=1000,
                           mean_compare_decimal=2, var_compare_decimal=1)
    def test_periodic_kernel(self,):
        np.random.seed(322) # seed the random number generator
        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
                        plot = False, points_num=50, x_interval = (0, 20), random=True)
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        ss_kernel = GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
        ss_kernel.lengthscale.constrain_bounded(0.27, 1000)
        ss_kernel.period.constrain_bounded(0.17, 100)
        gp_kernel = GPy.kern.StdPeriodic(1,active_dims=[0,])
        gp_kernel.lengthscale.constrain_bounded(0.27, 1000)
        gp_kernel.period.constrain_bounded(0.17, 100)
        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
                           predict_X=X,
                           gp_kernel=gp_kernel,
                           mean_compare_decimal=3, var_compare_decimal=3)
    def test_quasi_periodic_kernel(self,):
        np.random.seed(329) # seed the random number generator
        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
                        plot = False, points_num=50, x_interval = (0, 20), random=True)
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        ss_kernel = GPy.kern.sde_Matern32(1)*GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
        ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
        ss_kernel.std_periodic.period.constrain_bounded(0.15, 100)
        gp_kernel = GPy.kern.Matern32(1)*GPy.kern.StdPeriodic(1,active_dims=[0,])
        gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
        gp_kernel.std_periodic.period.constrain_bounded(0.15, 100)
        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
                            predict_X=X,
                            gp_kernel=gp_kernel,
                            mean_compare_decimal=1, var_compare_decimal=2)
    def test_linear_kernel(self,):
        np.random.seed(234) # seed the random number generator
        (X,Y) = generate_linear_data(x_points=None, tangent=2.0, add_term=20.0, noise_var=2.0,
                    plot = False, points_num=50, x_interval = (0, 20), random=True)
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        ss_kernel = GPy.kern.sde_Linear(1,X,active_dims=[0,]) + GPy.kern.sde_Bias(1, active_dims=[0,])
        gp_kernel = GPy.kern.Linear(1, active_dims=[0,]) + GPy.kern.Bias(1, active_dims=[0,])
        self.run_for_model(X, Y, ss_kernel, check_gradients= False,
                           predict_X=X,
                           gp_kernel=gp_kernel,
                           mean_compare_decimal=5, var_compare_decimal=5)
    def test_brownian_kernel(self,):
        np.random.seed(234) # seed the random number generator
        (X,Y) = generate_brownian_data(x_points=None, kernel_var=2.0, noise_var = 0.1,
                    plot = False, points_num=50, x_interval = (0, 20), random=True)
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        ss_kernel = GPy.kern.sde_Brownian()
        gp_kernel = GPy.kern.Brownian()
        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
                            predict_X=X,
                            gp_kernel=gp_kernel,
                            mean_compare_decimal=4, var_compare_decimal=4)
    def test_exponential_kernel(self,):
        np.random.seed(12345) # seed the random number generator
        (X,Y) = generate_linear_data(x_points=None, tangent=1.0, add_term=20.0, noise_var=2.0,
                    plot = False, points_num=10, x_interval = (0, 20), random=True)
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        ss_kernel = GPy.kern.sde_Exponential(1, Y.var(), X.ptp()/2., active_dims=[0,])
        gp_kernel = GPy.kern.Exponential(1, Y.var(), X.ptp()/2., active_dims=[0,])
        Y -= Y.mean()
        self.run_for_model(X, Y, ss_kernel, check_gradients=True,
                      predict_X=X,
                      gp_kernel=gp_kernel,
                      optimize_max_iters=1000,
                      mean_compare_decimal=2, var_compare_decimal=2)
    def test_kernel_addition_svd(self,):
        #np.random.seed(329) # seed the random number generator
        np.random.seed(42)
        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
                        plot = False, points_num=100, x_interval = (0, 40), random=True)
        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
                    plot = False, points_num=100, x_interval = (0, 40), random=True)
        # Sine data <-
        Y = Y + Y1
        Y -= Y.mean()
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        def get_new_kernels():
            ss_kernel = GPy.kern.sde_Linear(1, X, variances=1) + GPy.kern.sde_StdPeriodic(1, period=5.0, variance=300, lengthscale=3, active_dims=[0,])
            #ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
            #ss_kernel.std_periodic.period.constrain_bounded(3, 8)
            gp_kernel = GPy.kern.Linear(1, variances=1) + GPy.kern.StdPeriodic(1, period=5.0, variance=300, lengthscale=3, active_dims=[0,])
            #gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
            #gp_kernel.std_periodic.period.constrain_bounded(3, 8)
            return ss_kernel, gp_kernel
        # Cython is available only with svd.
        ss_kernel, gp_kernel = get_new_kernels()
        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'svd',
                           use_cython=True, optimize_max_iters=10, check_gradients=False,
                           predict_X=X,
                           gp_kernel=gp_kernel,
                           mean_compare_decimal=3, var_compare_decimal=3)
        ss_kernel, gp_kernel = get_new_kernels()
        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'svd',
                           use_cython=False, optimize_max_iters=10, check_gradients=False,
                           predict_X=X,
                           gp_kernel=gp_kernel,
                           mean_compare_decimal=3, var_compare_decimal=3)
    def test_kernel_addition_regular(self,):
        #np.random.seed(329) # seed the random number generator
        np.random.seed(42)
        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
                        plot = False, points_num=100, x_interval = (0, 40), random=True)
        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
                    plot = False, points_num=100, x_interval = (0, 40), random=True)
        # Sine data <-
        Y = Y + Y1
        Y -= Y.mean()
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        def get_new_kernels():
            ss_kernel = GPy.kern.sde_Linear(1, X, variances=1) + GPy.kern.sde_StdPeriodic(1, period=5.0, variance=300, lengthscale=3, active_dims=[0,])
            #ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
            #ss_kernel.std_periodic.period.constrain_bounded(3, 8)
            gp_kernel = GPy.kern.Linear(1, variances=1) + GPy.kern.StdPeriodic(1, period=5.0, variance=300, lengthscale=3, active_dims=[0,])
            #gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
            #gp_kernel.std_periodic.period.constrain_bounded(3, 8)
            return ss_kernel, gp_kernel
        ss_kernel, gp_kernel = get_new_kernels()
        try:
            self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'regular',
                               use_cython=False, optimize_max_iters=10, check_gradients=True,
                               predict_X=X,
                               gp_kernel=gp_kernel,
                               mean_compare_decimal=2, var_compare_decimal=2)
        except AssertionError:
            raise SkipTest("Skipping Regular kalman filter for kernel addition, because it is not stable (normal situation) for this data.")
    def test_kernel_multiplication(self,):
        np.random.seed(329) # seed the random number generator
        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=10.0, noise_var=2.0,
                        plot = False, points_num=50, x_interval = (0, 20), random=True)
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        def get_new_kernels():
            ss_kernel = GPy.kern.sde_Matern32(1)*GPy.kern.sde_Matern52(1)
            gp_kernel = GPy.kern.Matern32(1)*GPy.kern.sde_Matern52(1)
            return ss_kernel, gp_kernel
        ss_kernel, gp_kernel = get_new_kernels()
        #import ipdb;ipdb.set_trace()
        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'svd',
                           use_cython=True, optimize_max_iters=10, check_gradients=True,
                            predict_X=X,
                            gp_kernel=gp_kernel,
                            mean_compare_decimal=2, var_compare_decimal=2)
        ss_kernel, gp_kernel = get_new_kernels()
        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'regular',
                           use_cython=False, optimize_max_iters=10, check_gradients=True,
                            predict_X=X,
                            gp_kernel=gp_kernel,
                            mean_compare_decimal=2, var_compare_decimal=2)
        ss_kernel, gp_kernel = get_new_kernels()
        self.run_for_model(X, Y, ss_kernel, kalman_filter_type = 'svd',
                           use_cython=False, optimize_max_iters=10, check_gradients=True,
                            predict_X=X,
                            gp_kernel=gp_kernel,
                            mean_compare_decimal=2, var_compare_decimal=2)
    def test_forecast_regular(self,):
        # Generate data ->
        np.random.seed(339) # seed the random number generator
        #import pdb; pdb.set_trace()
        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
                        plot = False, points_num=100, x_interval = (0, 40), random=True)
        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
                    plot = False, points_num=100, x_interval = (0, 40), random=True)
        Y = Y + Y1
        X_train = X[X <= 20]
        Y_train = Y[X <= 20]
        X_test = X[X > 20]
        Y_test = Y[X > 20]
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        X_train.shape = (X_train.shape[0],1); Y_train.shape = (Y_train.shape[0],1)
        X_test.shape = (X_test.shape[0],1); Y_test.shape = (Y_test.shape[0],1)
        # Generate data <-
        #import pdb; pdb.set_trace()
        periodic_kernel = GPy.kern.StdPeriodic(1,active_dims=[0,])
        gp_kernel = GPy.kern.Linear(1, active_dims=[0,]) + GPy.kern.Bias(1, active_dims=[0,]) + periodic_kernel
        gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
        gp_kernel.std_periodic.period.constrain_bounded(0.15, 100)
        periodic_kernel = GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
        ss_kernel = GPy.kern.sde_Linear(1,X,active_dims=[0,]) + \
            GPy.kern.sde_Bias(1, active_dims=[0,]) + periodic_kernel
        ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
        ss_kernel.std_periodic.period.constrain_bounded(0.15, 100)
        self.run_for_model(X_train, Y_train, ss_kernel, kalman_filter_type = 'regular',
                           use_cython=False, optimize_max_iters=30, check_gradients=True,
                           predict_X=X_test,
                           gp_kernel=gp_kernel,
                           mean_compare_decimal=2, var_compare_decimal=2)
    def test_forecast_svd(self,):
        # Generate data ->
        np.random.seed(339) # seed the random number generator
        #import pdb; pdb.set_trace()
        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
                        plot = False, points_num=100, x_interval = (0, 40), random=True)
        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
                    plot = False, points_num=100, x_interval = (0, 40), random=True)
        Y = Y + Y1
        X_train = X[X <= 20]
        Y_train = Y[X <= 20]
        X_test = X[X > 20]
        Y_test = Y[X > 20]
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        X_train.shape = (X_train.shape[0],1); Y_train.shape = (Y_train.shape[0],1)
        X_test.shape = (X_test.shape[0],1); Y_test.shape = (Y_test.shape[0],1)
        # Generate data <-
        #import pdb; pdb.set_trace()
        periodic_kernel = GPy.kern.StdPeriodic(1,active_dims=[0,])
        gp_kernel = GPy.kern.Linear(1, active_dims=[0,]) + GPy.kern.Bias(1, active_dims=[0,]) + periodic_kernel
        gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
        gp_kernel.std_periodic.period.constrain_bounded(0.15, 100)
        periodic_kernel = GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
        ss_kernel = GPy.kern.sde_Linear(1,X,active_dims=[0,]) + \
            GPy.kern.sde_Bias(1, active_dims=[0,]) + periodic_kernel
        ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
        ss_kernel.std_periodic.period.constrain_bounded(0.15, 100)
        self.run_for_model(X_train, Y_train, ss_kernel, kalman_filter_type = 'svd',
                           use_cython=False, optimize_max_iters=30, check_gradients=False,
                           predict_X=X_test,
                           gp_kernel=gp_kernel,
                           mean_compare_decimal=2, var_compare_decimal=2)
    def test_forecast_svd_cython(self,):
        # Generate data ->
        np.random.seed(339) # seed the random number generator
        #import pdb; pdb.set_trace()
        (X,Y) = generate_sine_data(x_points=None, sin_period=5.0, sin_ampl=5.0, noise_var=2.0,
                        plot = False, points_num=100, x_interval = (0, 40), random=True)
        (X1,Y1) = generate_linear_data(x_points=X, tangent=1.0, add_term=20.0, noise_var=0.0,
                    plot = False, points_num=100, x_interval = (0, 40), random=True)
        Y = Y + Y1
        X_train = X[X <= 20]
        Y_train = Y[X <= 20]
        X_test = X[X > 20]
        Y_test = Y[X > 20]
        X.shape = (X.shape[0],1); Y.shape = (Y.shape[0],1)
        X_train.shape = (X_train.shape[0],1); Y_train.shape = (Y_train.shape[0],1)
        X_test.shape = (X_test.shape[0],1); Y_test.shape = (Y_test.shape[0],1)
        # Generate data <-
        #import pdb; pdb.set_trace()
        periodic_kernel = GPy.kern.StdPeriodic(1,active_dims=[0,])
        gp_kernel = GPy.kern.Linear(1, active_dims=[0,]) + GPy.kern.Bias(1, active_dims=[0,]) + periodic_kernel
        gp_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
        gp_kernel.std_periodic.period.constrain_bounded(0.15, 100)
        periodic_kernel = GPy.kern.sde_StdPeriodic(1,active_dims=[0,])
        ss_kernel = GPy.kern.sde_Linear(1,X,active_dims=[0,]) + \
            GPy.kern.sde_Bias(1, active_dims=[0,]) + periodic_kernel
        ss_kernel.std_periodic.lengthscale.constrain_bounded(0.25, 1000)
        ss_kernel.std_periodic.period.constrain_bounded(0.15, 100)
        self.run_for_model(X_train, Y_train, ss_kernel, kalman_filter_type = 'svd',
                           use_cython=True, optimize_max_iters=30, check_gradients=False,
                           predict_X=X_test,
                           gp_kernel=gp_kernel,
                           mean_compare_decimal=2, var_compare_decimal=2)
 if __name__ == "__main__":
    print("Running state-space inference tests...")
    unittest.main()
    #tt = StateSpaceKernelsTests('test_RBF_kernel')
    #import pdb; pdb.set_trace()
    #tt.test_Matern32_kernel()
    #tt.test_Matern52_kernel()
    #tt.test_RBF_kernel()
    #tt.test_periodic_kernel()
    #tt.test_quasi_periodic_kernel()
    #tt.test_linear_kernel()
    #tt.test_brownian_kernel()
    #tt.test_exponential_kernel()
    #tt.test_kernel_addition()
    #tt.test_kernel_multiplication()
    #tt.test_forecast()
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@ -1,179 +0,0 @@
 # Copyright (c) 2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The test cases for various inference algorithms
 """
 import unittest
 import numpy as np
 import GPy
 #np.seterr(invalid='raise')
 class InferenceXTestCase(unittest.TestCase):
    def genData(self):
        np.random.seed(1111)
        Ylist = GPy.examples.dimensionality_reduction._simulate_matern(5, 1, 1, 10, 3, False)[0]
        return Ylist[0]
    def test_inferenceX_BGPLVM_Linear(self):
        Ys = self.genData()
        m = GPy.models.BayesianGPLVM(Ys,3,kernel=GPy.kern.Linear(3,ARD=True))
        m.optimize()
        x, mi = m.infer_newX(m.Y, optimize=True)
        np.testing.assert_array_almost_equal(m.X.mean, mi.X.mean, decimal=2)
        np.testing.assert_array_almost_equal(m.X.variance, mi.X.variance, decimal=2)
    def test_inferenceX_BGPLVM_RBF(self):
        Ys = self.genData()
        m = GPy.models.BayesianGPLVM(Ys,3,kernel=GPy.kern.RBF(3,ARD=True))
        import warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            m.optimize()
        x, mi = m.infer_newX(m.Y, optimize=True)
        np.testing.assert_array_almost_equal(m.X.mean, mi.X.mean, decimal=2)
        np.testing.assert_array_almost_equal(m.X.variance, mi.X.variance, decimal=2)
    def test_inferenceX_GPLVM_Linear(self):
        Ys = self.genData()
        m = GPy.models.GPLVM(Ys,3,kernel=GPy.kern.Linear(3,ARD=True))
        m.optimize()
        x, mi = m.infer_newX(m.Y, optimize=True)
        np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
    def test_inferenceX_GPLVM_RBF(self):
        Ys = self.genData()
        m = GPy.models.GPLVM(Ys,3,kernel=GPy.kern.RBF(3,ARD=True))
        m.optimize()
        x, mi = m.infer_newX(m.Y, optimize=True)
        np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
 class InferenceGPEP(unittest.TestCase):
    def genData(self):
        np.random.seed(1)
        k = GPy.kern.RBF(1, variance=7., lengthscale=0.2)
        X = np.random.rand(200,1)
        f = np.random.multivariate_normal(np.zeros(200), k.K(X) + 1e-5 * np.eye(X.shape[0]))
        lik = GPy.likelihoods.Bernoulli()
        p = lik.gp_link.transf(f) # squash the latent function
        Y = lik.samples(f).reshape(-1,1)
        return X, Y
    def genNoisyData(self):
        np.random.seed(1)
        X = np.random.rand(100,1)
        self.real_std = 0.1
        noise = np.random.randn(*X[:, 0].shape)*self.real_std
        Y = (np.sin(X[:, 0]*2*np.pi) + noise)[:, None]
        self.f = np.random.rand(X.shape[0],1)
        Y_extra_noisy = Y.copy()
        Y_extra_noisy[50] += 4.
        # Y_extra_noisy[80:83] -= 2.
        return X, Y, Y_extra_noisy
    def test_inference_EP(self):
        from paramz import ObsAr
        X, Y = self.genData()
        lik = GPy.likelihoods.Bernoulli()
        k = GPy.kern.RBF(1, variance=7., lengthscale=0.2)
        inf = GPy.inference.latent_function_inference.expectation_propagation.EP(max_iters=30, delta=0.5)
        self.model = GPy.core.GP(X=X,
                        Y=Y,
                        kernel=k,
                        inference_method=inf,
                        likelihood=lik)
        K = self.model.kern.K(X)
        mean_prior = np.zeros(K.shape[0])
        post_params, ga_approx, cav_params, log_Z_tilde = self.model.inference_method.expectation_propagation(mean_prior, K, ObsAr(Y), lik, None)
        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
        p, m, d = self.model.inference_method._inference(Y, mean_prior, K, ga_approx, cav_params, lik, Y_metadata=None,  Z_tilde=log_Z_tilde)
        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, inf).inference(k, X,lik ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
        assert (np.sum(np.array([m - m0,
                    np.sum(d['dL_dK'] - d0['dL_dK']),
                    np.sum(d['dL_dthetaL'] - d0['dL_dthetaL']),
                    np.sum(d['dL_dm'] - d0['dL_dm']),
                    np.sum(p._woodbury_vector - p0._woodbury_vector),
                    np.sum(p.woodbury_inv - p0.woodbury_inv)])) < 1e6)
    # NOTE: adding a test like above for parameterized likelihood- the above test is
    # only for probit likelihood which does not have any tunable hyperparameter which is why
    # the term in dictionary of gradients: dL_dthetaL will always be zero. So here we repeat tests for
    # student-t likelihood and heterodescastic gaussian noise case. This test simply checks if the posterior
    # and gradients of log marginal are roughly the same for inference through EP and exact gaussian inference using
    # the gaussian approximation for the individual likelihood site terms. For probit likelihood, it is possible to
    # calculate moments analytically, but for other likelihoods, we will need to use numerical quadrature techniques,
    # and it is possible that any error might creep up because of quadrature implementation.
    def test_inference_EP_non_classification(self):
        from paramz import ObsAr
        X, Y, Y_extra_noisy = self.genNoisyData()
        deg_freedom = 5.
        init_noise_var = 0.08
        lik_studentT = GPy.likelihoods.StudentT(deg_free=deg_freedom, sigma2=init_noise_var)
        # like_gaussian_noise = GPy.likelihoods.MixedNoise()
        k = GPy.kern.RBF(1, variance=2., lengthscale=1.1)
        ep_inf_alt = GPy.inference.latent_function_inference.expectation_propagation.EP(max_iters=4, delta=0.5)
        # ep_inf_nested = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode='nested', max_iters=100, delta=0.5)
        m = GPy.core.GP(X=X,Y=Y_extra_noisy,kernel=k,likelihood=lik_studentT,inference_method=ep_inf_alt)
        K = m.kern.K(X)
        mean_prior = np.zeros(K.shape[0])
        post_params, ga_approx, cav_params, log_Z_tilde = m.inference_method.expectation_propagation(mean_prior, K, ObsAr(Y_extra_noisy), lik_studentT, None)
        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
        p, m, d = m.inference_method._inference(Y_extra_noisy, mean_prior, K, ga_approx, cav_params, lik_studentT, Y_metadata=None,  Z_tilde=log_Z_tilde)
        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, ep_inf_alt).inference(k, X,lik_studentT ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
        assert (np.sum(np.array([m - m0,
                    np.sum(d['dL_dK'] - d0['dL_dK']),
                    np.sum(d['dL_dthetaL'] - d0['dL_dthetaL']),
                    np.sum(d['dL_dm'] - d0['dL_dm']),
                    np.sum(p._woodbury_vector - p0._woodbury_vector),
                    np.sum(p.woodbury_inv - p0.woodbury_inv)])) < 1e6)
 class VarDtcTest(unittest.TestCase):
    def test_var_dtc_inference_with_mean(self):
        """ Check dL_dm in var_dtc is calculated correctly"""
        np.random.seed(1)
        x = np.linspace(0.,2*np.pi,100)[:,None]
        y = -np.cos(x)+np.random.randn(*x.shape)*0.3+1
        m = GPy.models.SparseGPRegression(x,y, mean_function=GPy.mappings.Linear(input_dim=1, output_dim=1))
        self.assertTrue(m.checkgrad())
 class HMCSamplerTest(unittest.TestCase):
    def test_sampling(self):
        np.random.seed(1)
        x = np.linspace(0.,2*np.pi,100)[:,None]
        y = -np.cos(x)+np.random.randn(*x.shape)*0.3+1
        m = GPy.models.GPRegression(x,y)
        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
        hmc = GPy.inference.mcmc.HMC(m,stepsize=1e-2)
        s = hmc.sample(num_samples=3)
 class MCMCSamplerTest(unittest.TestCase):
    def test_sampling(self):
        np.random.seed(1)
        x = np.linspace(0.,2*np.pi,100)[:,None]
        y = -np.cos(x)+np.random.randn(*x.shape)*0.3+1
        m = GPy.models.GPRegression(x,y)
        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.,10.))
        mcmc = GPy.inference.mcmc.Metropolis_Hastings(m)
        mcmc.sample(Ntotal=100, Nburn=10)
 if __name__ == "__main__":
    unittest.main()
--- a/GPy/testing/link_function_tests.py
+++ b/GPy/testing/link_function_tests.py
@ -1,148 +0,0 @@
 import numpy as np
 import scipy
 from scipy.special import cbrt
 from GPy.models import GradientChecker
 import random
 _lim_val = np.finfo(np.float64).max
 _lim_val_exp = np.log(_lim_val)
 _lim_val_square = np.sqrt(_lim_val)
 _lim_val_cube = cbrt(_lim_val)
 from GPy.likelihoods.link_functions import Identity, Probit, Cloglog, Log, Log_ex_1, Reciprocal, Heaviside, ScaledProbit
 class LinkFunctionTests(np.testing.TestCase):
    def setUp(self):
        self.small_f = np.array([[-1e-4]])
        self.zero_f = np.array([[1e-4]])
        self.mid_f = np.array([[5.0]])
        self.large_f = np.array([[1e4]])
        self.f_lower_lim = np.array(-np.inf)
        self.f_upper_lim = np.array(np.inf)
    def check_gradient(self, link_func, lim_of_inf, test_lim=False):
        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.mid_f)
        self.assertTrue(grad.checkgrad(verbose=True))
        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.mid_f)
        self.assertTrue(grad2.checkgrad(verbose=True))
        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.mid_f)
        self.assertTrue(grad3.checkgrad(verbose=True))
        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.small_f)
        self.assertTrue(grad.checkgrad(verbose=True))
        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.small_f)
        self.assertTrue(grad2.checkgrad(verbose=True))
        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.small_f)
        self.assertTrue(grad3.checkgrad(verbose=True))
        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.zero_f)
        self.assertTrue(grad.checkgrad(verbose=True))
        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.zero_f)
        self.assertTrue(grad2.checkgrad(verbose=True))
        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.zero_f)
        self.assertTrue(grad3.checkgrad(verbose=True))
        #Do a limit test if the large f value is too large
        large_f = np.clip(self.large_f, -np.inf, lim_of_inf-1e-3)
        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=large_f)
        self.assertTrue(grad.checkgrad(verbose=True))
        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=large_f)
        self.assertTrue(grad2.checkgrad(verbose=True))
        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=large_f)
        self.assertTrue(grad3.checkgrad(verbose=True))
        if test_lim:
            print("Testing limits")
            #Remove some otherwise we are too close to the limit for gradcheck to work effectively
            lim_of_inf = lim_of_inf - 1e-4
            grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=lim_of_inf)
            self.assertTrue(grad.checkgrad(verbose=True))
            grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=lim_of_inf)
            self.assertTrue(grad2.checkgrad(verbose=True))
            grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=lim_of_inf)
            self.assertTrue(grad3.checkgrad(verbose=True))
    def check_overflow(self, link_func, lim_of_inf):
        #Check that it does something sensible beyond this limit,
        #note this is not checking the value is correct, just that it isn't nan
        beyond_lim_of_inf = lim_of_inf + 100.0
        self.assertFalse(np.isinf(link_func.transf(beyond_lim_of_inf)))
        self.assertFalse(np.isinf(link_func.dtransf_df(beyond_lim_of_inf)))
        self.assertFalse(np.isinf(link_func.d2transf_df2(beyond_lim_of_inf)))
        self.assertFalse(np.isnan(link_func.transf(beyond_lim_of_inf)))
        self.assertFalse(np.isnan(link_func.dtransf_df(beyond_lim_of_inf)))
        self.assertFalse(np.isnan(link_func.d2transf_df2(beyond_lim_of_inf)))
    def test_log_overflow(self):
        link = Log()
        lim_of_inf = _lim_val_exp
        np.testing.assert_almost_equal(np.exp(self.mid_f), link.transf(self.mid_f))
        assert np.isinf(np.exp(np.log(self.f_upper_lim)))
        #Check the clipping works
        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
        self.assertTrue(np.isfinite(link.transf(self.f_upper_lim)))
        self.check_overflow(link, lim_of_inf)
        #Check that it would otherwise fail
        beyond_lim_of_inf = lim_of_inf + 10.0
        old_err_state = np.seterr(over='ignore')
        self.assertTrue(np.isinf(np.exp(beyond_lim_of_inf)))
        np.seterr(**old_err_state)
    def test_log_ex_1_overflow(self):
        link = Log_ex_1()
        lim_of_inf = _lim_val_exp
        np.testing.assert_almost_equal(scipy.special.log1p(np.exp(self.mid_f)), link.transf(self.mid_f))
        assert np.isinf(scipy.special.log1p(np.exp(np.log(self.f_upper_lim))))
        #Check the clipping works
        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
        #Need to look at most significant figures here rather than the decimals
        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), scipy.special.log1p(_lim_val), significant=5)
        self.check_overflow(link, lim_of_inf)
        #Check that it would otherwise fail
        beyond_lim_of_inf = lim_of_inf + 10.0
        old_err_state = np.seterr(over='ignore')
        self.assertTrue(np.isinf(scipy.special.log1p(np.exp(beyond_lim_of_inf))))
        np.seterr(**old_err_state)
    def test_log_gradients(self):
        # transf dtransf_df d2transf_df2 d3transf_df3
        link = Log()
        lim_of_inf = _lim_val_exp
        self.check_gradient(link, lim_of_inf, test_lim=True)
    def test_identity_gradients(self):
        link = Identity()
        lim_of_inf = _lim_val
        #FIXME: Should be able to think of a way to test the limits of this
        self.check_gradient(link, lim_of_inf, test_lim=False)
    def test_probit_gradients(self):
        link = Probit()
        lim_of_inf = _lim_val
        self.check_gradient(link, lim_of_inf, test_lim=True)
    def test_scaledprobit_gradients(self):
        link = ScaledProbit(nu=random.random())
        lim_of_inf = _lim_val
        self.check_gradient(link, lim_of_inf, test_lim=True)
    def test_Cloglog_gradients(self):
        link = Cloglog()
        lim_of_inf = _lim_val_exp
        self.check_gradient(link, lim_of_inf, test_lim=True)
    def test_Log_ex_1_gradients(self):
        link = Log_ex_1()
        lim_of_inf = _lim_val_exp
        self.check_gradient(link, lim_of_inf, test_lim=True)
        self.check_overflow(link, lim_of_inf)
    def test_reciprocal_gradients(self):
        link = Reciprocal()
        lim_of_inf = _lim_val
        #Does not work with much smaller values, and values closer to zero than 1e-5
        self.check_gradient(link, lim_of_inf, test_lim=True)
--- a/GPy/testing/meanfunc_tests.py
+++ b/GPy/testing/meanfunc_tests.py
@ -1,95 +0,0 @@
 # Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 import numpy as np
 import GPy
 class MFtests(unittest.TestCase):
    def test_simple_mean_function(self):
        """
        The simplest possible mean function. No parameters, just a simple Sinusoid.
        """
        #create  simple mean function
        mf = GPy.core.Mapping(1,1)
        mf.f = np.sin
        mf.update_gradients = lambda a,b: None
        X = np.linspace(0,10,50).reshape(-1,1)
        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
        k =GPy.kern.RBF(1)
        lik = GPy.likelihoods.Gaussian()
        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
        self.assertTrue(m.checkgrad())
    def test_parametric_mean_function(self):
        """
        A linear mean function with parameters that we'll learn alongside the kernel
        """
        X = np.linspace(-1,10,50).reshape(-1,1)
        Y = 3-np.abs((X-6))
        Y += .5*np.cos(3*X) + 0.3*np.random.randn(*X.shape) 
        mf = GPy.mappings.PiecewiseLinear(1, 1, [-1,1], [9,2])
        k =GPy.kern.RBF(1)
        lik = GPy.likelihoods.Gaussian()
        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
        self.assertTrue(m.checkgrad())
    def test_parametric_mean_function_composition(self):
        """
        A linear mean function with parameters that we'll learn alongside the kernel
        """
        X = np.linspace(0,10,50).reshape(-1,1)
        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
        mf = GPy.mappings.Compound(GPy.mappings.Linear(1,1), 
                                   GPy.mappings.Kernel(1, 1, np.random.normal(0,1,(1,1)), 
                                                       GPy.kern.RBF(1))
                                   )
        k =GPy.kern.RBF(1)
        lik = GPy.likelihoods.Gaussian()
        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
        self.assertTrue(m.checkgrad())
    def test_parametric_mean_function_additive(self):
        """
        A linear mean function with parameters that we'll learn alongside the kernel
        """
        X = np.linspace(0,10,50).reshape(-1,1)
        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
        mf = GPy.mappings.Additive(GPy.mappings.Constant(1,1,3),
               GPy.mappings.Additive(GPy.mappings.MLP(1,1),
                     GPy.mappings.Identity(1,1)
                           )
                        )
        k =GPy.kern.RBF(1)
        lik = GPy.likelihoods.Gaussian()
        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
        self.assertTrue(m.checkgrad())
    def test_svgp_mean_function(self):
        # an instance of the SVIGOP with a men function
        X = np.linspace(0,10,500).reshape(-1,1)
        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
        Y = np.where(Y>0, 1,0) # make aclassificatino problem
        mf = GPy.mappings.Linear(1,1)
        Z = np.linspace(0,10,50).reshape(-1,1)
        lik = GPy.likelihoods.Bernoulli()
        k =GPy.kern.RBF(1) + GPy.kern.White(1, 1e-4)
        m = GPy.core.SVGP(X, Y,Z=Z, kernel=k, likelihood=lik, mean_function=mf)
        self.assertTrue(m.checkgrad())
--- a/GPy/testing/minibatch_tests.py
+++ b/GPy/testing/minibatch_tests.py
@ -1,230 +0,0 @@
 '''
 Created on 4 Sep 2015
@author: maxz
 '''
 import unittest
 import numpy as np
 import GPy
 class BGPLVMTest(unittest.TestCase):
    def setUp(self):
        np.random.seed(12345)
        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
        self.X, self.W, self.Y = X,W,Y
        self.Q = 3
        self.m_full = GPy.models.BayesianGPLVM(Y, self.Q)
    def test_lik_comparisons_m1_s0(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False)
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_predict_missing_data(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        self.assertRaises(NotImplementedError, m.predict, m.X, full_cov=True)
        mu1, var1 = m.predict(m.X, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1, var2)
        mu1, var1 = m.predict(m.X.mean, full_cov=True)
        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=True)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1[:,:,0], var2)
        mu1, var1 = m.predict(m.X.mean, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1[:,[0]], var2)
    def test_lik_comparisons_m0_s0(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=self.m_full.X.variance.values, missing_data=False, stochastic=False)
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_lik_comparisons_m1_s1(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_lik_comparisons_m0_s1(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_gradients_missingdata(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
        assert(m.checkgrad())
    def test_gradients_missingdata_stochastics(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=1)
        assert(m.checkgrad())
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=4)
        assert(m.checkgrad())
    def test_gradients_stochastics(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=1)
        assert(m.checkgrad())
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=4)
        assert(m.checkgrad())
    def test_predict(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
 class SparseGPMinibatchTest(unittest.TestCase):
    def setUp(self):
        np.random.seed(12345)
        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
        self.X, self.W, self.Y = X,W,Y
        self.Q = 3
        self.m_full = GPy.models.SparseGPLVM(Y, self.Q, kernel=GPy.kern.RBF(self.Q, ARD=True))
    def test_lik_comparisons_m1_s0(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False)
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_sparsegp_init(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        try:
            np.random.seed(1234)
            Z = self.X[np.random.choice(self.X.shape[0], replace=False, size=10)].copy()
            Q = Z.shape[1]
            m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=True, stochastic=False)
            assert(m.checkgrad())
            m.optimize('adadelta', max_iters=10)
            assert(m.checkgrad())
            m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=True, stochastic=True)
            assert(m.checkgrad())
            m.optimize('rprop', max_iters=10)
            assert(m.checkgrad())
            m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=False, stochastic=False)
            assert(m.checkgrad())
            m.optimize('rprop', max_iters=10)
            assert(m.checkgrad())
            m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=False, stochastic=True)
            assert(m.checkgrad())
            m.optimize('adadelta', max_iters=10)
            assert(m.checkgrad())
        except ImportError:
            from nose import SkipTest
            raise SkipTest('climin not installed, skipping stochastic gradients')
    def test_predict_missing_data(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        mu1, var1 = m.predict(m.X, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        for i in range(var1.shape[1]):
            np.testing.assert_allclose(var1[:,[i]], var2)
        mu1, var1 = m.predict(m.X, full_cov=True)
        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=True)
        np.testing.assert_allclose(mu1, mu2)
        for i in range(var1.shape[2]):
            np.testing.assert_allclose(var1[:,:,i], var2)
    def test_lik_comparisons_m0_s0(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=False)
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_lik_comparisons_m1_s1(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_lik_comparisons_m0_s1(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_gradients_missingdata(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
        assert(m.checkgrad())
    def test_gradients_missingdata_stochastics(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=1)
        assert(m.checkgrad())
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=4)
        assert(m.checkgrad())
    def test_gradients_stochastics(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=1)
        assert(m.checkgrad())
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=4)
        assert(m.checkgrad())
    def test_predict(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.testName']
    unittest.main()
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
--- a/GPy/testing/mpi_test__.py
+++ b/GPy/testing/mpi_test__.py
@ -0,0 +1,21 @@
 import numpy as np
 import GPy
 from mpi4py import MPI
 np.random.seed(123456)
 comm = MPI.COMM_WORLD
 N = 100
 x = np.linspace(-6., 6., N)
 y = np.sin(x) + np.random.randn(N) * 0.05
 comm.Bcast(y)
 data = np.vstack([x,y])
 #infr = GPy.inference.latent_function_inference.VarDTC_minibatch(mpi_comm=comm)
 m = GPy.models.SparseGPRegression(data[:1].T,data[1:2].T,mpi_comm=comm)
 m.optimize(max_iters=10)
 if comm.rank==0:
    print float(m.objective_function())
    m.inference_method.mpi_comm=None
    m.mpi_comm=None
    m._trigger_params_changed()
    print float(m.objective_function())
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@ -1,130 +0,0 @@
 '''
 Created on 13 Mar 2014
@author: maxz
 '''
 import unittest, itertools
 #import cPickle as pickle
 import pickle
 import numpy as np
 import tempfile
 from GPy.examples.dimensionality_reduction import mrd_simulation
 from GPy.core.parameterization.variational import NormalPosterior
 from GPy.models.gp_regression import GPRegression
 import GPy
 from nose import SkipTest
 def toy_model():
    X = np.linspace(0,1,50)[:, None]
    Y = np.sin(X)
    m = GPRegression(X=X, Y=Y)
    return m
 class ListDictTestCase(unittest.TestCase):
    def assertListDictEquals(self, d1, d2, msg=None):
        #py3 fix
        #for k,v in d1.iteritems():
        for k,v in d1.items():
            self.assertListEqual(list(v), list(d2[k]), msg)
    def assertArrayListEquals(self, l1, l2):
        for a1, a2 in zip(l1,l2):
            np.testing.assert_array_equal(a1, a2)
 class Test(ListDictTestCase):
    @SkipTest
    def test_load_pickle(self):
        import os
        m = GPy.load(os.path.join(os.path.abspath(os.path.split(__file__)[0]), 'pickle_test.pickle'))
        self.assertTrue(m.checkgrad())
        self.assertEqual(m.log_likelihood(), -4.7351019830022087)
    def test_model(self):
        par = toy_model()
        pcopy = par.copy()
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        self.assertTrue(pcopy.checkgrad())
        self.assert_(np.any(pcopy.gradient!=0.0))
        with tempfile.TemporaryFile('w+b') as f:
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assert_(pcopy.checkgrad())
    def test_modelrecreation(self):
        par = toy_model()
        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
        np.testing.assert_allclose(par.param_array, pcopy.param_array)
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        self.assertTrue(pcopy.checkgrad())
        self.assert_(np.any(pcopy.gradient!=0.0))
        np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)
        par.randomize()
        with tempfile.TemporaryFile('w+b') as f:
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
        np.testing.assert_allclose(par.param_array, pcopy.param_array)
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full, atol=1e-6)
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assert_(pcopy.checkgrad())
    def test_posterior(self):
        X = np.random.randn(3,5)
        Xv = np.random.rand(*X.shape)
        par = NormalPosterior(X,Xv)
        par.gradient = 10
        pcopy = par.copy()
        pcopy.gradient = 10
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        with tempfile.TemporaryFile('w+b') as f:
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
        pcopy.gradient = 10
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        np.testing.assert_allclose(pcopy.mean.gradient_full, 10)
        self.assertSequenceEqual(str(par), str(pcopy))
    def test_model_concat(self):
        par = mrd_simulation(optimize=0, plot=0, plot_sim=0)
        par.randomize()
        pcopy = par.copy()
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        self.assertTrue(par.checkgrad())
        self.assertTrue(pcopy.checkgrad())
        self.assert_(np.any(pcopy.gradient!=0.0))
        with tempfile.TemporaryFile('w+b') as f:
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assert_(pcopy.checkgrad())
    def _callback(self, what, which):
        what.count += 1
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.test_parameter_index_operations']
    unittest.main()
--- a/GPy/testing/plotting_tests.py
+++ b/GPy/testing/plotting_tests.py
@ -1,509 +0,0 @@
 #===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # * Redistributions of source code must retain the above copyright notice, this
 #   list of conditions and the following disclaimer.
 #
 # * Redistributions in binary form must reproduce the above copyright notice,
 #   this list of conditions and the following disclaimer in the documentation
 #   and/or other materials provided with the distribution.
 #
 # * Neither the name of GPy nor the names of its
 #   contributors may be used to endorse or promote products derived from
 #   this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #===============================================================================
 #===============================================================================
 # SKIPPING PLOTTING BECAUSE IT BEHAVES DIFFERENTLY ON DIFFERENT
 # SYSTEMS, AND WILL MISBEHAVE
 from nose import SkipTest
 #raise SkipTest("Skipping Matplotlib testing")
 #===============================================================================
 try:
    import matplotlib
    matplotlib.use('agg')
 except ImportError:
    # matplotlib not installed
    from nose import SkipTest
    raise SkipTest("Error importing matplotlib")
 from unittest.case import TestCase
 import numpy as np
 import GPy, os
 import logging
 from GPy.util.config import config
 from GPy.plotting import change_plotting_library, plotting_library
 class ConfigTest(TestCase):
    def tearDown(self):
        change_plotting_library('matplotlib')
    def test_change_plotting(self):
        self.assertRaises(ValueError, change_plotting_library, 'not+in9names')
        change_plotting_library('none')
        self.assertRaises(RuntimeError, plotting_library)
 change_plotting_library('matplotlib')
 if config.get('plotting', 'library') != 'matplotlib':
    raise SkipTest("Matplotlib not installed, not testing plots")
 try:
    from matplotlib import cbook, pyplot as plt
    from matplotlib.testing.compare import compare_images
 except ImportError:
    raise SkipTest("Matplotlib not installed, not testing plots")
 extensions = ['npz']
 basedir = os.path.dirname(os.path.relpath(os.path.abspath(__file__)))
 def _image_directories():
    """
    Compute the baseline and result image directories for testing *func*.
    Create the result directory if it doesn't exist.
    """
    #module_name = __init__.__module__
    #mods = module_name.split('.')
    #basedir = os.path.join(*mods)
    result_dir = os.path.join(basedir, 'testresult','.')
    baseline_dir = os.path.join(basedir, 'baseline','.')
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    return baseline_dir, result_dir
 baseline_dir, result_dir = _image_directories()
 if not os.path.exists(baseline_dir):
    raise SkipTest("Not installed from source, baseline not available. Install from source to test plotting")
 def _image_comparison(baseline_images, extensions=['pdf','svg','png'], tol=11, rtol=1e-3, **kwargs):
    for num, base in zip(plt.get_fignums(), baseline_images):
        for ext in extensions:
            fig = plt.figure(num)
            try:
                fig.canvas.draw()
            except Exception as e:
                logging.error(base)
                #raise SkipTest(e)
            #fig.axes[0].set_axis_off()
            #fig.set_frameon(False)
            if ext in ['npz']:
                figdict = flatten_axis(fig)
                np.savez_compressed(os.path.join(result_dir, "{}.{}".format(base, ext)), **figdict)
                try:
                    fig.savefig(os.path.join(result_dir, "{}.{}".format(base, 'png')),
                                transparent=True,
                                edgecolor='none',
                                facecolor='none',
                                #bbox='tight'
                                )
                except:
                    logging.error(base)
                    # raise
            else:
                fig.savefig(os.path.join(result_dir, "{}.{}".format(base, ext)),
                            transparent=True,
                            edgecolor='none',
                            facecolor='none',
                            #bbox='tight'
                            )
    for num, base in zip(plt.get_fignums(), baseline_images):
        for ext in extensions:
            #plt.close(num)
            actual = os.path.join(result_dir, "{}.{}".format(base, ext))
            expected = os.path.join(baseline_dir, "{}.{}".format(base, ext))
            if ext == 'npz':
                def do_test():
                    if not os.path.exists(expected):
                        import shutil
                        shutil.copy2(actual, expected)
                        #shutil.copy2(os.path.join(result_dir, "{}.{}".format(base, 'png')), os.path.join(baseline_dir, "{}.{}".format(base, 'png')))
                        raise IOError("Baseline file {} not found, copying result {}".format(expected, actual))
                    else:
                        exp_dict = dict(np.load(expected).items())
                        act_dict = dict(np.load(actual).items())
                        for name in act_dict:
                            if name in exp_dict:
                                try:
                                    np.testing.assert_allclose(exp_dict[name], act_dict[name], err_msg="Mismatch in {}.{}".format(base, name), rtol=rtol, **kwargs)
                                except AssertionError as e:
                                    raise SkipTest(e)
            else:
                def do_test():
                    err = compare_images(expected, actual, tol, in_decorator=True)
                    if err:
                        raise SkipTest("Error between {} and {} is {:.5f}, which is bigger then the tolerance of {:.5f}".format(actual, expected, err['rms'], tol))
            yield do_test
    plt.close('all')
 def flatten_axis(ax, prevname=''):
    import inspect
    members = inspect.getmembers(ax)
    arrays = {}
    def _flatten(l, pre):
        arr = {}
        if isinstance(l, np.ndarray):
            if l.size:
                arr[pre] = np.asarray(l)
        elif isinstance(l, dict):
            for _n in l:
                _tmp = _flatten(l, pre+"."+_n+".")
                for _nt in _tmp.keys():
                    arrays[_nt] = _tmp[_nt]
        elif isinstance(l, list) and len(l)>0:
            for i in range(len(l)):
                _tmp = _flatten(l[i], pre+"[{}]".format(i))
                for _n in _tmp:
                    arr["{}".format(_n)] = _tmp[_n]
        else:
            return flatten_axis(l, pre+'.')
        return arr
    for name, l in members:
        if isinstance(l, np.ndarray):
            arrays[prevname+name] = np.asarray(l)
        elif isinstance(l, list) and len(l)>0:
            for i in range(len(l)):
                _tmp = _flatten(l[i], prevname+name+"[{}]".format(i))
                for _n in _tmp:
                    arrays["{}".format(_n)] = _tmp[_n]
    return arrays
 def _a(x,y,decimal):
    np.testing.assert_array_almost_equal(x, y, decimal)
 def compare_axis_dicts(x, y, decimal=6):
    try:
        assert(len(x)==len(y))
        for name in x:
            _a(x[name], y[name], decimal)
    except AssertionError as e:
        raise SkipTest(e.message)
 def test_figure():
    np.random.seed(1239847)
    from GPy.plotting import plotting_library as pl
    #import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams[u'text.usetex'] = False
    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        ax, _ = pl().new_canvas(num="imshow_interact")
        def test_func(x):
            return x[:, 0].reshape(3,3)
        pl().imshow_interact(ax, test_func, extent=(-1,1,-1,1), resolution=3)
        ax, _ = pl().new_canvas()
        def test_func_2(x):
            y = x[:, 0].reshape(3,3)
            anno = np.argmax(x, axis=1).reshape(3,3)
            return y, anno
        pl().annotation_heatmap_interact(ax, test_func_2, extent=(-1,1,-1,1), resolution=3)
        pl().annotation_heatmap_interact(ax, test_func_2, extent=(-1,1,-1,1), resolution=3, imshow_kwargs=dict(interpolation='nearest'))
        ax, _ = pl().new_canvas(figsize=(4,3))
        x = np.linspace(0,1,100)
        y = [0,1,2]
        array = np.array([.4,.5])
        cmap = matplotlib.colors.LinearSegmentedColormap.from_list('WhToColor', ('r', 'b'), N=array.size)
        pl().fill_gradient(ax, x, y, facecolors=['r', 'g'], array=array, cmap=cmap)
        ax, _ = pl().new_canvas(num="3d_plot", figsize=(4,3), projection='3d', xlabel='x', ylabel='y', zlabel='z', title='awsome title', xlim=(-1,1), ylim=(-1,1), zlim=(-3,3))
        z = 2-np.abs(np.linspace(-2,2,(100)))+1
        x, y = z*np.sin(np.linspace(-2*np.pi,2*np.pi,(100))), z*np.cos(np.linspace(-np.pi,np.pi,(100)))
        pl().plot(ax, x, y, z, linewidth=2)
        for do_test in _image_comparison(
                baseline_images=['coverage_{}'.format(sub) for sub in ["imshow_interact",'annotation_interact','gradient','3d_plot',]],
                extensions=extensions):
            yield (do_test, )
 def test_kernel():
    np.random.seed(1239847)
    #import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams[u'text.usetex'] = False
    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        k = GPy.kern.RBF(5, ARD=True) * GPy.kern.Linear(3, active_dims=[0,2,4], ARD=True) + GPy.kern.Bias(2)
        k.randomize()
        k2 = GPy.kern.RBF(5, ARD=True) * GPy.kern.Linear(3, active_dims=[0,2,4], ARD=True) + GPy.kern.Bias(2) + GPy.kern.White(4)
        k2[:-1] = k[:]
        k2.plot_ARD(['rbf', 'linear', 'bias'], legend=True)
        k2.plot_covariance(visible_dims=[0, 3], plot_limits=(-1,3))
        k2.plot_covariance(visible_dims=[2], plot_limits=(-1, 3))
        k2.plot_covariance(visible_dims=[2, 4], plot_limits=((-1, 0), (5, 3)), projection='3d', rstride=10, cstride=10)
        k2.plot_covariance(visible_dims=[1, 4])
        for do_test in _image_comparison(
                baseline_images=['kern_{}'.format(sub) for sub in ["ARD", 'cov_2d', 'cov_1d', 'cov_3d', 'cov_no_lim']],
                extensions=extensions):
            yield (do_test, )
 def test_plot():
    np.random.seed(111)
    import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams[u'text.usetex'] = False
    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        X = np.random.uniform(-2, 2, (40, 1))
        f = .2 * np.sin(1.3*X) + 1.3*np.cos(2*X)
        Y = f+np.random.normal(0, .1, f.shape)
        m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X)*[0.06])
        #m.optimize()
        m.plot_data()
        m.plot_mean()
        m.plot_confidence()
        m.plot_density()
        m.plot_errorbars_trainset()
        m.plot_samples()
        m.plot_data_error()
    for do_test in _image_comparison(baseline_images=['gp_{}'.format(sub) for sub in ["data", "mean", 'conf',
                                                                                      'density',
                                                                                      'out_error',
                                                                                      'samples', 'in_error']], extensions=extensions):
        yield (do_test, )
 def test_twod():
    np.random.seed(11111)
    import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams[u'text.usetex'] = False
    X = np.random.uniform(-2, 2, (40, 2))
    f = .2 * np.sin(1.3*X[:,[0]]) + 1.3*np.cos(2*X[:,[1]])
    Y = f+np.random.normal(0, .1, f.shape)
    m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X)*[0.01, 0.2])
    #m.optimize()
    m.plot_data()
    m.plot_mean()
    m.plot_inducing(legend=False, marker='s')
    #m.plot_errorbars_trainset()
    m.plot_data_error()
    for do_test in _image_comparison(baseline_images=['gp_2d_{}'.format(sub) for sub in ["data", "mean",
                                                                                         'inducing',
                                                                                         #'out_error',
                                                                                         'in_error',
                                                                                         ]], extensions=extensions):
        yield (do_test, )
 def test_threed():
    np.random.seed(11111)
    import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams[u'text.usetex'] = False
    X = np.random.uniform(-2, 2, (40, 2))
    f = .2 * np.sin(1.3*X[:,[0]]) + 1.3*np.cos(2*X[:,[1]])
    Y = f+np.random.normal(0, .1, f.shape)
    m = GPy.models.SparseGPRegression(X, Y)
    m.likelihood.variance = .1
    #m.optimize()
    m.plot_samples(projection='3d', samples=1)
    m.plot_samples(projection='3d', plot_raw=False, samples=1)
    plt.close('all')
    m.plot_data(projection='3d')
    m.plot_mean(projection='3d', rstride=10, cstride=10)
    m.plot_inducing(projection='3d')
    #m.plot_errorbars_trainset(projection='3d')
    for do_test in _image_comparison(baseline_images=[
        'gp_3d_{}'.format(sub) for sub in ["data", "mean", 'inducing',
    ]], extensions=extensions):
        yield (do_test, )
 def test_sparse():
    np.random.seed(11111)
    import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams[u'text.usetex'] = False
    X = np.random.uniform(-2, 2, (40, 1))
    f = .2 * np.sin(1.3*X) + 1.3*np.cos(2*X)
    Y = f+np.random.normal(0, .1, f.shape)
    m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X)*0.1)
    #m.optimize()
    #m.plot_inducing()
    _, ax = plt.subplots()
    m.plot_data(ax=ax)
    m.plot_data_error(ax=ax)
    for do_test in _image_comparison(baseline_images=['sparse_gp_{}'.format(sub) for sub in ['data_error']], extensions=extensions):
        yield (do_test, )
 def test_classification():
    np.random.seed(11111)
    import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams[u'text.usetex'] = False
    X = np.random.uniform(-2, 2, (40, 1))
    f = .2 * np.sin(1.3*X) + 1.3*np.cos(2*X)
    Y = f+np.random.normal(0, .1, f.shape)
    m = GPy.models.GPClassification(X, Y>Y.mean())
    #m.optimize()
    _, ax = plt.subplots()
    m.plot(plot_raw=False, apply_link=False, ax=ax, samples=3)
    m.plot_errorbars_trainset(plot_raw=False, apply_link=False, ax=ax)
    _, ax = plt.subplots()
    m.plot(plot_raw=True, apply_link=False, ax=ax, samples=3)
    m.plot_errorbars_trainset(plot_raw=True, apply_link=False, ax=ax)
    _, ax = plt.subplots()
    m.plot(plot_raw=True, apply_link=True, ax=ax, samples=3)
    m.plot_errorbars_trainset(plot_raw=True, apply_link=True, ax=ax)
    for do_test in _image_comparison(baseline_images=['gp_class_{}'.format(sub) for sub in ["likelihood", "raw", 'raw_link']], extensions=extensions):
        yield (do_test, )
 def test_sparse_classification():
    np.random.seed(11111)
    import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams[u'text.usetex'] = False
    X = np.random.uniform(-2, 2, (40, 1))
    f = .2 * np.sin(1.3*X) + 1.3*np.cos(2*X)
    Y = f+np.random.normal(0, .1, f.shape)
    m = GPy.models.SparseGPClassification(X, Y>Y.mean())
    #m.optimize()
    m.plot(plot_raw=False, apply_link=False, samples_likelihood=3)
    np.random.seed(111)
    m.plot(plot_raw=True, apply_link=False, samples=3)
    np.random.seed(111)
    m.plot(plot_raw=True, apply_link=True, samples=3)
    for do_test in _image_comparison(baseline_images=['sparse_gp_class_{}'.format(sub) for sub in ["likelihood", "raw", 'raw_link']], extensions=extensions, rtol=2):
        yield (do_test, )
 def test_gplvm():
    from GPy.models import GPLVM
    np.random.seed(12345)
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams[u'text.usetex'] = False
    #Q = 3
    # Define dataset
    #N = 60
    #k1 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,10,10,0.1,0.1]), ARD=True)
    #k2 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,0.1,10,0.1,10]), ARD=True)
    #k3 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[0.1,0.1,10,10,10]), ARD=True)
    #X = np.random.normal(0, 1, (N, 5))
    #A = np.random.multivariate_normal(np.zeros(N), k1.K(X), Q).T
    #B = np.random.multivariate_normal(np.zeros(N), k2.K(X), Q).T
    #C = np.random.multivariate_normal(np.zeros(N), k3.K(X), Q).T
    #Y = np.vstack((A,B,C))
    #labels = np.hstack((np.zeros(A.shape[0]), np.ones(B.shape[0]), np.ones(C.shape[0])*2))
    #k = RBF(Q, ARD=True, lengthscale=2)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    pars = np.load(os.path.join(basedir, 'b-gplvm-save.npz'))
    Y = pars['Y']
    Q = pars['Q']
    labels = pars['labels']
    import warnings
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter('always')  # always print
        m = GPLVM(Y, Q, initialize=False)
    m.update_model(False)
    m.initialize_parameter()
    m[:] = pars['gplvm_p']
    m.update_model(True)
    #m.optimize(messages=0)
    np.random.seed(111)
    m.plot_latent(labels=labels)
    np.random.seed(111)
    m.plot_scatter(projection='3d', labels=labels)
    np.random.seed(111)
    m.plot_magnification(labels=labels)
    m.plot_steepest_gradient_map(resolution=10, data_labels=labels)
    for do_test in _image_comparison(baseline_images=['gplvm_{}'.format(sub) for sub in ["latent", "latent_3d", "magnification", 'gradient']],
                                     extensions=extensions,
                                     tol=12):
        yield (do_test, )
 def test_bayesian_gplvm():
    from ..models import BayesianGPLVM
    np.random.seed(12345)
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    #matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams[u'text.usetex'] = False
    #Q = 3
    # Define dataset
    #N = 10
    #k1 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,10,10,0.1,0.1]), ARD=True)
    #k2 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,0.1,10,0.1,10]), ARD=True)
    #k3 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[0.1,0.1,10,10,10]), ARD=True)
    #X = np.random.normal(0, 1, (N, 5))
    #A = np.random.multivariate_normal(np.zeros(N), k1.K(X), Q).T
    #B = np.random.multivariate_normal(np.zeros(N), k2.K(X), Q).T
    #C = np.random.multivariate_normal(np.zeros(N), k3.K(X), Q).T
    #Y = np.vstack((A,B,C))
    #labels = np.hstack((np.zeros(A.shape[0]), np.ones(B.shape[0]), np.ones(C.shape[0])*2))
    #k = RBF(Q, ARD=True, lengthscale=2)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    pars = np.load(os.path.join(basedir, 'b-gplvm-save.npz'))
    Y = pars['Y']
    Q = pars['Q']
    labels = pars['labels']
    import warnings
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter('always')  # always print
        m = BayesianGPLVM(Y, Q, initialize=False)
    m.update_model(False)
    m.initialize_parameter()
    m[:] = pars['bgplvm_p']
    m.update_model(True)
    #m.optimize(messages=0)
    np.random.seed(111)
    m.plot_inducing(projection='2d')
    np.random.seed(111)
    m.plot_inducing(projection='3d')
    np.random.seed(111)
    m.plot_latent(projection='2d', labels=labels)
    np.random.seed(111)
    m.plot_scatter(projection='3d', labels=labels)
    np.random.seed(111)
    m.plot_magnification(labels=labels)
    np.random.seed(111)
    m.plot_steepest_gradient_map(resolution=10, data_labels=labels)
    for do_test in _image_comparison(baseline_images=['bayesian_gplvm_{}'.format(sub) for sub in ["inducing", "inducing_3d", "latent", "latent_3d", "magnification", 'gradient']], extensions=extensions):
        yield (do_test, )
 if __name__ == '__main__':
    import nose
    nose.main(defaultTest='./plotting_tests.py')
--- a/GPy/testing/run_coverage.sh
+++ b/GPy/testing/run_coverage.sh
@ -1 +1 @@
-nosetests . --with-coverage --logging-level=INFO --cover-html --cover-html-dir=coverage --cover-package=GPy --cover-erase
+pytest .
--- a/GPy/testing/rv_transformation_tests.py
+++ b/GPy/testing/rv_transformation_tests.py
@ -1,117 +0,0 @@
 # Written by Ilias Bilionis
 """
 Test if hyperparameters in models are properly transformed.
 """
 import unittest
 import numpy as np
 import scipy.stats as st
 import GPy
 class TestModel(GPy.core.Model):
    """
    A simple GPy model with one parameter.
    """
    def __init__(self, theta=1.):
        super(TestModel, self).__init__('test_model')
        theta = GPy.core.Param('theta', theta)
        self.link_parameter(theta)
    def log_likelihood(self):
        return 0.
 class RVTransformationTestCase(unittest.TestCase):
    def _test_trans(self, trans):
        m = TestModel()
        prior = GPy.priors.LogGaussian(.5, 0.1)
        m.theta.set_prior(prior)
        m.theta.unconstrain()
        m.theta.constrain(trans)
        # The PDF of the transformed variables
        p_phi = lambda phi : np.exp(-m._objective_grads(phi)[0])
        # To the empirical PDF of:
        theta_s = prior.rvs(1e5)
        phi_s = trans.finv(theta_s)
        # which is essentially a kernel density estimation
        kde = st.gaussian_kde(phi_s)
        # We will compare the PDF here:
        phi = np.linspace(phi_s.min(), phi_s.max(), 100)
        # The transformed PDF of phi should be this:
        pdf_phi = np.array([p_phi(p) for p in phi])
        # UNCOMMENT TO SEE GRAPHICAL COMPARISON
        #import matplotlib.pyplot as plt
        #fig, ax = plt.subplots()
        #ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Histogram')
        #ax.plot(phi, kde(phi), '--', linewidth=2, label='Kernel Density Estimation')
        #ax.plot(phi, pdf_phi, ':', linewidth=2, label='Transformed PDF')
        #ax.set_xlabel(r'transformed $\theta$', fontsize=16)
        #ax.set_ylabel('PDF', fontsize=16)
        #plt.legend(loc='best')
        #plt.show(block=True)
        # END OF PLOT
        # The following test cannot be very accurate
        self.assertTrue(np.linalg.norm(pdf_phi - kde(phi)) / np.linalg.norm(kde(phi)) <= 1e-1)
    def _test_grad(self, trans):
        np.random.seed(1234)
        m = TestModel(np.random.uniform(.5, 1.5, 20))
        prior = GPy.priors.LogGaussian(.5, 0.1)
        m.theta.set_prior(prior)
        m.theta.constrain(trans)
        m.randomize()
        print(m)
        self.assertTrue(m.checkgrad(1))
    def test_Logexp(self):
        self._test_trans(GPy.constraints.Logexp())
    @unittest.skip("Gradient not checking right, @jameshensman what is going on here?")
    def test_Logexp_grad(self):        
        self._test_grad(GPy.constraints.Logexp())
    def test_Exponent(self):
        self._test_trans(GPy.constraints.Exponent())
    @unittest.skip("Gradient not checking right, @jameshensman what is going on here?")
    def test_Exponent_grad(self):
        self._test_grad(GPy.constraints.Exponent())
 if __name__ == '__main__':
    unittest.main()
    quit()
    m = TestModel()
    prior = GPy.priors.LogGaussian(0., .9)
    m.theta.set_prior(prior)
    # The following should return the PDF in terms of the transformed quantities
    p_phi = lambda phi : np.exp(-m._objective_grads(phi)[0])
    # Let's look at the transformation phi = log(exp(theta - 1))
    trans = GPy.constraints.Exponent()
    m.theta.constrain(trans)
    # Plot the transformed probability density
    phi = np.linspace(-8, 8, 100)
    fig, ax = plt.subplots()
    # Let's draw some samples of theta and transform them so that we see
    # which one is right
    theta_s = prior.rvs(10000)
    # Transform it to the new variables
    phi_s = trans.finv(theta_s)
    # And draw their histogram
    ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Empirical')
    # This is to be compared to the PDF of the model expressed in terms of these new
    # variables
    ax.plot(phi, [p_phi(p) for p in phi], label='Transformed PDF', linewidth=2)
    ax.set_xlim(-3, 10)
    ax.set_xlabel(r'transformed $\theta$', fontsize=16)
    ax.set_ylabel('PDF', fontsize=16)
    plt.legend(loc='best')
    # Now let's test the gradients
    m.checkgrad(verbose=True)
    # And show the plot
    plt.show(block=True)
--- a/GPy/testing/serialization_tests.py
+++ b/GPy/testing/serialization_tests.py
@ -1,279 +0,0 @@
 '''
 Created on 20 April 2017
@author: pgmoren
 '''
 import unittest, itertools
 #import cPickle as pickle
 import pickle
 import numpy as np
 import tempfile
 import GPy
 from nose import SkipTest
 import numpy as np
 import os
 fixed_seed = 11
 class Test(unittest.TestCase):
    def test_serialize_deserialize_kernels(self):
        k1 = GPy.kern.RBF(2, variance=1.0, lengthscale=[1.0,1.0], ARD=True)
        k2 = GPy.kern.RatQuad(2, variance=2.0, lengthscale=1.0, power=2.0, active_dims = [0,1])
        k3 = GPy.kern.Bias(2, variance=2.0, active_dims = [1,0])
        k4 = GPy.kern.StdPeriodic(2, variance=2.0, lengthscale=1.0, period=1.0, active_dims = [1,1])
        k5 = GPy.kern.Linear(2, variances=[2.0, 1.0], ARD=True, active_dims = [1,1])
        k6 = GPy.kern.Exponential(2, variance=1., lengthscale=2)
        k7 = GPy.kern.Matern32(2, variance=1.0, lengthscale=[1.0,3.0], ARD=True, active_dims = [1,1])
        k8 = GPy.kern.Matern52(2, variance=2.0, lengthscale=[2.0,1.0], ARD=True, active_dims = [1,0])
        k9 = GPy.kern.ExpQuad(2, variance=3.0, lengthscale=[1.0,2.0], ARD=True, active_dims = [0,1])
        k10 = GPy.kern.OU(2, variance=2.0, lengthscale=[2.0, 1.0], ARD=True, active_dims=[1, 0])
        k11 = k1 + k1.copy() + k2 + k3 + k4 + k5 + k6
        k12 = k1 * k2 * k2.copy() * k3 * k4 * k5
        k13 = (k1 + k2) * (k3 + k4 + k5)
        k14 = ((k1 + k2) * k3) + k4 + k5 * k7
        k15 = ((k1 + k2) * k3) + k4 * k5 + k8 * k10
        k16 = ((k1 * k2) * k3) + k4 * k5 + k8 + k9
        k_list = [k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15,k16]
        for kk in k_list:
            kk_dict = kk.to_dict()
            kk_r = GPy.kern.Kern.from_dict(kk_dict)
            assert type(kk) == type(kk_r)
            np.testing.assert_array_equal(kk[:], kk_r[:])
            np.testing.assert_array_equal(np.array(kk.active_dims), np.array(kk_r.active_dims))
    def test_serialize_deserialize_mappings(self):
        m1 = GPy.mappings.Identity(3,2)
        m2 = GPy.mappings.Constant(3,2,1)
        m2_r = GPy.core.mapping.Mapping.from_dict(m2.to_dict())
        np.testing.assert_array_equal(m2.C.values[:], m2_r.C.values[:])
        m3 = GPy.mappings.Linear(3,2)
        m3_r = GPy.core.mapping.Mapping.from_dict(m3.to_dict())
        assert np.all(m3.A == m3_r.A)
        m_list = [m1, m2, m3]
        for mm in m_list:
            mm_dict = mm.to_dict()
            mm_r = GPy.core.mapping.Mapping.from_dict(mm_dict)
            assert type(mm) == type(mm_r)
            assert type(mm.input_dim) == type(mm_r.input_dim)
            assert type(mm.output_dim) == type(mm_r.output_dim)
    def test_serialize_deserialize_likelihoods(self):
        l1 = GPy.likelihoods.Gaussian(GPy.likelihoods.link_functions.Identity(),variance=3.0)
        l1_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l1.to_dict())
        l2 = GPy.likelihoods.Bernoulli(GPy.likelihoods.link_functions.Probit())
        l2_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l2.to_dict())
        assert type(l1) == type(l1_r)
        assert np.all(l1.variance == l1_r.variance)
        assert type(l2) == type(l2_r)
    def test_serialize_deserialize_normalizers(self):
        n1 = GPy.util.normalizer.Standardize()
        n1.scale_by(np.random.rand(10))
        n1_r = GPy.util.normalizer._Norm.from_dict((n1.to_dict()))
        assert type(n1) == type(n1_r)
        assert np.all(n1.mean == n1_r.mean)
        assert np.all(n1.std == n1_r.std)
    def test_serialize_deserialize_link_functions(self):
        l1 = GPy.likelihoods.link_functions.Identity()
        l2 = GPy.likelihoods.link_functions.Probit()
        l_list = [l1, l2]
        for ll in l_list:
            ll_dict = ll.to_dict()
            ll_r = GPy.likelihoods.link_functions.GPTransformation.from_dict(ll_dict)
            assert type(ll) == type(ll_r)
    def test_serialize_deserialize_inference_methods(self):
        e1 = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode="nested")
        e1.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10))
        e1._ep_approximation = []
        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.posteriorParams(np.random.rand(10),np.random.rand(100).reshape((10,10))))
        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10)))
        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.cavityParams(10))
        e1._ep_approximation[-1].v = np.random.rand(10)
        e1._ep_approximation[-1].tau = np.random.rand(10)
        e1._ep_approximation.append(np.random.rand(10))
        e1_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e1.to_dict())
        assert type(e1) == type(e1_r)
        assert e1.epsilon==e1_r.epsilon
        assert e1.eta==e1_r.eta
        assert e1.delta==e1_r.delta
        assert e1.always_reset==e1_r.always_reset
        assert e1.max_iters==e1_r.max_iters
        assert e1.ep_mode==e1_r.ep_mode
        assert e1.parallel_updates==e1_r.parallel_updates
        np.testing.assert_array_equal(e1.ga_approx_old.tau[:], e1_r.ga_approx_old.tau[:])
        np.testing.assert_array_equal(e1.ga_approx_old.v[:], e1_r.ga_approx_old.v[:])
        np.testing.assert_array_equal(e1._ep_approximation[0].mu[:], e1_r._ep_approximation[0].mu[:])
        np.testing.assert_array_equal(e1._ep_approximation[0].Sigma[:], e1_r._ep_approximation[0].Sigma[:])
        np.testing.assert_array_equal(e1._ep_approximation[1].tau[:], e1_r._ep_approximation[1].tau[:])
        np.testing.assert_array_equal(e1._ep_approximation[1].v[:], e1_r._ep_approximation[1].v[:])
        np.testing.assert_array_equal(e1._ep_approximation[2].tau[:], e1_r._ep_approximation[2].tau[:])
        np.testing.assert_array_equal(e1._ep_approximation[2].v[:], e1_r._ep_approximation[2].v[:])
        np.testing.assert_array_equal(e1._ep_approximation[3][:], e1_r._ep_approximation[3][:])
        e2 = GPy.inference.latent_function_inference.expectation_propagation.EPDTC(ep_mode="nested")
        e2.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10))
        e2._ep_approximation = []
        e2._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.posteriorParamsDTC(np.random.rand(10),np.random.rand(10)))
        e2._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10)))
        e2._ep_approximation.append(100.0)
        e2_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e2.to_dict())
        assert type(e2) == type(e2_r)
        assert e2.epsilon==e2_r.epsilon
        assert e2.eta==e2_r.eta
        assert e2.delta==e2_r.delta
        assert e2.always_reset==e2_r.always_reset
        assert e2.max_iters==e2_r.max_iters
        assert e2.ep_mode==e2_r.ep_mode
        assert e2.parallel_updates==e2_r.parallel_updates
        np.testing.assert_array_equal(e2.ga_approx_old.tau[:], e2_r.ga_approx_old.tau[:])
        np.testing.assert_array_equal(e2.ga_approx_old.v[:], e2_r.ga_approx_old.v[:])
        np.testing.assert_array_equal(e2._ep_approximation[0].mu[:], e2_r._ep_approximation[0].mu[:])
        np.testing.assert_array_equal(e2._ep_approximation[0].Sigma_diag[:], e2_r._ep_approximation[0].Sigma_diag[:])
        np.testing.assert_array_equal(e2._ep_approximation[1].tau[:], e2_r._ep_approximation[1].tau[:])
        np.testing.assert_array_equal(e2._ep_approximation[1].v[:], e2_r._ep_approximation[1].v[:])
        assert(e2._ep_approximation[2] == e2_r._ep_approximation[2])
        e3 = GPy.inference.latent_function_inference.exact_gaussian_inference.ExactGaussianInference()
        e3_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e3.to_dict())
        assert type(e3) == type(e3_r)
    def test_serialize_deserialize_GP(self):
        np.random.seed(fixed_seed)
        N = 20
        Nhalf = int(N/2)
        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
        kernel = GPy.kern.RBF(1)
        likelihood = GPy.likelihoods.Bernoulli()
        inference_method=GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode="nested")
        mean_function=None
        m = GPy.core.GP(X=X, Y=Y,  kernel=kernel, likelihood=likelihood, inference_method=inference_method, mean_function=mean_function, normalizer=True, name='gp_classification')
        m.optimize()
        m.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
        m.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
        m1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
        m2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X,Y))
        os.remove("temp_test_gp_with_data.json.zip")
        os.remove("temp_test_gp_without_data.json.zip")
        var = m.predict(X)[0]
        var1_r = m1_r.predict(X)[0]
        var2_r = m2_r.predict(X)[0]
        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var2_r).flatten())
    def test_serialize_deserialize_SparseGP(self):
        np.random.seed(fixed_seed)
        N = 20
        Nhalf = int(N/2)
        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
        kernel = GPy.kern.RBF(1)
        likelihood = GPy.likelihoods.Bernoulli()
        inference_method=GPy.inference.latent_function_inference.expectation_propagation.EPDTC(ep_mode="nested")
        mean_function=None
        sm = GPy.core.SparseGP(X=X, Y=Y, Z=X[0:20,:], kernel=kernel, likelihood=likelihood, inference_method=inference_method, mean_function=mean_function, normalizer=True, name='sparse_gp_classification')
        sm.optimize()
        sm.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
        sm.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
        sm1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
        sm2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X,Y))
        os.remove("temp_test_gp_with_data.json.zip")
        os.remove("temp_test_gp_without_data.json.zip")
        var = sm.predict(X)[0]
        var1_r = sm1_r.predict(X)[0]
        var2_r = sm2_r.predict(X)[0]
        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var2_r).flatten())
    def test_serialize_deserialize_GPRegressor(self):
        np.random.seed(fixed_seed)
        N = 50
        N_new = 50
        D = 1
        X = np.random.uniform(-3., 3., (N, 1))
        Y = np.sin(X) + np.random.randn(N, D) * 0.05
        X_new = np.random.uniform(-3., 3., (N_new, 1))
        k = GPy.kern.RBF(input_dim=1, lengthscale=10)
        m = GPy.models.GPRegression(X,Y,k)
        m.optimize()
        m.save_model("temp_test_gp_regressor_with_data.json", compress=True, save_data=True)
        m.save_model("temp_test_gp_regressor_without_data.json", compress=True, save_data=False)
        m1_r = GPy.models.GPRegression.load_model("temp_test_gp_regressor_with_data.json.zip")
        m2_r = GPy.models.GPRegression.load_model("temp_test_gp_regressor_without_data.json.zip", (X,Y))
        os.remove("temp_test_gp_regressor_with_data.json.zip")
        os.remove("temp_test_gp_regressor_without_data.json.zip")
        Xp = np.random.uniform(size=(int(1e5),1))
        Xp[:,0] = Xp[:,0]*15-5
        _, var = m.predict(Xp)
        _, var1_r = m1_r.predict(Xp)
        _, var2_r = m2_r.predict(Xp)
        np.testing.assert_array_equal(var.flatten(), var1_r.flatten())
        np.testing.assert_array_equal(var.flatten(), var2_r.flatten())
    def test_serialize_deserialize_GPClassification(self):
        np.random.seed(fixed_seed)
        N = 50
        Nhalf = int(N/2)
        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
        kernel = GPy.kern.RBF(1)
        m = GPy.models.GPClassification(X, Y, kernel=kernel)
        m.optimize()
        m.save_model("temp_test_gp_classifier_with_data.json", compress=True, save_data=True)
        m.save_model("temp_test_gp_classifier_without_data.json", compress=True, save_data=False)
        m1_r = GPy.models.GPClassification.load_model("temp_test_gp_classifier_with_data.json.zip")
        self.assertTrue(type(m) == type(m1_r), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m1_r)))
        m2_r = GPy.models.GPClassification.load_model("temp_test_gp_classifier_without_data.json.zip", (X,Y))
        self.assertTrue(type(m) == type(m2_r), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m2_r)))
        os.remove("temp_test_gp_classifier_with_data.json.zip")
        os.remove("temp_test_gp_classifier_without_data.json.zip")
        var = m.predict(X)[0]
        var1_r = m1_r.predict(X)[0]
        var2_r = m2_r.predict(X)[0]
        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
    def test_serialize_deserialize_SparseGPClassification(self):
        np.random.seed(fixed_seed)
        N = 50
        Nhalf = int(N/2)
        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
        kernel = GPy.kern.RBF(1)
        m = GPy.models.SparseGPClassification(X, Y, num_inducing=3, kernel=kernel)
        m.optimize()
        m.save_model("temp_test_sparse_gp_classifier_with_data.json", compress=True, save_data=True)
        m.save_model("temp_test_sparse_gp_classifier_without_data.json", compress=True, save_data=False)
        m1_r = GPy.models.SparseGPClassification.load_model("temp_test_sparse_gp_classifier_with_data.json.zip")
        self.assertTrue(type(m) == type(m1_r), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m1_r)))
        m2_r = GPy.models.SparseGPClassification.load_model("temp_test_sparse_gp_classifier_without_data.json.zip", (X,Y))
        self.assertTrue(type(m) == type(m2_r), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m2_r)))
        os.remove("temp_test_sparse_gp_classifier_with_data.json.zip")
        os.remove("temp_test_sparse_gp_classifier_without_data.json.zip")
        var = m.predict(X)[0]
        var1_r = m1_r.predict(X)[0]
        var2_r = m2_r.predict(X)[0]
        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.test_parameter_index_operations']
    unittest.main()
--- a/GPy/testing/state_space_main_tests.py
+++ b/GPy/testing/state_space_main_tests.py
--- a/GPy/testing/svgp_tests.py
+++ b/GPy/testing/svgp_tests.py
@ -1,54 +0,0 @@
 import numpy as np
 import scipy as sp
 import GPy
 class SVGP_nonconvex(np.testing.TestCase):
    """
    Inference in the SVGP with a student-T likelihood
    """
    def setUp(self):
        X = np.linspace(0,10,100).reshape(-1,1)
        Z = np.linspace(0,10,10).reshape(-1,1)
        Y = np.sin(X) + np.random.randn(*X.shape)*0.1
        Y[50] += 3
        lik = GPy.likelihoods.StudentT(deg_free=2)
        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
    def test_grad(self):
        assert self.m.checkgrad(step=1e-4)
 class SVGP_classification(np.testing.TestCase):
    """
    Inference in the SVGP with a Bernoulli likelihood
    """
    def setUp(self):
        X = np.linspace(0,10,100).reshape(-1,1)
        Z = np.linspace(0,10,10).reshape(-1,1)
        Y = np.where((np.sin(X) + np.random.randn(*X.shape)*0.1)>0, 1,0)
        lik = GPy.likelihoods.Bernoulli()
        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
    def test_grad(self):
        assert self.m.checkgrad(step=1e-4)
 class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
    """
    Inference in the SVGP with a Bernoulli likelihood
    """
    def setUp(self):
        X = np.linspace(0,10,100).reshape(-1,1)
        Z = np.linspace(0,10,10).reshape(-1,1)
        latent_f = np.exp(0.1*X * 0.05*X**2)
        Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
        mf = GPy.mappings.Linear(1,1)
        lik = GPy.likelihoods.Poisson()
        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
    def test_grad(self):
        assert self.m.checkgrad(step=1e-4)
--- a/GPy/testing/test_cython.py
+++ b/GPy/testing/test_cython.py
@ -0,0 +1,118 @@
 import numpy as np
 from GPy.util import choleskies
 import GPy
 import pytest
 from ..util.config import config
 try:
    from ..util import choleskies_cython
    choleskies_cython_working = config.getboolean("cython", "working")
 except ImportError:
    choleskies_cython_working = False
 try:
    from ..kern.src import stationary_cython
    stationary_cython_working = config.getboolean("cython", "working")
 except ImportError:
    stationary_cython_working = False
 """
 These tests make sure that the pure python and cython codes work the same
 """
 class CythonTestChols:
    def setup(self):
        self.flat = np.random.randn(45, 5)
        self.triang = np.array([np.eye(20) for i in range(3)])
    @pytest.mark.skipif(
        not choleskies_cython_working,
        "Cython cholesky module has not been built on this machine",
    )
    def test_flat_to_triang(self):
        L1 = choleskies._flat_to_triang_pure(self.flat)
        L2 = choleskies._flat_to_triang_cython(self.flat)
        assert np.allclose(L1, L2), "Triang mismatch!"
    @pytest.mark.skipif(
        not choleskies_cython_working,
        "Cython cholesky module has not been built on this machine",
    )
    def test_triang_to_flat(self):
        A1 = choleskies._triang_to_flat_pure(self.triang)
        A2 = choleskies._triang_to_flat_cython(self.triang)
        assert np.allclose(A1, A2), "Flat mismatch!"
 class TestStationary:
    def setup(self):
        self.k = GPy.kern.RBF(10)
        self.X = np.random.randn(300, 10)
        self.Z = np.random.randn(20, 10)
        self.dKxx = np.random.randn(300, 300)
        self.dKzz = np.random.randn(20, 20)
        self.dKxz = np.random.randn(300, 20)
    @pytest.mark.skipif(
        not stationary_cython_working,
        reason="Cython stationary module has not been built on this machine",
    )
    def test_square_gradX(self):
        self.setup()
        g1 = self.k._gradients_X_cython(self.dKxx, self.X)
        g2 = self.k._gradients_X_pure(self.dKxx, self.X)
        assert np.allclose(g1, g2), "Gradient mismatch on square X!"
    @pytest.mark.skipif(
        not stationary_cython_working,
        reason="Cython stationary module has not been built on this machine",
    )
    def test_rect_gradx(self):
        self.setup()
        g1 = self.k._gradients_X_cython(self.dKxz, self.X, self.Z)
        g2 = self.k._gradients_X_pure(self.dKxz, self.X, self.Z)
        assert np.allclose(g1, g2), "Gradient mismatch on rect X!"
    @pytest.mark.skipif(
        not stationary_cython_working,
        reason="Cython stationary module has not been built on this machine",
    )
    def test_square_lengthscales(self):
        self.setup()
        g1 = self.k._lengthscale_grads_pure(self.dKxx, self.X, self.X)
        g2 = self.k._lengthscale_grads_cython(self.dKxx, self.X, self.X)
        assert np.allclose(g1, g2), "Gradient mismatch on square lengthscale!"
    @pytest.mark.skipif(
        not stationary_cython_working,
        reason="Cython stationary module has not been built on this machine",
    )
    def test_rect_lengthscales(self):
        self.setup()
        g1 = self.k._lengthscale_grads_pure(self.dKxz, self.X, self.Z)
        g2 = self.k._lengthscale_grads_cython(self.dKxz, self.X, self.Z)
        assert np.allclose(g1, g2), "Gradient mismatch on rect lengthscale!"
 class TestCholeskiesBackprop:
    def setup(self):
        a = np.random.randn(10, 12)
        A = a.dot(a.T)
        self.L = GPy.util.linalg.jitchol(A)
        self.dL = np.random.randn(10, 10)
    @pytest.mark.skipif(
        not choleskies_cython_working,
        reason="Cython cholesky module has not been built on this machine",
    )
    def test_backprop(self):
        self.setup()
        r1 = choleskies._backprop_gradient_pure(self.dL, self.L)
        r2 = choleskies_cython.backprop_gradient(self.dL, self.L)
        r3 = choleskies_cython.backprop_gradient_par_c(self.dL, self.L)
        assert np.allclose(r1, r2), "Gradient mismatch!"
        assert np.allclose(r1, r3), "Gradient mismatch!"
--- a/GPy/testing/ep_likelihood_tests.py
+++ b/GPy/testing/ep_likelihood_tests.py
@ -1,17 +1,19 @@
-
+import pytest
 import numpy as np
 import unittest
 import GPy
-from GPy.models import GradientChecker
+
 fixed_seed = 10
-from nose.tools import with_setup, nottest
+
 def rmse(Y, Ystar):
    return np.sqrt(np.mean((Y - Ystar) ** 2))
 # this file will contain some high level tests, this is not unit testing, but will give us a higher level estimate
 # if things are going well under the hood.
-class TestObservationModels(unittest.TestCase):
+class TestObservationModels:
-    def setUp(self):
+    def setup(self):
        np.random.seed(fixed_seed)
        self.N = 100
        self.D = 2
@ -22,7 +24,7 @@ class TestObservationModels(unittest.TestCase):
        self.Y = (np.sin(self.X[:, 0] * 2 * np.pi) + noise)[:, None]
        self.num_points = self.X.shape[0]
        self.f = np.random.rand(self.N, 1)
-        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
+        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=int)[:, None]
        # self.binary_Y[self.binary_Y == 0.0] = -1.0
        self.positive_Y = np.exp(self.Y.copy())
@ -31,45 +33,72 @@ class TestObservationModels(unittest.TestCase):
        self.Y_noisy[75] += 1.3
        self.init_var = 0.15
-        self.deg_free = 4.
+        self.deg_free = 4.0
        censored = np.zeros_like(self.Y)
        random_inds = np.random.choice(self.N, int(self.N / 2), replace=True)
        censored[random_inds] = 1
        self.Y_metadata = dict()
-        self.Y_metadata['censored'] = censored
+        self.Y_metadata["censored"] = censored
        self.kernel1 = GPy.kern.RBF(self.X.shape[1]) + GPy.kern.White(self.X.shape[1])
-    def tearDown(self):
+    def tear_down(self):
        self.Y = None
        self.X = None
-        self.binary_Y =None
+        self.binary_Y = None
        self.positive_Y = None
        self.kernel1 = None
-    @with_setup(setUp, tearDown)
+    def test_epccassification(self):
-    def testEPClassification(self):
+        self.setup()
        bernoulli = GPy.likelihoods.Bernoulli()
        laplace_inf = GPy.inference.latent_function_inference.Laplace()
-        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
+        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode="alternated")
-        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
+        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode="nested")
-        ep_inf_fractional = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.9)
+        ep_inf_fractional = GPy.inference.latent_function_inference.EP(
            ep_mode="nested", eta=0.9
        )
-        m1 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=laplace_inf)
+        m1 = GPy.core.GP(
            self.X,
            self.binary_Y.copy(),
            kernel=self.kernel1.copy(),
            likelihood=bernoulli.copy(),
            inference_method=laplace_inf,
        )
        m1.randomize()
-        m2 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_alt)
+        m2 = GPy.core.GP(
            self.X,
            self.binary_Y.copy(),
            kernel=self.kernel1.copy(),
            likelihood=bernoulli.copy(),
            inference_method=ep_inf_alt,
        )
        m2.randomize()
-        m3 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_nested)
+        m3 = GPy.core.GP(
            self.X,
            self.binary_Y.copy(),
            kernel=self.kernel1.copy(),
            likelihood=bernoulli.copy(),
            inference_method=ep_inf_nested,
        )
        m3.randomize()
        #
-        m4 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_fractional)
+        m4 = GPy.core.GP(
            self.X,
            self.binary_Y.copy(),
            kernel=self.kernel1.copy(),
            likelihood=bernoulli.copy(),
            inference_method=ep_inf_fractional,
        )
        m4.randomize()
-        optimizer = 'bfgs'
+        optimizer = "bfgs"
-        #do gradcheck here ...
+        # do gradcheck here ...
        # self.assertTrue(m1.checkgrad())
        # self.assertTrue(m2.checkgrad())
        # self.assertTrue(m3.checkgrad())
@ -86,35 +115,53 @@ class TestObservationModels(unittest.TestCase):
        probs_mean_ep_nested, probs_var_ep_nested = m3.predict(self.X)
        # for simple single dimension data , marginal likelihood for laplace and EP approximations should not be so far apart.
-        self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=1)
+        # TODO: the below were assertAlmostEqual, not sure if allclose will do the job here
-        self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), delta=1)
+        #     I replace the old delta with the atol
-        self.assertAlmostEqual(m1.log_likelihood(), m4.log_likelihood(), delta=5)
+        assert np.allclose(m1.log_likelihood(), m2.log_likelihood(), atol=1.0)
        assert np.allclose(m1.log_likelihood(), m3.log_likelihood(), atol=1)
        assert np.allclose(m1.log_likelihood(), m4.log_likelihood(), atol=5.0)
        GPy.util.classification.conf_matrix(probs_mean_lap, self.binary_Y)
        GPy.util.classification.conf_matrix(probs_mean_ep_alt, self.binary_Y)
        GPy.util.classification.conf_matrix(probs_mean_ep_nested, self.binary_Y)
-    @nottest
+    @pytest.mark.skip(
-    def rmse(self, Y, Ystar):
+        "Fails as a consequence of fixing the DSYR function. Needs to be reviewed!"
-        return np.sqrt(np.mean((Y - Ystar) ** 2))
+    )
    def test_ep_with_studentt(self):
        self.setup()
        self.tear_down()
-    @with_setup(setUp, tearDown)
+        studentT = GPy.likelihoods.StudentT(
-    @unittest.skip("Fails as a consequence of fixing the DSYR function. Needs to be reviewed!")
+            deg_free=self.deg_free, sigma2=self.init_var
-    def test_EP_with_StudentT(self):
+        )
        studentT = GPy.likelihoods.StudentT(deg_free=self.deg_free, sigma2=self.init_var)
        laplace_inf = GPy.inference.latent_function_inference.Laplace()
-        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
+        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode="alternated")
-        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
+        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode="nested")
-        ep_inf_frac = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.7)
+        ep_inf_frac = GPy.inference.latent_function_inference.EP(
            ep_mode="nested", eta=0.7
        )
-        m1 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=laplace_inf)
+        m1 = GPy.core.GP(
            self.X.copy(),
            self.Y_noisy.copy(),
            kernel=self.kernel1.copy(),
            likelihood=studentT.copy(),
            inference_method=laplace_inf,
        )
        # optimize
-        m1['.*white'].constrain_fixed(1e-5)
+        m1[".*white"].constrain_fixed(1e-5)
        m1.randomize()
-        m2 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=ep_inf_alt)
+        m2 = GPy.core.GP(
-        m2['.*white'].constrain_fixed(1e-5)
+            self.X.copy(),
            self.Y_noisy.copy(),
            kernel=self.kernel1.copy(),
            likelihood=studentT.copy(),
            inference_method=ep_inf_alt,
        )
        m2[".*white"].constrain_fixed(1e-5)
        # m2.constrain_bounded('.*t_scale2', 0.001, 10)
        m2.randomize()
@ -123,12 +170,14 @@ class TestObservationModels(unittest.TestCase):
        # # m3.constrain_bounded('.*t_scale2', 0.001, 10)
        # m3.randomize()
-        optimizer='bfgs'
+        optimizer = "bfgs"
-        m1.optimize(optimizer=optimizer,max_iters=400)
+        m1.optimize(optimizer=optimizer, max_iters=400)
        m2.optimize(optimizer=optimizer, max_iters=400)
        # m3.optimize(optimizer=optimizer, max_iters=500)
-        self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=200)
+        # TODO: this was assertAlmostEqual, not sure if allclose will do the job here
        #    I replace the old delta with the atol
        assert np.allclose(m1.log_likelihood(), m2.log_likelihood(), atol=200.0)
        # self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), 3)
@ -140,9 +189,7 @@ class TestObservationModels(unittest.TestCase):
        # rmse_nested = self.rmse(preds_mean_nested, self.Y_noisy)
        if rmse_alt > rmse_lap:
-            self.assertAlmostEqual(rmse_lap, rmse_alt, delta=1.5)
+            # TODO: this was assertAlmostEqual, not sure if allclose will do the job here
            #   I replace the old delta with the atol
            assert np.allclose(rmse_lap, rmse_alt, atol=1.5)
        # m3.optimize(optimizer=optimizer, max_iters=500)
 if __name__ == "__main__":
    unittest.main()
--- a/GPy/testing/gp_tests.py
+++ b/GPy/testing/gp_tests.py
@ -1,36 +1,36 @@
-'''
+"""
 Created on 4 Sep 2015
@author: maxz
-'''
+"""
-import unittest
+import numpy as np
-import numpy as np, GPy
+import GPy
 from GPy.core.parameterization.variational import NormalPosterior
 class Test(unittest.TestCase):
-
+class TestGP:
-    def setUp(self):
+    def setup(self):
        np.random.seed(12345)
        self.N = 20
        self.N_new = 50
        self.D = 1
-        self.X = np.random.uniform(-3., 3., (self.N, 1))
+        self.X = np.random.uniform(-3.0, 3.0, (self.N, 1))
        self.Y = np.sin(self.X) + np.random.randn(self.N, self.D) * 0.05
-        self.X_new = np.random.uniform(-3., 3., (self.N_new, 1))
+        self.X_new = np.random.uniform(-3.0, 3.0, (self.N_new, 1))
    def test_setxy_bgplvm(self):
        self.setup()
        k = GPy.kern.RBF(1)
        m = GPy.models.BayesianGPLVM(self.Y, 1, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X
        Xnew = NormalPosterior(m.X.mean[:10].copy(), m.X.variance[:10].copy())
        m.set_XY(Xnew, m.Y[:10].copy())
-        assert(m.checkgrad())
+        assert m.checkgrad()
-        assert(m.num_data == m.X.shape[0])
+        assert m.num_data == m.X.shape[0]
-        assert(m.input_dim == m.X.shape[1])
+        assert m.input_dim == m.X.shape[1]
        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
@ -38,16 +38,18 @@ class Test(unittest.TestCase):
        np.testing.assert_allclose(var, var2)
    def test_setxy_gplvm(self):
        self.setup()
        k = GPy.kern.RBF(1)
        m = GPy.models.GPLVM(self.Y, 1, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X.copy()
        Xnew = X[:10].copy()
        m.set_XY(Xnew, m.Y[:10].copy())
-        assert(m.checkgrad())
+        assert m.checkgrad()
-        assert(m.num_data == m.X.shape[0])
+        assert m.num_data == m.X.shape[0]
-        assert(m.input_dim == m.X.shape[1])
+        assert m.input_dim == m.X.shape[1]
        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
@ -55,15 +57,17 @@ class Test(unittest.TestCase):
        np.testing.assert_allclose(var, var2)
    def test_setxy_gp(self):
        self.setup()
        k = GPy.kern.RBF(1)
        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X.copy()
        m.set_XY(m.X[:10], m.Y[:10])
-        assert(m.checkgrad())
+        assert m.checkgrad()
-        assert(m.num_data == m.X.shape[0])
+        assert m.num_data == m.X.shape[0]
-        assert(m.input_dim == m.X.shape[1])
+        assert m.input_dim == m.X.shape[1]
        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
@ -73,39 +77,45 @@ class Test(unittest.TestCase):
    def test_mean_function(self):
        from GPy.core.parameterization.param import Param
        from GPy.core.mapping import Mapping
        self.setup()
        class Parabola(Mapping):
-            def __init__(self, variance, degree=2, name='parabola'):
+            def __init__(self, variance, degree=2, name="parabola"):
                super(Parabola, self).__init__(1, 1, name)
-                self.variance = Param('variance', np.ones(degree+1) * variance)
+                self.variance = Param("variance", np.ones(degree + 1) * variance)
                self.degree = degree
                self.link_parameter(self.variance)
            def f(self, X):
                p = self.variance[0] * np.ones(X.shape)
-                for i in range(1, self.degree+1):
+                for i in range(1, self.degree + 1):
-                    p += self.variance[i] * X**(i)
+                    p += self.variance[i] * X ** (i)
                return p
            def gradients_X(self, dL_dF, X):
                grad = np.zeros(X.shape)
-                for i in range(1, self.degree+1):
+                for i in range(1, self.degree + 1):
-                    grad += (i) * self.variance[i] * X**(i-1)
+                    grad += (i) * self.variance[i] * X ** (i - 1)
                return grad
            def update_gradients(self, dL_dF, X):
-                for i in range(self.degree+1):
+                for i in range(self.degree + 1):
-                    self.variance.gradient[i] = (dL_dF * X**(i)).sum(0)
+                    self.variance.gradient[i] = (dL_dF * X ** (i)).sum(0)
        X = np.linspace(-2, 2, 100)[:, None]
        k = GPy.kern.RBF(1)
        k.randomize()
-        p = Parabola(.3)
+        p = Parabola(0.3)
        p.randomize()
-        Y = p.f(X) + np.random.multivariate_normal(np.zeros(X.shape[0]), k.K(X)+np.eye(X.shape[0])*1e-8)[:,None] + np.random.normal(0, .1, (X.shape[0], 1))
+        Y = (
            p.f(X)
            + np.random.multivariate_normal(
                np.zeros(X.shape[0]), k.K(X) + np.eye(X.shape[0]) * 1e-8
            )[:, None]
            + np.random.normal(0, 0.1, (X.shape[0], 1))
        )
        m = GPy.models.GPRegression(X, Y, mean_function=p)
        m.randomize()
-        assert(m.checkgrad())
+        assert m.checkgrad()
        _ = m.predict(m.X)
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.testName']
    unittest.main()
--- a/GPy/testing/test_gpy_kernels_state_space.py
+++ b/GPy/testing/test_gpy_kernels_state_space.py
--- a/GPy/testing/grid_tests.py
+++ b/GPy/testing/grid_tests.py
@ -3,21 +3,33 @@
 # Kurt Cutajar
 import unittest
 import numpy as np
 import GPy
-class GridModelTest(unittest.TestCase):
+
-    def setUp(self):
+class TestGridModel:
    def setup(self):
        ######################################
        # # 3 dimensional example
        # sample inputs and outputs
-        self.X = np.array([[0,0,0],[0,0,1],[0,1,0],[0,1,1],[1,0,0],[1,0,1],[1,1,0],[1,1,1]])
+        self.X = np.array(
            [
                [0, 0, 0],
                [0, 0, 1],
                [0, 1, 0],
                [0, 1, 1],
                [1, 0, 0],
                [1, 0, 1],
                [1, 1, 0],
                [1, 1, 1],
            ]
        )
        self.Y = np.random.randn(8, 1) * 100
        self.dim = self.X.shape[1]
    def test_alpha_match(self):
        self.setup()
        kernel = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
        m = GPy.models.GPRegressionGrid(self.X, self.Y, kernel)
@ -27,25 +39,31 @@ class GridModelTest(unittest.TestCase):
        np.testing.assert_almost_equal(m.posterior.alpha, m2.posterior.woodbury_vector)
    def test_gradient_match(self):
        self.setup()
        kernel = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
        m = GPy.models.GPRegressionGrid(self.X, self.Y, kernel)
        kernel2 = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
        m2 = GPy.models.GPRegression(self.X, self.Y, kernel2)
-        np.testing.assert_almost_equal(kernel.variance.gradient, kernel2.variance.gradient)
+        np.testing.assert_almost_equal(
-        np.testing.assert_almost_equal(kernel.lengthscale.gradient, kernel2.lengthscale.gradient)
+            kernel.variance.gradient, kernel2.variance.gradient
-        np.testing.assert_almost_equal(m.likelihood.variance.gradient, m2.likelihood.variance.gradient)
+        )
-
+        np.testing.assert_almost_equal(
            kernel.lengthscale.gradient, kernel2.lengthscale.gradient
        )
        np.testing.assert_almost_equal(
            m.likelihood.variance.gradient, m2.likelihood.variance.gradient
        )
    def test_prediction_match(self):
        self.setup()
        kernel = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
        m = GPy.models.GPRegressionGrid(self.X, self.Y, kernel)
        kernel2 = GPy.kern.RBF(input_dim=self.dim, variance=1, ARD=True)
        m2 = GPy.models.GPRegression(self.X, self.Y, kernel2)
-        test = np.array([[0,0,2],[-1,3,-4]])
+        test = np.array([[0, 0, 2], [-1, 3, -4]])
        np.testing.assert_almost_equal(m.predict(test), m2.predict(test))
--- a/GPy/testing/test_inference.py
+++ b/GPy/testing/test_inference.py
@ -0,0 +1,275 @@
 # Copyright (c) 2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The test cases for various inference algorithms
 """
 import numpy as np
 import GPy
 # np.seterr(invalid='raise')
 class TestInferenceXCase:
    def get_data(self):
        np.random.seed(1111)
        Ylist = GPy.examples.dimensionality_reduction._simulate_matern(
            5, 1, 1, 10, 3, False
        )[0]
        return Ylist[0]
    def test_inferenceX_BGPLVM_Linear(self):
        Ys = self.get_data()
        m = GPy.models.BayesianGPLVM(Ys, 3, kernel=GPy.kern.Linear(3, ARD=True))
        m.optimize()
        x, mi = m.infer_newX(m.Y, optimize=True)
        np.testing.assert_array_almost_equal(m.X.mean, mi.X.mean, decimal=2)
        np.testing.assert_array_almost_equal(m.X.variance, mi.X.variance, decimal=2)
    def test_inferenceX_BGPLVM_RBF(self):
        Ys = self.get_data()
        m = GPy.models.BayesianGPLVM(Ys, 3, kernel=GPy.kern.RBF(3, ARD=True))
        import warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            m.optimize()
        _x, mi = m.infer_newX(m.Y, optimize=True)
        np.testing.assert_array_almost_equal(m.X.mean, mi.X.mean, decimal=2)
        np.testing.assert_array_almost_equal(m.X.variance, mi.X.variance, decimal=2)
    def test_inferenceX_GPLVM_Linear(self):
        Ys = self.get_data()
        m = GPy.models.GPLVM(Ys, 3, kernel=GPy.kern.Linear(3, ARD=True))
        m.optimize()
        _x, mi = m.infer_newX(m.Y, optimize=True)
        np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
    def test_inferenceX_GPLVM_RBF(self):
        Ys = self.get_data()
        m = GPy.models.GPLVM(Ys, 3, kernel=GPy.kern.RBF(3, ARD=True))
        m.optimize()
        _x, mi = m.infer_newX(m.Y, optimize=True)
        np.testing.assert_array_almost_equal(m.X, mi.X, decimal=2)
 class TestInferenceGPEP:
    def get_data(self):
        np.random.seed(1)
        k = GPy.kern.RBF(1, variance=7.0, lengthscale=0.2)
        X = np.random.rand(200, 1)
        f = np.random.multivariate_normal(
            np.zeros(200), k.K(X) + 1e-5 * np.eye(X.shape[0])
        )
        lik = GPy.likelihoods.Bernoulli()
        _p = lik.gp_link.transf(f)  # squash the latent function
        Y = lik.samples(f).reshape(-1, 1)
        return X, Y
    def get_noisy_data(self):
        np.random.seed(1)
        X = np.random.rand(100, 1)
        self.real_std = 0.1
        noise = np.random.randn(*X[:, 0].shape) * self.real_std
        Y = (np.sin(X[:, 0] * 2 * np.pi) + noise)[:, None]
        self.f = np.random.rand(X.shape[0], 1)
        Y_extra_noisy = Y.copy()
        Y_extra_noisy[50] += 4.0
        # Y_extra_noisy[80:83] -= 2.
        return X, Y, Y_extra_noisy
    def test_inference_EP(self):
        from paramz import ObsAr
        X, Y = self.get_data()
        lik = GPy.likelihoods.Bernoulli()
        k = GPy.kern.RBF(1, variance=7.0, lengthscale=0.2)
        inf = GPy.inference.latent_function_inference.expectation_propagation.EP(
            max_iters=30, delta=0.5
        )
        self.model = GPy.core.GP(
            X=X, Y=Y, kernel=k, inference_method=inf, likelihood=lik
        )
        K = self.model.kern.K(X)
        mean_prior = np.zeros(K.shape[0])
        (
            post_params,
            ga_approx,
            cav_params,
            log_Z_tilde,
        ) = self.model.inference_method.expectation_propagation(
            mean_prior, K, ObsAr(Y), lik, None
        )
        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
        p, m, d = self.model.inference_method._inference(
            Y,
            mean_prior,
            K,
            ga_approx,
            cav_params,
            lik,
            Y_metadata=None,
            Z_tilde=log_Z_tilde,
        )
        p0, m0, d0 = super(
            GPy.inference.latent_function_inference.expectation_propagation.EP, inf
        ).inference(
            k,
            X,
            lik,
            mu_tilde[:, None],
            mean_function=None,
            variance=1.0 / ga_approx.tau,
            K=K,
            Z_tilde=log_Z_tilde
            + np.sum(
                -0.5 * np.log(ga_approx.tau)
                + 0.5 * (ga_approx.v * ga_approx.v * 1.0 / ga_approx.tau)
            ),
        )
        assert (
            np.sum(
                np.array(
                    [
                        m - m0,
                        np.sum(d["dL_dK"] - d0["dL_dK"]),
                        np.sum(d["dL_dthetaL"] - d0["dL_dthetaL"]),
                        np.sum(d["dL_dm"] - d0["dL_dm"]),
                        np.sum(p._woodbury_vector - p0._woodbury_vector),
                        np.sum(p.woodbury_inv - p0.woodbury_inv),
                    ]
                )
            )
            < 1e6
        )
    # NOTE: adding a test like above for parameterized likelihood- the above test is
    # only for probit likelihood which does not have any tunable hyperparameter which is why
    # the term in dictionary of gradients: dL_dthetaL will always be zero. So here we repeat tests for
    # student-t likelihood and heterodescastic gaussian noise case. This test simply checks if the posterior
    # and gradients of log marginal are roughly the same for inference through EP and exact gaussian inference using
    # the gaussian approximation for the individual likelihood site terms. For probit likelihood, it is possible to
    # calculate moments analytically, but for other likelihoods, we will need to use numerical quadrature techniques,
    # and it is possible that any error might creep up because of quadrature implementation.
    def test_inference_EP_non_classification(self):
        from paramz import ObsAr
        X, _Y, Y_extra_noisy = self.get_noisy_data()
        deg_freedom = 5.0
        init_noise_var = 0.08
        lik_studentT = GPy.likelihoods.StudentT(
            deg_free=deg_freedom, sigma2=init_noise_var
        )
        # like_gaussian_noise = GPy.likelihoods.MixedNoise()
        k = GPy.kern.RBF(1, variance=2.0, lengthscale=1.1)
        ep_inf_alt = GPy.inference.latent_function_inference.expectation_propagation.EP(
            max_iters=4, delta=0.5
        )
        # ep_inf_nested = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode='nested', max_iters=100, delta=0.5)
        m = GPy.core.GP(
            X=X,
            Y=Y_extra_noisy,
            kernel=k,
            likelihood=lik_studentT,
            inference_method=ep_inf_alt,
        )
        K = m.kern.K(X)
        mean_prior = np.zeros(K.shape[0])
        (
            post_params,
            ga_approx,
            cav_params,
            log_Z_tilde,
        ) = m.inference_method.expectation_propagation(
            mean_prior, K, ObsAr(Y_extra_noisy), lik_studentT, None
        )
        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
        p, m, d = m.inference_method._inference(
            Y_extra_noisy,
            mean_prior,
            K,
            ga_approx,
            cav_params,
            lik_studentT,
            Y_metadata=None,
            Z_tilde=log_Z_tilde,
        )
        p0, m0, d0 = super(
            GPy.inference.latent_function_inference.expectation_propagation.EP,
            ep_inf_alt,
        ).inference(
            k,
            X,
            lik_studentT,
            mu_tilde[:, None],
            mean_function=None,
            variance=1.0 / ga_approx.tau,
            K=K,
            Z_tilde=log_Z_tilde
            + np.sum(
                -0.5 * np.log(ga_approx.tau)
                + 0.5 * (ga_approx.v * ga_approx.v * 1.0 / ga_approx.tau)
            ),
        )
        assert (
            np.sum(
                np.array(
                    [
                        m - m0,
                        np.sum(d["dL_dK"] - d0["dL_dK"]),
                        np.sum(d["dL_dthetaL"] - d0["dL_dthetaL"]),
                        np.sum(d["dL_dm"] - d0["dL_dm"]),
                        np.sum(p._woodbury_vector - p0._woodbury_vector),
                        np.sum(p.woodbury_inv - p0.woodbury_inv),
                    ]
                )
            )
            < 1e6
        )
 class TestVarDtc:
    def test_var_dtc_inference_with_mean(self):
        """Check dL_dm in var_dtc is calculated correctly"""
        np.random.seed(1)
        x = np.linspace(0.0, 2 * np.pi, 100)[:, None]
        y = -np.cos(x) + np.random.randn(*x.shape) * 0.3 + 1
        m = GPy.models.SparseGPRegression(
            x, y, mean_function=GPy.mappings.Linear(input_dim=1, output_dim=1)
        )
        assert m.checkgrad()
 class TestHMCSampler:
    def test_sampling(self):
        np.random.seed(1)
        x = np.linspace(0.0, 2 * np.pi, 100)[:, None]
        y = -np.cos(x) + np.random.randn(*x.shape) * 0.3 + 1
        m = GPy.models.GPRegression(x, y)
        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
        hmc = GPy.inference.mcmc.HMC(m, stepsize=1e-2)
        _s = hmc.sample(num_samples=3)
 class TestMCMCSampler:
    def test_sampling(self):
        np.random.seed(1)
        x = np.linspace(0.0, 2 * np.pi, 100)[:, None]
        y = -np.cos(x) + np.random.randn(*x.shape) * 0.3 + 1
        m = GPy.models.GPRegression(x, y)
        m.kern.lengthscale.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
        m.kern.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
        m.likelihood.variance.set_prior(GPy.priors.Gamma.from_EV(1.0, 10.0))
        mcmc = GPy.inference.mcmc.Metropolis_Hastings(m)
        mcmc.sample(Ntotal=100, Nburn=10)
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
--- a/GPy/testing/test_linalg.py
+++ b/GPy/testing/test_linalg.py
@ -1,18 +1,19 @@
 import numpy as np
 import scipy as sp
-from ..util.linalg import jitchol,trace_dot, ijk_jlk_to_il, ijk_ljk_to_ilk
+from ..util.linalg import jitchol, trace_dot, ijk_jlk_to_il, ijk_ljk_to_ilk
-class LinalgTests(np.testing.TestCase):
+
-    def setUp(self):
+class TestLinalg:
-        #Create PD matrix
+    def setup(self):
-        A = np.random.randn(20,100)
+        # Create PD matrix
        A = np.random.randn(20, 100)
        self.A = A.dot(A.T)
-        #compute Eigdecomp
+        # compute Eigdecomp
        vals, vectors = np.linalg.eig(self.A)
-        #Set smallest eigenval to be negative with 5 rounds worth of jitter
+        # Set smallest eigenval to be negative with 5 rounds worth of jitter
        vals[vals.argmin()] = 0
-        default_jitter = 1e-6*np.mean(vals)
+        default_jitter = 1e-6 * np.mean(vals)
-        vals[vals.argmin()] = -default_jitter*(10**3.5)
+        vals[vals.argmin()] = -default_jitter * (10**3.5)
        self.A_corrupt = (vectors * vals).dot(vectors.T)
    def test_jitchol_success(self):
@ -20,12 +21,16 @@ class LinalgTests(np.testing.TestCase):
        Expect 5 rounds of jitter to be added and for the recovered matrix to be
        identical to the corrupted matrix apart from the jitter added to the diagonal
        """
        self.setup()
        L = jitchol(self.A_corrupt, maxtries=5)
        A_new = L.dot(L.T)
        diff = A_new - self.A_corrupt
-        np.testing.assert_allclose(diff, np.eye(A_new.shape[0])*np.diag(diff).mean(), atol=1e-13)
+        np.testing.assert_allclose(
            diff, np.eye(A_new.shape[0]) * np.diag(diff).mean(), atol=1e-13
        )
    def test_jitchol_failure(self):
        self.setup()
        try:
            """
            Expecting an exception to be thrown as we expect it to require
@ -37,24 +42,27 @@ class LinalgTests(np.testing.TestCase):
            return True
    def test_trace_dot(self):
        self.setup()
        N = 5
-        A = np.random.rand(N,N)
+        A = np.random.rand(N, N)
-        B = np.random.rand(N,N)
+        B = np.random.rand(N, N)
        trace = np.trace(A.dot(B))
-        test_trace = trace_dot(A,B)
+        test_trace = trace_dot(A, B)
-        np.testing.assert_allclose(trace,test_trace,atol=1e-13)
+        np.testing.assert_allclose(trace, test_trace, atol=1e-13)
    def test_einsum_ij_jlk_to_ilk(self):
        self.setup()
        A = np.random.randn(15, 150, 5)
        B = np.random.randn(150, 50, 5)
-        pure = np.einsum('ijk,jlk->il', A, B)
+        pure = np.einsum("ijk,jlk->il", A, B)
-        quick = ijk_jlk_to_il(A,B)
+        quick = ijk_jlk_to_il(A, B)
        np.testing.assert_allclose(pure, quick)
    def test_einsum_ijk_ljk_to_ilk(self):
        self.setup()
        A = np.random.randn(150, 20, 5)
        B = np.random.randn(150, 20, 5)
-        #B = A.copy()
+        # B = A.copy()
-        pure = np.einsum('ijk,ljk->ilk', A, B)
+        pure = np.einsum("ijk,ljk->ilk", A, B)
-        quick = ijk_ljk_to_ilk(A,B)
+        quick = ijk_ljk_to_ilk(A, B)
        np.testing.assert_allclose(pure, quick)
--- a/GPy/testing/test_link_function.py
+++ b/GPy/testing/test_link_function.py
@ -0,0 +1,196 @@
 import numpy as np
 import scipy
 from scipy.special import cbrt
 from GPy.models import GradientChecker
 import random
 _lim_val = np.finfo(np.float64).max
 _lim_val_exp = np.log(_lim_val)
 _lim_val_square = np.sqrt(_lim_val)
 _lim_val_cube = cbrt(_lim_val)
 from GPy.likelihoods.link_functions import (
    Identity,
    Probit,
    Cloglog,
    Log,
    Log_ex_1,
    Reciprocal,
    Heaviside,
    ScaledProbit,
 )
 class TestLinkFunction:
    def setup(self):
        self.small_f = np.array([[-1e-4]])
        self.zero_f = np.array([[1e-4]])
        self.mid_f = np.array([[5.0]])
        self.large_f = np.array([[1e4]])
        self.f_lower_lim = np.array(-np.inf)
        self.f_upper_lim = np.array(np.inf)
    def check_gradient(self, link_func, lim_of_inf, test_lim=False):
        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.mid_f)
        assert grad.checkgrad(verbose=True)
        grad2 = GradientChecker(
            link_func.dtransf_df, link_func.d2transf_df2, x0=self.mid_f
        )
        assert grad2.checkgrad(verbose=True)
        grad3 = GradientChecker(
            link_func.d2transf_df2, link_func.d3transf_df3, x0=self.mid_f
        )
        assert grad3.checkgrad(verbose=True)
        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.small_f)
        assert grad.checkgrad(verbose=True)
        grad2 = GradientChecker(
            link_func.dtransf_df, link_func.d2transf_df2, x0=self.small_f
        )
        assert grad2.checkgrad(verbose=True)
        grad3 = GradientChecker(
            link_func.d2transf_df2, link_func.d3transf_df3, x0=self.small_f
        )
        assert grad3.checkgrad(verbose=True)
        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.zero_f)
        assert grad.checkgrad(verbose=True)
        grad2 = GradientChecker(
            link_func.dtransf_df, link_func.d2transf_df2, x0=self.zero_f
        )
        assert grad2.checkgrad(verbose=True)
        grad3 = GradientChecker(
            link_func.d2transf_df2, link_func.d3transf_df3, x0=self.zero_f
        )
        assert grad3.checkgrad(verbose=True)
        # Do a limit test if the large f value is too large
        large_f = np.clip(self.large_f, -np.inf, lim_of_inf - 1e-3)
        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=large_f)
        assert grad.checkgrad(verbose=True)
        grad2 = GradientChecker(
            link_func.dtransf_df, link_func.d2transf_df2, x0=large_f
        )
        assert grad2.checkgrad(verbose=True)
        grad3 = GradientChecker(
            link_func.d2transf_df2, link_func.d3transf_df3, x0=large_f
        )
        assert grad3.checkgrad(verbose=True)
        if test_lim:
            print("Testing limits")
            # Remove some otherwise we are too close to the limit for gradcheck to work effectively
            lim_of_inf = lim_of_inf - 1e-4
            grad = GradientChecker(
                link_func.transf, link_func.dtransf_df, x0=lim_of_inf
            )
            assert grad.checkgrad(verbose=True)
            grad2 = GradientChecker(
                link_func.dtransf_df, link_func.d2transf_df2, x0=lim_of_inf
            )
            assert grad2.checkgrad(verbose=True)
            grad3 = GradientChecker(
                link_func.d2transf_df2, link_func.d3transf_df3, x0=lim_of_inf
            )
            assert grad3.checkgrad(verbose=True)
    def check_overflow(self, link_func, lim_of_inf):
        # Check that it does something sensible beyond this limit,
        # note this is not checking the value is correct, just that it isn't nan
        beyond_lim_of_inf = lim_of_inf + 100.0
        assert not np.isinf(link_func.transf(beyond_lim_of_inf))
        assert not np.isinf(link_func.dtransf_df(beyond_lim_of_inf))
        assert not np.isinf(link_func.d2transf_df2(beyond_lim_of_inf))
        assert not np.isnan(link_func.transf(beyond_lim_of_inf))
        assert not np.isnan(link_func.dtransf_df(beyond_lim_of_inf))
        assert not np.isnan(link_func.d2transf_df2(beyond_lim_of_inf))
    def test_log_overflow(self):
        self.setup()
        link = Log()
        lim_of_inf = _lim_val_exp
        np.testing.assert_almost_equal(np.exp(self.mid_f), link.transf(self.mid_f))
        assert np.isinf(np.exp(np.log(self.f_upper_lim)))
        # Check the clipping works
        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
        assert np.isfinite(link.transf(self.f_upper_lim))
        self.check_overflow(link, lim_of_inf)
        # Check that it would otherwise fail
        beyond_lim_of_inf = lim_of_inf + 10.0
        old_err_state = np.seterr(over="ignore")
        assert np.isinf(np.exp(beyond_lim_of_inf))
        np.seterr(**old_err_state)
    def test_log_ex_1_overflow(self):
        self.setup()
        link = Log_ex_1()
        lim_of_inf = _lim_val_exp
        np.testing.assert_almost_equal(
            scipy.special.log1p(np.exp(self.mid_f)), link.transf(self.mid_f)
        )
        assert np.isinf(scipy.special.log1p(np.exp(np.log(self.f_upper_lim))))
        # Check the clipping works
        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
        # Need to look at most significant figures here rather than the decimals
        np.testing.assert_approx_equal(
            link.transf(self.f_upper_lim), scipy.special.log1p(_lim_val), significant=5
        )
        self.check_overflow(link, lim_of_inf)
        # Check that it would otherwise fail
        beyond_lim_of_inf = lim_of_inf + 10.0
        old_err_state = np.seterr(over="ignore")
        assert np.isinf(scipy.special.log1p(np.exp(beyond_lim_of_inf)))
        np.seterr(**old_err_state)
    def test_log_gradients(self):
        # transf dtransf_df d2transf_df2 d3transf_df3
        self.setup()
        link = Log()
        lim_of_inf = _lim_val_exp
        self.check_gradient(link, lim_of_inf, test_lim=True)
    def test_identity_gradients(self):
        self.setup()
        link = Identity()
        lim_of_inf = _lim_val
        # FIXME: Should be able to think of a way to test the limits of this
        self.check_gradient(link, lim_of_inf, test_lim=False)
    def test_probit_gradients(self):
        self.setup()
        link = Probit()
        lim_of_inf = _lim_val
        self.check_gradient(link, lim_of_inf, test_lim=True)
    def test_scaledprobit_gradients(self):
        self.setup()
        link = ScaledProbit(nu=random.random())
        lim_of_inf = _lim_val
        self.check_gradient(link, lim_of_inf, test_lim=True)
    def test_Cloglog_gradients(self):
        self.setup()
        link = Cloglog()
        lim_of_inf = _lim_val_exp
        self.check_gradient(link, lim_of_inf, test_lim=True)
    def test_Log_ex_1_gradients(self):
        self.setup()
        link = Log_ex_1()
        lim_of_inf = _lim_val_exp
        self.check_gradient(link, lim_of_inf, test_lim=True)
        self.check_overflow(link, lim_of_inf)
    def test_reciprocal_gradients(self):
        self.setup()
        link = Reciprocal()
        lim_of_inf = _lim_val
        # Does not work with much smaller values, and values closer to zero than 1e-5
        self.check_gradient(link, lim_of_inf, test_lim=True)
--- a/GPy/testing/mapping_tests.py
+++ b/GPy/testing/mapping_tests.py
@ -1,10 +1,10 @@
 # Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 import numpy as np
 import GPy
 class MappingGradChecker(GPy.core.Model):
    """
    This class has everything we need to check the gradient of a mapping. It
@ -12,63 +12,60 @@ class MappingGradChecker(GPy.core.Model):
    mapping. the gradients are checked against the parameters of the mapping
    and the input.
    """
-    def __init__(self, mapping, X, name='map_grad_check'):
+
    def __init__(self, mapping, X, name="map_grad_check"):
        super(MappingGradChecker, self).__init__(name)
        self.mapping = mapping
        self.link_parameter(self.mapping)
-        self.X = GPy.core.Param('X',X)
+        self.X = GPy.core.Param("X", X)
        self.link_parameter(self.X)
        self.dL_dY = np.random.randn(self.X.shape[0], self.mapping.output_dim)
    def log_likelihood(self):
        return np.sum(self.mapping.f(self.X) * self.dL_dY)
    def parameters_changed(self):
        self.X.gradient = self.mapping.gradients_X(self.dL_dY, self.X)
        self.mapping.update_gradients(self.dL_dY, self.X)
-class MappingTests(unittest.TestCase):
+class TestMapping:
    def test_kernelmapping(self):
-        X = np.random.randn(100,3)
+        X = np.random.randn(100, 3)
-        Z = np.random.randn(10,3)
+        Z = np.random.randn(10, 3)
        mapping = GPy.mappings.Kernel(3, 2, Z, GPy.kern.RBF(3))
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        assert MappingGradChecker(mapping, X).checkgrad()
    def test_linearmapping(self):
        mapping = GPy.mappings.Linear(3, 2)
-        X = np.random.randn(100,3)
+        X = np.random.randn(100, 3)
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        assert MappingGradChecker(mapping, X).checkgrad()
    def test_mlpmapping(self):
        mapping = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
-        X = np.random.randn(100,3)
+        X = np.random.randn(100, 3)
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        assert MappingGradChecker(mapping, X).checkgrad()
    def test_mlpextmapping(self):
        np.random.seed(42)
-        X = np.random.randn(100,3)
+        X = np.random.randn(100, 3)
-        for activation in ['tanh', 'relu', 'sigmoid']:
+        for activation in ["tanh", "relu", "sigmoid"]:
-            mapping = GPy.mappings.MLPext(input_dim=3, hidden_dims=[5,5], output_dim=2, activation=activation)
+            mapping = GPy.mappings.MLPext(
-            self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+                input_dim=3, hidden_dims=[5, 5], output_dim=2, activation=activation
            )
            assert MappingGradChecker(mapping, X).checkgrad()
    def test_addmapping(self):
        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
        m2 = GPy.mappings.Linear(input_dim=3, output_dim=2)
        mapping = GPy.mappings.Additive(m1, m2)
-        X = np.random.randn(100,3)
+        X = np.random.randn(100, 3)
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        assert MappingGradChecker(mapping, X).checkgrad()
    def test_compoundmapping(self):
        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
-        Z = np.random.randn(10,2)
+        Z = np.random.randn(10, 2)
        m2 = GPy.mappings.Kernel(2, 4, Z, GPy.kern.RBF(2))
        mapping = GPy.mappings.Compound(m1, m2)
-        X = np.random.randn(100,3)
+        X = np.random.randn(100, 3)
-        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+        assert MappingGradChecker(mapping, X).checkgrad()
 if __name__ == "__main__":
    print("Running unit tests, please be (very) patient...")
    unittest.main()
--- a/GPy/testing/test_meanfunc.py
+++ b/GPy/testing/test_meanfunc.py
@ -0,0 +1,90 @@
 # Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import GPy
 class TestMF:
    def test_simple_mean_function(self):
        """
        The simplest possible mean function. No parameters, just a simple Sinusoid.
        """
        # create  simple mean function
        mf = GPy.core.Mapping(1, 1)
        mf.f = np.sin
        mf.update_gradients = lambda a, b: None
        X = np.linspace(0, 10, 50).reshape(-1, 1)
        Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape)
        k = GPy.kern.RBF(1)
        lik = GPy.likelihoods.Gaussian()
        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
        assert m.checkgrad()
    def test_parametric_mean_function(self):
        """
        A linear mean function with parameters that we'll learn alongside the kernel
        """
        X = np.linspace(-1, 10, 50).reshape(-1, 1)
        Y = 3 - np.abs((X - 6))
        Y += 0.5 * np.cos(3 * X) + 0.3 * np.random.randn(*X.shape)
        mf = GPy.mappings.PiecewiseLinear(1, 1, [-1, 1], [9, 2])
        k = GPy.kern.RBF(1)
        lik = GPy.likelihoods.Gaussian()
        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
        assert m.checkgrad()
    def test_parametric_mean_function_composition(self):
        """
        A linear mean function with parameters that we'll learn alongside the kernel
        """
        X = np.linspace(0, 10, 50).reshape(-1, 1)
        Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape) + 3 * X
        mf = GPy.mappings.Compound(
            GPy.mappings.Linear(1, 1),
            GPy.mappings.Kernel(1, 1, np.random.normal(0, 1, (1, 1)), GPy.kern.RBF(1)),
        )
        k = GPy.kern.RBF(1)
        lik = GPy.likelihoods.Gaussian()
        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
        assert m.checkgrad()
    def test_parametric_mean_function_additive(self):
        """
        A linear mean function with parameters that we'll learn alongside the kernel
        """
        X = np.linspace(0, 10, 50).reshape(-1, 1)
        Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape) + 3 * X
        mf = GPy.mappings.Additive(
            GPy.mappings.Constant(1, 1, 3),
            GPy.mappings.Additive(GPy.mappings.MLP(1, 1), GPy.mappings.Identity(1, 1)),
        )
        k = GPy.kern.RBF(1)
        lik = GPy.likelihoods.Gaussian()
        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
        assert m.checkgrad()
    def test_svgp_mean_function(self):
        # an instance of the SVIGOP with a men function
        X = np.linspace(0, 10, 500).reshape(-1, 1)
        Y = np.sin(X) + 0.5 * np.cos(3 * X) + 0.1 * np.random.randn(*X.shape)
        Y = np.where(Y > 0, 1, 0)  # make aclassificatino problem
        mf = GPy.mappings.Linear(1, 1)
        Z = np.linspace(0, 10, 50).reshape(-1, 1)
        lik = GPy.likelihoods.Bernoulli()
        k = GPy.kern.RBF(1) + GPy.kern.White(1, 1e-4)
        m = GPy.core.SVGP(X, Y, Z=Z, kernel=k, likelihood=lik, mean_function=mf)
        assert m.checkgrad()
--- a/GPy/testing/test_minibatch.py
+++ b/GPy/testing/test_minibatch.py
@ -0,0 +1,416 @@
 """
 Created on 4 Sep 2015
@author: maxz
 """
 import pytest
 import numpy as np
 import GPy
 try:
    import climin
 except ImportError:
    climin = None
 class TestBGPLVM:
    def setup(self):
        np.random.seed(12345)
        X, W = np.random.normal(0, 1, (100, 6)), np.random.normal(0, 1, (6, 13))
        Y = X.dot(W) + np.random.normal(0, 0.1, (X.shape[0], W.shape[1]))
        self.inan = np.random.binomial(1, 0.1, Y.shape).astype(bool)
        self.X, self.W, self.Y = X, W, Y
        self.Q = 3
        self.m_full = GPy.models.BayesianGPLVM(Y, self.Q)
    def test_lik_comparisons_m1_s0(self):
        self.setup()
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y, self.Q, missing_data=True, stochastic=False
        )
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(
            m.log_likelihood(), self.m_full.log_likelihood(), 7
        )
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert m.checkgrad()
    def test_predict_missing_data(self):
        self.setup()
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            missing_data=True,
            stochastic=True,
            batchsize=self.Y.shape[1],
        )
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(
            m.log_likelihood(), self.m_full.log_likelihood(), 7
        )
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        with pytest.raises(NotImplementedError):
            m.predict(m.X, full_cov=True)
        mu1, var1 = m.predict(m.X, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1, var2)
        mu1, var1 = m.predict(m.X.mean, full_cov=True)
        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=True)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1[:, :, 0], var2)
        mu1, var1 = m.predict(m.X.mean, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1[:, [0]], var2)
    def test_lik_comparisons_m0_s0(self):
        self.setup()
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            X_variance=self.m_full.X.variance.values,
            missing_data=False,
            stochastic=False,
        )
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(
            m.log_likelihood(), self.m_full.log_likelihood(), 7
        )
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert m.checkgrad()
    def test_lik_comparisons_m1_s1(self):
        self.setup()
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            missing_data=True,
            stochastic=True,
            batchsize=self.Y.shape[1],
        )
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(
            m.log_likelihood(), self.m_full.log_likelihood(), 7
        )
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert m.checkgrad()
    def test_lik_comparisons_m0_s1(self):
        self.setup()
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            missing_data=False,
            stochastic=True,
            batchsize=self.Y.shape[1],
        )
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(
            m.log_likelihood(), self.m_full.log_likelihood(), 7
        )
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert m.checkgrad()
    def test_gradients_missingdata(self):
        self.setup()
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            missing_data=True,
            stochastic=False,
            batchsize=self.Y.shape[1],
        )
        assert m.checkgrad()
    def test_gradients_missingdata_stochastics(self):
        self.setup()
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y, self.Q, missing_data=True, stochastic=True, batchsize=1
        )
        assert m.checkgrad()
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y, self.Q, missing_data=True, stochastic=True, batchsize=4
        )
        assert m.checkgrad()
    def test_gradients_stochastics(self):
        self.setup()
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y, self.Q, missing_data=False, stochastic=True, batchsize=1
        )
        assert m.checkgrad()
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y, self.Q, missing_data=False, stochastic=True, batchsize=4
        )
        assert m.checkgrad()
    def test_predict(self):
        self.setup()
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            missing_data=True,
            stochastic=True,
            batchsize=self.Y.shape[1],
        )
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(
            m.log_likelihood(), self.m_full.log_likelihood(), 7
        )
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert m.checkgrad()
 class TestSparseGPMinibatch:
    def setup(self):
        np.random.seed(12345)
        X, W = np.random.normal(0, 1, (100, 6)), np.random.normal(0, 1, (6, 13))
        Y = X.dot(W) + np.random.normal(0, 0.1, (X.shape[0], W.shape[1]))
        self.inan = np.random.binomial(1, 0.1, Y.shape).astype(bool)
        self.X, self.W, self.Y = X, W, Y
        self.Q = 3
        self.m_full = GPy.models.SparseGPLVM(
            Y, self.Q, kernel=GPy.kern.RBF(self.Q, ARD=True)
        )
    def test_lik_comparisons_m1_s0(self):
        self.setup()
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False
        )
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(
            m.log_likelihood(), self.m_full.log_likelihood(), 7
        )
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert m.checkgrad()
    @pytest.mark.skipif(climin is None, reason="climin not installed")
    def test_sparsegp_init(self):
        self.setup()
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        np.random.seed(1234)
        Z = self.X[np.random.choice(self.X.shape[0], replace=False, size=10)].copy()
        Q = Z.shape[1]
        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(
            self.X,
            self.Y,
            Z,
            GPy.kern.RBF(Q) + GPy.kern.Matern32(Q) + GPy.kern.Bias(Q),
            GPy.likelihoods.Gaussian(),
            missing_data=True,
            stochastic=False,
        )
        assert m.checkgrad()
        m.optimize("adadelta", max_iters=10)
        assert m.checkgrad()
        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(
            self.X,
            self.Y,
            Z,
            GPy.kern.RBF(Q) + GPy.kern.Matern32(Q) + GPy.kern.Bias(Q),
            GPy.likelihoods.Gaussian(),
            missing_data=True,
            stochastic=True,
        )
        assert m.checkgrad()
        m.optimize("rprop", max_iters=10)
        assert m.checkgrad()
        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(
            self.X,
            self.Y,
            Z,
            GPy.kern.RBF(Q) + GPy.kern.Matern32(Q) + GPy.kern.Bias(Q),
            GPy.likelihoods.Gaussian(),
            missing_data=False,
            stochastic=False,
        )
        assert m.checkgrad()
        m.optimize("rprop", max_iters=10)
        assert m.checkgrad()
        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(
            self.X,
            self.Y,
            Z,
            GPy.kern.RBF(Q) + GPy.kern.Matern32(Q) + GPy.kern.Bias(Q),
            GPy.likelihoods.Gaussian(),
            missing_data=False,
            stochastic=True,
        )
        assert m.checkgrad()
        m.optimize("adadelta", max_iters=10)
        assert m.checkgrad()
    def test_predict_missing_data(self):
        self.setup()
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            X_variance=False,
            missing_data=True,
            stochastic=True,
            batchsize=self.Y.shape[1],
        )
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(
            m.log_likelihood(), self.m_full.log_likelihood(), 7
        )
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        mu1, var1 = m.predict(m.X, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        for i in range(var1.shape[1]):
            np.testing.assert_allclose(var1[:, [i]], var2)
        mu1, var1 = m.predict(m.X, full_cov=True)
        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=True)
        np.testing.assert_allclose(mu1, mu2)
        for i in range(var1.shape[2]):
            np.testing.assert_allclose(var1[:, :, i], var2)
    def test_lik_comparisons_m0_s0(self):
        self.setup()
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y, self.Q, X_variance=False, missing_data=False, stochastic=False
        )
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(
            m.log_likelihood(), self.m_full.log_likelihood(), 7
        )
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert m.checkgrad()
    def test_lik_comparisons_m1_s1(self):
        self.setup()
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            X_variance=False,
            missing_data=True,
            stochastic=True,
            batchsize=self.Y.shape[1],
        )
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(
            m.log_likelihood(), self.m_full.log_likelihood(), 7
        )
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert m.checkgrad()
    def test_lik_comparisons_m0_s1(self):
        self.setup()
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            X_variance=False,
            missing_data=False,
            stochastic=True,
            batchsize=self.Y.shape[1],
        )
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(
            m.log_likelihood(), self.m_full.log_likelihood(), 7
        )
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert m.checkgrad()
    def test_gradients_missingdata(self):
        self.setup()
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            X_variance=False,
            missing_data=True,
            stochastic=False,
            batchsize=self.Y.shape[1],
        )
        assert m.checkgrad()
    def test_gradients_missingdata_stochastics(self):
        self.setup()
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            X_variance=False,
            missing_data=True,
            stochastic=True,
            batchsize=1,
        )
        assert m.checkgrad()
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            X_variance=False,
            missing_data=True,
            stochastic=True,
            batchsize=4,
        )
        assert m.checkgrad()
    def test_gradients_stochastics(self):
        self.setup()
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            X_variance=False,
            missing_data=False,
            stochastic=True,
            batchsize=1,
        )
        assert m.checkgrad()
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            X_variance=False,
            missing_data=False,
            stochastic=True,
            batchsize=4,
        )
        assert m.checkgrad()
    def test_predict(self):
        self.setup()
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(
            self.Y,
            self.Q,
            X_variance=False,
            missing_data=True,
            stochastic=True,
            batchsize=self.Y.shape[1],
        )
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(
            m.log_likelihood(), self.m_full.log_likelihood(), 7
        )
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert m.checkgrad()
--- a/GPy/testing/misc_tests.py
+++ b/GPy/testing/misc_tests.py
@ -1,27 +1,28 @@
 from __future__ import print_function
 import numpy as np
 import scipy as sp
 import GPy
 import warnings
-class MiscTests(np.testing.TestCase):
+
 class TestMisc:
    """
    Testing some utilities of misc
    """
-    def setUp(self):
+
    def setup(self):
        self._lim_val = np.finfo(np.float64).max
        self._lim_val_exp = np.log(self._lim_val)
    def test_safe_exp_upper(self):
        self.setup()
        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter('always')  # always print
+            warnings.simplefilter("always")  # always print
            assert np.isfinite(np.exp(self._lim_val_exp))
            assert np.isinf(np.exp(self._lim_val_exp + 1))
            assert np.isfinite(GPy.util.misc.safe_exp(self._lim_val_exp + 1))
            print(w)
            print(len(w))
-            assert len(w)<=1 # should have one overflow warning
+            assert len(w) <= 1  # should have one overflow warning
    def test_safe_exp_lower(self):
        assert GPy.util.misc.safe_exp(1e-10) < np.inf
--- a/GPy/testing/test_model.py
+++ b/GPy/testing/test_model.py
--- a/GPy/testing/pep_tests.py
+++ b/GPy/testing/pep_tests.py
@ -1,94 +1,98 @@
 # Copyright (c) 2014, James Hensman, 2016, Thang Bui
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 import numpy as np
 import GPy
-class PEPgradienttest(unittest.TestCase):
+
-    def setUp(self):
+class TestPEPgradient:
    def setup(self):
        ######################################
        # # 1 dimensional example
        np.random.seed(10)
        N = 20
        # sample inputs and outputs
-        self.X1D = np.random.uniform(-3., 3., (N, 1))
+        self.X1D = np.random.uniform(-3.0, 3.0, (N, 1))
        self.Y1D = np.sin(self.X1D) + np.random.randn(N, 1) * 0.05
        ######################################
        # # 2 dimensional example
        # sample inputs and outputs
-        self.X2D = np.random.uniform(-3., 3., (N, 2))
+        self.X2D = np.random.uniform(-3.0, 3.0, (N, 2))
-        self.Y2D = np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2]) + np.random.randn(N, 1) * 0.05
+        self.Y2D = (
            np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2])
            + np.random.randn(N, 1) * 0.05
        )
        #######################################
        # # more datapoints, check in alpha limits, the log marginal likelihood
        # # is consistent with FITC and VFE/Var_DTC
        M = 5
        np.random.seed(42)
-        self.X1 = np.c_[np.linspace(-1., 1., N)]
+        self.X1 = np.c_[np.linspace(-1.0, 1.0, N)]
        self.Y1 = np.sin(self.X1) + np.random.randn(N, 1) * 0.05
        self.kernel = GPy.kern.RBF(input_dim=1, lengthscale=0.5, variance=1)
        self.Z = np.random.uniform(-1, 1, (M, 1))
        self.lik_noise_var = 0.01
    def test_pep_1d_gradients(self):
        self.setup()
        m = GPy.models.SparseGPRegression(self.X1D, self.Y1D)
-        m.inference_method = GPy.inference.latent_function_inference.PEP(alpha=np.random.rand())
+        m.inference_method = GPy.inference.latent_function_inference.PEP(
-        self.assertTrue(m.checkgrad())
+            alpha=np.random.rand()
        )
        assert m.checkgrad()
    def test_pep_2d_gradients(self):
        self.setup()
        m = GPy.models.SparseGPRegression(self.X2D, self.Y2D)
-        m.inference_method = GPy.inference.latent_function_inference.PEP(alpha=np.random.rand())
+        m.inference_method = GPy.inference.latent_function_inference.PEP(
-        self.assertTrue(m.checkgrad())
+            alpha=np.random.rand()
        )
        assert m.checkgrad()
    def test_pep_vfe_consistency(self):
        self.setup()
        vfe_model = GPy.models.SparseGPRegression(
-            self.X1, 
+            self.X1, self.Y1, kernel=self.kernel, Z=self.Z
            self.Y1, 
            kernel=self.kernel, 
            Z=self.Z
        )
        vfe_model.inference_method = GPy.inference.latent_function_inference.VarDTC()
        vfe_model.Gaussian_noise.variance = self.lik_noise_var
        vfe_lml = vfe_model.log_likelihood()
        pep_model = GPy.models.SparseGPRegression(
-            self.X1, 
+            self.X1, self.Y1, kernel=self.kernel, Z=self.Z
-            self.Y1, 
+        )
-            kernel=self.kernel, 
+        pep_model.inference_method = GPy.inference.latent_function_inference.PEP(
-            Z=self.Z
+            alpha=1e-5
        )
        pep_model.inference_method = GPy.inference.latent_function_inference.PEP(alpha=1e-5)
        pep_model.Gaussian_noise.variance = self.lik_noise_var
        pep_lml = pep_model.log_likelihood()
-        self.assertAlmostEqual(vfe_lml[0, 0], pep_lml[0], delta=abs(0.01*pep_lml[0]))
+        np.testing.assert_almost_equal(
            vfe_lml[0, 0], pep_lml[0], decimal=abs(0.01 * pep_lml[0])
        )
    def test_pep_fitc_consistency(self):
        self.setup()
        fitc_model = GPy.models.SparseGPRegression(
-            self.X1D, 
+            self.X1D, self.Y1D, kernel=self.kernel, Z=self.Z
            self.Y1D, 
            kernel=self.kernel, 
            Z=self.Z
        )
        fitc_model.inference_method = GPy.inference.latent_function_inference.FITC()
        fitc_model.Gaussian_noise.variance = self.lik_noise_var
        fitc_lml = fitc_model.log_likelihood()
        pep_model = GPy.models.SparseGPRegression(
-            self.X1D, 
+            self.X1D, self.Y1D, kernel=self.kernel, Z=self.Z
-            self.Y1D, 
+        )
-            kernel=self.kernel, 
+        pep_model.inference_method = GPy.inference.latent_function_inference.PEP(
-            Z=self.Z
+            alpha=1
        )
        pep_model.inference_method = GPy.inference.latent_function_inference.PEP(alpha=1)
        pep_model.Gaussian_noise.variance = self.lik_noise_var
        pep_lml = pep_model.log_likelihood()
-        self.assertAlmostEqual(fitc_lml, pep_lml[0], delta=abs(0.001*pep_lml[0]))
+        np.testing.assert_almost_equal(
-
+            fitc_lml, pep_lml[0], decimal=abs(0.001 * pep_lml[0])
-
+        )
--- a/GPy/testing/test_pickle.py
+++ b/GPy/testing/test_pickle.py
@ -0,0 +1,133 @@
 """
 Created on 13 Mar 2014
@author: maxz
 """
 # import cPickle as pickle
 import pickle
 import pytest
 import numpy as np
 import tempfile
 from GPy.examples.dimensionality_reduction import mrd_simulation
 from GPy.core.parameterization.variational import NormalPosterior
 from GPy.models.gp_regression import GPRegression
 import GPy
 def toy_model():
    X = np.linspace(0, 1, 50)[:, None]
    Y = np.sin(X)
    m = GPRegression(X=X, Y=Y)
    return m
 class ListDictTestCase:
    def assertListDictEquals(self, d1, d2, msg=None):
        # py3 fix
        # for k,v in d1.iteritems():
        for k, v in d1.items():
            self.assertListEqual(list(v), list(d2[k]), msg)
    def assertArrayListEquals(self, l1, l2):
        for a1, a2 in zip(l1, l2):
            np.testing.assert_array_equal(a1, a2)
 class TestPickleSupport(ListDictTestCase):
    @pytest.mark.skip(reason="")  # why is this test skipped?
    def test_load_pickle(self):
        import os
        m = GPy.load(
            os.path.join(
                os.path.abspath(os.path.split(__file__)[0]), "pickle_test.pickle"
            )
        )
        assert m.checkgrad()
        assert m.log_likelihood(), -4.7351019830022087
    def test_model(self):
        par = toy_model()
        pcopy = par.copy()
        assert par.param_array.tolist() == pcopy.param_array.tolist()
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        assert str(par) == str(pcopy)
        assert np.all(par.param_array == pcopy.param_array)
        assert np.all(par.gradient_full == pcopy.gradient_full)
        assert pcopy.checkgrad()
        assert np.any(pcopy.gradient != 0.0)
        with tempfile.TemporaryFile("w+b") as f:
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
        assert par.param_array.tolist() == pcopy.param_array.tolist()
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        assert str(par) == str(pcopy)
        assert pcopy.checkgrad()
    def test_modelrecreation(self):
        par = toy_model()
        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
        np.testing.assert_allclose(par.param_array, pcopy.param_array)
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        assert str(par) == str(pcopy)
        assert np.all(par.param_array == pcopy.param_array)
        assert np.all(par.gradient_full == pcopy.gradient_full)
        assert pcopy.checkgrad()
        assert np.any(pcopy.gradient != 0.0)
        np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)
        par.randomize()
        with tempfile.TemporaryFile("w+b") as f:
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
        np.testing.assert_allclose(par.param_array, pcopy.param_array)
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full, atol=1e-6)
        assert str(par) == str(pcopy)
        assert pcopy.checkgrad()
    def test_posterior(self):
        X = np.random.randn(3, 5)
        Xv = np.random.rand(*X.shape)
        par = NormalPosterior(X, Xv)
        par.gradient = 10
        pcopy = par.copy()
        pcopy.gradient = 10
        assert par.param_array.tolist() == pcopy.param_array.tolist()
        assert par.gradient_full.tolist() == pcopy.gradient_full.tolist()
        assert str(par) == str(pcopy)
        assert np.all(par.param_array == pcopy.param_array)
        assert np.all(par.gradient_full == pcopy.gradient_full)
        with tempfile.TemporaryFile("w+b") as f:
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
        assert par.param_array.tolist() == pcopy.param_array.tolist()
        pcopy.gradient = 10
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        np.testing.assert_allclose(pcopy.mean.gradient_full, 10)
        assert str(par) == str(pcopy)
    def test_model_concat(self):
        par = mrd_simulation(optimize=0, plot=0, plot_sim=0)
        par.randomize()
        pcopy = par.copy()
        assert par.param_array.tolist() == pcopy.param_array.tolist()
        assert par.gradient_full.tolist() == pcopy.gradient_full.tolist()
        assert str(par) == str(pcopy)
        assert np.all(par.param_array == pcopy.param_array)
        assert np.all(par.gradient_full == pcopy.gradient_full)
        assert par.checkgrad()
        assert pcopy.checkgrad()
        assert np.any(pcopy.gradient != 0.0)
        with tempfile.TemporaryFile("w+b") as f:
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
        assert par.param_array.tolist() == pcopy.param_array.tolist()
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        assert str(par) == str(pcopy)
        assert pcopy.checkgrad()
    def _callback(self, what, which):
        what.count += 1
--- a/GPy/testing/test_plotting.py
+++ b/GPy/testing/test_plotting.py
@ -0,0 +1,703 @@
 # ===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # * Redistributions of source code must retain the above copyright notice, this
 #   list of conditions and the following disclaimer.
 #
 # * Redistributions in binary form must reproduce the above copyright notice,
 #   this list of conditions and the following disclaimer in the documentation
 #   and/or other materials provided with the distribution.
 #
 # * Neither the name of GPy nor the names of its
 #   contributors may be used to endorse or promote products derived from
 #   this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # ===============================================================================
 # ===============================================================================
 # SKIPPING PLOTTING BECAUSE IT BEHAVES DIFFERENTLY ON DIFFERENT
 # SYSTEMS, AND WILL MISBEHAVE
 # raise SkipTest("Skipping Matplotlib testing")
 # ===============================================================================
 try:
    import matplotlib
    from matplotlib import pyplot as plt
    from matplotlib.testing.compare import compare_images
    matplotlib.use("agg")
 except ImportError:
    # matplotlib not installed
    matplotlib = None
 import pytest
 import numpy as np
 import GPy, os
 import logging
 from GPy.util.config import config
 from GPy.plotting import change_plotting_library, plotting_library
 class TestConfig:
    def teardown(self):
        change_plotting_library("matplotlib")
    @pytest.mark.skipif(matplotlib is None, reason="Matplotlib not installed")
    def test_change_plotting(self):
        with pytest.raises(ValueError):
            change_plotting_library("not+in9names")
        change_plotting_library("none")
        with pytest.raises(RuntimeError):
            plotting_library()
        self.teardown()
 change_plotting_library("matplotlib")
 extensions = ["npz"]
 basedir = os.path.dirname(os.path.relpath(os.path.abspath(__file__)))
 def _image_directories():
    """
    Compute the baseline and result image directories for testing *func*.
    Create the result directory if it doesn't exist.
    """
    # module_name = __init__.__module__
    # mods = module_name.split('.')
    # basedir = os.path.join(*mods)
    result_dir = os.path.join(basedir, "testresult", ".")
    baseline_dir = os.path.join(basedir, "baseline", ".")
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    return baseline_dir, result_dir
 baseline_dir, result_dir = _image_directories()
 if not os.path.exists(baseline_dir):
    baseline_dir = None
 def _image_comparison(
    baseline_images, extensions=["pdf", "svg", "png"], tol=11, rtol=1e-3, **kwargs
 ):
    for num, base in zip(plt.get_fignums(), baseline_images):
        for ext in extensions:
            fig = plt.figure(num)
            try:
                fig.canvas.draw()
            except Exception as e:
                logging.error(base)
                # raise SkipTest(e)
            # fig.axes[0].set_axis_off()
            # fig.set_frameon(False)
            if ext in ["npz"]:
                figdict = flatten_axis(fig)
                np.savez_compressed(
                    os.path.join(result_dir, "{}.{}".format(base, ext)), **figdict
                )
                try:
                    fig.savefig(
                        os.path.join(result_dir, "{}.{}".format(base, "png")),
                        transparent=True,
                        edgecolor="none",
                        facecolor="none",
                        # bbox='tight'
                    )
                except:
                    logging.error(base)
                    # raise
            else:
                fig.savefig(
                    os.path.join(result_dir, "{}.{}".format(base, ext)),
                    transparent=True,
                    edgecolor="none",
                    facecolor="none",
                    # bbox='tight'
                )
    for num, base in zip(plt.get_fignums(), baseline_images):
        for ext in extensions:
            # plt.close(num)
            actual = os.path.join(result_dir, "{}.{}".format(base, ext))
            expected = os.path.join(baseline_dir, "{}.{}".format(base, ext))
            if ext == "npz":
                def do_test():
                    with pytest.skip:
                        if not os.path.exists(expected):
                            import shutil
                            shutil.copy2(actual, expected)
                            # shutil.copy2(os.path.join(result_dir, "{}.{}".format(base, 'png')), os.path.join(baseline_dir, "{}.{}".format(base, 'png')))
                            raise IOError(
                                "Baseline file {} not found, copying result {}".format(
                                    expected, actual
                                )
                            )
                        else:
                            exp_dict = dict(np.load(expected).items())
                            act_dict = dict(np.load(actual).items())
                            for name in act_dict:
                                if name in exp_dict:
                                    try:
                                        np.testing.assert_allclose(
                                            exp_dict[name],
                                            act_dict[name],
                                            err_msg="Mismatch in {}.{}".format(
                                                base, name
                                            ),
                                            rtol=rtol,
                                            **kwargs
                                        )
                                    except AssertionError as e:
                                        pass
            else:
                def do_test():
                    err = compare_images(expected, actual, tol, in_decorator=True)
                    if err:
                        print(
                            "Error between {} and {} is {:.5f}, which is bigger then the tolerance of {:.5f}".format(
                                actual, expected, err["rms"], tol
                            )
                        )
                        pass
            yield do_test
    plt.close("all")
 def flatten_axis(ax, prevname=""):
    import inspect
    members = inspect.getmembers(ax)
    arrays = {}
    def _flatten(l, pre):
        arr = {}
        if isinstance(l, np.ndarray):
            if l.size:
                arr[pre] = np.asarray(l)
        elif isinstance(l, dict):
            for _n in l:
                _tmp = _flatten(l, pre + "." + _n + ".")
                for _nt in _tmp.keys():
                    arrays[_nt] = _tmp[_nt]
        elif isinstance(l, list) and len(l) > 0:
            for i in range(len(l)):
                _tmp = _flatten(l[i], pre + "[{}]".format(i))
                for _n in _tmp:
                    arr["{}".format(_n)] = _tmp[_n]
        else:
            return flatten_axis(l, pre + ".")
        return arr
    for name, l in members:
        if isinstance(l, np.ndarray):
            arrays[prevname + name] = np.asarray(l)
        elif isinstance(l, list) and len(l) > 0:
            for i in range(len(l)):
                _tmp = _flatten(l[i], prevname + name + "[{}]".format(i))
                for _n in _tmp:
                    arrays["{}".format(_n)] = _tmp[_n]
    return arrays
 def _a(x, y, decimal):
    np.testing.assert_array_almost_equal(x, y, decimal)
 def compare_axis_dicts(x, y, decimal=6):
    try:
        assert len(x) == len(y)
        for name in x:
            _a(x[name], y[name], decimal)
    except AssertionError as e:
        print(e.message)
        pass
@pytest.mark.skipif(
    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
 )
 def test_figure():
    np.random.seed(1239847)
    from GPy.plotting import plotting_library as pl
    # import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams["text.usetex"] = False
    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        ax, _ = pl().new_canvas(num="imshow_interact")
        def test_func(x):
            return x[:, 0].reshape(3, 3)
        pl().imshow_interact(ax, test_func, extent=(-1, 1, -1, 1), resolution=3)
        ax, _ = pl().new_canvas()
        def test_func_2(x):
            y = x[:, 0].reshape(3, 3)
            anno = np.argmax(x, axis=1).reshape(3, 3)
            return y, anno
        pl().annotation_heatmap_interact(
            ax, test_func_2, extent=(-1, 1, -1, 1), resolution=3
        )
        pl().annotation_heatmap_interact(
            ax,
            test_func_2,
            extent=(-1, 1, -1, 1),
            resolution=3,
            imshow_kwargs=dict(interpolation="nearest"),
        )
        ax, _ = pl().new_canvas(figsize=(4, 3))
        x = np.linspace(0, 1, 100)
        y = [0, 1, 2]
        array = np.array([0.4, 0.5])
        cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
            "WhToColor", ("r", "b"), N=array.size
        )
        pl().fill_gradient(ax, x, y, facecolors=["r", "g"], array=array, cmap=cmap)
        ax, _ = pl().new_canvas(
            num="3d_plot",
            figsize=(4, 3),
            projection="3d",
            xlabel="x",
            ylabel="y",
            zlabel="z",
            title="awsome title",
            xlim=(-1, 1),
            ylim=(-1, 1),
            zlim=(-3, 3),
        )
        z = 2 - np.abs(np.linspace(-2, 2, (100))) + 1
        x, y = z * np.sin(np.linspace(-2 * np.pi, 2 * np.pi, (100))), z * np.cos(
            np.linspace(-np.pi, np.pi, (100))
        )
        pl().plot(ax, x, y, z, linewidth=2)
        for do_test in _image_comparison(
            baseline_images=[
                "coverage_{}".format(sub)
                for sub in [
                    "imshow_interact",
                    "annotation_interact",
                    "gradient",
                    "3d_plot",
                ]
            ],
            extensions=extensions,
        ):
            yield (do_test,)
@pytest.mark.skipif(
    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
 )
 def test_kernel():
    np.random.seed(1239847)
    # import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams["text.usetex"] = False
    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        k = GPy.kern.RBF(5, ARD=True) * GPy.kern.Linear(
            3, active_dims=[0, 2, 4], ARD=True
        ) + GPy.kern.Bias(2)
        k.randomize()
        k2 = (
            GPy.kern.RBF(5, ARD=True)
            * GPy.kern.Linear(3, active_dims=[0, 2, 4], ARD=True)
            + GPy.kern.Bias(2)
            + GPy.kern.White(4)
        )
        k2[:-1] = k[:]
        k2.plot_ARD(["rbf", "linear", "bias"], legend=True)
        k2.plot_covariance(visible_dims=[0, 3], plot_limits=(-1, 3))
        k2.plot_covariance(visible_dims=[2], plot_limits=(-1, 3))
        k2.plot_covariance(
            visible_dims=[2, 4],
            plot_limits=((-1, 0), (5, 3)),
            projection="3d",
            rstride=10,
            cstride=10,
        )
        k2.plot_covariance(visible_dims=[1, 4])
        for do_test in _image_comparison(
            baseline_images=[
                "kern_{}".format(sub)
                for sub in ["ARD", "cov_2d", "cov_1d", "cov_3d", "cov_no_lim"]
            ],
            extensions=extensions,
        ):
            yield (do_test,)
@pytest.mark.skipif(
    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
 )
 def test_plot():
    np.random.seed(111)
    import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams["text.usetex"] = False
    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        X = np.random.uniform(-2, 2, (40, 1))
        f = 0.2 * np.sin(1.3 * X) + 1.3 * np.cos(2 * X)
        Y = f + np.random.normal(0, 0.1, f.shape)
        m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X) * [0.06])
        # m.optimize()
        m.plot_data()
        m.plot_mean()
        m.plot_confidence()
        m.plot_density()
        m.plot_errorbars_trainset()
        m.plot_samples()
        m.plot_data_error()
    for do_test in _image_comparison(
        baseline_images=[
            "gp_{}".format(sub)
            for sub in [
                "data",
                "mean",
                "conf",
                "density",
                "out_error",
                "samples",
                "in_error",
            ]
        ],
        extensions=extensions,
    ):
        yield (do_test,)
@pytest.mark.skipif(
    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
 )
 def test_twod():
    np.random.seed(11111)
    import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams["text.usetex"] = False
    X = np.random.uniform(-2, 2, (40, 2))
    f = 0.2 * np.sin(1.3 * X[:, [0]]) + 1.3 * np.cos(2 * X[:, [1]])
    Y = f + np.random.normal(0, 0.1, f.shape)
    m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X) * [0.01, 0.2])
    # m.optimize()
    m.plot_data()
    m.plot_mean()
    m.plot_inducing(legend=False, marker="s")
    # m.plot_errorbars_trainset()
    m.plot_data_error()
    for do_test in _image_comparison(
        baseline_images=[
            "gp_2d_{}".format(sub)
            for sub in [
                "data",
                "mean",
                "inducing",
                #'out_error',
                "in_error",
            ]
        ],
        extensions=extensions,
    ):
        yield (do_test,)
@pytest.mark.skipif(
    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
 )
 def test_threed():
    np.random.seed(11111)
    import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams["text.usetex"] = False
    X = np.random.uniform(-2, 2, (40, 2))
    f = 0.2 * np.sin(1.3 * X[:, [0]]) + 1.3 * np.cos(2 * X[:, [1]])
    Y = f + np.random.normal(0, 0.1, f.shape)
    m = GPy.models.SparseGPRegression(X, Y)
    m.likelihood.variance = 0.1
    # m.optimize()
    m.plot_samples(projection="3d", samples=1)
    m.plot_samples(projection="3d", plot_raw=False, samples=1)
    plt.close("all")
    m.plot_data(projection="3d")
    m.plot_mean(projection="3d", rstride=10, cstride=10)
    m.plot_inducing(projection="3d")
    # m.plot_errorbars_trainset(projection='3d')
    for do_test in _image_comparison(
        baseline_images=[
            "gp_3d_{}".format(sub)
            for sub in [
                "data",
                "mean",
                "inducing",
            ]
        ],
        extensions=extensions,
    ):
        yield (do_test,)
@pytest.mark.skipif(
    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
 )
 def test_sparse():
    np.random.seed(11111)
    import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams["text.usetex"] = False
    X = np.random.uniform(-2, 2, (40, 1))
    f = 0.2 * np.sin(1.3 * X) + 1.3 * np.cos(2 * X)
    Y = f + np.random.normal(0, 0.1, f.shape)
    m = GPy.models.SparseGPRegression(X, Y, X_variance=np.ones_like(X) * 0.1)
    # m.optimize()
    # m.plot_inducing()
    _, ax = plt.subplots()
    m.plot_data(ax=ax)
    m.plot_data_error(ax=ax)
    for do_test in _image_comparison(
        baseline_images=["sparse_gp_{}".format(sub) for sub in ["data_error"]],
        extensions=extensions,
    ):
        yield (do_test,)
@pytest.mark.skipif(
    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
 )
 def test_classification():
    np.random.seed(11111)
    import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams["text.usetex"] = False
    X = np.random.uniform(-2, 2, (40, 1))
    f = 0.2 * np.sin(1.3 * X) + 1.3 * np.cos(2 * X)
    Y = f + np.random.normal(0, 0.1, f.shape)
    m = GPy.models.GPClassification(X, Y > Y.mean())
    # m.optimize()
    _, ax = plt.subplots()
    m.plot(plot_raw=False, apply_link=False, ax=ax, samples=3)
    m.plot_errorbars_trainset(plot_raw=False, apply_link=False, ax=ax)
    _, ax = plt.subplots()
    m.plot(plot_raw=True, apply_link=False, ax=ax, samples=3)
    m.plot_errorbars_trainset(plot_raw=True, apply_link=False, ax=ax)
    _, ax = plt.subplots()
    m.plot(plot_raw=True, apply_link=True, ax=ax, samples=3)
    m.plot_errorbars_trainset(plot_raw=True, apply_link=True, ax=ax)
    for do_test in _image_comparison(
        baseline_images=[
            "gp_class_{}".format(sub) for sub in ["likelihood", "raw", "raw_link"]
        ],
        extensions=extensions,
    ):
        yield (do_test,)
@pytest.mark.skipif(
    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
 )
 def test_sparse_classification():
    np.random.seed(11111)
    import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams["text.usetex"] = False
    X = np.random.uniform(-2, 2, (40, 1))
    f = 0.2 * np.sin(1.3 * X) + 1.3 * np.cos(2 * X)
    Y = f + np.random.normal(0, 0.1, f.shape)
    m = GPy.models.SparseGPClassification(X, Y > Y.mean())
    # m.optimize()
    m.plot(plot_raw=False, apply_link=False, samples_likelihood=3)
    np.random.seed(111)
    m.plot(plot_raw=True, apply_link=False, samples=3)
    np.random.seed(111)
    m.plot(plot_raw=True, apply_link=True, samples=3)
    for do_test in _image_comparison(
        baseline_images=[
            "sparse_gp_class_{}".format(sub)
            for sub in ["likelihood", "raw", "raw_link"]
        ],
        extensions=extensions,
        rtol=2,
    ):
        yield (do_test,)
@pytest.mark.skipif(
    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
 )
 def test_gplvm():
    from GPy.models import GPLVM
    np.random.seed(12345)
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams["text.usetex"] = False
    # Q = 3
    # Define dataset
    # N = 60
    # k1 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,10,10,0.1,0.1]), ARD=True)
    # k2 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,0.1,10,0.1,10]), ARD=True)
    # k3 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[0.1,0.1,10,10,10]), ARD=True)
    # X = np.random.normal(0, 1, (N, 5))
    # A = np.random.multivariate_normal(np.zeros(N), k1.K(X), Q).T
    # B = np.random.multivariate_normal(np.zeros(N), k2.K(X), Q).T
    # C = np.random.multivariate_normal(np.zeros(N), k3.K(X), Q).T
    # Y = np.vstack((A,B,C))
    # labels = np.hstack((np.zeros(A.shape[0]), np.ones(B.shape[0]), np.ones(C.shape[0])*2))
    # k = RBF(Q, ARD=True, lengthscale=2)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    pars = np.load(os.path.join(basedir, "b-gplvm-save.npz"))
    Y = pars["Y"]
    Q = pars["Q"]
    labels = pars["labels"]
    import warnings
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")  # always print
        m = GPLVM(Y, Q, initialize=False)
    m.update_model(False)
    m.initialize_parameter()
    m[:] = pars["gplvm_p"]
    m.update_model(True)
    # m.optimize(messages=0)
    np.random.seed(111)
    m.plot_latent(labels=labels)
    np.random.seed(111)
    m.plot_scatter(projection="3d", labels=labels)
    np.random.seed(111)
    m.plot_magnification(labels=labels)
    m.plot_steepest_gradient_map(resolution=10, data_labels=labels)
    for do_test in _image_comparison(
        baseline_images=[
            "gplvm_{}".format(sub)
            for sub in ["latent", "latent_3d", "magnification", "gradient"]
        ],
        extensions=extensions,
        tol=12,
    ):
        yield (do_test,)
@pytest.mark.skipif(
    matplotlib is None or baseline_dir is None, reason="Matplotlib not installed"
 )
 def test_bayesian_gplvm():
    from ..models import BayesianGPLVM
    np.random.seed(12345)
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    # matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams["text.usetex"] = False
    # Q = 3
    # Define dataset
    # N = 10
    # k1 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,10,10,0.1,0.1]), ARD=True)
    # k2 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[10,0.1,10,0.1,10]), ARD=True)
    # k3 = GPy.kern.RBF(5, variance=1, lengthscale=1./np.random.dirichlet(np.r_[0.1,0.1,10,10,10]), ARD=True)
    # X = np.random.normal(0, 1, (N, 5))
    # A = np.random.multivariate_normal(np.zeros(N), k1.K(X), Q).T
    # B = np.random.multivariate_normal(np.zeros(N), k2.K(X), Q).T
    # C = np.random.multivariate_normal(np.zeros(N), k3.K(X), Q).T
    # Y = np.vstack((A,B,C))
    # labels = np.hstack((np.zeros(A.shape[0]), np.ones(B.shape[0]), np.ones(C.shape[0])*2))
    # k = RBF(Q, ARD=True, lengthscale=2)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    pars = np.load(os.path.join(basedir, "b-gplvm-save.npz"))
    Y = pars["Y"]
    Q = pars["Q"]
    labels = pars["labels"]
    import warnings
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")  # always print
        m = BayesianGPLVM(Y, Q, initialize=False)
    m.update_model(False)
    m.initialize_parameter()
    m[:] = pars["bgplvm_p"]
    m.update_model(True)
    # m.optimize(messages=0)
    np.random.seed(111)
    m.plot_inducing(projection="2d")
    np.random.seed(111)
    m.plot_inducing(projection="3d")
    np.random.seed(111)
    m.plot_latent(projection="2d", labels=labels)
    np.random.seed(111)
    m.plot_scatter(projection="3d", labels=labels)
    np.random.seed(111)
    m.plot_magnification(labels=labels)
    np.random.seed(111)
    m.plot_steepest_gradient_map(resolution=10, data_labels=labels)
    for do_test in _image_comparison(
        baseline_images=[
            "bayesian_gplvm_{}".format(sub)
            for sub in [
                "inducing",
                "inducing_3d",
                "latent",
                "latent_3d",
                "magnification",
                "gradient",
            ]
        ],
        extensions=extensions,
    ):
        yield (do_test,)
--- a/GPy/testing/prior_tests.py
+++ b/GPy/testing/prior_tests.py
@ -1,17 +1,17 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-
+import pytest
 import unittest
 import numpy as np
 import GPy
-class PriorTests(unittest.TestCase):
+
 class TestPrior:
    def test_studentT(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
+        y = b * X + C + 1 * np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        studentT = GPy.priors.StudentT(1, 2, 4)
@ -20,119 +20,123 @@ class PriorTests(unittest.TestCase):
        # setting a StudentT prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, studentT)
+
        with pytest.raises(AssertionError):
            m.rbf.set_prior(studentT)
        # The gradients need to be checked
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
        # Check the singleton pattern:
-        self.assertIs(studentT, GPy.priors.StudentT(1,2,4))
+        assert studentT is GPy.priors.StudentT(1, 2, 4)
-        self.assertIsNot(studentT, GPy.priors.StudentT(2,2,4))
+        assert studentT is not GPy.priors.StudentT(2, 2, 4)
    def test_lognormal(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
+        y = b * X + C + 1 * np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        lognormal = GPy.priors.LogGaussian(1, 2)
        m.rbf.set_prior(lognormal)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
    def test_Gamma(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
+        y = b * X + C + 1 * np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        Gamma = GPy.priors.Gamma(1, 1)
        m.rbf.set_prior(Gamma)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
    def test_InverseGamma(self):
        # Test that this prior object can be instantiated and performs its basic functions
        # in integration.
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
+        y = b * X + C + 1 * np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        InverseGamma = GPy.priors.InverseGamma(1, 1)
        m.rbf.set_prior(InverseGamma)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
    def test_incompatibility(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
+        y = b * X + C + 1 * np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        gaussian = GPy.priors.Gaussian(1, 1)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, gaussian)
+        with pytest.raises(AssertionError):
            m.rbf.set_prior(gaussian)
    def test_set_prior(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
+        y = b * X + C + 1 * np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
        gaussian = GPy.priors.Gaussian(1, 1)
-        #m.rbf.set_prior(gaussian)
+        # m.rbf.set_prior(gaussian)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, gaussian)
+        with pytest.raises(AssertionError):
            m.rbf.set_prior(gaussian)
    def test_uniform(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
+        y = b * X + C + 1 * np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.SparseGPRegression(X, y)
        uniform = GPy.priors.Uniform(0, 2)
        m.rbf.set_prior(uniform)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
        m.Z.set_prior(uniform)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
        m.Z.unconstrain()
        uniform = GPy.priors.Uniform(-1, 10)
        m.Z.set_prior(uniform)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
        m.Z.constrain_negative()
        uniform = GPy.priors.Uniform(-1, 0)
        m.Z.set_prior(uniform)
        m.randomize()
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
    def test_set_gaussian_for_reals(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
+        y = b * X + C + 1 * np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.SparseGPRegression(X, y)
@ -140,16 +144,15 @@ class PriorTests(unittest.TestCase):
        m.Z.set_prior(gaussian)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        #self.assertRaises(AssertionError, m.Z.set_prior, gaussian)
+        # self.assertRaises(AssertionError, m.Z.set_prior, gaussian)
-        self.assertTrue(m.checkgrad())
+        assert m.checkgrad()
    def test_fixed_domain_check(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
+        y = b * X + C + 1 * np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
@ -157,14 +160,15 @@ class PriorTests(unittest.TestCase):
        gaussian = GPy.priors.Gaussian(1, 1)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, gaussian)
+        with pytest.raises(AssertionError):
            m.rbf.set_prior(gaussian)
    def test_fixed_domain_check1(self):
-        xmin, xmax = 1, 2.5*np.pi
+        xmin, xmax = 1, 2.5 * np.pi
        b, C, SNR = 1, 0, 0.1
        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
+        y = b * X + C + 1 * np.sin(X)
-        y += 0.05*np.random.randn(len(X))
+        y += 0.05 * np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
        m = GPy.models.GPRegression(X, y)
@ -172,8 +176,5 @@ class PriorTests(unittest.TestCase):
        gaussian = GPy.priors.Gaussian(1, 1)
        # setting a Gaussian prior on non-negative parameters
        # should raise an assertionerror.
-        self.assertRaises(AssertionError, m.rbf.set_prior, gaussian)
+        with pytest.raises(AssertionError):
-
+            m.rbf.set_prior(gaussian)
 if __name__ == "__main__":
    print("Running unit tests, please be (very) patient...")
    unittest.main()
--- a/GPy/testing/quadrature_tests.py
+++ b/GPy/testing/quadrature_tests.py
@ -1,23 +1,19 @@
 from __future__ import print_function, division
 import numpy as np
-import GPy
+from ..util.quad_integrate import quadgk_int, quadvgk
 import warnings
 from  ..util.quad_integrate import quadgk_int, quadvgk
-
+class TestQuad:
 class QuadTests(np.testing.TestCase):
    """
    test file for checking implementation of gaussian-kronrod quadrature.
    we will take a function which can be integrated analytically and check if quadgk result is similar or not!
    through this file we can test how numerically accurate quadrature implementation in native numpy or manual code is.
    """
    def setUp(self):
        pass
    def test_infinite_quad(self):
        def f(x):
-            return np.exp(-0.5*x**2)*np.power(x,np.arange(3)[:,None])
+            return np.exp(-0.5 * x**2) * np.power(x, np.arange(3)[:, None])
        quad_int_val = quadgk_int(f)
        real_val = np.sqrt(np.pi * 2)
        np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
@ -25,15 +21,18 @@ class QuadTests(np.testing.TestCase):
    def test_finite_quad(self):
        def f2(x):
            return x**2
-        quad_int_val = quadvgk(f2, 1.,2.)
+
-        real_val = 7/3.
+        quad_int_val = quadvgk(f2, 1.0, 2.0)
        real_val = 7 / 3.0
        np.testing.assert_almost_equal(real_val, quad_int_val, decimal=5)
-if __name__ == '__main__':
+
 if __name__ == "__main__":
    def f(x):
-        return np.exp(-0.5 * x ** 2) * np.power(x, np.arange(3)[:, None])
+        return np.exp(-0.5 * x**2) * np.power(x, np.arange(3)[:, None])
    quad_int_val = quadgk_int(f)
-    real_val = np.sqrt(np.pi*2)
+    real_val = np.sqrt(np.pi * 2)
    np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
    print(quadgk_int(f))
--- a/GPy/testing/test_rv_transformation.py
+++ b/GPy/testing/test_rv_transformation.py
@ -0,0 +1,84 @@
 # Written by Ilias Bilionis
 """
 Test if hyperparameters in models are properly transformed.
 """
 import pytest
 import numpy as np
 import scipy.stats as st
 import GPy
 class Model(GPy.core.Model):
    """
    A simple GPy model with one parameter.
    """
    def __init__(self, theta=1.0):
        super(Model, self).__init__("test_model")
        theta = GPy.core.Param("theta", theta)
        self.link_parameter(theta)
    def log_likelihood(self):
        return 0.0
 class TestRVTransformation:
    def _test_trans(self, trans):
        m = Model()
        prior = GPy.priors.LogGaussian(0.5, 0.1)
        m.theta.set_prior(prior)
        m.theta.unconstrain()
        m.theta.constrain(trans)
        # The PDF of the transformed variables
        p_phi = lambda phi: np.exp(-m._objective_grads(phi)[0])
        # To the empirical PDF of:
        theta_s = prior.rvs(1e5)
        phi_s = trans.finv(theta_s)
        # which is essentially a kernel density estimation
        kde = st.gaussian_kde(phi_s)
        # We will compare the PDF here:
        phi = np.linspace(phi_s.min(), phi_s.max(), 100)
        # The transformed PDF of phi should be this:
        pdf_phi = np.array([p_phi(p) for p in phi])
        # UNCOMMENT TO SEE GRAPHICAL COMPARISON
        # import matplotlib.pyplot as plt
        # fig, ax = plt.subplots()
        # ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Histogram')
        # ax.plot(phi, kde(phi), '--', linewidth=2, label='Kernel Density Estimation')
        # ax.plot(phi, pdf_phi, ':', linewidth=2, label='Transformed PDF')
        # ax.set_xlabel(r'transformed $\theta$', fontsize=16)
        # ax.set_ylabel('PDF', fontsize=16)
        # plt.legend(loc='best')
        # plt.show(block=True)
        # END OF PLOT
        # The following test cannot be very accurate
        assert np.linalg.norm(pdf_phi - kde(phi)) / np.linalg.norm(kde(phi)) <= 1e-1
    def _test_grad(self, trans):
        np.random.seed(1234)
        m = Model(np.random.uniform(0.5, 1.5, 20))
        prior = GPy.priors.LogGaussian(0.5, 0.1)
        m.theta.set_prior(prior)
        m.theta.constrain(trans)
        m.randomize()
        print(m)
        assert m.checkgrad(1)
    def test_Logexp(self):
        self._test_trans(GPy.constraints.Logexp())
    @pytest.mark.skip(
        "Gradient not checking right, @jameshensman what is going on here?"
    )
    def test_Logexp_grad(self):
        self._test_grad(GPy.constraints.Logexp())
    def test_Exponent(self):
        self._test_trans(GPy.constraints.Exponent())
    @pytest.mark.skip(
        "Gradient not checking right, @jameshensman what is going on here?"
    )
    def test_Exponent_grad(self):
        self._test_grad(GPy.constraints.Exponent())
--- a/GPy/testing/test_serialization.py
+++ b/GPy/testing/test_serialization.py
@ -0,0 +1,440 @@
 """
 Created on 20 April 2017
@author: pgmoren
 """
 import numpy as np
 import GPy
 import os
 fixed_seed = 11
 class TestSerialization:
    def test_serialize_deserialize_kernels(self):
        k1 = GPy.kern.RBF(2, variance=1.0, lengthscale=[1.0, 1.0], ARD=True)
        k2 = GPy.kern.RatQuad(
            2, variance=2.0, lengthscale=1.0, power=2.0, active_dims=[0, 1]
        )
        k3 = GPy.kern.Bias(2, variance=2.0, active_dims=[1, 0])
        k4 = GPy.kern.StdPeriodic(
            2, variance=2.0, lengthscale=1.0, period=1.0, active_dims=[1, 1]
        )
        k5 = GPy.kern.Linear(2, variances=[2.0, 1.0], ARD=True, active_dims=[1, 1])
        k6 = GPy.kern.Exponential(2, variance=1.0, lengthscale=2)
        k7 = GPy.kern.Matern32(
            2, variance=1.0, lengthscale=[1.0, 3.0], ARD=True, active_dims=[1, 1]
        )
        k8 = GPy.kern.Matern52(
            2, variance=2.0, lengthscale=[2.0, 1.0], ARD=True, active_dims=[1, 0]
        )
        k9 = GPy.kern.ExpQuad(
            2, variance=3.0, lengthscale=[1.0, 2.0], ARD=True, active_dims=[0, 1]
        )
        k10 = GPy.kern.OU(
            2, variance=2.0, lengthscale=[2.0, 1.0], ARD=True, active_dims=[1, 0]
        )
        k11 = k1 + k1.copy() + k2 + k3 + k4 + k5 + k6
        k12 = k1 * k2 * k2.copy() * k3 * k4 * k5
        k13 = (k1 + k2) * (k3 + k4 + k5)
        k14 = ((k1 + k2) * k3) + k4 + k5 * k7
        k15 = ((k1 + k2) * k3) + k4 * k5 + k8 * k10
        k16 = ((k1 * k2) * k3) + k4 * k5 + k8 + k9
        k_list = [k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14, k15, k16]
        for kk in k_list:
            kk_dict = kk.to_dict()
            kk_r = GPy.kern.Kern.from_dict(kk_dict)
            assert type(kk) == type(kk_r)
            np.testing.assert_array_equal(kk[:], kk_r[:])
            np.testing.assert_array_equal(
                np.array(kk.active_dims), np.array(kk_r.active_dims)
            )
    def test_serialize_deserialize_mappings(self):
        m1 = GPy.mappings.Identity(3, 2)
        m2 = GPy.mappings.Constant(3, 2, 1)
        m2_r = GPy.core.mapping.Mapping.from_dict(m2.to_dict())
        np.testing.assert_array_equal(m2.C.values[:], m2_r.C.values[:])
        m3 = GPy.mappings.Linear(3, 2)
        m3_r = GPy.core.mapping.Mapping.from_dict(m3.to_dict())
        assert np.all(m3.A == m3_r.A)
        m_list = [m1, m2, m3]
        for mm in m_list:
            mm_dict = mm.to_dict()
            mm_r = GPy.core.mapping.Mapping.from_dict(mm_dict)
            assert type(mm) == type(mm_r)
            assert type(mm.input_dim) == type(mm_r.input_dim)
            assert type(mm.output_dim) == type(mm_r.output_dim)
    def test_serialize_deserialize_likelihoods(self):
        l1 = GPy.likelihoods.Gaussian(
            GPy.likelihoods.link_functions.Identity(), variance=3.0
        )
        l1_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l1.to_dict())
        l2 = GPy.likelihoods.Bernoulli(GPy.likelihoods.link_functions.Probit())
        l2_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l2.to_dict())
        assert type(l1) == type(l1_r)
        assert np.all(l1.variance == l1_r.variance)
        assert type(l2) == type(l2_r)
    def test_serialize_deserialize_normalizers(self):
        n1 = GPy.util.normalizer.Standardize()
        n1.scale_by(np.random.rand(10))
        n1_r = GPy.util.normalizer._Norm.from_dict((n1.to_dict()))
        assert type(n1) == type(n1_r)
        assert np.all(n1.mean == n1_r.mean)
        assert np.all(n1.std == n1_r.std)
    def test_serialize_deserialize_link_functions(self):
        l1 = GPy.likelihoods.link_functions.Identity()
        l2 = GPy.likelihoods.link_functions.Probit()
        l_list = [l1, l2]
        for ll in l_list:
            ll_dict = ll.to_dict()
            ll_r = GPy.likelihoods.link_functions.GPTransformation.from_dict(ll_dict)
            assert type(ll) == type(ll_r)
    def test_serialize_deserialize_inference_methods(self):
        e1 = GPy.inference.latent_function_inference.expectation_propagation.EP(
            ep_mode="nested"
        )
        e1.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(
            np.random.rand(10), np.random.rand(10)
        )
        e1._ep_approximation = []
        e1._ep_approximation.append(
            GPy.inference.latent_function_inference.expectation_propagation.posteriorParams(
                np.random.rand(10), np.random.rand(100).reshape((10, 10))
            )
        )
        e1._ep_approximation.append(
            GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(
                np.random.rand(10), np.random.rand(10)
            )
        )
        e1._ep_approximation.append(
            GPy.inference.latent_function_inference.expectation_propagation.cavityParams(
                10
            )
        )
        e1._ep_approximation[-1].v = np.random.rand(10)
        e1._ep_approximation[-1].tau = np.random.rand(10)
        e1._ep_approximation.append(np.random.rand(10))
        e1_r = (
            GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(
                e1.to_dict()
            )
        )
        assert type(e1) == type(e1_r)
        assert e1.epsilon == e1_r.epsilon
        assert e1.eta == e1_r.eta
        assert e1.delta == e1_r.delta
        assert e1.always_reset == e1_r.always_reset
        assert e1.max_iters == e1_r.max_iters
        assert e1.ep_mode == e1_r.ep_mode
        assert e1.parallel_updates == e1_r.parallel_updates
        np.testing.assert_array_equal(
            e1.ga_approx_old.tau[:], e1_r.ga_approx_old.tau[:]
        )
        np.testing.assert_array_equal(e1.ga_approx_old.v[:], e1_r.ga_approx_old.v[:])
        np.testing.assert_array_equal(
            e1._ep_approximation[0].mu[:], e1_r._ep_approximation[0].mu[:]
        )
        np.testing.assert_array_equal(
            e1._ep_approximation[0].Sigma[:], e1_r._ep_approximation[0].Sigma[:]
        )
        np.testing.assert_array_equal(
            e1._ep_approximation[1].tau[:], e1_r._ep_approximation[1].tau[:]
        )
        np.testing.assert_array_equal(
            e1._ep_approximation[1].v[:], e1_r._ep_approximation[1].v[:]
        )
        np.testing.assert_array_equal(
            e1._ep_approximation[2].tau[:], e1_r._ep_approximation[2].tau[:]
        )
        np.testing.assert_array_equal(
            e1._ep_approximation[2].v[:], e1_r._ep_approximation[2].v[:]
        )
        np.testing.assert_array_equal(
            e1._ep_approximation[3][:], e1_r._ep_approximation[3][:]
        )
        e2 = GPy.inference.latent_function_inference.expectation_propagation.EPDTC(
            ep_mode="nested"
        )
        e2.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(
            np.random.rand(10), np.random.rand(10)
        )
        e2._ep_approximation = []
        e2._ep_approximation.append(
            GPy.inference.latent_function_inference.expectation_propagation.posteriorParamsDTC(
                np.random.rand(10), np.random.rand(10)
            )
        )
        e2._ep_approximation.append(
            GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(
                np.random.rand(10), np.random.rand(10)
            )
        )
        e2._ep_approximation.append(100.0)
        e2_r = (
            GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(
                e2.to_dict()
            )
        )
        assert type(e2) == type(e2_r)
        assert e2.epsilon == e2_r.epsilon
        assert e2.eta == e2_r.eta
        assert e2.delta == e2_r.delta
        assert e2.always_reset == e2_r.always_reset
        assert e2.max_iters == e2_r.max_iters
        assert e2.ep_mode == e2_r.ep_mode
        assert e2.parallel_updates == e2_r.parallel_updates
        np.testing.assert_array_equal(
            e2.ga_approx_old.tau[:], e2_r.ga_approx_old.tau[:]
        )
        np.testing.assert_array_equal(e2.ga_approx_old.v[:], e2_r.ga_approx_old.v[:])
        np.testing.assert_array_equal(
            e2._ep_approximation[0].mu[:], e2_r._ep_approximation[0].mu[:]
        )
        np.testing.assert_array_equal(
            e2._ep_approximation[0].Sigma_diag[:],
            e2_r._ep_approximation[0].Sigma_diag[:],
        )
        np.testing.assert_array_equal(
            e2._ep_approximation[1].tau[:], e2_r._ep_approximation[1].tau[:]
        )
        np.testing.assert_array_equal(
            e2._ep_approximation[1].v[:], e2_r._ep_approximation[1].v[:]
        )
        assert e2._ep_approximation[2] == e2_r._ep_approximation[2]
        e3 = (
            GPy.inference.latent_function_inference.exact_gaussian_inference.ExactGaussianInference()
        )
        e3_r = (
            GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(
                e3.to_dict()
            )
        )
        assert type(e3) == type(e3_r)
    def test_serialize_deserialize_GP(self):
        np.random.seed(fixed_seed)
        N = 20
        Nhalf = int(N / 2)
        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[
            :, None
        ]
        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
        kernel = GPy.kern.RBF(1)
        likelihood = GPy.likelihoods.Bernoulli()
        inference_method = (
            GPy.inference.latent_function_inference.expectation_propagation.EP(
                ep_mode="nested"
            )
        )
        mean_function = None
        m = GPy.core.GP(
            X=X,
            Y=Y,
            kernel=kernel,
            likelihood=likelihood,
            inference_method=inference_method,
            mean_function=mean_function,
            normalizer=True,
            name="gp_classification",
        )
        m.optimize()
        m.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
        m.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
        m1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
        m2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X, Y))
        os.remove("temp_test_gp_with_data.json.zip")
        os.remove("temp_test_gp_without_data.json.zip")
        var = m.predict(X)[0]
        var1_r = m1_r.predict(X)[0]
        var2_r = m2_r.predict(X)[0]
        np.testing.assert_array_equal(
            np.array(var).flatten(), np.array(var1_r).flatten()
        )
        np.testing.assert_array_equal(
            np.array(var).flatten(), np.array(var2_r).flatten()
        )
    def test_serialize_deserialize_SparseGP(self):
        np.random.seed(fixed_seed)
        N = 20
        Nhalf = int(N / 2)
        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[
            :, None
        ]
        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
        kernel = GPy.kern.RBF(1)
        likelihood = GPy.likelihoods.Bernoulli()
        inference_method = (
            GPy.inference.latent_function_inference.expectation_propagation.EPDTC(
                ep_mode="nested"
            )
        )
        mean_function = None
        sm = GPy.core.SparseGP(
            X=X,
            Y=Y,
            Z=X[0:20, :],
            kernel=kernel,
            likelihood=likelihood,
            inference_method=inference_method,
            mean_function=mean_function,
            normalizer=True,
            name="sparse_gp_classification",
        )
        sm.optimize()
        sm.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
        sm.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
        sm1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
        sm2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X, Y))
        os.remove("temp_test_gp_with_data.json.zip")
        os.remove("temp_test_gp_without_data.json.zip")
        var = sm.predict(X)[0]
        var1_r = sm1_r.predict(X)[0]
        var2_r = sm2_r.predict(X)[0]
        np.testing.assert_array_equal(
            np.array(var).flatten(), np.array(var1_r).flatten()
        )
        np.testing.assert_array_equal(
            np.array(var).flatten(), np.array(var2_r).flatten()
        )
    def test_serialize_deserialize_GPRegressor(self):
        np.random.seed(fixed_seed)
        N = 50
        N_new = 50
        D = 1
        X = np.random.uniform(-3.0, 3.0, (N, 1))
        Y = np.sin(X) + np.random.randn(N, D) * 0.05
        X_new = np.random.uniform(-3.0, 3.0, (N_new, 1))
        k = GPy.kern.RBF(input_dim=1, lengthscale=10)
        m = GPy.models.GPRegression(X, Y, k)
        m.optimize()
        m.save_model(
            "temp_test_gp_regressor_with_data.json", compress=True, save_data=True
        )
        m.save_model(
            "temp_test_gp_regressor_without_data.json", compress=True, save_data=False
        )
        m1_r = GPy.models.GPRegression.load_model(
            "temp_test_gp_regressor_with_data.json.zip"
        )
        m2_r = GPy.models.GPRegression.load_model(
            "temp_test_gp_regressor_without_data.json.zip", (X, Y)
        )
        os.remove("temp_test_gp_regressor_with_data.json.zip")
        os.remove("temp_test_gp_regressor_without_data.json.zip")
        Xp = np.random.uniform(size=(int(1e5), 1))
        Xp[:, 0] = Xp[:, 0] * 15 - 5
        _, var = m.predict(Xp)
        _, var1_r = m1_r.predict(Xp)
        _, var2_r = m2_r.predict(Xp)
        np.testing.assert_array_equal(var.flatten(), var1_r.flatten())
        np.testing.assert_array_equal(var.flatten(), var2_r.flatten())
    def test_serialize_deserialize_GPClassification(self):
        np.random.seed(fixed_seed)
        N = 50
        Nhalf = int(N / 2)
        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[
            :, None
        ]
        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
        kernel = GPy.kern.RBF(1)
        m = GPy.models.GPClassification(X, Y, kernel=kernel)
        m.optimize()
        m.save_model(
            "temp_test_gp_classifier_with_data.json", compress=True, save_data=True
        )
        m.save_model(
            "temp_test_gp_classifier_without_data.json", compress=True, save_data=False
        )
        m1_r = GPy.models.GPClassification.load_model(
            "temp_test_gp_classifier_with_data.json.zip"
        )
        assert type(m) == type(
            m1_r
        ), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m1_r))
        m2_r = GPy.models.GPClassification.load_model(
            "temp_test_gp_classifier_without_data.json.zip", (X, Y)
        )
        assert type(m) == type(
            m2_r
        ), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m2_r))
        os.remove("temp_test_gp_classifier_with_data.json.zip")
        os.remove("temp_test_gp_classifier_without_data.json.zip")
        var = m.predict(X)[0]
        var1_r = m1_r.predict(X)[0]
        _var2_r = m2_r.predict(X)[0]
        np.testing.assert_array_equal(
            np.array(var).flatten(), np.array(var1_r).flatten()
        )
        np.testing.assert_array_equal(
            np.array(var).flatten(), np.array(var1_r).flatten()
        )
    def test_serialize_deserialize_SparseGPClassification(self):
        np.random.seed(fixed_seed)
        N = 50
        Nhalf = int(N / 2)
        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[
            :, None
        ]
        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
        kernel = GPy.kern.RBF(1)
        m = GPy.models.SparseGPClassification(X, Y, num_inducing=3, kernel=kernel)
        m.optimize()
        m.save_model(
            "temp_test_sparse_gp_classifier_with_data.json",
            compress=True,
            save_data=True,
        )
        m.save_model(
            "temp_test_sparse_gp_classifier_without_data.json",
            compress=True,
            save_data=False,
        )
        m1_r = GPy.models.SparseGPClassification.load_model(
            "temp_test_sparse_gp_classifier_with_data.json.zip"
        )
        assert type(m) == type(
            m1_r
        ), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m1_r))
        m2_r = GPy.models.SparseGPClassification.load_model(
            "temp_test_sparse_gp_classifier_without_data.json.zip", (X, Y)
        )
        assert type(m) == type(
            m2_r
        ), "Incorrect model type. Expected: {} Actual: {}".format(type(m), type(m2_r))
        os.remove("temp_test_sparse_gp_classifier_with_data.json.zip")
        os.remove("temp_test_sparse_gp_classifier_without_data.json.zip")
        var = m.predict(X)[0]
        var1_r = m1_r.predict(X)[0]
        var2_r = m2_r.predict(X)[0]
        np.testing.assert_array_equal(
            np.array(var).flatten(), np.array(var1_r).flatten()
        )
        np.testing.assert_array_equal(
            np.array(var).flatten(), np.array(var1_r).flatten()
        )
--- a/GPy/testing/test_svgp.py
+++ b/GPy/testing/test_svgp.py
@ -0,0 +1,63 @@
 import numpy as np
 import GPy
 class TestSVGP_nonconvex:
    """
    Inference in the SVGP with a student-T likelihood
    """
    def setup(self):
        X = np.linspace(0, 10, 100).reshape(-1, 1)
        Z = np.linspace(0, 10, 10).reshape(-1, 1)
        Y = np.sin(X) + np.random.randn(*X.shape) * 0.1
        Y[50] += 3
        lik = GPy.likelihoods.StudentT(deg_free=2)
        k = GPy.kern.RBF(1, lengthscale=5.0) + GPy.kern.White(1, 1e-6)
        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
    def test_grad(self):
        self.setup()
        assert self.m.checkgrad(step=1e-4)
 class TestSVGP_classification:
    """
    Inference in the SVGP with a Bernoulli likelihood
    """
    def setup(self):
        X = np.linspace(0, 10, 100).reshape(-1, 1)
        Z = np.linspace(0, 10, 10).reshape(-1, 1)
        Y = np.where((np.sin(X) + np.random.randn(*X.shape) * 0.1) > 0, 1, 0)
        lik = GPy.likelihoods.Bernoulli()
        k = GPy.kern.RBF(1, lengthscale=5.0) + GPy.kern.White(1, 1e-6)
        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
    def test_grad(self):
        self.setup()
        assert self.m.checkgrad(step=1e-4)
 class TestSVGP_Poisson_with_meanfunction:
    """
    Inference in the SVGP with a Bernoulli likelihood
    """
    def setup(self):
        X = np.linspace(0, 10, 100).reshape(-1, 1)
        Z = np.linspace(0, 10, 10).reshape(-1, 1)
        latent_f = np.exp(0.1 * X * 0.05 * X**2)
        Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1, 1)
        mf = GPy.mappings.Linear(1, 1)
        lik = GPy.likelihoods.Poisson()
        k = GPy.kern.RBF(1, lengthscale=5.0) + GPy.kern.White(1, 1e-6)
        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
    def test_grad(self):
        self.setup()
        assert self.m.checkgrad(step=1e-4)
--- a/GPy/testing/tp_tests.py
+++ b/GPy/testing/tp_tests.py
@ -1,29 +1,30 @@
-'''
+"""
 Created on 14 Jul 2017, based on gp_tests
@author: javdrher
-'''
+"""
-import unittest
+import numpy as np
-import numpy as np, GPy
+import GPy
-class Test(unittest.TestCase):
+class TestTP:
-    def setUp(self):
+    def setup(self):
        np.random.seed(12345)
        self.N = 20
        self.N_new = 50
        self.D = 1
-        self.X = np.random.uniform(-3., 3., (self.N, 1))
+        self.X = np.random.uniform(-3.0, 3.0, (self.N, 1))
        self.Y = np.sin(self.X) + np.random.randn(self.N, self.D) * 0.05
-        self.X_new = np.random.uniform(-3., 3., (self.N_new, 1))
+        self.X_new = np.random.uniform(-3.0, 3.0, (self.N_new, 1))
    def test_setxy_gp(self):
        self.setup()
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        m = GPy.models.TPRegression(self.X, self.Y, kernel=k)
        mu, var = m.predict(m.X)
        X = m.X.copy()
        m.set_XY(m.X[:10], m.Y[:10])
-        assert (m.checkgrad(tolerance=1e-2))
+        assert m.checkgrad(tolerance=1e-2)
        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
        np.testing.assert_allclose(mu, mu2)
@ -33,10 +34,12 @@ class Test(unittest.TestCase):
        from GPy.core.parameterization.param import Param
        from GPy.core.mapping import Mapping
        self.setup()
        class Parabola(Mapping):
-            def __init__(self, variance, degree=2, name='parabola'):
+            def __init__(self, variance, degree=2, name="parabola"):
                super(Parabola, self).__init__(1, 1, name)
-                self.variance = Param('variance', np.ones(degree + 1) * variance)
+                self.variance = Param("variance", np.ones(degree + 1) * variance)
                self.degree = degree
                self.link_parameter(self.variance)
@ -59,21 +62,28 @@ class Test(unittest.TestCase):
        X = np.linspace(-2, 2, 100)[:, None]
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        k.randomize()
-        p = Parabola(.3)
+        p = Parabola(0.3)
        p.randomize()
-        Y = p.f(X) + np.random.multivariate_normal(np.zeros(X.shape[0]), k.K(X) + np.eye(X.shape[0]) * 1e-8)[:,
+        Y = (
-                     None] + np.random.normal(0, .1, (X.shape[0], 1))
+            p.f(X)
            + np.random.multivariate_normal(
                np.zeros(X.shape[0]), k.K(X) + np.eye(X.shape[0]) * 1e-8
            )[:, None]
            + np.random.normal(0, 0.1, (X.shape[0], 1))
        )
        m = GPy.models.TPRegression(X, Y, kernel=k, mean_function=p)
-        assert (m.checkgrad(tolerance=2e-1))
+        assert m.checkgrad(tolerance=2e-1)
        _ = m.predict(m.X)
    def test_normalizer(self):
        self.setup()
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        Y = self.Y
        mu, std = Y.mean(0), Y.std(0)
        m = GPy.models.TPRegression(self.X, Y, kernel=k, normalizer=True)
        m.optimize()
-        assert (m.checkgrad())
+        assert m.checkgrad()
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        m2 = GPy.models.TPRegression(self.X, (Y - mu) / std, kernel=k, normalizer=False)
        m2[:] = m[:]
@ -81,13 +91,13 @@ class Test(unittest.TestCase):
        mu1, var1 = m.predict(m.X, full_cov=True)
        mu2, var2 = m2.predict(m2.X, full_cov=True)
        np.testing.assert_allclose(mu1, (mu2 * std) + mu)
-        np.testing.assert_allclose(var1, var2 * std ** 2)
+        np.testing.assert_allclose(var1, var2 * std**2)
        mu1, var1 = m.predict(m.X, full_cov=False)
        mu2, var2 = m2.predict(m2.X, full_cov=False)
        np.testing.assert_allclose(mu1, (mu2 * std) + mu)
-        np.testing.assert_allclose(var1, var2 * std ** 2)
+        np.testing.assert_allclose(var1, var2 * std**2)
        q50n = m.predict_quantiles(m.X, (50,))
        q50 = m2.predict_quantiles(m2.X, (50,))
@ -102,10 +112,15 @@ class Test(unittest.TestCase):
        q95 = m2.predict_quantiles(self.X[[c]], qs)
        mu, var = m2.predict(self.X[[c]])
        from scipy.stats import t
-        np.testing.assert_allclose((mu + (t.ppf(qs / 100., m2.nu + m2.num_data) * np.sqrt(var))).flatten(),
+
-                                   np.array(q95).flatten())
+        np.testing.assert_allclose(
            (mu + (t.ppf(qs / 100.0, m2.nu + m2.num_data) * np.sqrt(var))).flatten(),
            np.array(q95).flatten(),
        )
    def test_predict_equivalence(self):
        self.setup()
        k = GPy.kern.RBF(1) + GPy.kern.White(1)
        m = GPy.models.TPRegression(self.X, self.Y, kernel=k)
        m.optimize()
@ -124,10 +139,12 @@ class Test(unittest.TestCase):
        mu3, var3 = m2._raw_predict(m.X)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1, var2)
-        self.assertFalse(np.allclose(mu1, mu3))
+        assert not np.allclose(mu1, mu3)
-        self.assertFalse(np.allclose(var1, var3))
+        assert not np.allclose(var1, var3)
    def test_gp_equivalence(self):
        self.setup()
        k = GPy.kern.RBF(1)
        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
        m.optimize()
@ -139,7 +156,3 @@ class Test(unittest.TestCase):
        mu2, var2 = m2.predict(self.X)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1, var2)
 if __name__ == "__main__":
    unittest.main()
--- a/GPy/testing/test_util.py
+++ b/GPy/testing/test_util.py
@ -0,0 +1,284 @@
 # ===============================================================================
 # Copyright (c) 2016, Max Zwiessele, Alan Saul
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # * Redistributions of source code must retain the above copyright notice, this
 #   list of conditions and the following disclaimer.
 #
 # * Redistributions in binary form must reproduce the above copyright notice,
 #   this list of conditions and the following disclaimer in the documentation
 #   and/or other materials provided with the distribution.
 #
 # * Neither the name of GPy.testing.util_tests nor the names of its
 #   contributors may be used to endorse or promote products derived from
 #   this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # ===============================================================================
 import numpy as np
 import GPy
 class UtilTest:
    def test_checkFinite(self):
        from GPy.util.debug import checkFinite
        array = np.random.normal(0, 1, 100).reshape(25, 4)
        assert checkFinite(array, name="test")
        array[np.random.binomial(1, 0.3, array.shape).astype(bool)] = np.nan
        assert not checkFinite(array)
    def test_checkFullRank(self):
        from GPy.util.debug import checkFullRank
        from GPy.util.linalg import tdot
        array = np.random.normal(0, 1, 100).reshape(25, 4)
        assert not checkFullRank(tdot(array), name="test")
        array = np.random.normal(0, 1, (25, 25))
        assert checkFullRank(tdot(array))
    def test_fixed_inputs_median(self):
        """test fixed_inputs convenience function"""
        from GPy.plotting.matplot_dep.util import fixed_inputs
        import GPy
        X = np.random.randn(10, 3)
        Y = np.sin(X) + np.random.randn(10, 3) * 1e-3
        m = GPy.models.GPRegression(X, Y)
        fixed = fixed_inputs(m, [1], fix_routine="median", as_list=True, X_all=False)
        assert (0, np.median(X[:, 0])) in fixed
        assert (2, np.median(X[:, 2])) in fixed
        assert (
            len([t for t in fixed if t[0] == 1]) == 0
        )  # Unfixed input should not be in fixed
    def test_fixed_inputs_mean(self):
        from GPy.plotting.matplot_dep.util import fixed_inputs
        import GPy
        X = np.random.randn(10, 3)
        Y = np.sin(X) + np.random.randn(10, 3) * 1e-3
        m = GPy.models.GPRegression(X, Y)
        fixed = fixed_inputs(m, [1], fix_routine="mean", as_list=True, X_all=False)
        assert (0, np.mean(X[:, 0])) in fixed
        assert (2, np.mean(X[:, 2])) in fixed
        assert (
            len([t for t in fixed if t[0] == 1]) == 0
        )  # Unfixed input should not be in fixed
    def test_fixed_inputs_zero(self):
        from GPy.plotting.matplot_dep.util import fixed_inputs
        import GPy
        X = np.random.randn(10, 3)
        Y = np.sin(X) + np.random.randn(10, 3) * 1e-3
        m = GPy.models.GPRegression(X, Y)
        fixed = fixed_inputs(m, [1], fix_routine="zero", as_list=True, X_all=False)
        assert (0, 0.0) in fixed
        assert (2, 0.0) in fixed
        assert (
            len([t for t in fixed if t[0] == 1]) == 0
        )  # Unfixed input should not be in fixed
    def test_fixed_inputs_uncertain(self):
        from GPy.plotting.matplot_dep.util import fixed_inputs
        import GPy
        from GPy.core.parameterization.variational import NormalPosterior
        X_mu = np.random.randn(10, 3)
        X_var = np.random.randn(10, 3)
        X = NormalPosterior(X_mu, X_var)
        Y = np.sin(X_mu) + np.random.randn(10, 3) * 1e-3
        m = GPy.models.BayesianGPLVM(Y, X=X_mu, X_variance=X_var, input_dim=3)
        fixed = fixed_inputs(m, [1], fix_routine="median", as_list=True, X_all=False)
        assert (0, np.median(X.mean.values[:, 0])) in fixed
        assert (2, np.median(X.mean.values[:, 2])) in fixed
        assert (
            len([t for t in fixed if t[0] == 1]) == 0
        )  # Unfixed input should not be in fixed
    def test_DSYR(self):
        from GPy.util.linalg import DSYR, DSYR_numpy
        A = np.arange(9.0).reshape(3, 3)
        A = np.dot(A.T, A)
        b = np.ones(3, dtype=float)
        alpha = 1.0
        DSYR(A, b, alpha)
        R = np.array([[46, 55, 64], [55, 67, 79], [64, 79, 94]])
        assert abs(np.sum(A - R)) < 1e-12
    def test_subarray(self):
        import GPy
        X = np.zeros((3, 6), dtype=bool)
        X[[1, 1, 1], [0, 4, 5]] = 1
        X[1:, [2, 3]] = 1
        d = GPy.util.subarray_and_sorting.common_subarrays(X, axis=1)
        assert len(d) == 3
        X[:, d[tuple(X[:, 0])]]
        assert d[tuple(X[:, 4])] == d[tuple(X[:, 0])] == [0, 4, 5]
        assert d[tuple(X[:, 1])] == [1]
    def test_offset_cluster(self):
        # Tests the GPy.util.cluster_with_offset.cluster utility with a small
        # test data set. Not using random noise just in case it occasionally
        # causes it not to cluster correctly.
        # groundtruth cluster identifiers are: [0,1,1,0]
        # data contains a list of the four sets of time series (3 per data point)
        data = [
            np.array(
                [
                    [2.18094245, 1.96529789, 2.00265523, 2.18218742, 2.06795428],
                    [1.62254829, 1.75748448, 1.83879347, 1.87531326, 1.52503496],
                    [1.54589609, 1.61607914, 2.00463192, 1.48771394, 1.63339218],
                ]
            ),
            np.array(
                [
                    [2.86766106, 2.97953437, 2.91958876, 2.92510506, 3.03239241],
                    [2.57368423, 2.59954886, 3.10000395, 2.75806125, 2.89865704],
                    [2.58916318, 2.53698259, 2.63858411, 2.63102504, 2.51853901],
                ]
            ),
            np.array(
                [
                    [2.77834168, 2.9618564, 2.88482141, 3.24259745, 2.9716821],
                    [2.60675576, 2.67095624, 2.94824436, 2.80520631, 2.87247516],
                    [2.49543562, 2.5492281, 2.6505866, 2.65015308, 2.59738616],
                ]
            ),
            np.array(
                [
                    [1.76783086, 2.21666738, 2.07939706, 1.9268263, 2.23360121],
                    [1.94305547, 1.94648592, 2.1278921, 2.09481457, 2.08575238],
                    [1.69336013, 1.72285186, 1.6339506, 1.61212022, 1.39198698],
                ]
            ),
        ]
        # inputs contains their associated X values
        inputs = [
            np.array([[0.0], [0.68040097], [1.20316795], [1.798749], [2.14891733]]),
            np.array([[0.0], [0.51910637], [0.98259352], [1.57442965], [1.82515098]]),
            np.array([[0.0], [0.66645478], [1.59464591], [1.69769551], [1.80932752]]),
            np.array([[0.0], [0.87512108], [1.71881079], [2.67162871], [3.23761907]]),
        ]
        # try doing the clustering
        active = GPy.util.cluster_with_offset.cluster(data, inputs)
        # check to see that the clustering has correctly clustered the time series.
        clusters = set([frozenset(cluster) for cluster in active])
        assert set([1, 2]) in clusters, "Offset Clustering algorithm failed"
        assert set([0, 3]) in clusters, "Offset Clustering algoirthm failed"
 class TestUnivariateGaussian:
    def setup(self):
        self.zz = [-5.0, -0.8, 0.0, 0.5, 2.0, 10.0]
    def test_logPdfNormal(self):
        from GPy.util.univariate_Gaussian import logPdfNormal
        self.setup()
        pySols = [
            -13.4189385332,
            -1.2389385332,
            -0.918938533205,
            -1.0439385332,
            -2.9189385332,
            -50.9189385332,
        ]
        diff = 0.0
        for i in range(len(pySols)):
            diff += abs(logPdfNormal(self.zz[i]) - pySols[i])
        assert diff < 1e-10
    def test_cdfNormal(self):
        from GPy.util.univariate_Gaussian import cdfNormal
        self.setup()
        pySols = [
            2.86651571879e-07,
            0.211855398583,
            0.5,
            0.691462461274,
            0.977249868052,
            1.0,
        ]
        diff = 0.0
        for i in range(len(pySols)):
            diff += abs(cdfNormal(self.zz[i]) - pySols[i])
        assert diff < 1e-10
    def test_logCdfNormal(self):
        from GPy.util.univariate_Gaussian import logCdfNormal
        self.setup()
        pySols = [
            -15.064998394,
            -1.55185131919,
            -0.69314718056,
            -0.368946415289,
            -0.023012909329,
            0.0,
        ]
        diff = 0.0
        for i in range(len(pySols)):
            diff += abs(logCdfNormal(self.zz[i]) - pySols[i])
        assert diff < 1e-10
    def test_derivLogCdfNormal(self):
        from GPy.util.univariate_Gaussian import derivLogCdfNormal
        self.setup()
        pySols = [
            5.18650396941,
            1.3674022693,
            0.79788456081,
            0.50916043387,
            0.0552478626962,
            0.0,
        ]
        diff = 0.0
        for i in range(len(pySols)):
            diff += abs(derivLogCdfNormal(self.zz[i]) - pySols[i])
        assert diff < 1e-8
 class TestStandardize:
    def setup(self):
        self.normalizer = GPy.util.normalizer.Standardize()
        y = np.stack([np.random.randn(10), 2 * np.random.randn(10)], axis=1)
        self.normalizer.scale_by(y)
    def test_inverse_covariance(self):
        """
        Test inverse covariance outputs correct size
        """
        self.setup()
        covariance = np.random.rand(100, 100)
        output = self.normalizer.inverse_covariance(covariance)
        assert output.shape == (100, 100, 2)
--- a/GPy/testing/variational_tests.py
+++ b/GPy/testing/variational_tests.py
@ -1,4 +1,4 @@
-'''
+"""
 Copyright (c) 2015, Max Zwiessele
 All rights reserved.
@ -26,38 +26,35 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-'''
+"""
 import unittest
 import GPy, numpy as np
 class KLGrad(GPy.core.Model):
-            def __init__(self, Xvar, kl):   
+    def __init__(self, Xvar, kl):
-                super(KLGrad, self).__init__(name="klgrad")     
+        super(KLGrad, self).__init__(name="klgrad")
-                self.kl = kl
+        self.kl = kl
-                self.link_parameter(Xvar)
+        self.link_parameter(Xvar)
-                self.Xvar = Xvar
+        self.Xvar = Xvar
-                self._obj = 0
+        self._obj = 0
            def parameters_changed(self):
                self.Xvar.gradient[:] = 0
                self.kl.update_gradients_KL(self.Xvar)
                self._obj = self.kl.KL_divergence(self.Xvar)
            def objective_function(self):
                return self._obj
-class Test(unittest.TestCase):
+    def parameters_changed(self):
        self.Xvar.gradient[:] = 0
        self.kl.update_gradients_KL(self.Xvar)
        self._obj = self.kl.KL_divergence(self.Xvar)
-    def setUp(self):
+    def objective_function(self):
        return self._obj
 class TestVariational:
    def setup(self):
        np.random.seed(12345)
        self.Xvar = GPy.core.parameterization.variational.NormalPosterior(
-            np.random.uniform(0,1,(10,3)), 
+            np.random.uniform(0, 1, (10, 3)), np.random.uniform(1e-5, 0.01, (10, 3))
-            np.random.uniform(1e-5,.01, (10,3))
+        )
            )
-
+    def test_normal(self):
-    def testNormal(self):
+        self.setup()
        klgrad = KLGrad(self.Xvar, GPy.core.parameterization.variational.NormalPrior())
        np.testing.assert_(klgrad.checkgrad())
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.testNormal']
    unittest.main()
--- a/GPy/testing/todo.md
+++ b/GPy/testing/todo.md
@ -0,0 +1,14 @@
 As off now, I am once through all of the tests and basic migration is done.
 Now, fix the below things and todos before starting to get the tests running using pytest
 + update test script names according to pytest conversion
 + check for TODOs
 + + there are many associated with "iscloseto" functions from np.testing. Will have to figure out how these
 + + some tests are not that clear to me tbh
 + check nomenclature of test files and test classes and test functions
 + chatgpt says that I should replace delta with the decimal but a delta of 1e-4 should be decimal=4. Not sure about this yet  but that is something I need to fix later on
 --> this gives more content to it: https://docs.python.org/3/library/unittest.html#unittest.TestCase.assertAlmostEqual
 I need to write a custom function that behaves accordingly as in some cases, np.testing.assert_almost_equal won't be applicable, https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html
 or how about this: `np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)`
--- a/GPy/testing/util_tests.py
+++ b/GPy/testing/util_tests.py
@ -1,242 +0,0 @@
 #===============================================================================
 # Copyright (c) 2016, Max Zwiessele, Alan Saul
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # * Redistributions of source code must retain the above copyright notice, this
 #   list of conditions and the following disclaimer.
 #
 # * Redistributions in binary form must reproduce the above copyright notice,
 #   this list of conditions and the following disclaimer in the documentation
 #   and/or other materials provided with the distribution.
 #
 # * Neither the name of GPy.testing.util_tests nor the names of its
 #   contributors may be used to endorse or promote products derived from
 #   this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #===============================================================================
 import unittest
 import numpy as np
 import GPy
 class TestDebug(unittest.TestCase):
    def test_checkFinite(self):
        from GPy.util.debug import checkFinite
        array = np.random.normal(0, 1, 100).reshape(25,4)
        self.assertTrue(checkFinite(array, name='test'))
        array[np.random.binomial(1, .3, array.shape).astype(bool)] = np.nan
        self.assertFalse(checkFinite(array))
    def test_checkFullRank(self):
        from GPy.util.debug import checkFullRank
        from GPy.util.linalg import tdot
        array = np.random.normal(0, 1, 100).reshape(25,4)
        self.assertFalse(checkFullRank(tdot(array), name='test'))
        array = np.random.normal(0, 1, (25,25))
        self.assertTrue(checkFullRank(tdot(array)))
    def test_fixed_inputs_median(self):
        """ test fixed_inputs convenience function """
        from GPy.plotting.matplot_dep.util import fixed_inputs
        import GPy
        X = np.random.randn(10, 3)
        Y = np.sin(X) + np.random.randn(10, 3)*1e-3
        m = GPy.models.GPRegression(X, Y)
        fixed = fixed_inputs(m, [1], fix_routine='median', as_list=True, X_all=False)
        self.assertTrue((0, np.median(X[:,0])) in fixed)
        self.assertTrue((2, np.median(X[:,2])) in fixed)
        self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
    def test_fixed_inputs_mean(self):
        from GPy.plotting.matplot_dep.util import fixed_inputs
        import GPy
        X = np.random.randn(10, 3)
        Y = np.sin(X) + np.random.randn(10, 3)*1e-3
        m = GPy.models.GPRegression(X, Y)
        fixed = fixed_inputs(m, [1], fix_routine='mean', as_list=True, X_all=False)
        self.assertTrue((0, np.mean(X[:,0])) in fixed)
        self.assertTrue((2, np.mean(X[:,2])) in fixed)
        self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
    def test_fixed_inputs_zero(self):
        from GPy.plotting.matplot_dep.util import fixed_inputs
        import GPy
        X = np.random.randn(10, 3)
        Y = np.sin(X) + np.random.randn(10, 3)*1e-3
        m = GPy.models.GPRegression(X, Y)
        fixed = fixed_inputs(m, [1], fix_routine='zero', as_list=True, X_all=False)
        self.assertTrue((0, 0.0) in fixed)
        self.assertTrue((2, 0.0) in fixed)
        self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
    def test_fixed_inputs_uncertain(self):
        from GPy.plotting.matplot_dep.util import fixed_inputs
        import GPy
        from GPy.core.parameterization.variational import NormalPosterior
        X_mu = np.random.randn(10, 3)
        X_var = np.random.randn(10, 3)
        X = NormalPosterior(X_mu, X_var)
        Y = np.sin(X_mu) + np.random.randn(10, 3)*1e-3
        m = GPy.models.BayesianGPLVM(Y, X=X_mu, X_variance=X_var, input_dim=3)
        fixed = fixed_inputs(m, [1], fix_routine='median', as_list=True, X_all=False)
        self.assertTrue((0, np.median(X.mean.values[:,0])) in fixed)
        self.assertTrue((2, np.median(X.mean.values[:,2])) in fixed)
        self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
    def test_DSYR(self):
        from GPy.util.linalg import DSYR, DSYR_numpy
        A = np.arange(9.0).reshape(3,3)
        A = np.dot(A.T, A)
        b = np.ones(3, dtype=float)
        alpha = 1.0
        DSYR(A, b, alpha)
        R = np.array([
            [46, 55, 64],
            [55, 67, 79],
            [64, 79, 94]]
            )
        self.assertTrue(abs(np.sum(A - R)) < 1e-12)
    def test_subarray(self):
        import GPy
        X = np.zeros((3,6), dtype=bool)
        X[[1,1,1],[0,4,5]] = 1
        X[1:,[2,3]] = 1
        d = GPy.util.subarray_and_sorting.common_subarrays(X,axis=1)
        self.assertTrue(len(d) == 3)
        X[:, d[tuple(X[:,0])]]
        self.assertTrue(d[tuple(X[:,4])] == d[tuple(X[:,0])] == [0, 4, 5])
        self.assertTrue(d[tuple(X[:,1])] == [1])
    def test_offset_cluster(self):
        #Tests the GPy.util.cluster_with_offset.cluster utility with a small
        #test data set. Not using random noise just in case it occasionally
        #causes it not to cluster correctly.
        #groundtruth cluster identifiers are: [0,1,1,0]
        #data contains a list of the four sets of time series (3 per data point)
        data = [np.array([[ 2.18094245,  1.96529789,  2.00265523,  2.18218742,  2.06795428],
                [ 1.62254829,  1.75748448,  1.83879347,  1.87531326,  1.52503496],
                [ 1.54589609,  1.61607914,  2.00463192,  1.48771394,  1.63339218]]),
         np.array([[ 2.86766106,  2.97953437,  2.91958876,  2.92510506,  3.03239241],
                [ 2.57368423,  2.59954886,  3.10000395,  2.75806125,  2.89865704],
                [ 2.58916318,  2.53698259,  2.63858411,  2.63102504,  2.51853901]]),
         np.array([[ 2.77834168,  2.9618564 ,  2.88482141,  3.24259745,  2.9716821 ],
                [ 2.60675576,  2.67095624,  2.94824436,  2.80520631,  2.87247516],
                [ 2.49543562,  2.5492281 ,  2.6505866 ,  2.65015308,  2.59738616]]),
         np.array([[ 1.76783086,  2.21666738,  2.07939706,  1.9268263 ,  2.23360121],
                [ 1.94305547,  1.94648592,  2.1278921 ,  2.09481457,  2.08575238],
                [ 1.69336013,  1.72285186,  1.6339506 ,  1.61212022,  1.39198698]])]
        #inputs contains their associated X values
        inputs = [np.array([[ 0.        ],
                [ 0.68040097],
                [ 1.20316795],
                [ 1.798749  ],
                [ 2.14891733]]), np.array([[ 0.        ],
                [ 0.51910637],
                [ 0.98259352],
                [ 1.57442965],
                [ 1.82515098]]), np.array([[ 0.        ],
                [ 0.66645478],
                [ 1.59464591],
                [ 1.69769551],
                [ 1.80932752]]), np.array([[ 0.        ],
                [ 0.87512108],
                [ 1.71881079],
                [ 2.67162871],
                [ 3.23761907]])]
        #try doing the clustering
        active = GPy.util.cluster_with_offset.cluster(data,inputs)
        #check to see that the clustering has correctly clustered the time series.
        clusters = set([frozenset(cluster) for cluster in active])
        assert set([1,2]) in clusters, "Offset Clustering algorithm failed"
        assert set([0,3]) in clusters, "Offset Clustering algoirthm failed"
 class TestUnivariateGaussian(unittest.TestCase):
    def setUp(self):
        self.zz = [-5.0, -0.8, 0.0, 0.5, 2.0, 10.0]
    def test_logPdfNormal(self):
        from GPy.util.univariate_Gaussian import logPdfNormal
        pySols = [-13.4189385332,
            -1.2389385332,
            -0.918938533205,
            -1.0439385332,
            -2.9189385332,
            -50.9189385332]
        diff = 0.0
        for i in range(len(pySols)):
            diff += abs(logPdfNormal(self.zz[i]) - pySols[i])
        self.assertTrue(diff  < 1e-10)
    def test_cdfNormal(self):
        from GPy.util.univariate_Gaussian import cdfNormal
        pySols = [2.86651571879e-07,
          0.211855398583,
          0.5,
          0.691462461274,
          0.977249868052,
          1.0]
        diff = 0.0
        for i in range(len(pySols)):
            diff += abs(cdfNormal(self.zz[i]) - pySols[i])
        self.assertTrue(diff  < 1e-10)
    def test_logCdfNormal(self):
        from GPy.util.univariate_Gaussian import logCdfNormal
        pySols = [-15.064998394,
          -1.55185131919,
          -0.69314718056,
          -0.368946415289,
          -0.023012909329,
          0.0]
        diff = 0.0
        for i in range(len(pySols)):
            diff += abs(logCdfNormal(self.zz[i]) - pySols[i])
        self.assertTrue(diff  < 1e-10)
    def test_derivLogCdfNormal(self):
        from GPy.util.univariate_Gaussian import derivLogCdfNormal
        pySols = [5.18650396941,
          1.3674022693,
          0.79788456081,
          0.50916043387,
          0.0552478626962,
          0.0]
        diff = 0.0
        for i in range(len(pySols)):
          diff += abs(derivLogCdfNormal(self.zz[i]) - pySols[i])
        self.assertTrue(diff  < 1e-8)
 class TestStandardize(unittest.TestCase):
    def setUp(self):
        self.normalizer = GPy.util.normalizer.Standardize()
        y = np.stack([np.random.randn(10), 2*np.random.randn(10)], axis=1)
        self.normalizer.scale_by(y)
    def test_inverse_covariance(self):
        """
        Test inverse covariance outputs correct size
        """
        covariance = np.random.rand(100, 100)
        output = self.normalizer.inverse_covariance(covariance)
        self.assertTrue(output.shape == (100, 100, 2))
--- a/GPy/util/choleskies_cython.c
+++ b/GPy/util/choleskies_cython.c
--- a/GPy/util/classification.py
+++ b/GPy/util/classification.py
@ -2,7 +2,8 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-def conf_matrix(p,labels,names=['1','0'],threshold=.5,show=True):
+
 def conf_matrix(p, labels, names=["1", "0"], threshold=0.5, show=True):
    """
    Returns error rate and true/false positives in a binary classification problem
    - Actual classes are displayed by column.
@ -16,18 +17,18 @@ def conf_matrix(p,labels,names=['1','0'],threshold=.5,show=True):
    :type show: False|True
    """
    assert p.size == labels.size, "Arrays p and labels have different dimensions."
-    decision = np.ones((labels.size,1))
+    decision = np.ones((labels.size, 1))
-    decision[p<threshold] = 0
+    decision[p < threshold] = 0
    diff = decision - labels
    false_0 = diff[diff == -1].size
    false_1 = diff[diff == 1].size
-    true_1 = np.sum(decision[diff ==0])
+    true_1 = np.sum(decision[diff == 0])
    true_0 = labels.size - true_1 - false_0 - false_1
-    error = (false_1 + false_0)/np.float(labels.size)
+    error = (false_1 + false_0) / float(labels.size)
    if show:
-        print(100. - error * 100,'% instances correctly classified')
+        print(100.0 - error * 100, "% instances correctly classified")
-        print('%-10s|  %-10s|  %-10s| ' % ('',names[0],names[1]))
+        print("%-10s|  %-10s|  %-10s| " % ("", names[0], names[1]))
-        print('----------|------------|------------|')
+        print("----------|------------|------------|")
-        print('%-10s|  %-10s|  %-10s| ' % (names[0],true_1,false_0))
+        print("%-10s|  %-10s|  %-10s| " % (names[0], true_1, false_0))
-        print('%-10s|  %-10s|  %-10s| ' % (names[1],false_1,true_0))
+        print("%-10s|  %-10s|  %-10s| " % (names[1], false_1, true_0))
-    return error,true_1, false_1, true_0, false_0
+    return error, true_1, false_1, true_0, false_0
--- a/GPy/util/config.py
+++ b/GPy/util/config.py
@ -2,31 +2,46 @@
 # This loads the configuration
 #
 import os
 try:
-    #Attempt Python 2 ConfigParser setup
+    # Attempt Python 2 ConfigParser setup
    import ConfigParser
    config = ConfigParser.ConfigParser()
    from ConfigParser import NoOptionError
 except ImportError:
-    #Attempt Python 3 ConfigParser setup
+    # Attempt Python 3 ConfigParser setup
    import configparser
    config = configparser.ConfigParser()
    from configparser import NoOptionError
 # This is the default configuration file that always needs to be present.
-default_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'defaults.cfg'))
+default_file = os.path.abspath(
    os.path.join(os.path.dirname(__file__), "..", "defaults.cfg")
 )
 # These files are optional
 # This specifies configurations that are typically specific to the machine (it is found alongside the GPy installation).
-local_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'installation.cfg'))
+local_file = os.path.abspath(
    os.path.join(os.path.dirname(__file__), "..", "installation.cfg")
 )
 # This specifies configurations specific to the user (it is found in the user home directory)
-home = os.getenv('HOME') or os.getenv('USERPROFILE') or ''
+home = os.getenv("HOME") or os.getenv("USERPROFILE") or ""
-user_file = os.path.join(home,'.config','GPy', 'user.cfg')
+user_file = os.path.join(home, ".config", "GPy", "user.cfg")
 # Read in the given files.
-config.readfp(open(default_file))
+config.read_file(open(default_file))
 config.read([local_file, user_file])
 if not config:
-    raise ValueError("No configuration file found at either " + user_file + " or " + local_file + " or " + default_file + ".")
+    raise ValueError(
        "No configuration file found at either "
        + user_file
        + " or "
        + local_file
        + " or "
        + default_file
        + "."
    )
--- a/GPy/util/initialization.py
+++ b/GPy/util/initialization.py
@ -1,31 +1,59 @@
-'''
+"""
 Created on 24 Feb 2014
@author: maxz
-'''
+"""
 import numpy as np
 import warnings
 from ..util.pca import PCA
 def initialize_latent(init, input_dim, Y):
    """
    :param init: initialization method for the latent space, 'PCA' or 'random'
    """
    Xr = np.asfortranarray(np.random.normal(0, 1, (Y.shape[0], input_dim)))
-    if 'PCA' in init:
+    if "PCA" == init:
        p = PCA(Y)
        PC = p.project(Y, min(input_dim, Y.shape[1]))
-        Xr[:PC.shape[0], :PC.shape[1]] = PC
+        Xr[: PC.shape[0], : PC.shape[1]] = PC
-        var = .1*p.fracs[:input_dim]
+        var = 0.1 * p.fracs[:input_dim]
-    elif init in 'empirical_samples':
+    elif init == "empirical_samples":
        # dealing with depcrecated initialization method
        # should be remove along the next major release
        warnings.warn(
            "Deprecated initialization method 'empirical_samples'. "
            "Use 'random' instead.",
            DeprecationWarning,
        )
        from ..util.linalg import tdot
        from ..util import diag
        YYT = tdot(Y)
        diag.add(YYT, 1e-6)
-        EMP = np.asfortranarray(np.random.multivariate_normal(np.zeros(Y.shape[0]), YYT, min(input_dim, Y.shape[1])).T)
+        EMP = np.asfortranarray(
-        Xr[:EMP.shape[0], :EMP.shape[1]] = EMP
+            np.random.multivariate_normal(
                np.zeros(Y.shape[0]), YYT, min(input_dim, Y.shape[1])
            ).T
        )
        Xr[: EMP.shape[0], : EMP.shape[1]] = EMP
        var = np.random.uniform(0.5, 1.5, input_dim)
-    else:
+    elif init == "random":
        var = Xr.var(0)
    else:
        # dealing with depcrecated initialization method
        # should be remove along the next major release
        warnings.warn(
            f"{init} is not a valid initialization method."
            "Supoprt for anything else than 'PCA' or 'random' will be removed in the next major release.",
            DeprecationWarning,
        )
        var = Xr.var(0)
    Xr -= Xr.mean(0)
    Xr /= Xr.std(0)
-    return Xr, var/var.max()
+    return Xr, var / var.max()
--- a/GPy/util/linalg_cython.c
+++ b/GPy/util/linalg_cython.c
--- a/GPy/util/multioutput.py
+++ b/GPy/util/multioutput.py
@ -2,6 +2,7 @@ import numpy as np
 import warnings
 import GPy
 def index_to_slices(index):
    """
    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index.
@ -16,28 +17,35 @@ def index_to_slices(index):
    returns
    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
    """
-    if len(index)==0:
+    if len(index) == 0:
-        return[]
+        return []
-    #contruct the return structure
+    # contruct the return structure
-    ind = np.asarray(index,dtype=np.int)
+    ind = np.asarray(index, dtype=int)
-    ret = [[] for i in range(ind.max()+1)]
+    ret = [[] for i in range(ind.max() + 1)]
-    #find the switchpoints
+    # find the switchpoints
-    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
+    ind_ = np.hstack((ind, ind[0] + ind[-1] + 1))
-    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
+    switchpoints = np.nonzero(ind_ - np.roll(ind_, +1))[0]
-    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
+    [
        ret[ind_i].append(slice(*indexes_i))
        for ind_i, indexes_i in zip(
            ind[switchpoints[:-1]], zip(switchpoints, switchpoints[1:])
        )
    ]
    return ret
 def get_slices(input_list):
    num_outputs = len(input_list)
-    _s = [0] + [ _x.shape[0] for _x in input_list ]
+    _s = [0] + [_x.shape[0] for _x in input_list]
    _s = np.cumsum(_s)
-    slices = [slice(a,b) for a,b in zip(_s[:-1],_s[1:])]
+    slices = [slice(a, b) for a, b in zip(_s[:-1], _s[1:])]
    return slices
-def build_XY(input_list,output_list=None,index=None):
+
 def build_XY(input_list, output_list=None, index=None):
    num_outputs = len(input_list)
    if output_list is not None:
        assert num_outputs == len(output_list)
@ -47,27 +55,35 @@ def build_XY(input_list,output_list=None,index=None):
    if index is not None:
        assert len(index) == num_outputs
-        I = np.hstack( [np.repeat(j,_x.shape[0]) for _x,j in zip(input_list,index)] )
+        I = np.hstack([np.repeat(j, _x.shape[0]) for _x, j in zip(input_list, index)])
    else:
-        I = np.hstack( [np.repeat(j,_x.shape[0]) for _x,j in zip(input_list,range(num_outputs))] )
+        I = np.hstack(
            [np.repeat(j, _x.shape[0]) for _x, j in zip(input_list, range(num_outputs))]
        )
    X = np.vstack(input_list)
-    X = np.hstack([X,I[:,None]])
+    X = np.hstack([X, I[:, None]])
-    return X,Y,I[:,None]#slices
+    return X, Y, I[:, None]  # slices
-def build_likelihood(Y_list,noise_index,likelihoods_list=None):
+
 def build_likelihood(Y_list, noise_index, likelihoods_list=None):
    Ny = len(Y_list)
    if likelihoods_list is None:
-       likelihoods_list = [GPy.likelihoods.Gaussian(name="Gaussian_noise_%s" %j) for y,j in zip(Y_list,range(Ny))]
+        likelihoods_list = [
            GPy.likelihoods.Gaussian(name="Gaussian_noise_%s" % j)
            for y, j in zip(Y_list, range(Ny))
        ]
    else:
        assert len(likelihoods_list) == Ny
-    #likelihood = GPy.likelihoods.mixed_noise.MixedNoise(likelihoods_list=likelihoods_list, noise_index=noise_index)
+    # likelihood = GPy.likelihoods.mixed_noise.MixedNoise(likelihoods_list=likelihoods_list, noise_index=noise_index)
-    likelihood = GPy.likelihoods.mixed_noise.MixedNoise(likelihoods_list=likelihoods_list)
+    likelihood = GPy.likelihoods.mixed_noise.MixedNoise(
        likelihoods_list=likelihoods_list
    )
    return likelihood
-def ICM(input_dim, num_outputs, kernel, W_rank=1,W=None,kappa=None,name='ICM'):
+def ICM(input_dim, num_outputs, kernel, W_rank=1, W=None, kappa=None, name="ICM"):
    """
    Builds a kernel for an Intrinsic Coregionalization Model
@ -80,13 +96,26 @@ def ICM(input_dim, num_outputs, kernel, W_rank=1,W=None,kappa=None,name='ICM'):
    """
    if kernel.input_dim != input_dim:
        kernel.input_dim = input_dim
-        warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
+        warnings.warn(
            "kernel's input dimension overwritten to fit input_dim parameter."
        )
-    K = kernel.prod(GPy.kern.Coregionalize(1, num_outputs, active_dims=[input_dim], rank=W_rank,W=W,kappa=kappa,name='B'),name=name)
+    K = kernel.prod(
        GPy.kern.Coregionalize(
            1,
            num_outputs,
            active_dims=[input_dim],
            rank=W_rank,
            W=W,
            kappa=kappa,
            name="B",
        ),
        name=name,
    )
    return K
-def LCM(input_dim, num_outputs, kernels_list, W_rank=1,name='ICM'):
+def LCM(input_dim, num_outputs, kernels_list, W_rank=1, name="ICM"):
    """
    Builds a kernel for an Linear Coregionalization Model
@ -98,15 +127,15 @@ def LCM(input_dim, num_outputs, kernels_list, W_rank=1,name='ICM'):
    :type W_rank: integer
    """
    Nk = len(kernels_list)
-    K = ICM(input_dim,num_outputs,kernels_list[0],W_rank,name='%s%s' %(name,0))
+    K = ICM(input_dim, num_outputs, kernels_list[0], W_rank, name="%s%s" % (name, 0))
    j = 1
    for kernel in kernels_list[1:]:
-        K += ICM(input_dim,num_outputs,kernel,W_rank,name='%s%s' %(name,j))
+        K += ICM(input_dim, num_outputs, kernel, W_rank, name="%s%s" % (name, j))
        j += 1
    return K
-def Private(input_dim, num_outputs, kernel, output, kappa=None,name='X'):
+def Private(input_dim, num_outputs, kernel, output, kappa=None, name="X"):
    """
    Builds a kernel for an Intrinsic Coregionalization Model
@ -117,7 +146,7 @@ def Private(input_dim, num_outputs, kernel, output, kappa=None,name='X'):
    :param W_rank: number tuples of the corregionalization parameters 'W'
    :type W_rank: integer
    """
-    K = ICM(input_dim,num_outputs,kernel,W_rank=1,kappa=kappa,name=name)
+    K = ICM(input_dim, num_outputs, kernel, W_rank=1, kappa=kappa, name=name)
    K.B.W.fix(0)
    _range = range(num_outputs)
    _range.pop(output)
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -3,7 +3,6 @@ include doc/source/conf.py
 include doc/source/index.rst
 include doc/source/tuto*.rst
 include README.md
 include README.rst
 include AUTHORS.txt
 # Data and config
--- a/README.md
+++ b/README.md
@ -79,7 +79,7 @@ If that is the case, it is best to clean the repo and reinstall.
 [<img src="https://upload.wikimedia.org/wikipedia/commons/8/8e/OS_X-Logo.svg" height=40px>](http://www.apple.com/osx/)
 [<img src="https://upload.wikimedia.org/wikipedia/commons/3/35/Tux.svg" height=40px>](https://en.wikipedia.org/wiki/List_of_Linux_distributions)
-Python 3.5 and higher
+Python 3.9 and higher
 ## Citation
@ -129,7 +129,7 @@ If you're having trouble installing GPy via `pip install GPy` here is a probable
    cd GPy
    git checkout devel
    python setup.py build_ext --inplace
-    nosetests GPy/testing
+    pytest .
 ### Direct downloads
@ -171,13 +171,13 @@ print(m_load)
 New way of running tests is using coverage:
-Ensure nose and coverage is installed:
+Ensure pytest and coverage is installed:
-    pip install nose coverage
+    pip install pytest
 Run nosetests from root directory of repository:
-    coverage run travis_tests.py
+    python travis_tests.py
 Create coverage report in htmlcov/
--- a/appveyor.yml
+++ b/appveyor.yml
@ -1,92 +0,0 @@
 environment:
  pip_access:
    secure: 8/ZjXFwtd1S7ixd7PJOpptupKKEDhm2da/q3unabJ00=
  COVERALLS_REPO_TOKEN:
    secure: d3Luic/ESkGaWnZrvWZTKrzO+xaVwJWaRCEP0F+K/9DQGPSRZsJ/Du5g3s4XF+tS
  gpy_version: 1.12.0
  matrix:
    - PYTHON_VERSION: 3.6
      MINICONDA: C:\Miniconda3-x64
      MPL_VERSION: 3.3.4
    - PYTHON_VERSION: 3.7
      MINICONDA: C:\Miniconda3-x64
      MPL_VERSION: 3.3.4
    - PYTHON_VERSION: 3.8
      MINICONDA: C:\Miniconda3-x64
      MPL_VERSION: 3.3.4
    - PYTHON_VERSION: 3.9
      MINICONDA: C:\Miniconda3-x64
      MPL_VERSION: 3.3.4
 #configuration:
 #  - Debug
 #  - Release
 install:
 - "set PATH=%MINICONDA%;%MINICONDA%\\Scripts;%PATH%"
 - conda config --set always_yes yes --set changeps1 no
 - conda update -q conda
 - conda info -a
 # github issue #955: freeze build version of matplotlib
 - "conda create -q -n build-environment python=%PYTHON_VERSION% numpy scipy matplotlib=%MPL_VERSION%"
 - activate build-environment
 # We need wheel installed to build wheels
 - python -m pip install wheel
 # GPy needs paramz
 - python -m pip install paramz
 - python -m pip install nose-show-skipped
 - python -m pip install coverage
 - python -m pip install coveralls
 - python -m pip install codecov
 - python -m pip install twine
 - "python setup.py develop"
 build: off
 test_script:
  # Put your test command here.
  # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
  # you can remove "build.cmd" from the front of the command, as it's
  # only needed to support those cases.
  # Note that you must use the environment variable %PYTHON% to refer to
  # the interpreter you're using - Appveyor does not do anything special
  # to put the Python evrsion you want to use on PATH.
  #- "build.cmd %PYTHON%\\python.exe setup.py test"
  - "coverage run travis_tests.py"
 after_test:
  # This step builds your wheels.
  - "python setup.py bdist_wheel"
  - codecov
 artifacts:
  # bdist_wheel puts your built wheel in the dist directory
  - path: dist\*
 deploy_script:
 - echo [distutils] > %USERPROFILE%\\.pypirc
 - echo index-servers = >> %USERPROFILE%\\.pypirc
 - echo     pypi >> %USERPROFILE%\\.pypirc
 - echo     test >> %USERPROFILE%\\.pypirc
 - echo[
 - echo [pypi] >> %USERPROFILE%\\.pypirc
 - echo username = maxz >> %USERPROFILE%\\.pypirc
 - echo password = %pip_access% >> %USERPROFILE%\\.pypirc
 - echo[
 - echo [test] >> %USERPROFILE%\\.pypirc
 - echo repository = https://testpypi.python.org/pypi >> %USERPROFILE%\\.pypirc
 - echo username = maxz >> %USERPROFILE%\\.pypirc
 - echo password = %pip_access% >> %USERPROFILE%\\.pypirc
 - .appveyor_twine_upload.bat
 # deploy:
 #   - provider: GitHub
 #     release: GPy-v$(gpy_version)
 #     description: 'GPy windows install'
 #     artifact: dist/*.exe               # upload wininst to GitHub
 #     draft: false
 #     prerelease: false
 #     on:
 #         branch: deploy                 # release from deploy branch only
 #         appveyor_repo_tag: true        # deploy on tag push only
--- a/benchmarks/regression/evaluation.py
+++ b/benchmarks/regression/evaluation.py
@ -4,6 +4,7 @@
 import abc
 import numpy as np
 class Evaluation(object):
    __metaclass__ = abc.ABCMeta
@ -12,10 +13,10 @@ class Evaluation(object):
        """Compute a scalar for access the performance"""
        return None
 class RMSE(Evaluation):
    "Rooted Mean Square Error"
-    name = 'RMSE'
+    name = "RMSE"
    def evaluate(self, gt, pred):
-        return np.sqrt(np.square(gt-pred).astype(np.float).mean())
+        return np.sqrt(np.square(gt - pred).astype(float).mean())
--- a/doc/source/requirements.txt
+++ b/doc/source/requirements.txt
@ -7,4 +7,4 @@ paramz
 cython
 mock
 sympy
-nose
+pytest
--- a/Show more
+++ b/Show more
`@ -1 +1 @@`
	`__version__ = "1.12.0"`	`__version__ = "1.13.2"`
		`@ -1 +1 @@`
			`nosetests . --with-coverage --logging-level=INFO --cover-html --cover-html-dir=coverage --cover-package=GPy --cover-erase`				`pytest .`