Merge branch 'feature-multioutput-grad-obs' of git://github.com/esiivola/GPy into esiivola-feature-multioutput-grad-obs

2026-05-18 13:55:14 +02:00 · 2019-07-22 10:54:28 +01:00 · 2019-07-22 10:54:28 +01:00 · 22ce7ad207
commit 22ce7ad207
parent c8255bdca2 081d447bc2
24 changed files with 795 additions and 53 deletions
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -581,3 +581,53 @@ def warped_gp_cubic_sine(max_iters=100):
    m.plot(title="Standard GP")
    warp_m.plot_warping()
    pb.show()
 def multioutput_gp_with_derivative_observations():
    def plot_gp_vs_real(m, x, yreal, size_inputs, title, fixed_input=1, xlim=[0,11], ylim=[-1.5,3]):
        fig, ax = pb.subplots()
        ax.set_title(title)
        pb.plot(x, yreal, "r", label='Real function')
        rows = slice(0, size_inputs[0]) if fixed_input == 0 else slice(size_inputs[0], size_inputs[0]+size_inputs[1])
        m.plot(fixed_inputs=[(1, fixed_input)], which_data_rows=rows, xlim=xlim, ylim=ylim, ax=ax)
    f = lambda x: np.sin(x)+0.1*(x-2.)**2-0.005*x**3
    fd = lambda x: np.cos(x)+0.2*(x-2.)-0.015*x**2
    N=10 # Number of observations
    M=10 # Number of derivative observations
    Npred=100 # Number of prediction points
    sigma = 0.05 # Noise of observations
    sigma_der = 0.05 # Noise of derivative observations
    x = np.array([np.linspace(1,10,N)]).T 
    y = f(x) + np.array(sigma*np.random.normal(0,1,(N,1)))
    xd = np.array([np.linspace(2,8,M)]).T
    yd = fd(xd) + np.array(sigma_der*np.random.normal(0,1,(M,1)))
    xpred = np.array([np.linspace(0,11,Npred)]).T
    ypred_true = f(xpred)
    ydpred_true = fd(xpred)
    # squared exponential kernel:
    se = GPy.kern.RBF(input_dim = 1, lengthscale=1.5, variance=0.2)
    # We need to generate separate kernel for the derivative observations and give the created kernel as an input:
    se_der = GPy.kern.DiffKern(se, 0)
    #Then 
    gauss = GPy.likelihoods.Gaussian(variance=sigma**2)
    gauss_der = GPy.likelihoods.Gaussian(variance=sigma_der**2)
    # Then create the model, we give everything in lists, the order of the inputs indicates the order of the outputs
    # Now we have the regular observations first and derivative observations second, meaning that the kernels and
    # the likelihoods must follow the same order. Crosscovariances are automatically taken car of
    m = GPy.models.MultioutputGP(X_list=[x, xd], Y_list=[y, yd], kernel_list=[se, se_der], likelihood_list = [gauss, gauss])
    # Optimize the model
    m.optimize(messages=0, ipython_notebook=False)
    #Plot the model, the syntax is same as for multioutput models:
    plot_gp_vs_real(m, xpred, ydpred_true, [x.shape[0], xd.shape[0]], title='Latent function derivatives', fixed_input=1, xlim=[0,11], ylim=[-1.5,3])
    plot_gp_vs_real(m, xpred, ypred_true, [x.shape[0], xd.shape[0]], title='Latent function', fixed_input=0, xlim=[0,11], ylim=[-1.5,3])
    #making predictions for the values:
    mu, var = m.predict_noiseless(Xnew=[xpred, np.empty((0,1))])
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -44,3 +44,5 @@ from .src.sde_static import sde_White, sde_Bias
 from .src.sde_stationary import sde_RBF,sde_Exponential,sde_RatQuad
 from .src.sde_brownian import sde_Brownian
 from .src.multioutput_kern import MultioutputKern
 from .src.multioutput_derivative_kern import MultioutputDerivativeKern
 from .src.diff_kern import DiffKern
--- a/GPy/kern/src/ODE_UY.py
+++ b/GPy/kern/src/ODE_UY.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from .kern import Kern
-from .independent_outputs import index_to_slices
+from ...util.multioutput import index_to_slices
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
 import numpy as np
--- a/GPy/kern/src/ODE_UYC.py
+++ b/GPy/kern/src/ODE_UYC.py
@ -5,7 +5,7 @@ from .kern import Kern
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
 import numpy as np
-from .independent_outputs import index_to_slices
+from ...util.multioutput import index_to_slices
 class ODE_UYC(Kern):
    def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., ubias =1. ,active_dims=None, name='ode_uyc'):
--- a/GPy/kern/src/ODE_st.py
+++ b/GPy/kern/src/ODE_st.py
@ -4,7 +4,7 @@ from .kern import Kern
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
 import numpy as np
-from .independent_outputs import index_to_slices
+from ...util.multioutput import index_to_slices
 class ODE_st(Kern):
--- a/GPy/kern/src/ODE_t.py
+++ b/GPy/kern/src/ODE_t.py
@ -2,7 +2,7 @@ from .kern import Kern
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
 import numpy as np
-from .independent_outputs import index_to_slices
+from ...util.multioutput import index_to_slices
 class ODE_t(Kern):
--- a/GPy/kern/src/diff_kern.py
+++ b/GPy/kern/src/diff_kern.py
@ -0,0 +1,88 @@
 # Copyright (c) 2018, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from .kern import Kern
 import numpy as np
 from paramz.caching import Cache_this
 class DiffKern(Kern):
    """
    Diff kernel is a thin wrapper for using partial derivatives of kernels as kernels. Eg. in combination with
    Multioutput kernel this allows the user to train GPs with observations of latent function and latent
    function derivatives. NOTE: DiffKern only works when used with Multioutput kernel. Do not use the kernel as standalone
    The parameters the kernel needs are:
    -'base_kern': a member of Kernel class that is used for observations
    -'dimension': integer that indigates in which dimensions the partial derivative observations are
    """
    def __init__(self, base_kern, dimension):
        super(DiffKern, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='DiffKern')
        self.base_kern = base_kern
        self.dimension = dimension
    def parameters_changed(self):
        self.base_kern.parameters_changed()
    @Cache_this(limit=3, ignore_args=())
    def K(self, X, X2=None, dimX2 = None): #X in dimension self.dimension
        if X2 is None:
            X2 = X
        if dimX2 is None:
            dimX2 = self.dimension
        return self.base_kern.dK2_dXdX2(X,X2, self.dimension, dimX2)
    @Cache_this(limit=3, ignore_args=())
    def Kdiag(self, X):
        return np.diag(self.base_kern.dK2_dXdX2(X,X, self.dimension, self.dimension))
    @Cache_this(limit=3, ignore_args=())
    def dK_dX_wrap(self, X, X2): #X in dimension self.dimension
        return self.base_kern.dK_dX(X,X2, self.dimension)
    @Cache_this(limit=3, ignore_args=())
    def dK_dX2_wrap(self, X, X2): #X in dimension self.dimension
        return self.base_kern.dK_dX2(X,X2, self.dimension)
    def reset_gradients(self):
        self.base_kern.reset_gradients()
    @property
    def gradient(self):
        return self.base_kern.gradient
    @gradient.setter
    def gradient(self, gradient):
        self.base_kern.gradient = gradient
    def update_gradients_full(self, dL_dK, X, X2=None, dimX2=None):
        if dimX2 is None:
            dimX2 = self.dimension
        gradients = self.base_kern.dgradients2_dXdX2(X,X2,self.dimension,dimX2)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
    def update_gradients_diag(self, dL_dK_diag, X):
        gradients = self.base_kern.dgradients2_dXdX2(X,X, self.dimension, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK_diag, gradient, f=np.diag) for gradient in gradients])
    def update_gradients_dK_dX(self, dL_dK, X, X2=None):
        if X2 is None:
            X2 = X
        gradients = self.base_kern.dgradients_dX(X,X2, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
    def update_gradients_dK_dX2(self, dL_dK, X, X2=None):
        gradients = self.base_kern.dgradients_dX2(X,X2, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
    def gradients_X(self, dL_dK, X, X2):
        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,:, self.dimension]
        return np.sum(tmp, axis=1)
    def gradients_X2(self, dL_dK, X, X2):
        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:, :, self.dimension, :]
        return np.sum(tmp, axis=1)
    def _convert_gradients(self, l,g, f = lambda x:x):
        if type(g) is np.ndarray:
            return np.sum(f(l)*f(g))
        else:
            return np.array([np.sum(f(l)*f(gi)) for gi in g])
--- a/GPy/kern/src/independent_outputs.py
+++ b/GPy/kern/src/independent_outputs.py
@ -5,34 +5,8 @@
 from .kern import CombinationKernel
 import numpy as np
 import itertools
 from ...util.multioutput import index_to_slices
 def index_to_slices(index):
    """
    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index.
    e.g.
    >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
    returns
    >>> [[slice(0,3,None)],[slice(3,6,None)],[slice(6,9,None)]]
    or, a more complicated example
    >>> index = np.asarray([0,0,1,1,0,2,2,2,1,1])
    returns
    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
    """
    if len(index)==0:
        return[]
    #contruct the return structure
    ind = np.asarray(index,dtype=np.int)
    ret = [[] for i in range(ind.max()+1)]
    #find the switchpoints
    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
    return ret
 class IndependentOutputs(CombinationKernel):
    """
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@ -116,6 +116,12 @@ class Kern(Parameterized):
        except:
            return X[:, self._all_dims_active]
    def _project_dim(self, dim):
        try:
            return np.where(self._all_dims_active == dim)[0][0]
        except:
            return None
    def K(self, X, X2):
        """
        Compute the kernel function.
--- a/GPy/kern/src/kernel_slice_operations.py
+++ b/GPy/kern/src/kernel_slice_operations.py
@ -19,16 +19,26 @@ def put_clean(dct, name, func):
 class KernCallsViaSlicerMeta(ParametersChangedMeta):
    def __new__(cls, name, bases, dct):
        put_clean(dct, 'K', _slice_K)
        put_clean(dct, 'dK_dX', _slice_dK_dX)
        put_clean(dct, 'dK_dX2', _slice_dK_dX)
        put_clean(dct, 'dK2_dXdX2', _slice_dK2_dXdX2)
        put_clean(dct, 'Kdiag', _slice_Kdiag)
        put_clean(dct, 'phi', _slice_Kdiag)
        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
        put_clean(dct, 'update_gradients_dK_dX', _slice_update_gradients_full)
        put_clean(dct, 'update_gradients_dK_dX2', _slice_update_gradients_full)
        put_clean(dct, 'gradients_X', _slice_gradients_X)
        put_clean(dct, 'gradients_X2', _slice_gradients_X)
        put_clean(dct, 'gradients_X_X2', _slice_gradients_X)
        put_clean(dct, 'gradients_XX', _slice_gradients_XX)
        put_clean(dct, 'gradients_XX_diag', _slice_gradients_XX_diag)
        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
        put_clean(dct, 'dgradients_dX',_slice_partial_gradients_list_X)
        put_clean(dct, 'dgradients_dX2',_slice_partial_gradients_list_X)
        put_clean(dct, 'dgradients2_dXdX2',_slice_partial_gradients_list_XX)
        put_clean(dct, 'psi0', _slice_psi)
        put_clean(dct, 'psi1', _slice_psi)
        put_clean(dct, 'psi2', _slice_psi)
@ -79,6 +89,20 @@ class _Slice_wrap(object):
            return ret
        return return_val
    def handle_return_list(self, return_val):
        if self.ret:
            ret = [np.zeros(self.shape) for _ in range(len(return_val))]
            if len(self.shape) == 3:
                for i in range(len(return_val)):
                    ret[i][self.k._all_dims_active, :, :] = return_val[i]
                #[ret[i].__setitem__((:, :, self.k._all_dims_active), return_val[i])  for i in range(len(return_val))]             
            elif len(self.shape) == 4:
                for i in range(len(return_val)):
                    ret[i][np.ix_(self.k._all_dims_active, self.k._all_dims_active)] = return_val[i]
                #[ret[i].__setitem__((:, :, self.k._all_dims_active, self.k._all_dims_active), return_val[i])  for i in range(len(return_val))]             
            return ret
        return return_val
 def _slice_K(f):
    @wraps(f)
    def wrap(self, X, X2 = None, *a, **kw):
@ -97,15 +121,15 @@ def _slice_Kdiag(f):
 def _slice_update_gradients_full(f):
    @wraps(f)
-    def wrap(self, dL_dK, X, X2=None):
+    def wrap(self, dL_dK, X, X2=None, *a, **kw):
        with _Slice_wrap(self, X, X2) as s:
-            ret = f(self, dL_dK, s.X, s.X2)
+            ret = f(self, dL_dK, s.X, s.X2, *a, **kw)
        return ret
    return wrap
 def _slice_update_gradients_diag(f):
    @wraps(f)
-    def wrap(self, dL_dKdiag, X):
+    def wrap(self, dL_dKdiag, X, *a, **kw):
        with _Slice_wrap(self, X, None) as s:
            ret = f(self, dL_dKdiag, s.X)
        return ret
@ -119,6 +143,99 @@ def _slice_gradients_X(f):
        return ret
    return wrap
 def _slice_dK_dX(f):
    @wraps(f)
    def wrap(self, X, X2, dim, *a, **kw):
        with _Slice_wrap(self, X, X2) as s:
            d = s.k._project_dim(dim)
            if d is None:
                ret = np.zeros((X.shape[0], X2.shape[0]))
            else:
                ret = f(self, s.X, s.X2, d, *a, **kw)
        return ret
    return wrap
 def _slice_dK2_dXdX2(f):
    @wraps(f)
    def wrap(self, X, X2, dimX, dimX2, *a, **kw):
        with _Slice_wrap(self, X, X2) as s:
            d = s.k._project_dim(dimX)
            d2 = s.k._project_dim(dimX2)
            if (d is None) or (d2 is None):
                ret = np.zeros((X.shape[0], X2.shape[0]))
            else:
                ret = f(self, s.X, s.X2, d, d2, *a, **kw)
        return ret
    return wrap
 def _slice_partial_gradients_X(f):
    @wraps(f)
    def wrap(self, X, X2, dim):
        if X2 is None:
            N, M = X.shape[0], X.shape[0]
        else:
            N, M = X.shape[0], X2.shape[0]
        Q1 = X.shape[1]
        with _Slice_wrap(self, X, X2, ret_shape=(N, M, Q1)) as s:
            ret = s.handle_return_array(f(self, s.X, s.X2, dim))
        return ret
    return wrap
 def _slice_partial_gradients_list_X(f):
    @wraps(f)
    def wrap(self, X, X2, dim):
        if X2 is None:
            N, M = X.shape[0], X.shape[0]
        else:
            N, M = X.shape[0], X2.shape[0]
        with _Slice_wrap(self, X, X2, ret_shape=(N, M)) as s:
            d = s.k._project_dim(dim)
            if d is None:
                ret = [np.zeros((N, M)) for i in range(s.k.size)]
            else:            
                ret = f(self, s.X, s.X2, d)
        return ret
    return wrap
 def _slice_partial_gradients_XX(f):
    @wraps(f)
    def wrap(self, X, X2, dim, dimX2):
        if X2 is None:
            N, M = X.shape[0], X.shape[0]
            Q1 = X.shape[1]
        else:
            N, M = X.shape[0], X2.shape[0]
            Q1, Q2 = X.shape[1], X2.shape[1]
        with _Slice_wrap(self, X, X2, ret_shape=(N, M, Q1, Q2)) as s:
            ret = s.handle_return_array(f(self, s.X, s.X2, dim, dimX2))
        return ret
    return wrap
 def _slice_partial_gradients_list_XX(f):
    @wraps(f)
    def wrap(self, X, X2, dimX, dimX2):
        if X2 is None:
            N, M = X.shape[0], X.shape[0]
        else:
            N, M = X.shape[0], X2.shape[0]
        with _Slice_wrap(self, X, X2, ret_shape=(N, M)) as s:
            d = s.k._project_dim(dimX)
            d2 = s.k._project_dim(dimX2)
            if (d is None) or (d2 is None):
                ret = [np.zeros((N, M)) for i in range(s.k.size)]
            else:
                ret = f(self, s.X, s.X2, d, d2)
        return ret
    return wrap
 def _slice_gradient_derivatives_X(f):
    @wraps(f)
    def wrap(self, dL_dK, X, X2=None):
        with _Slice_wrap(self, X, X2) as s:
            ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
        return ret
    return wrap
 def _slice_gradients_X_diag(f):
    @wraps(f)
    def wrap(self, dL_dKdiag, X):
--- a/GPy/kern/src/multioutput_derivative_kern.py
+++ b/GPy/kern/src/multioutput_derivative_kern.py
@ -0,0 +1,84 @@
 # Copyright (c) 2018, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from .kern import Kern, CombinationKernel
 from .multioutput_kern import MultioutputKern, ZeroKern
 import numpy as np
 from functools import partial
 class KernWrapper(Kern):
    def __init__(self, fk, fug, fg, base_kern):
        self.fk = fk
        self.fug = fug
        self.fg = fg
        self.base_kern = base_kern
        super(KernWrapper, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='KernWrapper',useGPU=False)
    def K(self, X, X2=None):
        return self.fk(X,X2=X2)
    def update_gradients_full(self,dL_dK, X, X2=None):
        return self.fug(dL_dK, X, X2=X2)
    def gradients_X(self,dL_dK, X, X2=None):
        return self.fg(dL_dK, X, X2=X2)
    @property
    def gradient(self):
        return self.base_kern.gradient
    @gradient.setter
    def gradient(self, gradient):
        self.base_kern.gradient = gradient
 class MultioutputDerivativeKern(MultioutputKern):
    """
    Multioutput derivative kernel is a meta class for combining different kernels for multioutput GPs.
    Multioutput derivative kernel is only a thin wrapper for Multioutput kernel for user not having to define
    cross covariances.
    """
    def __init__(self, kernels, cross_covariances={}, name='MultioutputDerivativeKern'):
        #kernels contains a list of kernels as input, 
        if not isinstance(kernels, list):
            self.single_kern = True
            self.kern = kernels
            kernels = [kernels]
        else:
            self.single_kern = False
            self.kern = kernels
        # The combination kernel ALLWAYS puts the extra dimension last.
        # Thus, the index dimension of this kernel is always the last dimension
        # after slicing. This is why the index_dim is just the last column:
        self.index_dim = -1
        super(MultioutputKern, self).__init__(kernels=kernels, extra_dims=[self.index_dim], name=name, link_parameters=False)
        nl = len(kernels)
        #build covariance structure
        covariance = [[None for i in range(nl)] for j in range(nl)]
        linked = []
        for i in range(0,nl):
            unique=True
            for j in range(0,nl):
                if i==j or (kernels[i] is kernels[j]):
                    kern = kernels[i]
                    if i>j:
                        unique=False
                elif cross_covariances.get((i,j)) is not None: #cross covariance is given
                    kern = cross_covariances.get((i,j))
                elif kernels[i].name == 'DiffKern' and kernels[i].base_kern == kernels[j]: # one is derivative of other
                    kern = KernWrapper(kernels[i].dK_dX_wrap,kernels[i].update_gradients_dK_dX,kernels[i].gradients_X, kernels[j])
                    unique=False
                elif kernels[j].name == 'DiffKern' and kernels[j].base_kern == kernels[i]: # one is derivative of other
                    kern = KernWrapper(kernels[j].dK_dX2_wrap,kernels[j].update_gradients_dK_dX2,kernels[j].gradients_X2, kernels[i])
                elif kernels[i].name == 'DiffKern' and kernels[j].name == 'DiffKern' and kernels[i].base_kern == kernels[j].base_kern: #both are partial derivatives
                    kern = KernWrapper(partial(kernels[i].K, dimX2=kernels[j].dimension), partial(kernels[i].update_gradients_full, dimX2=kernels[j].dimension),None, kernels[i].base_kern)
                    if i>j:
                        unique=False
                else:
                    kern = ZeroKern()
                covariance[i][j] = kern
            if unique is True:
                linked.append(i)
        self.covariance = covariance
        self.link_parameters(*[kernels[i] for i in linked])
--- a/GPy/kern/src/multioutput_kern.py
+++ b/GPy/kern/src/multioutput_kern.py
@ -1,7 +1,10 @@
 # Copyright (c) 2018, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from .kern import Kern, CombinationKernel
 import numpy as np
 from functools import reduce, partial
-from .independent_outputs import index_to_slices
+from ...util.multioutput import index_to_slices
 from paramz.caching import Cache_this
 class ZeroKern(Kern):
@ -18,6 +21,13 @@ class ZeroKern(Kern):
    def gradients_X(self,dL_dK, X, X2=None):
        return np.zeros((X.shape[0],X.shape[1]))
    @property
    def gradient(self):
        return np.empty((1,0))
    @gradient.setter
    def gradient(self, gradient):
        pass
 class MultioutputKern(CombinationKernel):
    """
@ -62,14 +72,13 @@ class MultioutputKern(CombinationKernel):
            unique=True
            for j in range(0,nl):
                if i==j or (kernels[i] is kernels[j]):
-                    covariance[i][j] = {'kern': kernels[i], 'K': kernels[i].K, 'update_gradients_full': kernels[i].update_gradients_full, 'gradients_X': kernels[i].gradients_X}
+                    covariance[i][j] = kernels[i]
                    if i>j:
                        unique=False
                elif cross_covariances.get((i,j)) is not None: #cross covariance is given
                    covariance[i][j] = cross_covariances.get((i,j))
                else: # zero covariance structure
-                    kern = ZeroKern()
+                    covariance[i][j] = ZeroKern()
                    covariance[i][j] = {'kern': kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X}       
            if unique is True:
                linked.append(i)
        self.covariance = covariance
@ -82,7 +91,7 @@ class MultioutputKern(CombinationKernel):
        slices = index_to_slices(X[:,self.index_dim])
        slices2 = index_to_slices(X2[:,self.index_dim])
        target =  np.zeros((X.shape[0], X2.shape[0]))
-        [[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j]['K'](X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))]  
+        [[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j].K(X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))]  
        return target
    @Cache_this(limit=3, ignore_args=())
@ -93,10 +102,10 @@ class MultioutputKern(CombinationKernel):
        [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
        return target
-    def _update_gradients_full_wrapper(self, cov_struct, dL_dK, X, X2):
+    def _update_gradients_full_wrapper(self, kern, dL_dK, X, X2):
-        gradient = cov_struct['kern'].gradient.copy()
+        gradient = kern.gradient.copy()
-        cov_struct['update_gradients_full'](dL_dK, X, X2)
+        kern.update_gradients_full(dL_dK, X, X2)
-        cov_struct['kern'].gradient += gradient
+        kern.gradient += gradient
    def _update_gradients_diag_wrapper(self, kern, dL_dKdiag, X):
        gradient = kern.gradient.copy()
@ -118,14 +127,14 @@ class MultioutputKern(CombinationKernel):
    def update_gradients_diag(self, dL_dKdiag, X):
        self.reset_gradients()
        slices = index_to_slices(X[:,self.index_dim])
-        [[ self._update_gradients_diag_wrapper(self.covariance[i][i]['kern'], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))]
+        [[ self._update_gradients_diag_wrapper(self.covariance[i][i], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))]
    def gradients_X(self,dL_dK, X, X2=None):
        slices = index_to_slices(X[:,self.index_dim])
        target = np.zeros((X.shape[0], X.shape[1]) )
        if X2 is not None:
            slices2 = index_to_slices(X2[:,self.index_dim])
-            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j]['gradients_X'](dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) ) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
+            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) ) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
        else:
-            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j]['gradients_X'](dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], (None if (i==j and k==l) else X[slices[j][l],:] )) ) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
+            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], (None if (i==j and k==l) else X[slices[j][l],:] )) ) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
        return target
--- a/GPy/kern/src/rbf.py
+++ b/GPy/kern/src/rbf.py
@ -6,6 +6,7 @@ import numpy as np
 from .stationary import Stationary
 from .psi_comp import PSICOMP_RBF, PSICOMP_RBF_GPU
 from ...core import Param
 from paramz.caching import Cache_this
 from paramz.transformations import Logexp
 from .grid_kerns import GridRBF
@ -50,6 +51,27 @@ class RBF(Stationary):
    def K_of_r(self, r):
        return self.variance * np.exp(-0.5 * r**2)
    @Cache_this(limit=3, ignore_args=())
    def dK_dX(self, X, X2, dimX):
        r = self._scaled_dist(X, X2)
        K = self.K_of_r(r)
        dist = X[:,None,dimX]-X2[None,:,dimX]
        lengthscale2inv = (np.ones((X.shape[1]))/(self.lengthscale**2))[dimX]
        return -1.*K*dist*lengthscale2inv
    @Cache_this(limit=3, ignore_args=())
    def dK_dX2(self, X, X2, dimX2):
        return -self.dK_dX(X,X2, dimX2)
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX2(self, X, X2, dimX, dimX2):
        r = self._scaled_dist(X, X2)
        K = self.K_of_r(r)
        if X2 is None:
            X2=X
        dist = X[:,None,:]-X2[None,:,:]
        lengthscale2inv = np.ones((X.shape[1]))/(self.lengthscale**2)
        return -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX]*lengthscale2inv[dimX2] + (dimX==dimX2)*K*lengthscale2inv[dimX]
    def dK_dr(self, r):
        return -r*self.K_of_r(r)
@ -59,6 +81,74 @@ class RBF(Stationary):
    def dK2_drdr_diag(self):
        return -self.variance # as the diagonal of r is always filled with zeros
    @Cache_this(limit=3, ignore_args=())
    def dK_dvariance(self,X,X2):
        return self.K(X,X2)/self.variance
    @Cache_this(limit=3, ignore_args=())
    def dK2_dvariancedX(self, X, X2, dim):
        return self.dK_dX(X,X2, dim)/self.variance
    @Cache_this(limit=3, ignore_args=())
    def dK2_dvariancedX2(self, X, X2, dim):
        return self.dK_dX2(X,X2, dim)/self.variance
    @Cache_this(limit=3, ignore_args=())
    def dK3_dvariancedXdX2(self, X, X2, dim, dimX2):
        return self.dK2_dXdX2(X, X2, dim, dimX2)/self.variance
    @Cache_this(limit=3, ignore_args=())
    def dK2_dlengthscaledX(self, X, X2, dimX):
        r = self._scaled_dist(X, X2)
        K = self.K_of_r(r)
        if X2 is None:
            X2=X
        dist = X[:,None,:]-X2[None,:,:]
        lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
        if self.ARD:
            g = []
            for diml in range(X.shape[1]):
                g += [-1.*K*dist[:,:,dimX]*(dist[:,:,diml]**2)*(lengthscaleinv[dimX]**2)*(lengthscaleinv[diml]**3) + 2.*dist[:,:,dimX]*(lengthscaleinv[diml]**3)*K*(dimX == diml)]
        else:
            g = -1.*K*dist[:,:,dimX]*np.sum(dist**2, axis=2)*(lengthscaleinv[dimX]**5) + 2.*dist[:,:,dimX]*(lengthscaleinv[dimX]**3)*K
        return g
    @Cache_this(limit=3, ignore_args=())
    def dK2_dlengthscaledX2(self, X, X2, dimX2):
        tmp = self.dK2_dlengthscaledX(X, X2, dimX2)
        if self.ARD:
            return [-1.*g for g in tmp]
        else:
            return -1*tmp
    @Cache_this(limit=3, ignore_args=())
    def dK3_dlengthscaledXdX2(self, X, X2, dimX, dimX2):
        r = self._scaled_dist(X, X2)
        K = self.K_of_r(r)
        if X2 is None:
            X2=X
        dist = X[:,None,:]-X2[None,:,:]
        lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
        lengthscale2inv = lengthscaleinv**2
        if self.ARD:
            g = []
            for diml in range(X.shape[1]):
                tmp = -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*(dist[:,:,diml]**2)*lengthscale2inv[dimX]*lengthscale2inv[dimX2]*(lengthscaleinv[diml]**3)
                if dimX == dimX2:
                    tmp += K*lengthscale2inv[dimX]*(lengthscaleinv[diml]**3)*(dist[:,:,diml]**2)
                if diml == dimX:
                    tmp += 2.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX2]*(lengthscaleinv[dimX]**3)
                if diml == dimX2:
                    tmp += 2.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX]*(lengthscaleinv[dimX2]**3)
                    if dimX == dimX2:
                        tmp += -2.*K*(lengthscaleinv[dimX]**3)
                g += [tmp]
        else:
            g = -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*np.sum(dist**2, axis=2)*(lengthscaleinv[dimX]**7) +4*K*dist[:,:,dimX]*dist[:,:,dimX2]*(lengthscaleinv[dimX]**5)
            if dimX == dimX2:
                g += -2.*K*(lengthscaleinv[dimX]**3) + K*(lengthscaleinv[dimX]**5)*np.sum(dist**2, axis=2)
        return g
    def __getstate__(self):
        dc = super(RBF, self).__getstate__()
        if self.useGPU:
--- a/GPy/kern/src/splitKern.py
+++ b/GPy/kern/src/splitKern.py
@ -4,7 +4,7 @@ A new kernel
 import numpy as np
 from .kern import Kern, CombinationKernel
-from .independent_outputs import index_to_slices
+from ...util.multioutput import index_to_slices
 import itertools
 class DEtime(Kern):
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -212,7 +212,6 @@ class Stationary(Kern):
            r = self._scaled_dist(X, X2)
            self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale
    def update_gradients_direct(self, dL_dVar, dL_dLen):
        """
        Specially intended for the Grid regression case.
@ -308,6 +307,21 @@ class Stationary(Kern):
        return dL_dK_diag * (np.eye(X.shape[1]) * -self.dK2_drdr_diag()/(l4))[None, :,:]# np.zeros(X.shape+(X.shape[1],))
        #return np.ones(X.shape) * d2L_dK * self.variance/self.lengthscale**2 # np.zeros(X.shape)
    def dgradients_dX(self, X, X2, dimX):
        g1 = self.dK2_dvariancedX(X, X2, dimX)
        g2 = self.dK2_dlengthscaledX(X, X2, dimX)
        return [g1, g2]
    def dgradients_dX2(self, X, X2, dimX2):
        g1 = self.dK2_dvariancedX2(X, X2, dimX2)
        g2 = self.dK2_dlengthscaledX2(X, X2, dimX2)
        return [g1, g2]
    def dgradients2_dXdX2(self, X, X2, dimX, dimX2):
        g1 = self.dK3_dvariancedXdX2(X, X2, dimX, dimX2)
        g2 = self.dK3_dlengthscaledXdX2(X, X2, dimX, dimX2)
        return [g1, g2]
    def _gradients_X_pure(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
--- a/GPy/likelihoods/binomial.py
+++ b/GPy/likelihoods/binomial.py
@ -66,7 +66,6 @@ class Binomial(Likelihood):
        np.testing.assert_array_equal(N.shape, y.shape)
        nchoosey = special.gammaln(N+1) - special.gammaln(y+1) - special.gammaln(N-y+1)
        Ny = N-y
        t1 = np.zeros(y.shape)
        t2 = np.zeros(y.shape)
@ -177,6 +176,41 @@ class Binomial(Likelihood):
    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
        pass
    def moments_match_ep(self,obs,tau,v,Y_metadata_i=None):
        """
        Calculation of moments using quadrature
        :param obs: observed output
        :param tau: cavity distribution 1st natural parameter (precision)
        :param v: cavity distribution 2nd natural paramenter (mu*precision)
        """
        #Compute first integral for zeroth moment.
        #NOTE constant np.sqrt(2*pi/tau) added at the end of the function
        if (isinstance(self.gp_link, link_functions.Probit) or isinstance(self.gp_link, link_functions.ScaledProbit)) and (Y_metadata_i is None or int(Y_metadata_i.get('trials', 1)) == int(1)): #Special case for probit likelihood. Can be found from Riihimaki et Vehtari 2010
            if isinstance(self.gp_link, link_functions.ScaledProbit):
                nu = self.gp_link.nu
            else:
                nu = 1.0
            nu = self.gp_link.nu
            mu = v/tau
            sigma2 = 1./tau
            t = np.asarray(1 + sigma2*(nu**2))
            t[t<1e-20] = 1e-20
            a = np.sqrt(t)
            z = obs*mu/a
            normc_z = max(self.gp_link.transf(z), 1e-20)
            m0 = normc_z
            normp_z = self.gp_link.dtransf_df(z)
            m1 = mu + (obs*sigma2*normp_z)/(normc_z*a)
            #print('tau: {}, v: {}, nu: {}, z: {}, normc_z: {}, normp_z: {}'.format(tau, v, nu.values, z, normc_z, normp_z))
            m2 = sigma2 - ((sigma2**2)*normp_z)/((1./(nu**2)+sigma2)*normc_z)*(z + normp_z/(nu**2)/normc_z)
            #print("m0: {}, m1: {}, m2: {}".format(m0,m1,m2))
            #m0a, m1a, m2a =  super(Binomial, self).moments_match_ep(obs,tau,v,Y_metadata_i)
            #print("m0a: {}, m1a: {}, m2a: {}".format(m0a,m1a,m2a))
            return m0, m1, m2
        else:
            return super(Binomial, self).moments_match_ep(obs,tau,v,Y_metadata_i)
    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
        if isinstance(self.gp_link, link_functions.Probit):
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@ -138,6 +138,38 @@ class Probit(GPTransformation):
        input_dict["class"] = "GPy.likelihoods.link_functions.Probit"
        return input_dict
 class ScaledProbit(Probit):
    """
    .. math::
        g(f) = \\Phi^{-1} (nu*mu)
    """
    def __init__(self, nu=1.):
        self.nu = float(nu)
    def transf(self,f):
        return std_norm_cdf(f*self.nu)
    def dtransf_df(self,f):
        return std_norm_pdf(f*self.nu)*self.nu
    def d2transf_df2(self,f):
        return -(f*self.nu) * std_norm_pdf(f*self.nu)*(self.nu**2)
    def d3transf_df3(self,f):
        return (safe_square(f*self.nu)-1.)*std_norm_pdf(f*self.nu)*(self.nu**3)
    def to_dict(self):
        """
        Convert the object into a json serializable dictionary.
        Note: It uses the private method _save_to_input_dict of the parent.
        :return dict: json serializable dictionary containing the needed information to instantiate the object
        """
        input_dict = super(ScaledProbit, self)._save_to_input_dict()
        input_dict["class"] = "GPy.likelihoods.link_functions.ScaledProbit"
        return input_dict
 class Cloglog(GPTransformation):
    """
--- a/GPy/likelihoods/multioutput_likelihood.py
+++ b/GPy/likelihoods/multioutput_likelihood.py
@ -14,7 +14,7 @@ from .gaussian import Gaussian
 from ..core.parameterization import Param
 from paramz.transformations import Logexp
 from ..core.parameterization import Parameterized
-from ..kern.src.independent_outputs import index_to_slices
+from ..util.multioutput import index_to_slices
 import itertools
 class MultioutputLikelihood(MixedNoise):
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -30,3 +30,4 @@ from .gp_grid_regression import GPRegressionGrid
 from .gp_multiout_regression import GPMultioutRegression
 from .gp_multiout_regression_md import GPMultioutRegressionMD
 from .tp_regression import TPRegression
 from .multioutput_gp import MultioutputGP
--- a/GPy/models/multioutput_gp.py
+++ b/GPy/models/multioutput_gp.py
@ -0,0 +1,147 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import itertools
 from ..core.model import Model
 from ..core.parameterization.variational import VariationalPosterior
 from ..core.mapping import Mapping
 from .. import likelihoods
 from ..likelihoods.gaussian import Gaussian
 from .. import kern
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
 from ..util.normalizer import Standardize
 from .. import util
 from paramz import ObsAr
 from ..core.gp import GP
 from ..util.multioutput import index_to_slices
 import logging
 import warnings
 logger = logging.getLogger("GP")
 class MultioutputGP(GP):
    """
    Gaussian process model for using observations from multiple likelihoods and different kernels
    :param X_list: input observations in a list for each likelihood
    :param Y: output observations in a list for each likelihood
    :param kernel_list: kernels in a list for each likelihood
    :param likelihood_list: likelihoods in a list
    :param kernel_cross_covariances: Cross covariances between different likelihoods. See class MultioutputKern for more
    :param inference_method: The :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference method to use for this GP
    """
    def __init__(self, X_list, Y_list, kernel_list, likelihood_list, name='multioutputgp', kernel_cross_covariances={}, inference_method=None):
        #Input and Output
        X,Y,self.output_index = util.multioutput.build_XY(X_list,Y_list)
        Ny = len(Y_list)
        assert isinstance(kernel_list, list)
        kernel = kern.MultioutputDerivativeKern(kernels=kernel_list, cross_covariances=kernel_cross_covariances)
        assert isinstance(likelihood_list, list)
        likelihood = likelihoods.MultioutputLikelihood(likelihood_list)
        if inference_method is None:
            if all([isinstance(l, Gaussian) for l in likelihood_list]):
                inference_method = exact_gaussian_inference.ExactGaussianInference()
            else:
                inference_method = expectation_propagation.EP() 
        super(MultioutputGP, self).__init__(X,Y,kernel,likelihood, Y_metadata={'output_index':self.output_index, 'trials':np.ones(self.output_index.shape)}, inference_method = inference_method)
    def predict_noiseless(self,  Xnew, full_cov=False, Y_metadata=None, kern=None):
        if isinstance(Xnew, list):
            Xnew, _, ind  = util.multioutput.build_XY(Xnew,None)
            if Y_metadata is None:
                Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
        return super(MultioutputGP, self).predict_noiseless(Xnew, full_cov, Y_metadata, kern)
    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, likelihood=None, include_likelihood=True):
        if isinstance(Xnew, list):
            Xnew, _, ind = util.multioutput.build_XY(Xnew,None)
            if Y_metadata is None:
                Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
        return super(MultioutputGP, self).predict(Xnew, full_cov, Y_metadata, kern, likelihood, include_likelihood)
    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, kern=None, likelihood=None):
        if isinstance(X, list):
            X, _, ind  = util.multioutput.build_XY(X,None)
            if Y_metadata is None:
                Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
        return super(MultioutputGP, self).predict_quantiles(X, quantiles, Y_metadata, kern, likelihood)
    def predictive_gradients(self, Xnew, kern=None):
        if isinstance(Xnew, list):
            Xnew, _, ind  = util.multioutput.build_XY(Xnew, None)
            #if Y_metadata is None:
                #Y_metadata={'output_index': ind}
        return super(MultioutputGP, self).predictive_gradients(Xnew, kern)
    def predictive_gradients(self, Xnew, kern=None): #XNEW IS NOT A LIST!!
        """
        Compute the derivatives of the predicted latent function with respect to X*
        Given a set of points at which to predict X* (size [N*,Q]), compute the
        derivatives of the mean and variance. Resulting arrays are sized:
         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
        Note that this is not the same as computing the mean and variance of the derivative of the function!
         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
        :param X: The points at which to get the predictive gradients
        :type X: np.ndarray (Xnew x self.input_dim)
        :returns: dmu_dX, dv_dX
        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q) ]
        """
        if isinstance(Xnew, list):
            Xnew, _, ind  = util.multioutput.build_XY(Xnew, None)
        slices = index_to_slices(Xnew[:,-1])
        for i in range(len(slices)):
            if ((self.kern.kern[i].name == 'diffKern' ) and len(slices[i])>0):
                assert 0, "It is not (yet) possible to predict gradients of gradient observations, sorry :)"
        if kern is None:
            kern = self.kern
        mean_jac = np.empty((Xnew.shape[0],Xnew.shape[1]-1,self.output_dim))
        for i in range(self.output_dim):
            mean_jac[:,:,i] = kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self._predictive_variable)[:,0:-1]
        # gradients wrt the diagonal part k_{xx}
        dv_dX = kern.gradients_X(np.eye(Xnew.shape[0]), Xnew)[:,0:-1]
        #grads wrt 'Schur' part K_{xf}K_{ff}^{-1}K_{fx}
        if self.posterior.woodbury_inv.ndim == 3:
            tmp = np.empty(dv_dX.shape + (self.posterior.woodbury_inv.shape[2],))
            tmp[:] = dv_dX[:,:,None]
            for i in range(self.posterior.woodbury_inv.shape[2]):
                alpha = -2.*np.dot(kern.K(Xnew, self._predictive_variable), self.posterior.woodbury_inv[:, :, i])
                tmp[:, :, i] += kern.gradients_X(alpha, Xnew, self._predictive_variable)
        else:
            tmp = dv_dX
            alpha = -2.*np.dot(kern.K(Xnew, self._predictive_variable), self.posterior.woodbury_inv)
            tmp += kern.gradients_X(alpha, Xnew, self._predictive_variable)[:,0:-1]
        return mean_jac, tmp
    def log_predictive_density(self, x_test, y_test, Y_metadata=None):
        if isinstance(x_test, list):
            x_test, y_test, ind  = util.multioutput.build_XY(x_test, y_test)
            if Y_metadata is None:
                Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
        return super(MultioutputGP, self).log_predictive_density(x_test, y_test, Y_metadata)
    def set_XY(self, X=None, Y=None):
        if isinstance(X, list):
            X, _, self.output_index  = util.multioutput.build_XY(X, None)
        if isinstance(Y, list):
            _, Y, self.output_index  = util.multioutput.build_XY(Y, Y)      
        self.update_model(False)
        if Y is not None:
            self.Y = ObsAr(Y)
            self.Y_normalized = self.Y
        if X is not None:
            self.X = ObsAr(X)
        self.Y_metadata={'output_index': self.output_index, 'trials': np.ones(self.output_index.shape)}
        if isinstance(self.inference_method, expectation_propagation.EP):      
            self.inference_method.reset()
        self.update_model(True)
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -7,7 +7,7 @@ from unittest.case import skip
 import GPy
 from GPy.core.parameterization.param import Param
 import numpy as np
-
+import random
 from ..util.config import config
--- a/GPy/testing/link_function_tests.py
+++ b/GPy/testing/link_function_tests.py
@ -2,11 +2,12 @@ import numpy as np
 import scipy
 from scipy.special import cbrt
 from GPy.models import GradientChecker
 import random
 _lim_val = np.finfo(np.float64).max
 _lim_val_exp = np.log(_lim_val)
 _lim_val_square = np.sqrt(_lim_val)
 _lim_val_cube = cbrt(_lim_val)
-from GPy.likelihoods.link_functions import Identity, Probit, Cloglog, Log, Log_ex_1, Reciprocal, Heaviside
+from GPy.likelihoods.link_functions import Identity, Probit, Cloglog, Log, Log_ex_1, Reciprocal, Heaviside, ScaledProbit
 class LinkFunctionTests(np.testing.TestCase):
    def setUp(self):
@ -124,6 +125,11 @@ class LinkFunctionTests(np.testing.TestCase):
        lim_of_inf = _lim_val
        self.check_gradient(link, lim_of_inf, test_lim=True)
    def test_scaledprobit_gradients(self):
        link = ScaledProbit(nu=random.random())
        lim_of_inf = _lim_val
        self.check_gradient(link, lim_of_inf, test_lim=True)
    def test_Cloglog_gradients(self):
        link = Cloglog()
        lim_of_inf = _lim_val_exp
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -1181,6 +1181,67 @@ class GradientTests(np.testing.TestCase):
        with self.assertRaises(RuntimeError):
            m.posterior_covariance_between_points(np.array([[1], [2]]), np.array([[3], [4]]))
    def test_multioutput_model_with_derivative_observations(self):
        f = lambda x: np.sin(x)+0.1*(x-2.)**2-0.005*x**3
        fd = lambda x: np.cos(x)+0.2*(x-2.)-0.015*x**2
        N=10
        M=10
        sigma=0.05
        sigmader=0.05
        x = np.array([np.linspace(1,10,N)]).T
        y = f(x) + np.array(sigma*np.random.normal(0,1,(N,1)))
        xd = np.array([np.linspace(2,8,M)]).T
        yd = fd(xd) + np.array(sigmader*np.random.normal(0,1,(M,1)))
        # squared exponential kernel:
        se = GPy.kern.RBF(input_dim = 1, lengthscale=1.5, variance=0.2)
        # We need to generate separate kernel for the derivative observations and give the created kernel as an input:
        se_der = GPy.kern.DiffKern(se, 0)
        #Then 
        gauss = GPy.likelihoods.Gaussian(variance=sigma**2)
        gauss = GPy.likelihoods.Gaussian(variance=0.1)
        gauss_der = GPy.likelihoods.Gaussian(variance=sigma**2)
        # Then create the model, we give everything in lists, the order of the inputs indicates the order of the outputs
        # Now we have the regular observations first and derivative observations second, meaning that the kernels and
        # the likelihoods must follow the same order
        m = GPy.models.MultioutputGP(X_list=[x, xd], Y_list=[y, yd], kernel_list=[se, se_der], likelihood_list = [gauss, gauss])
        m.randomize()
        self.assertTrue(m.checkgrad())
        m.optimize(messages=0, ipython_notebook=False)
        self.assertTrue(m.checkgrad())
    def test_multioutput_model_with_ep(self):
        f = lambda x: np.sin(x)+0.1*(x-2.)**2-0.005*x**3
        fd = lambda x: np.cos(x)+0.2*(x-2.)-0.015*x**2
        N=10
        sigma=0.05
        sigmader=0.05
        x = np.array([np.linspace(1,10,N)]).T
        y = f(x) + np.array(sigma*np.random.normal(0,1,(N,1)))
        M=7
        xd = np.array([np.linspace(2,8,M)]).T
        yd = 2*(fd(xd)>0) -1
        # squared exponential kernel:
        se = GPy.kern.RBF(input_dim = 1, lengthscale=1.5, variance=0.2)
        # We need to generate separate kernel for the derivative observations and give the created kernel as an input:
        se_der = GPy.kern.DiffKern(se, 0)
        #Then 
        gauss = GPy.likelihoods.Gaussian(variance=sigma**2)
        probit = GPy.likelihoods.Binomial(gp_link = GPy.likelihoods.link_functions.ScaledProbit(nu=100))
        # Then create the model, we give everything in lists
        m = GPy.models.MultioutputGP(X_list=[x, xd], Y_list=[y, yd], kernel_list=[se, se_der], likelihood_list = [gauss, probit], inference_method=GPy.inference.latent_function_inference.EP(ep_mode="nested"))
        self.assertTrue(m.checkgrad())       
 def _create_missing_data_model(kernel, Q):
    D1, D2, D3, N, num_inducing = 13, 5, 8, 400, 3
    _, _, Ylist = GPy.examples.dimensionality_reduction._simulate_matern(D1, D2, D3, N, num_inducing, False)
--- a/GPy/util/multioutput.py
+++ b/GPy/util/multioutput.py
@ -2,6 +2,33 @@ import numpy as np
 import warnings
 import GPy
 def index_to_slices(index):
    """
    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index.
    e.g.
    >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
    returns
    >>> [[slice(0,3,None)],[slice(3,6,None)],[slice(6,9,None)]]
    or, a more complicated example
    >>> index = np.asarray([0,0,1,1,0,2,2,2,1,1])
    returns
    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
    """
    if len(index)==0:
        return[]
    #contruct the return structure
    ind = np.asarray(index,dtype=np.int)
    ret = [[] for i in range(ind.max()+1)]
    #find the switchpoints
    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
    return ret
 def get_slices(input_list):
    num_outputs = len(input_list)