Merge pull request #788 from SheffieldML/devel

Release 1.9.9
2026-07-08 16:12:15 +02:00 · 2019-10-17 08:55:24 +01:00 · 2019-10-17 08:55:24 +01:00 · 92f2e87e7b
commit 92f2e87e7b
parent e4848ce3ad 2c8245de40
60 changed files with 846 additions and 145 deletions
--- a/GPy/version.py
+++ b/GPy/version.py
@ -1 +1 @@
-__version__ = "1.9.8"
+__version__ = "1.9.9"
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -581,3 +581,56 @@ def warped_gp_cubic_sine(max_iters=100):
    m.plot(title="Standard GP")
    warp_m.plot_warping()
    pb.show()
+    return warp_m
+
+
+
+def multioutput_gp_with_derivative_observations():
+    def plot_gp_vs_real(m, x, yreal, size_inputs, title, fixed_input=1, xlim=[0,11], ylim=[-1.5,3]):
+        fig, ax = pb.subplots()
+        ax.set_title(title)
+        pb.plot(x, yreal, "r", label='Real function')
+        rows = slice(0, size_inputs[0]) if fixed_input == 0 else slice(size_inputs[0], size_inputs[0]+size_inputs[1])
+        m.plot(fixed_inputs=[(1, fixed_input)], which_data_rows=rows, xlim=xlim, ylim=ylim, ax=ax)
+    f = lambda x: np.sin(x)+0.1*(x-2.)**2-0.005*x**3
+    fd = lambda x: np.cos(x)+0.2*(x-2.)-0.015*x**2
+    N=10 # Number of observations
+    M=10 # Number of derivative observations
+    Npred=100 # Number of prediction points
+    sigma = 0.05 # Noise of observations
+    sigma_der = 0.05 # Noise of derivative observations
+    x = np.array([np.linspace(1,10,N)]).T
+    y = f(x) + np.array(sigma*np.random.normal(0,1,(N,1)))
+
+    xd = np.array([np.linspace(2,8,M)]).T
+    yd = fd(xd) + np.array(sigma_der*np.random.normal(0,1,(M,1)))
+
+    xpred = np.array([np.linspace(0,11,Npred)]).T
+    ypred_true = f(xpred)
+    ydpred_true = fd(xpred)
+
+    # squared exponential kernel:
+    se = GPy.kern.RBF(input_dim = 1, lengthscale=1.5, variance=0.2)
+    # We need to generate separate kernel for the derivative observations and give the created kernel as an input:
+    se_der = GPy.kern.DiffKern(se, 0)
+
+    #Then
+    gauss = GPy.likelihoods.Gaussian(variance=sigma**2)
+    gauss_der = GPy.likelihoods.Gaussian(variance=sigma_der**2)
+
+    # Then create the model, we give everything in lists, the order of the inputs indicates the order of the outputs
+    # Now we have the regular observations first and derivative observations second, meaning that the kernels and
+    # the likelihoods must follow the same order. Crosscovariances are automatically taken car of
+    m = GPy.models.MultioutputGP(X_list=[x, xd], Y_list=[y, yd], kernel_list=[se, se_der], likelihood_list = [gauss, gauss])
+
+    # Optimize the model
+    m.optimize(messages=0, ipython_notebook=False)
+
+    #Plot the model, the syntax is same as for multioutput models:
+    plot_gp_vs_real(m, xpred, ydpred_true, [x.shape[0], xd.shape[0]], title='Latent function derivatives', fixed_input=1, xlim=[0,11], ylim=[-1.5,3])
+    plot_gp_vs_real(m, xpred, ypred_true, [x.shape[0], xd.shape[0]], title='Latent function', fixed_input=0, xlim=[0,11], ylim=[-1.5,3])
+
+    #making predictions for the values:
+    mu, var = m.predict_noiseless(Xnew=[xpred, np.empty((0,1))])
+
+    return m
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -43,4 +43,6 @@ from .src.sde_standard_periodic import sde_StdPeriodic
 from .src.sde_static import sde_White, sde_Bias
 from .src.sde_stationary import sde_RBF,sde_Exponential,sde_RatQuad
 from .src.sde_brownian import sde_Brownian
-from .src.multioutput_kern import MultioutputKern
+from .src.multioutput_kern import MultioutputKern
+from .src.multioutput_derivative_kern import MultioutputDerivativeKern
+from .src.diff_kern import DiffKern
--- a/GPy/kern/src/ODE_UY.py
+++ b/GPy/kern/src/ODE_UY.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 from .kern import Kern
-from .independent_outputs import index_to_slices
+from ...util.multioutput import index_to_slices
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
 import numpy as np
--- a/GPy/kern/src/ODE_UYC.py
+++ b/GPy/kern/src/ODE_UYC.py
@ -5,7 +5,7 @@ from .kern import Kern
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
 import numpy as np
-from .independent_outputs import index_to_slices
+from ...util.multioutput import index_to_slices

 class ODE_UYC(Kern):
    def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., ubias =1. ,active_dims=None, name='ode_uyc'):
--- a/GPy/kern/src/ODE_st.py
+++ b/GPy/kern/src/ODE_st.py
@ -4,7 +4,7 @@ from .kern import Kern
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
 import numpy as np
-from .independent_outputs import index_to_slices
+from ...util.multioutput import index_to_slices


 class ODE_st(Kern):
--- a/GPy/kern/src/ODE_t.py
+++ b/GPy/kern/src/ODE_t.py
@ -2,7 +2,7 @@ from .kern import Kern
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
 import numpy as np
-from .independent_outputs import index_to_slices
+from ...util.multioutput import index_to_slices


 class ODE_t(Kern):
--- a/GPy/kern/src/diff_kern.py
+++ b/GPy/kern/src/diff_kern.py
@ -0,0 +1,88 @@
+# Copyright (c) 2018, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+from .kern import Kern
+import numpy as np
+from paramz.caching import Cache_this
+
+class DiffKern(Kern):
+    """
+    Diff kernel is a thin wrapper for using partial derivatives of kernels as kernels. Eg. in combination with
+    Multioutput kernel this allows the user to train GPs with observations of latent function and latent
+    function derivatives. NOTE: DiffKern only works when used with Multioutput kernel. Do not use the kernel as standalone
+    
+    The parameters the kernel needs are:
+    -'base_kern': a member of Kernel class that is used for observations
+    -'dimension': integer that indigates in which dimensions the partial derivative observations are
+    """
+    def __init__(self, base_kern, dimension):
+        super(DiffKern, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='DiffKern')
+        self.base_kern = base_kern
+        self.dimension = dimension
+
+    def parameters_changed(self):
+        self.base_kern.parameters_changed()
+
+    @Cache_this(limit=3, ignore_args=())
+    def K(self, X, X2=None, dimX2 = None): #X in dimension self.dimension
+        if X2 is None:
+            X2 = X
+        if dimX2 is None:
+            dimX2 = self.dimension
+        return self.base_kern.dK2_dXdX2(X,X2, self.dimension, dimX2)
+ 
+    @Cache_this(limit=3, ignore_args=())
+    def Kdiag(self, X):
+        return np.diag(self.base_kern.dK2_dXdX2(X,X, self.dimension, self.dimension))
+    
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dX_wrap(self, X, X2): #X in dimension self.dimension
+        return self.base_kern.dK_dX(X,X2, self.dimension)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dX2_wrap(self, X, X2): #X in dimension self.dimension
+        return self.base_kern.dK_dX2(X,X2, self.dimension)
+
+    def reset_gradients(self):
+        self.base_kern.reset_gradients()
+
+    @property
+    def gradient(self):
+        return self.base_kern.gradient
+
+    @gradient.setter
+    def gradient(self, gradient):
+        self.base_kern.gradient = gradient
+
+    def update_gradients_full(self, dL_dK, X, X2=None, dimX2=None):
+        if dimX2 is None:
+            dimX2 = self.dimension
+        gradients = self.base_kern.dgradients2_dXdX2(X,X2,self.dimension,dimX2)
+        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
+
+    def update_gradients_diag(self, dL_dK_diag, X):
+        gradients = self.base_kern.dgradients2_dXdX2(X,X, self.dimension, self.dimension)
+        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK_diag, gradient, f=np.diag) for gradient in gradients])
+
+    def update_gradients_dK_dX(self, dL_dK, X, X2=None):
+        if X2 is None:
+            X2 = X
+        gradients = self.base_kern.dgradients_dX(X,X2, self.dimension)
+        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
+
+    def update_gradients_dK_dX2(self, dL_dK, X, X2=None):
+        gradients = self.base_kern.dgradients_dX2(X,X2, self.dimension)
+        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
+
+    def gradients_X(self, dL_dK, X, X2):
+        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,:, self.dimension]
+        return np.sum(tmp, axis=1)
+    
+    def gradients_X2(self, dL_dK, X, X2):
+        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:, :, self.dimension, :]
+        return np.sum(tmp, axis=1)
+
+    def _convert_gradients(self, l,g, f = lambda x:x):
+        if type(g) is np.ndarray:
+            return np.sum(f(l)*f(g))
+        else:
+            return np.array([np.sum(f(l)*f(gi)) for gi in g])
--- a/GPy/kern/src/independent_outputs.py
+++ b/GPy/kern/src/independent_outputs.py
@ -5,34 +5,8 @@
 from .kern import CombinationKernel
 import numpy as np
 import itertools
+from ...util.multioutput import index_to_slices

-def index_to_slices(index):
-    """
-    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index.
-
-    e.g.
-    >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
-    returns
-    >>> [[slice(0,3,None)],[slice(3,6,None)],[slice(6,9,None)]]
-
-    or, a more complicated example
-    >>> index = np.asarray([0,0,1,1,0,2,2,2,1,1])
-    returns
-    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
-    """
-    if len(index)==0:
-        return[]
-
-    #contruct the return structure
-    ind = np.asarray(index,dtype=np.int)
-    ret = [[] for i in range(ind.max()+1)]
-
-    #find the switchpoints
-    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
-    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
-
-    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
-    return ret

 class IndependentOutputs(CombinationKernel):
    """
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@ -116,6 +116,12 @@ class Kern(Parameterized):
        except:
            return X[:, self._all_dims_active]

+    def _project_dim(self, dim):
+        try:
+            return np.where(self._all_dims_active == dim)[0][0]
+        except:
+            return None
+
    def K(self, X, X2):
        """
        Compute the kernel function.
--- a/GPy/kern/src/kernel_slice_operations.py
+++ b/GPy/kern/src/kernel_slice_operations.py
@ -19,16 +19,26 @@ def put_clean(dct, name, func):
 class KernCallsViaSlicerMeta(ParametersChangedMeta):
    def __new__(cls, name, bases, dct):
        put_clean(dct, 'K', _slice_K)
+        put_clean(dct, 'dK_dX', _slice_dK_dX)
+        put_clean(dct, 'dK_dX2', _slice_dK_dX)
+        put_clean(dct, 'dK2_dXdX2', _slice_dK2_dXdX2)
        put_clean(dct, 'Kdiag', _slice_Kdiag)
        put_clean(dct, 'phi', _slice_Kdiag)
        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
+        put_clean(dct, 'update_gradients_dK_dX', _slice_update_gradients_full)
+        put_clean(dct, 'update_gradients_dK_dX2', _slice_update_gradients_full)
        put_clean(dct, 'gradients_X', _slice_gradients_X)
+        put_clean(dct, 'gradients_X2', _slice_gradients_X)
        put_clean(dct, 'gradients_X_X2', _slice_gradients_X)
        put_clean(dct, 'gradients_XX', _slice_gradients_XX)
        put_clean(dct, 'gradients_XX_diag', _slice_gradients_XX_diag)
        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)

+        put_clean(dct, 'dgradients_dX',_slice_partial_gradients_list_X)
+        put_clean(dct, 'dgradients_dX2',_slice_partial_gradients_list_X)
+        put_clean(dct, 'dgradients2_dXdX2',_slice_partial_gradients_list_XX)
+
        put_clean(dct, 'psi0', _slice_psi)
        put_clean(dct, 'psi1', _slice_psi)
        put_clean(dct, 'psi2', _slice_psi)
@ -79,6 +89,20 @@ class _Slice_wrap(object):
            return ret
        return return_val

+    def handle_return_list(self, return_val):
+        if self.ret:
+            ret = [np.zeros(self.shape) for _ in range(len(return_val))]
+            if len(self.shape) == 3:
+                for i in range(len(return_val)):
+                    ret[i][self.k._all_dims_active, :, :] = return_val[i]
+                #[ret[i].__setitem__((:, :, self.k._all_dims_active), return_val[i])  for i in range(len(return_val))]             
+            elif len(self.shape) == 4:
+                for i in range(len(return_val)):
+                    ret[i][np.ix_(self.k._all_dims_active, self.k._all_dims_active)] = return_val[i]
+                #[ret[i].__setitem__((:, :, self.k._all_dims_active, self.k._all_dims_active), return_val[i])  for i in range(len(return_val))]             
+            return ret
+        return return_val
+
 def _slice_K(f):
    @wraps(f)
    def wrap(self, X, X2 = None, *a, **kw):
@ -97,15 +121,15 @@ def _slice_Kdiag(f):

 def _slice_update_gradients_full(f):
    @wraps(f)
-    def wrap(self, dL_dK, X, X2=None):
+    def wrap(self, dL_dK, X, X2=None, *a, **kw):
        with _Slice_wrap(self, X, X2) as s:
-            ret = f(self, dL_dK, s.X, s.X2)
+            ret = f(self, dL_dK, s.X, s.X2, *a, **kw)
        return ret
    return wrap

 def _slice_update_gradients_diag(f):
    @wraps(f)
-    def wrap(self, dL_dKdiag, X):
+    def wrap(self, dL_dKdiag, X, *a, **kw):
        with _Slice_wrap(self, X, None) as s:
            ret = f(self, dL_dKdiag, s.X)
        return ret
@ -119,6 +143,99 @@ def _slice_gradients_X(f):
        return ret
    return wrap

+def _slice_dK_dX(f):
+    @wraps(f)
+    def wrap(self, X, X2, dim, *a, **kw):
+        with _Slice_wrap(self, X, X2) as s:
+            d = s.k._project_dim(dim)
+            if d is None:
+                ret = np.zeros((X.shape[0], X2.shape[0]))
+            else:
+                ret = f(self, s.X, s.X2, d, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_dK2_dXdX2(f):
+    @wraps(f)
+    def wrap(self, X, X2, dimX, dimX2, *a, **kw):
+        with _Slice_wrap(self, X, X2) as s:
+            d = s.k._project_dim(dimX)
+            d2 = s.k._project_dim(dimX2)
+            if (d is None) or (d2 is None):
+                ret = np.zeros((X.shape[0], X2.shape[0]))
+            else:
+                ret = f(self, s.X, s.X2, d, d2, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_partial_gradients_X(f):
+    @wraps(f)
+    def wrap(self, X, X2, dim):
+        if X2 is None:
+            N, M = X.shape[0], X.shape[0]
+        else:
+            N, M = X.shape[0], X2.shape[0]
+        Q1 = X.shape[1]
+        with _Slice_wrap(self, X, X2, ret_shape=(N, M, Q1)) as s:
+            ret = s.handle_return_array(f(self, s.X, s.X2, dim))
+        return ret
+    return wrap
+
+def _slice_partial_gradients_list_X(f):
+    @wraps(f)
+    def wrap(self, X, X2, dim):
+        if X2 is None:
+            N, M = X.shape[0], X.shape[0]
+        else:
+            N, M = X.shape[0], X2.shape[0]
+        with _Slice_wrap(self, X, X2, ret_shape=(N, M)) as s:
+            d = s.k._project_dim(dim)
+            if d is None:
+                ret = [np.zeros((N, M)) for i in range(s.k.size)]
+            else:            
+                ret = f(self, s.X, s.X2, d)
+        return ret
+    return wrap
+
+def _slice_partial_gradients_XX(f):
+    @wraps(f)
+    def wrap(self, X, X2, dim, dimX2):
+        if X2 is None:
+            N, M = X.shape[0], X.shape[0]
+            Q1 = X.shape[1]
+        else:
+            N, M = X.shape[0], X2.shape[0]
+            Q1, Q2 = X.shape[1], X2.shape[1]
+        with _Slice_wrap(self, X, X2, ret_shape=(N, M, Q1, Q2)) as s:
+            ret = s.handle_return_array(f(self, s.X, s.X2, dim, dimX2))
+        return ret
+    return wrap
+
+def _slice_partial_gradients_list_XX(f):
+    @wraps(f)
+    def wrap(self, X, X2, dimX, dimX2):
+        if X2 is None:
+            N, M = X.shape[0], X.shape[0]
+        else:
+            N, M = X.shape[0], X2.shape[0]
+        with _Slice_wrap(self, X, X2, ret_shape=(N, M)) as s:
+            d = s.k._project_dim(dimX)
+            d2 = s.k._project_dim(dimX2)
+            if (d is None) or (d2 is None):
+                ret = [np.zeros((N, M)) for i in range(s.k.size)]
+            else:
+                ret = f(self, s.X, s.X2, d, d2)
+        return ret
+    return wrap
+
+def _slice_gradient_derivatives_X(f):
+    @wraps(f)
+    def wrap(self, dL_dK, X, X2=None):
+        with _Slice_wrap(self, X, X2) as s:
+            ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
+        return ret
+    return wrap
+
 def _slice_gradients_X_diag(f):
    @wraps(f)
    def wrap(self, dL_dKdiag, X):
--- a/GPy/kern/src/multioutput_derivative_kern.py
+++ b/GPy/kern/src/multioutput_derivative_kern.py
@ -0,0 +1,84 @@
+# Copyright (c) 2018, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from .kern import Kern, CombinationKernel
+from .multioutput_kern import MultioutputKern, ZeroKern
+import numpy as np
+from functools import partial
+
+class KernWrapper(Kern):
+    def __init__(self, fk, fug, fg, base_kern):
+        self.fk = fk
+        self.fug = fug
+        self.fg = fg
+        self.base_kern = base_kern
+        super(KernWrapper, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='KernWrapper',useGPU=False)
+
+    def K(self, X, X2=None):
+        return self.fk(X,X2=X2)
+    
+    def update_gradients_full(self,dL_dK, X, X2=None):
+        return self.fug(dL_dK, X, X2=X2)
+    
+    def gradients_X(self,dL_dK, X, X2=None):
+        return self.fg(dL_dK, X, X2=X2)
+
+    @property
+    def gradient(self):
+        return self.base_kern.gradient
+
+    @gradient.setter
+    def gradient(self, gradient):
+        self.base_kern.gradient = gradient
+
+class MultioutputDerivativeKern(MultioutputKern):
+    """
+    Multioutput derivative kernel is a meta class for combining different kernels for multioutput GPs.
+    Multioutput derivative kernel is only a thin wrapper for Multioutput kernel for user not having to define
+    cross covariances.
+    """
+    def __init__(self, kernels, cross_covariances={}, name='MultioutputDerivativeKern'):
+        #kernels contains a list of kernels as input, 
+        if not isinstance(kernels, list):
+            self.single_kern = True
+            self.kern = kernels
+            kernels = [kernels]
+        else:
+            self.single_kern = False
+            self.kern = kernels
+        # The combination kernel ALLWAYS puts the extra dimension last.
+        # Thus, the index dimension of this kernel is always the last dimension
+        # after slicing. This is why the index_dim is just the last column:
+        self.index_dim = -1
+        super(MultioutputKern, self).__init__(kernels=kernels, extra_dims=[self.index_dim], name=name, link_parameters=False)
+
+        nl = len(kernels)
+         
+        #build covariance structure
+        covariance = [[None for i in range(nl)] for j in range(nl)]
+        linked = []
+        for i in range(0,nl):
+            unique=True
+            for j in range(0,nl):
+                if i==j or (kernels[i] is kernels[j]):
+                    kern = kernels[i]
+                    if i>j:
+                        unique=False
+                elif cross_covariances.get((i,j)) is not None: #cross covariance is given
+                    kern = cross_covariances.get((i,j))
+                elif kernels[i].name == 'DiffKern' and kernels[i].base_kern == kernels[j]: # one is derivative of other
+                    kern = KernWrapper(kernels[i].dK_dX_wrap,kernels[i].update_gradients_dK_dX,kernels[i].gradients_X, kernels[j])
+                    unique=False
+                elif kernels[j].name == 'DiffKern' and kernels[j].base_kern == kernels[i]: # one is derivative of other
+                    kern = KernWrapper(kernels[j].dK_dX2_wrap,kernels[j].update_gradients_dK_dX2,kernels[j].gradients_X2, kernels[i])
+                elif kernels[i].name == 'DiffKern' and kernels[j].name == 'DiffKern' and kernels[i].base_kern == kernels[j].base_kern: #both are partial derivatives
+                    kern = KernWrapper(partial(kernels[i].K, dimX2=kernels[j].dimension), partial(kernels[i].update_gradients_full, dimX2=kernels[j].dimension),None, kernels[i].base_kern)
+                    if i>j:
+                        unique=False
+                else:
+                    kern = ZeroKern()
+                covariance[i][j] = kern
+            if unique is True:
+                linked.append(i)
+        self.covariance = covariance
+        self.link_parameters(*[kernels[i] for i in linked])
--- a/GPy/kern/src/multioutput_kern.py
+++ b/GPy/kern/src/multioutput_kern.py
@ -1,7 +1,10 @@
+# Copyright (c) 2018, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
 from .kern import Kern, CombinationKernel
 import numpy as np
 from functools import reduce, partial
-from .independent_outputs import index_to_slices
+from ...util.multioutput import index_to_slices
 from paramz.caching import Cache_this

 class ZeroKern(Kern):
@ -18,6 +21,13 @@ class ZeroKern(Kern):
    
    def gradients_X(self,dL_dK, X, X2=None):
        return np.zeros((X.shape[0],X.shape[1]))
+    @property
+    def gradient(self):
+        return np.empty((1,0))
+
+    @gradient.setter
+    def gradient(self, gradient):
+        pass
        
 class MultioutputKern(CombinationKernel):
    """
@ -62,14 +72,13 @@ class MultioutputKern(CombinationKernel):
            unique=True
            for j in range(0,nl):
                if i==j or (kernels[i] is kernels[j]):
-                    covariance[i][j] = {'kern': kernels[i], 'K': kernels[i].K, 'update_gradients_full': kernels[i].update_gradients_full, 'gradients_X': kernels[i].gradients_X}
+                    covariance[i][j] = kernels[i]
                    if i>j:
                        unique=False
                elif cross_covariances.get((i,j)) is not None: #cross covariance is given
                    covariance[i][j] = cross_covariances.get((i,j))
                else: # zero covariance structure
-                    kern = ZeroKern()
-                    covariance[i][j] = {'kern': kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X}       
+                    covariance[i][j] = ZeroKern()
            if unique is True:
                linked.append(i)
        self.covariance = covariance
@ -82,7 +91,7 @@ class MultioutputKern(CombinationKernel):
        slices = index_to_slices(X[:,self.index_dim])
        slices2 = index_to_slices(X2[:,self.index_dim])
        target =  np.zeros((X.shape[0], X2.shape[0]))
-        [[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j]['K'](X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))]  
+        [[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j].K(X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))]  
        return target

    @Cache_this(limit=3, ignore_args=())
@ -93,10 +102,10 @@ class MultioutputKern(CombinationKernel):
        [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
        return target
    
-    def _update_gradients_full_wrapper(self, cov_struct, dL_dK, X, X2):
-        gradient = cov_struct['kern'].gradient.copy()
-        cov_struct['update_gradients_full'](dL_dK, X, X2)
-        cov_struct['kern'].gradient += gradient
+    def _update_gradients_full_wrapper(self, kern, dL_dK, X, X2):
+        gradient = kern.gradient.copy()
+        kern.update_gradients_full(dL_dK, X, X2)
+        kern.gradient += gradient
    
    def _update_gradients_diag_wrapper(self, kern, dL_dKdiag, X):
        gradient = kern.gradient.copy()
@ -118,14 +127,14 @@ class MultioutputKern(CombinationKernel):
    def update_gradients_diag(self, dL_dKdiag, X):
        self.reset_gradients()
        slices = index_to_slices(X[:,self.index_dim])
-        [[ self._update_gradients_diag_wrapper(self.covariance[i][i]['kern'], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))]
+        [[ self._update_gradients_diag_wrapper(self.covariance[i][i], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))]
    
    def gradients_X(self,dL_dK, X, X2=None):
        slices = index_to_slices(X[:,self.index_dim])
        target = np.zeros((X.shape[0], X.shape[1]) )
        if X2 is not None:
            slices2 = index_to_slices(X2[:,self.index_dim])
-            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j]['gradients_X'](dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) ) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
+            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) ) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
        else:
-            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j]['gradients_X'](dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], (None if (i==j and k==l) else X[slices[j][l],:] )) ) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
+            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], (None if (i==j and k==l) else X[slices[j][l],:] )) ) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
        return target
--- a/GPy/kern/src/rbf.py
+++ b/GPy/kern/src/rbf.py
@ -6,6 +6,7 @@ import numpy as np
 from .stationary import Stationary
 from .psi_comp import PSICOMP_RBF, PSICOMP_RBF_GPU
 from ...core import Param
+from paramz.caching import Cache_this
 from paramz.transformations import Logexp
 from .grid_kerns import GridRBF

@ -50,6 +51,27 @@ class RBF(Stationary):
    def K_of_r(self, r):
        return self.variance * np.exp(-0.5 * r**2)

+    @Cache_this(limit=3, ignore_args=())
+    def dK_dX(self, X, X2, dimX):
+        r = self._scaled_dist(X, X2)
+        K = self.K_of_r(r)
+        dist = X[:,None,dimX]-X2[None,:,dimX]
+        lengthscale2inv = (np.ones((X.shape[1]))/(self.lengthscale**2))[dimX]
+        return -1.*K*dist*lengthscale2inv
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dX2(self, X, X2, dimX2):
+        return -self.dK_dX(X,X2, dimX2)
+    
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dXdX2(self, X, X2, dimX, dimX2):
+        r = self._scaled_dist(X, X2)
+        K = self.K_of_r(r)
+        if X2 is None:
+            X2=X
+        dist = X[:,None,:]-X2[None,:,:]
+        lengthscale2inv = np.ones((X.shape[1]))/(self.lengthscale**2)
+        return -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX]*lengthscale2inv[dimX2] + (dimX==dimX2)*K*lengthscale2inv[dimX]
+
    def dK_dr(self, r):
        return -r*self.K_of_r(r)

@ -58,6 +80,74 @@ class RBF(Stationary):

    def dK2_drdr_diag(self):
        return -self.variance # as the diagonal of r is always filled with zeros
+    
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dvariance(self,X,X2):
+        return self.K(X,X2)/self.variance
+    
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dvariancedX(self, X, X2, dim):
+        return self.dK_dX(X,X2, dim)/self.variance
+    
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dvariancedX2(self, X, X2, dim):
+        return self.dK_dX2(X,X2, dim)/self.variance
+    
+    @Cache_this(limit=3, ignore_args=())
+    def dK3_dvariancedXdX2(self, X, X2, dim, dimX2):
+        return self.dK2_dXdX2(X, X2, dim, dimX2)/self.variance
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dlengthscaledX(self, X, X2, dimX):
+        r = self._scaled_dist(X, X2)
+        K = self.K_of_r(r)
+        if X2 is None:
+            X2=X
+        dist = X[:,None,:]-X2[None,:,:]
+        lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
+        if self.ARD:
+            g = []
+            for diml in range(X.shape[1]):
+                g += [-1.*K*dist[:,:,dimX]*(dist[:,:,diml]**2)*(lengthscaleinv[dimX]**2)*(lengthscaleinv[diml]**3) + 2.*dist[:,:,dimX]*(lengthscaleinv[diml]**3)*K*(dimX == diml)]
+        else:
+            g = -1.*K*dist[:,:,dimX]*np.sum(dist**2, axis=2)*(lengthscaleinv[dimX]**5) + 2.*dist[:,:,dimX]*(lengthscaleinv[dimX]**3)*K
+        return g
+    
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dlengthscaledX2(self, X, X2, dimX2):
+        tmp = self.dK2_dlengthscaledX(X, X2, dimX2)
+        if self.ARD:
+            return [-1.*g for g in tmp]
+        else:
+            return -1*tmp
+    
+    @Cache_this(limit=3, ignore_args=())
+    def dK3_dlengthscaledXdX2(self, X, X2, dimX, dimX2):
+        r = self._scaled_dist(X, X2)
+        K = self.K_of_r(r)
+        if X2 is None:
+            X2=X
+        dist = X[:,None,:]-X2[None,:,:]
+        lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
+        lengthscale2inv = lengthscaleinv**2
+        if self.ARD:
+            g = []
+            for diml in range(X.shape[1]):
+                tmp = -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*(dist[:,:,diml]**2)*lengthscale2inv[dimX]*lengthscale2inv[dimX2]*(lengthscaleinv[diml]**3)
+                if dimX == dimX2:
+                    tmp += K*lengthscale2inv[dimX]*(lengthscaleinv[diml]**3)*(dist[:,:,diml]**2)
+                if diml == dimX:
+                    tmp += 2.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX2]*(lengthscaleinv[dimX]**3)
+                if diml == dimX2:
+                    tmp += 2.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX]*(lengthscaleinv[dimX2]**3)
+                    if dimX == dimX2:
+                        tmp += -2.*K*(lengthscaleinv[dimX]**3)
+                g += [tmp]
+        else:
+            g = -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*np.sum(dist**2, axis=2)*(lengthscaleinv[dimX]**7) +4*K*dist[:,:,dimX]*dist[:,:,dimX2]*(lengthscaleinv[dimX]**5)
+            if dimX == dimX2:
+                g += -2.*K*(lengthscaleinv[dimX]**3) + K*(lengthscaleinv[dimX]**5)*np.sum(dist**2, axis=2)
+        return g

    def __getstate__(self):
        dc = super(RBF, self).__getstate__()
--- a/GPy/kern/src/splitKern.py
+++ b/GPy/kern/src/splitKern.py
@ -4,7 +4,7 @@ A new kernel

 import numpy as np
 from .kern import Kern, CombinationKernel
-from .independent_outputs import index_to_slices
+from ...util.multioutput import index_to_slices
 import itertools

 class DEtime(Kern):
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -212,7 +212,6 @@ class Stationary(Kern):
            r = self._scaled_dist(X, X2)
            self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale

-
    def update_gradients_direct(self, dL_dVar, dL_dLen):
        """
        Specially intended for the Grid regression case.
@ -307,6 +306,21 @@ class Stationary(Kern):
        l4 =  np.ones(X.shape[1])*self.lengthscale**2
        return dL_dK_diag * (np.eye(X.shape[1]) * -self.dK2_drdr_diag()/(l4))[None, :,:]# np.zeros(X.shape+(X.shape[1],))
        #return np.ones(X.shape) * d2L_dK * self.variance/self.lengthscale**2 # np.zeros(X.shape)
+    
+    def dgradients_dX(self, X, X2, dimX):
+        g1 = self.dK2_dvariancedX(X, X2, dimX)
+        g2 = self.dK2_dlengthscaledX(X, X2, dimX)
+        return [g1, g2]
+
+    def dgradients_dX2(self, X, X2, dimX2):
+        g1 = self.dK2_dvariancedX2(X, X2, dimX2)
+        g2 = self.dK2_dlengthscaledX2(X, X2, dimX2)
+        return [g1, g2]
+
+    def dgradients2_dXdX2(self, X, X2, dimX, dimX2):
+        g1 = self.dK3_dvariancedXdX2(X, X2, dimX, dimX2)
+        g2 = self.dK3_dlengthscaledXdX2(X, X2, dimX, dimX2)
+        return [g1, g2]

    def _gradients_X_pure(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
--- a/GPy/likelihoods/binomial.py
+++ b/GPy/likelihoods/binomial.py
@ -66,7 +66,6 @@ class Binomial(Likelihood):
        np.testing.assert_array_equal(N.shape, y.shape)

        nchoosey = special.gammaln(N+1) - special.gammaln(y+1) - special.gammaln(N-y+1)
-        
        Ny = N-y
        t1 = np.zeros(y.shape)
        t2 = np.zeros(y.shape)
@ -177,6 +176,41 @@ class Binomial(Likelihood):

    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
        pass
+    
+    def moments_match_ep(self,obs,tau,v,Y_metadata_i=None):
+        """
+        Calculation of moments using quadrature
+        :param obs: observed output
+        :param tau: cavity distribution 1st natural parameter (precision)
+        :param v: cavity distribution 2nd natural paramenter (mu*precision)
+        """
+        #Compute first integral for zeroth moment.
+        #NOTE constant np.sqrt(2*pi/tau) added at the end of the function
+        if (isinstance(self.gp_link, link_functions.Probit) or isinstance(self.gp_link, link_functions.ScaledProbit)) and (Y_metadata_i is None or int(Y_metadata_i.get('trials', 1)) == int(1)): #Special case for probit likelihood. Can be found from Riihimaki et Vehtari 2010
+            if isinstance(self.gp_link, link_functions.ScaledProbit):
+                nu = self.gp_link.nu
+            else:
+                nu = 1.0
+            nu = self.gp_link.nu
+            mu = v/tau
+            sigma2 = 1./tau
+            t = np.asarray(1 + sigma2*(nu**2))
+            t[t<1e-20] = 1e-20
+            a = np.sqrt(t)
+            z = obs*mu/a
+            normc_z = max(self.gp_link.transf(z), 1e-20)
+            m0 = normc_z
+            normp_z = self.gp_link.dtransf_df(z)
+            m1 = mu + (obs*sigma2*normp_z)/(normc_z*a)
+            #print('tau: {}, v: {}, nu: {}, z: {}, normc_z: {}, normp_z: {}'.format(tau, v, nu.values, z, normc_z, normp_z))
+            m2 = sigma2 - ((sigma2**2)*normp_z)/((1./(nu**2)+sigma2)*normc_z)*(z + normp_z/(nu**2)/normc_z)
+            #print("m0: {}, m1: {}, m2: {}".format(m0,m1,m2))
+            #m0a, m1a, m2a =  super(Binomial, self).moments_match_ep(obs,tau,v,Y_metadata_i)
+            #print("m0a: {}, m1a: {}, m2a: {}".format(m0a,m1a,m2a))
+            return m0, m1, m2
+        else:
+            return super(Binomial, self).moments_match_ep(obs,tau,v,Y_metadata_i)
+    
    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
        if isinstance(self.gp_link, link_functions.Probit):

--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -296,7 +296,7 @@ class Likelihood(Parameterized):
            elif quad_mode == 'gh':
                f = partial(self.integrate_gh)
                quads = zip(*map(f, Y.flatten(), mu.flatten(), np.sqrt(sigma2.flatten())))
-                quads = np.hstack(quads)
+                quads = np.hstack(list(quads))
                quads = quads.T
            else:
                raise Exception("no other quadrature mode available")
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@ -138,6 +138,38 @@ class Probit(GPTransformation):
        input_dict["class"] = "GPy.likelihoods.link_functions.Probit"
        return input_dict

+class ScaledProbit(Probit):
+    """
+    .. math::
+        g(f) = \\Phi^{-1} (nu*mu)
+    """
+    def __init__(self, nu=1.):
+        self.nu = float(nu)
+        
+    def transf(self,f):
+        return std_norm_cdf(f*self.nu)
+
+    def dtransf_df(self,f):
+        return std_norm_pdf(f*self.nu)*self.nu
+
+    def d2transf_df2(self,f):
+        return -(f*self.nu) * std_norm_pdf(f*self.nu)*(self.nu**2)
+
+    def d3transf_df3(self,f):
+        return (safe_square(f*self.nu)-1.)*std_norm_pdf(f*self.nu)*(self.nu**3)
+    
+    def to_dict(self):
+        """
+        Convert the object into a json serializable dictionary.
+
+        Note: It uses the private method _save_to_input_dict of the parent.
+
+        :return dict: json serializable dictionary containing the needed information to instantiate the object
+        """
+
+        input_dict = super(ScaledProbit, self)._save_to_input_dict()
+        input_dict["class"] = "GPy.likelihoods.link_functions.ScaledProbit"
+        return input_dict

 class Cloglog(GPTransformation):
    """
--- a/GPy/likelihoods/multioutput_likelihood.py
+++ b/GPy/likelihoods/multioutput_likelihood.py
@ -14,7 +14,7 @@ from .gaussian import Gaussian
 from ..core.parameterization import Param
 from paramz.transformations import Logexp
 from ..core.parameterization import Parameterized
-from ..kern.src.independent_outputs import index_to_slices
+from ..util.multioutput import index_to_slices
 import itertools

 class MultioutputLikelihood(MixedNoise):
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -30,3 +30,4 @@ from .gp_grid_regression import GPRegressionGrid
 from .gp_multiout_regression import GPMultioutRegression
 from .gp_multiout_regression_md import GPMultioutRegressionMD
 from .tp_regression import TPRegression
+from .multioutput_gp import MultioutputGP
--- a/GPy/models/multioutput_gp.py
+++ b/GPy/models/multioutput_gp.py
@ -0,0 +1,147 @@
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import itertools
+from ..core.model import Model
+from ..core.parameterization.variational import VariationalPosterior
+from ..core.mapping import Mapping
+from .. import likelihoods
+from ..likelihoods.gaussian import Gaussian
+from .. import kern
+from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
+from ..util.normalizer import Standardize
+from .. import util
+from paramz import ObsAr
+from ..core.gp import GP
+
+from ..util.multioutput import index_to_slices
+import logging
+import warnings
+logger = logging.getLogger("GP")
+
+class MultioutputGP(GP):
+    """
+    Gaussian process model for using observations from multiple likelihoods and different kernels
+    :param X_list: input observations in a list for each likelihood
+    :param Y: output observations in a list for each likelihood
+    :param kernel_list: kernels in a list for each likelihood
+    :param likelihood_list: likelihoods in a list
+    :param kernel_cross_covariances: Cross covariances between different likelihoods. See class MultioutputKern for more
+    :param inference_method: The :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference method to use for this GP
+    """
+    def __init__(self, X_list, Y_list, kernel_list, likelihood_list, name='multioutputgp', kernel_cross_covariances={}, inference_method=None):
+        #Input and Output
+        X,Y,self.output_index = util.multioutput.build_XY(X_list,Y_list)
+        Ny = len(Y_list)
+        
+        assert isinstance(kernel_list, list)
+        kernel = kern.MultioutputDerivativeKern(kernels=kernel_list, cross_covariances=kernel_cross_covariances)
+
+        assert isinstance(likelihood_list, list)
+        likelihood = likelihoods.MultioutputLikelihood(likelihood_list)
+        
+        if inference_method is None:
+            if all([isinstance(l, Gaussian) for l in likelihood_list]):
+                inference_method = exact_gaussian_inference.ExactGaussianInference()
+            else:
+                inference_method = expectation_propagation.EP() 
+        
+        super(MultioutputGP, self).__init__(X,Y,kernel,likelihood, Y_metadata={'output_index':self.output_index, 'trials':np.ones(self.output_index.shape)}, inference_method = inference_method)
+
+    def predict_noiseless(self,  Xnew, full_cov=False, Y_metadata=None, kern=None):
+        if isinstance(Xnew, list):
+            Xnew, _, ind  = util.multioutput.build_XY(Xnew,None)
+            if Y_metadata is None:
+                Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
+        return super(MultioutputGP, self).predict_noiseless(Xnew, full_cov, Y_metadata, kern)
+    
+    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, likelihood=None, include_likelihood=True):
+        if isinstance(Xnew, list):
+            Xnew, _, ind = util.multioutput.build_XY(Xnew,None)
+            if Y_metadata is None:
+                Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
+        return super(MultioutputGP, self).predict(Xnew, full_cov, Y_metadata, kern, likelihood, include_likelihood)
+    
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, kern=None, likelihood=None):
+        if isinstance(X, list):
+            X, _, ind  = util.multioutput.build_XY(X,None)
+            if Y_metadata is None:
+                Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
+        return super(MultioutputGP, self).predict_quantiles(X, quantiles, Y_metadata, kern, likelihood)
+    
+    def predictive_gradients(self, Xnew, kern=None):
+        if isinstance(Xnew, list):
+            Xnew, _, ind  = util.multioutput.build_XY(Xnew, None)
+            #if Y_metadata is None:
+                #Y_metadata={'output_index': ind}
+        return super(MultioutputGP, self).predictive_gradients(Xnew, kern)
+
+    def predictive_gradients(self, Xnew, kern=None): #XNEW IS NOT A LIST!!
+        """
+        Compute the derivatives of the predicted latent function with respect to X*
+        Given a set of points at which to predict X* (size [N*,Q]), compute the
+        derivatives of the mean and variance. Resulting arrays are sized:
+         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
+        Note that this is not the same as computing the mean and variance of the derivative of the function!
+         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
+        :param X: The points at which to get the predictive gradients
+        :type X: np.ndarray (Xnew x self.input_dim)
+        :returns: dmu_dX, dv_dX
+        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q) ]
+        """
+        
+        if isinstance(Xnew, list):
+            Xnew, _, ind  = util.multioutput.build_XY(Xnew, None)
+        
+        slices = index_to_slices(Xnew[:,-1])
+        
+        for i in range(len(slices)):
+            if ((self.kern.kern[i].name == 'diffKern' ) and len(slices[i])>0):
+                assert 0, "It is not (yet) possible to predict gradients of gradient observations, sorry :)"
+ 
+        if kern is None:
+            kern = self.kern
+        mean_jac = np.empty((Xnew.shape[0],Xnew.shape[1]-1,self.output_dim))
+        for i in range(self.output_dim):
+            mean_jac[:,:,i] = kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self._predictive_variable)[:,0:-1]
+
+        # gradients wrt the diagonal part k_{xx}
+        dv_dX = kern.gradients_X(np.eye(Xnew.shape[0]), Xnew)[:,0:-1]
+        #grads wrt 'Schur' part K_{xf}K_{ff}^{-1}K_{fx}
+        if self.posterior.woodbury_inv.ndim == 3:
+            tmp = np.empty(dv_dX.shape + (self.posterior.woodbury_inv.shape[2],))
+            tmp[:] = dv_dX[:,:,None]
+            for i in range(self.posterior.woodbury_inv.shape[2]):
+                alpha = -2.*np.dot(kern.K(Xnew, self._predictive_variable), self.posterior.woodbury_inv[:, :, i])
+                tmp[:, :, i] += kern.gradients_X(alpha, Xnew, self._predictive_variable)
+        else:
+            tmp = dv_dX
+            alpha = -2.*np.dot(kern.K(Xnew, self._predictive_variable), self.posterior.woodbury_inv)
+            tmp += kern.gradients_X(alpha, Xnew, self._predictive_variable)[:,0:-1]
+        return mean_jac, tmp
+    
+    def log_predictive_density(self, x_test, y_test, Y_metadata=None):
+        if isinstance(x_test, list):
+            x_test, y_test, ind  = util.multioutput.build_XY(x_test, y_test)
+            if Y_metadata is None:
+                Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
+        return super(MultioutputGP, self).log_predictive_density(x_test, y_test, Y_metadata)
+    
+    def set_XY(self, X=None, Y=None):
+        if isinstance(X, list):
+            X, _, self.output_index  = util.multioutput.build_XY(X, None)
+        if isinstance(Y, list):
+            _, Y, self.output_index  = util.multioutput.build_XY(Y, Y)      
+                
+        self.update_model(False)
+        if Y is not None:
+            self.Y = ObsAr(Y)
+            self.Y_normalized = self.Y
+        if X is not None:
+            self.X = ObsAr(X)
+            
+        self.Y_metadata={'output_index': self.output_index, 'trials': np.ones(self.output_index.shape)}
+        if isinstance(self.inference_method, expectation_propagation.EP):      
+            self.inference_method.reset()
+        self.update_model(True)
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -3,6 +3,8 @@
 from matplotlib import pyplot as plt
 import numpy as np

+from .util import align_subplot_array, align_subplots
+
 def ax_default(fignum, ax):
    if ax is None:
        fig = plt.figure(fignum)
@ -50,73 +52,73 @@ def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
        kwargs['linewidth'] = 0.5
    if not 'alpha' in kwargs.keys():
        kwargs['alpha'] = 1./(len(percentiles))
-    
+
    # pop where from kwargs
    where = kwargs.pop('where') if 'where' in kwargs else None
    # pop interpolate, which we actually do not do here!
    if 'interpolate' in kwargs: kwargs.pop('interpolate')
-    
+
    def pairwise(inlist):
        l = len(inlist)
        for i in range(int(np.ceil(l/2.))):
            yield inlist[:][i], inlist[:][(l-1)-i]
-    
+
    polycol = []
    for y1, y2 in pairwise(percentiles):
        import matplotlib.mlab as mlab
        # Handle united data, such as dates
        ax._process_unit_info(xdata=x, ydata=y1)
        ax._process_unit_info(ydata=y2)
-    
+
        # Convert the arrays so we can work with them
        from numpy import ma
        x = ma.masked_invalid(ax.convert_xunits(x))
        y1 = ma.masked_invalid(ax.convert_yunits(y1))
        y2 = ma.masked_invalid(ax.convert_yunits(y2))
-    
+
        if y1.ndim == 0:
            y1 = np.ones_like(x) * y1
        if y2.ndim == 0:
            y2 = np.ones_like(x) * y2
-    
+
        if where is None:
            where = np.ones(len(x), np.bool)
        else:
            where = np.asarray(where, np.bool)
-    
+
        if not (x.shape == y1.shape == y2.shape == where.shape):
            raise ValueError("Argument dimensions are incompatible")
-    
+
        mask = reduce(ma.mask_or, [ma.getmask(a) for a in (x, y1, y2)])
        if mask is not ma.nomask:
            where &= ~mask
-        
+
        polys = []
        for ind0, ind1 in mlab.contiguous_regions(where):
            xslice = x[ind0:ind1]
            y1slice = y1[ind0:ind1]
            y2slice = y2[ind0:ind1]
-    
+
            if not len(xslice):
                continue
-    
+
            N = len(xslice)
            X = np.zeros((2 * N + 2, 2), np.float)
-    
+
            # the purpose of the next two lines is for when y2 is a
            # scalar like 0 and we want the fill to go all the way
            # down to 0 even if none of the y1 sample points do
            start = xslice[0], y2slice[0]
            end = xslice[-1], y2slice[-1]
-    
+
            X[0] = start
            X[N + 1] = end
-    
+
            X[1:N + 1, 0] = xslice
            X[1:N + 1, 1] = y1slice
            X[N + 2:, 0] = xslice[::-1]
            X[N + 2:, 1] = y2slice[::-1]
-    
+
            polys.append(X)
        polycol.extend(polys)
    from matplotlib.collections import PolyCollection
@ -167,65 +169,6 @@ def fewerXticks(ax=None,divideby=2):
    ax = ax or plt.gca()
    ax.set_xticks(ax.get_xticks()[::divideby])

-def align_subplots(N,M,xlim=None, ylim=None):
-    """make all of the subplots have the same limits, turn off unnecessary ticks"""
-    #find sensible xlim,ylim
-    if xlim is None:
-        xlim = [np.inf,-np.inf]
-        for i in range(N*M):
-            plt.subplot(N,M,i+1)
-            xlim[0] = min(xlim[0],plt.xlim()[0])
-            xlim[1] = max(xlim[1],plt.xlim()[1])
-    if ylim is None:
-        ylim = [np.inf,-np.inf]
-        for i in range(N*M):
-            plt.subplot(N,M,i+1)
-            ylim[0] = min(ylim[0],plt.ylim()[0])
-            ylim[1] = max(ylim[1],plt.ylim()[1])
-
-    for i in range(N*M):
-        plt.subplot(N,M,i+1)
-        plt.xlim(xlim)
-        plt.ylim(ylim)
-        if (i)%M:
-            plt.yticks([])
-        else:
-            removeRightTicks()
-        if i<(M*(N-1)):
-            plt.xticks([])
-        else:
-            removeUpperTicks()
-
-def align_subplot_array(axes,xlim=None, ylim=None):
-    """
-    Make all of the axes in the array hae the same limits, turn off unnecessary ticks
-    use plt.subplots() to get an array of axes
-    """
-    #find sensible xlim,ylim
-    if xlim is None:
-        xlim = [np.inf,-np.inf]
-        for ax in axes.flatten():
-            xlim[0] = min(xlim[0],ax.get_xlim()[0])
-            xlim[1] = max(xlim[1],ax.get_xlim()[1])
-    if ylim is None:
-        ylim = [np.inf,-np.inf]
-        for ax in axes.flatten():
-            ylim[0] = min(ylim[0],ax.get_ylim()[0])
-            ylim[1] = max(ylim[1],ax.get_ylim()[1])
-
-    N,M = axes.shape
-    for i,ax in enumerate(axes.flatten()):
-        ax.set_xlim(xlim)
-        ax.set_ylim(ylim)
-        if (i)%M:
-            ax.set_yticks([])
-        else:
-            removeRightTicks(ax)
-        if i<(M*(N-1)):
-            ax.set_xticks([])
-        else:
-            removeUpperTicks(ax)
-
 def x_frame1D(X,plot_limits=None,resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
--- a/GPy/plotting/plotly_dep/defaults.py
+++ b/GPy/plotting/plotly_dep/defaults.py
@ -44,7 +44,7 @@ it gives back an empty default, when defaults are not defined.

 # Data plots:
 data_1d = dict(marker_kwargs=dict(), marker='x', color='black')
-data_2d = dict(marker='o', cmap='Hot', marker_kwargs=dict(opacity=1., size='5', line=Line(width=.5, color='black')))
+data_2d = dict(marker='o', cmap='Hot', marker_kwargs=dict(opacity=1., size=5, line=Line(width=.5, color='black')))
 inducing_1d = dict(color=Tango.colorsHex['darkRed'])
 inducing_2d = dict(marker_kwargs=dict(size='5', opacity=.7, line=Line(width=.5, color='black')), opacity=.7, color='white', marker='star-triangle-up')
 inducing_3d = dict(marker_kwargs=dict(symbol='diamond', size='5', opacity=.7, line=Line(width=.1, color='black')), color='#F5F5F5')
--- a/GPy/plotting/plotly_dep/plot_definitions.py
+++ b/GPy/plotting/plotly_dep/plot_definitions.py
@ -39,7 +39,7 @@ from plotly.graph_objs import Scatter, Scatter3d, Line,\
 from plotly.exceptions import PlotlyDictKeyError

 SYMBOL_MAP = {
-    'o': 'dot',
+    'o': 'circle-dot',
    'v': 'triangle-down',
    '^': 'triangle-up',
    '<': 'triangle-left',
@ -47,7 +47,7 @@ SYMBOL_MAP = {
    's': 'square',
    '+': 'cross',
    'x': 'x',
-    '*': 'x',  # no star yet in plotly!!
+    '*': 'star',
    'D': 'diamond',
    'd': 'diamond',
 }
--- a/GPy/testing/baseline/bayesian_gplvm_gradient.npz
+++ b/GPy/testing/baseline/bayesian_gplvm_gradient.npz
--- a/GPy/testing/baseline/bayesian_gplvm_inducing.npz
+++ b/GPy/testing/baseline/bayesian_gplvm_inducing.npz
--- a/GPy/testing/baseline/bayesian_gplvm_inducing_3d.npz
+++ b/GPy/testing/baseline/bayesian_gplvm_inducing_3d.npz
--- a/GPy/testing/baseline/bayesian_gplvm_latent.npz
+++ b/GPy/testing/baseline/bayesian_gplvm_latent.npz
--- a/GPy/testing/baseline/bayesian_gplvm_latent_3d.npz
+++ b/GPy/testing/baseline/bayesian_gplvm_latent_3d.npz
--- a/GPy/testing/baseline/bayesian_gplvm_magnification.npz
+++ b/GPy/testing/baseline/bayesian_gplvm_magnification.npz
--- a/GPy/testing/baseline/coverage_3d_plot.npz
+++ b/GPy/testing/baseline/coverage_3d_plot.npz
--- a/GPy/testing/baseline/gp_2d_data.npz
+++ b/GPy/testing/baseline/gp_2d_data.npz
--- a/GPy/testing/baseline/gp_2d_inducing.npz
+++ b/GPy/testing/baseline/gp_2d_inducing.npz
--- a/GPy/testing/baseline/gp_2d_mean.npz
+++ b/GPy/testing/baseline/gp_2d_mean.npz
--- a/GPy/testing/baseline/gp_3d_data.npz
+++ b/GPy/testing/baseline/gp_3d_data.npz
--- a/GPy/testing/baseline/gp_3d_inducing.npz
+++ b/GPy/testing/baseline/gp_3d_inducing.npz
--- a/GPy/testing/baseline/gp_3d_mean.npz
+++ b/GPy/testing/baseline/gp_3d_mean.npz
--- a/GPy/testing/baseline/gp_class_likelihood.npz
+++ b/GPy/testing/baseline/gp_class_likelihood.npz
--- a/GPy/testing/baseline/gp_class_raw_link.npz
+++ b/GPy/testing/baseline/gp_class_raw_link.npz
--- a/GPy/testing/baseline/gp_data.npz
+++ b/GPy/testing/baseline/gp_data.npz
--- a/GPy/testing/baseline/gp_samples.npz
+++ b/GPy/testing/baseline/gp_samples.npz
--- a/GPy/testing/baseline/gplvm_gradient.npz
+++ b/GPy/testing/baseline/gplvm_gradient.npz
--- a/GPy/testing/baseline/gplvm_latent.npz
+++ b/GPy/testing/baseline/gplvm_latent.npz
--- a/GPy/testing/baseline/gplvm_latent_3d.npz
+++ b/GPy/testing/baseline/gplvm_latent_3d.npz
--- a/GPy/testing/baseline/gplvm_magnification.npz
+++ b/GPy/testing/baseline/gplvm_magnification.npz
--- a/GPy/testing/baseline/kern_cov_2d.npz
+++ b/GPy/testing/baseline/kern_cov_2d.npz
--- a/GPy/testing/baseline/kern_cov_3d.npz
+++ b/GPy/testing/baseline/kern_cov_3d.npz
--- a/GPy/testing/baseline/kern_cov_no_lim.npz
+++ b/GPy/testing/baseline/kern_cov_no_lim.npz
--- a/GPy/testing/baseline/sparse_gp_class_likelihood.npz
+++ b/GPy/testing/baseline/sparse_gp_class_likelihood.npz
--- a/GPy/testing/baseline/sparse_gp_class_raw.npz
+++ b/GPy/testing/baseline/sparse_gp_class_raw.npz
--- a/GPy/testing/baseline/sparse_gp_class_raw_link.npz
+++ b/GPy/testing/baseline/sparse_gp_class_raw_link.npz
--- a/GPy/testing/baseline/sparse_gp_data_error.npz
+++ b/GPy/testing/baseline/sparse_gp_data_error.npz
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -7,7 +7,7 @@ from unittest.case import skip
 import GPy
 from GPy.core.parameterization.param import Param
 import numpy as np
-
+import random
 from ..util.config import config


--- a/GPy/testing/link_function_tests.py
+++ b/GPy/testing/link_function_tests.py
@ -2,11 +2,12 @@ import numpy as np
 import scipy
 from scipy.special import cbrt
 from GPy.models import GradientChecker
+import random
 _lim_val = np.finfo(np.float64).max
 _lim_val_exp = np.log(_lim_val)
 _lim_val_square = np.sqrt(_lim_val)
 _lim_val_cube = cbrt(_lim_val)
-from GPy.likelihoods.link_functions import Identity, Probit, Cloglog, Log, Log_ex_1, Reciprocal, Heaviside
+from GPy.likelihoods.link_functions import Identity, Probit, Cloglog, Log, Log_ex_1, Reciprocal, Heaviside, ScaledProbit

 class LinkFunctionTests(np.testing.TestCase):
    def setUp(self):
@ -123,6 +124,11 @@ class LinkFunctionTests(np.testing.TestCase):
        link = Probit()
        lim_of_inf = _lim_val
        self.check_gradient(link, lim_of_inf, test_lim=True)
+        
+    def test_scaledprobit_gradients(self):
+        link = ScaledProbit(nu=random.random())
+        lim_of_inf = _lim_val
+        self.check_gradient(link, lim_of_inf, test_lim=True)

    def test_Cloglog_gradients(self):
        link = Cloglog()
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -1180,6 +1180,67 @@ class GradientTests(np.testing.TestCase):

        with self.assertRaises(RuntimeError):
            m.posterior_covariance_between_points(np.array([[1], [2]]), np.array([[3], [4]]))
+    
+    def test_multioutput_model_with_derivative_observations(self):
+        f = lambda x: np.sin(x)+0.1*(x-2.)**2-0.005*x**3
+        fd = lambda x: np.cos(x)+0.2*(x-2.)-0.015*x**2
+        N=10
+        M=10
+        sigma=0.05
+        sigmader=0.05
+        x = np.array([np.linspace(1,10,N)]).T
+        y = f(x) + np.array(sigma*np.random.normal(0,1,(N,1)))
+
+        xd = np.array([np.linspace(2,8,M)]).T
+        yd = fd(xd) + np.array(sigmader*np.random.normal(0,1,(M,1)))
+
+        # squared exponential kernel:
+        se = GPy.kern.RBF(input_dim = 1, lengthscale=1.5, variance=0.2)
+        # We need to generate separate kernel for the derivative observations and give the created kernel as an input:
+        se_der = GPy.kern.DiffKern(se, 0)
+
+        #Then 
+        gauss = GPy.likelihoods.Gaussian(variance=sigma**2)
+        gauss = GPy.likelihoods.Gaussian(variance=0.1)
+        gauss_der = GPy.likelihoods.Gaussian(variance=sigma**2)
+
+        # Then create the model, we give everything in lists, the order of the inputs indicates the order of the outputs
+        # Now we have the regular observations first and derivative observations second, meaning that the kernels and
+        # the likelihoods must follow the same order
+        m = GPy.models.MultioutputGP(X_list=[x, xd], Y_list=[y, yd], kernel_list=[se, se_der], likelihood_list = [gauss, gauss])
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+        m.optimize(messages=0, ipython_notebook=False)
+
+        self.assertTrue(m.checkgrad())
+
+    def test_multioutput_model_with_ep(self):
+        f = lambda x: np.sin(x)+0.1*(x-2.)**2-0.005*x**3
+        fd = lambda x: np.cos(x)+0.2*(x-2.)-0.015*x**2
+        N=10
+        sigma=0.05
+        sigmader=0.05
+        x = np.array([np.linspace(1,10,N)]).T
+        y = f(x) + np.array(sigma*np.random.normal(0,1,(N,1)))
+
+        M=7
+        xd = np.array([np.linspace(2,8,M)]).T
+        yd = 2*(fd(xd)>0) -1
+
+        # squared exponential kernel:
+        se = GPy.kern.RBF(input_dim = 1, lengthscale=1.5, variance=0.2)
+        # We need to generate separate kernel for the derivative observations and give the created kernel as an input:
+        se_der = GPy.kern.DiffKern(se, 0)
+
+        #Then 
+        gauss = GPy.likelihoods.Gaussian(variance=sigma**2)
+        probit = GPy.likelihoods.Binomial(gp_link = GPy.likelihoods.link_functions.ScaledProbit(nu=100))
+
+        # Then create the model, we give everything in lists
+        m = GPy.models.MultioutputGP(X_list=[x, xd], Y_list=[y, yd], kernel_list=[se, se_der], likelihood_list = [gauss, probit], inference_method=GPy.inference.latent_function_inference.EP(ep_mode="nested"))
+        
+        self.assertTrue(m.checkgrad())       

 def _create_missing_data_model(kernel, Q):
    D1, D2, D3, N, num_inducing = 13, 5, 8, 400, 3
--- a/GPy/testing/plotting_tests.py
+++ b/GPy/testing/plotting_tests.py
@ -38,7 +38,7 @@ from nose import SkipTest

 try:
    import matplotlib
-    # matplotlib.use('agg')
+    matplotlib.use('agg', warn=False)
 except ImportError:
    # matplotlib not installed
    from nose import SkipTest
@ -48,6 +48,7 @@ from unittest.case import TestCase

 import numpy as np
 import GPy, os
+import logging

 from GPy.util.config import config
 from GPy.plotting import change_plotting_library, plotting_library
@ -98,18 +99,26 @@ def _image_comparison(baseline_images, extensions=['pdf','svg','png'], tol=11, r
    for num, base in zip(plt.get_fignums(), baseline_images):
        for ext in extensions:
            fig = plt.figure(num)
-            fig.canvas.draw()
+            try:
+                fig.canvas.draw()
+            except Exception as e:
+                logging.error(base)
+                #raise SkipTest(e)
            #fig.axes[0].set_axis_off()
            #fig.set_frameon(False)
            if ext in ['npz']:
                figdict = flatten_axis(fig)
                np.savez_compressed(os.path.join(result_dir, "{}.{}".format(base, ext)), **figdict)
-                fig.savefig(os.path.join(result_dir, "{}.{}".format(base, 'png')),
-                            transparent=True,
-                            edgecolor='none',
-                            facecolor='none',
-                            #bbox='tight'
-                            )
+                try:
+                    fig.savefig(os.path.join(result_dir, "{}.{}".format(base, 'png')),
+                                transparent=True,
+                                edgecolor='none',
+                                facecolor='none',
+                                #bbox='tight'
+                                )
+                except:
+                    logging.error(base)
+                    # raise
            else:
                fig.savefig(os.path.join(result_dir, "{}.{}".format(base, ext)),
                            transparent=True,
@ -331,10 +340,9 @@ def test_threed():
    m.plot_mean(projection='3d', rstride=10, cstride=10)
    m.plot_inducing(projection='3d')
    #m.plot_errorbars_trainset(projection='3d')
-    for do_test in _image_comparison(baseline_images=['gp_3d_{}'.format(sub) for sub in ["data", "mean", 'inducing',
-                                                                                         #'error',
-                                                                                          #"samples", "samples_lik"
-                                                                                          ]], extensions=extensions):
+    for do_test in _image_comparison(baseline_images=[
+        'gp_3d_{}'.format(sub) for sub in ["data", "mean", 'inducing',
+    ]], extensions=extensions):
        yield (do_test, )

 def test_sparse():
--- a/GPy/util/multioutput.py
+++ b/GPy/util/multioutput.py
@ -2,6 +2,33 @@ import numpy as np
 import warnings
 import GPy

+def index_to_slices(index):
+    """
+    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index.
+
+    e.g.
+    >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
+    returns
+    >>> [[slice(0,3,None)],[slice(3,6,None)],[slice(6,9,None)]]
+
+    or, a more complicated example
+    >>> index = np.asarray([0,0,1,1,0,2,2,2,1,1])
+    returns
+    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
+    """
+    if len(index)==0:
+        return[]
+
+    #contruct the return structure
+    ind = np.asarray(index,dtype=np.int)
+    ret = [[] for i in range(ind.max()+1)]
+
+    #find the switchpoints
+    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
+    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
+
+    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
+    return ret

 def get_slices(input_list):
    num_outputs = len(input_list)
--- a/setup.py
+++ b/setup.py
@ -40,6 +40,11 @@ import sys
 from setuptools import setup, Extension
 import codecs

+try:
+    ModuleNotFoundError
+except NameError:
+    ModuleNotFoundError = ImportError
+
 def read(fname):
    with codecs.open(fname, 'r', 'latin') as f:
        return f.read()
--- a/travis_tests.py
+++ b/travis_tests.py
@ -31,7 +31,7 @@

 #!/usr/bin/env python
 import matplotlib
-matplotlib.use('agg')
+matplotlib.use('agg', warn=False)

 import nose, warnings
 with warnings.catch_warnings():