Enhance multioutput grad obs (#995)

* multiplied RBF kernels can now be used with gradient observations * standard periodic kernels can now be used with gradient observations * predictive gradients (derivatives of posterior means and variances) can now be calculated when using gradient observations * simplified and commented RBF & StdP kernel derivatives * updated kernel slicing and commented prod kernel derivatives * removed caching from stdp kern, as it breaks optimization for some reason * fixed hyperparameter optimization for prod kernel * improved code readability * added unit tests for gradient observing MultioutputGP models * added predictions check to unit tests * bugfix for multioutput_kern * improved testing coverage * reduced size of some tests; led to an issue in an unrelated test * updated testing * added gradient MultioutputGP prod kernel example * added keywords and plotting to example
2026-05-18 13:55:14 +02:00 · 2023-09-21 16:53:42 +03:00 · 2023-09-21 16:53:42 +03:00 · 9c1db7aa34
commit 9c1db7aa34
parent 2c22f1e9c5
11 changed files with 1494 additions and 164 deletions
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -771,3 +771,117 @@ def multioutput_gp_with_derivative_observations(plot=True):
    mu, var = m.predict_noiseless(Xnew=[xpred, np.empty((0, 1))])
    return m
 def multioutput_gp_with_derivative_observations_2D(optimize=True, plot=False):
    '''
    This in an example on how to use a MultioutputGP model with gradient
    observations and multiple single-dimensional kernels of differing types.
    '''
    period = 3
    w = 2*np.pi/period # angular frequency
    bounds = (-period, period)
    # latent function and gradient
    f = lambda x: (np.exp(-x[:,0]**2) + np.cos(w*x[:,1]))[:,None]
    df = lambda x: np.array([-2*np.exp(-x[:,0]**2)*x[:,0], -w*np.sin(w*x[:,1])]).T
    # 2D input grid
    ppa = 25 # points per axis
    x = np.linspace(*bounds, ppa)
    xx, yy = np.meshgrid(x, x)
    grid = np.array([xx.reshape(-1), yy.reshape(-1)]).T
    fgrid = f(grid)
    dfgrid = df(grid)
    # 10 random training points generated with a space-filling sobol sequence
    X = np.array([
        [ 0.50421399,  2.1331483 ],
        [-2.15717152, -1.70295936],
        [-1.46704334,  1.37111521],
        [ 2.79064536, -0.9649018 ],
        [ 1.60728264,  0.27702713],
        [-0.30712366, -0.57372129],
        [-2.6140632 ,  2.49192488],
        [ 0.89078772, -2.85873686],
        [ 1.15813136,  0.96910322],
        [-2.83307021, -1.38155383]
    ])
    # Note!
    # This example uses the same inputs for function and gradient observations.
    noise_std = 1e-2
    # function observations
    Y = f(X) + np.random.normal(scale=noise_std, size=(len(X), 1))
    # gradient observations
    dY = df(X) + np.random.normal(scale=noise_std, size=(len(X), 2))
    # gather inputs and observations into lists
    X_list = [X, X, X]
    # once for function observations, and once for each partial derivative
    # make sure all arrays are of shape (N x dims), where N is # of training points
    Y_list = [Y, dY[:,0,None], dY[:,1,None]]
    # create a kernel that is the product of two one-dimensional kernels
    # the first kernel is an RBF kernel
    kern0 = GPy.kern.RBF(input_dim=1, active_dims=[0])
    # as the function is periodic in the second dimension, we use a StdP kernel
    kern1 = GPy.kern.StdPeriodic(input_dim=1, active_dims=[1], period=period)
    kern1.period.constrain_fixed()
    # the kernels can be multiplied together into a product kernel
    kern = kern0 * kern1
    # with gradient observations, we need to define a DiffKern for each dimension
    # the DiffKern is given the main kernel as a base kernel
    diffkern0 = GPy.kern.DiffKern(kern, 0)
    diffkern1 = GPy.kern.DiffKern(kern, 1)
    # gather the main kernel and diffkerns into a list
    kern_list = [kern, diffkern0, diffkern1]
    # define a likelihood and repeat it in a list
    likelihood_list = [GPy.likelihoods.Gaussian(variance=noise_std**2)]*3
    # create the MultioutputGP model and optimize
    model = GPy.models.MultioutputGP(X_list, Y_list, kern_list, likelihood_list)
    model.likelihood.constrain_fixed()
    if optimize:
        model.optimize()
    # make function predictions
    Xnew, _, ind = GPy.util.multioutput.build_XY([grid], index=[0])
    Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
    mu, var = model.predict(Xnew, Y_metadata=Y_metadata)
    # make gradient predictions
    Xnew, _, ind = GPy.util.multioutput.build_XY([grid]*2, index=[1, 2])
    Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
    mu_d, var_d = model.predict(Xnew, Y_metadata=Y_metadata)
    mu_d = np.array([mu_d[:len(grid)], mu_d[len(grid):]]).T[0]
    var_d = np.array([var_d[:len(grid)], var_d[len(grid):]]).T[0]
    if plot and MPL_AVAILABLE:
        fig, axs = plt.subplots(1, 3)
        for ax in axs: ax.set_box_aspect(1)
        axs[0].set_title('true f')
        axs[0].contourf(xx, yy, fgrid.reshape(ppa, ppa), levels=25)
        axs[1].set_title('true df1')
        axs[1].contourf(xx, yy, dfgrid[:,0].reshape(ppa, ppa), levels=25)
        axs[2].set_title('true df2')
        axs[2].contourf(xx, yy, dfgrid[:,1].reshape(ppa, ppa), levels=25)
        fig, axs = plt.subplots(1, 3)
        for ax in axs: ax.set_box_aspect(1)
        axs[0].set_title('pred f')
        axs[0].contourf(xx, yy, mu.reshape(ppa, ppa), levels=25)
        axs[1].set_title('pred df1')
        axs[1].contourf(xx, yy, mu_d[:,0].reshape(ppa, ppa), levels=25)
        axs[2].set_title('pred df2')
        axs[2].contourf(xx, yy, mu_d[:,1].reshape(ppa, ppa), levels=25)
    return model
--- a/GPy/kern/src/diff_kern.py
+++ b/GPy/kern/src/diff_kern.py
@ -23,24 +23,42 @@ class DiffKern(Kern):
        self.base_kern.parameters_changed()
    @Cache_this(limit=3, ignore_args=())
-    def K(self, X, X2=None, dimX2 = None): #X in dimension self.dimension
+    def K(self, X, X2=None, dimX2=None): #X in dimension self.dimension
        if X2 is None:
            X2 = X
        if dimX2 is None:
            dimX2 = self.dimension
-        return self.base_kern.dK2_dXdX2(X,X2, self.dimension, dimX2)
+        return self.base_kern.dK2_dXdX2(X, X2, self.dimension, dimX2)
- 
+
    @Cache_this(limit=3, ignore_args=())
    def dK_dX(self, X, X2, dimX, dimX2=None):
        if dimX2 is None:
            dimX2 = self.dimension
        return self.base_kern.dK3_dXdXdX2(X, X2, dimX, self.dimension, dimX2)
    @Cache_this(limit=3, ignore_args=())
    def Kdiag(self, X):
-        return np.diag(self.base_kern.dK2_dXdX2(X,X, self.dimension, self.dimension))
+        return self.base_kern.dK2_dXdX2diag(X, self.dimension, self.dimension)
-    
+
    @Cache_this(limit=3, ignore_args=())
    def dK_dXdiag(self, X, dimX):
        return self.base_kern.dK3_dXdXdX2diag(X, dimX, self.dimension, self.dimension)
    @Cache_this(limit=3, ignore_args=())
    def dK_dX_wrap(self, X, X2): #X in dimension self.dimension
-        return self.base_kern.dK_dX(X,X2, self.dimension)
+        return self.base_kern.dK_dX(X, X2, self.dimension)
    @Cache_this(limit=3, ignore_args=())
    def dK_dX2_wrap(self, X, X2): #X in dimension self.dimension
-        return self.base_kern.dK_dX2(X,X2, self.dimension)
+        return self.base_kern.dK_dX2(X, X2, self.dimension)
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX2_wrap(self, X, X2, dimX):
        return self.base_kern.dK2_dXdX2(X, X2, dimX, self.dimension)
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX_wrap(self, X, X2, dimX):
        return self.base_kern.dK2_dXdX(X, X2, dimX, self.dimension)
    def reset_gradients(self):
        self.base_kern.reset_gradients()
@ -56,33 +74,33 @@ class DiffKern(Kern):
    def update_gradients_full(self, dL_dK, X, X2=None, dimX2=None):
        if dimX2 is None:
            dimX2 = self.dimension
-        gradients = self.base_kern.dgradients2_dXdX2(X,X2,self.dimension,dimX2)
+        gradients = self.base_kern.dgradients2_dXdX2(X, X2, self.dimension, dimX2)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
    def update_gradients_diag(self, dL_dK_diag, X):
-        gradients = self.base_kern.dgradients2_dXdX2(X,X, self.dimension, self.dimension)
+        gradients = self.base_kern.dgradients2_dXdX2(X, X, self.dimension, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK_diag, gradient, f=np.diag) for gradient in gradients])
    def update_gradients_dK_dX(self, dL_dK, X, X2=None):
        if X2 is None:
            X2 = X
-        gradients = self.base_kern.dgradients_dX(X,X2, self.dimension)
+        gradients = self.base_kern.dgradients_dX(X, X2, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
    def update_gradients_dK_dX2(self, dL_dK, X, X2=None):
-        gradients = self.base_kern.dgradients_dX2(X,X2, self.dimension)
+        gradients = self.base_kern.dgradients_dX2(X, X2, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])
    def gradients_X(self, dL_dK, X, X2):
-        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,:, self.dimension]
+        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,:,self.dimension]
        return np.sum(tmp, axis=1)
    def gradients_X2(self, dL_dK, X, X2):
-        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:, :, self.dimension, :]
+        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,self.dimension,:]
        return np.sum(tmp, axis=1)
-    def _convert_gradients(self, l,g, f = lambda x:x):
+    def _convert_gradients(self, l, g, f=lambda x:x):
        if type(g) is np.ndarray:
            return np.sum(f(l)*f(g))
        else:
-            return np.array([np.sum(f(l)*f(gi)) for gi in g])
+            return np.array([np.sum(f(l)*f(gi)) for gi in g])
--- a/GPy/kern/src/kernel_slice_operations.py
+++ b/GPy/kern/src/kernel_slice_operations.py
@ -22,7 +22,14 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        put_clean(dct, 'dK_dX', _slice_dK_dX)
        put_clean(dct, 'dK_dX2', _slice_dK_dX)
        put_clean(dct, 'dK2_dXdX2', _slice_dK2_dXdX2)
        put_clean(dct, 'dK2_dXdX', _slice_dK2_dXdX2)
        put_clean(dct, 'dK3_dXdXdX2', _slice_dK3_dXdXdX2)
        put_clean(dct, 'Kdiag', _slice_Kdiag)
        put_clean(dct, 'dK_dXdiag', _slice_dK_dXdiag)
        put_clean(dct, 'dK_dX2diag', _slice_dK_dXdiag)
        put_clean(dct, 'dK2_dXdX2diag', _slice_dK2_dXdX2diag)
        put_clean(dct, 'dK2_dXdXdiag', _slice_dK2_dXdX2diag)
        put_clean(dct, 'dK3_dXdXdX2diag', _slice_dK3_dXdXdX2diag)
        put_clean(dct, 'phi', _slice_Kdiag)
        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
@ -35,9 +42,10 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        put_clean(dct, 'gradients_XX_diag', _slice_gradients_XX_diag)
        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
-        put_clean(dct, 'dgradients_dX',_slice_partial_gradients_list_X)
+        put_clean(dct, 'dgradients', _slice_partial_gradients_list)
-        put_clean(dct, 'dgradients_dX2',_slice_partial_gradients_list_X)
+        put_clean(dct, 'dgradients_dX', _slice_partial_gradients_list_X)
-        put_clean(dct, 'dgradients2_dXdX2',_slice_partial_gradients_list_XX)
+        put_clean(dct, 'dgradients_dX2', _slice_partial_gradients_list_X)
        put_clean(dct, 'dgradients2_dXdX2', _slice_partial_gradients_list_XX)
        put_clean(dct, 'psi0', _slice_psi)
        put_clean(dct, 'psi1', _slice_psi)
@ -155,6 +163,18 @@ def _slice_dK_dX(f):
        return ret
    return wrap
 def _slice_dK_dXdiag(f):
    @wraps(f)
    def wrap(self, X, dim, *a, **kw):
        with _Slice_wrap(self, X, None) as s:
            d = s.k._project_dim(dim)
            if d is None:
                ret = np.zeros(X.shape[0])
            else:
                ret = f(self, s.X, dim, *a, **kw)
        return ret
    return wrap
 def _slice_dK2_dXdX2(f):
    @wraps(f)
    def wrap(self, X, X2, dimX, dimX2, *a, **kw):
@ -168,6 +188,59 @@ def _slice_dK2_dXdX2(f):
        return ret
    return wrap
 def _slice_dK2_dXdX2diag(f):
    @wraps(f)
    def wrap(self, X, dimX, dimX2, *a, **kw):
        with _Slice_wrap(self, X, None) as s:
            d = s.k._project_dim(dimX)
            d2 = s.k._project_dim(dimX2)
            if (d is None) or (d2 is None):
                ret = np.zeros(X.shape[0])
            else:
                ret = f(self, s.X, d, d2, *a, **kw)
        return ret
    return wrap
 def _slice_dK3_dXdXdX2(f):
    @wraps(f)
    def wrap(self, X, X2, dim, dimX, dimX2, *a, **kw):
        with _Slice_wrap(self, X, X2) as s:
            D = s.k._project_dim(dim)
            d = s.k._project_dim(dimX)
            d2 = s.k._project_dim(dimX2)
            if (D is None) or (d is None) or (d2 is None):
                ret = np.zeros((X.shape[0], X2.shape[0]))
            else:
                ret = f(self, s.X, s.X2, D, d, d2, *a, **kw)
        return ret
    return wrap
 def _slice_dK3_dXdXdX2diag(f):
    @wraps(f)
    def wrap(self, X, dim, dimX, dimX2, *a, **kw):
        with _Slice_wrap(self, X, None) as s:
            D = s.k._project_dim(dim)
            d = s.k._project_dim(dimX)
            d2 = s.k._project_dim(dimX2)
            if (D is None) or (d is None) or (d2 is None):
                ret = np.zeros(X.shape[0])
            else:
                ret = f(self, s.X, D, d, d2, *a, **kw)
        return ret
    return wrap
 def _slice_partial_gradients_list(f):
    @wraps(f)
    def wrap(self, X, X2):
        if X2 is None:
            N, M = X.shape[0], X.shape[0]
        else:
            N, M = X.shape[0], X2.shape[0]
        with _Slice_wrap(self, X, X2, ret_shape=(N, M)) as s:
            ret = f(self, s.X, s.X2)
        return ret
    return wrap
 def _slice_partial_gradients_X(f):
    @wraps(f)
    def wrap(self, X, X2, dim):
--- a/GPy/kern/src/multioutput_derivative_kern.py
+++ b/GPy/kern/src/multioutput_derivative_kern.py
@ -7,20 +7,24 @@ import numpy as np
 from functools import partial
 class KernWrapper(Kern):
-    def __init__(self, fk, fug, fg, base_kern):
+    def __init__(self, fk, fdk, fug, fg, base_kern):
        self.fk = fk
        self.fdk = fdk
        self.fug = fug
        self.fg = fg
        self.base_kern = base_kern
-        super(KernWrapper, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='KernWrapper',useGPU=False)
+        super(KernWrapper, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='KernWrapper', useGPU=False)
    def K(self, X, X2=None):
-        return self.fk(X,X2=X2)
+        return self.fk(X, X2=X2)
    def dK_dX(self, X, X2, dimX):
        return self.fdk(X, X2, dimX)
-    def update_gradients_full(self,dL_dK, X, X2=None):
+    def update_gradients_full(self, dL_dK, X, X2=None):
        return self.fug(dL_dK, X, X2=X2)
-    def gradients_X(self,dL_dK, X, X2=None):
+    def gradients_X(self, dL_dK, X, X2=None):
        return self.fg(dL_dK, X, X2=X2)
    @property
@ -57,28 +61,46 @@ class MultioutputDerivativeKern(MultioutputKern):
        #build covariance structure
        covariance = [[None for i in range(nl)] for j in range(nl)]
        linked = []
-        for i in range(0,nl):
+        for i in range(0, nl):
-            unique=True
+            unique = True
-            for j in range(0,nl):
+            for j in range(0, nl):
-                if i==j or (kernels[i] is kernels[j]):
+                if (i == j) or (kernels[i] is kernels[j]):
                    kern = kernels[i]
-                    if i>j:
+                    if i > j:
-                        unique=False
+                        unique = False
                elif cross_covariances.get((i,j)) is not None: #cross covariance is given
                    kern = cross_covariances.get((i,j))
-                elif kernels[i].name == 'DiffKern' and kernels[i].base_kern == kernels[j]: # one is derivative of other
+                elif (kernels[i].name == 'DiffKern') and (kernels[i].base_kern == kernels[j]): # one is derivative of other
-                    kern = KernWrapper(kernels[i].dK_dX_wrap,kernels[i].update_gradients_dK_dX,kernels[i].gradients_X, kernels[j])
+                    kern = KernWrapper(
                        kernels[i].dK_dX_wrap,
                        kernels[i].dK2_dXdX_wrap,
                        kernels[i].update_gradients_dK_dX,
                        kernels[i].gradients_X,
                        kernels[j]
                        )
                    unique=False
-                elif kernels[j].name == 'DiffKern' and kernels[j].base_kern == kernels[i]: # one is derivative of other
+                elif (kernels[j].name == 'DiffKern') and (kernels[j].base_kern == kernels[i]): # one is derivative of other
-                    kern = KernWrapper(kernels[j].dK_dX2_wrap,kernels[j].update_gradients_dK_dX2,kernels[j].gradients_X2, kernels[i])
+                    kern = KernWrapper(
-                elif kernels[i].name == 'DiffKern' and kernels[j].name == 'DiffKern' and kernels[i].base_kern == kernels[j].base_kern: #both are partial derivatives
+                        kernels[j].dK_dX2_wrap,
-                    kern = KernWrapper(partial(kernels[i].K, dimX2=kernels[j].dimension), partial(kernels[i].update_gradients_full, dimX2=kernels[j].dimension),None, kernels[i].base_kern)
+                        kernels[j].dK2_dXdX2_wrap,
-                    if i>j:
+                        kernels[j].update_gradients_dK_dX2,
-                        unique=False
+                        kernels[j].gradients_X2,
                        kernels[i]
                        )
                elif (kernels[i].name == 'DiffKern') and (kernels[j].name == 'DiffKern') and (kernels[i].base_kern == kernels[j].base_kern): #both are partial derivatives
                    kern = KernWrapper(
                        partial(kernels[i].K, dimX2=kernels[j].dimension),
                        partial(kernels[i].dK_dX, dimX2=kernels[j].dimension),
                        partial(kernels[i].update_gradients_full, dimX2=kernels[j].dimension),
                        None,
                        kernels[i].base_kern
                        )
                    if i > j:
                        unique = False
                else:
                    kern = ZeroKern()
                covariance[i][j] = kern
            if unique is True:
                linked.append(i)
        self.covariance = covariance
-        self.link_parameters(*[kernels[i] for i in linked])
+        self.link_parameters(*[kernels[i] for i in linked])
--- a/GPy/kern/src/multioutput_kern.py
+++ b/GPy/kern/src/multioutput_kern.py
@ -85,21 +85,63 @@ class MultioutputKern(CombinationKernel):
        self.link_parameters(*[kernels[i] for i in linked])
    @Cache_this(limit=3, ignore_args=())
-    def K(self, X ,X2=None):
+    def K(self, X, X2=None):
        if X2 is None:
            X2 = X
        slices = index_to_slices(X[:,self.index_dim])
        slices2 = index_to_slices(X2[:,self.index_dim])
        target =  np.zeros((X.shape[0], X2.shape[0]))
-        [[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j].K(X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))]  
+        for j in range(len(slices2)):
            for i in range(len(slices)):
                for l in range(len(slices2[j])):
                    for k in range(len(slices[i])):
                        cov_K = self.covariance[i][j].K(X[slices[i][k],:], X2[slices2[j][l],:])
                        target.__setitem__((slices[i][k], slices2[j][l]), cov_K)
        return target
    @Cache_this(limit=3, ignore_args=())
-    def Kdiag(self,X):
+    def Kdiag(self, X):
        slices = index_to_slices(X[:,self.index_dim])
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        target = np.zeros(X.shape[0])
-        [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
+        for kern, slices_i in zip(kerns, slices):
            for s in slices_i:
                np.copyto(target[s], kern.Kdiag(X[s]))
        return target
    @Cache_this(limit=3, ignore_args=())
    def dK_dX(self, X, X2, dimX):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        """
        if X2 is None:
            X2 = X
        slices = index_to_slices(X[:,self.index_dim])
        slices2 = index_to_slices(X2[:,self.index_dim])
        target =  np.zeros((X.shape[0], X2.shape[0]))
        for j in range(len(slices2)):
            for i in range(len(slices)):
                for l in range(len(slices2[j])):
                    for k in range(len(slices[i])):
                        cov_dK_dX = self.covariance[i][j].dK_dX(X[slices[i][k],:], X2[slices2[j][l],:], dimX)
                        target.__setitem__((slices[i][k], slices2[j][l]), cov_dK_dX)
        return target
    @Cache_this(limit=3, ignore_args=())
    def dK_dXdiag(self, X, dimX):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        """
        slices = index_to_slices(X[:,self.index_dim])
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        target = np.zeros(X.shape[0])
        for kern, slices_i in zip(kerns, slices):
            for s in slices_i:
                np.copyto(target[s], kern.dK_dXdiag(X[s], dimX))
        return target
    def _update_gradients_full_wrapper(self, kern, dL_dK, X, X2):
@ -115,19 +157,35 @@ class MultioutputKern(CombinationKernel):
    def reset_gradients(self):
        for kern in self.kern: kern.reset_gradients()
-    def update_gradients_full(self,dL_dK, X, X2=None):
+    def update_gradients_full(self, dL_dK, X, X2=None):
-        self.reset_gradients()
+        if X2 is None:
            X2 = X
        slices = index_to_slices(X[:,self.index_dim])
-        if X2 is not None:
+        slices2 = index_to_slices(X2[:,self.index_dim])
-            slices2 = index_to_slices(X2[:,self.index_dim])
+
-            [[[[ self._update_gradients_full_wrapper(self.covariance[i][j], dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
+        self.reset_gradients()
-        else:
+        for j in range(len(slices2)):
-            [[[[ self._update_gradients_full_wrapper(self.covariance[i][j], dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], X[slices[j][l],:]) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
+            for i in range(len(slices)):
-            
+                for l in range(len(slices2[j])):
                    for k in range(len(slices[i])):
                        self._update_gradients_full_wrapper(
                            self.covariance[i][j],
                            dL_dK[slices[i][k],slices2[j][l]],
                            X[slices[i][k],:],
                            X2[slices2[j][l],:]
                            )
    def update_gradients_diag(self, dL_dKdiag, X):
        self.reset_gradients()
        slices = index_to_slices(X[:,self.index_dim])
-        [[ self._update_gradients_diag_wrapper(self.covariance[i][i], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))]
+
        self.reset_gradients()
        for i in range(len(slices)):
            for k in range(len(slices[i])):
                self._update_gradients_diag_wrapper(
                    self.covariance[i][i],
                    dL_dKdiag[slices[i][k]],
                    X[slices[i][k],:]
                    )
    def gradients_X(self,dL_dK, X, X2=None):
        slices = index_to_slices(X[:,self.index_dim])
@ -137,4 +195,4 @@ class MultioutputKern(CombinationKernel):
            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) ) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
        else:
            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], (None if (i==j and k==l) else X[slices[j][l],:] )) ) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
-        return target
+        return target
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@ -70,6 +70,310 @@ class Prod(CombinationKernel):
            which_parts = self.parts
        return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))
    def reset_gradients(self):
        for part in self.parts:
            part.reset_gradients()
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK_dX(self, X, X2, dimX, which_parts=None):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
        for combination in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination) > 0:
                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
            else:
                prod = np.ones(prod_sum.shape)
            to_update = list(set(which_parts) - set(combination))[0]
            prod_sum += prod*to_update.dK_dX(X, X2, dimX)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK_dXdiag(self, X, dimX, which_parts=None):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        Returns only diagonal elements.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros(X.shape[0])
        for combination in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination) > 0:
                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination])
            else:
                prod = np.ones(prod_sum.shape)
            to_update = list(set(which_parts) - set(combination))[0]
            prod_sum += prod*to_update.dK_dXdiag(X, dimX)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK_dX2(self, X, X2, dimX2, which_parts=None):
        """
        Compute the derivative of K with respect to:
            dimension dimX2 of set X2.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
        for combination in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination) > 0:
                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
            else:
                prod = np.ones(prod_sum.shape)
            to_update = list(set(which_parts) - set(combination))[0]
            prod_sum += prod*to_update.dK_dX2(X, X2, dimX2)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK2_dXdX2(self, X, X2, dimX, dimX2, which_parts=None):
        """
        Compute the second derivative of K with respect to:
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination1) > 0:
                prod = reduce(np.multiply, [p.K(X, X2) for p in combination1])
            else:
                prod = np.ones(prod_sum.shape)
            to_update1 = list(set(which_parts) - set(combination1))[0]
            prod_sum += prod*to_update1.dK2_dXdX2(X, X2, dimX, dimX2)
            if len(which_parts) > 1:
                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
                    if len(combination2) > 0:
                        prod = reduce(np.multiply, [p.K(X, X2) for p in combination2])
                    else:
                        prod = np.ones(prod_sum.shape)
                    to_update2 = list(set(combination1) - set(combination2))[0]
                    prod_sum += prod*to_update1.dK_dX(X, X2, dimX)*to_update2.dK_dX2(X, X2, dimX2)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK2_dXdX2diag(self, X, dimX, dimX2, which_parts=None):
        """
        Compute the second derivative of K with respect to:
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        Returns only diagonal elements.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros(X.shape[0])
        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination1) > 0:
                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination1])
            else:
                prod = np.ones(prod_sum.shape)
            to_update1 = list(set(which_parts) - set(combination1))[0]
            prod_sum += prod*to_update1.dK2_dXdX2diag(X, dimX, dimX2)
            if len(which_parts) > 1:
                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
                    if len(combination2) > 0:
                        prod = reduce(np.multiply, [p.Kdiag(X) for p in combination2])
                    else:
                        prod = np.ones(prod_sum.shape)
                    to_update2 = list(set(combination1) - set(combination2))[0]
                    prod_sum += prod*to_update1.dK_dXdiag(X, dimX)*to_update2.dK_dX2diag(X, dimX)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK2_dXdX(self, X, X2, dimX_0, dimX_1, which_parts=None):
        """
        Compute the second derivative of K with respect to:
            dimension dimX_0 of set X, and
            dimension dimX_1 of set X.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination1) > 0:
                prod = reduce(np.multiply, [p.K(X, X2) for p in combination1])
            else:
                prod = np.ones(prod_sum.shape)
            to_update1 = list(set(which_parts) - set(combination1))[0]
            prod_sum += prod*to_update1.dK2_dXdX(X, X2, dimX_0, dimX_1)
            if len(which_parts) > 1:
                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
                    if len(combination2) > 0:
                        prod = reduce(np.multiply, [p.K(X, X2) for p in combination2])
                    else:
                        prod = np.ones(prod_sum.shape)
                    to_update2 = list(set(combination1) - set(combination2))[0]
                    prod_sum += prod*to_update1.dK_dX(X, X2, dimX_0)*to_update2.dK_dX(X, X2, dimX_1)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK3_dXdXdX2(self, X, X2, dimX_0, dimX_1, dimX2, which_parts=None):
        """
        Compute the third derivative of K with respect to:
            dimension dimX_0 of set X,
            dimension dimX_1 of set X, and
            dimension dimX2 of set X2.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination1) > 0:
                prod = reduce(np.multiply, [p.K(X, X2) for p in combination1])
            else:
                prod = np.ones(prod_sum.shape)
            to_update1 = list(set(which_parts) - set(combination1))[0]
            prod_sum += prod*to_update1.dK3_dXdXdX2(X, X2, dimX_0, dimX_1, dimX2)
            if len(which_parts) > 1:
                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
                    if len(combination2) > 0:
                        prod = reduce(np.multiply, [p.K(X, X2) for p in combination2])
                    else:
                        prod = np.ones(prod_sum.shape)
                    to_update2 = list(set(combination1) - set(combination2))[0]
                    prod_sum += prod*to_update1.dK2_dXdX2(X, X2, dimX_0, dimX2)*to_update2.dK_dX(X, X2, dimX_1)
                    prod_sum += prod*to_update1.dK2_dXdX(X, X2, dimX_0, dimX_1)*to_update2.dK_dX2(X, X2, dimX2)
                    prod_sum += prod*to_update1.dK_dX(X, X2, dimX_0)*to_update2.dK2_dXdX2(X, X2, dimX_1, dimX2)
                    if len(which_parts) > 2:
                        for combination3 in itertools.combinations(combination2, len(combination2) - 1):
                            if len(combination3) > 0:
                                prod = reduce(np.multiply, [p.K(X, X2) for p in combination3])
                            else:
                                prod = np.ones(prod_sum.shape)
                            to_update3 = list(set(combination2) - set(combination3))[0]
                            prod_sum += prod*to_update1.dK_dX(X, X2, dimX_0)*to_update2.dK_dX2(X, X2, dimX2)*to_update3.dK_dX(X, X2, dimX_1)
        return prod_sum
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def dK3_dXdXdX2diag(self, X, dimX_0, dimX_1, dimX2, which_parts=None):
        """
        Compute the third derivative of K with respect to:
            dimension dimX_0 of set X,
            dimension dimX_1 of set X, and
            dimension dimX2 of set X2.
        Returns only diagonal elements of the covariance matrix.
        """
        if which_parts is None:
            which_parts = self.parts
        prod_sum = np.zeros(X.shape[0])
        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
            if len(combination1) > 0:
                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination1])
            else:
                prod = np.ones(prod_sum.shape)
            to_update1 = list(set(which_parts) - set(combination1))[0]
            prod_sum += prod*to_update1.dK3_dXdXdX2diag(X, dimX_0, dimX_1, dimX2)
            if len(which_parts) > 1:
                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
                    if len(combination2) > 0:
                        prod = reduce(np.multiply, [p.Kdiag(X) for p in combination2])
                    else:
                        prod = np.ones(prod_sum.shape)
                    to_update2 = list(set(combination1) - set(combination2))[0]
                    prod_sum += prod*to_update1.dK2_dXdX2diag(X, dimX_0, dimX2)*to_update2.dK_dXdiag(X, dimX_1)
                    prod_sum += prod*to_update1.dK2_dXdXdiag(X, dimX_0, dimX_1)*to_update2.dK_dX2diag(X, dimX2)
                    prod_sum += prod*to_update1.dK_dXdiag(X, dimX_0)*to_update2.dK2_dXdX2diag(X, dimX_1, dimX2)
                    if len(which_parts) > 2:
                        for combination3 in itertools.combinations(combination2, len(combination2) - 1):
                            if len(combination3) > 0:
                                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination3])
                            else:
                                prod = np.ones(prod_sum.shape)
                            to_update3 = list(set(combination2) - set(combination3))[0]
                            prod_sum += prod*to_update1.dK_dXdiag(X, dimX_0)*to_update2.dK_dX2diag(X, dimX2)*to_update3.dK_dXdiag(X, dimX_1)
        return prod_sum
    def update_gradients_direct(self, *args):
        for i, (g,p) in enumerate(zip(args, self.parts)):
            p.update_gradients_direct(*g)
    def dgradients_dX(self, X, X2, dimX, parts=None):
        """
        Compute the hyperparameter gradients of:
            the derivative of K with respect to dimension dimX of set X
            ("dK_dX").
        """
        if parts is None:
            parts = self.parts
        gradients = []
        for part in parts:
            neq_parts = [p for p in parts if p is not part]
            if len(neq_parts) > 0:
                K = self.K(X, X2, which_parts=neq_parts)
                K_dx = self.dK_dX(X, X2, dimX, which_parts=neq_parts)
            else:
                K = np.ones((X.shape[0], X2.shape[0]))
                K_dx = np.zeros((X.shape[0], X2.shape[0]))
            g = part.dgradients(X, X2)
            g_dx = part.dgradients_dX(X, X2, dimX)
            gradients += [[(g_i*K_dx + g_dx_i*K) for (g_i, g_dx_i) in zip(g, g_dx)]]
        return gradients
    def dgradients_dX2(self, X, X2, dimX2, parts=None):
        """
        Compute the hyperparameter gradients of:
            the derivative of K with respect to dimension dimX2 of set X2
            ("dK_dX2").
        """
        if parts is None:
            parts = self.parts
        gradients = []
        for part in parts:
            neq_parts = [p for p in parts if p is not part]
            if len(neq_parts) > 0:
                K = self.K(X, X2, which_parts=neq_parts)
                K_dx2 = self.dK_dX2(X, X2, dimX2, which_parts=neq_parts)
            else:
                K = np.ones((X.shape[0], X2.shape[0]))
                K_dx2 = np.zeros((X.shape[0], X2.shape[0]))
            g = part.dgradients(X, X2)
            g_dx2 = part.dgradients_dX2(X, X2, dimX2)
            gradients += [[(g_i*K_dx2 + g_dx2_i*K) for (g_i, g_dx2_i) in zip(g, g_dx2)]]
        return gradients
    def dgradients2_dXdX2(self, X, X2, dimX, dimX2, parts=None):
        """
        Compute the hyperparameter gradients of:
            the second derivative of K with respect to:
                dimension dimX of set X, and
                dimension dimX2 of set X2
            ("dK2_dXdX2").
        """
        if parts is None:
            parts = self.parts
        gradients = []
        for part in parts:
            neq_parts = [p for p in parts if p is not part]
            K = self.K(X, X2, which_parts=neq_parts)
            K_dx = self.dK_dX(X, X2, dimX, which_parts=neq_parts)
            K_dx2 = self.dK_dX2(X, X2, dimX2, which_parts=neq_parts)
            K_dxdx2 = self.dK2_dXdX2(X, X2, dimX, dimX2, which_parts=neq_parts)
            g = part.dgradients(X, X2)
            g_dx = part.dgradients_dX(X, X2, dimX)
            g_dx2 = part.dgradients_dX2(X, X2, dimX2)
            g_dxdx2 = part.dgradients2_dXdX2(X, X2, dimX, dimX2)
            gradients += [[(g_i*K_dxdx2 + g_dx_i*K_dx2 + g_dx2_i*K_dx + g_dxdx2_i*K) for (g_i, g_dx_i, g_dx2_i, g_dxdx2_i) in zip(g, g_dx, g_dx2, g_dxdx2)]]
        return gradients
    def update_gradients_full(self, dL_dK, X, X2=None):
        if len(self.parts)==2:
            self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
--- a/GPy/kern/src/rbf.py
+++ b/GPy/kern/src/rbf.py
@ -53,24 +53,126 @@ class RBF(Stationary):
    @Cache_this(limit=3, ignore_args=())
    def dK_dX(self, X, X2, dimX):
-        r = self._scaled_dist(X, X2)
+        """
-        K = self.K_of_r(r)
+        Compute the derivative of K with respect to:
-        dist = X[:,None,dimX]-X2[None,:,dimX]
+            dimension dimX of set X.
-        lengthscale2inv = (np.ones((X.shape[1]))/(self.lengthscale**2))[dimX]
+        """
-        return -1.*K*dist*lengthscale2inv
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
        dist = X[:,None,dimX] - X2[None,:,dimX]
        return -dist*(lengthscaleinv**2)*self._clean_K(X, X2)
    @Cache_this(limit=3, ignore_args=())
    def dK_dXdiag(self, X, dimX):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        Returns only diagonal elements.
        """
        return np.zeros(X.shape[0])
    @Cache_this(limit=3, ignore_args=())
    def dK_dX2(self, X, X2, dimX2):
-        return -self.dK_dX(X,X2, dimX2)
+        """
-    
+        Compute the derivative of K with respect to:
            dimension dimX2 of set X2.
        """
        return -self._clean_dK_dX(X, X2, dimX2)
    @Cache_this(limit=3, ignore_args=())
    def dK_dX2diag(self, X, dimX2):
        """
        Compute the derivative of K with respect to:
            dimension dimX2 of set X2.
        Returns only diagonal elements.
        """
        return np.zeros(X.shape[0])
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX2(self, X, X2, dimX, dimX2):
-        r = self._scaled_dist(X, X2)
+        """
-        K = self.K_of_r(r)
+        Compute the second derivative of K with respect to:
-        if X2 is None:
+            dimension dimX of set X, and
-            X2=X
+            dimension dimX2 of set X2.
-        dist = X[:,None,:]-X2[None,:,:]
+        """
-        lengthscale2inv = np.ones((X.shape[1]))/(self.lengthscale**2)
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
-        return -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX]*lengthscale2inv[dimX2] + (dimX==dimX2)*K*lengthscale2inv[dimX]
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        term = dist[dimX]*(lengthscaleinv[dimX]**2)
        term *= dist[dimX2]*(lengthscaleinv[dimX2]**2)
        if dimX == dimX2:
            term -= (lengthscaleinv[dimX]**2)
        return -term*self._clean_K(X, X2)
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX2diag(self, X, dimX, dimX2):
        """
        Compute the second derivative of K with respect to:
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        Returns only diagonal elements.
        """
        if dimX == dimX2:
            lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
            return np.ones(X.shape[0])*(lengthscaleinv[dimX]**2)*self.variance
        else:
            return np.zeros(X.shape[0])
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX(self, X, X2, dimX_0, dimX_1):
        """
        Compute the second derivative of K with respect to:
            dimension dimX_0 of set X, and
            dimension dimX_1 of set X.
        """
        return -self._clean_dK2_dXdX2(X, X2, dimX_0, dimX_1)
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdXdiag(self, X, dimX_0, dimX_1):
        """
        Compute the second derivative of K with respect to:
            dimension dimX_0 of set X, and
            dimension dimX_1 of set X.
        Returns only diagonal elements.
        """
        return -self._clean_dK2_dXdX2diag(X, dimX_0, dimX_1)
    @Cache_this(limit=3, ignore_args=())
    def dK3_dXdXdX2(self, X, X2, dimX_0, dimX_1, dimX2):
        """
        Compute the third derivative of K with respect to:
            dimension dimX_0 of set X,
            dimension dimX_1 of set X, and
            dimension dimX2 of set X2.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        term = dist[dimX_0]*(lengthscaleinv[dimX_0]**2)
        term *= dist[dimX_1]*(lengthscaleinv[dimX_1]**2)
        term *= dist[dimX2]*(lengthscaleinv[dimX2]**2)
        if dimX_0 == dimX_1:
            term -= dist[dimX2]*(lengthscaleinv[dimX2]**2)*(lengthscaleinv[dimX_0]**2)
        if dimX_0 == dimX2:
            term -= dist[dimX_1]*(lengthscaleinv[dimX_1]**2)*(lengthscaleinv[dimX_0]**2)
        if dimX_1 == dimX2:
            term -= dist[dimX_0]*(lengthscaleinv[dimX_0]**2)*(lengthscaleinv[dimX_1]**2)
        return term*self._clean_K(X, X2)
    @Cache_this(limit=3, ignore_args=())
    def dK3_dXdXdX2diag(self, X, dimX_0, dimX_1, dimX2):
        """
        Compute the third derivative of K with respect to:
            dimension dimX_0 of set X,
            dimension dimX_1 of set X, and
            dimension dimX2 of set X2.
        Returns only diagonal elements of the covariance matrix.
        """
        return np.zeros(X.shape[0])
    def dK_dr(self, r):
        return -r*self.K_of_r(r)
@ -80,73 +182,132 @@ class RBF(Stationary):
    def dK2_drdr_diag(self):
        return -self.variance # as the diagonal of r is always filled with zeros
-    
+
    @Cache_this(limit=3, ignore_args=())
-    def dK_dvariance(self,X,X2):
+    def dK_dvariance(self, X, X2):
-        return self.K(X,X2)/self.variance
+        """
-    
+        Compute the derivative of K with respect to variance.
        """
        return self._clean_K(X, X2)/self.variance
    @Cache_this(limit=3, ignore_args=())
-    def dK2_dvariancedX(self, X, X2, dim):
+    def dK_dlengthscale(self, X, X2):
-        return self.dK_dX(X,X2, dim)/self.variance
+        """
-    
+        Compute the derivative(s) of K with respect to lengthscale(s).
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        K = self._clean_K(X, X2)
        if self.ARD:
            g = []
            for diml in range(self.input_dim):
                g += [(dist[diml]**2)*(lengthscaleinv[diml]**3)*K]
        else:
            g = (lengthscaleinv[0]**3)*np.sum(dist**2, axis=0)*K
        return g
    @Cache_this(limit=3, ignore_args=())
-    def dK2_dvariancedX2(self, X, X2, dim):
+    def dK2_dvariancedX(self, X, X2, dimX):
-        return self.dK_dX2(X,X2, dim)/self.variance
+        """
-    
+        Compute the second derivative of K with respect to:
            variance, and
            dimension dimX of set X.
        """
        return self._clean_dK_dX(X, X2, dimX)/self.variance
    @Cache_this(limit=3, ignore_args=())
-    def dK3_dvariancedXdX2(self, X, X2, dim, dimX2):
+    def dK2_dvariancedX2(self, X, X2, dimX2):
-        return self.dK2_dXdX2(X, X2, dim, dimX2)/self.variance
+        """
        Compute the second derivative of K with respect to:
            variance, and
            dimension dimX2 of set X2.
        """
        return -self.dK2_dvariancedX(X, X2, dimX2)
    @Cache_this(limit=3, ignore_args=())
    def dK2_dlengthscaledX(self, X, X2, dimX):
-        r = self._scaled_dist(X, X2)
+        """
-        K = self.K_of_r(r)
+        Compute the second derivative(s) of K with respect to:
-        if X2 is None:
+            lengthscale(s), and
-            X2=X
+            dimension dimX of set X.
-        dist = X[:,None,:]-X2[None,:,:]
+        """
-        lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        dK_dX = self._clean_dK_dX(X, X2, dimX)
        dK_dl = self.dK_dlengthscale(X, X2)
        if self.ARD:
            g = []
-            for diml in range(X.shape[1]):
+            for diml in range(self.input_dim):
-                g += [-1.*K*dist[:,:,dimX]*(dist[:,:,diml]**2)*(lengthscaleinv[dimX]**2)*(lengthscaleinv[diml]**3) + 2.*dist[:,:,dimX]*(lengthscaleinv[diml]**3)*K*(dimX == diml)]
+                term = -dist[dimX]*(lengthscaleinv[dimX]**2)*dK_dl[diml]
                if diml == dimX:
                    term -= 2*lengthscaleinv[dimX]*dK_dX
                g += [term]
        else:
-            g = -1.*K*dist[:,:,dimX]*np.sum(dist**2, axis=2)*(lengthscaleinv[dimX]**5) + 2.*dist[:,:,dimX]*(lengthscaleinv[dimX]**3)*K
+            term = -dist[dimX]*(lengthscaleinv[0]**2)*dK_dl
            term -= 2*lengthscaleinv[0]*dK_dX
            g = term
        return g
-    
+
    @Cache_this(limit=3, ignore_args=())
    def dK2_dlengthscaledX2(self, X, X2, dimX2):
-        tmp = self.dK2_dlengthscaledX(X, X2, dimX2)
+        """
        Compute the second derivative(s) of K with respect to:
            lengthscale(s), and
            dimension dimX2 of set X2.
        """
        dK2_dlengthscaledX = self.dK2_dlengthscaledX(X, X2, dimX2)
        if self.ARD:
-            return [-1.*g for g in tmp]
+            return [-1.*g for g in dK2_dlengthscaledX]
        else:
-            return -1*tmp
+            return -1*dK2_dlengthscaledX
    @Cache_this(limit=3, ignore_args=())
    def dK3_dvariancedXdX2(self, X, X2, dimX, dimX2):
        """
        Compute the third derivative of K with respect to:
            variance,
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        """
        return self._clean_dK2_dXdX2(X, X2, dimX, dimX2)/self.variance
    @Cache_this(limit=3, ignore_args=())
    def dK3_dlengthscaledXdX2(self, X, X2, dimX, dimX2):
-        r = self._scaled_dist(X, X2)
+        """
-        K = self.K_of_r(r)
+        Compute the third derivative(s) of K with respect to:
-        if X2 is None:
+            lengthscale(s),
-            X2=X
+            dimension dimX of set X, and
-        dist = X[:,None,:]-X2[None,:,:]
+            dimension dimX2 of set X2.
-        lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
+        """
-        lengthscale2inv = lengthscaleinv**2
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        K = self._clean_K(X, X2)
        dK_dX = self._clean_dK_dX(X, X2, dimX)
        dK_dX2 = self._clean_dK_dX(X, X2, dimX2)
        dK2_dXdX2 = self._clean_dK2_dXdX2(X, X2, dimX, dimX2)
        if self.ARD:
            g = []
-            for diml in range(X.shape[1]):
+            for diml in range(self.input_dim):
-                tmp = -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*(dist[:,:,diml]**2)*lengthscale2inv[dimX]*lengthscale2inv[dimX2]*(lengthscaleinv[diml]**3)
+                term = (dist[diml]**2)*(lengthscaleinv[diml]**3)*dK2_dXdX2
                if dimX == dimX2:
                    tmp += K*lengthscale2inv[dimX]*(lengthscaleinv[diml]**3)*(dist[:,:,diml]**2)
                if diml == dimX:
-                    tmp += 2.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX2]*(lengthscaleinv[dimX]**3)
+                    term -= 2*dist[dimX]*(lengthscaleinv[dimX]**3)*dK_dX2
                if diml == dimX2:
-                    tmp += 2.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX]*(lengthscaleinv[dimX2]**3)
+                    term -= 2*dist[dimX2]*(lengthscaleinv[dimX2]**3)*dK_dX
-                    if dimX == dimX2:
+                if diml == dimX == dimX2:
-                        tmp += -2.*K*(lengthscaleinv[dimX]**3)
+                    term -= 2*(lengthscaleinv[dimX]**3)*K
-                g += [tmp]
+                g += [term]
        else:
-            g = -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*np.sum(dist**2, axis=2)*(lengthscaleinv[dimX]**7) +4*K*dist[:,:,dimX]*dist[:,:,dimX2]*(lengthscaleinv[dimX]**5)
+            term = np.sum(dist**2, axis=0)*dK2_dXdX2
            term -= 4*dist[dimX2]*dK_dX
            if dimX == dimX2:
-                g += -2.*K*(lengthscaleinv[dimX]**3) + K*(lengthscaleinv[dimX]**5)*np.sum(dist**2, axis=2)
+                term -= 2*K
            g = (lengthscaleinv[0]**3)*term
        return g
    def __getstate__(self):
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@ -122,7 +122,6 @@ class StdPeriodic(Kern):
        pass
    def K(self, X, X2=None):
        """Compute the covariance matrix between X and X2."""
        if X2 is None:
@ -133,13 +132,372 @@ class StdPeriodic(Kern):
        return self.variance * exp_dist
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix associated to X."""
        ret = np.empty(X.shape[0])
        ret[:] = self.variance
        return ret
    def dK_dX(self, X, X2, dimX):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX]
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        dist = X[:,None,dimX] - X2[None,:,dimX]
        base = np.pi*periodinv*dist
        return -F*np.sin(2*base)*self._clean_K(X, X2)
    def dK_dXdiag(self, X, dimX):
        """
        Compute the derivative of K with respect to:
            dimension dimX of set X.
        Returns only diagonal elements.
        """
        return np.zeros(X.shape[0])
    def dK_dX2(self, X, X2, dimX2):
        """
        Compute the derivative of K with respect to:
            dimension dimX2 of set X2.
        """
        return -self._clean_dK_dX(X, X2, dimX2)
    def dK_dX2diag(self, X, dimX2):
        """
        Compute the derivative of K with respect to:
            dimension dimX2 of set X2.
        Returns only diagonal elements.
        """
        return np.zeros(X.shape[0])
    def dK2_dXdX2(self, X, X2, dimX, dimX2):
        """
        Compute the second derivative of K with respect to:
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        dist = X[:,None,dimX2] - X2[None,:,dimX2]
        base = np.pi*periodinv*dist
        term = np.sin(2*base)*self._clean_dK_dX(X, X2, dimX)
        if dimX == dimX2:
            term += 2*np.pi*periodinv*np.cos(2*base)*self._clean_K(X, X2)
        return F*term
    def dK2_dXdX2diag(self, X, dimX, dimX2):
        """
        Compute the second derivative of K with respect to:
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        Returns only diagonal elements.
        """
        if dimX == dimX2:
            lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
            periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
            return (np.pi**2)*(lengthscaleinv**2)*(periodinv**2)*self.variance*np.ones(X.shape[0])
        else:
            return np.zeros(X.shape[0])
    def dK2_dXdX(self, X, X2, dimX_0, dimX_1):
        """
        Compute the second derivative of K with respect to:
            dimension dimX_0 of set X, and
            dimension dimX_1 of set X.
        """
        return -self._clean_dK2_dXdX2(X, X2, dimX_0, dimX_1)
    def dK2_dXdXdiag(self, X, dimX_0, dimX_1):
        """
        Compute the second derivative of K with respect to:
            dimension dimX_0 of set X, and
            dimension dimX_1 of set X.
        Returns only diagonal elements.
        """
        return -self._clean_dK2_dXdX2diag(X, dimX_0, dimX_1)
    def dK3_dXdXdX2(self, X, X2, dimX_0, dimX_1, dimX2):
        """
        Compute the third derivative of K with respect to:
            dimension dimX_0 of set X,
            dimension dimX_1 of set X, and
            dimension dimX2 of set X2.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        dist = X[:,None,dimX2] - X2[None,:,dimX2]
        base = np.pi*periodinv*dist
        term = np.sin(2*base)*self._clean_dK2_dXdX(X, X2, dimX_0, dimX_1)
        if dimX_0 == dimX2:
            term += 2*np.pi*periodinv*np.cos(2*base)*self._clean_dK_dX(X, X2, dimX_1)
        if dimX_1 == dimX2:
            term += 2*np.pi*periodinv*np.cos(2*base)*self._clean_dK_dX(X, X2, dimX_0)
        if dimX_0 == dimX_1 == dimX2:
            term -= 4*(np.pi**2)*(periodinv**2)*np.sin(2*base)*self._clean_K(X, X2)
        return F*term
    def dK3_dXdXdX2diag(self, X, dimX_0, dimX_1, dimX2):
        """
        Compute the third derivative of K with respect to:
            dimension dimX_0 of set X,
            dimension dimX_1 of set X, and
            dimension dimX2 of set X2.
        Returns only diagonal elements of the covariance matrix.
        """
        return np.zeros(X.shape[0])
    def dK_dvariance(self, X, X2):
        """
        Compute the derivative of K with respect to variance.
        """
        return self._clean_K(X, X2)/self.variance
    def dK_dlengthscale(self, X, X2):
        """
        Compute the derivative(s) of K with respect to lengthscale(s).
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
        periodinv = (np.ones(X.shape[1])/(self.period))
        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        base = np.pi*periodinv[:,None,None]*dist
        K = self._clean_K(X, X2)
        if self.ARD2:
            g = []
            for diml in range(self.input_dim):
                g += [(lengthscaleinv[diml]**3)*np.square(np.sin(base[diml]))*K]
        else:
            g = (lengthscaleinv[0]**3)*np.sum(np.square(np.sin(base)), axis=0)*K
        return g
    def dK_dperiod(self, X, X2):
        """
        Compute the derivative(s) of K with respect to period(s).
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
        periodinv = (np.ones(X.shape[1])/(self.period))
        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
        base = np.pi*periodinv[:,None,None]*dist
        K = self._clean_K(X, X2)
        if self.ARD1:
            g = []
            for diml in range(self.input_dim):
                g += [0.5*base[diml]*(lengthscaleinv[diml]**2)*periodinv[diml]*np.sin(2*base[diml])*K]
        else:
            g = 0.5*periodinv[0]*np.sum(base*(lengthscaleinv**2)[:,None,None]*np.sin(2*base), axis=0)*K
        return g
    def dK2_dvariancedX(self, X, X2, dimX):
        """
        Compute the second derivative of K with respect to:
            variance, and
            dimension dimX of set X.
        """
        return self._clean_dK_dX(X, X2, dimX)/self.variance
    def dK2_dvariancedX2(self, X, X2, dimX2):
        """
        Compute the second derivative of K with respect to:
            variance, and
            dimension dimX2 of set X2.
        """
        return -self.dK2_dvariancedX(X, X2, dimX2)
    def dK2_dlengthscaledX(self, X, X2, dimX):
        """
        Compute the second derivative(s) of K with respect to:
            lengthscale(s), and
            dimension dimX of set X.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX]
        dist = X[:,None,dimX] - X2[None,:,dimX]
        base = np.pi*periodinv*dist
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        K = self._clean_K(X, X2)
        dK_dl = self.dK_dlengthscale(X, X2)
        if self.ARD2:
            g = []
            for diml in range(self.input_dim):
                term = dK_dl[diml]
                if diml == dimX:
                    term -= 2*lengthscaleinv*K
                g += [-F*np.sin(2*base)*term]
        else:
            g = -F*np.sin(2*base)*(dK_dl - 2*lengthscaleinv*K)
        return g
    def dK2_dlengthscaledX2(self, X, X2, dimX2):
        """
        Compute the second derivative(s) of K with respect to:
            lengthscale(s), and
            dimension dimX2 of set X2.
        """
        dK2_dldX = self.dK2_dlengthscaledX(X, X2, dimX2)
        if self.ARD2:
            return [-1*g for g in dK2_dldX]
        else:
            return -1*dK2_dldX
    def dK2_dperioddX(self, X, X2, dimX):
        """
        Compute the second derivative(s) of K with respect to:
            period(s), and
            dimension dimX of set X.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX]
        dist = X[:,None,dimX] - X2[None,:,dimX]
        base = np.pi*periodinv*dist
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        K = self._clean_K(X, X2)
        dK_dT = self.dK_dperiod(X, X2)
        if self.ARD1:
            g = []
            for dimT in range(self.input_dim):
                term = np.sin(2*base)*dK_dT[dimT]
                if dimT == dimX:
                    term -= periodinv*(np.sin(2*base)+2*base*np.cos(2*base))*K
                g += [-F*term]
        else:
            term = np.sin(2*base)*dK_dT
            term -= periodinv*(np.sin(2*base)+2*base*np.cos(2*base))*K
            g = -F*term
        return g
    def dK2_dperioddX2(self, X, X2, dimX2):
        """
        Compute the second derivative(s) of K with respect to:
            period(s), and
            dimension dimX2 of set X2.
        """
        dK2_dperioddX = self.dK2_dperioddX(X, X2, dimX2)
        if self.ARD1:
            return [-1*g for g in dK2_dperioddX]
        else:
            return -1*dK2_dperioddX
    def dK3_dvariancedXdX2(self, X, X2, dimX, dimX2):
        """
        Compute the third derivative of K with respect to:
            variance,
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        """
        return self._clean_dK2_dXdX2(X, X2, dimX, dimX2)/self.variance
    def dK3_dlengthscaledXdX2(self, X, X2, dimX, dimX2):
        """
        Compute the third derivative(s) of K with respect to:
            lengthscale(s),
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
        dist = X[:,None,dimX2] - X2[None,:,dimX2]
        base = np.pi*periodinv*dist
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        dK2_dXdX2 = self._clean_dK2_dXdX2(X, X2, dimX, dimX2)
        dK_dl = self.dK_dlengthscale(X, X2)
        dK2_dldX = self.dK2_dlengthscaledX(X, X2, dimX)
        if self.ARD2:
            g = []
            for diml in range(self.input_dim):
                term = np.sin(2*base)*dK2_dldX[diml]
                if dimX == dimX2:
                    term += 2*np.pi*periodinv*np.cos(2*base)*dK_dl[diml]
                term *= F
                if diml == dimX2:
                    term -= 2*lengthscaleinv*dK2_dXdX2
                g += [term]
        else:
            term = np.sin(2*base)*dK2_dldX
            if dimX == dimX2:
                term += 2*np.pi*periodinv*np.cos(2*base)*dK_dl
            term *= F
            term -= 2*lengthscaleinv*dK2_dXdX2
            g = term
        return g
    def dK3_dperioddXdX2(self, X, X2, dimX, dimX2):
        """
        Compute the third derivative(s) of K with respect to:
            period(s),
            dimension dimX of set X, and
            dimension dimX2 of set X2.
        """
        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
        dist = X[:,None,dimX2] - X2[None,:,dimX2]
        base = np.pi*periodinv*dist
        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
        K = self._clean_K(X, X2)
        dK_dX = self._clean_dK_dX(X, X2, dimX)
        dK2_dXdX2 = self._clean_dK2_dXdX2(X, X2, dimX, dimX2)
        dK_dT = self.dK_dperiod(X, X2)
        dK2_dTdX = self.dK2_dperioddX(X, X2, dimX)
        if self.ARD1:
            g = []
            for dimT in range(self.input_dim):
                term = np.sin(2*base)*dK2_dTdX[dimT]
                if dimT == dimX2:
                    term -= 2*periodinv*np.cos(2*base)*base*dK_dX
                if dimX == dimX2:
                    term += 2*np.pi*periodinv*np.cos(2*base)*dK_dT[dimT]
                if dimX == dimX2 == dimT:
                    term += 2*np.pi*(periodinv**2)*(2*base*np.sin(2*base)-np.cos(2*base))*K
                term *= F
                if dimT == dimX2:
                    term -= periodinv*dK2_dXdX2
                g += [term]
        else:
            term = np.sin(2*base)*dK2_dTdX-2*periodinv*base*np.cos(2*base)*dK_dX
            if dimX == dimX2:
                term += 2*np.pi*periodinv*(np.cos(2*base)*dK_dT+periodinv*(2*base*np.sin(2*base)-np.cos(2*base))*K)
            g = F*term-periodinv*dK2_dXdX2
        return g
    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None:
@ -167,12 +525,52 @@ class StdPeriodic(Kern):
        else: # same lengthscales
            self.lengthscale.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
    def update_gradients_direct(self, dL_dVar, dL_dPer, dL_dLen):
        self.variance.gradient = dL_dVar
        self.period.gradient = dL_dPer
        self.lengthscale.gradient = dL_dLen
    def reset_gradients(self):
        self.variance.gradient = 0.
        if not self.ARD1:
            self.period.gradient = 0.
        else:
            self.period.gradient = np.zeros(self.input_dim)
        if not self.ARD2:
            self.lengthscale.gradient = 0.
        else:
            self.lengthscale.gradient = np.zeros(self.input_dim)
    def update_gradients_diag(self, dL_dKdiag, X):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        self.variance.gradient = np.sum(dL_dKdiag)
        self.period.gradient = 0
        self.lengthscale.gradient = 0
    def dgradients(self, X, X2):
        g1 = self.dK_dvariance(X, X2)
        g2 = self.dK_dperiod(X, X2)
        g3 = self.dK_dlengthscale(X, X2)
        return [g1, g2, g3]
    def dgradients_dX(self, X, X2, dimX):
        g1 = self.dK2_dvariancedX(X, X2, dimX)
        g2 = self.dK2_dperioddX(X, X2, dimX)
        g3 = self.dK2_dlengthscaledX(X, X2, dimX)
        return [g1, g2, g3]
    def dgradients_dX2(self, X, X2, dimX2):
        g1 = self.dK2_dvariancedX2(X, X2, dimX2)
        g2 = self.dK2_dperioddX2(X, X2, dimX2)
        g3 = self.dK2_dlengthscaledX2(X, X2, dimX2)
        return [g1, g2, g3]
    def dgradients2_dXdX2(self, X, X2, dimX, dimX2):
        g1 = self.dK3_dvariancedXdX2(X, X2, dimX, dimX2)
        g2 = self.dK3_dperioddXdX2(X, X2, dimX, dimX2)
        g3 = self.dK3_dlengthscaledXdX2(X, X2, dimX, dimX2)
        return [g1, g2, g3]
    def gradients_X(self, dL_dK, X, X2=None):
        K = self.K(X, X2)
        if X2 is None:
@ -185,4 +583,4 @@ class StdPeriodic(Kern):
        return np.zeros(X.shape)
    def input_sensitivity(self, summarize=True):
-        return self.variance*np.ones(self.input_dim)/self.lengthscale**2
+        return self.variance*np.ones(self.input_dim)/self.lengthscale**2
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -306,7 +306,12 @@ class Stationary(Kern):
        l4 =  np.ones(X.shape[1])*self.lengthscale**2
        return dL_dK_diag * (np.eye(X.shape[1]) * -self.dK2_drdr_diag()/(l4))[None, :,:]# np.zeros(X.shape+(X.shape[1],))
        #return np.ones(X.shape) * d2L_dK * self.variance/self.lengthscale**2 # np.zeros(X.shape)
-    
+
    def dgradients(self, X, X2):
        g1 = self.dK_dvariance(X, X2)
        g2 = self.dK_dlengthscale(X, X2)
        return [g1, g2]
    def dgradients_dX(self, X, X2, dimX):
        g1 = self.dK2_dvariancedX(X, X2, dimX)
        g2 = self.dK2_dlengthscaledX(X, X2, dimX)
--- a/GPy/models/multioutput_gp.py
+++ b/GPy/models/multioutput_gp.py
@ -9,6 +9,7 @@ from ..core.mapping import Mapping
 from .. import likelihoods
 from ..likelihoods.gaussian import Gaussian
 from .. import kern
 from ..kern import DiffKern
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
 from ..util.normalizer import Standardize
 from .. import util
@ -69,39 +70,80 @@ class MultioutputGP(GP):
            if Y_metadata is None:
                Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
        return super(MultioutputGP, self).predict_quantiles(X, quantiles, Y_metadata, kern, likelihood)
    def predictive_gradients(self, Xnew, kern=None):
        if isinstance(Xnew, list):
            Xnew, _, ind  = util.multioutput.build_XY(Xnew, None)
            #if Y_metadata is None:
                #Y_metadata={'output_index': ind}
        return super(MultioutputGP, self).predictive_gradients(Xnew, kern)
-    def predictive_gradients(self, Xnew, kern=None): #XNEW IS NOT A LIST!!
+    def predictive_gradients(self, Xnew, kern=None):
        """
-        Compute the derivatives of the predicted latent function with respect to X*
+        Compute the derivatives of the predicted latent function with respect
        to X*
        Given a set of points at which to predict X* (size [N*,Q]), compute the
        derivatives of the mean and variance. Resulting arrays are sized:
-         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
+            dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP
-        Note that this is not the same as computing the mean and variance of the derivative of the function!
+            (usually one).
        Note that this is not the same as computing the mean and variance of
        the derivative of the function!
         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
        :param X: The points at which to get the predictive gradients
        :type X: np.ndarray (Xnew x self.input_dim)
        :returns: dmu_dX, dv_dX
        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q) ]
        """
        if isinstance(Xnew, list):
            Xnew, _, ind  = util.multioutput.build_XY(Xnew, None)
        slices = index_to_slices(Xnew[:,-1])
-        
+
        for i in range(len(slices)):
            if ((self.kern.kern[i].name == 'diffKern' ) and len(slices[i])>0):
                assert 0, "It is not (yet) possible to predict gradients of gradient observations, sorry :)"
        if kern is None:
            kern = self.kern
        if all([(isinstance(k, DiffKern)) for k in self.kern.kern[1:]]):
            """
            Compute the gradients of the predicted latent function and predicted
            partial derivatives with respect to X*.
            This works only for models that observe the gradient of the latent function.
            Xnew is given as a list of arrays, where each array X*_i (size [N_i*, Q])
            contains points at which to compute gradients for each predicted latent
            function or partial derivative.
            Resulting arrays are sized [sum_i^D : N_i*, Q]
            Passing a list of only one array [X*] returns only gradients of
            the predicted latent function and does not compute gradients of
            predicted partial derivatives.
            In this case the resulting arrays are sized [N*, Q].
            :param Xnew: points at which to compute predictive gradients
            :type Xnew: list
            :type Xnew[i]: np.darray (sum_i^D : N_i*, Q)
            :returns: dmu_dX, dv_dX
            :rtype: (np.ndarray (sum_i^D : N_i*, Q), np.ndarray (sum_i^D : N_i*, Q))
            """
            dims = Xnew.shape[1] - 1
            mean_jac = np.empty((Xnew.shape[0], dims))
            var_jac = np.empty((Xnew.shape[0], dims))
            X = self._predictive_variable
            alpha = self.posterior.woodbury_vector
            Wi = self.posterior.woodbury_inv
            k = kern.K(Xnew, X)
            for dimX in range(dims):
                dk_dx = kern.dK_dX(Xnew, X, dimX)
                dk_dxdiag = kern.dK_dXdiag(Xnew, dimX)
                mean_jac[:,dimX] = np.dot(dk_dx, alpha).flatten()
                var_jac[:,dimX] = dk_dxdiag - 2*(np.dot(k, Wi)*dk_dx).sum(-1)
            return mean_jac, var_jac
        mean_jac = np.empty((Xnew.shape[0],Xnew.shape[1]-1,self.output_dim))
        for i in range(self.output_dim):
            mean_jac[:,:,i] = kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self._predictive_variable)[:,0:-1]
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -5,6 +5,8 @@ from __future__ import division
 import unittest
 import numpy as np
 import GPy
 from GPy.models import GradientChecker
 from functools import reduce
 class MiscTests(unittest.TestCase):
    def setUp(self):
@ -1208,40 +1210,6 @@ class GradientTests(np.testing.TestCase):
        with self.assertRaises(RuntimeError):
            m._raw_posterior_covariance_between_points(np.array([[1], [2]]), np.array([[3], [4]]))
    def test_multioutput_model_with_derivative_observations(self):
        f = lambda x: np.sin(x)+0.1*(x-2.)**2-0.005*x**3
        fd = lambda x: np.cos(x)+0.2*(x-2.)-0.015*x**2
        N=10
        M=10
        sigma=0.05
        sigmader=0.05
        x = np.array([np.linspace(1,10,N)]).T
        y = f(x) + np.array(sigma*np.random.normal(0,1,(N,1)))
        xd = np.array([np.linspace(2,8,M)]).T
        yd = fd(xd) + np.array(sigmader*np.random.normal(0,1,(M,1)))
        # squared exponential kernel:
        se = GPy.kern.RBF(input_dim = 1, lengthscale=1.5, variance=0.2)
        # We need to generate separate kernel for the derivative observations and give the created kernel as an input:
        se_der = GPy.kern.DiffKern(se, 0)
        #Then 
        gauss = GPy.likelihoods.Gaussian(variance=sigma**2)
        gauss = GPy.likelihoods.Gaussian(variance=0.1)
        gauss_der = GPy.likelihoods.Gaussian(variance=sigma**2)
        # Then create the model, we give everything in lists, the order of the inputs indicates the order of the outputs
        # Now we have the regular observations first and derivative observations second, meaning that the kernels and
        # the likelihoods must follow the same order
        m = GPy.models.MultioutputGP(X_list=[x, xd], Y_list=[y, yd], kernel_list=[se, se_der], likelihood_list = [gauss, gauss])
        m.randomize()
        self.assertTrue(m.checkgrad())
        m.optimize(messages=0, ipython_notebook=False)
        self.assertTrue(m.checkgrad())
    def test_multioutput_model_with_ep(self):
        f = lambda x: np.sin(x)+0.1*(x-2.)**2-0.005*x**3
@ -1308,6 +1276,173 @@ class GradientTests(np.testing.TestCase):
        c2 = model.predict(x, full_cov=True)[1]
        np.testing.assert_allclose(c1,c2)
 class GradientMultioutputGPModelTests(np.testing.TestCase):
    def setUp(self):
        # standard test function
        self.period = 3
        self.w = 2*np.pi/self.period
        self.f = lambda x: np.sum(np.square(np.sin(self.w*x)), axis=1)
        self.df = lambda x: self.w*np.sin(2*self.w*x)
        self.noise_std = 1e-2
        self.bounds = (-self.period, self.period)
        self.train_points = 5
        self.test_points = 25
    def approximate_predictive_gradients(self, model, x_test, D, step=1e-6):
        '''
        Approximates gradients of predicted posterior means and variances.
        This function is used as the frameworks for GradientChecker and
        MultioutputGP do not easily combine when checking gradients of predicted
        partial derivative posteriors.
        '''
        dmdx_aprx = np.zeros((x_test.shape[0]*(D + 1), D))
        dvdx_aprx = np.zeros((x_test.shape[0]*(D + 1), D))
        for d in range(D):
            x_over = x_test.copy()
            x_over[:,d] += step
            x_undr = x_test.copy()
            x_undr[:,d] -= step
            m_over, v_over = model.predict([x_over]*(D + 1))
            m_undr, v_undr = model.predict([x_undr]*(D + 1))
            dmdx_aprx[:,d,None] = (m_over - m_undr)/(2*step)
            dvdx_aprx[:,d,None] = (v_over - v_undr)/(2*step)
        return dmdx_aprx, dvdx_aprx
    def check_model(self, kern):
        '''
        Checks predictions, hyperparameter gradients, and gradients of predicted
        posterior means and variances for MultioutputGP models that incorporate
        observed latent function gradient information.
        '''
        D = kern.input_dim
        X_list = []
        Y_list = []
        for i in range(D + 1):
            # sample inputs for either latent function or partial derivatives
            X_i = np.random.uniform(*self.bounds, size=(self.train_points, D))
            # output of latent function or partial derivatives
            Y_i = (self.f(X_i) if (i == 0) else self.df(X_i)[:,i - 1])[:,None]
            # noisy observations
            Y_i += np.random.normal(scale=self.noise_std, size=Y_i.shape)
            X_list.append(X_i)
            Y_list.append(Y_i)
        # the kernel is accompanied with derivative kernels, one for each dimension
        kernel_list = [kern] + [GPy.kern.DiffKern(kern, d) for d in range(D)]
        # create model and check its hyperparameter gradient
        likelihood_list = [GPy.likelihoods.Gaussian(variance=self.noise_std**2)]*(D + 1)
        model = GPy.models.MultioutputGP(X_list, Y_list, kernel_list, likelihood_list)
        model.likelihood.constrain_fixed()
        self.assertTrue(model.checkgrad(step=1e-3))
        # optimize the model, and check its hyperparameter gradient again
        model.optimize()
        self.assertTrue(model.checkgrad(step=1e-3))
        # check predictions
        np.testing.assert_allclose(model.predict(X_list)[0], model.Y, atol=3*self.noise_std)
        # test inputs for checking predictive gradients
        x_test = np.random.uniform(*self.bounds, size=(self.test_points, D))
        # predictive gradients
        dmdx, dvdx = model.predictive_gradients([x_test]*(D + 1))
        # approximated predictive gradients
        dmdx_aprx, dvdx_aprx = self.approximate_predictive_gradients(model, x_test, D, step=1e-3)
        # check predictive gradients
        np.testing.assert_allclose(dmdx, dmdx_aprx, atol=3*self.noise_std)
        np.testing.assert_allclose(dvdx, dvdx_aprx, atol=3*self.noise_std)
    def test_MultioutputGP_gradobs_RBF(self):
        '''
        Testing gradient observing MultioutputGP model with an RBF kernel.
        '''
        for D in range(1, 4):
            kern = GPy.kern.RBF(input_dim=D)
            kern.randomize()
            self.check_model(kern)
    def test_MultioutputGP_gradobs_RBF_ARD(self):
        '''
        Testing gradient observing MultioutputGP model with an RBF (ARD) kernel.
        '''
        for D in range(1, 4):
            kern = GPy.kern.RBF(input_dim=D, ARD=True)
            kern.randomize()
            self.check_model(kern)
    def test_MultioutputGP_gradobs_StdP(self):
        '''
        Testing gradient observing MultioutputGP model with a StdP kernel.
        '''
        for D in range(1, 4):
            kern = GPy.kern.StdPeriodic(input_dim=D, period=self.period)
            kern.period.constrain_fixed()
            kern.randomize()
            self.check_model(kern)
    def test_MultioutputGP_gradobs_StdP_ARD(self):
        '''
        Testing gradient observing MultioutputGP model with a StdP (ARD) kernel.
        '''
        for D in range(1, 4):
            kern = GPy.kern.StdPeriodic(input_dim=D, period=[self.period]*D, ARD1=True, ARD2=True)
            kern.period.constrain_fixed()
            kern.randomize()
            self.check_model(kern)
    def test_MultioutputGP_gradobs_prod_RBF(self):
        '''
        Testing gradient observing MultioutputGP model with several RBF kernels.
        '''
        for D in range(2, 4):
            kerns = [GPy.kern.RBF(input_dim=1) for d in range(D)]
            kern = reduce(lambda k0, k1: k0 * k1, kerns)
            kern.randomize()
            self.check_model(kern)
    def test_MultioutputGP_gradobs_prod_StdP(self):
        '''
        Testing gradient observing MultioutputGP model with several StdP kernels.
        '''
        for D in range(2, 4):
            kerns = [GPy.kern.StdPeriodic(input_dim=1, period=self.period) for d in range(D)]
            kern = reduce(lambda k0, k1: k0 * k1, kerns)
            [k.period.constrain_fixed() for k in kern.parts]
            kern.randomize()
            self.check_model(kern)
    def test_MultioutputGP_gradobs_prod_mix(self):
        '''
        Testing gradient observing MultioutputGP model with a mix of kernel types.
        '''
        for D in range(2, 4):
            kerns = []
            for d in range(D):
                if d % 2 == 0:
                    k = GPy.kern.RBF(input_dim=1)
                else:
                    k = GPy.kern.StdPeriodic(input_dim=1, period=self.period)
                    k.period.constrain_fixed()
                kerns.append(k)
            kern = reduce(lambda k0, k1: k0 * k1, kerns)
            kern.randomize()
            self.check_model(kern)
 def _create_missing_data_model(kernel, Q):
    D1, D2, D3, N, num_inducing = 13, 5, 8, 400, 3