Enhance multioutput grad obs (#995)

* multiplied RBF kernels can now be used with gradient observations * standard periodic kernels can now be used with gradient observations * predictive gradients (derivatives of posterior means and variances) can now be calculated when using gradient observations * simplified and commented RBF & StdP kernel derivatives * updated kernel slicing and commented prod kernel derivatives * removed caching from stdp kern, as it breaks optimization for some reason * fixed hyperparameter optimization for prod kernel * improved code readability * added unit tests for gradient observing MultioutputGP models * added predictions check to unit tests * bugfix for multioutput_kern * improved testing coverage * reduced size of some tests; led to an issue in an unrelated test * updated testing * added gradient MultioutputGP prod kernel example * added keywords and plotting to example
2026-05-21 14:05:14 +02:00 · 2023-09-21 16:53:42 +03:00 · 2023-09-21 16:53:42 +03:00 · 9c1db7aa34
commit 9c1db7aa34
parent 2c22f1e9c5
11 changed files with 1494 additions and 164 deletions
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -771,3 +771,117 @@ def multioutput_gp_with_derivative_observations(plot=True):
    mu, var = m.predict_noiseless(Xnew=[xpred, np.empty((0, 1))])

    return m
+
+def multioutput_gp_with_derivative_observations_2D(optimize=True, plot=False):
+    '''
+    This in an example on how to use a MultioutputGP model with gradient
+    observations and multiple single-dimensional kernels of differing types.
+    '''
+
+    period = 3
+    w = 2*np.pi/period # angular frequency
+    bounds = (-period, period)
+
+    # latent function and gradient
+    f = lambda x: (np.exp(-x[:,0]**2) + np.cos(w*x[:,1]))[:,None]
+    df = lambda x: np.array([-2*np.exp(-x[:,0]**2)*x[:,0], -w*np.sin(w*x[:,1])]).T
+
+    # 2D input grid
+    ppa = 25 # points per axis
+    x = np.linspace(*bounds, ppa)
+    xx, yy = np.meshgrid(x, x)
+    grid = np.array([xx.reshape(-1), yy.reshape(-1)]).T
+
+    fgrid = f(grid)
+    dfgrid = df(grid)
+
+    # 10 random training points generated with a space-filling sobol sequence
+    X = np.array([
+        [ 0.50421399,  2.1331483 ],
+        [-2.15717152, -1.70295936],
+        [-1.46704334,  1.37111521],
+        [ 2.79064536, -0.9649018 ],
+        [ 1.60728264,  0.27702713],
+        [-0.30712366, -0.57372129],
+        [-2.6140632 ,  2.49192488],
+        [ 0.89078772, -2.85873686],
+        [ 1.15813136,  0.96910322],
+        [-2.83307021, -1.38155383]
+    ])
+
+    # Note!
+    # This example uses the same inputs for function and gradient observations.
+
+    noise_std = 1e-2
+    # function observations
+    Y = f(X) + np.random.normal(scale=noise_std, size=(len(X), 1))
+    # gradient observations
+    dY = df(X) + np.random.normal(scale=noise_std, size=(len(X), 2))
+
+    # gather inputs and observations into lists
+    X_list = [X, X, X]
+    # once for function observations, and once for each partial derivative
+    # make sure all arrays are of shape (N x dims), where N is # of training points
+    Y_list = [Y, dY[:,0,None], dY[:,1,None]]
+
+    # create a kernel that is the product of two one-dimensional kernels
+    # the first kernel is an RBF kernel
+    kern0 = GPy.kern.RBF(input_dim=1, active_dims=[0])
+    # as the function is periodic in the second dimension, we use a StdP kernel
+    kern1 = GPy.kern.StdPeriodic(input_dim=1, active_dims=[1], period=period)
+    kern1.period.constrain_fixed()
+    # the kernels can be multiplied together into a product kernel
+    kern = kern0 * kern1
+
+    # with gradient observations, we need to define a DiffKern for each dimension
+    # the DiffKern is given the main kernel as a base kernel
+    diffkern0 = GPy.kern.DiffKern(kern, 0)
+    diffkern1 = GPy.kern.DiffKern(kern, 1)
+
+    # gather the main kernel and diffkerns into a list
+    kern_list = [kern, diffkern0, diffkern1]
+
+    # define a likelihood and repeat it in a list
+    likelihood_list = [GPy.likelihoods.Gaussian(variance=noise_std**2)]*3
+
+    # create the MultioutputGP model and optimize
+    model = GPy.models.MultioutputGP(X_list, Y_list, kern_list, likelihood_list)
+    model.likelihood.constrain_fixed()
+    if optimize:
+        model.optimize()
+
+    # make function predictions
+    Xnew, _, ind = GPy.util.multioutput.build_XY([grid], index=[0])
+    Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
+
+    mu, var = model.predict(Xnew, Y_metadata=Y_metadata)
+
+    # make gradient predictions
+    Xnew, _, ind = GPy.util.multioutput.build_XY([grid]*2, index=[1, 2])
+    Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
+
+    mu_d, var_d = model.predict(Xnew, Y_metadata=Y_metadata)
+
+    mu_d = np.array([mu_d[:len(grid)], mu_d[len(grid):]]).T[0]
+    var_d = np.array([var_d[:len(grid)], var_d[len(grid):]]).T[0]
+
+    if plot and MPL_AVAILABLE:
+        fig, axs = plt.subplots(1, 3)
+        for ax in axs: ax.set_box_aspect(1)
+        axs[0].set_title('true f')
+        axs[0].contourf(xx, yy, fgrid.reshape(ppa, ppa), levels=25)
+        axs[1].set_title('true df1')
+        axs[1].contourf(xx, yy, dfgrid[:,0].reshape(ppa, ppa), levels=25)
+        axs[2].set_title('true df2')
+        axs[2].contourf(xx, yy, dfgrid[:,1].reshape(ppa, ppa), levels=25)
+
+        fig, axs = plt.subplots(1, 3)
+        for ax in axs: ax.set_box_aspect(1)
+        axs[0].set_title('pred f')
+        axs[0].contourf(xx, yy, mu.reshape(ppa, ppa), levels=25)
+        axs[1].set_title('pred df1')
+        axs[1].contourf(xx, yy, mu_d[:,0].reshape(ppa, ppa), levels=25)
+        axs[2].set_title('pred df2')
+        axs[2].contourf(xx, yy, mu_d[:,1].reshape(ppa, ppa), levels=25)
+
+    return model
--- a/GPy/kern/src/diff_kern.py
+++ b/GPy/kern/src/diff_kern.py
@ -23,24 +23,42 @@ class DiffKern(Kern):
        self.base_kern.parameters_changed()

    @Cache_this(limit=3, ignore_args=())
-    def K(self, X, X2=None, dimX2 = None): #X in dimension self.dimension
+    def K(self, X, X2=None, dimX2=None): #X in dimension self.dimension
        if X2 is None:
            X2 = X
        if dimX2 is None:
            dimX2 = self.dimension
-        return self.base_kern.dK2_dXdX2(X,X2, self.dimension, dimX2)
- 
+        return self.base_kern.dK2_dXdX2(X, X2, self.dimension, dimX2)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dX(self, X, X2, dimX, dimX2=None):
+        if dimX2 is None:
+            dimX2 = self.dimension
+        return self.base_kern.dK3_dXdXdX2(X, X2, dimX, self.dimension, dimX2)
+
    @Cache_this(limit=3, ignore_args=())
    def Kdiag(self, X):
-        return np.diag(self.base_kern.dK2_dXdX2(X,X, self.dimension, self.dimension))
-    
+        return self.base_kern.dK2_dXdX2diag(X, self.dimension, self.dimension)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dXdiag(self, X, dimX):
+        return self.base_kern.dK3_dXdXdX2diag(X, dimX, self.dimension, self.dimension)
+
    @Cache_this(limit=3, ignore_args=())
    def dK_dX_wrap(self, X, X2): #X in dimension self.dimension
-        return self.base_kern.dK_dX(X,X2, self.dimension)
+        return self.base_kern.dK_dX(X, X2, self.dimension)

    @Cache_this(limit=3, ignore_args=())
    def dK_dX2_wrap(self, X, X2): #X in dimension self.dimension
-        return self.base_kern.dK_dX2(X,X2, self.dimension)
+        return self.base_kern.dK_dX2(X, X2, self.dimension)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dXdX2_wrap(self, X, X2, dimX):
+        return self.base_kern.dK2_dXdX2(X, X2, dimX, self.dimension)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dXdX_wrap(self, X, X2, dimX):
+        return self.base_kern.dK2_dXdX(X, X2, dimX, self.dimension)

    def reset_gradients(self):
        self.base_kern.reset_gradients()
@ -56,33 +74,33 @@ class DiffKern(Kern):
    def update_gradients_full(self, dL_dK, X, X2=None, dimX2=None):
        if dimX2 is None:
            dimX2 = self.dimension
-        gradients = self.base_kern.dgradients2_dXdX2(X,X2,self.dimension,dimX2)
+        gradients = self.base_kern.dgradients2_dXdX2(X, X2, self.dimension, dimX2)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])

    def update_gradients_diag(self, dL_dK_diag, X):
-        gradients = self.base_kern.dgradients2_dXdX2(X,X, self.dimension, self.dimension)
+        gradients = self.base_kern.dgradients2_dXdX2(X, X, self.dimension, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK_diag, gradient, f=np.diag) for gradient in gradients])

    def update_gradients_dK_dX(self, dL_dK, X, X2=None):
        if X2 is None:
            X2 = X
-        gradients = self.base_kern.dgradients_dX(X,X2, self.dimension)
+        gradients = self.base_kern.dgradients_dX(X, X2, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])

    def update_gradients_dK_dX2(self, dL_dK, X, X2=None):
-        gradients = self.base_kern.dgradients_dX2(X,X2, self.dimension)
+        gradients = self.base_kern.dgradients_dX2(X, X2, self.dimension)
        self.base_kern.update_gradients_direct(*[self._convert_gradients(dL_dK, gradient) for gradient in gradients])

    def gradients_X(self, dL_dK, X, X2):
-        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,:, self.dimension]
+        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,:,self.dimension]
        return np.sum(tmp, axis=1)
    
    def gradients_X2(self, dL_dK, X, X2):
-        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:, :, self.dimension, :]
+        tmp = self.base_kern.gradients_XX(dL_dK, X, X2)[:,:,self.dimension,:]
        return np.sum(tmp, axis=1)

-    def _convert_gradients(self, l,g, f = lambda x:x):
+    def _convert_gradients(self, l, g, f=lambda x:x):
        if type(g) is np.ndarray:
            return np.sum(f(l)*f(g))
        else:
-            return np.array([np.sum(f(l)*f(gi)) for gi in g])
+            return np.array([np.sum(f(l)*f(gi)) for gi in g])
--- a/GPy/kern/src/kernel_slice_operations.py
+++ b/GPy/kern/src/kernel_slice_operations.py
@ -22,7 +22,14 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        put_clean(dct, 'dK_dX', _slice_dK_dX)
        put_clean(dct, 'dK_dX2', _slice_dK_dX)
        put_clean(dct, 'dK2_dXdX2', _slice_dK2_dXdX2)
+        put_clean(dct, 'dK2_dXdX', _slice_dK2_dXdX2)
+        put_clean(dct, 'dK3_dXdXdX2', _slice_dK3_dXdXdX2)
        put_clean(dct, 'Kdiag', _slice_Kdiag)
+        put_clean(dct, 'dK_dXdiag', _slice_dK_dXdiag)
+        put_clean(dct, 'dK_dX2diag', _slice_dK_dXdiag)
+        put_clean(dct, 'dK2_dXdX2diag', _slice_dK2_dXdX2diag)
+        put_clean(dct, 'dK2_dXdXdiag', _slice_dK2_dXdX2diag)
+        put_clean(dct, 'dK3_dXdXdX2diag', _slice_dK3_dXdXdX2diag)
        put_clean(dct, 'phi', _slice_Kdiag)
        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
@ -35,9 +42,10 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        put_clean(dct, 'gradients_XX_diag', _slice_gradients_XX_diag)
        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)

-        put_clean(dct, 'dgradients_dX',_slice_partial_gradients_list_X)
-        put_clean(dct, 'dgradients_dX2',_slice_partial_gradients_list_X)
-        put_clean(dct, 'dgradients2_dXdX2',_slice_partial_gradients_list_XX)
+        put_clean(dct, 'dgradients', _slice_partial_gradients_list)
+        put_clean(dct, 'dgradients_dX', _slice_partial_gradients_list_X)
+        put_clean(dct, 'dgradients_dX2', _slice_partial_gradients_list_X)
+        put_clean(dct, 'dgradients2_dXdX2', _slice_partial_gradients_list_XX)

        put_clean(dct, 'psi0', _slice_psi)
        put_clean(dct, 'psi1', _slice_psi)
@ -155,6 +163,18 @@ def _slice_dK_dX(f):
        return ret
    return wrap

+def _slice_dK_dXdiag(f):
+    @wraps(f)
+    def wrap(self, X, dim, *a, **kw):
+        with _Slice_wrap(self, X, None) as s:
+            d = s.k._project_dim(dim)
+            if d is None:
+                ret = np.zeros(X.shape[0])
+            else:
+                ret = f(self, s.X, dim, *a, **kw)
+        return ret
+    return wrap
+
 def _slice_dK2_dXdX2(f):
    @wraps(f)
    def wrap(self, X, X2, dimX, dimX2, *a, **kw):
@ -168,6 +188,59 @@ def _slice_dK2_dXdX2(f):
        return ret
    return wrap

+def _slice_dK2_dXdX2diag(f):
+    @wraps(f)
+    def wrap(self, X, dimX, dimX2, *a, **kw):
+        with _Slice_wrap(self, X, None) as s:
+            d = s.k._project_dim(dimX)
+            d2 = s.k._project_dim(dimX2)
+            if (d is None) or (d2 is None):
+                ret = np.zeros(X.shape[0])
+            else:
+                ret = f(self, s.X, d, d2, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_dK3_dXdXdX2(f):
+    @wraps(f)
+    def wrap(self, X, X2, dim, dimX, dimX2, *a, **kw):
+        with _Slice_wrap(self, X, X2) as s:
+            D = s.k._project_dim(dim)
+            d = s.k._project_dim(dimX)
+            d2 = s.k._project_dim(dimX2)
+            if (D is None) or (d is None) or (d2 is None):
+                ret = np.zeros((X.shape[0], X2.shape[0]))
+            else:
+                ret = f(self, s.X, s.X2, D, d, d2, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_dK3_dXdXdX2diag(f):
+    @wraps(f)
+    def wrap(self, X, dim, dimX, dimX2, *a, **kw):
+        with _Slice_wrap(self, X, None) as s:
+            D = s.k._project_dim(dim)
+            d = s.k._project_dim(dimX)
+            d2 = s.k._project_dim(dimX2)
+            if (D is None) or (d is None) or (d2 is None):
+                ret = np.zeros(X.shape[0])
+            else:
+                ret = f(self, s.X, D, d, d2, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_partial_gradients_list(f):
+    @wraps(f)
+    def wrap(self, X, X2):
+        if X2 is None:
+            N, M = X.shape[0], X.shape[0]
+        else:
+            N, M = X.shape[0], X2.shape[0]
+        with _Slice_wrap(self, X, X2, ret_shape=(N, M)) as s:
+            ret = f(self, s.X, s.X2)
+        return ret
+    return wrap
+
 def _slice_partial_gradients_X(f):
    @wraps(f)
    def wrap(self, X, X2, dim):
--- a/GPy/kern/src/multioutput_derivative_kern.py
+++ b/GPy/kern/src/multioutput_derivative_kern.py
@ -7,20 +7,24 @@ import numpy as np
 from functools import partial

 class KernWrapper(Kern):
-    def __init__(self, fk, fug, fg, base_kern):
+    def __init__(self, fk, fdk, fug, fg, base_kern):
        self.fk = fk
+        self.fdk = fdk
        self.fug = fug
        self.fg = fg
        self.base_kern = base_kern
-        super(KernWrapper, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='KernWrapper',useGPU=False)
+        super(KernWrapper, self).__init__(base_kern.active_dims.size, base_kern.active_dims, name='KernWrapper', useGPU=False)

    def K(self, X, X2=None):
-        return self.fk(X,X2=X2)
+        return self.fk(X, X2=X2)
+
+    def dK_dX(self, X, X2, dimX):
+        return self.fdk(X, X2, dimX)
    
-    def update_gradients_full(self,dL_dK, X, X2=None):
+    def update_gradients_full(self, dL_dK, X, X2=None):
        return self.fug(dL_dK, X, X2=X2)
    
-    def gradients_X(self,dL_dK, X, X2=None):
+    def gradients_X(self, dL_dK, X, X2=None):
        return self.fg(dL_dK, X, X2=X2)

    @property
@ -57,28 +61,46 @@ class MultioutputDerivativeKern(MultioutputKern):
        #build covariance structure
        covariance = [[None for i in range(nl)] for j in range(nl)]
        linked = []
-        for i in range(0,nl):
-            unique=True
-            for j in range(0,nl):
-                if i==j or (kernels[i] is kernels[j]):
+        for i in range(0, nl):
+            unique = True
+            for j in range(0, nl):
+                if (i == j) or (kernels[i] is kernels[j]):
                    kern = kernels[i]
-                    if i>j:
-                        unique=False
+                    if i > j:
+                        unique = False
                elif cross_covariances.get((i,j)) is not None: #cross covariance is given
                    kern = cross_covariances.get((i,j))
-                elif kernels[i].name == 'DiffKern' and kernels[i].base_kern == kernels[j]: # one is derivative of other
-                    kern = KernWrapper(kernels[i].dK_dX_wrap,kernels[i].update_gradients_dK_dX,kernels[i].gradients_X, kernels[j])
+                elif (kernels[i].name == 'DiffKern') and (kernels[i].base_kern == kernels[j]): # one is derivative of other
+                    kern = KernWrapper(
+                        kernels[i].dK_dX_wrap,
+                        kernels[i].dK2_dXdX_wrap,
+                        kernels[i].update_gradients_dK_dX,
+                        kernels[i].gradients_X,
+                        kernels[j]
+                        )
                    unique=False
-                elif kernels[j].name == 'DiffKern' and kernels[j].base_kern == kernels[i]: # one is derivative of other
-                    kern = KernWrapper(kernels[j].dK_dX2_wrap,kernels[j].update_gradients_dK_dX2,kernels[j].gradients_X2, kernels[i])
-                elif kernels[i].name == 'DiffKern' and kernels[j].name == 'DiffKern' and kernels[i].base_kern == kernels[j].base_kern: #both are partial derivatives
-                    kern = KernWrapper(partial(kernels[i].K, dimX2=kernels[j].dimension), partial(kernels[i].update_gradients_full, dimX2=kernels[j].dimension),None, kernels[i].base_kern)
-                    if i>j:
-                        unique=False
+                elif (kernels[j].name == 'DiffKern') and (kernels[j].base_kern == kernels[i]): # one is derivative of other
+                    kern = KernWrapper(
+                        kernels[j].dK_dX2_wrap,
+                        kernels[j].dK2_dXdX2_wrap,
+                        kernels[j].update_gradients_dK_dX2,
+                        kernels[j].gradients_X2,
+                        kernels[i]
+                        )
+                elif (kernels[i].name == 'DiffKern') and (kernels[j].name == 'DiffKern') and (kernels[i].base_kern == kernels[j].base_kern): #both are partial derivatives
+                    kern = KernWrapper(
+                        partial(kernels[i].K, dimX2=kernels[j].dimension),
+                        partial(kernels[i].dK_dX, dimX2=kernels[j].dimension),
+                        partial(kernels[i].update_gradients_full, dimX2=kernels[j].dimension),
+                        None,
+                        kernels[i].base_kern
+                        )
+                    if i > j:
+                        unique = False
                else:
                    kern = ZeroKern()
                covariance[i][j] = kern
            if unique is True:
                linked.append(i)
        self.covariance = covariance
-        self.link_parameters(*[kernels[i] for i in linked])
+        self.link_parameters(*[kernels[i] for i in linked])
--- a/GPy/kern/src/multioutput_kern.py
+++ b/GPy/kern/src/multioutput_kern.py
@ -85,21 +85,63 @@ class MultioutputKern(CombinationKernel):
        self.link_parameters(*[kernels[i] for i in linked])
        
    @Cache_this(limit=3, ignore_args=())
-    def K(self, X ,X2=None):
+    def K(self, X, X2=None):
        if X2 is None:
            X2 = X
        slices = index_to_slices(X[:,self.index_dim])
        slices2 = index_to_slices(X2[:,self.index_dim])
+
        target =  np.zeros((X.shape[0], X2.shape[0]))
-        [[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j].K(X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))]  
+        for j in range(len(slices2)):
+            for i in range(len(slices)):
+                for l in range(len(slices2[j])):
+                    for k in range(len(slices[i])):
+                        cov_K = self.covariance[i][j].K(X[slices[i][k],:], X2[slices2[j][l],:])
+                        target.__setitem__((slices[i][k], slices2[j][l]), cov_K)
        return target

    @Cache_this(limit=3, ignore_args=())
-    def Kdiag(self,X):
+    def Kdiag(self, X):
        slices = index_to_slices(X[:,self.index_dim])
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        target = np.zeros(X.shape[0])
-        [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
+        for kern, slices_i in zip(kerns, slices):
+            for s in slices_i:
+                np.copyto(target[s], kern.Kdiag(X[s]))
+        return target
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dX(self, X, X2, dimX):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+        """
+        if X2 is None:
+            X2 = X
+        slices = index_to_slices(X[:,self.index_dim])
+        slices2 = index_to_slices(X2[:,self.index_dim])
+
+        target =  np.zeros((X.shape[0], X2.shape[0]))
+        for j in range(len(slices2)):
+            for i in range(len(slices)):
+                for l in range(len(slices2[j])):
+                    for k in range(len(slices[i])):
+                        cov_dK_dX = self.covariance[i][j].dK_dX(X[slices[i][k],:], X2[slices2[j][l],:], dimX)
+                        target.__setitem__((slices[i][k], slices2[j][l]), cov_dK_dX)
+        return target
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dXdiag(self, X, dimX):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+        """
+        slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        target = np.zeros(X.shape[0])
+        for kern, slices_i in zip(kerns, slices):
+            for s in slices_i:
+                np.copyto(target[s], kern.dK_dXdiag(X[s], dimX))
        return target
    
    def _update_gradients_full_wrapper(self, kern, dL_dK, X, X2):
@ -115,19 +157,35 @@ class MultioutputKern(CombinationKernel):
    def reset_gradients(self):
        for kern in self.kern: kern.reset_gradients()

-    def update_gradients_full(self,dL_dK, X, X2=None):
-        self.reset_gradients()
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None:
+            X2 = X
        slices = index_to_slices(X[:,self.index_dim])
-        if X2 is not None:
-            slices2 = index_to_slices(X2[:,self.index_dim])
-            [[[[ self._update_gradients_full_wrapper(self.covariance[i][j], dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
-        else:
-            [[[[ self._update_gradients_full_wrapper(self.covariance[i][j], dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], X[slices[j][l],:]) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
-            
+        slices2 = index_to_slices(X2[:,self.index_dim])
+
+        self.reset_gradients()
+        for j in range(len(slices2)):
+            for i in range(len(slices)):
+                for l in range(len(slices2[j])):
+                    for k in range(len(slices[i])):
+                        self._update_gradients_full_wrapper(
+                            self.covariance[i][j],
+                            dL_dK[slices[i][k],slices2[j][l]],
+                            X[slices[i][k],:],
+                            X2[slices2[j][l],:]
+                            )
+
    def update_gradients_diag(self, dL_dKdiag, X):
-        self.reset_gradients()
        slices = index_to_slices(X[:,self.index_dim])
-        [[ self._update_gradients_diag_wrapper(self.covariance[i][i], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))]
+
+        self.reset_gradients()
+        for i in range(len(slices)):
+            for k in range(len(slices[i])):
+                self._update_gradients_diag_wrapper(
+                    self.covariance[i][i],
+                    dL_dKdiag[slices[i][k]],
+                    X[slices[i][k],:]
+                    )
    
    def gradients_X(self,dL_dK, X, X2=None):
        slices = index_to_slices(X[:,self.index_dim])
@ -137,4 +195,4 @@ class MultioutputKern(CombinationKernel):
            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) ) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
        else:
            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j].gradients_X(dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], (None if (i==j and k==l) else X[slices[j][l],:] )) ) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
-        return target
+        return target
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@ -70,6 +70,310 @@ class Prod(CombinationKernel):
            which_parts = self.parts
        return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))

+    def reset_gradients(self):
+        for part in self.parts:
+            part.reset_gradients()
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK_dX(self, X, X2, dimX, which_parts=None):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
+        for combination in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination) > 0:
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update = list(set(which_parts) - set(combination))[0]
+            prod_sum += prod*to_update.dK_dX(X, X2, dimX)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK_dXdiag(self, X, dimX, which_parts=None):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+
+        Returns only diagonal elements.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros(X.shape[0])
+        for combination in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination) > 0:
+                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update = list(set(which_parts) - set(combination))[0]
+            prod_sum += prod*to_update.dK_dXdiag(X, dimX)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK_dX2(self, X, X2, dimX2, which_parts=None):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX2 of set X2.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
+        for combination in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination) > 0:
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update = list(set(which_parts) - set(combination))[0]
+            prod_sum += prod*to_update.dK_dX2(X, X2, dimX2)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK2_dXdX2(self, X, X2, dimX, dimX2, which_parts=None):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
+        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination1) > 0:
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination1])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update1 = list(set(which_parts) - set(combination1))[0]
+            prod_sum += prod*to_update1.dK2_dXdX2(X, X2, dimX, dimX2)
+            if len(which_parts) > 1:
+                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
+                    if len(combination2) > 0:
+                        prod = reduce(np.multiply, [p.K(X, X2) for p in combination2])
+                    else:
+                        prod = np.ones(prod_sum.shape)
+                    to_update2 = list(set(combination1) - set(combination2))[0]
+                    prod_sum += prod*to_update1.dK_dX(X, X2, dimX)*to_update2.dK_dX2(X, X2, dimX2)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK2_dXdX2diag(self, X, dimX, dimX2, which_parts=None):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros(X.shape[0])
+        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination1) > 0:
+                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination1])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update1 = list(set(which_parts) - set(combination1))[0]
+            prod_sum += prod*to_update1.dK2_dXdX2diag(X, dimX, dimX2)
+            if len(which_parts) > 1:
+                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
+                    if len(combination2) > 0:
+                        prod = reduce(np.multiply, [p.Kdiag(X) for p in combination2])
+                    else:
+                        prod = np.ones(prod_sum.shape)
+                    to_update2 = list(set(combination1) - set(combination2))[0]
+                    prod_sum += prod*to_update1.dK_dXdiag(X, dimX)*to_update2.dK_dX2diag(X, dimX)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK2_dXdX(self, X, X2, dimX_0, dimX_1, which_parts=None):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX_0 of set X, and
+            dimension dimX_1 of set X.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
+        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination1) > 0:
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination1])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update1 = list(set(which_parts) - set(combination1))[0]
+            prod_sum += prod*to_update1.dK2_dXdX(X, X2, dimX_0, dimX_1)
+            if len(which_parts) > 1:
+                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
+                    if len(combination2) > 0:
+                        prod = reduce(np.multiply, [p.K(X, X2) for p in combination2])
+                    else:
+                        prod = np.ones(prod_sum.shape)
+                    to_update2 = list(set(combination1) - set(combination2))[0]
+                    prod_sum += prod*to_update1.dK_dX(X, X2, dimX_0)*to_update2.dK_dX(X, X2, dimX_1)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK3_dXdXdX2(self, X, X2, dimX_0, dimX_1, dimX2, which_parts=None):
+        """
+        Compute the third derivative of K with respect to:
+            dimension dimX_0 of set X,
+            dimension dimX_1 of set X, and
+            dimension dimX2 of set X2.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros((X.shape[0], X2.shape[0]))
+        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination1) > 0:
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination1])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update1 = list(set(which_parts) - set(combination1))[0]
+            prod_sum += prod*to_update1.dK3_dXdXdX2(X, X2, dimX_0, dimX_1, dimX2)
+            if len(which_parts) > 1:
+                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
+                    if len(combination2) > 0:
+                        prod = reduce(np.multiply, [p.K(X, X2) for p in combination2])
+                    else:
+                        prod = np.ones(prod_sum.shape)
+                    to_update2 = list(set(combination1) - set(combination2))[0]
+                    prod_sum += prod*to_update1.dK2_dXdX2(X, X2, dimX_0, dimX2)*to_update2.dK_dX(X, X2, dimX_1)
+                    prod_sum += prod*to_update1.dK2_dXdX(X, X2, dimX_0, dimX_1)*to_update2.dK_dX2(X, X2, dimX2)
+                    prod_sum += prod*to_update1.dK_dX(X, X2, dimX_0)*to_update2.dK2_dXdX2(X, X2, dimX_1, dimX2)
+                    if len(which_parts) > 2:
+                        for combination3 in itertools.combinations(combination2, len(combination2) - 1):
+                            if len(combination3) > 0:
+                                prod = reduce(np.multiply, [p.K(X, X2) for p in combination3])
+                            else:
+                                prod = np.ones(prod_sum.shape)
+                            to_update3 = list(set(combination2) - set(combination3))[0]
+                            prod_sum += prod*to_update1.dK_dX(X, X2, dimX_0)*to_update2.dK_dX2(X, X2, dimX2)*to_update3.dK_dX(X, X2, dimX_1)
+        return prod_sum
+
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
+    def dK3_dXdXdX2diag(self, X, dimX_0, dimX_1, dimX2, which_parts=None):
+        """
+        Compute the third derivative of K with respect to:
+            dimension dimX_0 of set X,
+            dimension dimX_1 of set X, and
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements of the covariance matrix.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        prod_sum = np.zeros(X.shape[0])
+        for combination1 in itertools.combinations(which_parts, len(which_parts) - 1):
+            if len(combination1) > 0:
+                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination1])
+            else:
+                prod = np.ones(prod_sum.shape)
+            to_update1 = list(set(which_parts) - set(combination1))[0]
+            prod_sum += prod*to_update1.dK3_dXdXdX2diag(X, dimX_0, dimX_1, dimX2)
+            if len(which_parts) > 1:
+                for combination2 in itertools.combinations(combination1, len(combination1) - 1):
+                    if len(combination2) > 0:
+                        prod = reduce(np.multiply, [p.Kdiag(X) for p in combination2])
+                    else:
+                        prod = np.ones(prod_sum.shape)
+                    to_update2 = list(set(combination1) - set(combination2))[0]
+                    prod_sum += prod*to_update1.dK2_dXdX2diag(X, dimX_0, dimX2)*to_update2.dK_dXdiag(X, dimX_1)
+                    prod_sum += prod*to_update1.dK2_dXdXdiag(X, dimX_0, dimX_1)*to_update2.dK_dX2diag(X, dimX2)
+                    prod_sum += prod*to_update1.dK_dXdiag(X, dimX_0)*to_update2.dK2_dXdX2diag(X, dimX_1, dimX2)
+                    if len(which_parts) > 2:
+                        for combination3 in itertools.combinations(combination2, len(combination2) - 1):
+                            if len(combination3) > 0:
+                                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination3])
+                            else:
+                                prod = np.ones(prod_sum.shape)
+                            to_update3 = list(set(combination2) - set(combination3))[0]
+                            prod_sum += prod*to_update1.dK_dXdiag(X, dimX_0)*to_update2.dK_dX2diag(X, dimX2)*to_update3.dK_dXdiag(X, dimX_1)
+        return prod_sum
+
+    def update_gradients_direct(self, *args):
+        for i, (g,p) in enumerate(zip(args, self.parts)):
+            p.update_gradients_direct(*g)
+
+    def dgradients_dX(self, X, X2, dimX, parts=None):
+        """
+        Compute the hyperparameter gradients of:
+            the derivative of K with respect to dimension dimX of set X
+            ("dK_dX").
+        """
+        if parts is None:
+            parts = self.parts
+        gradients = []
+        for part in parts:
+            neq_parts = [p for p in parts if p is not part]
+
+            if len(neq_parts) > 0:
+                K = self.K(X, X2, which_parts=neq_parts)
+                K_dx = self.dK_dX(X, X2, dimX, which_parts=neq_parts)
+            else:
+                K = np.ones((X.shape[0], X2.shape[0]))
+                K_dx = np.zeros((X.shape[0], X2.shape[0]))
+
+            g = part.dgradients(X, X2)
+            g_dx = part.dgradients_dX(X, X2, dimX)
+
+            gradients += [[(g_i*K_dx + g_dx_i*K) for (g_i, g_dx_i) in zip(g, g_dx)]]
+            
+        return gradients
+
+    def dgradients_dX2(self, X, X2, dimX2, parts=None):
+        """
+        Compute the hyperparameter gradients of:
+            the derivative of K with respect to dimension dimX2 of set X2
+            ("dK_dX2").
+        """
+        if parts is None:
+            parts = self.parts
+        gradients = []
+        for part in parts:
+            neq_parts = [p for p in parts if p is not part]
+
+            if len(neq_parts) > 0:
+                K = self.K(X, X2, which_parts=neq_parts)
+                K_dx2 = self.dK_dX2(X, X2, dimX2, which_parts=neq_parts)
+            else:
+                K = np.ones((X.shape[0], X2.shape[0]))
+                K_dx2 = np.zeros((X.shape[0], X2.shape[0]))
+
+            g = part.dgradients(X, X2)
+            g_dx2 = part.dgradients_dX2(X, X2, dimX2)
+
+            gradients += [[(g_i*K_dx2 + g_dx2_i*K) for (g_i, g_dx2_i) in zip(g, g_dx2)]]
+            
+        return gradients
+
+    def dgradients2_dXdX2(self, X, X2, dimX, dimX2, parts=None):
+        """
+        Compute the hyperparameter gradients of:
+            the second derivative of K with respect to:
+                dimension dimX of set X, and
+                dimension dimX2 of set X2
+            ("dK2_dXdX2").
+        """
+        if parts is None:
+            parts = self.parts
+        gradients = []
+        for part in parts:
+            neq_parts = [p for p in parts if p is not part]
+
+            K = self.K(X, X2, which_parts=neq_parts)
+            K_dx = self.dK_dX(X, X2, dimX, which_parts=neq_parts)
+            K_dx2 = self.dK_dX2(X, X2, dimX2, which_parts=neq_parts)
+            K_dxdx2 = self.dK2_dXdX2(X, X2, dimX, dimX2, which_parts=neq_parts)
+
+            g = part.dgradients(X, X2)
+            g_dx = part.dgradients_dX(X, X2, dimX)
+            g_dx2 = part.dgradients_dX2(X, X2, dimX2)
+            g_dxdx2 = part.dgradients2_dXdX2(X, X2, dimX, dimX2)
+
+            gradients += [[(g_i*K_dxdx2 + g_dx_i*K_dx2 + g_dx2_i*K_dx + g_dxdx2_i*K) for (g_i, g_dx_i, g_dx2_i, g_dxdx2_i) in zip(g, g_dx, g_dx2, g_dxdx2)]]
+        return gradients
+
    def update_gradients_full(self, dL_dK, X, X2=None):
        if len(self.parts)==2:
            self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
--- a/GPy/kern/src/rbf.py
+++ b/GPy/kern/src/rbf.py
@ -53,24 +53,126 @@ class RBF(Stationary):

    @Cache_this(limit=3, ignore_args=())
    def dK_dX(self, X, X2, dimX):
-        r = self._scaled_dist(X, X2)
-        K = self.K_of_r(r)
-        dist = X[:,None,dimX]-X2[None,:,dimX]
-        lengthscale2inv = (np.ones((X.shape[1]))/(self.lengthscale**2))[dimX]
-        return -1.*K*dist*lengthscale2inv
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
+        dist = X[:,None,dimX] - X2[None,:,dimX]
+        return -dist*(lengthscaleinv**2)*self._clean_K(X, X2)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dXdiag(self, X, dimX):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+
+        Returns only diagonal elements.
+        """
+        return np.zeros(X.shape[0])
+
    @Cache_this(limit=3, ignore_args=())
    def dK_dX2(self, X, X2, dimX2):
-        return -self.dK_dX(X,X2, dimX2)
-    
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX2 of set X2.
+        """
+        return -self._clean_dK_dX(X, X2, dimX2)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dX2diag(self, X, dimX2):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements.
+        """
+        return np.zeros(X.shape[0])
+
    @Cache_this(limit=3, ignore_args=())
    def dK2_dXdX2(self, X, X2, dimX, dimX2):
-        r = self._scaled_dist(X, X2)
-        K = self.K_of_r(r)
-        if X2 is None:
-            X2=X
-        dist = X[:,None,:]-X2[None,:,:]
-        lengthscale2inv = np.ones((X.shape[1]))/(self.lengthscale**2)
-        return -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX]*lengthscale2inv[dimX2] + (dimX==dimX2)*K*lengthscale2inv[dimX]
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+
+        term = dist[dimX]*(lengthscaleinv[dimX]**2)
+        term *= dist[dimX2]*(lengthscaleinv[dimX2]**2)
+        if dimX == dimX2:
+            term -= (lengthscaleinv[dimX]**2)
+        return -term*self._clean_K(X, X2)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dXdX2diag(self, X, dimX, dimX2):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements.
+        """
+        if dimX == dimX2:
+            lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
+            return np.ones(X.shape[0])*(lengthscaleinv[dimX]**2)*self.variance
+        else:
+            return np.zeros(X.shape[0])
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dXdX(self, X, X2, dimX_0, dimX_1):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX_0 of set X, and
+            dimension dimX_1 of set X.
+        """
+        return -self._clean_dK2_dXdX2(X, X2, dimX_0, dimX_1)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_dXdXdiag(self, X, dimX_0, dimX_1):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX_0 of set X, and
+            dimension dimX_1 of set X.
+
+        Returns only diagonal elements.
+        """
+        return -self._clean_dK2_dXdX2diag(X, dimX_0, dimX_1)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK3_dXdXdX2(self, X, X2, dimX_0, dimX_1, dimX2):
+        """
+        Compute the third derivative of K with respect to:
+            dimension dimX_0 of set X,
+            dimension dimX_1 of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+
+        term = dist[dimX_0]*(lengthscaleinv[dimX_0]**2)
+        term *= dist[dimX_1]*(lengthscaleinv[dimX_1]**2)
+        term *= dist[dimX2]*(lengthscaleinv[dimX2]**2)
+        if dimX_0 == dimX_1:
+            term -= dist[dimX2]*(lengthscaleinv[dimX2]**2)*(lengthscaleinv[dimX_0]**2)
+        if dimX_0 == dimX2:
+            term -= dist[dimX_1]*(lengthscaleinv[dimX_1]**2)*(lengthscaleinv[dimX_0]**2)
+        if dimX_1 == dimX2:
+            term -= dist[dimX_0]*(lengthscaleinv[dimX_0]**2)*(lengthscaleinv[dimX_1]**2)
+        return term*self._clean_K(X, X2)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK3_dXdXdX2diag(self, X, dimX_0, dimX_1, dimX2):
+        """
+        Compute the third derivative of K with respect to:
+            dimension dimX_0 of set X,
+            dimension dimX_1 of set X, and
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements of the covariance matrix.
+        """
+        return np.zeros(X.shape[0])

    def dK_dr(self, r):
        return -r*self.K_of_r(r)
@ -80,73 +182,132 @@ class RBF(Stationary):

    def dK2_drdr_diag(self):
        return -self.variance # as the diagonal of r is always filled with zeros
-    
+
    @Cache_this(limit=3, ignore_args=())
-    def dK_dvariance(self,X,X2):
-        return self.K(X,X2)/self.variance
-    
+    def dK_dvariance(self, X, X2):
+        """
+        Compute the derivative of K with respect to variance.
+        """
+        return self._clean_K(X, X2)/self.variance
+
    @Cache_this(limit=3, ignore_args=())
-    def dK2_dvariancedX(self, X, X2, dim):
-        return self.dK_dX(X,X2, dim)/self.variance
-    
+    def dK_dlengthscale(self, X, X2):
+        """
+        Compute the derivative(s) of K with respect to lengthscale(s).
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+
+        K = self._clean_K(X, X2)
+
+        if self.ARD:
+            g = []
+            for diml in range(self.input_dim):
+                g += [(dist[diml]**2)*(lengthscaleinv[diml]**3)*K]
+        else:
+            g = (lengthscaleinv[0]**3)*np.sum(dist**2, axis=0)*K
+        return g
+
    @Cache_this(limit=3, ignore_args=())
-    def dK2_dvariancedX2(self, X, X2, dim):
-        return self.dK_dX2(X,X2, dim)/self.variance
-    
+    def dK2_dvariancedX(self, X, X2, dimX):
+        """
+        Compute the second derivative of K with respect to:
+            variance, and
+            dimension dimX of set X.
+        """
+        return self._clean_dK_dX(X, X2, dimX)/self.variance
+
    @Cache_this(limit=3, ignore_args=())
-    def dK3_dvariancedXdX2(self, X, X2, dim, dimX2):
-        return self.dK2_dXdX2(X, X2, dim, dimX2)/self.variance
+    def dK2_dvariancedX2(self, X, X2, dimX2):
+        """
+        Compute the second derivative of K with respect to:
+            variance, and
+            dimension dimX2 of set X2.
+        """
+        return -self.dK2_dvariancedX(X, X2, dimX2)

    @Cache_this(limit=3, ignore_args=())
    def dK2_dlengthscaledX(self, X, X2, dimX):
-        r = self._scaled_dist(X, X2)
-        K = self.K_of_r(r)
-        if X2 is None:
-            X2=X
-        dist = X[:,None,:]-X2[None,:,:]
-        lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
+        """
+        Compute the second derivative(s) of K with respect to:
+            lengthscale(s), and
+            dimension dimX of set X.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+
+        dK_dX = self._clean_dK_dX(X, X2, dimX)
+        dK_dl = self.dK_dlengthscale(X, X2)
+
        if self.ARD:
            g = []
-            for diml in range(X.shape[1]):
-                g += [-1.*K*dist[:,:,dimX]*(dist[:,:,diml]**2)*(lengthscaleinv[dimX]**2)*(lengthscaleinv[diml]**3) + 2.*dist[:,:,dimX]*(lengthscaleinv[diml]**3)*K*(dimX == diml)]
+            for diml in range(self.input_dim):
+                term = -dist[dimX]*(lengthscaleinv[dimX]**2)*dK_dl[diml]
+                if diml == dimX:
+                    term -= 2*lengthscaleinv[dimX]*dK_dX
+                g += [term]
        else:
-            g = -1.*K*dist[:,:,dimX]*np.sum(dist**2, axis=2)*(lengthscaleinv[dimX]**5) + 2.*dist[:,:,dimX]*(lengthscaleinv[dimX]**3)*K
+            term = -dist[dimX]*(lengthscaleinv[0]**2)*dK_dl
+            term -= 2*lengthscaleinv[0]*dK_dX
+            g = term
        return g
-    
+
    @Cache_this(limit=3, ignore_args=())
    def dK2_dlengthscaledX2(self, X, X2, dimX2):
-        tmp = self.dK2_dlengthscaledX(X, X2, dimX2)
+        """
+        Compute the second derivative(s) of K with respect to:
+            lengthscale(s), and
+            dimension dimX2 of set X2.
+        """
+        dK2_dlengthscaledX = self.dK2_dlengthscaledX(X, X2, dimX2)
        if self.ARD:
-            return [-1.*g for g in tmp]
+            return [-1.*g for g in dK2_dlengthscaledX]
        else:
-            return -1*tmp
+            return -1*dK2_dlengthscaledX
    
+    @Cache_this(limit=3, ignore_args=())
+    def dK3_dvariancedXdX2(self, X, X2, dimX, dimX2):
+        """
+        Compute the third derivative of K with respect to:
+            variance,
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        return self._clean_dK2_dXdX2(X, X2, dimX, dimX2)/self.variance
+
    @Cache_this(limit=3, ignore_args=())
    def dK3_dlengthscaledXdX2(self, X, X2, dimX, dimX2):
-        r = self._scaled_dist(X, X2)
-        K = self.K_of_r(r)
-        if X2 is None:
-            X2=X
-        dist = X[:,None,:]-X2[None,:,:]
-        lengthscaleinv = np.ones((X.shape[1]))/(self.lengthscale)
-        lengthscale2inv = lengthscaleinv**2
+        """
+        Compute the third derivative(s) of K with respect to:
+            lengthscale(s),
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+
+        K = self._clean_K(X, X2)
+        dK_dX = self._clean_dK_dX(X, X2, dimX)
+        dK_dX2 = self._clean_dK_dX(X, X2, dimX2)
+        dK2_dXdX2 = self._clean_dK2_dXdX2(X, X2, dimX, dimX2)
+
        if self.ARD:
            g = []
-            for diml in range(X.shape[1]):
-                tmp = -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*(dist[:,:,diml]**2)*lengthscale2inv[dimX]*lengthscale2inv[dimX2]*(lengthscaleinv[diml]**3)
-                if dimX == dimX2:
-                    tmp += K*lengthscale2inv[dimX]*(lengthscaleinv[diml]**3)*(dist[:,:,diml]**2)
+            for diml in range(self.input_dim):
+                term = (dist[diml]**2)*(lengthscaleinv[diml]**3)*dK2_dXdX2
                if diml == dimX:
-                    tmp += 2.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX2]*(lengthscaleinv[dimX]**3)
+                    term -= 2*dist[dimX]*(lengthscaleinv[dimX]**3)*dK_dX2
                if diml == dimX2:
-                    tmp += 2.*K*dist[:,:,dimX]*dist[:,:,dimX2]*lengthscale2inv[dimX]*(lengthscaleinv[dimX2]**3)
-                    if dimX == dimX2:
-                        tmp += -2.*K*(lengthscaleinv[dimX]**3)
-                g += [tmp]
+                    term -= 2*dist[dimX2]*(lengthscaleinv[dimX2]**3)*dK_dX
+                if diml == dimX == dimX2:
+                    term -= 2*(lengthscaleinv[dimX]**3)*K
+                g += [term]
        else:
-            g = -1.*K*dist[:,:,dimX]*dist[:,:,dimX2]*np.sum(dist**2, axis=2)*(lengthscaleinv[dimX]**7) +4*K*dist[:,:,dimX]*dist[:,:,dimX2]*(lengthscaleinv[dimX]**5)
+            term = np.sum(dist**2, axis=0)*dK2_dXdX2
+            term -= 4*dist[dimX2]*dK_dX
            if dimX == dimX2:
-                g += -2.*K*(lengthscaleinv[dimX]**3) + K*(lengthscaleinv[dimX]**5)*np.sum(dist**2, axis=2)
+                term -= 2*K
+            g = (lengthscaleinv[0]**3)*term
        return g

    def __getstate__(self):
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@ -122,7 +122,6 @@ class StdPeriodic(Kern):

        pass

-
    def K(self, X, X2=None):
        """Compute the covariance matrix between X and X2."""
        if X2 is None:
@ -133,13 +132,372 @@ class StdPeriodic(Kern):

        return self.variance * exp_dist

-
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix associated to X."""
        ret = np.empty(X.shape[0])
        ret[:] = self.variance
        return ret

+    def dK_dX(self, X, X2, dimX):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX]
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        dist = X[:,None,dimX] - X2[None,:,dimX]
+        base = np.pi*periodinv*dist
+
+        return -F*np.sin(2*base)*self._clean_K(X, X2)
+
+    def dK_dXdiag(self, X, dimX):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX of set X.
+
+        Returns only diagonal elements.
+        """
+        return np.zeros(X.shape[0])
+
+    def dK_dX2(self, X, X2, dimX2):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX2 of set X2.
+        """
+        return -self._clean_dK_dX(X, X2, dimX2)
+
+    def dK_dX2diag(self, X, dimX2):
+        """
+        Compute the derivative of K with respect to:
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements.
+        """
+        return np.zeros(X.shape[0])
+    
+    def dK2_dXdX2(self, X, X2, dimX, dimX2):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        dist = X[:,None,dimX2] - X2[None,:,dimX2]
+        base = np.pi*periodinv*dist
+
+        term = np.sin(2*base)*self._clean_dK_dX(X, X2, dimX)
+        if dimX == dimX2:
+            term += 2*np.pi*periodinv*np.cos(2*base)*self._clean_K(X, X2)
+        return F*term
+
+    def dK2_dXdX2diag(self, X, dimX, dimX2):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements.
+        """
+        if dimX == dimX2:
+            lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
+            periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
+            return (np.pi**2)*(lengthscaleinv**2)*(periodinv**2)*self.variance*np.ones(X.shape[0])
+        else:
+            return np.zeros(X.shape[0])
+
+    def dK2_dXdX(self, X, X2, dimX_0, dimX_1):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX_0 of set X, and
+            dimension dimX_1 of set X.
+        """
+        return -self._clean_dK2_dXdX2(X, X2, dimX_0, dimX_1)
+
+    def dK2_dXdXdiag(self, X, dimX_0, dimX_1):
+        """
+        Compute the second derivative of K with respect to:
+            dimension dimX_0 of set X, and
+            dimension dimX_1 of set X.
+
+        Returns only diagonal elements.
+        """
+        return -self._clean_dK2_dXdX2diag(X, dimX_0, dimX_1)
+
+    def dK3_dXdXdX2(self, X, X2, dimX_0, dimX_1, dimX2):
+        """
+        Compute the third derivative of K with respect to:
+            dimension dimX_0 of set X,
+            dimension dimX_1 of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        dist = X[:,None,dimX2] - X2[None,:,dimX2]
+        base = np.pi*periodinv*dist
+
+        term = np.sin(2*base)*self._clean_dK2_dXdX(X, X2, dimX_0, dimX_1)
+        if dimX_0 == dimX2:
+            term += 2*np.pi*periodinv*np.cos(2*base)*self._clean_dK_dX(X, X2, dimX_1)
+        if dimX_1 == dimX2:
+            term += 2*np.pi*periodinv*np.cos(2*base)*self._clean_dK_dX(X, X2, dimX_0)
+        if dimX_0 == dimX_1 == dimX2:
+            term -= 4*(np.pi**2)*(periodinv**2)*np.sin(2*base)*self._clean_K(X, X2)
+        return F*term
+
+    def dK3_dXdXdX2diag(self, X, dimX_0, dimX_1, dimX2):
+        """
+        Compute the third derivative of K with respect to:
+            dimension dimX_0 of set X,
+            dimension dimX_1 of set X, and
+            dimension dimX2 of set X2.
+
+        Returns only diagonal elements of the covariance matrix.
+        """
+        return np.zeros(X.shape[0])
+
+    def dK_dvariance(self, X, X2):
+        """
+        Compute the derivative of K with respect to variance.
+        """
+        return self._clean_K(X, X2)/self.variance
+
+    def dK_dlengthscale(self, X, X2):
+        """
+        Compute the derivative(s) of K with respect to lengthscale(s).
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        periodinv = (np.ones(X.shape[1])/(self.period))
+
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+        base = np.pi*periodinv[:,None,None]*dist
+
+        K = self._clean_K(X, X2)
+
+        if self.ARD2:
+            g = []
+            for diml in range(self.input_dim):
+                g += [(lengthscaleinv[diml]**3)*np.square(np.sin(base[diml]))*K]
+        else:
+            g = (lengthscaleinv[0]**3)*np.sum(np.square(np.sin(base)), axis=0)*K
+        return g
+
+    def dK_dperiod(self, X, X2):
+        """
+        Compute the derivative(s) of K with respect to period(s).
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))
+        periodinv = (np.ones(X.shape[1])/(self.period))
+
+        dist = np.rollaxis(X[:,None,:] - X2[None,:,:], 2, 0)
+        base = np.pi*periodinv[:,None,None]*dist
+
+        K = self._clean_K(X, X2)
+
+        if self.ARD1:
+            g = []
+            for diml in range(self.input_dim):
+                g += [0.5*base[diml]*(lengthscaleinv[diml]**2)*periodinv[diml]*np.sin(2*base[diml])*K]
+        else:
+            g = 0.5*periodinv[0]*np.sum(base*(lengthscaleinv**2)[:,None,None]*np.sin(2*base), axis=0)*K
+        return g
+
+    def dK2_dvariancedX(self, X, X2, dimX):
+        """
+        Compute the second derivative of K with respect to:
+            variance, and
+            dimension dimX of set X.
+        """
+        return self._clean_dK_dX(X, X2, dimX)/self.variance
+
+    def dK2_dvariancedX2(self, X, X2, dimX2):
+        """
+        Compute the second derivative of K with respect to:
+            variance, and
+            dimension dimX2 of set X2.
+        """
+        return -self.dK2_dvariancedX(X, X2, dimX2)
+
+    def dK2_dlengthscaledX(self, X, X2, dimX):
+        """
+        Compute the second derivative(s) of K with respect to:
+            lengthscale(s), and
+            dimension dimX of set X.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX]
+
+        dist = X[:,None,dimX] - X2[None,:,dimX]
+        base = np.pi*periodinv*dist
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        K = self._clean_K(X, X2)
+        dK_dl = self.dK_dlengthscale(X, X2)
+
+        if self.ARD2:
+            g = []
+            for diml in range(self.input_dim):
+                term = dK_dl[diml]
+                if diml == dimX:
+                    term -= 2*lengthscaleinv*K
+                g += [-F*np.sin(2*base)*term]
+        else:
+            g = -F*np.sin(2*base)*(dK_dl - 2*lengthscaleinv*K)
+        return g
+
+    def dK2_dlengthscaledX2(self, X, X2, dimX2):
+        """
+        Compute the second derivative(s) of K with respect to:
+            lengthscale(s), and
+            dimension dimX2 of set X2.
+        """
+        dK2_dldX = self.dK2_dlengthscaledX(X, X2, dimX2)
+        if self.ARD2:
+            return [-1*g for g in dK2_dldX]
+        else:
+            return -1*dK2_dldX
+
+    def dK2_dperioddX(self, X, X2, dimX):
+        """
+        Compute the second derivative(s) of K with respect to:
+            period(s), and
+            dimension dimX of set X.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX]
+
+        dist = X[:,None,dimX] - X2[None,:,dimX]
+        base = np.pi*periodinv*dist
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        K = self._clean_K(X, X2)
+        dK_dT = self.dK_dperiod(X, X2)
+
+        if self.ARD1:
+            g = []
+            for dimT in range(self.input_dim):
+                term = np.sin(2*base)*dK_dT[dimT]
+                if dimT == dimX:
+                    term -= periodinv*(np.sin(2*base)+2*base*np.cos(2*base))*K
+                g += [-F*term]
+        else:
+            term = np.sin(2*base)*dK_dT
+            term -= periodinv*(np.sin(2*base)+2*base*np.cos(2*base))*K
+            g = -F*term
+        return g
+
+    def dK2_dperioddX2(self, X, X2, dimX2):
+        """
+        Compute the second derivative(s) of K with respect to:
+            period(s), and
+            dimension dimX2 of set X2.
+        """
+        dK2_dperioddX = self.dK2_dperioddX(X, X2, dimX2)
+        if self.ARD1:
+            return [-1*g for g in dK2_dperioddX]
+        else:
+            return -1*dK2_dperioddX
+
+    def dK3_dvariancedXdX2(self, X, X2, dimX, dimX2):
+        """
+        Compute the third derivative of K with respect to:
+            variance,
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        return self._clean_dK2_dXdX2(X, X2, dimX, dimX2)/self.variance
+
+    def dK3_dlengthscaledXdX2(self, X, X2, dimX, dimX2):
+        """
+        Compute the third derivative(s) of K with respect to:
+            lengthscale(s),
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
+
+        dist = X[:,None,dimX2] - X2[None,:,dimX2]
+        base = np.pi*periodinv*dist
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        dK2_dXdX2 = self._clean_dK2_dXdX2(X, X2, dimX, dimX2)
+        dK_dl = self.dK_dlengthscale(X, X2)
+        dK2_dldX = self.dK2_dlengthscaledX(X, X2, dimX)
+
+        if self.ARD2:
+            g = []
+            for diml in range(self.input_dim):
+                term = np.sin(2*base)*dK2_dldX[diml]
+                if dimX == dimX2:
+                    term += 2*np.pi*periodinv*np.cos(2*base)*dK_dl[diml]
+                term *= F
+                if diml == dimX2:
+                    term -= 2*lengthscaleinv*dK2_dXdX2
+                g += [term]
+        else:
+            term = np.sin(2*base)*dK2_dldX
+            if dimX == dimX2:
+                term += 2*np.pi*periodinv*np.cos(2*base)*dK_dl
+            term *= F
+            term -= 2*lengthscaleinv*dK2_dXdX2
+            g = term
+        return g
+
+    def dK3_dperioddXdX2(self, X, X2, dimX, dimX2):
+        """
+        Compute the third derivative(s) of K with respect to:
+            period(s),
+            dimension dimX of set X, and
+            dimension dimX2 of set X2.
+        """
+        lengthscaleinv = (np.ones(X.shape[1])/(self.lengthscale))[dimX2]
+        periodinv = (np.ones(X.shape[1])/(self.period))[dimX2]
+
+        dist = X[:,None,dimX2] - X2[None,:,dimX2]
+        base = np.pi*periodinv*dist
+
+        F = 0.5*np.pi*(lengthscaleinv**2)*periodinv # multiplicative factor
+
+        K = self._clean_K(X, X2)
+        dK_dX = self._clean_dK_dX(X, X2, dimX)
+        dK2_dXdX2 = self._clean_dK2_dXdX2(X, X2, dimX, dimX2)
+        dK_dT = self.dK_dperiod(X, X2)
+        dK2_dTdX = self.dK2_dperioddX(X, X2, dimX)
+
+        if self.ARD1:
+            g = []
+            for dimT in range(self.input_dim):
+                term = np.sin(2*base)*dK2_dTdX[dimT]
+                if dimT == dimX2:
+                    term -= 2*periodinv*np.cos(2*base)*base*dK_dX
+                if dimX == dimX2:
+                    term += 2*np.pi*periodinv*np.cos(2*base)*dK_dT[dimT]
+                if dimX == dimX2 == dimT:
+                    term += 2*np.pi*(periodinv**2)*(2*base*np.sin(2*base)-np.cos(2*base))*K
+                term *= F
+                if dimT == dimX2:
+                    term -= periodinv*dK2_dXdX2
+                g += [term]
+        else:
+            term = np.sin(2*base)*dK2_dTdX-2*periodinv*base*np.cos(2*base)*dK_dX
+            if dimX == dimX2:
+                term += 2*np.pi*periodinv*(np.cos(2*base)*dK_dT+periodinv*(2*base*np.sin(2*base)-np.cos(2*base))*K)
+            g = F*term-periodinv*dK2_dXdX2
+        return g
+
    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None:
@ -167,12 +525,52 @@ class StdPeriodic(Kern):
        else: # same lengthscales
            self.lengthscale.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)

+    def update_gradients_direct(self, dL_dVar, dL_dPer, dL_dLen):
+        self.variance.gradient = dL_dVar
+        self.period.gradient = dL_dPer
+        self.lengthscale.gradient = dL_dLen
+
+    def reset_gradients(self):
+        self.variance.gradient = 0.
+        if not self.ARD1:
+            self.period.gradient = 0.
+        else:
+            self.period.gradient = np.zeros(self.input_dim)
+        if not self.ARD2:
+            self.lengthscale.gradient = 0.
+        else:
+            self.lengthscale.gradient = np.zeros(self.input_dim)
+
    def update_gradients_diag(self, dL_dKdiag, X):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        self.variance.gradient = np.sum(dL_dKdiag)
        self.period.gradient = 0
        self.lengthscale.gradient = 0

+    def dgradients(self, X, X2):
+        g1 = self.dK_dvariance(X, X2)
+        g2 = self.dK_dperiod(X, X2)
+        g3 = self.dK_dlengthscale(X, X2)
+        return [g1, g2, g3]
+
+    def dgradients_dX(self, X, X2, dimX):
+        g1 = self.dK2_dvariancedX(X, X2, dimX)
+        g2 = self.dK2_dperioddX(X, X2, dimX)
+        g3 = self.dK2_dlengthscaledX(X, X2, dimX)
+        return [g1, g2, g3]
+
+    def dgradients_dX2(self, X, X2, dimX2):
+        g1 = self.dK2_dvariancedX2(X, X2, dimX2)
+        g2 = self.dK2_dperioddX2(X, X2, dimX2)
+        g3 = self.dK2_dlengthscaledX2(X, X2, dimX2)
+        return [g1, g2, g3]
+
+    def dgradients2_dXdX2(self, X, X2, dimX, dimX2):
+        g1 = self.dK3_dvariancedXdX2(X, X2, dimX, dimX2)
+        g2 = self.dK3_dperioddXdX2(X, X2, dimX, dimX2)
+        g3 = self.dK3_dlengthscaledXdX2(X, X2, dimX, dimX2)
+        return [g1, g2, g3]
+
    def gradients_X(self, dL_dK, X, X2=None):
        K = self.K(X, X2)
        if X2 is None:
@ -185,4 +583,4 @@ class StdPeriodic(Kern):
        return np.zeros(X.shape)
    
    def input_sensitivity(self, summarize=True):
-        return self.variance*np.ones(self.input_dim)/self.lengthscale**2
+        return self.variance*np.ones(self.input_dim)/self.lengthscale**2
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -306,7 +306,12 @@ class Stationary(Kern):
        l4 =  np.ones(X.shape[1])*self.lengthscale**2
        return dL_dK_diag * (np.eye(X.shape[1]) * -self.dK2_drdr_diag()/(l4))[None, :,:]# np.zeros(X.shape+(X.shape[1],))
        #return np.ones(X.shape) * d2L_dK * self.variance/self.lengthscale**2 # np.zeros(X.shape)
-    
+
+    def dgradients(self, X, X2):
+        g1 = self.dK_dvariance(X, X2)
+        g2 = self.dK_dlengthscale(X, X2)
+        return [g1, g2]
+
    def dgradients_dX(self, X, X2, dimX):
        g1 = self.dK2_dvariancedX(X, X2, dimX)
        g2 = self.dK2_dlengthscaledX(X, X2, dimX)
--- a/GPy/models/multioutput_gp.py
+++ b/GPy/models/multioutput_gp.py
@ -9,6 +9,7 @@ from ..core.mapping import Mapping
 from .. import likelihoods
 from ..likelihoods.gaussian import Gaussian
 from .. import kern
+from ..kern import DiffKern
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
 from ..util.normalizer import Standardize
 from .. import util
@ -69,39 +70,80 @@ class MultioutputGP(GP):
            if Y_metadata is None:
                Y_metadata={'output_index': ind, 'trials': np.ones(ind.shape)}
        return super(MultioutputGP, self).predict_quantiles(X, quantiles, Y_metadata, kern, likelihood)
-    
-    def predictive_gradients(self, Xnew, kern=None):
-        if isinstance(Xnew, list):
-            Xnew, _, ind  = util.multioutput.build_XY(Xnew, None)
-            #if Y_metadata is None:
-                #Y_metadata={'output_index': ind}
-        return super(MultioutputGP, self).predictive_gradients(Xnew, kern)

-    def predictive_gradients(self, Xnew, kern=None): #XNEW IS NOT A LIST!!
+    def predictive_gradients(self, Xnew, kern=None):
        """
-        Compute the derivatives of the predicted latent function with respect to X*
+        Compute the derivatives of the predicted latent function with respect
+        to X*
+        
        Given a set of points at which to predict X* (size [N*,Q]), compute the
        derivatives of the mean and variance. Resulting arrays are sized:
-         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
-        Note that this is not the same as computing the mean and variance of the derivative of the function!
+            dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP
+            (usually one).
+        
+        Note that this is not the same as computing the mean and variance of
+        the derivative of the function!
+
         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
        :param X: The points at which to get the predictive gradients
        :type X: np.ndarray (Xnew x self.input_dim)
        :returns: dmu_dX, dv_dX
        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q) ]
+
        """
        
        if isinstance(Xnew, list):
            Xnew, _, ind  = util.multioutput.build_XY(Xnew, None)
        
        slices = index_to_slices(Xnew[:,-1])
-        
-        for i in range(len(slices)):
-            if ((self.kern.kern[i].name == 'diffKern' ) and len(slices[i])>0):
-                assert 0, "It is not (yet) possible to predict gradients of gradient observations, sorry :)"
- 
+
        if kern is None:
            kern = self.kern
+
+        if all([(isinstance(k, DiffKern)) for k in self.kern.kern[1:]]):
+            """
+            Compute the gradients of the predicted latent function and predicted
+            partial derivatives with respect to X*.
+
+            This works only for models that observe the gradient of the latent function.
+
+            Xnew is given as a list of arrays, where each array X*_i (size [N_i*, Q])
+            contains points at which to compute gradients for each predicted latent
+            function or partial derivative.
+
+            Resulting arrays are sized [sum_i^D : N_i*, Q]
+
+            Passing a list of only one array [X*] returns only gradients of
+            the predicted latent function and does not compute gradients of
+            predicted partial derivatives.
+
+            In this case the resulting arrays are sized [N*, Q].
+            
+            :param Xnew: points at which to compute predictive gradients
+            :type Xnew: list
+            :type Xnew[i]: np.darray (sum_i^D : N_i*, Q)
+            :returns: dmu_dX, dv_dX
+            :rtype: (np.ndarray (sum_i^D : N_i*, Q), np.ndarray (sum_i^D : N_i*, Q))
+            """
+
+            dims = Xnew.shape[1] - 1
+
+            mean_jac = np.empty((Xnew.shape[0], dims))
+            var_jac = np.empty((Xnew.shape[0], dims))
+
+            X = self._predictive_variable
+            alpha = self.posterior.woodbury_vector
+            Wi = self.posterior.woodbury_inv
+
+            k = kern.K(Xnew, X)
+            for dimX in range(dims):
+                dk_dx = kern.dK_dX(Xnew, X, dimX)
+                dk_dxdiag = kern.dK_dXdiag(Xnew, dimX)
+
+                mean_jac[:,dimX] = np.dot(dk_dx, alpha).flatten()
+                var_jac[:,dimX] = dk_dxdiag - 2*(np.dot(k, Wi)*dk_dx).sum(-1)
+            return mean_jac, var_jac
+
        mean_jac = np.empty((Xnew.shape[0],Xnew.shape[1]-1,self.output_dim))
        for i in range(self.output_dim):
            mean_jac[:,:,i] = kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self._predictive_variable)[:,0:-1]
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -5,6 +5,8 @@ from __future__ import division
 import unittest
 import numpy as np
 import GPy
+from GPy.models import GradientChecker
+from functools import reduce

 class MiscTests(unittest.TestCase):
    def setUp(self):
@ -1208,40 +1210,6 @@ class GradientTests(np.testing.TestCase):

        with self.assertRaises(RuntimeError):
            m._raw_posterior_covariance_between_points(np.array([[1], [2]]), np.array([[3], [4]]))
-    
-    def test_multioutput_model_with_derivative_observations(self):
-        f = lambda x: np.sin(x)+0.1*(x-2.)**2-0.005*x**3
-        fd = lambda x: np.cos(x)+0.2*(x-2.)-0.015*x**2
-        N=10
-        M=10
-        sigma=0.05
-        sigmader=0.05
-        x = np.array([np.linspace(1,10,N)]).T
-        y = f(x) + np.array(sigma*np.random.normal(0,1,(N,1)))
-
-        xd = np.array([np.linspace(2,8,M)]).T
-        yd = fd(xd) + np.array(sigmader*np.random.normal(0,1,(M,1)))
-
-        # squared exponential kernel:
-        se = GPy.kern.RBF(input_dim = 1, lengthscale=1.5, variance=0.2)
-        # We need to generate separate kernel for the derivative observations and give the created kernel as an input:
-        se_der = GPy.kern.DiffKern(se, 0)
-
-        #Then 
-        gauss = GPy.likelihoods.Gaussian(variance=sigma**2)
-        gauss = GPy.likelihoods.Gaussian(variance=0.1)
-        gauss_der = GPy.likelihoods.Gaussian(variance=sigma**2)
-
-        # Then create the model, we give everything in lists, the order of the inputs indicates the order of the outputs
-        # Now we have the regular observations first and derivative observations second, meaning that the kernels and
-        # the likelihoods must follow the same order
-        m = GPy.models.MultioutputGP(X_list=[x, xd], Y_list=[y, yd], kernel_list=[se, se_der], likelihood_list = [gauss, gauss])
-        m.randomize()
-        self.assertTrue(m.checkgrad())
-
-        m.optimize(messages=0, ipython_notebook=False)
-
-        self.assertTrue(m.checkgrad())

    def test_multioutput_model_with_ep(self):
        f = lambda x: np.sin(x)+0.1*(x-2.)**2-0.005*x**3
@ -1308,6 +1276,173 @@ class GradientTests(np.testing.TestCase):
        c2 = model.predict(x, full_cov=True)[1]
        np.testing.assert_allclose(c1,c2)

+class GradientMultioutputGPModelTests(np.testing.TestCase):
+    def setUp(self):
+
+        # standard test function
+        self.period = 3
+        self.w = 2*np.pi/self.period
+        self.f = lambda x: np.sum(np.square(np.sin(self.w*x)), axis=1)
+        self.df = lambda x: self.w*np.sin(2*self.w*x)
+
+        self.noise_std = 1e-2
+
+        self.bounds = (-self.period, self.period)
+
+        self.train_points = 5
+        self.test_points = 25
+
+    def approximate_predictive_gradients(self, model, x_test, D, step=1e-6):
+        '''
+        Approximates gradients of predicted posterior means and variances.
+
+        This function is used as the frameworks for GradientChecker and
+        MultioutputGP do not easily combine when checking gradients of predicted
+        partial derivative posteriors.
+        '''
+
+        dmdx_aprx = np.zeros((x_test.shape[0]*(D + 1), D))
+        dvdx_aprx = np.zeros((x_test.shape[0]*(D + 1), D))
+
+        for d in range(D):
+
+            x_over = x_test.copy()
+            x_over[:,d] += step
+            x_undr = x_test.copy()
+            x_undr[:,d] -= step
+
+            m_over, v_over = model.predict([x_over]*(D + 1))
+            m_undr, v_undr = model.predict([x_undr]*(D + 1))
+
+            dmdx_aprx[:,d,None] = (m_over - m_undr)/(2*step)
+            dvdx_aprx[:,d,None] = (v_over - v_undr)/(2*step)
+
+        return dmdx_aprx, dvdx_aprx
+
+    def check_model(self, kern):
+        '''
+        Checks predictions, hyperparameter gradients, and gradients of predicted
+        posterior means and variances for MultioutputGP models that incorporate
+        observed latent function gradient information.
+        '''
+
+        D = kern.input_dim
+
+        X_list = []
+        Y_list = []
+        for i in range(D + 1):
+            # sample inputs for either latent function or partial derivatives
+            X_i = np.random.uniform(*self.bounds, size=(self.train_points, D))
+            # output of latent function or partial derivatives
+            Y_i = (self.f(X_i) if (i == 0) else self.df(X_i)[:,i - 1])[:,None]
+            # noisy observations
+            Y_i += np.random.normal(scale=self.noise_std, size=Y_i.shape)
+
+            X_list.append(X_i)
+            Y_list.append(Y_i)
+
+        # the kernel is accompanied with derivative kernels, one for each dimension
+        kernel_list = [kern] + [GPy.kern.DiffKern(kern, d) for d in range(D)]
+
+        # create model and check its hyperparameter gradient
+        likelihood_list = [GPy.likelihoods.Gaussian(variance=self.noise_std**2)]*(D + 1)
+        model = GPy.models.MultioutputGP(X_list, Y_list, kernel_list, likelihood_list)
+        model.likelihood.constrain_fixed()
+        self.assertTrue(model.checkgrad(step=1e-3))
+
+        # optimize the model, and check its hyperparameter gradient again
+        model.optimize()
+        self.assertTrue(model.checkgrad(step=1e-3))
+
+        # check predictions
+        np.testing.assert_allclose(model.predict(X_list)[0], model.Y, atol=3*self.noise_std)
+
+        # test inputs for checking predictive gradients
+        x_test = np.random.uniform(*self.bounds, size=(self.test_points, D))
+
+        # predictive gradients
+        dmdx, dvdx = model.predictive_gradients([x_test]*(D + 1))
+        # approximated predictive gradients
+        dmdx_aprx, dvdx_aprx = self.approximate_predictive_gradients(model, x_test, D, step=1e-3)
+        # check predictive gradients
+        np.testing.assert_allclose(dmdx, dmdx_aprx, atol=3*self.noise_std)
+        np.testing.assert_allclose(dvdx, dvdx_aprx, atol=3*self.noise_std)
+
+    def test_MultioutputGP_gradobs_RBF(self):
+        '''
+        Testing gradient observing MultioutputGP model with an RBF kernel.
+        '''
+        for D in range(1, 4):
+            kern = GPy.kern.RBF(input_dim=D)
+            kern.randomize()
+            self.check_model(kern)
+
+    def test_MultioutputGP_gradobs_RBF_ARD(self):
+        '''
+        Testing gradient observing MultioutputGP model with an RBF (ARD) kernel.
+        '''
+        for D in range(1, 4):
+            kern = GPy.kern.RBF(input_dim=D, ARD=True)
+            kern.randomize()
+            self.check_model(kern)
+
+    def test_MultioutputGP_gradobs_StdP(self):
+        '''
+        Testing gradient observing MultioutputGP model with a StdP kernel.
+        '''
+        for D in range(1, 4):
+            kern = GPy.kern.StdPeriodic(input_dim=D, period=self.period)
+            kern.period.constrain_fixed()
+            kern.randomize()
+            self.check_model(kern)
+
+    def test_MultioutputGP_gradobs_StdP_ARD(self):
+        '''
+        Testing gradient observing MultioutputGP model with a StdP (ARD) kernel.
+        '''
+        for D in range(1, 4):
+            kern = GPy.kern.StdPeriodic(input_dim=D, period=[self.period]*D, ARD1=True, ARD2=True)
+            kern.period.constrain_fixed()
+            kern.randomize()
+            self.check_model(kern)
+
+    def test_MultioutputGP_gradobs_prod_RBF(self):
+        '''
+        Testing gradient observing MultioutputGP model with several RBF kernels.
+        '''
+        for D in range(2, 4):
+            kerns = [GPy.kern.RBF(input_dim=1) for d in range(D)]
+            kern = reduce(lambda k0, k1: k0 * k1, kerns)
+            kern.randomize()
+            self.check_model(kern)
+
+    def test_MultioutputGP_gradobs_prod_StdP(self):
+        '''
+        Testing gradient observing MultioutputGP model with several StdP kernels.
+        '''
+        for D in range(2, 4):
+            kerns = [GPy.kern.StdPeriodic(input_dim=1, period=self.period) for d in range(D)]
+            kern = reduce(lambda k0, k1: k0 * k1, kerns)
+            [k.period.constrain_fixed() for k in kern.parts]
+            kern.randomize()
+            self.check_model(kern)
+
+    def test_MultioutputGP_gradobs_prod_mix(self):
+        '''
+        Testing gradient observing MultioutputGP model with a mix of kernel types.
+        '''
+        for D in range(2, 4):
+            kerns = []
+            for d in range(D):
+                if d % 2 == 0:
+                    k = GPy.kern.RBF(input_dim=1)
+                else:
+                    k = GPy.kern.StdPeriodic(input_dim=1, period=self.period)
+                    k.period.constrain_fixed()
+                kerns.append(k)
+            kern = reduce(lambda k0, k1: k0 * k1, kerns)
+            kern.randomize()
+            self.check_model(kern)

 def _create_missing_data_model(kernel, Q):
    D1, D2, D3, N, num_inducing = 13, 5, 8, 400, 3