diff --git a/GPy/kern/src/add.py b/GPy/kern/src/add.py
index ec09f157..2ad515dd 100644
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@@ -85,13 +85,20 @@ class Add(CombinationKernel):
         [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
         return target
 
-    def gradients_XX(self, dL_dK, X, X2):
-        if X2 is None:
-            target = np.zeros((X.shape[0], X.shape[0], X.shape[1]))
-        else:
-            target = np.zeros((X.shape[0], X2.shape[0], X.shape[1]))
-        [target.__iadd__(p.gradients_XX(dL_dK, X, X2)) for p in self.parts]
-        return target
+    # def gradients_XX(self, dL_dK, X, X2, cov=True):
+    #     if cov==True: # full covarance
+    #         if X2 is None:
+    #             target = np.zeros((X.shape[0], X.shape[0], X.shape[1], X.shape[1]))
+    #         else:
+    #             target = np.zeros((X.shape[0], X2.shape[0], X.shape[1], X.shape[1]))
+    #     else: # diagonal covariance
+    #         if X2 is None:
+    #             target = np.zeros((X.shape[0], X.shape[0], X.shape[1]))
+    #         else:
+    #             target = np.zeros((X.shape[0], X2.shape[0], X.shape[1]))
+
+    #     [target.__iadd__(p.gradients_XX(dL_dK, X, X2, cov=True)) for p in self.parts]
+    #     return target
 
     def gradients_XX_diag(self, dL_dKdiag, X):
         target = np.zeros(X.shape)
diff --git a/GPy/kern/src/kern.py b/GPy/kern/src/kern.py
index 393cf1c2..b64d145b 100644
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@@ -132,7 +132,7 @@ class Kern(Parameterized):
         raise NotImplementedError
     def gradients_X_X2(self, dL_dK, X, X2):
         return self.gradients_X(dL_dK, X, X2), self.gradients_X(dL_dK.T, X2, X)
-    def gradients_XX(self, dL_dK, X, X2):
+    def gradients_XX(self, dL_dK, X, X2, cov='False'):
         """
         .. math::
 
diff --git a/GPy/kern/src/kernel_slice_operations.py b/GPy/kern/src/kernel_slice_operations.py
index 57b34de9..c0d46c0f 100644
--- a/GPy/kern/src/kernel_slice_operations.py
+++ b/GPy/kern/src/kernel_slice_operations.py
@@ -24,7 +24,7 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
         put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
         put_clean(dct, 'gradients_X', _slice_gradients_X)
         put_clean(dct, 'gradients_X_X2', _slice_gradients_X)
-        put_clean(dct, 'gradients_XX', _slice_gradients_XX)
+#        put_clean(dct, 'gradients_XX', _slice_gradients_XX)
         put_clean(dct, 'gradients_XX_diag', _slice_gradients_X_diag)
         put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
 
@@ -112,18 +112,23 @@ def _slice_gradients_X(f):
         return ret
     return wrap
 
-def _slice_gradients_XX(f):
-    @wraps(f)
-    def wrap(self, dL_dK, X, X2=None):
-        if X2 is None:
-            N, M = X.shape[0], X.shape[0]
-        else:
-            N, M = X.shape[0], X2.shape[0]
-        with _Slice_wrap(self, X, X2, ret_shape=(N, M, X.shape[1])) as s:
-        #with _Slice_wrap(self, X, X2, ret_shape=None) as s:
-            ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
-        return ret
-    return wrap
+# def _slice_gradients_XX(f):
+#     @wraps(f)
+#     def wrap(self, dL_dK, X, X2=None, cov=True):
+#         if X2 is None:
+#             N, M = X.shape[0], X.shape[0]
+#         else:
+#             N, M = X.shape[0], X2.shape[0]
+#         if cov==True: # full covariance
+#             with _Slice_wrap(self, X, X2, ret_shape=(N, M, X.shape[1], X.shape[1])) as s:
+#             #with _Slice_wrap(self, X, X2, ret_shape=None) as s:
+#                 ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
+#         else: # diagonal covariance
+#             with _Slice_wrap(self, X, X2, ret_shape=(N, M, X.shape[1])) as s:
+#             #with _Slice_wrap(self, X, X2, ret_shape=None) as s:
+#                 ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
+#         return ret
+#     return wrap
 
 def _slice_gradients_X_diag(f):
     @wraps(f)
diff --git a/GPy/kern/src/stationary.py b/GPy/kern/src/stationary.py
index 5e137abb..90aa4297 100644
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@@ -218,45 +218,60 @@ class Stationary(Kern):
         else:
             return self._gradients_X_pure(dL_dK, X, X2)
 
-    def gradients_XX(self, dL_dK, X, X2=None):
+    def gradients_XX(self, dL_dK, X, X2=None, cov=True):
         """
         Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
 
+        cov = Full: returns the full covariance matrix [QxQ] of the input dimensionfor each pair or vectors
+        cov = Diag: returns the diagonal of the covariance matrix [QxQ] of the input dimensionfor each pair 
+                    or vectors (computationally more efficient if the full covariance matrix is not needed)
         ..math:
-          \frac{\partial^2 K}{\partial X\partial X2}
+            \frac{\partial^2 K}{\partial X2 ^2} = - \frac{\partial^2 K}{\partial X\partial X2}
 
         ..returns:
-            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ] (X2 is X if, X2 is None)
-            Thus, we return the second derivative in X2.
+            dL2_dXdX2: [NxMxQ] in the cov=Diag case, or [NxMxQxQ] in the cov=full case, 
+                        for X [NxQ] and X2[MxQ] (X2 is X if, X2 is None)
+                        Thus, we return the second derivative in X2.
         """
-        # The off diagonals in Q are always zero, this should also be true for the Linear kernel...
         # According to multivariable chain rule, we can chain the second derivative through r:
         # d2K_dXdX2 = dK_dr*d2r_dXdX2 + d2K_drdr * dr_dX * dr_dX2:
         invdist = self._inv_dist(X, X2)
         invdist2 = invdist**2
-
-        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
+        dL_dr = self.dK_dr_via_X(X, X2) # * dL_dK we perofrm this product later
         tmp1 = dL_dr * invdist
-
-        dL_drdr = self.dK2_drdr_via_X(X, X2) * dL_dK
+        dL_drdr = self.dK2_drdr_via_X(X, X2) # * dL_dK we perofrm this product later
         tmp2 = dL_drdr * invdist2
-
-        l2 = np.ones(X.shape[1]) * self.lengthscale**2
+        l2 =  np.ones(X.shape[1])*self.lengthscale**2 #np.multiply(np.ones(X.shape[1]) ,self.lengthscale**2)
+        print ['l2',l2]
 
         if X2 is None:
             X2 = X
             tmp1 -= np.eye(X.shape[0])*self.variance
         else:
-            tmp1[X==X2.T] -= self.variance
-
-        grad = np.empty((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
-        #grad = np.empty(X.shape, dtype=np.float64)
-        for q in range(self.input_dim):
-            tmpdist2 = (X[:,[q]]-X2[:,[q]].T) ** 2
-            grad[:, :, q] = ((tmp1*invdist2 - tmp2)*tmpdist2/l2[q] - tmp1)/l2[q]
-            #grad[:, :, q] = ((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q]
-            #np.sum(((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q], axis=1, out=grad[:,q])
-            #np.sum( - (tmp2*(tmpdist**2)), axis=1, out=grad[:,q])
+            #tmp1[X==X2.T] -= self.variance # Old version, to be removed 
+                                            # (seems to have a bug: it is subtracted to the first X1 anyway)
+            tmp1[invdist2==0.] -= self.variance
+        
+        if cov==True: # full covariance
+            grad = np.empty((X.shape[0], X2.shape[0], X2.shape[1], X.shape[1]), dtype=np.float64)
+            for q in range(self.input_dim):
+                for r in range(self.input_dim):
+                    tmpdist2 = (X[:,[q]]-X2[:,[q]].T)*(X[:,[r]]-X2[:,[r]].T) # Introduce temporary distance
+                    if r==q:
+                        grad[:, :, q, r] = np.multiply(dL_dK,(np.multiply((tmp1*invdist2 - tmp2),tmpdist2)/l2[r] - tmp1)/l2[q])
+                    else:
+                        grad[:, :, q, r] = np.multiply(dL_dK,(np.multiply((tmp1*invdist2 - tmp2),tmpdist2)/l2[r])/l2[q])
+        else: 
+            # Diagonal covariance
+            grad = np.empty((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
+            #grad = np.empty(X.shape, dtype=np.float64)
+            for q in range(self.input_dim):
+                tmpdist2 = (X[:,[q]]-X2[:,[q]].T) ** 2
+                grad[:, :, q] = np.multiply(dL_dK,(np.multiply((tmp1*invdist2 - tmp2),tmpdist2)/l2[q] - tmp1)/l2[q])
+                #grad[:, :, q] = ((tmp1*invdist2 - tmp2)*tmpdist2/l2[q] - tmp1)/l2[q]
+                #grad[:, :, q] = ((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q]
+                #np.sum(((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q], axis=1, out=grad[:,q])
+                #np.sum( - (tmp2*(tmpdist**2)), axis=1, out=grad[:,q])
         return grad
 
     def gradients_XX_diag(self, dL_dK, X):