diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 6b1c9cb4..303921ea 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -377,7 +377,7 @@ class GP(Model):
         if full_cov:
             dK2_dXdX = kern.gradients_XX(one, Xnew)
         else:
-            dK2_dXdX = -kern.gradients_XX(one, Xnew).sum(0)
+            dK2_dXdX = kern.gradients_XX(one, Xnew).sum(0)
             #dK2_dXdX = np.zeros((Xnew.shape[0], Xnew.shape[1], Xnew.shape[1]))
             #for i in range(Xnew.shape[0]):
             #    dK2_dXdX[i:i+1,:,:] = kern.gradients_XX(one, Xnew[i:i+1,:])
diff --git a/GPy/kern/src/add.py b/GPy/kern/src/add.py
index 5ac773c9..99fe809b 100644
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@@ -85,23 +85,22 @@ class Add(CombinationKernel):
         [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
         return target
 
-    def gradients_XX(self, dL_dK, X, X2, cov=True):
-        if cov: # full covarance
-            if X2 is None:
-                target = np.zeros((X.shape[0], X.shape[0], X.shape[1], X.shape[1]))
-            else:
-                target = np.zeros((X.shape[0], X2.shape[0], X.shape[1], X.shape[1]))
-        else: # diagonal covariance
-            if X2 is None:
-                target = np.zeros((X.shape[0], X.shape[0], X.shape[1]))
-            else:
-                target = np.zeros((X.shape[0], X2.shape[0], X.shape[1]))
-        [target.__iadd__(p.gradients_XX(dL_dK, X, X2, cov=cov)) for p in self.parts]
+    def gradients_XX(self, dL_dK, X, X2):
+        if X2 is None:
+            target = np.zeros((X.shape[0], X.shape[0], X.shape[1], X.shape[1]))
+        else:
+            target = np.zeros((X.shape[0], X2.shape[0], X.shape[1], X.shape[1]))
+        #else: # diagonal covariance
+        #    if X2 is None:
+        #        target = np.zeros((X.shape[0], X.shape[0], X.shape[1]))
+        #    else:
+        #        target = np.zeros((X.shape[0], X2.shape[0], X.shape[1]))
+        [target.__iadd__(p.gradients_XX(dL_dK, X, X2)) for p in self.parts]
         return target
 
-    def gradients_XX_diag(self, dL_dKdiag, X, cov=True):
+    def gradients_XX_diag(self, dL_dKdiag, X):
         target = np.zeros(X.shape+(X.shape[1],))
-        [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X, cov=cov)) for p in self.parts]
+        [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X)) for p in self.parts]
         return target
 
     @Cache_this(limit=3, force_kwargs=['which_parts'])
@@ -188,7 +187,7 @@ class Add(CombinationKernel):
 
     def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         tmp = dL_dpsi2.sum(0)+ dL_dpsi2.sum(1) if len(dL_dpsi2.shape)==2 else dL_dpsi2.sum(2)+ dL_dpsi2.sum(1)
-        
+
         if not self._exact_psicomp: return Kern.update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)
         from .static import White, Bias
         for p1 in self.parts:
@@ -200,9 +199,9 @@ class Add(CombinationKernel):
                 if isinstance(p2, White):
                     continue
                 elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += tmp * p2.variance 
+                    eff_dL_dpsi1 += tmp * p2.variance
                 else:# np.setdiff1d(p1._all_dims_active, ar2, assume_unique): # TODO: Careful, not correct for overlapping _all_dims_active
-                    eff_dL_dpsi1 += tmp * p2.psi1(Z, variational_posterior) 
+                    eff_dL_dpsi1 += tmp * p2.psi1(Z, variational_posterior)
             p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
 
     def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
@@ -219,7 +218,7 @@ class Add(CombinationKernel):
                 if isinstance(p2, White):
                     continue
                 elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += tmp * p2.variance 
+                    eff_dL_dpsi1 += tmp * p2.variance
                 else:
                     eff_dL_dpsi1 += tmp * p2.psi1(Z, variational_posterior)
             target += p1.gradients_Z_expectations(dL_psi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
@@ -227,7 +226,7 @@ class Add(CombinationKernel):
 
     def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         tmp = dL_dpsi2.sum(0)+ dL_dpsi2.sum(1) if len(dL_dpsi2.shape)==2 else dL_dpsi2.sum(2)+ dL_dpsi2.sum(1)
-        
+
         if not self._exact_psicomp: return Kern.gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)
         from .static import White, Bias
         target_grads = [np.zeros(v.shape) for v in variational_posterior.parameters]
@@ -240,9 +239,9 @@ class Add(CombinationKernel):
                 if isinstance(p2, White):
                     continue
                 elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += tmp * p2.variance 
+                    eff_dL_dpsi1 += tmp * p2.variance
                 else:
-                    eff_dL_dpsi1 += tmp * p2.psi1(Z, variational_posterior) 
+                    eff_dL_dpsi1 += tmp * p2.psi1(Z, variational_posterior)
             grads = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
             [np.add(target_grads[i],grads[i],target_grads[i]) for i in range(len(grads))]
         return target_grads
@@ -255,7 +254,7 @@ class Add(CombinationKernel):
     #            other.unlink_parameter(p)
     #        parts.extend(other.parts)
     #        #self.link_parameters(*other_params)
-    #        
+    #
     #    else:
     #        #self.link_parameter(other)
     #        parts.append(other)
@@ -271,7 +270,7 @@ class Add(CombinationKernel):
         else:
 
             return super(Add, self).input_sensitivity(summarize)
-            
+
     def sde_update_gradient_full(self, gradients):
         """
         Update gradient in the order in which parameters are represented in the
@@ -283,12 +282,12 @@ class Add(CombinationKernel):
                 part_param_num = len(p.param_array) # number of parameters in the part
                 p.sde_update_gradient_full(gradients[part_start_param_index:(part_start_param_index+part_param_num)])
                 part_start_param_index += part_param_num
-    
+
     def sde(self):
         """
         Support adding kernels for sde representation
         """
-        
+
         import scipy.linalg as la
 
         F     = None
@@ -312,51 +311,51 @@ class Add(CombinationKernel):
             L = la.block_diag(L,Lt) if (L is not None) else Lt
             Qc = la.block_diag(Qc,Qct) if (Qc is not None) else Qct
             H = np.hstack((H,Ht)) if (H is not None) else Ht
-             
+
             Pinf = la.block_diag(Pinf,Pinft) if (Pinf is not None) else Pinft
             P0 = la.block_diag(P0,P0t) if (P0 is not None) else P0t
-            
+
             if dF is not None:
                 dF = np.pad(dF,((0,dFt.shape[0]),(0,dFt.shape[1]),(0,dFt.shape[2])),
                         'constant', constant_values=0)
                 dF[-dFt.shape[0]:,-dFt.shape[1]:,-dFt.shape[2]:] = dFt
             else:
                 dF = dFt
-             
+
             if dQc is not None:
                 dQc = np.pad(dQc,((0,dQct.shape[0]),(0,dQct.shape[1]),(0,dQct.shape[2])),
                         'constant', constant_values=0)
                 dQc[-dQct.shape[0]:,-dQct.shape[1]:,-dQct.shape[2]:] = dQct
             else:
                 dQc = dQct
-             
+
             if dPinf is not None:
                 dPinf = np.pad(dPinf,((0,dPinft.shape[0]),(0,dPinft.shape[1]),(0,dPinft.shape[2])),
                         'constant', constant_values=0)
                 dPinf[-dPinft.shape[0]:,-dPinft.shape[1]:,-dPinft.shape[2]:] = dPinft
             else:
                 dPinf = dPinft
-                
+
             if dP0 is not None:
                 dP0 = np.pad(dP0,((0,dP0t.shape[0]),(0,dP0t.shape[1]),(0,dP0t.shape[2])),
                         'constant', constant_values=0)
                 dP0[-dP0t.shape[0]:,-dP0t.shape[1]:,-dP0t.shape[2]:] = dP0t
             else:
                 dP0 = dP0t
-                
+
             n += Ft.shape[0]
             nq += Qct.shape[0]
             nd += dFt.shape[2]
-        
+
         assert (F.shape[0] == n and F.shape[1]==n), "SDE add: Check of F Dimensions failed"
         assert (L.shape[0] == n and L.shape[1]==nq), "SDE add: Check of L Dimensions failed"
         assert (Qc.shape[0] == nq and Qc.shape[1]==nq), "SDE add: Check of Qc Dimensions failed"
         assert (H.shape[0] == 1 and H.shape[1]==n), "SDE add: Check of H Dimensions failed"
         assert (Pinf.shape[0] == n and Pinf.shape[1]==n), "SDE add: Check of Pinf Dimensions failed"
-        assert (P0.shape[0] == n and P0.shape[1]==n), "SDE add: Check of P0 Dimensions failed"        
+        assert (P0.shape[0] == n and P0.shape[1]==n), "SDE add: Check of P0 Dimensions failed"
         assert (dF.shape[0] == n and dF.shape[1]==n and dF.shape[2]==nd), "SDE add: Check of dF Dimensions failed"
         assert (dQc.shape[0] == nq and dQc.shape[1]==nq and dQc.shape[2]==nd), "SDE add: Check of dQc Dimensions failed"
         assert (dPinf.shape[0] == n and dPinf.shape[1]==n and dPinf.shape[2]==nd), "SDE add: Check of dPinf Dimensions failed"
         assert (dP0.shape[0] == n and dP0.shape[1]==n and dP0.shape[2]==nd), "SDE add: Check of dP0 Dimensions failed"
-        
+
         return (F,L,Qc,H,Pinf,P0,dF,dQc,dPinf,dP0)
diff --git a/GPy/kern/src/stationary.py b/GPy/kern/src/stationary.py
index 22613fa2..141a1347 100644
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@@ -266,11 +266,11 @@ class Stationary(Kern):
         ..returns:
             dL2_dXdX: [NxQxQ]
         """
-        dL_dK_diag = dL_dK_diag.reshape(-1, 1, 1)
+        dL_dK_diag = dL_dK_diag.copy().reshape(-1, 1, 1)
         assert dL_dK_diag.size == X.shape[0], "dL_dK_diag has to be given as row [N] or column vector [Nx1]"
 
-        l2 =  np.ones(X.shape[1])*self.lengthscale**2
-        return (dL_dK_diag * self.variance/(l2[:,None]*l2[None,:]))# np.zeros(X.shape+(X.shape[1],))
+        l4 =  np.ones(X.shape[1])*self.lengthscale**2
+        return dL_dK_diag * (np.eye(X.shape[1]) * self.variance/(l4))[None, :,:]# np.zeros(X.shape+(X.shape[1],))
         #return np.ones(X.shape) * d2L_dK * self.variance/self.lengthscale**2 # np.zeros(X.shape)
 
     def _gradients_X_pure(self, dL_dK, X, X2=None):
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 9971e6d6..99951eb1 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -135,10 +135,13 @@ class Kern_check_d2Kdiag_dXdX(Kern_check_model):
         self.Xc = X.copy()
 
     def log_likelihood(self):
-        return np.sum(self.kernel.gradients_X_diag(self.dL_dK.diagonal(), self.X))
+        l = 0.
+        for i in range(self.X.shape[0]):
+            l += self.kernel.gradients_X(self.dL_dK[[i],[i]], self.X[[i]], self.Xc[[i]]).sum()
+        return l
 
     def parameters_changed(self):
-        grads = self.kernel.gradients_XX_diag(self.dL_dK.diagonal(), self.X)
+        grads = -self.kernel.gradients_XX_diag(self.dL_dK.diagonal(), self.X)
         self.X.gradient[:] = grads.sum(-1)
 
 def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verbose=False, fixed_X_dims=None):