diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index decca6b8..9cbe3daf 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -337,8 +337,7 @@ class GP(Model):
         dv_dX += kern.gradients_X(alpha, Xnew, self._predictive_variable)
         return mean_jac, dv_dX
 
-
-    def predict_jacobian(self, Xnew, kern=None, full_cov=True):
+    def predict_jacobian(self, Xnew, kern=None, full_cov=False):
         """
         Compute the derivatives of the posterior of the GP.
 
@@ -356,15 +355,11 @@ class GP(Model):
         :param X: The points at which to get the predictive gradients.
         :type X: np.ndarray (Xnew x self.input_dim)
         :param kern: The kernel to compute the jacobian for.
-        :param boolean full_cov: whether to return the full covariance of the jacobian.
+        :param boolean full_cov: whether to return the cross-covariance terms between
+        the N* Jacobian vectors
 
         :returns: dmu_dX, dv_dX
         :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q,(D)) ]
-
-        Note: We always return sum in input_dim gradients, as the off-diagonals
-        in the input_dim are not needed for further calculations.
-        This is a compromise for increase in speed. Mathematically the jacobian would
-        have another dimension in Q.
         """
         if kern is None:
             kern = self.kern
@@ -383,24 +378,26 @@ class GP(Model):
             dK2_dXdX = kern.gradients_XX(one, Xnew)
         else:
             dK2_dXdX = kern.gradients_XX_diag(one, Xnew)
+            #dK2_dXdX = np.zeros((Xnew.shape[0], Xnew.shape[1], Xnew.shape[1]))
+            #for i in range(Xnew.shape[0]):
+            #    dK2_dXdX[i:i+1,:,:] = kern.gradients_XX(one, Xnew[i:i+1,:])
 
         def compute_cov_inner(wi):
             if full_cov:
-                # full covariance gradients:
-                var_jac = dK2_dXdX - np.einsum('qnm,miq->niq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
+                var_jac = dK2_dXdX - np.einsum('qnm,msr->nsqr', dK_dXnew_full.T.dot(wi), dK_dXnew_full) # n,s = Xnew.shape[0], m = pred_var.shape[0]
             else:
-                var_jac = dK2_dXdX - np.einsum('qim,miq->iq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
+                var_jac = dK2_dXdX - np.einsum('qnm,mnr->nqr', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
             return var_jac
 
         if self.posterior.woodbury_inv.ndim == 3: # Missing data:
             if full_cov:
-                var_jac = np.empty((Xnew.shape[0],Xnew.shape[0],Xnew.shape[1],self.output_dim))
+                var_jac = np.empty((Xnew.shape[0],Xnew.shape[0],Xnew.shape[1],Xnew.shape[1],self.output_dim))
+                for d in range(self.posterior.woodbury_inv.shape[2]):
+                    var_jac[:, :, :, :, d] = compute_cov_inner(self.posterior.woodbury_inv[:, :, d])
+            else:
+                var_jac = np.empty((Xnew.shape[0],Xnew.shape[1],Xnew.shape[1],self.output_dim))
                 for d in range(self.posterior.woodbury_inv.shape[2]):
                     var_jac[:, :, :, d] = compute_cov_inner(self.posterior.woodbury_inv[:, :, d])
-            else:
-                var_jac = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
-                for d in range(self.posterior.woodbury_inv.shape[2]):
-                    var_jac[:, :, d] = compute_cov_inner(self.posterior.woodbury_inv[:, :, d])
         else:
             var_jac = compute_cov_inner(self.posterior.woodbury_inv)
         return mean_jac, var_jac
@@ -424,10 +421,11 @@ class GP(Model):
         mu_jac, var_jac = self.predict_jacobian(Xnew, kern, full_cov=False)
         mumuT = np.einsum('iqd,ipd->iqp', mu_jac, mu_jac)
         Sigma = np.zeros(mumuT.shape)
-        if var_jac.ndim == 3:
-            Sigma[(slice(None), )+np.diag_indices(Xnew.shape[1], 2)] = var_jac.sum(-1)
+        if var_jac.ndim == 4: # Missing data
+            Sigma = var_jac.sum(-1)
         else:
-            Sigma[(slice(None), )+np.diag_indices(Xnew.shape[1], 2)] = self.output_dim*var_jac
+            Sigma = self.output_dim*var_jac
+
         G = 0.
         if mean:
             G += mumuT
@@ -451,7 +449,7 @@ class GP(Model):
         :param bool covariance: whether to include the covariance of the wishart embedding.
         :param array-like dimensions: which dimensions of the input space to use [defaults to self.get_most_significant_input_dimensions()[:2]]
         """
-        G = self.predict_wishard_embedding(Xnew, kern, mean, covariance)
+        G = self.predict_wishart_embedding(Xnew, kern, mean, covariance)
         if dimensions is None:
             dimensions = self.get_most_significant_input_dimensions()[:2]
         G = G[:, dimensions][:,:,dimensions]
diff --git a/GPy/kern/src/add.py b/GPy/kern/src/add.py
index ec09f157..99fe809b 100644
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@@ -87,14 +87,19 @@ class Add(CombinationKernel):
 
     def gradients_XX(self, dL_dK, X, X2):
         if X2 is None:
-            target = np.zeros((X.shape[0], X.shape[0], X.shape[1]))
+            target = np.zeros((X.shape[0], X.shape[0], X.shape[1], X.shape[1]))
         else:
-            target = np.zeros((X.shape[0], X2.shape[0], X.shape[1]))
+            target = np.zeros((X.shape[0], X2.shape[0], X.shape[1], X.shape[1]))
+        #else: # diagonal covariance
+        #    if X2 is None:
+        #        target = np.zeros((X.shape[0], X.shape[0], X.shape[1]))
+        #    else:
+        #        target = np.zeros((X.shape[0], X2.shape[0], X.shape[1]))
         [target.__iadd__(p.gradients_XX(dL_dK, X, X2)) for p in self.parts]
         return target
 
     def gradients_XX_diag(self, dL_dKdiag, X):
-        target = np.zeros(X.shape)
+        target = np.zeros(X.shape+(X.shape[1],))
         [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X)) for p in self.parts]
         return target
 
@@ -182,7 +187,7 @@ class Add(CombinationKernel):
 
     def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         tmp = dL_dpsi2.sum(0)+ dL_dpsi2.sum(1) if len(dL_dpsi2.shape)==2 else dL_dpsi2.sum(2)+ dL_dpsi2.sum(1)
-        
+
         if not self._exact_psicomp: return Kern.update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)
         from .static import White, Bias
         for p1 in self.parts:
@@ -194,9 +199,9 @@ class Add(CombinationKernel):
                 if isinstance(p2, White):
                     continue
                 elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += tmp * p2.variance 
+                    eff_dL_dpsi1 += tmp * p2.variance
                 else:# np.setdiff1d(p1._all_dims_active, ar2, assume_unique): # TODO: Careful, not correct for overlapping _all_dims_active
-                    eff_dL_dpsi1 += tmp * p2.psi1(Z, variational_posterior) 
+                    eff_dL_dpsi1 += tmp * p2.psi1(Z, variational_posterior)
             p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
 
     def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
@@ -213,7 +218,7 @@ class Add(CombinationKernel):
                 if isinstance(p2, White):
                     continue
                 elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += tmp * p2.variance 
+                    eff_dL_dpsi1 += tmp * p2.variance
                 else:
                     eff_dL_dpsi1 += tmp * p2.psi1(Z, variational_posterior)
             target += p1.gradients_Z_expectations(dL_psi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
@@ -221,7 +226,7 @@ class Add(CombinationKernel):
 
     def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         tmp = dL_dpsi2.sum(0)+ dL_dpsi2.sum(1) if len(dL_dpsi2.shape)==2 else dL_dpsi2.sum(2)+ dL_dpsi2.sum(1)
-        
+
         if not self._exact_psicomp: return Kern.gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)
         from .static import White, Bias
         target_grads = [np.zeros(v.shape) for v in variational_posterior.parameters]
@@ -234,9 +239,9 @@ class Add(CombinationKernel):
                 if isinstance(p2, White):
                     continue
                 elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += tmp * p2.variance 
+                    eff_dL_dpsi1 += tmp * p2.variance
                 else:
-                    eff_dL_dpsi1 += tmp * p2.psi1(Z, variational_posterior) 
+                    eff_dL_dpsi1 += tmp * p2.psi1(Z, variational_posterior)
             grads = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
             [np.add(target_grads[i],grads[i],target_grads[i]) for i in range(len(grads))]
         return target_grads
@@ -249,7 +254,7 @@ class Add(CombinationKernel):
     #            other.unlink_parameter(p)
     #        parts.extend(other.parts)
     #        #self.link_parameters(*other_params)
-    #        
+    #
     #    else:
     #        #self.link_parameter(other)
     #        parts.append(other)
@@ -265,7 +270,7 @@ class Add(CombinationKernel):
         else:
 
             return super(Add, self).input_sensitivity(summarize)
-            
+
     def sde_update_gradient_full(self, gradients):
         """
         Update gradient in the order in which parameters are represented in the
@@ -277,12 +282,12 @@ class Add(CombinationKernel):
                 part_param_num = len(p.param_array) # number of parameters in the part
                 p.sde_update_gradient_full(gradients[part_start_param_index:(part_start_param_index+part_param_num)])
                 part_start_param_index += part_param_num
-    
+
     def sde(self):
         """
         Support adding kernels for sde representation
         """
-        
+
         import scipy.linalg as la
 
         F     = None
@@ -306,51 +311,51 @@ class Add(CombinationKernel):
             L = la.block_diag(L,Lt) if (L is not None) else Lt
             Qc = la.block_diag(Qc,Qct) if (Qc is not None) else Qct
             H = np.hstack((H,Ht)) if (H is not None) else Ht
-             
+
             Pinf = la.block_diag(Pinf,Pinft) if (Pinf is not None) else Pinft
             P0 = la.block_diag(P0,P0t) if (P0 is not None) else P0t
-            
+
             if dF is not None:
                 dF = np.pad(dF,((0,dFt.shape[0]),(0,dFt.shape[1]),(0,dFt.shape[2])),
                         'constant', constant_values=0)
                 dF[-dFt.shape[0]:,-dFt.shape[1]:,-dFt.shape[2]:] = dFt
             else:
                 dF = dFt
-             
+
             if dQc is not None:
                 dQc = np.pad(dQc,((0,dQct.shape[0]),(0,dQct.shape[1]),(0,dQct.shape[2])),
                         'constant', constant_values=0)
                 dQc[-dQct.shape[0]:,-dQct.shape[1]:,-dQct.shape[2]:] = dQct
             else:
                 dQc = dQct
-             
+
             if dPinf is not None:
                 dPinf = np.pad(dPinf,((0,dPinft.shape[0]),(0,dPinft.shape[1]),(0,dPinft.shape[2])),
                         'constant', constant_values=0)
                 dPinf[-dPinft.shape[0]:,-dPinft.shape[1]:,-dPinft.shape[2]:] = dPinft
             else:
                 dPinf = dPinft
-                
+
             if dP0 is not None:
                 dP0 = np.pad(dP0,((0,dP0t.shape[0]),(0,dP0t.shape[1]),(0,dP0t.shape[2])),
                         'constant', constant_values=0)
                 dP0[-dP0t.shape[0]:,-dP0t.shape[1]:,-dP0t.shape[2]:] = dP0t
             else:
                 dP0 = dP0t
-                
+
             n += Ft.shape[0]
             nq += Qct.shape[0]
             nd += dFt.shape[2]
-        
+
         assert (F.shape[0] == n and F.shape[1]==n), "SDE add: Check of F Dimensions failed"
         assert (L.shape[0] == n and L.shape[1]==nq), "SDE add: Check of L Dimensions failed"
         assert (Qc.shape[0] == nq and Qc.shape[1]==nq), "SDE add: Check of Qc Dimensions failed"
         assert (H.shape[0] == 1 and H.shape[1]==n), "SDE add: Check of H Dimensions failed"
         assert (Pinf.shape[0] == n and Pinf.shape[1]==n), "SDE add: Check of Pinf Dimensions failed"
-        assert (P0.shape[0] == n and P0.shape[1]==n), "SDE add: Check of P0 Dimensions failed"        
+        assert (P0.shape[0] == n and P0.shape[1]==n), "SDE add: Check of P0 Dimensions failed"
         assert (dF.shape[0] == n and dF.shape[1]==n and dF.shape[2]==nd), "SDE add: Check of dF Dimensions failed"
         assert (dQc.shape[0] == nq and dQc.shape[1]==nq and dQc.shape[2]==nd), "SDE add: Check of dQc Dimensions failed"
         assert (dPinf.shape[0] == n and dPinf.shape[1]==n and dPinf.shape[2]==nd), "SDE add: Check of dPinf Dimensions failed"
         assert (dP0.shape[0] == n and dP0.shape[1]==n and dP0.shape[2]==nd), "SDE add: Check of dP0 Dimensions failed"
-        
+
         return (F,L,Qc,H,Pinf,P0,dF,dQc,dPinf,dP0)
diff --git a/GPy/kern/src/integral.py b/GPy/kern/src/integral.py
index d2827390..971a48a8 100644
--- a/GPy/kern/src/integral.py
+++ b/GPy/kern/src/integral.py
@@ -10,10 +10,10 @@ class Integral(Kern): #todo do I need to inherit from Stationary
     """
     Integral kernel between...
     """
-    
+
     def __init__(self, input_dim, variances=None, lengthscale=None, ARD=False, active_dims=None, name='integral'):
         super(Integral, self).__init__(input_dim, active_dims, name)
-        
+
         if lengthscale is None:
             lengthscale = np.ones(1)
         else:
@@ -22,7 +22,7 @@ class Integral(Kern): #todo do I need to inherit from Stationary
         self.lengthscale = Param('lengthscale', lengthscale, Logexp()) #Logexp - transforms to allow positive only values...
         self.variances = Param('variances', variances, Logexp()) #and here.
         self.link_parameters(self.variances, self.lengthscale) #this just takes a list of parameters we need to optimise.
-    
+
     def h(self, z):
         return 0.5 * z * np.sqrt(math.pi) * math.erf(z) + np.exp(-(z**2))
 
@@ -36,13 +36,13 @@ class Integral(Kern): #todo do I need to inherit from Stationary
             for i,x in enumerate(X):
                 for j,x2 in enumerate(X):
                     dK_dl[i,j] = self.variances[0]*self.dk_dl(x[0],x2[0],self.lengthscale[0]) #TODO Multiple length scales
-                    dK_dv[i,j] = self.k_xx(x[0],x2[0],self.lengthscale[0])  #the gradient wrt the variance is k_xx.              
+                    dK_dv[i,j] = self.k_xx(x[0],x2[0],self.lengthscale[0])  #the gradient wrt the variance is k_xx.
             self.lengthscale.gradient = np.sum(dK_dl * dL_dK)
             self.variances.gradient = np.sum(dK_dv * dL_dK)
             #print "V%0.5f" % self.variances.gradient
             #print "L%0.5f" % self.lengthscale.gradient
-        else:     #we're finding dK_xf/Dtheta                
-            print "NEED TO HANDLE TODO!"
+        else:     #we're finding dK_xf/Dtheta
+            print("NEED TO HANDLE TODO!")
 
     #useful little function to help calculate the covariances.
     def g(self,z):
@@ -52,15 +52,15 @@ class Integral(Kern): #todo do I need to inherit from Stationary
     def k_xx(self,t,tprime,l):
         return 0.5 * (l**2) * ( self.g(t/l) - self.g((t - tprime)/l) + self.g(tprime/l) - 1)
 
-    def k_ff(self,t,tprime,l): 
+    def k_ff(self,t,tprime,l):
         return np.exp(-((t-tprime)**2)/(l**2)) #rbf
-        
+
     #covariance between the gradient and the actual value
     def k_xf(self,t,tprime,l):
         return 0.5 * np.sqrt(math.pi) * l * (math.erf((t-tprime)/l) + math.erf(tprime/l))
 
     def K(self, X, X2=None):
-        if X2 is None:         
+        if X2 is None:
             K_xx = np.zeros([X.shape[0],X.shape[0]])
             for i,x in enumerate(X):
                 for j,x2 in enumerate(X):
@@ -73,7 +73,7 @@ class Integral(Kern): #todo do I need to inherit from Stationary
                     K_xf[i,j] = self.k_xf(x[0],x2[0],self.lengthscale[0])
             #print self.variances[0]
             return K_xf * self.variances[0]
-            
+
     def Kdiag(self, X):
         """I've used the fact that we call this method for K_ff when finding the covariance as a hack so
         I know if I should return K_ff or K_xx. In this case we're returning K_ff!!
diff --git a/GPy/kern/src/integral_limits.py b/GPy/kern/src/integral_limits.py
index a1b60859..7006ee6f 100644
--- a/GPy/kern/src/integral_limits.py
+++ b/GPy/kern/src/integral_limits.py
@@ -10,10 +10,10 @@ class Integral_Limits(Kern): #todo do I need to inherit from Stationary
     """
     Integral kernel, can include limits on each integral value.
     """
-    
+
     def __init__(self, input_dim, variances=None, lengthscale=None, ARD=False, active_dims=None, name='integral'):
         super(Integral_Limits, self).__init__(input_dim, active_dims, name)
-        
+
         if lengthscale is None:
             lengthscale = np.ones(1)
         else:
@@ -22,27 +22,27 @@ class Integral_Limits(Kern): #todo do I need to inherit from Stationary
         self.lengthscale = Param('lengthscale', lengthscale, Logexp()) #Logexp - transforms to allow positive only values...
         self.variances = Param('variances', variances, Logexp()) #and here.
         self.link_parameters(self.variances, self.lengthscale) #this just takes a list of parameters we need to optimise.
-    
+
     def h(self, z):
         return 0.5 * z * np.sqrt(math.pi) * math.erf(z) + np.exp(-(z**2))
 
     def dk_dl(self, t, tprime, s, sprime, l): #derivative of the kernel wrt lengthscale
         return l * ( self.h((t-sprime)/l) - self.h((t - tprime)/l) + self.h((tprime-s)/l) - self.h((s-sprime)/l))
 
-    def update_gradients_full(self, dL_dK, X, X2=None):        
+    def update_gradients_full(self, dL_dK, X, X2=None):
         if X2 is None:  #we're finding dK_xx/dTheta
             dK_dl = np.zeros([X.shape[0],X.shape[0]])
             dK_dv = np.zeros([X.shape[0],X.shape[0]])
             for i,x in enumerate(X):
                 for j,x2 in enumerate(X):
                     dK_dl[i,j] = self.variances[0]*self.dk_dl(x[0],x2[0],x[1],x2[1],self.lengthscale[0])
-                    dK_dv[i,j] = self.k_xx(x[0],x2[0],x[1],x2[1],self.lengthscale[0])  #the gradient wrt the variance is k_xx.              
+                    dK_dv[i,j] = self.k_xx(x[0],x2[0],x[1],x2[1],self.lengthscale[0])  #the gradient wrt the variance is k_xx.
             self.lengthscale.gradient = np.sum(dK_dl * dL_dK)
             self.variances.gradient = np.sum(dK_dv * dL_dK)
             #print "V%0.5f" % self.variances.gradient
             #print "L%0.5f" % self.lengthscale.gradient
-        else:     #we're finding dK_xf/Dtheta                
-            print "NEED TO HANDLE TODO!"
+        else:     #we're finding dK_xf/Dtheta
+            print("NEED TO HANDLE TODO!")
 
     #useful little function to help calculate the covariances.
     def g(self,z):
@@ -50,28 +50,28 @@ class Integral_Limits(Kern): #todo do I need to inherit from Stationary
 
     def k_xx(self,t,tprime,s,sprime,l):
         """Covariance between observed values.
-        
+
         s and t are one domain of the integral (i.e. the integral between s and t)
         sprime and tprime are another domain of the integral (i.e. the integral between sprime and tprime)
-        
+
         We're interested in how correlated these two integrals are.
-        
+
         Note: We've not multiplied by the variance, this is done in K."""
         return 0.5 * (l**2) * ( self.g((t-sprime)/l) + self.g((tprime-s)/l) - self.g((t - tprime)/l) - self.g((s-sprime)/l))
 
-    def k_ff(self,t,tprime,l): 
+    def k_ff(self,t,tprime,l):
         """Doesn't need s or sprime as we're looking at the 'derivatives', so no domains over which to integrate are required"""
         return np.exp(-((t-tprime)**2)/(l**2)) #rbf
-        
+
     def k_xf(self,t,tprime,s,l):
         """Covariance between the gradient (latent value) and the actual (observed) value.
-        
+
         Note that sprime isn't actually used in this expression, presumably because the 'primes' are the gradient (latent) values which don't
         involve an integration, and thus there is no domain over which they're integrated, just a single value that we want."""
         return 0.5 * np.sqrt(math.pi) * l * (math.erf((t-tprime)/l) + math.erf((tprime-s)/l))
 
     def K(self, X, X2=None):
-        if X2 is None:         
+        if X2 is None:
             K_xx = np.zeros([X.shape[0],X.shape[0]])
             for i,x in enumerate(X):
                 for j,x2 in enumerate(X):
@@ -83,7 +83,7 @@ class Integral_Limits(Kern): #todo do I need to inherit from Stationary
                 for j,x2 in enumerate(X2):
                     K_xf[i,j] = self.k_xf(x[0],x2[0],x[1],self.lengthscale[0]) #x2[1] unused, see k_xf docstring for explanation.
             return K_xf * self.variances[0]
-            
+
     def Kdiag(self, X):
         """I've used the fact that we call this method for K_ff when finding the covariance as a hack so
         I know if I should return K_ff or K_xx. In this case we're returning K_ff!!
diff --git a/GPy/kern/src/kern.py b/GPy/kern/src/kern.py
index 393cf1c2..4379fb71 100644
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@@ -15,10 +15,10 @@ class Kern(Parameterized):
     # This adds input slice support. The rather ugly code for slicing can be
     # found in kernel_slice_operations
     # __meataclass__ is ignored in Python 3 - needs to be put in the function definiton
-    #__metaclass__ = KernCallsViaSlicerMeta
-    #Here, we use the Python module six to support Py3 and Py2 simultaneously
+    # __metaclass__ = KernCallsViaSlicerMeta
+    # Here, we use the Python module six to support Py3 and Py2 simultaneously
     #===========================================================================
-    _support_GPU=False
+    _support_GPU = False
     def __init__(self, input_dim, active_dims, name, useGPU=False, *a, **kw):
         """
         The base class for a kernel: a positive definite function
@@ -62,7 +62,7 @@ class Kern(Parameterized):
         self.psicomp = PSICOMP_GH()
 
     def __setstate__(self, state):
-        self._all_dims_active = np.arange(0, max(state['active_dims'])+1)
+        self._all_dims_active = np.arange(0, max(state['active_dims']) + 1)
         super(Kern, self).__setstate__(state)
 
     @property
@@ -132,18 +132,18 @@ class Kern(Parameterized):
         raise NotImplementedError
     def gradients_X_X2(self, dL_dK, X, X2):
         return self.gradients_X(dL_dK, X, X2), self.gradients_X(dL_dK.T, X2, X)
-    def gradients_XX(self, dL_dK, X, X2):
+    def gradients_XX(self, dL_dK, X, X2, cov=True):
         """
         .. math::
 
             \\frac{\partial^2 L}{\partial X\partial X_2} = \\frac{\partial L}{\partial K}\\frac{\partial^2 K}{\partial X\partial X_2}
         """
-        raise(NotImplementedError, "This is the second derivative of K wrt X and X2, and not implemented for this kernel")
-    def gradients_XX_diag(self, dL_dKdiag, X):
+        raise NotImplementedError("This is the second derivative of K wrt X and X2, and not implemented for this kernel")
+    def gradients_XX_diag(self, dL_dKdiag, X, cov=True):
         """
         The diagonal of the second derivative w.r.t. X and X2
         """
-        raise(NotImplementedError, "This is the diagonal of the second derivative of K wrt X and X2, and not implemented for this kernel")
+        raise NotImplementedError("This is the diagonal of the second derivative of K wrt X and X2, and not implemented for this kernel")
     def gradients_X_diag(self, dL_dKdiag, X):
         """
         The diagonal of the derivative w.r.t. X
@@ -292,11 +292,11 @@ class Kern(Parameterized):
         """
         assert isinstance(other, Kern), "only kernels can be multiplied to kernels..."
         from .prod import Prod
-        #kernels = []
-        #if isinstance(self, Prod): kernels.extend(self.parameters)
-        #else: kernels.append(self)
-        #if isinstance(other, Prod): kernels.extend(other.parameters)
-        #else: kernels.append(other)
+        # kernels = []
+        # if isinstance(self, Prod): kernels.extend(self.parameters)
+        # else: kernels.append(self)
+        # if isinstance(other, Prod): kernels.extend(other.parameters)
+        # else: kernels.append(other)
         return Prod([self, other], name)
 
     def _check_input_dim(self, X):
diff --git a/GPy/kern/src/kernel_slice_operations.py b/GPy/kern/src/kernel_slice_operations.py
index 57b34de9..c4a64518 100644
--- a/GPy/kern/src/kernel_slice_operations.py
+++ b/GPy/kern/src/kernel_slice_operations.py
@@ -13,7 +13,7 @@ from paramz.parameterized import ParametersChangedMeta
 
 def put_clean(dct, name, func):
     if name in dct:
-        #dct['_clean_{}'.format(name)] = dct[name]
+        dct['_clean_{}'.format(name)] = dct[name]
         dct[name] = func(dct[name])
 
 class KernCallsViaSlicerMeta(ParametersChangedMeta):
@@ -25,7 +25,7 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
         put_clean(dct, 'gradients_X', _slice_gradients_X)
         put_clean(dct, 'gradients_X_X2', _slice_gradients_X)
         put_clean(dct, 'gradients_XX', _slice_gradients_XX)
-        put_clean(dct, 'gradients_XX_diag', _slice_gradients_X_diag)
+        put_clean(dct, 'gradients_XX_diag', _slice_gradients_XX_diag)
         put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
 
         put_clean(dct, 'psi0', _slice_psi)
@@ -38,15 +38,16 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
         return super(KernCallsViaSlicerMeta, cls).__new__(cls, name, bases, dct)
 
 class _Slice_wrap(object):
-    def __init__(self, k, X, X2=None, ret_shape=None):
+    def __init__(self, k, X, X2=None, diag=False, ret_shape=None):
         self.k = k
+        self.diag = diag
         if ret_shape is None:
             self.shape = X.shape
         else:
             self.shape = ret_shape
-        assert X.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X.shape={!s}".format(X.shape)
+        assert X.ndim == 2, "need at least column vectors as inputs to kernels for now, given X.shape={!s}".format(X.shape)
         if X2 is not None:
-            assert X2.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X2.shape={!s}".format(X2.shape)
+            assert X2.ndim == 2, "need at least column vectors as inputs to kernels for now, given X2.shape={!s}".format(X2.shape)
         if (self.k._all_dims_active is not None) and (self.k._sliced_X == 0):
             self.k._check_active_dims(X)
             self.X = self.k._slice_X(X)
@@ -67,8 +68,13 @@ class _Slice_wrap(object):
             ret = np.zeros(self.shape)
             if len(self.shape) == 2:
                 ret[:, self.k._all_dims_active] = return_val
-            elif len(self.shape) == 3:
-                ret[:, :, self.k._all_dims_active] = return_val
+            elif len(self.shape) == 3: # derivative for X2!=None
+                if self.diag:
+                    ret.T[np.ix_(self.k._all_dims_active, self.k._all_dims_active)] = return_val.T
+                else:
+                    ret[:, :, self.k._all_dims_active] = return_val
+            elif len(self.shape) == 4: # second order derivative
+                ret.T[np.ix_(self.k._all_dims_active, self.k._all_dims_active)] = return_val.T
             return ret
         return return_val
 
@@ -112,23 +118,34 @@ def _slice_gradients_X(f):
         return ret
     return wrap
 
+def _slice_gradients_X_diag(f):
+    @wraps(f)
+    def wrap(self, dL_dKdiag, X):
+        with _Slice_wrap(self, X, None) as s:
+            ret = s.handle_return_array(f(self, dL_dKdiag, s.X))
+        return ret
+    return wrap
+
 def _slice_gradients_XX(f):
     @wraps(f)
     def wrap(self, dL_dK, X, X2=None):
         if X2 is None:
             N, M = X.shape[0], X.shape[0]
+            Q1 = Q2 = X.shape[1]
         else:
             N, M = X.shape[0], X2.shape[0]
-        with _Slice_wrap(self, X, X2, ret_shape=(N, M, X.shape[1])) as s:
+            Q1, Q2 = X.shape[1], X2.shape[1]
         #with _Slice_wrap(self, X, X2, ret_shape=None) as s:
+        with _Slice_wrap(self, X, X2, ret_shape=(N, M, Q1, Q2)) as s:
             ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
         return ret
     return wrap
 
-def _slice_gradients_X_diag(f):
+def _slice_gradients_XX_diag(f):
     @wraps(f)
     def wrap(self, dL_dKdiag, X):
-        with _Slice_wrap(self, X, None) as s:
+        N, Q = X.shape
+        with _Slice_wrap(self, X, None, diag=True, ret_shape=(N, Q, Q)) as s:
             ret = s.handle_return_array(f(self, dL_dKdiag, s.X))
         return ret
     return wrap
diff --git a/GPy/kern/src/linear.py b/GPy/kern/src/linear.py
index fa412c1d..e7089fe1 100644
--- a/GPy/kern/src/linear.py
+++ b/GPy/kern/src/linear.py
@@ -102,17 +102,39 @@ class Linear(Kern):
             return dL_dK.dot(X2)*self.variances #np.einsum('jq,q,ij->iq', X2, self.variances, dL_dK)
 
     def gradients_XX(self, dL_dK, X, X2=None):
-        if X2 is None: dL_dK = (dL_dK+dL_dK.T)/2
+        """
+        Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
+
+        returns the full covariance matrix [QxQ] of the input dimensionfor each pair or vectors, thus
+        the returned array is of shape [NxNxQxQ].
+
+        ..math:
+            \frac{\partial^2 K}{\partial X2 ^2} = - \frac{\partial^2 K}{\partial X\partial X2}
+
+        ..returns:
+            dL2_dXdX2:  [NxMxQxQ] for X [NxQ] and X2[MxQ] (X2 is X if, X2 is None)
+                        Thus, we return the second derivative in X2.
+        """
         if X2 is None:
-            return 2*np.ones(X.shape)*self.variances
-        else:
-            return np.ones(X.shape)*self.variances
+            X2 = X
+        return np.zeros((X.shape[0], X2.shape[0], X.shape[1], X.shape[1]))
+        #if X2 is None: dL_dK = (dL_dK+dL_dK.T)/2
+        #if X2 is None:
+        #    return np.ones(np.repeat(X.shape, 2)) * (self.variances[None,:] + self.variances[:, None])[None, None, :, :]
+        #else:
+        #    return np.ones((X.shape[0], X2.shape[0], X.shape[1], X.shape[1])) * (self.variances[None,:] + self.variances[:, None])[None, None, :, :]
+
 
     def gradients_X_diag(self, dL_dKdiag, X):
         return 2.*self.variances*dL_dKdiag[:,None]*X
 
     def gradients_XX_diag(self, dL_dKdiag, X):
-        return 2*np.ones(X.shape)*self.variances
+        return np.zeros((X.shape[0], X.shape[1], X.shape[1]))
+
+        #dims = X.shape
+        #if cov:
+        #    dims += (X.shape[1],)
+        #return 2*np.ones(dims)*self.variances
 
     def input_sensitivity(self, summarize=True):
         return np.ones(self.input_dim) * self.variances
diff --git a/GPy/kern/src/multidimensional_integral_limits.py b/GPy/kern/src/multidimensional_integral_limits.py
index 91983f53..0f473742 100644
--- a/GPy/kern/src/multidimensional_integral_limits.py
+++ b/GPy/kern/src/multidimensional_integral_limits.py
@@ -10,10 +10,10 @@ class Multidimensional_Integral_Limits(Kern): #todo do I need to inherit from St
     """
     Integral kernel, can include limits on each integral value.
     """
-    
+
     def __init__(self, input_dim, variances=None, lengthscale=None, ARD=False, active_dims=None, name='integral'):
         super(Multidimensional_Integral_Limits, self).__init__(input_dim, active_dims, name)
-        
+
         if lengthscale is None:
             lengthscale = np.ones(1)
         else:
@@ -22,38 +22,38 @@ class Multidimensional_Integral_Limits(Kern): #todo do I need to inherit from St
         self.lengthscale = Param('lengthscale', lengthscale, Logexp()) #Logexp - transforms to allow positive only values...
         self.variances = Param('variances', variances, Logexp()) #and here.
         self.link_parameters(self.variances, self.lengthscale) #this just takes a list of parameters we need to optimise.
-    
+
     def h(self, z):
         return 0.5 * z * np.sqrt(math.pi) * math.erf(z) + np.exp(-(z**2))
 
     def dk_dl(self, t, tprime, s, sprime, l): #derivative of the kernel wrt lengthscale
         return l * ( self.h((t-sprime)/l) - self.h((t - tprime)/l) + self.h((tprime-s)/l) - self.h((s-sprime)/l))
 
-    def update_gradients_full(self, dL_dK, X, X2=None):        
+    def update_gradients_full(self, dL_dK, X, X2=None):
         #print self.variances
         if X2 is None:  #we're finding dK_xx/dTheta
             dK_dl_term = np.zeros([X.shape[0],X.shape[0],self.lengthscale.shape[0]])
-            k_term = np.zeros([X.shape[0],X.shape[0],self.lengthscale.shape[0]])            
-            dK_dl = np.zeros([X.shape[0],X.shape[0],self.lengthscale.shape[0]])            
+            k_term = np.zeros([X.shape[0],X.shape[0],self.lengthscale.shape[0]])
+            dK_dl = np.zeros([X.shape[0],X.shape[0],self.lengthscale.shape[0]])
             dK_dv = np.zeros([X.shape[0],X.shape[0]])
             for il,l in enumerate(self.lengthscale):
                 idx = il*2
                 for i,x in enumerate(X):
                     for j,x2 in enumerate(X):
                         dK_dl_term[i,j,il] = self.dk_dl(x[idx],x2[idx],x[idx+1],x2[idx+1],l)
-                        k_term[i,j,il] = self.k_xx(x[idx],x2[idx],x[idx+1],x2[idx+1],l) 
-            for il,l in enumerate(self.lengthscale):                               
+                        k_term[i,j,il] = self.k_xx(x[idx],x2[idx],x[idx+1],x2[idx+1],l)
+            for il,l in enumerate(self.lengthscale):
                 dK_dl = self.variances[0] * dK_dl_term[:,:,il]
                 for jl, l in enumerate(self.lengthscale):
                     if jl!=il:
                         dK_dl *= k_term[:,:,jl]
                         #dK_dl = np.dot(dK_dl,k_term[:,:,il])
                         #print k_term[:,:,il]
-                self.lengthscale.gradient[il] = np.sum(dK_dl * dL_dK)  
-            dK_dv = self.calc_K_xx_wo_variance(X) #the gradient wrt the variance is k_xx. 
-            self.variances.gradient = np.sum(dK_dv * dL_dK)           
-        else:     #we're finding dK_xf/Dtheta                
-            print "NEED TO HANDLE TODO!"
+                self.lengthscale.gradient[il] = np.sum(dK_dl * dL_dK)
+            dK_dv = self.calc_K_xx_wo_variance(X) #the gradient wrt the variance is k_xx.
+            self.variances.gradient = np.sum(dK_dv * dL_dK)
+        else:     #we're finding dK_xf/Dtheta
+            print("NEED TO HANDLE TODO!")
         #print self.variances[0],self.lengthscale[0],self.lengthscale[1] #np.sum(dK_dv*dL_dK)
 
 
@@ -63,38 +63,38 @@ class Multidimensional_Integral_Limits(Kern): #todo do I need to inherit from St
 
     def k_xx(self,t,tprime,s,sprime,l):
         """Covariance between observed values.
-        
+
         s and t are one domain of the integral (i.e. the integral between s and t)
         sprime and tprime are another domain of the integral (i.e. the integral between sprime and tprime)
-        
+
         We're interested in how correlated these two integrals are.
-        
+
         Note: We've not multiplied by the variance, this is done in K."""
         return 0.5 * (l**2) * ( self.g((t-sprime)/l) + self.g((tprime-s)/l) - self.g((t - tprime)/l) - self.g((s-sprime)/l))
 
-    def k_ff(self,t,tprime,l): 
+    def k_ff(self,t,tprime,l):
         """Doesn't need s or sprime as we're looking at the 'derivatives', so no domains over which to integrate are required"""
         return np.exp(-((t-tprime)**2)/(l**2)) #rbf
-        
+
     def k_xf(self,t,tprime,s,l):
         """Covariance between the gradient (latent value) and the actual (observed) value.
-        
+
         Note that sprime isn't actually used in this expression, presumably because the 'primes' are the gradient (latent) values which don't
         involve an integration, and thus there is no domain over which they're integrated, just a single value that we want."""
         return 0.5 * np.sqrt(math.pi) * l * (math.erf((t-tprime)/l) + math.erf((tprime-s)/l))
 
     def calc_K_xx_wo_variance(self,X):
         """Calculates K_xx without the variance term"""
-        K_xx = np.ones([X.shape[0],X.shape[0]]) #ones now as a product occurs over each dimension            
+        K_xx = np.ones([X.shape[0],X.shape[0]]) #ones now as a product occurs over each dimension
         for i,x in enumerate(X):
             for j,x2 in enumerate(X):
                 for il,l in enumerate(self.lengthscale):
                     idx = il*2 #each pair of input dimensions describe the limits on one actual dimension in the data
                     K_xx[i,j] *= self.k_xx(x[idx],x2[idx],x[idx+1],x2[idx+1],l)
         return K_xx
-           
+
     def K(self, X, X2=None):
-        if X2 is None:         
+        if X2 is None:
             #print "X x X"
             K_xx = self.calc_K_xx_wo_variance(X)
             return K_xx * self.variances[0]
@@ -107,7 +107,7 @@ class Multidimensional_Integral_Limits(Kern): #todo do I need to inherit from St
                         idx = il*2
                         K_xf[i,j] *= self.k_xf(x[idx],x2[idx],x[idx+1],l)
             return K_xf * self.variances[0]
-            
+
     def Kdiag(self, X):
         """I've used the fact that we call this method for K_ff when finding the covariance as a hack so
         I know if I should return K_ff or K_xx. In this case we're returning K_ff!!
diff --git a/GPy/kern/src/rbf.py b/GPy/kern/src/rbf.py
index ff86561d..7a15abe8 100644
--- a/GPy/kern/src/rbf.py
+++ b/GPy/kern/src/rbf.py
@@ -39,6 +39,8 @@ class RBF(Stationary):
     def dK2_drdr(self, r):
         return (r**2-1)*self.K_of_r(r)
 
+    def dK2_drdr_diag(self):
+        return -self.variance # as the diagonal of r is always filled with zeros
     def __getstate__(self):
         dc = super(RBF, self).__getstate__()
         if self.useGPU:
diff --git a/GPy/kern/src/static.py b/GPy/kern/src/static.py
index 3ce0dc0a..5cf4a1c9 100644
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@@ -6,6 +6,7 @@ from .kern import Kern
 import numpy as np
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
+from paramz.caching import Cache_this
 
 class Static(Kern):
     def __init__(self, input_dim, variance, active_dims, name):
@@ -24,12 +25,13 @@ class Static(Kern):
     def gradients_X_diag(self, dL_dKdiag, X):
         return np.zeros(X.shape)
 
-    def gradients_XX(self, dL_dK, X, X2):
+    def gradients_XX(self, dL_dK, X, X2=None):
         if X2 is None:
             X2 = X
-        return np.zeros((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
-    def gradients_XX_diag(self, dL_dKdiag, X):
-        return np.zeros(X.shape)
+        return np.zeros((X.shape[0], X2.shape[0], X.shape[1], X.shape[1]), dtype=np.float64)
+
+    def gradients_XX_diag(self, dL_dKdiag, X, cov=False):
+        return np.zeros((X.shape[0], X.shape[1], X.shape[1]), dtype=np.float64)
 
     def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         return np.zeros(Z.shape)
@@ -172,13 +174,19 @@ class Fixed(Static):
         super(Fixed, self).__init__(input_dim, variance, active_dims, name)
         self.fixed_K = covariance_matrix
     def K(self, X, X2):
-        return self.variance * self.fixed_K
+        if X2 is None:
+            return self.variance * self.fixed_K
+        else:
+            return np.zeros((X.shape[0], X2.shape[0]))
 
     def Kdiag(self, X):
         return self.variance * self.fixed_K.diagonal()
 
     def update_gradients_full(self, dL_dK, X, X2=None):
-        self.variance.gradient = np.einsum('ij,ij', dL_dK, self.fixed_K)
+        if X2 is None:
+            self.variance.gradient = np.einsum('ij,ij', dL_dK, self.fixed_K)
+        else:
+            self.variance.gradient = 0
 
     def update_gradients_diag(self, dL_dKdiag, X):
         self.variance.gradient = np.einsum('i,i', dL_dKdiag, np.diagonal(self.fixed_K))
@@ -224,21 +232,26 @@ class Precomputed(Fixed):
         :param variance: the variance of the kernel
         :type variance: float
         """
+        assert input_dim==1, "Precomputed only implemented in one dimension. Use multiple Precomputed kernels to have more dimensions by making use of active_dims"
         super(Precomputed, self).__init__(input_dim, covariance_matrix, variance, active_dims, name)
-    def K(self, X, X2=None):
+
+    @Cache_this(limit=2)
+    def _index(self, X, X2):
         if X2 is None:
-            return self.variance * self.fixed_K[X[:,0].astype('int')][:,X[:,0].astype('int')]
+            i1 = i2 = X.astype('int').flat
         else:
-            return self.variance * self.fixed_K[X[:,0].astype('int')][:,X2[:,0].astype('int')]
+            i1, i2 = X.astype('int').flat, X2.astype('int').flat
+        return self.fixed_K[i1,:][:,i2]
+
+    def K(self, X, X2=None):
+        return self.variance * self._index(X, X2)
 
     def Kdiag(self, X):
-        return self.variance * self.fixed_K[X[:,0].astype('int')][:,X[:,0].astype('int')].diagonal()
+        return self.variance * self._index(X,None).diagonal()
 
     def update_gradients_full(self, dL_dK, X, X2=None):
-        if X2 is None:
-            self.variance.gradient = np.einsum('ij,ij', dL_dK, self.fixed_K[X[:,0].astype('int')][:,X[:,0].astype('int')])
-        else:
-            self.variance.gradient = np.einsum('ij,ij', dL_dK, self.fixed_K[X[:,0].astype('int')][:,X2[:,0].astype('int')])
+        self.variance.gradient = np.einsum('ij,ij', dL_dK, self._index(X, X2))
 
     def update_gradients_diag(self, dL_dKdiag, X):
-        self.variance.gradient = np.einsum('i,ii', dL_dKdiag, self.fixed_K[X[:,0].astype('int')][:,X[:,0].astype('int')])
\ No newline at end of file
+        self.variance.gradient = np.einsum('i,ii', dL_dKdiag, self._index(X, None))
+
diff --git a/GPy/kern/src/stationary.py b/GPy/kern/src/stationary.py
index 5e137abb..1ce8084f 100644
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@@ -85,6 +85,11 @@ class Stationary(Kern):
     def dK2_drdr(self, r):
         raise NotImplementedError("implement second derivative of covariance wrt r to use this method")
 
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_drdr_diag(self):
+        "Second order derivative of K in r_{i,i}. The diagonal entries are always zero, so we do not give it here."
+        raise NotImplementedError("implement second derivative of covariance wrt r_diag to use this method")
+
     @Cache_this(limit=3, ignore_args=())
     def K(self, X, X2=None):
         """
@@ -222,54 +227,57 @@ class Stationary(Kern):
         """
         Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
 
+        returns the full covariance matrix [QxQ] of the input dimensionfor each pair or vectors, thus
+        the returned array is of shape [NxNxQxQ].
+
         ..math:
-          \frac{\partial^2 K}{\partial X\partial X2}
+            \frac{\partial^2 K}{\partial X2 ^2} = - \frac{\partial^2 K}{\partial X\partial X2}
 
         ..returns:
-            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ] (X2 is X if, X2 is None)
-            Thus, we return the second derivative in X2.
+            dL2_dXdX2:  [NxMxQxQ] in the cov=True case, or [NxMxQ] in the cov=False case,
+                        for X [NxQ] and X2[MxQ] (X2 is X if, X2 is None)
+                        Thus, we return the second derivative in X2.
         """
-        # The off diagonals in Q are always zero, this should also be true for the Linear kernel...
         # According to multivariable chain rule, we can chain the second derivative through r:
         # d2K_dXdX2 = dK_dr*d2r_dXdX2 + d2K_drdr * dr_dX * dr_dX2:
         invdist = self._inv_dist(X, X2)
         invdist2 = invdist**2
-
-        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
+        dL_dr = self.dK_dr_via_X(X, X2) #* dL_dK # we perform this product later
         tmp1 = dL_dr * invdist
-
-        dL_drdr = self.dK2_drdr_via_X(X, X2) * dL_dK
-        tmp2 = dL_drdr * invdist2
-
-        l2 = np.ones(X.shape[1]) * self.lengthscale**2
+        dL_drdr = self.dK2_drdr_via_X(X, X2) #* dL_dK # we perofrm this product later
+        tmp2 = dL_drdr*invdist2
+        l2 =  np.ones(X.shape[1])*self.lengthscale**2 #np.multiply(np.ones(X.shape[1]) ,self.lengthscale**2)
 
         if X2 is None:
             X2 = X
             tmp1 -= np.eye(X.shape[0])*self.variance
         else:
-            tmp1[X==X2.T] -= self.variance
+            tmp1[invdist2==0.] -= self.variance
 
-        grad = np.empty((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
-        #grad = np.empty(X.shape, dtype=np.float64)
-        for q in range(self.input_dim):
-            tmpdist2 = (X[:,[q]]-X2[:,[q]].T) ** 2
-            grad[:, :, q] = ((tmp1*invdist2 - tmp2)*tmpdist2/l2[q] - tmp1)/l2[q]
-            #grad[:, :, q] = ((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q]
-            #np.sum(((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q], axis=1, out=grad[:,q])
-            #np.sum( - (tmp2*(tmpdist**2)), axis=1, out=grad[:,q])
+        #grad = np.empty((X.shape[0], X2.shape[0], X2.shape[1], X.shape[1]), dtype=np.float64)
+        dist = X[:,None,:] - X2[None,:,:]
+        dist = (dist[:,:,:,None]*dist[:,:,None,:])
+        I = np.ones((X.shape[0], X2.shape[0], X2.shape[1], X.shape[1]))*np.eye((X2.shape[1]))
+        grad = (((dL_dK*(tmp1*invdist2 - tmp2))[:,:,None,None] * dist)/l2[None,None,:,None]
+                - (dL_dK*tmp1)[:,:,None,None] * I)/l2[None,None,None,:]
         return grad
 
-    def gradients_XX_diag(self, dL_dK, X):
+    def gradients_XX_diag(self, dL_dK_diag, X):
         """
-        Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
+        Given the derivative of the objective dL_dK, compute the second derivative of K wrt X:
 
         ..math:
-          \frac{\partial^2 K}{\partial X\partial X2}
+          \frac{\partial^2 K}{\partial X\partial X}
 
         ..returns:
-            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ]
+            dL2_dXdX: [NxQxQ]
         """
-        return np.ones(X.shape) * self.variance/self.lengthscale**2
+        dL_dK_diag = dL_dK_diag.copy().reshape(-1, 1, 1)
+        assert (dL_dK_diag.size == X.shape[0]) or (dL_dK_diag.size == 1), "dL_dK_diag has to be given as row [N] or column vector [Nx1]"
+
+        l4 =  np.ones(X.shape[1])*self.lengthscale**2
+        return dL_dK_diag * (np.eye(X.shape[1]) * -self.dK2_drdr_diag()/(l4))[None, :,:]# np.zeros(X.shape+(X.shape[1],))
+        #return np.ones(X.shape) * d2L_dK * self.variance/self.lengthscale**2 # np.zeros(X.shape)
 
     def _gradients_X_pure(self, dL_dK, X, X2=None):
         invdist = self._inv_dist(X, X2)
@@ -320,18 +328,18 @@ class Exponential(Stationary):
     def dK_dr(self, r):
         return -self.K_of_r(r)
 
-#    def sde(self): 
-#        """ 
-#        Return the state space representation of the covariance. 
-#        """ 
-#        F  = np.array([[-1/self.lengthscale]]) 
-#        L  = np.array([[1]]) 
-#        Qc = np.array([[2*self.variance/self.lengthscale]]) 
-#        H = np.array([[1]]) 
-#        Pinf = np.array([[self.variance]]) 
-#        # TODO: return the derivatives as well 
-#        
-#        return (F, L, Qc, H, Pinf)  
+#    def sde(self):
+#        """
+#        Return the state space representation of the covariance.
+#        """
+#        F  = np.array([[-1/self.lengthscale]])
+#        L  = np.array([[1]])
+#        Qc = np.array([[2*self.variance/self.lengthscale]])
+#        H = np.array([[1]])
+#        Pinf = np.array([[self.variance]])
+#        # TODO: return the derivatives as well
+#
+#        return (F, L, Qc, H, Pinf)
 
 
 
@@ -400,41 +408,41 @@ class Matern32(Stationary):
         F1lower = np.array([f(lower) for f in F1])[:, None]
         return(self.lengthscale ** 3 / (12.*np.sqrt(3) * self.variance) * G + 1. / self.variance * np.dot(Flower, Flower.T) + self.lengthscale ** 2 / (3.*self.variance) * np.dot(F1lower, F1lower.T))
 
-    def sde(self): 
-        """ 
-        Return the state space representation of the covariance. 
-        """ 
+    def sde(self):
+        """
+        Return the state space representation of the covariance.
+        """
         variance = float(self.variance.values)
         lengthscale = float(self.lengthscale.values)
-        foo  = np.sqrt(3.)/lengthscale 
-        F    = np.array([[0, 1], [-foo**2, -2*foo]]) 
-        L    = np.array([[0], [1]]) 
-        Qc   = np.array([[12.*np.sqrt(3) / lengthscale**3 * variance]]) 
-        H    = np.array([[1, 0]]) 
-        Pinf = np.array([[variance, 0],  
-        [0,              3.*variance/(lengthscale**2)]]) 
-        # Allocate space for the derivatives 
+        foo  = np.sqrt(3.)/lengthscale
+        F    = np.array([[0, 1], [-foo**2, -2*foo]])
+        L    = np.array([[0], [1]])
+        Qc   = np.array([[12.*np.sqrt(3) / lengthscale**3 * variance]])
+        H    = np.array([[1, 0]])
+        Pinf = np.array([[variance, 0],
+        [0,              3.*variance/(lengthscale**2)]])
+        # Allocate space for the derivatives
         dF    = np.empty([F.shape[0],F.shape[1],2])
-        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2]) 
-        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2]) 
-        # The partial derivatives 
-        dFvariance       = np.zeros([2,2]) 
-        dFlengthscale    = np.array([[0,0], 
-        [6./lengthscale**3,2*np.sqrt(3)/lengthscale**2]]) 
-        dQcvariance      = np.array([12.*np.sqrt(3)/lengthscale**3]) 
-        dQclengthscale   = np.array([-3*12*np.sqrt(3)/lengthscale**4*variance]) 
-        dPinfvariance    = np.array([[1,0],[0,3./lengthscale**2]]) 
-        dPinflengthscale = np.array([[0,0], 
-        [0,-6*variance/lengthscale**3]]) 
-        # Combine the derivatives 
-        dF[:,:,0]    = dFvariance 
-        dF[:,:,1]    = dFlengthscale 
-        dQc[:,:,0]   = dQcvariance 
-        dQc[:,:,1]   = dQclengthscale 
-        dPinf[:,:,0] = dPinfvariance 
-        dPinf[:,:,1] = dPinflengthscale 
+        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2])
+        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2])
+        # The partial derivatives
+        dFvariance       = np.zeros([2,2])
+        dFlengthscale    = np.array([[0,0],
+        [6./lengthscale**3,2*np.sqrt(3)/lengthscale**2]])
+        dQcvariance      = np.array([12.*np.sqrt(3)/lengthscale**3])
+        dQclengthscale   = np.array([-3*12*np.sqrt(3)/lengthscale**4*variance])
+        dPinfvariance    = np.array([[1,0],[0,3./lengthscale**2]])
+        dPinflengthscale = np.array([[0,0],
+        [0,-6*variance/lengthscale**3]])
+        # Combine the derivatives
+        dF[:,:,0]    = dFvariance
+        dF[:,:,1]    = dFlengthscale
+        dQc[:,:,0]   = dQcvariance
+        dQc[:,:,1]   = dQclengthscale
+        dPinf[:,:,0] = dPinfvariance
+        dPinf[:,:,1] = dPinflengthscale
 
-        return (F, L, Qc, H, Pinf, dF, dQc, dPinf)  
+        return (F, L, Qc, H, Pinf, dF, dQc, dPinf)
 
 class Matern52(Stationary):
     """
diff --git a/GPy/plotting/__init__.py b/GPy/plotting/__init__.py
index 067f5580..359a841a 100644
--- a/GPy/plotting/__init__.py
+++ b/GPy/plotting/__init__.py
@@ -50,6 +50,8 @@ def inject_plotting():
         GP.plot_samples = gpy_plot.gp_plots.plot_samples
         GP.plot = gpy_plot.gp_plots.plot
         GP.plot_f = gpy_plot.gp_plots.plot_f
+        GP.plot_latent = gpy_plot.gp_plots.plot_f
+        GP.plot_noiseless = gpy_plot.gp_plots.plot_f
         GP.plot_magnification = gpy_plot.latent_plots.plot_magnification
 
         from ..models import StateSpace
@@ -62,7 +64,9 @@ def inject_plotting():
         StateSpace.plot_samples = gpy_plot.gp_plots.plot_samples
         StateSpace.plot = gpy_plot.gp_plots.plot
         StateSpace.plot_f = gpy_plot.gp_plots.plot_f
-
+        StateSpace.plot_latent = gpy_plot.gp_plots.plot_f
+        StateSpace.plot_noiseless = gpy_plot.gp_plots.plot_f
+        
         from ..core import SparseGP
         SparseGP.plot_inducing = gpy_plot.data_plots.plot_inducing
 
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index b834ba9f..99951eb1 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -104,7 +104,45 @@ class Kern_check_dKdiag_dX(Kern_check_dK_dX):
     def parameters_changed(self):
         self.X.gradient[:] =  self.kernel.gradients_X_diag(self.dL_dK.diagonal(), self.X)
 
+class Kern_check_d2K_dXdX(Kern_check_model):
+    """This class allows gradient checks for the secondderivative of a kernel with respect to X. """
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
+        self.X = Param('X',X.copy())
+        self.link_parameter(self.X)
+        self.Xc = X.copy()
 
+    def log_likelihood(self):
+        if self.X2 is None:
+            return self.kernel.gradients_X(self.dL_dK, self.X, self.Xc).sum()
+        return self.kernel.gradients_X(self.dL_dK, self.X, self.X2).sum()
+
+    def parameters_changed(self):
+        #if self.kernel.name == 'rbf':
+        #    import ipdb;ipdb.set_trace()
+        if self.X2 is None:
+            grads = -self.kernel.gradients_XX(self.dL_dK, self.X).sum(1).sum(1)
+        else:
+            grads = -self.kernel.gradients_XX(self.dL_dK.T, self.X2, self.X).sum(0).sum(1)
+        self.X.gradient[:] = grads
+
+class Kern_check_d2Kdiag_dXdX(Kern_check_model):
+    """This class allows gradient checks for the second derivative of a kernel with respect to X. """
+    def __init__(self, kernel=None, dL_dK=None, X=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X)
+        self.X = Param('X',X)
+        self.link_parameter(self.X)
+        self.Xc = X.copy()
+
+    def log_likelihood(self):
+        l = 0.
+        for i in range(self.X.shape[0]):
+            l += self.kernel.gradients_X(self.dL_dK[[i],[i]], self.X[[i]], self.Xc[[i]]).sum()
+        return l
+
+    def parameters_changed(self):
+        grads = -self.kernel.gradients_XX_diag(self.dL_dK.diagonal(), self.X)
+        self.X.gradient[:] = grads.sum(-1)
 
 def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verbose=False, fixed_X_dims=None):
     """
@@ -242,6 +280,66 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
         assert(result)
         return False
 
+    if verbose:
+        print("Checking gradients of dK(X, X2) wrt X2 with full cov in dimensions")
+    try:
+        testmodel = Kern_check_d2K_dXdX(kern, X=X, X2=X2)
+        if fixed_X_dims is not None:
+            testmodel.X[:,fixed_X_dims].fix()
+        result = testmodel.checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print(("gradients_X not implemented for " + kern.name))
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print(("Gradient of dK(X, X2) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:"))
+        testmodel.checkgrad(verbose=True)
+        assert(result)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of dK(X, X) wrt X with full cov in dimensions")
+    try:
+        testmodel = Kern_check_d2K_dXdX(kern, X=X, X2=None)
+        if fixed_X_dims is not None:
+            testmodel.X[:,fixed_X_dims].fix()
+        result = testmodel.checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print(("gradients_X not implemented for " + kern.name))
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print(("Gradient of dK(X, X) wrt X with full cov in dimensions failed for " + kern.name + " covariance function. Gradient values as follows:"))
+        testmodel.checkgrad(verbose=True)
+        assert(result)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of dKdiag(X, X) wrt X with cov in dimensions")
+    try:
+        testmodel = Kern_check_d2Kdiag_dXdX(kern, X=X)
+        if fixed_X_dims is not None:
+            testmodel.X[:,fixed_X_dims].fix()
+        result = testmodel.checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print(("gradients_X not implemented for " + kern.name))
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print(("Gradient of dKdiag(X, X) wrt X with cov in dimensions failed for " + kern.name + " covariance function. Gradient values as follows:"))
+        testmodel.checkgrad(verbose=True)
+        assert(result)
+        pass_checks = False
+        return False
+
     return pass_checks
 
 
@@ -249,8 +347,8 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
 class KernelGradientTestsContinuous(unittest.TestCase):
     def setUp(self):
         self.N, self.D = 10, 5
-        self.X = np.random.randn(self.N,self.D)
-        self.X2 = np.random.randn(self.N+10,self.D)
+        self.X = np.random.randn(self.N,self.D+1)
+        self.X2 = np.random.randn(self.N+10,self.D+1)
 
         continuous_kerns = ['RBF', 'Linear']
         self.kernclasses = [getattr(GPy.kern, s) for s in continuous_kerns]
@@ -299,7 +397,7 @@ class KernelGradientTestsContinuous(unittest.TestCase):
     def test_Add_dims(self):
         k = GPy.kern.Matern32(2, active_dims=[2,self.D]) + GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D)
         k.randomize()
-        self.assertRaises(IndexError, k.K, self.X)
+        self.assertRaises(IndexError, k.K, self.X[:, :self.D])
         k = GPy.kern.Matern32(2, active_dims=[2,self.D-1]) + GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D)
         k.randomize()
         # assert it runs:
@@ -314,7 +412,7 @@ class KernelGradientTestsContinuous(unittest.TestCase):
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
     def test_RBF(self):
-        k = GPy.kern.RBF(self.D, ARD=True)
+        k = GPy.kern.RBF(self.D-1, ARD=True)
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
@@ -329,9 +427,8 @@ class KernelGradientTestsContinuous(unittest.TestCase):
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
     def test_Fixed(self):
-        Xall = np.concatenate([self.X, self.X])
-        cov = np.dot(Xall, Xall.T)
-        X = np.arange(self.N).reshape(1,self.N)
+        cov = np.dot(self.X, self.X.T)
+        X = np.arange(self.N).reshape(self.N, 1)
         k = GPy.kern.Fixed(1, cov)
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=X, X2=None, verbose=verbose))
@@ -354,11 +451,11 @@ class KernelGradientTestsContinuous(unittest.TestCase):
     def test_Precomputed(self):
         Xall = np.concatenate([self.X, self.X2])
         cov = np.dot(Xall, Xall.T)
-        X = np.arange(self.N).reshape(1,self.N)
-        X2 = np.arange(self.N,2*self.N+10).reshape(1,self.N+10)
+        X = np.arange(self.N).reshape(self.N, 1)
+        X2 = np.arange(self.N,2*self.N+10).reshape(self.N+10, 1)
         k = GPy.kern.Precomputed(1, cov)
         k.randomize()
-        self.assertTrue(check_kernel_gradient_functions(k, X=X, X2=X2, verbose=verbose))
+        self.assertTrue(check_kernel_gradient_functions(k, X=X, X2=X2, verbose=verbose, fixed_X_dims=[0]))
 
 class KernelTestsMiscellaneous(unittest.TestCase):
     def setUp(self):