diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 2ef07fa5..a6551e11 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -289,9 +289,11 @@ class kern(parameterised):
         assert X.shape[1] == self.D
         slices1, slices2 = self._process_slices(slices1, slices2)
         if X2 is None:
-            X2 = X
-        target = np.zeros((X.shape[0], X2.shape[0]))
-        [p.K(X[s1, i_s], X2[s2, i_s], target=target[s1, s2]) for p, i_s, s1, s2 in zip(self.parts, self.input_slices, slices1, slices2)]
+            target = np.zeros((X.shape[0], X.shape[0]))
+            [p.K(X[s1, i_s], None, target=target[s1, s2]) for p, i_s, s1, s2 in zip(self.parts, self.input_slices, slices1, slices2)]
+        else:
+            target = np.zeros((X.shape[0], X2.shape[0]))
+            [p.K(X[s1, i_s], X2[s2, i_s], target=target[s1, s2]) for p, i_s, s1, s2 in zip(self.parts, self.input_slices, slices1, slices2)]
         return target
 
     def dK_dtheta(self, dL_dK, X, X2=None, slices1=None, slices2=None):
@@ -308,10 +310,12 @@ class kern(parameterised):
         """
         assert X.shape[1] == self.D
         slices1, slices2 = self._process_slices(slices1, slices2)
-        if X2 is None:
-            X2 = X
         target = np.zeros(self.Nparam)
-        [p.dK_dtheta(dL_dK[s1, s2], X[s1, i_s], X2[s2, i_s], target[ps]) for p, i_s, ps, s1, s2 in zip(self.parts, self.input_slices, self.param_slices, slices1, slices2)]
+        if X2 is None:
+            [p.dK_dtheta(dL_dK[s1, s2], X[s1, i_s], None, target[ps]) for p, i_s, ps, s1, s2 in zip(self.parts, self.input_slices,self.param_slices, slices1, slices2)]
+        else:
+            [p.dK_dtheta(dL_dK[s1, s2], X[s1, i_s], X2[s2, i_s], target[ps]) for p, i_s, ps, s1, s2 in zip(self.parts, self.input_slices,self.param_slices, slices1, slices2)]
+
 
         return self._transform_gradients(target)
 
diff --git a/GPy/kern/linear.py b/GPy/kern/linear.py
index 78a8732a..78dbdf01 100644
--- a/GPy/kern/linear.py
+++ b/GPy/kern/linear.py
@@ -4,6 +4,7 @@
 
 from kernpart import kernpart
 import numpy as np
+from ..util.linalg import tdot
 
 class linear(kernpart):
     """
@@ -65,8 +66,11 @@ class linear(kernpart):
     def K(self,X,X2,target):
         if self.ARD:
             XX = X*np.sqrt(self.variances)
-            XX2 = X2*np.sqrt(self.variances)
-            target += np.dot(XX, XX2.T)
+            if X2 is None:
+                target += tdot(XX)
+            else:
+                XX2 = X2*np.sqrt(self.variances)
+                target += np.dot(XX, XX2.T)
         else:
             self._K_computations(X, X2)
             target += self.variances * self._dot_product
@@ -76,8 +80,11 @@ class linear(kernpart):
 
     def dK_dtheta(self,dL_dK,X,X2,target):
         if self.ARD:
-            product = X[:,None,:]*X2[None,:,:]
-            target += (dL_dK[:,:,None]*product).sum(0).sum(0)
+            if X2 is None:
+                [np.add(target[i:i+1],np.sum(dL_dK*tdot(X[:,i:i+1])),target[i:i+1]) for i in range(self.D)]
+            else:
+                product = X[:,None,:]*X2[None,:,:]
+                target += (dL_dK[:,:,None]*product).sum(0).sum(0)
         else:
             self._K_computations(X, X2)
             target += np.sum(self._dot_product*dL_dK)
@@ -133,9 +140,9 @@ class linear(kernpart):
         returns N,M,M matrix
         """
         self._psi_computations(Z,mu,S)
-        psi2 = self.ZZ*np.square(self.variances)*self.mu2_S[:, None, None, :]
-        target += psi2.sum(-1)
-        #TODO: this could be faster using np.tensordot
+        #psi2 = self.ZZ*np.square(self.variances)*self.mu2_S[:, None, None, :]
+        #target += psi2.sum(-1)
+        target += np.tensordot(self.ZZ[None,:,:,:]*np.square(self.variances),self.mu2_S[:, None, None, :],((3),(3))).squeeze().T
 
     def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
         self._psi_computations(Z,mu,S)
@@ -156,28 +163,30 @@ class linear(kernpart):
         self._psi_computations(Z,mu,S)
         mu2_S = np.sum(self.mu2_S,0)# Q,
         target += (dL_dpsi2[:,:,:,None] * (self.mu2_S[:,None,None,:]*(Z*np.square(self.variances)[None,:])[None,None,:,:])).sum(0).sum(1)
+        #TODO: tensordot would gain some time here
 
     #---------------------------------------#
     #            Precomputations            #
     #---------------------------------------#
 
     def _K_computations(self,X,X2):
-        if X2 is None:
-            X2 = X
-        if not (np.all(X==self._Xcache) and np.all(X2==self._X2cache)):
-            self._Xcache = X
-            self._X2cache = X2
-            self._dot_product = np.dot(X,X2.T)
-        else:
-            # print "Cache hit!"
-            pass # TODO: insert debug message here (logging framework)
+        if not (np.array_equal(X, self._Xcache) and np.array_equal(X2, self._X2cache)):
+            self._Xcache = X.copy()
+            if X2 is None:
+                self._dot_product = tdot(X)
+                self._X2cache = None
+            else:
+                self._X2cache = X2.copy()
+                self._dot_product = np.dot(X,X2.T)
 
     def _psi_computations(self,Z,mu,S):
         #here are the "statistics" for psi1 and psi2
         if not np.all(Z==self._Z):
             #Z has changed, compute Z specific stuff
-            self.ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q
-            self._Z = Z
+            #self.ZZ = Z[:,None,:]*Z[None,:,:] # M,M,Q
+            self.ZZ = np.empty((Z.shape[0],Z.shape[0],Z.shape[1]),order='F')
+            [tdot(Z[:,i:i+1],self.ZZ[:,:,i].T) for i in xrange(Z.shape[1])]
+            self._Z = Z.copy()
         if not (np.all(mu==self._mu) and np.all(S==self._S)):
             self.mu2_S = np.square(mu)+S
-            self._mu, self._S = mu, S
+            self._mu, self._S = mu.copy(), S.copy()
diff --git a/GPy/kern/rbf.py b/GPy/kern/rbf.py
index 9ff7a93e..027e5e9e 100644
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/rbf.py
@@ -6,6 +6,7 @@ from kernpart import kernpart
 import numpy as np
 import hashlib
 from scipy import weave
+from ..util.linalg import tdot
 
 class rbf(kernpart):
     """
@@ -74,11 +75,8 @@ class rbf(kernpart):
             return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
 
     def K(self,X,X2,target):
-        if X2 is None:
-            X2 = X
-
         self._K_computations(X,X2)
-        np.add(self.variance*self._K_dvar, target,target)
+        target += self.variance*self._K_dvar
 
     def Kdiag(self,X,target):
         np.add(target,self.variance,target)
@@ -87,6 +85,7 @@ class rbf(kernpart):
         self._K_computations(X,X2)
         target[0] += np.sum(self._K_dvar*dL_dK)
         if self.ARD:
+            if X2 is None: X2 = X
             [np.add(target[1+q:2+q],(self.variance/self.lengthscale[q]**3)*np.sum(self._K_dvar*dL_dK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.D)]
         else:
             target[1] += (self.variance/self.lengthscale)*np.sum(self._K_dvar*self._K_dist2*dL_dK)
@@ -182,29 +181,31 @@ class rbf(kernpart):
     #---------------------------------------#
 
     def _K_computations(self,X,X2):
-        if not (np.all(X==self._X) and np.all(X2==self._X2) and np.all(self._params == self._get_params())):
+        if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
             self._X = X.copy()
-            self._X2 = X2.copy()
             self._params == self._get_params().copy()
-            if X2 is None: X2 = X
-            #never do this: self._K_dist = X[:,None,:]-X2[None,:,:] # this can be computationally heavy
-            #_K_dist = X[:,None,:]-X2[None,:,:]
-            #_K_dist2 = np.square(_K_dist/self.lengthscale)
-            X = X/self.lengthscale
-            X2 = X2/self.lengthscale
-            self._K_dist2 = (-2.*np.dot(X, X2.T) + np.sum(np.square(X),1)[:,None] + np.sum(np.square(X2),1)[None,:])
+            if X2 is None:
+                self._X2 = None
+                X = X/self.lengthscale
+                Xsquare = np.sum(np.square(X),1)
+                self._K_dist2 = (-2.*tdot(X) + Xsquare[:,None] + Xsquare[None,:])
+            else:
+                self._X2 = X2.copy()
+                X = X/self.lengthscale
+                X2 = X2/self.lengthscale
+                self._K_dist2 = (-2.*np.dot(X, X2.T) + np.sum(np.square(X),1)[:,None] + np.sum(np.square(X2),1)[None,:])
             self._K_dvar = np.exp(-0.5*self._K_dist2)
 
     def _psi_computations(self,Z,mu,S):
         #here are the "statistics" for psi1 and psi2
-        if not np.all(Z==self._Z):
+        if not np.array_equal(Z, self._Z):
             #Z has changed, compute Z specific stuff
             self._psi2_Zhat = 0.5*(Z[:,None,:] +Z[None,:,:]) # M,M,Q
             self._psi2_Zdist = 0.5*(Z[:,None,:]-Z[None,:,:]) # M,M,Q
             self._psi2_Zdist_sq = np.square(self._psi2_Zdist/self.lengthscale) # M,M,Q
             self._Z = Z
 
-        if not (np.all(Z==self._Z) and np.all(mu==self._mu) and np.all(S==self._S)):
+        if not (np.array_equal(Z, self._Z) and np.array_equal(mu, self._mu) and np.array_equal(S, self._S)):
             #something's changed. recompute EVERYTHING
 
             #psi1
diff --git a/GPy/kern/white.py b/GPy/kern/white.py
index f5d6894a..be6aad45 100644
--- a/GPy/kern/white.py
+++ b/GPy/kern/white.py
@@ -30,17 +30,15 @@ class white(kernpart):
         return ['variance']
 
     def K(self,X,X2,target):
-        if X.shape==X2.shape:
-            if np.all(X==X2):
-                np.add(target,np.eye(X.shape[0])*self.variance,target)
+        if X2 is None:
+            target += np.eye(X.shape[0])*self.variance
 
     def Kdiag(self,X,target):
         target += self.variance
 
     def dK_dtheta(self,dL_dK,X,X2,target):
-        if X.shape==X2.shape:
-            if np.all(X==X2):
-                target += np.trace(dL_dK)
+        if X2 is None:
+            target += np.trace(dL_dK)
 
     def dKdiag_dtheta(self,dL_dKdiag,X,target):
         target += np.sum(dL_dKdiag)
diff --git a/GPy/likelihoods/Gaussian.py b/GPy/likelihoods/Gaussian.py
index 25d12491..d3696fa6 100644
--- a/GPy/likelihoods/Gaussian.py
+++ b/GPy/likelihoods/Gaussian.py
@@ -30,7 +30,7 @@ class Gaussian(likelihood):
             self.trYYT = np.trace(self.YYT)
         else:
             self.YYT = None
-            self.trYYT = None
+            self.trYYT = np.sum(np.square(self.Y))
 
     def _get_params(self):
         return np.asarray(self._variance)
diff --git a/GPy/models/Bayesian_GPLVM.py b/GPy/models/Bayesian_GPLVM.py
index 0d4cf91e..6333fb1c 100644
--- a/GPy/models/Bayesian_GPLVM.py
+++ b/GPy/models/Bayesian_GPLVM.py
@@ -33,7 +33,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
             X = self.initialise_latent(init, Q, Y)
 
         if X_variance is None:
-            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0, 1)
+            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1)
 
         if Z is None:
             Z = np.random.permutation(X.copy())[:M]
diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py
index dc77e795..697a9978 100644
--- a/GPy/models/sparse_GP.py
+++ b/GPy/models/sparse_GP.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 import pylab as pb
-from ..util.linalg import mdot, jitchol, chol_inv, pdinv, trace_dot
+from ..util.linalg import mdot, jitchol, chol_inv, pdinv, trace_dot, tdot
 from ..util.plot import gpplot
 from .. import kern
 from GP import GP
@@ -50,9 +50,6 @@ class sparse_GP(GP):
             self.has_uncertain_inputs=True
             self.X_variance = X_variance
 
-        if not self.likelihood.is_heteroscedastic:
-            self.likelihood.trYYT = np.trace(np.dot(self.likelihood.Y, self.likelihood.Y.T)) # TODO: something more elegant here?
-
         GP.__init__(self, X, likelihood, kernel=kernel, normalize_X=normalize_X, Xslices=Xslices)
 
         #normalize X uncertainty also
@@ -86,13 +83,15 @@ class sparse_GP(GP):
                 self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision.flatten().reshape(self.N,1,1)/sf2)).sum(0)
             else:
                 tmp = self.psi1*(np.sqrt(self.likelihood.precision.flatten().reshape(1,self.N))/sf)
-                self.psi2_beta_scaled = np.dot(tmp,tmp.T)
+                #self.psi2_beta_scaled = np.dot(tmp,tmp.T)
+                self.psi2_beta_scaled = tdot(tmp)
         else:
             if self.has_uncertain_inputs:
                 self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision/sf2)).sum(0)
             else:
                 tmp = self.psi1*(np.sqrt(self.likelihood.precision)/sf)
-                self.psi2_beta_scaled = np.dot(tmp,tmp.T)
+                #self.psi2_beta_scaled = np.dot(tmp,tmp.T)
+                self.psi2_beta_scaled = tdot(tmp)
 
         self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
 
@@ -110,10 +109,11 @@ class sparse_GP(GP):
         self.psi1V = np.dot(self.psi1, self.V)
         tmp = linalg.lapack.flapack.dtrtrs(self.Lm,np.asfortranarray(self.Bi),lower=1,trans=1)[0]
         self.C = linalg.lapack.flapack.dtrtrs(self.Lm,np.asfortranarray(tmp.T),lower=1,trans=1)[0]
+        #TODO: can we multiply in C by forwardsubstitution?
         self.Cpsi1V = np.dot(self.C,self.psi1V)
         self.Cpsi1VVpsi1 = np.dot(self.Cpsi1V,self.psi1V.T)
-        #self.E = np.dot(self.Cpsi1VVpsi1,self.C)/sf2
-        self.E = np.dot(self.Cpsi1V/sf,self.Cpsi1V.T/sf)
+        #self.E = np.dot(self.Cpsi1V/sf,self.Cpsi1V.T/sf)
+        self.E = tdot(self.Cpsi1V/sf)
 
         # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertin inputs case
         self.dL_dpsi0 = - 0.5 * self.D * (self.likelihood.precision * np.ones([self.N,1])).flatten()
@@ -166,9 +166,9 @@ class sparse_GP(GP):
             #self.partial_for_likelihood += -np.diag(np.dot((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) , self.psi1VVpsi1 ))*self.likelihood.precision #dD
         else:
             #likelihood is not heterscedatic
-            self.partial_for_likelihood =   - 0.5 * self.N*self.D*self.likelihood.precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self.likelihood.precision**2
+            self.partial_for_likelihood =   - 0.5 * self.N*self.D*self.likelihood.precision + 0.5 * self.likelihood.trYYT*self.likelihood.precision**2
             self.partial_for_likelihood += 0.5 * self.D * (self.psi0.sum()*self.likelihood.precision**2 - np.trace(self.A)*self.likelihood.precision*sf2)
-            self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self.likelihood.precision # TODO: unstable?
+            self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self.likelihood.precision
             self.partial_for_likelihood += self.likelihood.precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1))
 
 
diff --git a/GPy/testing/unit_tests.py b/GPy/testing/unit_tests.py
index 55a1fb65..ee8368ac 100644
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@@ -112,6 +112,16 @@ class GradientTests(unittest.TestCase):
         bias = GPy.kern.bias(2)
         self.check_model_with_white(bias, model_type='GP_regression', dimension=2)
 
+    def test_GP_regression_linear_kern_1D_ARD(self):
+        ''' Testing the GP regression with linear kernel on 1d data '''
+        linear = GPy.kern.linear(1,ARD=True)
+        self.check_model_with_white(linear, model_type='GP_regression', dimension=1)
+
+    def test_GP_regression_linear_kern_2D_ARD(self):
+        ''' Testing the GP regression with linear kernel on 2d data '''
+        linear = GPy.kern.linear(2,ARD=True)
+        self.check_model_with_white(linear, model_type='GP_regression', dimension=2)
+
     def test_GP_regression_linear_kern_1D(self):
         ''' Testing the GP regression with linear kernel on 1d data '''
         linear = GPy.kern.linear(1)
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 34e30dca..b19aa2b6 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -196,7 +196,7 @@ def tdot_blas(mat, out=None):
     if (mat.dtype != 'float64') or (len(mat.shape) != 2):
         return np.dot(mat, mat.T)
     nn = mat.shape[0]
-    if not out:
+    if out is None:
         out = np.zeros((nn,nn))
     else:
         assert(out.dtype == 'float64')
@@ -211,7 +211,7 @@ def tdot_blas(mat, out=None):
     # could avoid the copy. I also thought swapping to cblas API would allow use
     # of C order. However, I tried that and had errors with large matrices:
     # http://homepages.inf.ed.ac.uk/imurray2/code/tdot/tdot_broken.py
-    mat = mat.copy(order='F')
+    mat = np.asfortranarray(mat)
     TRANS = c_char('n')
     N = c_int(mat.shape[0])
     K = c_int(mat.shape[1])
@@ -225,7 +225,7 @@ def tdot_blas(mat, out=None):
     _blaslib.dsyrk_(byref(UPLO), byref(TRANS), byref(N), byref(K),
             byref(ALPHA), A, byref(LDA), byref(BETA), C, byref(LDC))
 
-    symmetrify(out.T)
+    symmetrify(out,upper=True)
 
     return out
 
@@ -235,7 +235,7 @@ def tdot(*args, **kwargs):
     else:
         return tdot_numpy(*args,**kwargs)
 
-def symmetrify(A):
+def symmetrify(A,upper=False):
     """
     Take the square matrix A and make it symmetrical by copting elements from the lower half to the upper
 
@@ -257,9 +257,13 @@ def symmetrify(A):
       }
     }
     """
-    if A.flags['C_CONTIGUOUS']:
+    if A.flags['C_CONTIGUOUS'] and upper:
+        weave.inline(f_contig_code,['A','N'])
+    elif A.flags['C_CONTIGUOUS'] and not upper:
         weave.inline(c_contig_code,['A','N'])
-    elif A.flags['F_CONTIGUOUS']:
+    elif A.flags['F_CONTIGUOUS'] and upper:
+        weave.inline(c_contig_code,['A','N'])
+    elif A.flags['F_CONTIGUOUS'] and not upper:
         weave.inline(f_contig_code,['A','N'])
     else:
         tmp = np.tril(A)