diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index df3160f1..74a16894 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -54,11 +54,11 @@ class SparseGP(GPBase):
         self.Kmm = self.kern.K(self.Z)
         if self.has_uncertain_inputs:
             self.psi0 = self.kern.psi0(self.Z, self.X, self.X_variance)
-            self.psi1 = self.kern.psi1(self.Z, self.X, self.X_variance).T
+            self.psi1 = self.kern.psi1(self.Z, self.X, self.X_variance)
             self.psi2 = self.kern.psi2(self.Z, self.X, self.X_variance)
         else:
             self.psi0 = self.kern.Kdiag(self.X)
-            self.psi1 = self.kern.K(self.Z, self.X)
+            self.psi1 = self.kern.K(self.X, self.Z)
             self.psi2 = None
 
     def _computations(self):
@@ -75,12 +75,13 @@ class SparseGP(GPBase):
             evals, evecs = linalg.eigh(psi2_beta)
             clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
             tmp = evecs * np.sqrt(clipped_evals)
+            tmp = tmp.T
         else:
             if self.likelihood.is_heteroscedastic:
-                tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.num_data)))
+                tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(self.num_data,1)))
             else:
                 tmp = self.psi1 * (np.sqrt(self.likelihood.precision))
-        tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
+        tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(tmp.T), lower=1)
         self.A = tdot(tmp)
 
 
@@ -89,7 +90,7 @@ class SparseGP(GPBase):
         self.LB = jitchol(self.B)
 
         # TODO: make a switch for either first compute psi1V, or VV.T
-        self.psi1V = np.dot(self.psi1, self.likelihood.V)
+        self.psi1V = np.dot(self.psi1.T, self.likelihood.V)
 
         # back substutue C into psi1V
         tmp, info1 = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0)
@@ -107,14 +108,14 @@ class SparseGP(GPBase):
 
         # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case
         self.dL_dpsi0 = -0.5 * self.output_dim * (self.likelihood.precision * np.ones([self.num_data, 1])).flatten()
-        self.dL_dpsi1 = np.dot(self.Cpsi1V, self.likelihood.V.T)
+        self.dL_dpsi1 = np.dot(self.Cpsi1V, self.likelihood.V.T).T
         dL_dpsi2_beta = 0.5 * backsub_both_sides(self.Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)
 
         if self.likelihood.is_heteroscedastic:
             if self.has_uncertain_inputs:
                 self.dL_dpsi2 = self.likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :]
             else:
-                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, self.psi1 * self.likelihood.precision.reshape(1, self.num_data))
+                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (self.psi1 * self.likelihood.precision.reshape(self.num_data, 1)).T).T
                 self.dL_dpsi2 = None
         else:
             dL_dpsi2 = self.likelihood.precision * dL_dpsi2_beta
@@ -123,7 +124,7 @@ class SparseGP(GPBase):
                 self.dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], self.num_data, axis=0)
             else:
                 # subsume back into psi1 (==Kmn)
-                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2, self.psi1)
+                self.dL_dpsi1 += 2.*np.dot(self.psi1, dL_dpsi2)
                 self.dL_dpsi2 = None
 
 
@@ -179,10 +180,10 @@ class SparseGP(GPBase):
                 Kmmi = tdot(Lmi.T)
                 diag_tr_psi2Kmmi = np.array([np.trace(psi2_Kmmi) for psi2_Kmmi in np.dot(self.psi2, Kmmi)])
 
-                self.likelihood.fit_FITC(self.Kmm, self.psi1, diag_tr_psi2Kmmi) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion
+                self.likelihood.fit_FITC(self.Kmm, self.psi1.T, diag_tr_psi2Kmmi) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion
                 # raise NotImplementedError, "EP approximation not implemented for uncertain inputs"
             else:
-                self.likelihood.fit_DTC(self.Kmm, self.psi1)
+                self.likelihood.fit_DTC(self.Kmm, self.psi1.T)
                 # self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
                 self._set_params(self._get_params()) # update the GP
 
@@ -196,10 +197,10 @@ class SparseGP(GPBase):
         dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
         if self.has_uncertain_inputs:
             dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X, self.X_variance)
-            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T, self.Z, self.X, self.X_variance)
+            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X, self.X_variance)
             dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X, self.X_variance)
         else:
-            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.Z, self.X)
+            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X, self.Z)
             dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
 
         return dL_dtheta
@@ -213,7 +214,7 @@ class SparseGP(GPBase):
             dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
             dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
         else:
-            dL_dZ += self.kern.dK_dX(self.dL_dpsi1, self.Z, self.X)
+            dL_dZ += self.kern.dK_dX(self.dL_dpsi1.T, self.Z, self.X)
         return dL_dZ
 
     def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
@@ -233,9 +234,9 @@ class SparseGP(GPBase):
                 Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
                 var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
         else:
-            # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
+            # assert which_p.Tarts=='all', "swithching out parts of variational kernels is not implemented"
             Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
-            mu = np.dot(Kx, self.Cpsi1V)
+            mu = np.dot(Kx.T, self.Cpsi1V)
             if full_cov:
                 raise NotImplementedError, "TODO"
             else:
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 4a05783b..76a0d99e 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -360,9 +360,7 @@ class kern(Parameterised):
                 raise NotImplementedError("We don't handle linear/linear cross-terms")
             tmp = np.zeros((mu.shape[0], Z.shape[0]))
             p1.psi1(Z, mu, S, tmp)
-            tmp2 = np.zeros_like(target)
-            p2.dpsi1_dZ((tmp[:, None, :] * dL_dpsi2).sum(1).T, Z, mu, S, tmp2)
-            target += tmp2
+            p2.dpsi1_dZ((tmp[:, None, :] * dL_dpsi2).sum(1), Z, mu, S, target)
 
         return target * 2
 
@@ -378,7 +376,7 @@ class kern(Parameterised):
 
             tmp = np.zeros((mu.shape[0], Z.shape[0]))
             p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dmuS((tmp[:, None, :] * dL_dpsi2).sum(1).T * 2., Z, mu, S, target_mu, target_S)
+            p2.dpsi1_dmuS((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target_mu, target_S)
 
         return target_mu, target_S
 
diff --git a/GPy/kern/linear.py b/GPy/kern/linear.py
index b2aa29f9..55582114 100644
--- a/GPy/kern/linear.py
+++ b/GPy/kern/linear.py
@@ -98,7 +98,7 @@ class linear(Kernpart):
             target += tmp.sum()
 
     def dK_dX(self, dL_dK, X, X2, target):
-        target += (((X2[:, None, :] * self.variances)) * dL_dK[:, :, None]).sum(0)
+        target += (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
 
     #---------------------------------------#
     #             PSI statistics            #
@@ -131,7 +131,7 @@ class linear(Kernpart):
     def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
         """Do nothing for S, it does not affect psi1"""
         self._psi_computations(Z, mu, S)
-        target_mu += (dL_dpsi1.T[:, :, None] * (Z * self.variances)).sum(1)
+        target_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
 
     def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
         self.dK_dX(dL_dpsi1.T, Z, mu, target)
diff --git a/GPy/kern/rbf.py b/GPy/kern/rbf.py
index d1e0797c..5686d7bd 100644
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/rbf.py
@@ -178,13 +178,13 @@ class rbf(Kernpart):
         self._psi_computations(Z, mu, S)
         denominator = (self.lengthscale2 * (self._psi1_denom))
         dpsi1_dZ = -self._psi1[:, :, None] * ((self._psi1_dist / denominator))
-        target += np.sum(dL_dpsi1.T[:, :, None] * dpsi1_dZ, 0)
+        target += np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
 
     def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
         self._psi_computations(Z, mu, S)
         tmp = self._psi1[:, :, None] / self.lengthscale2 / self._psi1_denom
-        target_mu += np.sum(dL_dpsi1.T[:, :, None] * tmp * self._psi1_dist, 1)
-        target_S += np.sum(dL_dpsi1.T[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
+        target_mu += np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
+        target_S += np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
 
     def psi2(self, Z, mu, S, target):
         self._psi_computations(Z, mu, S)
diff --git a/GPy/models/sparse_gplvm.py b/GPy/models/sparse_gplvm.py
index 76fe65f1..ea2f8013 100644
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@@ -43,7 +43,7 @@ class SparseGPLVM(SparseGPRegression, GPLVM):
 
     def dL_dX(self):
         dL_dX = self.kern.dKdiag_dX(self.dL_dpsi0, self.X)
-        dL_dX += self.kern.dK_dX(self.dL_dpsi1.T, self.X, self.Z)
+        dL_dX += self.kern.dK_dX(self.dL_dpsi1, self.X, self.Z)
 
         return dL_dX