diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py
index a06c6c80..47a90fd2 100644
--- a/GPy/inference/latent_function_inference/var_dtc_gpu.py
+++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py
@@ -70,10 +70,6 @@ class VarDTC_GPU(object):
                              'dL_dpsi0_gpu'         :gpuarray.empty((self.batchsize,),np.float64,order='F'),
                              'dL_dpsi1_gpu'         :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
                              'dL_dpsi2_gpu'         :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
-                             'dL_dthetaL_gpu'       :gpuarray.empty((self.batchsize,),np.float64,order='F'),
-                             'betapsi1_gpu'         :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
-                             'thetaL_t_gpu'         :gpuarray.empty((self.batchsize,),np.float64,order='F'),
-                             'betaYT2_gpu'          :gpuarray.empty((output_dim,self.batchsize),np.float64,order='F'),
                              'psi0p_gpu'            :gpuarray.empty((self.batchsize,),np.float64,order='F'),
                              'psi1p_gpu'            :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
                              'psi2p_gpu'            :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
@@ -377,13 +373,12 @@ class VarDTC_GPU(object):
         # Compute dL_dthetaL for uncertian input and non-heter noise
         #======================================================================        
         
-        if uncertain_inputs and not het_noise:
-            dL_dthetaL = (YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta)/2.
-            dL_dthetaL += -beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1)
-            dL_dthetaL += -beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1)
-            dL_dthetaL += traceDot(v_gpu, psi1Y_gpu, num_inducing, output_dim)
-            self.midRes['dL_dthetaL'] = dL_dthetaL
-
+        if not het_noise:
+            dL_dthetaL = (YRY_full + output_dim*psi0_full - num_data*output_dim)/-2.
+            dL_dthetaL += cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1)
+            dL_dthetaL += cublas.cublasDdot(self.cublas_handle,v_gpu.size, v_gpu.gpudata,1,psi1Y_gpu.gpudata,1)
+            self.midRes['dL_dthetaL'] = -beta*dL_dthetaL
+            
         return logL, dL_dKmm_gpu.get(), post
 
     def inference_minibatch(self, kern, X, Z, likelihood, Y):
@@ -419,22 +414,22 @@ class VarDTC_GPU(object):
             beta = beta[n_start] # nSlice==1
         
         if kern.useGPU:
-            if uncertain_inputs:
-                psi0p_gpu = kern.psi0(Z, X_slice)
-                psi1p_gpu = kern.psi1(Z, X_slice)
-                psi2p_gpu = kern.psi2(Z, X_slice)
-            else:
+            if not uncertain_inputs:
                 psi0p_gpu = kern.Kdiag(X_slice)
                 psi1p_gpu = kern.K(X_slice, Z)
                 psi2p_gpu = self.gpuCache['psi2p_gpu']
-        else:
-            if uncertain_inputs:
+            elif het_noise:
+                psi0p_gpu = kern.psi0(Z, X_slice)
+                psi1p_gpu = kern.psi1(Z, X_slice)
+                psi2p_gpu = kern.psi2(Z, X_slice)
+        elif not uncertain_inputs or het_noise:
+            if not uncertain_inputs:
+                psi0 = kern.Kdiag(X_slice)
+                psi1 = kern.K(X_slice, Z)
+            elif het_noise:
                 psi0 = kern.psi0(Z, X_slice)
                 psi1 = kern.psi1(Z, X_slice)
                 psi2 = kern.psi2(Z, X_slice)
-            else:
-                psi0 = kern.Kdiag(X_slice)
-                psi1 = kern.K(X_slice, Z)
 
             psi0p_gpu = self.gpuCache['psi0p_gpu']
             psi1p_gpu = self.gpuCache['psi1p_gpu']
@@ -446,43 +441,23 @@ class VarDTC_GPU(object):
             psi1p_gpu.set(np.asfortranarray(psi1))
             if uncertain_inputs:
                 psi2p_gpu.set(np.asfortranarray(psi2))
-                
-        #======================================================================
-        # Prepare gpu memory
-        #======================================================================
-        
-        dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu']
-        v_gpu = self.gpuCache['v_gpu']        
-        betaYT_gpu = self.gpuCache['betaYT_gpu']
-        beta_gpu = self.gpuCache['beta_gpu']
-        dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu']
-        dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
-        dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
-        dL_dthetaL_gpu = self.gpuCache['dL_dthetaL_gpu']
-        psi2R_gpu = self.gpuCache['psi2_t_gpu'][:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
-        betapsi1_gpu = self.gpuCache['betapsi1_gpu']
-        thetaL_t_gpu = self.gpuCache['thetaL_t_gpu']
-        betaYT2_gpu = self.gpuCache['betaYT2_gpu']
-        
-        betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
-        beta_gpu_slice = beta_gpu[n_start:n_end]
-
-        # Adjust to the batch size
-        if dL_dpsi0_gpu.shape[0] > nSlice:
-            betaYT2_gpu = betaYT2_gpu[:,:nSlice]
-            dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice]
-            dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
-            dL_dpsi2_gpu = dL_dpsi2_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
-            dL_dthetaL_gpu = dL_dthetaL_gpu.ravel()[:nSlice]
-            psi2R_gpu = psi2R_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
-            thetaL_t_gpu = thetaL_t_gpu.ravel()[:nSlice]
-            betapsi1_gpu = betapsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
-        
-        mul_bcast(betapsi1_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size)
-
+                            
         #======================================================================
         # Compute dL_dpsi
         #======================================================================
+
+        dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu']
+        v_gpu = self.gpuCache['v_gpu']        
+        dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu']
+        dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
+        dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
+        betaYT_gpu = self.gpuCache['betaYT_gpu']
+        betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
+        
+        # Adjust to the batch size
+        if dL_dpsi0_gpu.shape[0] > nSlice:
+            dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice]
+            dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
         
         dL_dpsi0_gpu.fill(-output_dim *beta/2.)
         
@@ -490,52 +465,18 @@ class VarDTC_GPU(object):
         
         if uncertain_inputs:
             cublas.cublasDcopy(self.cublas_handle, dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata, 1, dL_dpsi2_gpu.gpudata, 1)
-            cublas.cublasDscal(self.cublas_handle, dL_dpsi2_gpu.szie, beta, dL_dpsi2_gpu.gpudata, 1)
+            cublas.cublasDscal(self.cublas_handle, dL_dpsi2_gpu.size, beta, dL_dpsi2_gpu.gpudata, 1)
         else:
-            cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, 1.0, betapsi1_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice)
-            
+            cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, beta, psi1p_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice)
+
         #======================================================================
         # Compute dL_dthetaL
         #======================================================================
-        
-        if uncertain_inputs and not het_noise:
-            if isEnd:
-                dL_dthetaL = self.midRes['dL_dthetaL']
-            else:
-                dL_dthetaL = 0
-        
-#         if not uncertain_inputs:
-#             cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, nSlice, beta, psi1p_gpu.gpudata, nSlice, psi1p_gpu.gpudata, nSlice, 1.0, psi2p_gpu.gpudata, num_inducing)
-#            join_prod(psi2p_gpu,psi1p_gpu,psi1p_gpu,nSlice,num_inducing)
-
-#             psiR = cublas.cublasDdot(self.cublas_handle, dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata, 1, psi2p_gpu.gpudata, 1)
-#     #        mul_bcast_first(psi2R_gpu,dL_dpsi2R_gpu,psi2p_gpu,nSlice)
-#     
-#             dL_dthetaL_gpu.fill(0.)
-#             dL_dthetaL = 0.
-#             
-#             dL_dthetaL += cublas.cublasDdot(self.cublas_handle, betaYT_gpu_slice.size, betaYT_gpu_slice.gpudata,1,betaYT_gpu_slice.gpudata,1)
-#             
-#             cublas.cublasDcopy(self.cublas_handle, betaYT_gpu_slice.size, betaYT_gpu_slice.gpudata, 1, betaYT2_gpu.gpudata, 1)
-#             mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT2_gpu,betaYT2_gpu.size)
-#             cublas.cublasDscal(self.cublas_handle, betaYT2_gpu.size, 0.5, betaYT2_gpu.gpudata, 1)
-#             sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim)
-#             
-#             cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/(-2.0), beta_gpu_slice.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
-#             cublas.cublasDcopy(self.cublas_handle, beta_gpu_slice.size, beta_gpu_slice.gpudata, 1, thetaL_t_gpu.gpudata, 1)
-#             mul_bcast(thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu.size)
-#             mul_bcast(thetaL_t_gpu,thetaL_t_gpu,psi0p_gpu,thetaL_t_gpu.size)
-#             cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/2.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
-#             
-#             thetaL_t_gpu.fill(0.)
-#             sum_axis(thetaL_t_gpu, psi2R_gpu, nSlice, num_inducing*num_inducing)
-#             mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size)
-#             mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size)
-#             cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, -1.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
-#             
-#             cublas.cublasDgemm(self.cublas_handle, 'T', 'T', output_dim, nSlice, num_inducing, -1.0, v_gpu.gpudata, num_inducing, betapsi1_gpu.gpudata, nSlice, 0.0, betaYT2_gpu.gpudata, output_dim)
-#             mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size)
-#             sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim)
+        if het_noise:
+            betaY = betaYT_gpu_slice.get()
+            dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0p_gpu.get())-output_dim*beta)/2.
+            dL_dthetaL += -beta*beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2p_gpu.gpudata,1)
+            dL_dthetaL += -beta*(betaY*np.dot(psi1p_gpu.get(),v_gpu.get())).sum(axis=-1)
 
         if kern.useGPU:
             dL_dpsi0 = dL_dpsi0_gpu
@@ -548,10 +489,11 @@ class VarDTC_GPU(object):
                 dL_dpsi2 = dL_dpsi2_gpu
             else:
                 dL_dpsi2 = dL_dpsi2_gpu.get()
-        if het_noise:
-            dL_dthetaL = dL_dthetaL_gpu.get()
-        else:
-            dL_dthetaL = gpuarray.sum(dL_dthetaL_gpu).get()
+        if not het_noise:
+            if isEnd:
+                dL_dthetaL = self.midRes['dL_dthetaL']
+            else:
+                dL_dthetaL = 0.
         if uncertain_inputs:
             grad_dict = {'dL_dpsi0':dL_dpsi0,
                          'dL_dpsi1':dL_dpsi1,
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index da7e67cb..f2a3c3fd 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -116,7 +116,7 @@ class VarDTC_minibatch(object):
                 if het_noise:
                     psi2_full += beta_slice*np.outer(psi1,psi1)
                 else:
-                    psi2_full += np.outer(psi1.T,psi1.T)
+                    psi2_full += np.dot(psi1.T,psi1)
                 
         if not het_noise:
             psi0_full *= beta
@@ -176,7 +176,7 @@ class VarDTC_minibatch(object):
         # Compute dL_dthetaL for uncertian input and non-heter noise
         #======================================================================        
         
-        if uncertain_inputs and not het_noise:
+        if not het_noise:
             dL_dthetaL = (YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2_full).sum() - beta*(v.T*psi1Y_full).sum()
             self.midRes['dL_dthetaL'] = dL_dthetaL
 
@@ -265,14 +265,10 @@ class VarDTC_minibatch(object):
             
             dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1)
         else:
-            if uncertain_inputs:
-                if isEnd:
-                    dL_dthetaL = self.midRes['dL_dthetaL']
-                else:
-                    dL_dthetaL = 0.
+            if isEnd:
+                dL_dthetaL = self.midRes['dL_dthetaL']
             else:
-                psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
-                dL_dthetaL = ((np.square(betaY)).sum() + beta*beta*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - beta*beta*psiR- (betaY*np.dot(betapsi1,v)).sum()
+                dL_dthetaL = 0.
                 
         if uncertain_inputs:
             grad_dict = {'dL_dpsi0':dL_dpsi0,
diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index dba12700..7eff3f89 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -42,7 +42,8 @@ class SSGPLVM(SparseGP):
             X_variance = np.random.uniform(0,.1,X.shape)
             
         gamma = np.empty_like(X, order='F') # The posterior probabilities of the binary variable in the variational approximation
-        gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim)
+        #gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim)
+        gamma[:] = 0.5
         
         if group_spike:
             gamma[:] = gamma.mean(axis=0)
diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py
index cd656dd3..cf6d483e 100644
--- a/GPy/util/linalg_gpu.py
+++ b/GPy/util/linalg_gpu.py
@@ -19,8 +19,8 @@ try:
     
     strideSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?x[i]:0", arguments="double *x, int step")
     
-    # np.trace(np.dot(A,B)) (also equivalent to (A*B.T).sum() ) A - n x m, B - m x n
-    traceDot = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="A[i]*B[(i%n)*m+i/n]", arguments="double *A, double *B, int n, int m")
+    # np.trace(np.dot(A,B)) (also equivalent to (A*B.T).sum() ) A - a1 x a2, B - a2 x a1
+    traceDot = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="A[i]*B[(i%a1)*a2+i/a1]", arguments="double *A, double *B, int a1, int a2")
     
     #=======================================================================================
     # Element-wise functions