diff --git a/GPy/examples/oil_flow_demo.py b/GPy/examples/oil_flow_demo.py
index eee74461..f977e18d 100644
--- a/GPy/examples/oil_flow_demo.py
+++ b/GPy/examples/oil_flow_demo.py
@@ -20,26 +20,28 @@ def plot_oil(X, theta, labels, label):
     plt.plot(flow_type[:,0], flow_type[:,1], 'bx')
     plt.title(label)
 
-data = pickle.load(open('../util/datasets/oil_flow_3classes.pickle', 'r'))
+data = pickle.load(open('../../../GPy_assembla/datasets/oil_flow_3classes.pickle', 'r'))
 
 Y = data['DataTrn']
 N, D = Y.shape
-selected = np.random.permutation(N)[:200]
+selected = np.random.permutation(N)#[:200]
 labels = data['DataTrnLbls'][selected]
 Y = Y[selected]
 N, D = Y.shape
 Y -= Y.mean(axis=0)
-Y /= Y.std(axis=0)
+#Y /= Y.std(axis=0)
 
-Q = 2
-m1 = GPy.models.sparse_GPLVM(Y, Q, M = 15)
-m1.constrain_positive('(rbf|bias|noise)')
-m1.constrain_bounded('white', 1e-6, 1.0)
+Q = 7
+k = GPy.kern.rbf_ARD(Q) + GPy.kern.white(Q)
+m = GPy.models.Bayesian_GPLVM(Y, Q, kernel = k, M = 12)
+m.constrain_positive('(rbf|bias|S|white|noise)')
+# m.constrain_bounded('white', 1e-6, 100.0)
+# m.constrain_bounded('noise', 1e-4, 1000.0)
 
-plot_oil(m1.X, np.array([1,1]), labels, 'PCA initialization')
+plot_oil(m.X, np.array([1,1]), labels, 'PCA initialization')
 # m.optimize(messages = True)
-m1.optimize('bfgs', messages = True)
-plot_oil(m1.X, np.array([1,1]), labels, 'sparse GPLVM')
+m.optimize('tnc', messages = True)
+plot_oil(m.X, m.kern.parts[0].lengthscales, labels, 'B-GPLVM')
 # pb.figure()
 # m.plot()
 # pb.title('PCA initialisation')
@@ -47,7 +49,7 @@ plot_oil(m1.X, np.array([1,1]), labels, 'sparse GPLVM')
 # m.optimize(messages = 1)
 # m.plot()
 # pb.title('After optimisation')
-m = GPy.models.GPLVM(Y, Q)
-m.constrain_positive('(white|rbf|bias|noise)')
-m.optimize()
-plot_oil(m.X, np.array([1,1]), labels, 'GPLVM')
+# m = GPy.models.GPLVM(Y, Q)
+# m.constrain_positive('(white|rbf|bias|noise)')
+# m.optimize()
+# plot_oil(m.X, np.array([1,1]), labels, 'GPLVM')
diff --git a/GPy/kern/rbf.py b/GPy/kern/rbf.py
index 0dcd3ea1..c22e68c7 100644
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/rbf.py
@@ -95,7 +95,7 @@ class rbf(kernpart):
         target += self.variance
 
     def dpsi0_dtheta(self,partial,Z,mu,S,target):
-        target[0] += 1.
+        target[0] += np.sum(partial)
 
     def dpsi0_dmuS(self,partial,Z,mu,S,target_mu,target_S):
         pass
@@ -113,7 +113,9 @@ class rbf(kernpart):
 
     def dpsi1_dZ(self,partial,Z,mu,S,target):
         self._psi_computations(Z,mu,S)
-        target += np.sum(partial[:,:,None]*-self._psi1[:,:,None]*self._psi1_dist/self.lengthscale2/self._psi1_denom,0)
+        denominator = (self.lengthscale2*(self._psi1_denom))
+        dpsi1_dZ = - self._psi1[:,:,None] * ((self._psi1_dist/denominator))
+        target += np.sum(partial.T[:,:,None] * dpsi1_dZ, 0)
 
     def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S):
         self._psi_computations(Z,mu,S)
@@ -132,13 +134,14 @@ class rbf(kernpart):
         d_length = self._psi2[:,:,:,None]*(0.5*self._psi2_Zdist_sq*self._psi2_denom + 2.*self._psi2_mudist_sq + 2.*S[:,None,None,:]/self.lengthscale2)/(self.lengthscale*self._psi2_denom)
         d_length = d_length.sum(0)
         target[0] += np.sum(partial*d_var)
-        target[1] += np.sum(d_length*partial)
+        target[1] += np.sum(d_length*partial[:,:,None])
 
     def dpsi2_dZ(self,partial,Z,mu,S,target):
-        """Returns shape N,M,M,Q"""
         self._psi_computations(Z,mu,S)
-        dZ = self._psi2[:,:,:,None]/self.lengthscale2*(-0.5*self._psi2_Zdist + self._psi2_mudist/self._psi2_denom)
-        target += np.sum(partial[None,:,:,None]*dZ,0).sum(1)
+        term1 = 0.5*self._psi2_Zdist/self.lengthscale2 # M, M, Q
+        term2 = self._psi2_mudist/self._psi2_denom/self.lengthscale2 # N, M, M, Q
+        dZ = self._psi2[:,:,:,None] * (term1[None] + term2) 
+        target += (partial[None,:,:,None]*dZ).sum(0).sum(0)
 
     def dpsi2_dmuS(self,partial,Z,mu,S,target_mu,target_S):
         """Think N,M,M,Q """
@@ -175,4 +178,3 @@ class rbf(kernpart):
             self._psi2 = np.square(self.variance)*np.exp(self._psi2_exponent) # N,M,M
 
             self._Z, self._mu, self._S = Z, mu,S
-
diff --git a/GPy/kern/rbf_ARD.py b/GPy/kern/rbf_ARD.py
index 7a5a3368..79c6ff58 100644
--- a/GPy/kern/rbf_ARD.py
+++ b/GPy/kern/rbf_ARD.py
@@ -74,7 +74,7 @@ class rbf_ARD(kernpart):
         target += self.variance
 
     def dpsi0_dtheta(self,partial,Z,mu,S,target):
-        target[0] += 1.
+        target[0] += np.sum(partial)
 
     def dpsi0_dmuS(self,partial,Z,mu,S,target_mu,target_S):
         pass
@@ -93,7 +93,9 @@ class rbf_ARD(kernpart):
     def dpsi1_dZ(self,partial,Z,mu,S,target):
         self._psi_computations(Z,mu,S)
         # np.add(target,-self._psi1[:,:,None]*self._psi1_dist/self.lengthscales2/self._psi1_denom,target)
-        target += np.sum(partial[:,:,None]*-self._psi1[:,:,None]*self._psi1_dist/self.lengthscales2/self._psi1_denom,0)
+        denominator = (self.lengthscales2*(self._psi1_denom))
+        dpsi1_dZ = - self._psi1[:,:,None] * ((self._psi1_dist/denominator))
+        target += np.sum(partial.T[:,:,None] * dpsi1_dZ, 0)
 
     def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S):
         """return shapes are N,M,Q"""
@@ -118,8 +120,10 @@ class rbf_ARD(kernpart):
     def dpsi2_dZ(self,partial,Z,mu,S,target):
         """Returns shape N,M,M,Q"""
         self._psi_computations(Z,mu,S)
-        dZ = self._psi2[:,:,:,None]/self.lengthscales2*(-0.5*self._psi2_Zdist + self._psi2_mudist/self._psi2_denom)
-        target += np.sum(partial[None,:,:,None]*dZ,0).sum(1)
+        term1 = 0.5*self._psi2_Zdist/self.lengthscales2 # M, M, Q
+        term2 = self._psi2_mudist/self._psi2_denom/self.lengthscales2 # N, M, M, Q
+        dZ = self._psi2[:,:,:,None] * (term1[None] + term2) 
+        target += (partial[None,:,:,None]*dZ).sum(0).sum(0)
 
     def dpsi2_dmuS(self,partial,Z,mu,S,target_mu,target_S):
         """Think N,M,M,Q """
@@ -247,5 +251,3 @@ if __name__=='__main__':
         return f,df
     print "psi2_theta_test"
     checkgrad(psi2_theta_test,np.random.rand(1+Q),args=(k,))
-
-
diff --git a/GPy/models/BGPLVM.py b/GPy/models/BGPLVM.py
new file mode 100644
index 00000000..2c760ec3
--- /dev/null
+++ b/GPy/models/BGPLVM.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import pylab as pb
+import sys, pdb
+from GPLVM import GPLVM
+from sparse_GP_regression import sparse_GP_regression
+from GPy.util.linalg import pdinv
+
+class Bayesian_GPLVM(sparse_GP_regression, GPLVM):
+    """
+    Bayesian Gaussian Process Latent Variable Model
+
+    :param Y: observed data
+    :type Y: np.ndarray
+    :param Q: latent dimensionality
+    :type Q: int
+    :param init: initialisation method for the latent space
+    :type init: 'PCA'|'random'
+
+    """
+    def __init__(self, Y, Q, init='PCA', **kwargs):
+        X = self.initialise_latent(init, Q, Y)
+        S = np.ones_like(X) * 1e-2# 
+        sparse_GP_regression.__init__(self, X, Y, X_uncertainty = S, **kwargs)
+
+    def get_param_names(self):
+        X_names = sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
+        S_names = sum([['S_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[])
+        return (X_names + S_names + sparse_GP_regression.get_param_names(self))
+
+    def get_param(self):
+        """
+        Horizontally stacks the parameters in order to present them to the optimizer.
+        The resulting 1-D array has this structure:
+
+        ===============================================================
+        |       mu       |        S        |    Z    | beta |  theta  |
+        ===============================================================
+
+        """
+        return np.hstack((self.X.flatten(), self.X_uncertainty.flatten(), sparse_GP_regression.get_param(self)))
+
+    def set_param(self,x):
+        N, Q = self.N, self.Q
+        self.X = x[:self.X.size].reshape(N,Q).copy()
+        self.X_uncertainty = x[(N*Q):(2*N*Q)].reshape(N,Q).copy()
+        sparse_GP_regression.set_param(self, x[(2*N*Q):])
+
+    def dL_dmuS(self):
+        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi1_dmuS(self.dL_dpsi1,self.Z,self.X,self.X_uncertainty)
+        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi0_dmuS(self.dL_dpsi0,self.Z,self.X,self.X_uncertainty)
+        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.dL_dpsi2,self.Z,self.X,self.X_uncertainty)
+        dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2
+        dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2
+
+        return np.hstack((dL_dmu.flatten(), dL_dS.flatten()))
+
+
+    def log_likelihood_gradients(self):
+        return np.hstack((self.dL_dmuS().flatten(), sparse_GP_regression.log_likelihood_gradients(self)))
+
+    def plot(self):
+        GPLVM.plot(self)
+        #passing Z without a small amout of jitter will induce the white kernel where we don;t want it!
+        mu, var = sparse_GP_regression.predict(self, self.Z+np.random.randn(*self.Z.shape)*0.0001)
+        pb.plot(mu[:, 0] , mu[:, 1], 'ko')
diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py
index 692d77d1..8caf38b1 100644
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@@ -37,7 +37,7 @@ class sparse_GP_regression(GP_regression):
     """
 
     def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
-        self.scale_factor = 10.0
+        self.scale_factor = 1000.0
         self.beta = beta
         if Z is None:
             self.Z = np.random.permutation(X.copy())[:M]
@@ -70,7 +70,6 @@ class sparse_GP_regression(GP_regression):
             self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
             self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
             self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
-            # raise NotImplementedError, "scale psi2 (in kern?)"
             self.psi2_beta_scaled = self.psi2*(self.beta/self.scale_factor**2)
         else:
             self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
@@ -163,10 +162,10 @@ class sparse_GP_regression(GP_regression):
         """
         The derivative of the bound wrt the inducing inputs Z
         """
-        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
+        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
         if self.has_uncertain_inputs:
-            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
-            dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty)
+            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_uncertainty)
+            dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # 'stripes'
         else:
             #re-cast computations in psi2 back to psi1:
             dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)