diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 898c7b58..5ca86d9e 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -244,7 +244,7 @@ class GP(Model):
             mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)
 
         # now push through likelihood
-        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata)
+        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
         return mean, var
 
     def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
@@ -261,7 +261,7 @@ class GP(Model):
         m, v = self._raw_predict(X,  full_cov=False)
         if self.normalizer is not None:
             m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
-        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)
 
     def predictive_gradients(self, Xnew):
         """
@@ -331,7 +331,7 @@ class GP(Model):
         :returns: Ysim: set of simulations, a Numpy array (N x samples).
         """
         fsim = self.posterior_samples_f(X, size, full_cov=full_cov)
-        Ysim = self.likelihood.samples(fsim, Y_metadata)
+        Ysim = self.likelihood.samples(fsim, Y_metadata=Y_metadata)
         return Ysim
 
     def plot_f(self, plot_limits=None, which_data_rows='all',
@@ -473,16 +473,16 @@ class GP(Model):
             self.inference_method.on_optimization_end()
             raise
 
-    def infer_newX(self, Y_new, optimize=True, ):
+    def infer_newX(self, Y_new, optimize=True):
         """
-        Infer the distribution of X for the new observed data *Y_new*.
+        Infer X for the new observed data *Y_new*.
 
         :param Y_new: the new observed data for inference
         :type Y_new: numpy.ndarray
         :param optimize: whether to optimize the location of new X (True by default)
         :type optimize: boolean
         :return: a tuple containing the posterior estimation of X and the model that optimize X
-        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` or numpy.ndarray, :class:`~GPy.core.model.Model`)
+        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` and numpy.ndarray, :class:`~GPy.core.model.Model`)
         """
         from ..inference.latent_function_inference.inferenceX import infer_newX
         return infer_newX(self, Y_new, optimize=optimize)
diff --git a/GPy/core/parameterization/priors.py b/GPy/core/parameterization/priors.py
index c748f0df..83c83dfd 100644
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@@ -522,16 +522,9 @@ class DGPLVM(Prior):
 
     """
     domain = _REAL
-    # _instances = []
-    # def __new__(cls, mu, sigma): # Singleton:
-    #     if cls._instances:
-    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
-    #         for instance in cls._instances:
-    #             if instance().mu == mu and instance().sigma == sigma:
-    #                 return instance()
-    #     o = super(Prior, cls).__new__(cls, mu, sigma)
-    #     cls._instances.append(weakref.ref(o))
-    #     return cls._instances[-1]()
+    
+    def __new__(cls, sigma2, lbl, x_shape): 
+        return super(Prior, cls).__new__(cls, sigma2, lbl, x_shape)
 
     def __init__(self, sigma2, lbl, x_shape):
         self.sigma2 = sigma2
@@ -843,7 +836,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
 
     # Calculating beta and Bi for Sb
     def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
-        import pdb
+        # import pdb
         # pdb.set_trace()
         B_i = np.zeros((self.classnum, self.dim))
         Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
@@ -909,8 +902,8 @@ class DGPLVM_Lamda(Prior, Parameterized):
         Sw = self.compute_Sw(cls, M_i)
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
         return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
 
     # This function calculates derivative of the log of prior function
@@ -933,8 +926,8 @@ class DGPLVM_Lamda(Prior, Parameterized):
         # Calculating inverse of Sb and its transpose and minus
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
         Sb_inv_N_trans = np.transpose(Sb_inv_N)
         Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
         Sw_trans = np.transpose(Sw)
diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index d4518f24..3aa022e0 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -217,9 +217,8 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
     elif model_type == 'FITC':
         m = GPy.models.FITCClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
         m['.*len'] = 3.
-
     if optimize:
-        m.pseudo_EM()
+        m.optimize()
 
     if plot:
         m.plot()
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index cccb2496..745c2c24 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -215,6 +215,7 @@ def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40
     return m
 
 def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
+    """Simulate some data drawn from a matern covariance and a periodic exponential for use in MRD demos."""
     Q_signal = 4
     import GPy
     import numpy as np
@@ -254,6 +255,7 @@ def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
     return slist, [S1, S2, S3], Ylist
 
 def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
+    """Simulate some data drawn from sine and cosine for use in demos of MRD"""
     _np.random.seed(1234)
 
     x = _np.linspace(0, 4 * _np.pi, N)[:, None]
@@ -402,7 +404,8 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
     from GPy.models import MRD
 
     D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
-    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim)
+    
 
     # Ylist = [Ylist[0]]
     k = kern.Linear(Q, ARD=True)
@@ -585,6 +588,7 @@ def robot_wireless(optimize=True, verbose=True, plot=True):
     return m
 
 def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
+    """Interactive visualisation of the Stick Man data from Ohio State University with the Bayesian GPLVM."""
     from GPy.models import BayesianGPLVM
     from matplotlib import pyplot as plt
     import numpy as np
@@ -613,7 +617,8 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
         data_show = GPy.plotting.matplot_dep.visualize.stick_show(y, connect=data['connect'])
         dim_select = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
         fig.canvas.draw()
-        fig.canvas.show()
+        # Canvas.show doesn't work on OSX.
+        #fig.canvas.show()
         raw_input('Press enter to finish')
 
     return m
diff --git a/GPy/inference/latent_function_inference/inferenceX.py b/GPy/inference/latent_function_inference/inferenceX.py
index a8ed2d09..19013b06 100644
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@@ -27,12 +27,19 @@ def infer_newX(model, Y_new, optimize=True, init='L2'):
 
 class InferenceX(Model):
     """
-    The class for inference of new X with given new Y. (do_test_latent)
+    The model class for inference of new X with given new Y. (replacing the "do_test_latent" in Bayesian GPLVM)
+    It is a tiny inference model created from the original GP model. The kernel, likelihood (only Gaussian is supported at the moment) 
+    and posterior distribution are taken from the original model.
+    For Regression models and GPLVM, a point estimate of the latent variable X will be inferred. 
+    For Bayesian GPLVM, the variational posterior of X will be inferred. 
+    X is inferred through a gradient optimization of the inference model.
 
     :param model: the GPy model used in inference
     :type model: GPy.core.Model
     :param Y: the new observed data for inference
     :type Y: numpy.ndarray
+    :param init: the distance metric of Y for initializing X with the nearest neighbour.
+    :type init: 'L2', 'NCC' and 'rand'
     """
     def __init__(self, model, Y, name='inferenceX', init='L2'):
         if np.isnan(Y).any() or getattr(model, 'missing_data', False):
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index aefc82ac..902d5fff 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -139,10 +139,6 @@ class Laplace(LatentFunctionInference):
 
         f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
 
-        self.f_hat = f_hat
-        #self.Ki_fhat =  Ki_fhat
-        #self.K = K.copy()
-
         #Compute hessian and other variables at mode
         log_marginal, woodbury_inv, dL_dK, dL_dthetaL = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, kern, Y_metadata)
 
@@ -298,6 +294,11 @@ class Laplace(LatentFunctionInference):
         else:
             dL_dthetaL = np.zeros(likelihood.size)
 
+        #Cache some things for speedy LOO
+        self.Ki_W_i = Ki_W_i
+        self.K = K
+        self.W = W
+        self.f_hat = f_hat
         return log_marginal, K_Wi_i, dL_dK, dL_dthetaL
 
     def _compute_B_statistics(self, K, W, log_concave, *args, **kwargs):
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index d41468ab..67a6a3a3 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -298,13 +298,8 @@ class Likelihood(Parameterized):
                 return self.conditional_mean(f)*p
         scaled_mean = [quad(int_mean, fmin, fmax,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
         mean = np.array(scaled_mean)[:,None] / np.sqrt(2*np.pi*(variance))
-
         return mean
 
-    def _conditional_mean(self, f):
-        """Quadrature calculation of the conditional mean: E(Y_star|f)"""
-        raise NotImplementedError("implement this function to make predictions")
-
     def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
         """
         Approximation to the predictive variance: V(Y_star)
@@ -608,23 +603,30 @@ class Likelihood(Parameterized):
         :param full_cov: whether to use the full covariance or just the diagonal
         :type full_cov: Boolean
         """
-
-        pred_mean = self.predictive_mean(mu, var, Y_metadata)
-        pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata)
+        try:
+            pred_mean = self.predictive_mean(mu, var, Y_metadata=Y_metadata)
+            pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata=Y_metadata)
+        except NotImplementedError:
+            print "Finding predictive mean and variance via sampling rather than quadrature"
+            Nf_samp = 300
+            Ny_samp = 1
+            s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
+            ss_y = self.samples(s, Y_metadata, samples=Ny_samp)
+            pred_mean = np.mean(ss_y, axis=1)[:, None]
+            pred_var = np.var(ss_y, axis=1)[:, None]
 
         return pred_mean, pred_var
 
     def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
         #compute the quantiles by sampling!!!
-        N_samp = 500
-        s = np.random.randn(mu.shape[0], N_samp)*np.sqrt(var) + mu
-        #ss_f = s.flatten()
-        #ss_y = self.samples(ss_f, Y_metadata)
-        #ss_y = self.samples(s, Y_metadata, samples=100)
-        ss_y = self.samples(s, Y_metadata)
-        #ss_y = ss_y.reshape(mu.shape[0], N_samp)
+        Nf_samp = 300
+        Ny_samp = 1
+        s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
+        ss_y = self.samples(s, Y_metadata, samples=Ny_samp)
+        #ss_y = ss_y.reshape(mu.shape[0], mu.shape[1], Nf_samp*Ny_samp)
 
-        return [np.percentile(ss_y ,q, axis=1)[:,None] for q in quantiles]
+        pred_quantiles = [np.percentile(ss_y, q, axis=1)[:,None] for q in quantiles]
+        return pred_quantiles
 
     def samples(self, gp, Y_metadata=None, samples=1):
         """
diff --git a/GPy/models/gp_var_gauss.py b/GPy/models/gp_var_gauss.py
index cd688360..729b6bb8 100644
--- a/GPy/models/gp_var_gauss.py
+++ b/GPy/models/gp_var_gauss.py
@@ -9,13 +9,14 @@ from ..core.parameterization import ObsAr
 from .. import kern
 from ..core.parameterization.param import Param
 from ..util.linalg import pdinv
+from ..likelihoods import Gaussian
 
 log_2_pi = np.log(2*np.pi)
 
 
 class GPVariationalGaussianApproximation(Model):
     """
-    The Variational Gaussian Approximation revisited implementation for regression
+    The Variational Gaussian Approximation revisited
 
     @article{Opper:2009,
         title = {The Variational Gaussian Approximation Revisited},
@@ -25,44 +26,29 @@ class GPVariationalGaussianApproximation(Model):
         pages = {786--792},
     }
     """
-    def __init__(self, X, Y, kernel=None):
-        Model.__init__(self,'Variational GP classification')
+    def __init__(self, X, Y, kernel, likelihood=None, Y_metadata=None):
+        Model.__init__(self,'Variational GP')
+        if likelihood is None:
+            likelihood = Gaussian()
         # accept the construction arguments
         self.X = ObsAr(X)
-        if kernel is None:
-            kernel = kern.RBF(X.shape[1]) + kern.White(X.shape[1], 0.01)
-        self.kern = kernel
-        self.link_parameter(self.kern)
+        self.Y = Y
         self.num_data, self.input_dim = self.X.shape
+        self.Y_metadata = Y_metadata
 
-        self.alpha = Param('alpha', np.zeros(self.num_data))
+        self.kern = kernel
+        self.likelihood = likelihood
+        self.link_parameter(self.kern)
+        self.link_parameter(self.likelihood)
+
+        self.alpha = Param('alpha', np.zeros((self.num_data,1))) # only one latent fn for now.
         self.beta = Param('beta', np.ones(self.num_data))
         self.link_parameter(self.alpha)
         self.link_parameter(self.beta)
 
-        self.gh_x, self.gh_w = np.polynomial.hermite.hermgauss(20)
-        self.Ysign = np.where(Y==1, 1, -1).flatten()
-
     def log_likelihood(self):
-        """
-        Marginal log likelihood evaluation
-        """
         return self._log_lik
 
-    def likelihood_quadrature(self, m, v):
-        """
-        Perform Gauss-Hermite quadrature over the log of the likelihood, with a fixed weight
-        """
-        # assume probit for now.
-        X = self.gh_x[None, :]*np.sqrt(2.*v[:, None]) + (m*self.Ysign)[:, None]
-        p = stats.norm.cdf(X)
-        N = stats.norm.pdf(X)
-        F = np.log(p).dot(self.gh_w)
-        NoverP = N/p
-        dF_dm = (NoverP*self.Ysign[:,None]).dot(self.gh_w)
-        dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(self.gh_w)
-        return F, dF_dm, dF_dv
-
     def parameters_changed(self):
         K = self.kern.K(self.X)
         m = K.dot(self.alpha)
@@ -71,13 +57,14 @@ class GPVariationalGaussianApproximation(Model):
         A = np.eye(self.num_data) + BKB
         Ai, LA, _, Alogdet = pdinv(A)
         Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
-        var = np.diag(Sigma)
+        var = np.diag(Sigma).reshape(-1,1)
 
-        F, dF_dm, dF_dv = self.likelihood_quadrature(m, var)
+        F, dF_dm, dF_dv, dF_dthetaL = self.likelihood.variational_expectations(self.Y, m, var, Y_metadata=self.Y_metadata)
+        self.likelihood.gradient = dF_dthetaL.sum(1).sum(1)
         dF_da = np.dot(K, dF_dm)
         SigmaB = Sigma*self.beta
-        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv)).dot(SigmaB))*2
-        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + m.dot(self.alpha))
+        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv.flatten())).dot(SigmaB))*2
+        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + np.sum(m*self.alpha))
         dKL_da = m
         A_A2 = Ai - Ai.dot(Ai)
         dKL_db = np.diag(np.dot(KB.T, A_A2))
@@ -86,12 +73,12 @@ class GPVariationalGaussianApproximation(Model):
         self.beta.gradient = dF_db - dKL_db
 
         # K-gradients
-        dKL_dK = 0.5*(self.alpha[None, :]*self.alpha[:, None] + self.beta[:, None]*self.beta[None, :]*A_A2)
+        dKL_dK = 0.5*(self.alpha*self.alpha.T + self.beta[:, None]*self.beta[None, :]*A_A2)
         tmp = Ai*self.beta[:, None]/self.beta[None, :]
-        dF_dK = self.alpha[:, None]*dF_dm[None, :] + np.dot(tmp*dF_dv, tmp.T)
+        dF_dK = self.alpha*dF_dm.T + np.dot(tmp*dF_dv, tmp.T)
         self.kern.update_gradients_full(dF_dK - dKL_dK, self.X)
 
-    def predict(self, Xnew):
+    def _raw_predict(self, Xnew):
         """
         Predict the function(s) at the new point(s) Xnew.
 
@@ -105,4 +92,4 @@ class GPVariationalGaussianApproximation(Model):
         Kxx = self.kern.Kdiag(Xnew)
         var = Kxx - np.sum(WiKux*Kux, 0)
 
-        return 0.5*(1+erf(mu/np.sqrt(2.*(var+1))))
+        return mu, var.reshape(-1,1)
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 77c42825..9f841372 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -107,11 +107,13 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
                 upper = m + 2*np.sqrt(v)
         else:
             if isinstance(model,GPCoregionalizedRegression) or isinstance(model,SparseGPCoregionalizedRegression):
-                meta = {'output_index': Xgrid[:,-1:].astype(np.int)}
-            else:
-                meta = None
-            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=meta, **predict_kw)
-            lower, upper = model.predict_quantiles(Xgrid, Y_metadata=meta)
+                extra_data = Xgrid[:,-1:].astype(np.int)
+                if Y_metadata is None:
+                    Y_metadata = {'output_index': extra_data}
+                else:
+                    Y_metadata['output_index'] = extra_data
+            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
+            lower, upper = model.predict_quantiles(Xgrid, Y_metadata=Y_metadata)
 
 
         for d in which_data_ycols:
@@ -120,7 +122,9 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
         #optionally plot some samples
         if samples: #NOTE not tested with fixed_inputs
-            Ysim = model.posterior_samples(Xgrid, samples)
+            Ysim = model.posterior_samples(Xgrid, samples, Y_metadata=Y_metadata)
+            print Ysim.shape
+            print Xnew.shape
             for yi in Ysim.T:
                 plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                 #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
@@ -185,10 +189,12 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
             m, _ = model._raw_predict(Xgrid, **predict_kw)
         else:
             if isinstance(model,GPCoregionalizedRegression) or isinstance(model,SparseGPCoregionalizedRegression):
-                meta = {'output_index': Xgrid[:,-1:].astype(np.int)}
-            else:
-                meta = None
-            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=meta, **predict_kw)
+                extra_data = Xgrid[:,-1:].astype(np.int)
+                if Y_metadata is None:
+                    Y_metadata = {'output_index': extra_data}
+                else:
+                    Y_metadata['output_index'] = extra_data
+            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
         for d in which_data_ycols:
             m_d = m[:,d].reshape(resolution, resolution).T
             plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)