diff --git a/GPy/models/gp_var_gauss.py b/GPy/models/gp_var_gauss.py
index cd688360..ccaab305 100644
--- a/GPy/models/gp_var_gauss.py
+++ b/GPy/models/gp_var_gauss.py
@@ -15,7 +15,7 @@ log_2_pi = np.log(2*np.pi)
 
 class GPVariationalGaussianApproximation(Model):
     """
-    The Variational Gaussian Approximation revisited implementation for regression
+    The Variational Gaussian Approximation revisited
 
     @article{Opper:2009,
         title = {The Variational Gaussian Approximation Revisited},
@@ -25,44 +25,27 @@ class GPVariationalGaussianApproximation(Model):
         pages = {786--792},
     }
     """
-    def __init__(self, X, Y, kernel=None):
+    def __init__(self, X, Y, kernel, likelihood,Y_metadata=None):
         Model.__init__(self,'Variational GP classification')
         # accept the construction arguments
         self.X = ObsAr(X)
-        if kernel is None:
-            kernel = kern.RBF(X.shape[1]) + kern.White(X.shape[1], 0.01)
-        self.kern = kernel
-        self.link_parameter(self.kern)
+        self.Y = Y
         self.num_data, self.input_dim = self.X.shape
+        self.Y_metadata = Y_metadata
 
-        self.alpha = Param('alpha', np.zeros(self.num_data))
+        self.kern = kernel
+        self.likelihood = likelihood
+        self.link_parameter(self.kern)
+        self.link_parameter(self.likelihood)
+
+        self.alpha = Param('alpha', np.zeros((self.num_data,1))) # only one latent fn for now.
         self.beta = Param('beta', np.ones(self.num_data))
         self.link_parameter(self.alpha)
         self.link_parameter(self.beta)
 
-        self.gh_x, self.gh_w = np.polynomial.hermite.hermgauss(20)
-        self.Ysign = np.where(Y==1, 1, -1).flatten()
-
     def log_likelihood(self):
-        """
-        Marginal log likelihood evaluation
-        """
         return self._log_lik
 
-    def likelihood_quadrature(self, m, v):
-        """
-        Perform Gauss-Hermite quadrature over the log of the likelihood, with a fixed weight
-        """
-        # assume probit for now.
-        X = self.gh_x[None, :]*np.sqrt(2.*v[:, None]) + (m*self.Ysign)[:, None]
-        p = stats.norm.cdf(X)
-        N = stats.norm.pdf(X)
-        F = np.log(p).dot(self.gh_w)
-        NoverP = N/p
-        dF_dm = (NoverP*self.Ysign[:,None]).dot(self.gh_w)
-        dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(self.gh_w)
-        return F, dF_dm, dF_dv
-
     def parameters_changed(self):
         K = self.kern.K(self.X)
         m = K.dot(self.alpha)
@@ -71,13 +54,14 @@ class GPVariationalGaussianApproximation(Model):
         A = np.eye(self.num_data) + BKB
         Ai, LA, _, Alogdet = pdinv(A)
         Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
-        var = np.diag(Sigma)
+        var = np.diag(Sigma).reshape(-1,1)
 
-        F, dF_dm, dF_dv = self.likelihood_quadrature(m, var)
+        F, dF_dm, dF_dv, dF_dthetaL = self.likelihood.variational_expectations(self.Y, m, var, Y_metadata=self.Y_metadata)
+        self.likelihood.gradient = dF_dthetaL.sum(1).sum(1)
         dF_da = np.dot(K, dF_dm)
         SigmaB = Sigma*self.beta
-        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv)).dot(SigmaB))*2
-        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + m.dot(self.alpha))
+        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv.flatten())).dot(SigmaB))*2
+        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + np.sum(m*self.alpha))
         dKL_da = m
         A_A2 = Ai - Ai.dot(Ai)
         dKL_db = np.diag(np.dot(KB.T, A_A2))
@@ -86,12 +70,12 @@ class GPVariationalGaussianApproximation(Model):
         self.beta.gradient = dF_db - dKL_db
 
         # K-gradients
-        dKL_dK = 0.5*(self.alpha[None, :]*self.alpha[:, None] + self.beta[:, None]*self.beta[None, :]*A_A2)
+        dKL_dK = 0.5*(self.alpha*self.alpha.T + self.beta[:, None]*self.beta[None, :]*A_A2)
         tmp = Ai*self.beta[:, None]/self.beta[None, :]
-        dF_dK = self.alpha[:, None]*dF_dm[None, :] + np.dot(tmp*dF_dv, tmp.T)
+        dF_dK = self.alpha*dF_dm.T + np.dot(tmp*dF_dv, tmp.T)
         self.kern.update_gradients_full(dF_dK - dKL_dK, self.X)
 
-    def predict(self, Xnew):
+    def _raw_predict(self, Xnew):
         """
         Predict the function(s) at the new point(s) Xnew.
 
@@ -105,4 +89,4 @@ class GPVariationalGaussianApproximation(Model):
         Kxx = self.kern.Kdiag(Xnew)
         var = Kxx - np.sum(WiKux*Kux, 0)
 
-        return 0.5*(1+erf(mu/np.sqrt(2.*(var+1))))
+        return mu, var.reshape(-1,1)