diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index aeee34ac..ddef0647 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -183,7 +183,7 @@ class GP(Model):
         """
         return self._log_marginal_likelihood
 
-    def _raw_predict(self, _Xnew, full_cov=False, kern=None):
+    def _raw_predict(self, Xnew, full_cov=False, kern=None):
         """
         For making predictions, does not account for normalization or likelihood
 
@@ -199,24 +199,30 @@ class GP(Model):
         if kern is None:
             kern = self.kern
 
-        Kx = kern.K(_Xnew, self.X).T
-        WiKx = np.dot(self.posterior.woodbury_inv, Kx)
+        Kx = kern.K(self.X, Xnew)
         mu = np.dot(Kx.T, self.posterior.woodbury_vector)
         if full_cov:
-            Kxx = kern.K(_Xnew)
-            var = Kxx - np.dot(Kx.T, WiKx)
+            Kxx = kern.K(Xnew)
+            if self.posterior.woodbury_inv.ndim == 2:
+                var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
+            elif self.posterior.woodbury_inv.ndim == 3:
+                var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
+                for i in range(var.shape[2]):
+                    var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
+            var = var
         else:
-            Kxx = kern.Kdiag(_Xnew)
-            var = Kxx - np.sum(WiKx*Kx, 0)
-            var = var.reshape(-1, 1)
-            var[var<0.] = 0.
+            Kxx = kern.Kdiag(Xnew)
+            if self.posterior.woodbury_inv.ndim == 2:
+                var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
+            elif self.posterior.woodbury_inv.ndim == 3:
+                var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
+                for i in range(var.shape[1]):
+                    var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
+            var = var
+        #add in the mean function
+        if self.mean_function is not None:
+            mu += self.mean_function.f(Xnew)
 
-        #force mu to be a column vector
-        if len(mu.shape)==1: mu = mu[:,None]
-
-        #add the mean function in
-        if not self.mean_function is None:
-            mu += self.mean_function.f(_Xnew)
         return mu, var
 
     def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index f927aba4..7bdd6ff2 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -69,7 +69,7 @@ from .expectation_propagation_dtc import EPDTC
 from .dtc import DTC
 from .fitc import FITC
 from .var_dtc_parallel import VarDTC_minibatch
-#from .svgp import SVGP
+from .var_gauss import VarGauss
 
 # class FullLatentFunctionData(object):
 #
diff --git a/GPy/models/gp_var_gauss.py b/GPy/models/gp_var_gauss.py
index 9f2229f0..ca68ed14 100644
--- a/GPy/models/gp_var_gauss.py
+++ b/GPy/models/gp_var_gauss.py
@@ -2,19 +2,16 @@
 # Distributed under the terms of the GNU General public License, see LICENSE.txt
 
 import numpy as np
-from scipy import stats
-from scipy.special import erf
-from ..core.model import Model
+from ..core import GP
 from ..core.parameterization import ObsAr
 from .. import kern
 from ..core.parameterization.param import Param
-from ..util.linalg import pdinv
-from ..likelihoods import Gaussian
+from ..inference.latent_function_inference import VarGauss
 
 log_2_pi = np.log(2*np.pi)
 
 
-class GPVariationalGaussianApproximation(Model):
+class GPVariationalGaussianApproximation(GP):
     """
     The Variational Gaussian Approximation revisited
 
@@ -26,71 +23,15 @@ class GPVariationalGaussianApproximation(Model):
         pages = {786--792},
     }
     """
-    def __init__(self, X, Y, kernel, likelihood=None, Y_metadata=None):
-        Model.__init__(self,'Variational GP')
-        if likelihood is None:
-            likelihood = Gaussian()
-        # accept the construction arguments
-        self.X = ObsAr(X)
-        self.Y = Y
-        self.num_data, self.input_dim = self.X.shape
-        self.Y_metadata = Y_metadata
+    def __init__(self, X, Y, kernel, likelihood, Y_metadata=None):
 
-        self.kern = kernel
-        self.likelihood = likelihood
-        self.link_parameter(self.kern)
-        self.link_parameter(self.likelihood)
+        num_data = Y.shape[0]
+        self.alpha = Param('alpha', np.zeros((num_data,1))) # only one latent fn for now.
+        self.beta = Param('beta', np.ones(num_data))
+
+        inf = VarGauss(self.alpha, self.beta)
+        super(GPVariationalGaussianApproximation, self).__init__(X, Y, kernel, likelihood, name='VarGP', inference_method=inf)
 
-        self.alpha = Param('alpha', np.zeros((self.num_data,1))) # only one latent fn for now.
-        self.beta = Param('beta', np.ones(self.num_data))
         self.link_parameter(self.alpha)
         self.link_parameter(self.beta)
 
-    def log_likelihood(self):
-        return self._log_lik
-
-    def parameters_changed(self):
-        K = self.kern.K(self.X)
-        m = K.dot(self.alpha)
-        KB = K*self.beta[:, None]
-        BKB = KB*self.beta[None, :]
-        A = np.eye(self.num_data) + BKB
-        Ai, LA, _, Alogdet = pdinv(A)
-        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
-        var = np.diag(Sigma).reshape(-1,1)
-
-        F, dF_dm, dF_dv, dF_dthetaL = self.likelihood.variational_expectations(self.Y, m, var, Y_metadata=self.Y_metadata)
-        if dF_dthetaL is not None:
-            self.likelihood.gradient = dF_dthetaL.sum(1).sum(1)
-        dF_da = np.dot(K, dF_dm)
-        SigmaB = Sigma*self.beta
-        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv.flatten())).dot(SigmaB))*2
-        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + np.sum(m*self.alpha))
-        dKL_da = m
-        A_A2 = Ai - Ai.dot(Ai)
-        dKL_db = np.diag(np.dot(KB.T, A_A2))
-        self._log_lik = F.sum() - KL
-        self.alpha.gradient = dF_da - dKL_da
-        self.beta.gradient = dF_db - dKL_db
-
-        # K-gradients
-        dKL_dK = 0.5*(self.alpha*self.alpha.T + self.beta[:, None]*self.beta[None, :]*A_A2)
-        tmp = Ai*self.beta[:, None]/self.beta[None, :]
-        dF_dK = self.alpha*dF_dm.T + np.dot(tmp*dF_dv, tmp.T)
-        self.kern.update_gradients_full(dF_dK - dKL_dK, self.X)
-
-    def _raw_predict(self, Xnew):
-        """
-        Predict the function(s) at the new point(s) Xnew.
-
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        """
-        Wi, _, _, _ = pdinv(self.kern.K(self.X) + np.diag(self.beta**-2))
-        Kux = self.kern.K(self.X, Xnew)
-        mu = np.dot(Kux.T, self.alpha)
-        WiKux = np.dot(Wi, Kux)
-        Kxx = self.kern.Kdiag(Xnew)
-        var = Kxx - np.sum(WiKux*Kux, 0)
-
-        return mu, var.reshape(-1,1)
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index f7cacb13..06e197f8 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -509,7 +509,8 @@ class GradientTests(np.testing.TestCase):
         X = X[:, None]
         Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None]
         kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
-        m = GPy.models.GPVariationalGaussianApproximation(X, Y, kern)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.models.GPVariationalGaussianApproximation(X, Y, kernel=kern, likelihood=lik)
         self.assertTrue(m.checkgrad())