diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index 1fb781d5..cce26783 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -18,3 +18,5 @@ from gp_coregionalized_regression import GPCoregionalizedRegression
 from sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
 from gp_heteroscedastic_regression import GPHeteroscedasticRegression
 from ss_mrd import SSMRD
+from gp_kronecker_gaussian_regression import GPKroneckerGaussianRegression
+from gp_var_gauss import GPVariationalGaussianApproximation
diff --git a/GPy/models/gp_kronecker_gaussian_regression.py b/GPy/models/gp_kronecker_gaussian_regression.py
new file mode 100644
index 00000000..0e8dab81
--- /dev/null
+++ b/GPy/models/gp_kronecker_gaussian_regression.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2014, James Hensman, Alan Saul
+# Distributed under the terms of the GNU General public License, see LICENSE.txt
+
+import numpy as np
+from ..core.model import Model
+from ..core.parameterization import ObsAr
+from .. import likelihoods
+
+class GPKroneckerGaussianRegression(Model):
+    """
+    Kronecker GP regression
+
+    Take two kernels computed on separate spaces K1(X1), K2(X2), and a data
+    matrix Y which is f size (N1, N2).
+
+    The effective covaraince is np.kron(K2, K1)
+    The effective data is vec(Y) = Y.flatten(order='F')
+
+    The noise must be iid Gaussian.
+
+    See Stegle et al.
+    @inproceedings{stegle2011efficient,
+      title={Efficient inference in matrix-variate gaussian models with $\\backslash$ iid observation noise},
+      author={Stegle, Oliver and Lippert, Christoph and Mooij, Joris M and Lawrence, Neil D and Borgwardt, Karsten M},
+      booktitle={Advances in Neural Information Processing Systems},
+      pages={630--638},
+      year={2011}
+    }
+
+    """
+    def __init__(self, X1, X2, Y, kern1, kern2, noise_var=1., name='KGPR'):
+        Model.__init__(self, name=name)
+        # accept the construction arguments
+        self.X1 = ObsAr(X1)
+        self.X2 = ObsAr(X2)
+        self.Y = Y
+        self.kern1, self.kern2 = kern1, kern2
+        self.add_parameter(self.kern1)
+        self.add_parameter(self.kern2)
+
+        self.likelihood = likelihoods.Gaussian()
+        self.likelihood.variance = noise_var
+        self.add_parameter(self.likelihood)
+
+        self.num_data1, self.input_dim1 = self.X1.shape
+        self.num_data2, self.input_dim2 = self.X2.shape
+
+        assert kern1.input_dim == self.input_dim1
+        assert kern2.input_dim == self.input_dim2
+        assert Y.shape == (self.num_data1, self.num_data2)
+
+    def log_likelihood(self):
+        return self._log_marginal_likelihood
+
+    def parameters_changed(self):
+        (N1, D1), (N2, D2) = self.X1.shape, self.X2.shape
+        K1, K2 = self.kern1.K(self.X1), self.kern2.K(self.X2)
+
+        # eigendecompositon
+        S1, U1 = np.linalg.eigh(K1)
+        S2, U2 = np.linalg.eigh(K2)
+        W = np.kron(S2, S1) + self.likelihood.variance
+
+        Y_ = U1.T.dot(self.Y).dot(U2)
+
+        # store these quantities: needed for prediction
+        Wi = 1./W
+        Ytilde = Y_.flatten(order='F')*Wi
+
+        self._log_marginal_likelihood = -0.5*self.num_data1*self.num_data2*np.log(2*np.pi)\
+                                        -0.5*np.sum(np.log(W))\
+                                        -0.5*np.dot(Y_.flatten(order='F'), Ytilde)
+
+        # gradients for data fit part
+        Yt_reshaped = Ytilde.reshape(N1, N2, order='F')
+        tmp = U1.dot(Yt_reshaped)
+        dL_dK1 = .5*(tmp*S2).dot(tmp.T)
+        tmp = U2.dot(Yt_reshaped.T)
+        dL_dK2 = .5*(tmp*S1).dot(tmp.T)
+
+        # gradients for logdet
+        Wi_reshaped = Wi.reshape(N1, N2, order='F')
+        tmp = np.dot(Wi_reshaped, S2)
+        dL_dK1 += -0.5*(U1*tmp).dot(U1.T)
+        tmp = np.dot(Wi_reshaped.T, S1)
+        dL_dK2 += -0.5*(U2*tmp).dot(U2.T)
+
+        self.kern1.update_gradients_full(dL_dK1, self.X1)
+        self.kern2.update_gradients_full(dL_dK2, self.X2)
+
+        # gradients for noise variance
+        dL_dsigma2 = -0.5*Wi.sum() + 0.5*np.sum(np.square(Ytilde))
+        self.likelihood.variance.gradient = dL_dsigma2
+
+        # store these quantities for prediction:
+        self.Wi, self.Ytilde, self.U1, self.U2 = Wi, Ytilde, U1, U2
+
+    def predict(self, X1new, X2new):
+        """
+        Return the predictive mean and variance at a series of new points X1new, X2new
+        Only returns the diagonal of the predictive variance, for now.
+
+        :param X1new: The points at which to make a prediction
+        :type X1new: np.ndarray, Nnew x self.input_dim1
+        :param X2new: The points at which to make a prediction
+        :type X2new: np.ndarray, Nnew x self.input_dim2
+
+        """
+        k1xf = self.kern1.K(X1new, self.X1)
+        k2xf = self.kern2.K(X2new, self.X2)
+        A = k1xf.dot(self.U1)
+        B = k2xf.dot(self.U2)
+        mu = A.dot(self.Ytilde.reshape(self.num_data1, self.num_data2, order='F')).dot(B.T).flatten(order='F')
+        k1xx = self.kern1.Kdiag(X1new)
+        k2xx = self.kern2.Kdiag(X2new)
+        BA = np.kron(B, A)
+        var = np.kron(k2xx, k1xx) - np.sum(BA**2*self.Wi, 1) + self.likelihood.variance
+
+        return mu[:, None], var[:, None]
diff --git a/GPy/models/gp_var_gauss.py b/GPy/models/gp_var_gauss.py
new file mode 100644
index 00000000..68b62443
--- /dev/null
+++ b/GPy/models/gp_var_gauss.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2014, James Hensman, Alan Saul
+# Distributed under the terms of the GNU General public License, see LICENSE.txt
+
+import numpy as np
+from scipy import stats
+from scipy.special import erf
+from ..core.model import Model
+from ..core.parameterization import ObsAr
+from .. import kern
+from ..core.parameterization.param import Param
+from ..util.linalg import pdinv
+
+log_2_pi = np.log(2*np.pi)
+
+
+class GPVariationalGaussianApproximation(Model):
+    """
+    The Variational Gaussian Approximation revisited implementation for regression
+
+    @article{Opper:2009,
+        title = {The Variational Gaussian Approximation Revisited},
+        author = {Opper, Manfred and Archambeau, C{\'e}dric},
+        journal = {Neural Comput.},
+        year = {2009},
+        pages = {786--792},
+    }
+    """
+    def __init__(self, X, Y, kernel=None):
+        Model.__init__(self,'Variational GP classification')
+        # accept the construction arguments
+        self.X = ObsAr(X)
+        if kernel is None:
+            kernel = kern.RBF(X.shape[1]) + kern.White(X.shape[1], 0.01)
+        self.kern = kernel
+        self.add_parameter(self.kern)
+        self.num_data, self.input_dim = self.X.shape
+
+        self.alpha = Param('alpha', np.zeros(self.num_data))
+        self.beta = Param('beta', np.ones(self.num_data))
+        self.add_parameter(self.alpha)
+        self.add_parameter(self.beta)
+
+        self.gh_x, self.gh_w = np.polynomial.hermite.hermgauss(20)
+        self.Ysign = np.where(Y==1, 1, -1).flatten()
+
+    def log_likelihood(self):
+        """
+        Marginal log likelihood evaluation
+        """
+        return self._log_lik
+
+    def likelihood_quadrature(self, m, v):
+        """
+        Perform Gauss-Hermite quadrature over the log of the likelihood, with a fixed weight
+        """
+        # assume probit for now.
+        X = self.gh_x[None, :]*np.sqrt(2.*v[:, None]) + (m*self.Ysign)[:, None]
+        p = stats.norm.cdf(X)
+        N = stats.norm.pdf(X)
+        F = np.log(p).dot(self.gh_w)
+        NoverP = N/p
+        dF_dm = (NoverP*self.Ysign[:,None]).dot(self.gh_w)
+        dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(self.gh_w)
+        return F, dF_dm, dF_dv
+
+    def parameters_changed(self):
+        K = self.kern.K(self.X)
+        m = K.dot(self.alpha)
+        KB = K*self.beta[:, None]
+        BKB = KB*self.beta[None, :]
+        A = np.eye(self.num_data) + BKB
+        Ai, LA, _, Alogdet = pdinv(A)
+        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
+        var = np.diag(Sigma)
+
+        F, dF_dm, dF_dv = self.likelihood_quadrature(m, var)
+        dF_da = np.dot(K, dF_dm)
+        SigmaB = Sigma*self.beta
+        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv)).dot(SigmaB))*2
+        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + m.dot(self.alpha))
+        dKL_da = m
+        A_A2 = Ai - Ai.dot(Ai)
+        dKL_db = np.diag(np.dot(KB.T, A_A2))
+        self._log_lik = F.sum() - KL
+        self.alpha.gradient = dF_da - dKL_da
+        self.beta.gradient = dF_db - dKL_db
+
+        # K-gradients
+        dKL_dK = 0.5*(self.alpha[None, :]*self.alpha[:, None] + self.beta[:, None]*self.beta[None, :]*A_A2)
+        tmp = Ai*self.beta[:, None]/self.beta[None, :]
+        dF_dK = self.alpha[:, None]*dF_dm[None, :] + np.dot(tmp*dF_dv, tmp.T)
+        self.kern.update_gradients_full(dF_dK - dKL_dK, self.X)
+
+    def predict(self, Xnew):
+        """
+        Predict the function(s) at the new point(s) Xnew.
+
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        """
+        Wi, _, _, _ = pdinv(self.kern.K(self.X) + np.diag(self.beta**-2))
+        Kux = self.kern.K(self.X, Xnew)
+        mu = np.dot(Kux.T, self.alpha)
+        WiKux = np.dot(Wi, Kux)
+        Kxx = self.kern.Kdiag(Xnew)
+        var = Kxx - np.sum(WiKux*Kux, 0)
+
+        return 0.5*(1+erf(mu/np.sqrt(2.*(var+1))))
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 7b19538b..451ab757 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -423,6 +423,45 @@ class GradientTests(np.testing.TestCase):
         m = GPy.models.GPHeteroscedasticRegression(X,Y,kern)
         self.assertTrue(m.checkgrad())
 
+    def test_gp_kronecker_gaussian(self):
+        N1, N2 = 30, 20
+        X1 = np.random.randn(N1, 1)
+        X2 = np.random.randn(N2, 1)
+        X1.sort(0); X2.sort(0)
+        k1 = GPy.kern.RBF(1)  # + GPy.kern.White(1)
+        k2 = GPy.kern.RBF(1)  # + GPy.kern.White(1)
+        Y = np.random.randn(N1, N2)
+        m = GPy.models.GPKroneckerGaussianRegression(X1, X2, Y, k1, k2)
+
+        # build the model the dumb way
+        assert (N1*N2<1000), "too much data for standard GPs!"
+        yy, xx = np.meshgrid(X2, X1)
+        Xgrid = np.vstack((xx.flatten(order='F'), yy.flatten(order='F'))).T
+        kg = GPy.kern.RBF(1, active_dims=[0]) * GPy.kern.RBF(1, active_dims=[1])
+        mm = GPy.models.GPRegression(Xgrid, Y.reshape(-1, 1, order='F'), kernel=kg)
+
+        m.randomize()
+        mm[:] = m[:]
+        assert np.allclose(m.log_likelihood(), mm.log_likelihood())
+        assert np.allclose(m.gradient, mm.gradient)
+        X1test = np.random.randn(100, 1)
+        X2test = np.random.randn(100, 1)
+        mean1, var1 = m.predict(X1test, X2test)
+        yy, xx = np.meshgrid(X2test, X1test)
+        Xgrid = np.vstack((xx.flatten(order='F'), yy.flatten(order='F'))).T
+        mean2, var2 = mm.predict(Xgrid)
+        assert np.allclose(mean1, mean2)
+        assert np.allclose(var1, var2)
+
+    def test_gp_VGPC(self):
+        num_obs = 25
+        X = np.random.randint(0,140,num_obs)
+        X = X[:,None]
+        Y = 25. + np.sin(X/20.) * 2. + np.random.rand(num_obs)[:,None]
+        kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
+        m = GPy.models.GPVariationalGaussianApproximation(X,Y,kern)
+        self.assertTrue(m.checkgrad())
+
 
 if __name__ == "__main__":
     print "Running unit tests, please be (very) patient..."