diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py
index 32b6c02d..e9e049b0 100644
--- a/GPy/core/__init__.py
+++ b/GPy/core/__init__.py
@@ -4,6 +4,7 @@
 from model import *
 from parameterised import *
 import priors
-from GPy.core.gp import GP
-from GPy.core.sparse_gp import SparseGP
+from gp import GP
+from sparse_gp import SparseGP
 from fitc import FITC
+from svigp import SVIGP
diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index 0799e0c2..b82f3298 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -29,6 +29,7 @@ class GPBase(Model):
             self._Xscale = np.ones((1, self.input_dim))
 
         super(GPBase, self).__init__()
+        #Model.__init__(self)
         # All leaf nodes should call self._set_params(self._get_params()) at
         # the end
 
diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py
new file mode 100644
index 00000000..1db0e26f
--- /dev/null
+++ b/GPy/core/svigp.py
@@ -0,0 +1,466 @@
+# Copyright (c) 2012, James Hensman and Nicolo' Fusi
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import pylab as pb
+from .. import kern
+from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs, jitchol, backsub_both_sides
+from ..likelihoods import EP
+from gp_base import GPBase
+from model import Model
+import time
+import sys
+
+
+class SVIGP(GPBase):
+    """
+    Stochastic Variational inference in a Gaussian Process
+
+    :param X: inputs
+    :type X: np.ndarray (N x Q)
+    :param Y: observed data
+    :type Y: np.ndarray of observations (N x D)
+    :param batchsize: the size of a h
+
+    Additional kwargs are used as for a sparse GP. They include
+
+    :param q_u: canonical parameters of the distribution squasehd into a 1D array
+    :type q_u: np.ndarray
+    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :type M: int
+    :param kernel : the kernel/covariance function. See link kernels
+    :type kernel: a GPy kernel
+    :param Z: inducing inputs (optional, see note)
+    :type Z: np.ndarray (M x Q) | None
+    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
+    :type X_uncertainty: np.ndarray (N x Q) | None
+    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
+    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :type M: int
+    :param beta: noise precision. TODO> ignore beta if doing EP
+    :type beta: float
+    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :type normalize_(X|Y): bool
+    """
+
+
+    def __init__(self, X, likelihood, kernel, Z, q_u=None, batchsize=10, X_variance=None):
+        GPBase.__init__(self, X, likelihood, kernel, normalize_X=False)
+        self.batchsize=batchsize
+        self.Y = self.likelihood.Y.copy()
+        self.Z = Z
+        self.num_inducing = Z.shape[0]
+
+        self.batchcounter = 0
+        self.epochs = 0
+        self.iterations = 0
+
+        self.vb_steplength = 0.05
+        self.param_steplength = 1e-5
+        self.momentum = 0.9
+
+        if X_variance is None:
+            self.has_uncertain_inputs = False
+        else:
+            self.has_uncertain_inputs = True
+            self.X_variance = X_variance
+
+
+        if q_u is None:
+             q_u = np.hstack((np.random.randn(self.num_inducing*self.output_dim),-.5*np.eye(self.num_inducing).flatten()))
+        self.set_vb_param(q_u)
+
+        self._permutation = np.random.permutation(self.num_data)
+        self.load_batch()
+
+        self._param_trace = []
+        self._ll_trace = []
+        self._grad_trace = []
+
+        #set the adaptive steplength parameters
+        self.hbar_t = 0.0
+        self.tau_t = 100.0
+        self.gbar_t = 0.0
+        self.gbar_t1 = 0.0
+        self.gbar_t2 = 0.0
+        self.hbar_tp = 0.0
+        self.tau_tp = 10000.0
+        self.gbar_tp = 0.0
+        self.adapt_param_steplength = True
+        self.adapt_vb_steplength = True
+        self._param_steplength_trace = []
+        self._vb_steplength_trace = []
+
+    def _compute_kernel_matrices(self):
+        # kernel computations, using BGPLVM notation
+        self.Kmm = self.kern.K(self.Z)
+        if self.has_uncertain_inputs:
+            self.psi0 = self.kern.psi0(self.Z, self.X_batch, self.X_variance_batch)
+            self.psi1 = self.kern.psi1(self.Z, self.X_batch, self.X_variance_batch)
+            self.psi2 = self.kern.psi2(self.Z, self.X_batch, self.X_variance_batch)
+        else:
+            self.psi0 = self.kern.Kdiag(self.X_batch)
+            self.psi1 = self.kern.K(self.X_batch, self.Z)
+            self.psi2 = None
+
+    def dL_dtheta(self):
+        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
+        if self.has_uncertain_inputs:
+            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch)
+            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch)
+            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch)
+        else:
+            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X_batch, self.Z)
+            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch)
+        return dL_dtheta
+
+    def _set_params(self, p, computations=True):
+        self.kern._set_params_transformed(p[:self.kern.num_params])
+        self.likelihood._set_params(p[self.kern.num_params:])
+        if computations:
+            self._compute_kernel_matrices()
+            self._computations()
+
+    def _get_params(self):
+        return np.hstack((self.kern._get_params_transformed() , self.likelihood._get_params()))
+
+    def _get_param_names(self):
+        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
+
+    def load_batch(self):
+        """
+        load a batch of data (set self.X_batch and self.likelihood.Y from self.X, self.Y)
+        """
+
+        #if we've seen all the data, start again with them in a new random order
+        if self.batchcounter+self.batchsize > self.num_data:
+            self.batchcounter = 0
+            self.epochs += 1
+            self._permutation = np.random.permutation(self.num_data)
+
+        this_perm = self._permutation[self.batchcounter:self.batchcounter+self.batchsize]
+
+        self.X_batch = self.X[this_perm]
+        self.likelihood.set_data(self.Y[this_perm])
+        if self.has_uncertain_inputs:
+            self.X_variance_batch = self.X_variance[this_perm]
+
+        self.batchcounter += self.batchsize
+
+        self.data_prop = float(self.batchsize)/self.num_data
+
+        self._compute_kernel_matrices()
+        self._computations()
+
+    def _computations(self,do_Kmm=True, do_Kmm_grad=True):
+        """
+        All of the computations needed. Some are optional, see kwargs.
+        """
+
+        if do_Kmm:
+            self.Lm = jitchol(self.Kmm)
+
+        # The rather complex computations of self.A
+        if self.has_uncertain_inputs:
+            if self.likelihood.is_heteroscedastic:
+                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0)
+            else:
+                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
+            evals, evecs = linalg.eigh(psi2_beta)
+            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
+            tmp = evecs * np.sqrt(clipped_evals)
+        else:
+            if self.likelihood.is_heteroscedastic:
+                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.batchsize)))
+            else:
+                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision))
+        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
+        self.A = tdot(tmp)
+
+        self.V = self.likelihood.precision*self.likelihood.Y
+        self.VmT = np.dot(self.V,self.q_u_expectation[0].T)
+        self.psi1V = np.dot(self.psi1.T, self.V)
+
+        self.B = np.eye(self.num_inducing)*self.data_prop + self.A
+        self.Lambda = backsub_both_sides(self.Lm, self.B.T)
+        self.LQL = backsub_both_sides(self.Lm,self.q_u_expectation[1].T,transpose='right')
+
+        self.trace_K = self.psi0.sum() - np.trace(self.A)/self.likelihood.precision
+        self.Kmmi_m, _ = dpotrs(self.Lm, self.q_u_expectation[0], lower=1)
+        self.projected_mean = np.dot(self.psi1,self.Kmmi_m)
+
+        # Compute dL_dpsi
+        self.dL_dpsi0 = - 0.5 * self.output_dim * self.likelihood.precision * np.ones(self.batchsize)
+        self.dL_dpsi1, _ = dpotrs(self.Lm,np.asfortranarray(self.VmT.T),lower=1)
+        self.dL_dpsi1 = self.dL_dpsi1.T
+
+        dL_dpsi2 = -0.5 * self.likelihood.precision * backsub_both_sides(self.Lm, self.LQL - self.output_dim * np.eye(self.num_inducing))
+        if self.has_uncertain_inputs:
+            self.dL_dpsi2 = np.repeat(dL_dpsi2[None,:,:],self.batchsize,axis=0)
+        else:
+            self.dL_dpsi1 += 2.*np.dot(dL_dpsi2,self.psi1.T).T
+            self.dL_dpsi2 = None
+
+        # Compute dL_dKmm
+        if do_Kmm_grad:
+            tmp = np.dot(self.LQL,self.A) - backsub_both_sides(self.Lm,np.dot(self.q_u_expectation[0],self.psi1V.T),transpose='right')
+            tmp += tmp.T
+            tmp += -self.output_dim*self.B
+            tmp += self.data_prop*self.LQL
+            self.dL_dKmm = 0.5*backsub_both_sides(self.Lm,tmp)
+
+        #Compute the gradient of the log likelihood wrt noise variance
+        self.partial_for_likelihood =  -0.5*(self.batchsize*self.output_dim - np.sum(self.A*self.LQL))*self.likelihood.precision
+        self.partial_for_likelihood +=  (0.5*self.output_dim*self.trace_K + 0.5 * self.likelihood.trYYT - np.sum(self.likelihood.Y*self.projected_mean))*self.likelihood.precision**2
+
+
+    def log_likelihood(self):
+        """
+        As for uncollapsed sparse GP, but account for the proportion of data we're looking at right now.
+
+        NB. self.batchsize is the size of the batch, not the size of X_all
+        """
+        assert not self.likelihood.is_heteroscedastic
+        A = -0.5*self.batchsize*self.output_dim*(np.log(2.*np.pi) - np.log(self.likelihood.precision))
+        B = -0.5*self.likelihood.precision*self.output_dim*self.trace_K
+        Kmm_logdet = 2.*np.sum(np.log(np.diag(self.Lm)))
+        C = -0.5*self.output_dim*self.data_prop*(Kmm_logdet-self.q_u_logdet - self.num_inducing)
+        C += -0.5*np.sum(self.LQL * self.B)
+        D = -0.5*self.likelihood.precision*self.likelihood.trYYT
+        E = np.sum(self.V*self.projected_mean)
+        return (A+B+C+D+E)/self.data_prop
+
+    def _log_likelihood_gradients(self):
+        return np.hstack((self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))/self.data_prop
+
+    def vb_grad_natgrad(self):
+        """
+        Compute the gradients of the lower bound wrt the canonical and
+        Expectation parameters of u.
+
+        Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family)
+        """
+
+        # Gradient for eta
+        dL_dmmT_S = -0.5*self.Lambda/self.data_prop + 0.5*self.q_u_prec
+        Kmmipsi1V,_ = dpotrs(self.Lm,self.psi1V,lower=1)
+        dL_dm = (Kmmipsi1V - np.dot(self.Lambda,self.q_u_mean))/self.data_prop
+
+        # Gradients for theta
+        S = self.q_u_cov
+        Si = self.q_u_prec
+        m = self.q_u_mean
+        dL_dSi = -mdot(S,dL_dmmT_S, S)
+
+        dL_dmhSi = -2*dL_dSi
+        dL_dSim = np.dot(dL_dSi,m) + np.dot(Si, dL_dm)
+
+        return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten())) , np.hstack((dL_dSim.flatten(), dL_dmhSi.flatten()))
+
+
+    def optimize(self, iterations, print_interval=10, callback=lambda:None, callback_interval=5):
+
+        param_step = 0.
+
+        #Iterate!
+        for i in range(iterations):
+
+            #store the current configuration for plotting later
+            self._param_trace.append(self._get_params())
+            self._ll_trace.append(self.log_likelihood() + self.log_prior())
+
+            #load a batch
+            self.load_batch()
+
+            #compute the (stochastic) gradient
+            natgrads = self.vb_grad_natgrad()
+            grads = self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+            self._grad_trace.append(grads)
+
+            #compute the steps in all parameters
+            vb_step = self.vb_steplength*natgrads[0]
+            if (self.epochs>=1):#only move the parameters after the first epoch
+                param_step = self.momentum*param_step + self.param_steplength*grads
+            else:
+                param_step = 0.
+
+            self.set_vb_param(self.get_vb_param() + vb_step)
+            #Note: don't recompute everything here, wait until the next iteration when we have a new batch
+            self._set_params(self._untransform_params(self._get_params_transformed() + param_step), computations=False)
+
+            #print messages if desired
+            if i and (not i%print_interval):
+                print i, np.mean(self._ll_trace[-print_interval:]) #, self.log_likelihood()
+                print np.round(np.mean(self._grad_trace[-print_interval:],0),3)
+                sys.stdout.flush()
+
+            #callback
+            if i and not i%callback_interval:
+                callback()
+                time.sleep(0.1)
+
+            if self.epochs > 10:
+                self._adapt_steplength()
+
+            self.iterations += 1
+
+
+    def _adapt_steplength(self):
+        if self.adapt_vb_steplength:
+            # self._adaptive_vb_steplength()
+            self._adaptive_vb_steplength_KL()
+        self._vb_steplength_trace.append(self.vb_steplength)
+        assert self.vb_steplength > 0
+
+        if self.adapt_param_steplength:
+            # self._adaptive_param_steplength()
+            # self._adaptive_param_steplength_log()
+            self._adaptive_param_steplength_from_vb()
+        self._param_steplength_trace.append(self.param_steplength)
+
+    def _adaptive_param_steplength(self):
+        decr_factor = 0.1
+        g_tp = self._transform_gradients(self._log_likelihood_gradients())
+        self.gbar_tp = (1-1/self.tau_tp)*self.gbar_tp + 1/self.tau_tp * g_tp
+        self.hbar_tp = (1-1/self.tau_tp)*self.hbar_tp + 1/self.tau_tp * np.dot(g_tp.T, g_tp)
+        new_param_steplength = np.dot(self.gbar_tp.T, self.gbar_tp) / self.hbar_tp
+        #- hack
+        new_param_steplength *= decr_factor
+        self.param_steplength = (self.param_steplength + new_param_steplength)/2
+        #-
+        self.tau_tp = self.tau_tp*(1-self.param_steplength) + 1
+
+    def _adaptive_param_steplength_log(self):
+        stp = np.logspace(np.log(0.0001), np.log(1e-6), base=np.e, num=18000)
+        self.param_steplength = stp[self.iterations]
+
+    def _adaptive_param_steplength_log2(self):
+        self.param_steplength = (self.iterations + 0.001)**-0.5
+
+    def _adaptive_param_steplength_from_vb(self):
+        self.param_steplength = self.vb_steplength * 0.01
+
+    def _adaptive_vb_steplength(self):
+        decr_factor = 0.1
+        g_t = self.vb_grad_natgrad()[0]
+        self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
+        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
+        new_vb_steplength = np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t
+        #- hack
+        new_vb_steplength *= decr_factor
+        self.vb_steplength = (self.vb_steplength + new_vb_steplength)/2
+        #-
+        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
+
+    def _adaptive_vb_steplength_KL(self):
+        decr_factor = 1 #0.1
+        natgrad = self.vb_grad_natgrad()
+        g_t1 = natgrad[0]
+        g_t2 = natgrad[1]
+        self.gbar_t1 = (1-1/self.tau_t)*self.gbar_t1 + 1/self.tau_t * g_t1
+        self.gbar_t2 = (1-1/self.tau_t)*self.gbar_t2 + 1/self.tau_t * g_t2
+        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t1.T, g_t2)
+        self.vb_steplength = np.dot(self.gbar_t1.T, self.gbar_t2) / self.hbar_t
+        self.vb_steplength *= decr_factor
+        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
+
+    def _raw_predict(self, X_new, X_variance_new=None, which_parts='all',full_cov=False):
+        """Internal helper function for making predictions, does not account for normalization"""
+
+        #TODO: make this more efficient!
+        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
+        tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi)
+
+        if X_variance_new is None:
+            Kx = self.kern.K(X_new,self.Z)
+            mu = np.dot(Kx,self.Kmmi_m)
+            if full_cov:
+                Kxx = self.kern.K(X_new)
+                var = Kxx - mdot(Kx,tmp,Kx.T)
+            else:
+                Kxx = self.kern.Kdiag(X_new)
+                var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None]
+            return mu, var
+        else:
+            assert X_variance_new.shape == X_new.shape
+            Kx = self.kern.psi1(self.Z,X_new, X_variance_new)
+            mu = np.dot(Kx,self.Kmmi_m)
+            Kxx = self.kern.psi0(self.Z,X_new,X_variance_new)
+            psi2 = self.kern.psi2(self.Z,X_new,X_variance_new)
+            diag_var = Kxx - np.sum(np.sum(psi2*tmp[None,:,:],1),1)
+            if full_cov:
+                raise NotImplementedError
+            else:
+                return mu, diag_var[:,None]
+
+    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
+        # normalize X values
+        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        if X_variance_new is not None:
+            X_variance_new = X_variance_new / self._Xscale ** 2
+
+        # here's the actual prediction by the GP model
+        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
+
+        # now push through likelihood
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+
+        return mean, var, _025pm, _975pm
+
+
+    def set_vb_param(self,vb_param):
+        """set the distribution q(u) from the canonical parameters"""
+        self.q_u_canonical_flat = vb_param.copy()
+        self.q_u_canonical = self.q_u_canonical_flat[:self.num_inducing*self.output_dim].reshape(self.num_inducing,self.output_dim),self.q_u_canonical_flat[self.num_inducing*self.output_dim:].reshape(self.num_inducing,self.num_inducing)
+
+        self.q_u_prec = -2.*self.q_u_canonical[1]
+        self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec)
+        self.q_u_Li = q_u_Li
+        self.q_u_logdet = -tmp
+        self.q_u_mean, _ = dpotrs(q_u_Li, np.asfortranarray(self.q_u_canonical[0]),lower=1)
+
+        self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.output_dim)
+
+
+    def get_vb_param(self):
+        """
+        Return the canonical parameters of the distribution q(u)
+        """
+        return self.q_u_canonical_flat
+
+
+    def plot(self, ax=None, fignum=None, Z_height=None, **kwargs):
+
+        if ax is None:
+            fig = pb.figure(num=fignum)
+            ax = fig.add_subplot(111)
+
+        #horrible hack here:
+        data = self.likelihood.data.copy()
+        self.likelihood.data = self.Y
+        GPBase.plot(self, ax=ax, **kwargs)
+        self.likelihood.data = data
+
+        Zu = self.Z * self._Xscale + self._Xoffset
+        if self.input_dim==1:
+            ax.plot(self.X_batch, self.likelihood.data, 'gx',mew=2)
+            if Z_height is None:
+                Z_height = ax.get_ylim()[0]
+            ax.plot(Zu, np.zeros_like(Zu) + Z_height, 'r|', mew=1.5, markersize=12)
+
+        if self.input_dim==2:
+            ax.scatter(self.X_all[:,0], self.X_all[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet)
+            ax.plot(Zu[:,0], Zu[:,1], 'w^')
+
+    def plot_traces(self):
+        pb.figure()
+        t = np.array(self._param_trace)
+        pb.subplot(2,1,1)
+        for l,ti in zip(self._get_param_names(),t.T):
+            if not l[:3]=='iip':
+                pb.plot(ti,label=l)
+        pb.legend(loc=0)
+
+        pb.subplot(2,1,2)
+        pb.plot(np.asarray(self._ll_trace),label='stochastic likelihood')
+        pb.legend(loc=0)
diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py
index 0d73daff..2f74858a 100644
--- a/GPy/examples/__init__.py
+++ b/GPy/examples/__init__.py
@@ -5,3 +5,4 @@ import classification
 import regression
 import dimensionality_reduction
 import tutorials
+import stochastic
diff --git a/GPy/examples/stochastic.py b/GPy/examples/stochastic.py
new file mode 100644
index 00000000..45418d5c
--- /dev/null
+++ b/GPy/examples/stochastic.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import pylab as pb
+import numpy as np
+import GPy
+
+def toy_1d():
+    N = 2000
+    M = 20
+
+    #create data
+    X = np.linspace(0,32,N)[:,None]
+    Z = np.linspace(0,32,M)[:,None]
+    Y = np.sin(X) + np.cos(0.3*X) + np.random.randn(*X.shape)/np.sqrt(50.)
+
+    m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z)
+    m.ensure_default_constraints()
+    m.constrain_bounded('noise_variance',1e-3,1e-1)
+
+    m.param_steplength = 1e-4
+
+    fig = pb.figure()
+    ax = fig.add_subplot(111)
+    def cb():
+        ax.cla()
+        m.plot(ax=ax,Z_height=-3)
+        ax.set_ylim(-3,3)
+        fig.canvas.draw()
+
+    m.optimize(500, callback=cb, callback_interval=1)
+
+    m.plot_traces()
+    return m
+
+
+
+
+
+
+
diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index f18e89db..885372a1 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -4,6 +4,7 @@
 from gp_regression import GPRegression
 from gp_classification import GPClassification
 from sparse_gp_regression import SparseGPRegression
+from svigp_regression import SVIGPRegression
 from sparse_gp_classification import SparseGPClassification
 from fitc_classification import FITCClassification
 from gplvm import GPLVM
diff --git a/GPy/models/svigp_regression.py b/GPy/models/svigp_regression.py
new file mode 100644
index 00000000..8448bf37
--- /dev/null
+++ b/GPy/models/svigp_regression.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2012, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core import SVIGP
+from .. import likelihoods
+from .. import kern
+
+class SVIGPRegression(SVIGP):
+    """
+    Gaussian Process model for regression
+
+    This is a thin wrapper around the SVIGP class, with a set of sensible defalts
+
+    :param X: input observations
+    :param Y: observed values
+    :param kernel: a GPy kernel, defaults to rbf+white
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_Y: False|True
+    :rtype: model object
+
+    .. Note:: Multiple independent outputs are allowed using columns of Y
+
+    """
+
+    def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, q_u=None, batchsize=10):
+        # kern defaults to rbf (plus white for stability)
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1], variance=1., lengthscale=4.) + kern.white(X.shape[1], 1e-3)
+
+        # Z defaults to a subset of the data
+        if Z is None:
+            i = np.random.permutation(X.shape[0])[:num_inducing]
+            Z = X[i].copy()
+        else:
+            assert Z.shape[1] == X.shape[1]
+
+        # likelihood defaults to Gaussian
+        likelihood = likelihoods.Gaussian(Y, normalize=False)
+
+        SVIGP.__init__(self, X, likelihood, kernel, Z, q_u=q_u, batchsize=batchsize)
+        self.load_batch()