diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py
index 25651827..fb40a9e0 100644
--- a/GPy/core/__init__.py
+++ b/GPy/core/__init__.py
@@ -8,5 +8,4 @@ from parameterization.observable_array import ObsAr
 
 from gp import GP
 from sparse_gp import SparseGP
-from svigp import SVIGP
 from mapping import *
diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py
deleted file mode 100644
index f78618eb..00000000
--- a/GPy/core/svigp.py
+++ /dev/null
@@ -1,434 +0,0 @@
-# Copyright (c) 2012, James Hensman and Nicolo' Fusi
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs, jitchol, backsub_both_sides
-from gp import GP
-import time
-import sys
-
-
-class SVIGP(GP):
-    """
-
-    Stochastic Variational inference in a Gaussian Process
-
-    :param X: inputs
-    :type X: np.ndarray (N x Q)
-    :param Y: observed data
-    :type Y: np.ndarray of observations (N x D)
-    :param batchsize: the size of a h
-
-    Additional kwargs are used as for a sparse GP. They include:
-
-    :param q_u: canonical parameters of the distribution sqehd into a 1D array
-    :type q_u: np.ndarray
-    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
-    :type M: int
-    :param kernel: the kernel/covariance function. See link kernels
-    :type kernel: a GPy kernel
-    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (M x Q) | None
-    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
-    :type X_uncertainty: np.ndarray (N x Q) | None
-    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
-    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
-    :type M: int
-    :param beta: noise precision. TODO: ignore beta if doing EP
-    :type beta: float
-    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize_(X|Y): bool
-
-    """
-
-
-    def __init__(self, X, Y, kernel, Z, q_u=None, batchsize=10):
-        raise NotImplementedError, "This is a work in progress, see github issue "
-        GP.__init__(self, X, Y, kernel)
-        self.batchsize=batchsize
-        self.Z = Z
-        self.num_inducing = Z.shape[0]
-
-        self.batchcounter = 0
-        self.epochs = 0
-        self.iterations = 0
-
-        self.vb_steplength = 0.05
-        self.param_steplength = 1e-5
-        self.momentum = 0.9
-
-        if q_u is None:
-             q_u = np.hstack((np.random.randn(self.num_inducing*self.output_dim),-.5*np.eye(self.num_inducing).flatten()))
-
-        self.set_vb_param(q_u)
-
-        self._permutation = np.random.permutation(self.num_data)
-        self.load_batch()
-
-        self._param_trace = []
-        self._ll_trace = []
-        self._grad_trace = []
-
-        #set the adaptive steplength parameters
-        #self.hbar_t = 0.0
-        #self.tau_t = 100.0
-        #self.gbar_t = 0.0
-        #self.gbar_t1 = 0.0
-        #self.gbar_t2 = 0.0
-        #self.hbar_tp = 0.0
-        #self.tau_tp = 10000.0
-        #self.gbar_tp = 0.0
-        #self.adapt_param_steplength = True
-        #self.adapt_vb_steplength = True
-        #self._param_steplength_trace = []
-        #self._vb_steplength_trace = []
-
-    def _compute_kernel_matrices(self):
-        # kernel computations, using BGPLVM notation
-        self.Kmm = self.kern.K(self.Z)
-        self.psi0 = self.kern.Kdiag(self.X_batch)
-        self.psi1 = self.kern.K(self.X_batch, self.Z)
-        self.psi2 = None
-
-    def dL_dtheta(self):
-        dL_dtheta = self.kern._param_grad_helper(self.dL_dKmm, self.Z)
-        if self.has_uncertain_inputs:
-            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch)
-            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch)
-            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch)
-        else:
-            dL_dtheta += self.kern._param_grad_helper(self.dL_dpsi1, self.X_batch, self.Z)
-            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch)
-        return dL_dtheta
-
-    def _set_params(self, p, computations=True):
-        self.kern._set_params_transformed(p[:self.kern.num_params])
-        self.likelihood._set_params(p[self.kern.num_params:])
-        if computations:
-            self._compute_kernel_matrices()
-            self._computations()
-
-    def _get_params(self):
-        return np.hstack((self.kern._get_params_transformed() , self.likelihood._get_params()))
-
-    def _get_param_names(self):
-        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
-
-    def load_batch(self):
-        """
-        load a batch of data (set self.X_batch and self.Y_batch from self.X, self.Y)
-        """
-
-        #if we've seen all the data, start again with them in a new random order
-        if self.batchcounter+self.batchsize > self.num_data:
-            self.batchcounter = 0
-            self.epochs += 1
-            self._permutation = np.random.permutation(self.num_data)
-
-        this_perm = self._permutation[self.batchcounter:self.batchcounter+self.batchsize]
-
-        self.X_batch = self.X[this_perm]
-        self.Y_batch = self.Y[this_perm]
-
-        self.batchcounter += self.batchsize
-
-        self.data_prop = float(self.batchsize)/self.num_data
-
-        self._compute_kernel_matrices()
-        self._computations()
-
-    def _computations(self,do_Kmm=True, do_Kmm_grad=True):
-        """
-        All of the computations needed. Some are optional, see kwargs.
-        """
-
-        if do_Kmm:
-            self.Lm = jitchol(self.Kmm)
-
-        # The rather complex computations of self.A
-        if self.has_uncertain_inputs:
-            if self.likelihood.is_heteroscedastic:
-                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0)
-            else:
-                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
-            evals, evecs = np.linalg.eigh(psi2_beta)
-            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
-            tmp = evecs * np.sqrt(clipped_evals)
-        else:
-            if self.likelihood.is_heteroscedastic:
-                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.batchsize)))
-            else:
-                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision))
-        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
-        self.A = tdot(tmp)
-
-        self.V = self.likelihood.precision*self.likelihood.Y
-        self.VmT = np.dot(self.V,self.q_u_expectation[0].T)
-        self.psi1V = np.dot(self.psi1.T, self.V)
-
-        self.B = np.eye(self.num_inducing)*self.data_prop + self.A
-        self.Lambda = backsub_both_sides(self.Lm, self.B.T)
-        self.LQL = backsub_both_sides(self.Lm,self.q_u_expectation[1].T,transpose='right')
-
-        self.trace_K = self.psi0.sum() - np.trace(self.A)/self.likelihood.precision
-        self.Kmmi_m, _ = dpotrs(self.Lm, self.q_u_expectation[0], lower=1)
-        self.projected_mean = np.dot(self.psi1,self.Kmmi_m)
-
-        # Compute dL_dpsi
-        self.dL_dpsi0 = - 0.5 * self.output_dim * self.likelihood.precision * np.ones(self.batchsize)
-        self.dL_dpsi1, _ = dpotrs(self.Lm,np.asfortranarray(self.VmT.T),lower=1)
-        self.dL_dpsi1 = self.dL_dpsi1.T
-
-        dL_dpsi2 = -0.5 * self.likelihood.precision * backsub_both_sides(self.Lm, self.LQL - self.output_dim * np.eye(self.num_inducing))
-        if self.has_uncertain_inputs:
-            self.dL_dpsi2 = np.repeat(dL_dpsi2[None,:,:],self.batchsize,axis=0)
-        else:
-            self.dL_dpsi1 += 2.*np.dot(dL_dpsi2,self.psi1.T).T
-            self.dL_dpsi2 = None
-
-        # Compute dL_dKmm
-        if do_Kmm_grad:
-            tmp = np.dot(self.LQL,self.A) - backsub_both_sides(self.Lm,np.dot(self.q_u_expectation[0],self.psi1V.T),transpose='right')
-            tmp += tmp.T
-            tmp += -self.output_dim*self.B
-            tmp += self.data_prop*self.LQL
-            self.dL_dKmm = 0.5*backsub_both_sides(self.Lm,tmp)
-
-        #Compute the gradient of the log likelihood wrt noise variance
-        self.partial_for_likelihood =  -0.5*(self.batchsize*self.output_dim - np.sum(self.A*self.LQL))*self.likelihood.precision
-        self.partial_for_likelihood +=  (0.5*self.output_dim*self.trace_K + 0.5 * self.likelihood.trYYT - np.sum(self.likelihood.Y*self.projected_mean))*self.likelihood.precision**2
-
-
-    def log_likelihood(self):
-        """
-        As for uncollapsed sparse GP, but account for the proportion of data we're looking at right now.
-
-        NB. self.batchsize is the size of the batch, not the size of X_all
-        """
-        assert not self.likelihood.is_heteroscedastic
-        A = -0.5*self.batchsize*self.output_dim*(np.log(2.*np.pi) - np.log(self.likelihood.precision))
-        B = -0.5*self.likelihood.precision*self.output_dim*self.trace_K
-        Kmm_logdet = 2.*np.sum(np.log(np.diag(self.Lm)))
-        C = -0.5*self.output_dim*self.data_prop*(Kmm_logdet-self.q_u_logdet - self.num_inducing)
-        C += -0.5*np.sum(self.LQL * self.B)
-        D = -0.5*self.likelihood.precision*self.likelihood.trYYT
-        E = np.sum(self.V*self.projected_mean)
-        return (A+B+C+D+E)/self.data_prop
-
-    def _log_likelihood_gradients(self):
-        return np.hstack((self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))/self.data_prop
-
-    def vb_grad_natgrad(self):
-        """
-        Compute the gradients of the lower bound wrt the canonical and
-        Expectation parameters of u.
-
-        Note that the natural gradient in either is given by the gradient in
-        the other (See Hensman et al 2012 Fast Variational inference in the
-        conjugate exponential Family)
-        """
-
-        # Gradient for eta
-        dL_dmmT_S = -0.5*self.Lambda/self.data_prop + 0.5*self.q_u_prec
-        Kmmipsi1V,_ = dpotrs(self.Lm,self.psi1V,lower=1)
-        dL_dm = (Kmmipsi1V - np.dot(self.Lambda,self.q_u_mean))/self.data_prop
-
-        # Gradients for theta
-        S = self.q_u_cov
-        Si = self.q_u_prec
-        m = self.q_u_mean
-        dL_dSi = -mdot(S,dL_dmmT_S, S)
-
-        dL_dmhSi = -2*dL_dSi
-        dL_dSim = np.dot(dL_dSi,m) + np.dot(Si, dL_dm)
-
-        return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten())) , np.hstack((dL_dSim.flatten(), dL_dmhSi.flatten()))
-
-
-    def optimize(self, iterations, print_interval=10, callback=lambda:None, callback_interval=5):
-
-        param_step = 0.
-
-        #Iterate!
-        for i in range(iterations):
-
-            #store the current configuration for plotting later
-            self._param_trace.append(self._get_params())
-            self._ll_trace.append(self.log_likelihood() + self.log_prior())
-
-            #load a batch
-            self.load_batch()
-
-            #compute the (stochastic) gradient
-            natgrads = self.vb_grad_natgrad()
-            grads = self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
-            self._grad_trace.append(grads)
-
-            #compute the steps in all parameters
-            vb_step = self.vb_steplength*natgrads[0]
-            if (self.epochs>=1):#only move the parameters after the first epoch
-                param_step = self.momentum*param_step + self.param_steplength*grads
-            else:
-                param_step = 0.
-
-            self.set_vb_param(self.get_vb_param() + vb_step)
-            #Note: don't recompute everything here, wait until the next iteration when we have a new batch
-            self._set_params(self._untransform_params(self._get_params_transformed() + param_step), computations=False)
-
-            #print messages if desired
-            if i and (not i%print_interval):
-                print i, np.mean(self._ll_trace[-print_interval:]) #, self.log_likelihood()
-                print np.round(np.mean(self._grad_trace[-print_interval:],0),3)
-                sys.stdout.flush()
-
-            #callback
-            if i and not i%callback_interval:
-                callback(self) # Change this to callback()
-                time.sleep(0.01)
-
-            if self.epochs > 10:
-                self._adapt_steplength()
-
-            self.iterations += 1
-
-
-    def _adapt_steplength(self):
-        if self.adapt_vb_steplength:
-            # self._adaptive_vb_steplength()
-            self._adaptive_vb_steplength_KL()
-        self._vb_steplength_trace.append(self.vb_steplength)
-        assert self.vb_steplength > 0
-
-        if self.adapt_param_steplength:
-            self._adaptive_param_steplength()
-            # self._adaptive_param_steplength_log()
-            # self._adaptive_param_steplength_from_vb()
-        self._param_steplength_trace.append(self.param_steplength)
-
-    def _adaptive_param_steplength(self):
-        decr_factor = 0.02
-        g_tp = self._transform_gradients(self._log_likelihood_gradients())
-        self.gbar_tp = (1-1/self.tau_tp)*self.gbar_tp + 1/self.tau_tp * g_tp
-        self.hbar_tp = (1-1/self.tau_tp)*self.hbar_tp + 1/self.tau_tp * np.dot(g_tp.T, g_tp)
-        new_param_steplength = np.dot(self.gbar_tp.T, self.gbar_tp) / self.hbar_tp
-        #- hack
-        new_param_steplength *= decr_factor
-        self.param_steplength = (self.param_steplength + new_param_steplength)/2
-        #-
-        self.tau_tp = self.tau_tp*(1-self.param_steplength) + 1
-
-    def _adaptive_param_steplength_log(self):
-        stp = np.logspace(np.log(0.0001), np.log(1e-6), base=np.e, num=18000)
-        self.param_steplength = stp[self.iterations]
-
-    def _adaptive_param_steplength_log2(self):
-        self.param_steplength = (self.iterations + 0.001)**-0.5
-
-    def _adaptive_param_steplength_from_vb(self):
-        self.param_steplength = self.vb_steplength * 0.01
-
-    def _adaptive_vb_steplength(self):
-        decr_factor = 0.1
-        g_t = self.vb_grad_natgrad()[0]
-        self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
-        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
-        new_vb_steplength = np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t
-        #- hack
-        new_vb_steplength *= decr_factor
-        self.vb_steplength = (self.vb_steplength + new_vb_steplength)/2
-        #-
-        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
-
-    def _adaptive_vb_steplength_KL(self):
-        decr_factor = 0.1
-        natgrad = self.vb_grad_natgrad()
-        g_t1 = natgrad[0]
-        g_t2 = natgrad[1]
-        self.gbar_t1 = (1-1/self.tau_t)*self.gbar_t1 + 1/self.tau_t * g_t1
-        self.gbar_t2 = (1-1/self.tau_t)*self.gbar_t2 + 1/self.tau_t * g_t2
-        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t1.T, g_t2)
-        self.vb_steplength = np.dot(self.gbar_t1.T, self.gbar_t2) / self.hbar_t
-        self.vb_steplength *= decr_factor
-        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
-
-    def _raw_predict(self, X_new, X_variance_new=None, which_parts='all',full_cov=False):
-        """Internal helper function for making predictions, does not account for normalization"""
-
-        #TODO: make this more efficient!
-        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
-        tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi)
-
-        if X_variance_new is None:
-            Kx = self.kern.K(X_new,self.Z)
-            mu = np.dot(Kx,self.Kmmi_m)
-            if full_cov:
-                Kxx = self.kern.K(X_new)
-                var = Kxx - mdot(Kx,tmp,Kx.T)
-            else:
-                Kxx = self.kern.Kdiag(X_new)
-                var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None]
-            return mu, var
-        else:
-            assert X_variance_new.shape == X_new.shape
-            Kx = self.kern.psi1(self.Z,X_new, X_variance_new)
-            mu = np.dot(Kx,self.Kmmi_m)
-            Kxx = self.kern.psi0(self.Z,X_new,X_variance_new)
-            psi2 = self.kern.psi2(self.Z,X_new,X_variance_new)
-            diag_var = Kxx - np.sum(np.sum(psi2*tmp[None,:,:],1),1)
-            if full_cov:
-                raise NotImplementedError
-            else:
-                return mu, diag_var[:,None]
-
-    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
-        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        if X_variance_new is not None:
-            X_variance_new = X_variance_new / self._Xscale ** 2
-
-        # here's the actual prediction by the GP model
-        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
-
-        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
-
-        return mean, var, _025pm, _975pm
-
-
-    def set_vb_param(self,vb_param):
-        """set the distribution q(u) from the canonical parameters"""
-        self.q_u_canonical_flat = vb_param.copy()
-        self.q_u_canonical = self.q_u_canonical_flat[:self.num_inducing*self.output_dim].reshape(self.num_inducing,self.output_dim),self.q_u_canonical_flat[self.num_inducing*self.output_dim:].reshape(self.num_inducing,self.num_inducing)
-
-        self.q_u_prec = -2.*self.q_u_canonical[1]
-        self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec)
-        self.q_u_Li = q_u_Li
-        self.q_u_logdet = -tmp
-        self.q_u_mean, _ = dpotrs(q_u_Li, np.asfortranarray(self.q_u_canonical[0]),lower=1)
-
-        self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.output_dim)
-
-
-    def get_vb_param(self):
-        """
-        Return the canonical parameters of the distribution q(u)
-        """
-        return self.q_u_canonical_flat
-
-
-    def plot(self, *args, **kwargs):
-        """
-        See GPy.plotting.matplot_dep.svig_plots.plot
-        """
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import svig_plots
-        svig_plots.plot(self,*args,**kwargs)
-
-
-    def plot_traces(self):
-        """
-        See GPy.plotting.matplot_dep.svig_plots.plot_traces
-        """
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import svig_plots
-        svig_plots.plot_traces(self)
diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py
index c575bb33..93994175 100644
--- a/GPy/examples/__init__.py
+++ b/GPy/examples/__init__.py
@@ -4,6 +4,4 @@
 import classification
 import regression
 import dimensionality_reduction
-import tutorials
-import stochastic
 import non_gaussian
diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index cf71be24..b9d488d6 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -28,13 +28,13 @@ def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
     m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, num_inducing=num_inducing)
 
     # Contrain all parameters to be positive
-    m.tie_params('.*len')
+    #m.tie_params('.*len')
     m['.*len'] = 10.
-    m.update_likelihood_approximation()
 
     # Optimize
     if optimize:
-        m.optimize(max_iters=max_iters)
+        for _ in range(5):
+            m.optimize(max_iters=int(max_iters/5))
     print(m)
 
     #Test
@@ -150,7 +150,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
     print m
     return m
 
-def toy_heaviside(seed=default_seed, optimize=True, plot=True):
+def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
     """
     Simple 1D classification example using a heavy side gp transformation
 
@@ -166,16 +166,18 @@ def toy_heaviside(seed=default_seed, optimize=True, plot=True):
     Y[Y.flatten() == -1] = 0
 
     # Model definition
-    noise_model = GPy.likelihoods.bernoulli(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
-    likelihood = GPy.likelihoods.EP(Y, noise_model)
-    m = GPy.models.GPClassification(data['X'], likelihood=likelihood)
+    kernel = GPy.kern.RBF(1)
+    likelihood = GPy.likelihoods.Bernoulli(gp_link=GPy.likelihoods.link_functions.Heaviside())
+    ep = GPy.inference.latent_function_inference.expectation_propagation.EP()
+    m = GPy.core.GP(X=data['X'], Y=Y, kernel=kernel, likelihood=likelihood, inference_method=ep, name='gp_classification_heaviside')
+    #m = GPy.models.GPClassification(data['X'], likelihood=likelihood)
 
     # Optimize
     if optimize:
-        m.update_likelihood_approximation()
         # Parameters optimization:
-        m.optimize()
-        #m.pseudo_EM()
+        for _ in range(5):
+            m.optimize(max_iters=int(max_iters/5))
+        print m
 
     # Plot
     if plot:
diff --git a/GPy/examples/non_gaussian.py b/GPy/examples/non_gaussian.py
index 1e2be93b..57c841e4 100644
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@@ -59,7 +59,7 @@ def student_t_approx(optimize=True, plot=True):
     t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
     laplace_inf = GPy.inference.latent_function_inference.Laplace()
     m3 = GPy.core.GP(X, Y.copy(), kernel3, likelihood=t_distribution, inference_method=laplace_inf)
-    m3['.*t_noise'].constrain_bounded(1e-6, 10.)
+    m3['.*t_scale2'].constrain_bounded(1e-6, 10.)
     m3['.*white'].constrain_fixed(1e-5)
     m3.randomize()
 
@@ -67,7 +67,7 @@ def student_t_approx(optimize=True, plot=True):
     t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
     laplace_inf = GPy.inference.latent_function_inference.Laplace()
     m4 = GPy.core.GP(X, Yc.copy(), kernel4, likelihood=t_distribution, inference_method=laplace_inf)
-    m4['.*t_noise'].constrain_bounded(1e-6, 10.)
+    m4['.*t_scale2'].constrain_bounded(1e-6, 10.)
     m4['.*white'].constrain_fixed(1e-5)
     m4.randomize()
     print m4
@@ -124,6 +124,7 @@ def student_t_approx(optimize=True, plot=True):
     return m1, m2, m3, m4
 
 def boston_example(optimize=True, plot=True):
+    raise NotImplementedError("Needs updating")
     import sklearn
     from sklearn.cross_validation import KFold
     optimizer='bfgs'
@@ -152,8 +153,8 @@ def boston_example(optimize=True, plot=True):
         noise = 1e-1 #np.exp(-2)
         rbf_len = 0.5
         data_axis_plot = 4
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
-        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelstu = GPy.kern.RBF(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelgp = GPy.kern.RBF(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
         #Baseline
         score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
@@ -162,8 +163,8 @@ def boston_example(optimize=True, plot=True):
         print "Gauss GP"
         mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
         mgp.constrain_fixed('.*white', 1e-5)
-        mgp['rbf_len'] = rbf_len
-        mgp['noise'] = noise
+        mgp['.*len'] = rbf_len
+        mgp['.*noise'] = noise
         print mgp
         if optimize:
             mgp.optimize(optimizer=optimizer, messages=messages)
@@ -198,9 +199,9 @@ def boston_example(optimize=True, plot=True):
             stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
             mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
             mstu_t.constrain_fixed('.*white', 1e-5)
-            mstu_t.constrain_bounded('.*t_noise', 0.0001, 1000)
+            mstu_t.constrain_bounded('.*t_scale2', 0.0001, 1000)
             mstu_t['rbf_len'] = rbf_len
-            mstu_t['.*t_noise'] = noise
+            mstu_t['.*t_scale2'] = noise
             print mstu_t
             if optimize:
                 mstu_t.optimize(optimizer=optimizer, messages=messages)
diff --git a/GPy/examples/stochastic.py b/GPy/examples/stochastic.py
deleted file mode 100644
index cc365cae..00000000
--- a/GPy/examples/stochastic.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-try:
-    import pylab as pb
-except:
-    pass
-import numpy as np
-import GPy
-
-def toy_1d(optimize=True, plot=True):
-    N = 2000
-    M = 20
-
-    #create data
-    X = np.linspace(0,32,N)[:,None]
-    Z = np.linspace(0,32,M)[:,None]
-    Y = np.sin(X) + np.cos(0.3*X) + np.random.randn(*X.shape)/np.sqrt(50.)
-
-    m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z)
-    m.constrain_bounded('noise_variance',1e-3,1e-1)
-    m.constrain_bounded('white_variance',1e-3,1e-1)
-
-    m.param_steplength = 1e-4
-
-    if plot:
-        fig = pb.figure()
-        ax = fig.add_subplot(111)
-        def cb(foo):
-            ax.cla()
-            m.plot(ax=ax,Z_height=-3)
-            ax.set_ylim(-3,3)
-            fig.canvas.draw()
-
-    if optimize:
-        m.optimize(500, callback=cb, callback_interval=1)
-
-    if plot:
-        m.plot_traces()
-    return m
diff --git a/GPy/examples/tutorials.py b/GPy/examples/tutorials.py
deleted file mode 100644
index b3afd02b..00000000
--- a/GPy/examples/tutorials.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-"""
-Code of Tutorials
-"""
-
-try:
-    import pylab as pb
-    pb.ion()
-except:
-    pass
-import numpy as np
-import GPy
-
-def tuto_GP_regression(optimize=True, plot=True):
-    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
-
-    X = np.random.uniform(-3.,3.,(20,1))
-    Y = np.sin(X) + np.random.randn(20,1)*0.05
-
-    kernel = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
-
-    m = GPy.models.GPRegression(X, Y, kernel)
-
-    print m
-    if plot:
-        m.plot()
-
-    m.constrain_positive('')
-
-    m.unconstrain('')               # may be used to remove the previous constrains
-    m.constrain_positive('.*rbf_variance')
-    m.constrain_bounded('.*lengthscale',1.,10. )
-    m.constrain_fixed('.*noise',0.0025)
-
-    if optimize:
-        m.optimize()
-        m.optimize_restarts(num_restarts = 10)
-
-    #######################################################
-    #######################################################
-    # sample inputs and outputs
-    X = np.random.uniform(-3.,3.,(50,2))
-    Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(50,1)*0.05
-
-    # define kernel
-    ker = GPy.kern.Matern52(2,ARD=True) + GPy.kern.white(2)
-
-    # create simple GP model
-    m = GPy.models.GPRegression(X, Y, ker)
-
-    # contrain all parameters to be positive
-    m.constrain_positive('')
-
-    # optimize and plot
-    if optimize:
-        m.optimize('tnc', max_f_eval = 1000)
-    if plot:
-        m.plot()
-
-    print m
-    return(m)
-
-def tuto_kernel_overview(optimize=True, plot=True):
-    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
-    ker1 = GPy.kern.rbf(1)  # Equivalent to ker1 = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
-    ker2 = GPy.kern.rbf(input_dim=1, variance = .75, lengthscale=2.)
-    ker3 = GPy.kern.rbf(1, .5, .5)
-
-    print ker2
-
-    if plot:
-        ker1.plot()
-        ker2.plot()
-        ker3.plot()
-
-    k1 = GPy.kern.rbf(1,1.,2.)
-    k2 = GPy.kern.Matern32(1, 0.5, 0.2)
-
-    # Product of kernels
-    k_prod = k1.prod(k2)                        # By default, tensor=False
-    k_prodtens = k1.prod(k2,tensor=True)
-
-    # Sum of kernels
-    k_add = k1.add(k2)                          # By default, tensor=False
-    k_addtens = k1.add(k2,tensor=True)
-
-    k1 = GPy.kern.rbf(1,1.,2)
-    k2 = GPy.kern.periodic_Matern52(1,variance=1e3, lengthscale=1, period = 1.5, lower=-5., upper = 5)
-
-    k = k1 * k2  # equivalent to k = k1.prod(k2)
-    print k
-
-    # Simulate sample paths
-    X = np.linspace(-5,5,501)[:,None]
-    Y = np.random.multivariate_normal(np.zeros(501),k.K(X),1)
-
-    k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.Matern32(1)
-    k3 = GPy.kern.white(1)
-
-    k = k1 + k2 + k3
-    print k
-
-    k.constrain_positive('.*var')
-    k.constrain_fixed(np.array([1]),1.75)
-    k.tie_params('.*len')
-    k.unconstrain('white')
-    k.constrain_bounded('white',lower=1e-5,upper=.5)
-    print k
-
-    k_cst = GPy.kern.bias(1,variance=1.)
-    k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3)
-    Kanova = (k_cst + k_mat).prod(k_cst + k_mat,tensor=True)
-    print Kanova
-
-    # sample inputs and outputs
-    X = np.random.uniform(-3.,3.,(40,2))
-    Y = 0.5*X[:,:1] + 0.5*X[:,1:] + 2*np.sin(X[:,:1]) * np.sin(X[:,1:])
-
-    # Create GP regression model
-    m = GPy.models.GPRegression(X, Y, Kanova)
-
-    if plot:
-        fig = pb.figure(figsize=(5,5))
-        ax = fig.add_subplot(111)
-        m.plot(ax=ax)
-
-        pb.figure(figsize=(20,3))
-        pb.subplots_adjust(wspace=0.5)
-        axs = pb.subplot(1,5,1)
-        m.plot(ax=axs)
-        pb.subplot(1,5,2)
-        pb.ylabel("=   ",rotation='horizontal',fontsize='30')
-        axs = pb.subplot(1,5,3)
-        m.plot(ax=axs, which_parts=[False,True,False,False])
-        pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
-        axs = pb.subplot(1,5,4)
-        m.plot(ax=axs, which_parts=[False,False,True,False])
-        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
-        axs = pb.subplot(1,5,5)
-        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
-        m.plot(ax=axs, which_parts=[False,False,False,True])
-
-    return(m)
-
-
-def model_interaction(optimize=True, plot=True):
-    X = np.random.randn(20,1)
-    Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5.
-    k = GPy.kern.RBF(1) + GPy.kern.Bias(1)
-    m = GPy.models.GPRegression(X, Y, kernel=k)
-    return m
-
diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index cce26783..aa3a6478 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -4,7 +4,6 @@
 from gp_regression import GPRegression
 from gp_classification import GPClassification
 from sparse_gp_regression import SparseGPRegression, SparseGPRegressionUncertainInput
-from svigp_regression import SVIGPRegression
 from sparse_gp_classification import SparseGPClassification
 from gplvm import GPLVM
 from bcgplvm import BCGPLVM
diff --git a/GPy/models/svigp_regression.py b/GPy/models/svigp_regression.py
deleted file mode 100644
index 3397e31e..00000000
--- a/GPy/models/svigp_regression.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2012, James Hensman
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-from ..core import SVIGP
-from .. import likelihoods
-from .. import kern
-
-class SVIGPRegression(SVIGP):
-    """
-    Gaussian Process model for regression
-
-    This is a thin wrapper around the SVIGP class, with a set of sensible defalts
-
-    :param X: input observations
-    :param Y: observed values
-    :param kernel: a GPy kernel, defaults to rbf+white
-    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
-    :type normalize_X: False|True
-    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
-    :type normalize_Y: False|True
-    :rtype: model object
-
-    .. Note:: Multiple independent outputs are allowed using columns of Y
-
-    """
-
-    def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, q_u=None, batchsize=10, normalize_Y=False):
-        # kern defaults to rbf (plus white for stability)
-        if kernel is None:
-            kernel = kern.rbf(X.shape[1], variance=1., lengthscale=4.) + kern.white(X.shape[1], 1e-3)
-
-        # Z defaults to a subset of the data
-        if Z is None:
-            i = np.random.permutation(X.shape[0])[:num_inducing]
-            Z = X[i].copy()
-        else:
-            assert Z.shape[1] == X.shape[1]
-
-        # likelihood defaults to Gaussian
-        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
-
-        SVIGP.__init__(self, X, likelihood, kernel, Z, q_u=q_u, batchsize=batchsize)
-        self.load_batch()
-
diff --git a/GPy/plotting/matplot_dep/__init__.py b/GPy/plotting/matplot_dep/__init__.py
index f493513a..4c4402ce 100644
--- a/GPy/plotting/matplot_dep/__init__.py
+++ b/GPy/plotting/matplot_dep/__init__.py
@@ -6,7 +6,6 @@ import models_plots
 import priors_plots
 import variational_plots
 import kernel_plots
-import svig_plots
 import dim_reduction_plots
 import mapping_plots
 import Tango
diff --git a/GPy/testing/examples_tests.py b/GPy/testing/examples_tests.py
index c468a0b0..be26fff6 100644
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@@ -36,7 +36,7 @@ def flatten_nested(lst):
             result.append(element)
     return result
 
-#@nottest
+@nottest
 def test_models():
     optimize=False
     plot=True