diff --git a/GPy/models/uncollapsed_sparse_GP.py b/GPy/models/uncollapsed_sparse_GP.py
deleted file mode 100644
index d2638784..00000000
--- a/GPy/models/uncollapsed_sparse_GP.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2012 James Hensman
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-import pylab as pb
-from ..util.linalg import mdot, jitchol, chol_inv, pdinv
-from .. import kern
-from ..likelihoods import likelihood
-from sparse_GP import sparse_GP
-
-class uncollapsed_sparse_GP(sparse_GP):
-    """
-    Variational sparse GP model (Regression), where the approximating distribution q(u) is represented explicitly
-
-    :param X: inputs
-    :type X: np.ndarray (N x Q)
-    :param likelihood: GPy likelihood class, containing observed data
-    :param q_u: canonical parameters of the distribution squasehd into a 1D array
-    :type q_u: np.ndarray
-    :param kernel : the kernel/covariance function. See link kernels
-    :type kernel: a GPy kernel
-    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (M x Q) | None
-    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
-    :param normalize_X : whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize_X: bool
-    """
-
-    def __init__(self, X, likelihood, kernel, Z, q_u=None, **kwargs):
-        self.M = Z.shape[0]
-        if q_u is None:
-            q_u = np.hstack((np.random.randn(self.M*likelihood.D),-0.5*np.eye(self.M).flatten()))
-            self.likelihood = likelihood
-        self.set_vb_param(q_u)
-        sparse_GP.__init__(self, X, likelihood, kernel, Z, **kwargs)
-
-    def _computations(self):
-        # kernel computations, using BGPLVM notation
-        self.Kmm = self.kern.K(self.Z)
-        if self.has_uncertain_inputs:
-            raise NotImplementedError
-        else:
-            self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)
-            self.psi1 = self.kern.K(self.Z,self.X)
-            if self.likelihood.is_heteroscedastic:
-                raise NotImplementedError
-            else:
-                tmp = self.psi1*(np.sqrt(self.likelihood.precision)/sf)
-            self.psi2_beta_scaled = np.dot(tmp,tmp.T)
-            self.psi2 = self.psi1.T[:,:,None]*self.psi1.T[:,None,:]
-
-
-        self.V = self.likelihood.precision*self.Y
-        self.VmT = np.dot(self.V,self.q_u_expectation[0].T)
-        self.psi1V = np.dot(self.psi1, self.V)
-        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
-        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
-        self.A = mdot(self.Lmi, self.beta*self.psi2, self.Lmi.T)
-        self.B = np.eye(self.M) + self.A
-        self.Lambda = mdot(self.Lmi.T,self.B,self.Lmi)
-        self.trace_K = self.psi0 - np.trace(self.A)/self.beta
-        self.projected_mean = mdot(self.psi1.T,self.Kmmi,self.q_u_expectation[0])
-
-        # Compute dL_dpsi
-        self.dL_dpsi0 = - 0.5 * self.likelihood.D * self.beta * np.ones(self.N)
-        self.dL_dpsi1 = np.dot(self.VmT,self.Kmmi).T # This is the correct term for E I think...
-        self.dL_dpsi2 = 0.5 * self.beta * self.likelihood.D * (self.Kmmi - mdot(self.Kmmi,self.q_u_expectation[1],self.Kmmi))
-
-        # Compute dL_dKmm
-        tmp = self.beta*mdot(self.psi2,self.Kmmi,self.q_u_expectation[1]) -np.dot(self.q_u_expectation[0],self.psi1V.T)
-        tmp += tmp.T
-        tmp += self.likelihood.D*(-self.beta*self.psi2 - self.Kmm + self.q_u_expectation[1])
-        self.dL_dKmm = 0.5*mdot(self.Kmmi,tmp,self.Kmmi)
-
-        #Compute the gradient of the log likelihood wrt noise variance
-        #TODO: suport heteroscedatic noise
-        dbeta =   0.5 * self.N*self.likelihood.D/self.beta
-        dbeta += - 0.5 * self.likelihood.D * self.trace_K
-        dbeta += - 0.5 * self.likelihood.D * np.sum(self.q_u_expectation[1]*mdot(self.Kmmi,self.psi2,self.Kmmi))
-        dbeta += - 0.5 * self.trYYT
-        dbeta += np.sum(np.dot(self.Y.T,self.projected_mean))
-        self.partial_for_likelihood = -dbeta*self.likelihood.precision**2
-
-    def log_likelihood(self):
-        """
-        Compute the (lower bound on the) log marginal likelihood
-        """
-        A = -0.5*self.N*self.likelihood.D*(np.log(2.*np.pi) - np.log(self.beta))
-        B = -0.5*self.beta*self.likelihood.D*self.trace_K
-        C = -0.5*self.likelihood.D *(self.Kmm_logdet-self.q_u_logdet + np.sum(self.Lambda * self.q_u_expectation[1]) - self.M)
-        D = -0.5*self.beta*self.trYYT
-        E = np.sum(np.dot(self.V.T,self.projected_mean))
-        return A+B+C+D+E
-
-    def _raw_predict(self, Xnew, slices,full_cov=False):
-        """Internal helper function for making predictions, does not account for normalization"""
-        Kx = self.kern.K(Xnew,self.Z)
-        mu = mdot(Kx,self.Kmmi,self.q_u_expectation[0])
-
-        tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi)
-        if full_cov:
-            Kxx = self.kern.K(Xnew)
-            var = Kxx - mdot(Kx,tmp,Kx.T)
-        else:
-            Kxx = self.kern.Kdiag(Xnew)
-            var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None]
-        return mu,var
-
-
-    def set_vb_param(self,vb_param):
-        """set the distribution q(u) from the canonical parameters"""
-        self.q_u_prec = -2.*vb_param[-self.M**2:].reshape(self.M, self.M)
-        self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec)
-        self.q_u_logdet = -tmp
-        self.q_u_mean = np.dot(self.q_u_cov,vb_param[:self.M*self.likelihood.D].reshape(self.M,self.likelihood.D))
-
-        self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.likelihood.D)
-
-        self.q_u_canonical = (np.dot(self.q_u_prec, self.q_u_mean),-0.5*self.q_u_prec)
-        #TODO: computations now?
-
-    def get_vb_param(self):
-        """
-        Return the canonical parameters of the distribution q(u)
-        """
-        return np.hstack([e.flatten() for e in self.q_u_canonical])
-
-    def vb_grad_natgrad(self):
-        """
-        Compute the gradients of the lower bound wrt the canonical and
-        Expectation parameters of u.
-
-        Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family)
-        """
-        dL_dmmT_S = -0.5*self.Lambda-self.q_u_canonical[1]
-        dL_dm = np.dot(self.Kmmi,self.psi1V) - np.dot(self.Lambda,self.q_u_mean)
-
-        #dL_dSim =
-        #dL_dmhSi =
-
-        return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten()))  # natgrad only, grad TODO
-
-
-    def plot(self, *args, **kwargs):
-        """
-        add the distribution q(u) to the plot from sparse_GP
-        """
-        sparse_GP.plot(self,*args,**kwargs)
-        if self.Q==1:
-            pb.errorbar(self.Z[:,0],self.q_u_expectation[0][:,0],yerr=2.*np.sqrt(np.diag(self.q_u_cov)),fmt=None,ecolor='b')
-