diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 060b617a..031ed16e 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -55,9 +55,9 @@ class GP(Model):
 
         self.add_parameter(self.kern)
         self.add_parameter(self.likelihood)
-
         self.parameters_changed()
 
+
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y)
         self._dL_dK = grad_dict['dL_dK']
@@ -65,9 +65,6 @@ class GP(Model):
     def log_likelihood(self):
         return self._log_marginal_likelihood
 
-    def dL_dtheta_K(self):
-        return self.kern.dK_dtheta(self.posterior.dL_dK, self.X)
-
     def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
         """
         Internal helper function for making predictions, does not account
diff --git a/GPy/core/model.py b/GPy/core/model.py
index f4de0405..dc108641 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -325,7 +325,6 @@ class Model(Parameterized):
             if self._fail_count >= self._allowed_failures:
                 raise e
             self._fail_count += 1
-            import ipdb;ipdb.set_trace()
             obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
         return obj_grads
 
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 5a04fbfd..c2025202 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -74,6 +74,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
         self.size = sum(p.size for p in self._parameters_)
         if not self._has_fixes():
             self._fixes_ = None
+        self._param_slices_ = []
         self._connect_parameters()
         self._added_names_ = set()
         del self._in_init_
@@ -213,7 +214,6 @@ class Parameterized(Constrainable, Pickleable, Observable):
             return
         i = 0
         sizes = [0]
-        self._param_slices_ = []
         for p in self._parameters_:
             p._direct_parent_ = self
             p._highest_parent_ = self
@@ -315,7 +315,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
         return n
     def _get_params(self):
         # don't overwrite this anymore!
-        return numpy.hstack([x._get_params() for x in self._parameters_])
+        return numpy.hstack([x._get_params() for x in self._parameters_ if x.size>0])
     def _set_params(self, params, update=True):
         # don't overwrite this anymore!
         [p._set_params(params[s], update=update) for p,s in itertools.izip(self._parameters_,self._param_slices_)]
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index ab1f3bf0..089f2b38 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -42,8 +42,8 @@ class SparseGP(GP):
             raise NotImplementedError, "what to do what to do?"
             print "defaulting to ", inference_method, "for latent function inference"
 
+        self.Z = Param('inducing inputs', Z)
 
-        self.Z = Z
         self.num_inducing = Z.shape[0]
 
         if not (X_variance is None):
@@ -52,10 +52,8 @@ class SparseGP(GP):
 
         GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
 
-        self.Z = Param('inducing inputs', self.Z)
         self.add_parameter(self.Z, index=0)
-        self.add_parameter(self.kern)
-        self.add_parameter(self.likelihood)
+
 
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index a6d3c9b5..9a66d3b6 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -11,7 +11,8 @@
 #http://gaussianprocess.org/gpml/code.
 
 import numpy as np
-from ...util.linalg import mdot, jitchol, pddet, dpotrs
+from ...util.linalg import mdot, jitchol, pddet, dpotrs, dtrtrs
+from ...util.misc import param_to_array
 from functools import partial as partial_func
 from posterior import Posterior
 import warnings
@@ -27,7 +28,6 @@ class LaplaceInference(object):
         (using Newton-Raphson) of the unnormalised posterior
 
         """
-        self.NORMAL_CONST = (0.5 * np.log(2 * np.pi))
 
         self._mode_finding_tolerance = 1e-7
         self._mode_finding_max_iter = 40
@@ -39,58 +39,133 @@ class LaplaceInference(object):
         Returns a Posterior class containing essential quantities of the posterior
         """
 
+        #make Y a normal array!
+        Y = param_to_array(Y)
+
         # Compute K
         K = kern.K(X)
 
         #Find mode
         if self.bad_fhat:
-            Ki_f_init = np.random.randn(*Y.shape)/50
+            Ki_f_init = np.zeros_like(Y)
         else:
             Ki_f_init = self._previous_Ki_fhat
-        self.f_hat, self._previous_Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
-
-        stop
+        f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
 
         #Compute hessian and other variables at mode
-        self._compute_likelihood_variables()
+        log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, Y_metadata)
 
-        likelihood.gradient = self.likelihood_gradients()
-        dL_dK = self._Kgradients()
-        kern.update_gradients_full(dL_dK)
+        #likelihood.gradient = self.likelihood_gradients()
+        kern.update_gradients_full(dL_dK, X)
 
-        return Posterior(mean=self.f_hat, cov=self.Sigma, K=self.K), log_marginal_approx, {'dL_dK':dL_dK}
+        self._previous_Ki_fhat = Ki_fhat.copy()
+        return Posterior(woodbury_vector=woodbury_vector, woodbury_inv = K_Wi_i, K=K), log_marginal, {'dL_dK':dL_dK}
 
-    def _shared_gradients_components(self):
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
         """
-        A helper function to compute some common quantities
-        """
-        d3lik_d3fhat = likelihood.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
-        dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
-        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
-        return dL_dfhat, I_KW_i
+        Rasmussen's numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+        Influenced by GPML (BSD) code, all errors are our own
 
-    def _Kgradients(self):
+        :param K: Covariance matrix evaluated at locations X
+        :type K: NxD matrix
+        :param Y: The data
+        :type Y: np.ndarray
+        :param likelihood: the likelihood of the latent function value for the given data
+        :type likelihood: a GPy.likelihood object
+        :param Ki_f_init: the initial guess at the mode
+        :type Ki_f_init: np.ndarray
+        :param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object
+        :type Y_metadata: np.ndarray | None
+        :returns: f_hat, mode on which to make laplace approxmiation
+        :rtype: np.ndarray
         """
-        Gradients with respect to prior kernel parameters dL_dK to be chained
-        with dK_dthetaK to give dL_dthetaK
-        :returns: dL_dK matrix
-        :rtype: Matrix (1 x num_kernel_params)
-        """
-        dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlp = likelihood.dlogpdf_df(self.f_hat, Y, extra_data=None) # TODO: how will extra data work?
 
-        #Explicit
-        expl_a = np.dot(self.Ki_f, self.Ki_f.T)
-        expl_b = self.Wi_K_i
-        expl = 0.5*expl_a - 0.5*expl_b
-        dL_dthetaK_exp = dK_dthetaK(expl, X)
+        Ki_f = Ki_f_init.copy()
+        f = np.dot(K, Ki_f)
+
+
+        #define the objective function (to be maximised)
+        def obj(Ki_f, f):
+            return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + likelihood.logpdf(f, Y, extra_data=Y_metadata)
+
+        difference = np.inf
+        iteration = 0
+        while difference > self._mode_finding_tolerance and iteration < self._mode_finding_max_iter:
+            W = -likelihood.d2logpdf_df2(f, Y, extra_data=Y_metadata)
+
+            W_f = W*f
+            grad = likelihood.dlogpdf_df(f, Y, extra_data=Y_metadata)
+
+            b = W_f + grad # R+W p46 line 6.
+            #W12BiW12Kb, B_logdet = self._compute_B_statistics(K, W.copy(), np.dot(K, b), likelihood.log_concave)
+            W12BiW12, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave)
+            W12BiW12Kb = np.dot(W12BiW12, np.dot(K, b))
+
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            full_step_Ki_f = b - W12BiW12Kb # full_step_Ki_f = a in R&W p46 line 6.
+            dKi_f = full_step_Ki_f - Ki_f
+
+            #define an objective for the line search (minimize this one)
+            def inner_obj(step_size):
+                Ki_f_trial = Ki_f + step_size*dKi_f
+                f_trial = np.dot(K, Ki_f_trial)
+                return -obj(Ki_f_trial, f_trial)
+
+            #use scipy for the line search, the compute new values of f, Ki_f
+            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
+            Ki_f_new = Ki_f + step*dKi_f
+            f_new = np.dot(K, Ki_f_new)
+
+            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
+            Ki_f = Ki_f_new
+            f = f_new
+            iteration += 1
+
+        #Warn of bad fits
+        if difference > self._mode_finding_tolerance:
+            if not self.bad_fhat:
+                warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
+            self.bad_fhat = True
+        elif self.bad_fhat:
+            self.bad_fhat = False
+            warnings.warn("f_hat now fine again")
+
+        return f, Ki_f
+
+
+    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, Y_metadata):
+        """
+        At the mode, compute the hessian and effective covariance matrix.
+
+        returns: logZ : approximation to the marginal likelihood
+        Cov : the approximation to the covariance matrix
+        """
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        W = -likelihood.d2logpdf_df2(f_hat, Y, extra_data=Y_metadata)
+
+        K_Wi_i, L, LiW12 = self._compute_B_statistics(K, W, likelihood.log_concave)
+
+        #compute vital matrices
+        C = np.dot(LiW12, K)
+        Ki_W_i  = K - C.T.dot(C)
+
+        #compute the log marginal
+        log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + likelihood.logpdf(f_hat, Y, extra_data=Y_metadata) - np.sum(np.log(np.diag(L)))
+
+        #compute dL_dK
+        explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
 
         #Implicit
-        impl = mdot(dlp, dL_dfhat, I_KW_i)
+        d3lik_d3fhat = likelihood.d3logpdf_df3(f_hat, Y, extra_data=Y_metadata)
+        dL_dfhat = 0.5*(np.diag(Ki_W_i)[:, None]*d3lik_d3fhat) #why isn't this -0.5? s2 in R&W p126 line 9.
+        woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, extra_data=Y_metadata)
+        implicit_part = np.dot(woodbury_vector, dL_dfhat.T).dot(np.eye(Y.shape[0]) - np.dot(K, K_Wi_i))
 
-        dL_dK = expl + impl
+        dL_dK = explicit_part + implicit_part
+
+        return log_marginal, Ki_W_i, K_Wi_i, dL_dK, woodbury_vector
 
-        return dL_dK
 
     def likelihood_gradients(self):
         """
@@ -118,24 +193,7 @@ class LaplaceInference(object):
 
         return dL_dthetaL
 
-    def _compute_likelihood_variables(self):
-        """
-        At the mode, compute the hessian and effective covaraince matrix.
-        """
-        #At this point get the hessian matrix (or vector as W is diagonal)
-        self.W = -likelihood.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
-
-        if not likelihood.log_concave:
-            #print "Under 1e-10: {}".format(np.sum(self.W < 1e-6))
-            self.W[self.W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-
-        self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N), likelihood.log_concave)
-
-        self.Ki_f = self.Ki_f
-        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
-        self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
-
-    def _compute_B_statistics(self, K, W, a, log_concave):
+    def _compute_B_statistics(self, K, W, log_concave):
         """
         Rasmussen suggests the use of a numerically stable positive definite matrix B
         Which has a positive diagonal element and can be easyily inverted
@@ -144,9 +202,7 @@ class LaplaceInference(object):
         :type K: NxN matrix
         :param W: Negative hessian at a point (diagonal matrix)
         :type W: Vector of diagonal values of hessian (1xN)
-        :param a: Matrix to calculate W12BiW12a
-        :type a: Matrix NxN
-        :returns: (W12BiW12, ln_B_det)
+        :returns: (W12BiW12, L_B, Li_W12)
         """
         if not log_concave:
             #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
@@ -155,169 +211,13 @@ class LaplaceInference(object):
                                 # To cause the posterior to become less certain than the prior and likelihood,
                                 # This is a property only held by non-log-concave likelihoods
 
-
         #W is diagonal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
         B = np.eye(K.shape[0]) + W_12*K*W_12.T
         L = jitchol(B)
 
-        W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
-        ln_B_det = 2.*np.sum(np.log(np.diag(L)))
-        return W12BiW12a, ln_B_det
-
-    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
-        """
-        Rasmussen's numerically stable mode finding
-        For nomenclature see Rasmussen & Williams 2006
-        Influenced by GPML (BSD) code, all errors are our own
-
-        :param K: Covariance matrix evaluated at locations X
-        :type K: NxD matrix
-        :param Y: The data
-        :type Y: np.ndarray
-        :param likelihood: the likelihood of the latent function value for the given data
-        :type likelihood: a GPy.likelihood object
-        :param Ki_f_init: the initial guess at the mode
-        :type Ki_f_init: np.ndarray
-        :param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object
-        :type Y_metadata: np.ndarray | None
-        :returns: f_hat, mode on which to make laplace approxmiation
-        :rtype: np.ndarray
-        """
-
-        ##Start f's at zero originally or if we have gone off track, try restarting
-        #if self.old_Ki_f is None or self.bad_fhat:
-            #old_Ki_f = np.random.rand(self.N, 1)/50.0
-            ##old_Ki_f = self.Y
-            #f = np.dot(K, old_Ki_f)
-        #else:
-            ##Start at the old best point
-            #old_Ki_f = self.old_Ki_f.copy()
-            #f = self.f_hat.copy()
-
-        Ki_f = Ki_f_init.copy()
-        f = np.dot(K, Ki_f)
-
-
-        #define the objective function (to be maximised)
-        def obj(Ki_f, f):
-            return -0.5*np.dot(Ki_f.T, f) + likelihood.logpdf(f, Y, extra_data=Y_metadata)
-
-        difference = np.inf
-        i = 0
-        while difference > self._mode_finding_tolerance and i < self._mode_finding_max_iter:
-            W = -likelihood.d2logpdf_df2(f, Y, extra_data=Y_metadata)
-
-            W_f = W*f
-            grad = likelihood.dlogpdf_df(f, Y, extra_data=Y_metadata)
-
-            b = W_f + grad
-            W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b), likelihood.log_concave)
-
-            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
-            full_step_Ki_f = b - W12BiW12Kb
-            dKi_f = full_step_Ki_f - Ki_f
-
-            #define an objective for the line search
-            def inner_obj(step_size):
-                Ki_f_trial = Ki_f + step_size*dKi_f
-                f_trial = np.dot(K, Ki_f_trial)
-                print -obj(Ki_f_trial, f_trial),
-                return -obj(Ki_f_trial, f_trial)
-
-            #use scipy for the line search, the compute new values of f, Ki_f
-            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
-            Ki_f_new = Ki_f + step*dKi_f
-            f_new = np.dot(K, Ki_f_new)
-
-            print ""
-            print obj(Ki_f, f), obj(Ki_f_new, f_new), step
-            print ""
-
-            #i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
-            #Find the stepsize that minimizes the objective function using a brent line search
-            #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
-            #steps than get this exact then make a step, if B was bigger it might be the other way around though
-            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
-            #new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
-            #f = self.tmp_f.copy()
-            #Ki_f = self.tmp_Ki_f.copy()
-
-            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
-            Ki_f = Ki_f_new
-            f = f_new
-            i += 1
-
-
-        #Warn of bad fits
-        if difference > self._mode_finding_tolerance:
-            if not self.bad_fhat:
-                warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
-            self.bad_fhat = True
-        elif self.bad_fhat:
-            self.bad_fhat = False
-            warnings.warn("f_hat now fine again")
-
-        return f, Ki_f
-
-    def _compute_GP_variables(self):
-        """
-        Generate data Y which would give the normal distribution identical
-        to the laplace approximation to the posterior, but normalised
-
-        GPy expects a likelihood to be gaussian, so need to caluclate
-        the data Y^{\tilde} that makes the posterior match that found
-        by a laplace approximation to a non-gaussian likelihood but with
-        a gaussian likelihood
-
-        Firstly,
-        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
-        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
-        we wish to find the hessian \Sigma^{\tilde}
-        that has the same curvature but using our new simulated data Y^{\tilde}
-        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
-        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
-        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
-
-        Secondly,
-        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
-        So we can suck up any differences between that and our log marginal likelihood approximation
-        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
-        which we want to optimize instead, by equating them and rearranging, the difference is added onto
-        the log p(y) that GPy optimizes by default
-
-        Thirdly,
-        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
-        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
-        gp.py code
-        """
-        Wi = 1.0/self.W
-        self.Sigma_tilde = np.diagflat(Wi)
-
-        Y_tilde = Wi*self.Ki_f + self.f_hat
-
-        self.Wi_K_i = self.W12BiW12
-        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-        lik = likelihood.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
-        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
-
-        Z_tilde = (+ lik
-                   - 0.5*self.ln_B_det
-                   + 0.5*ln_det_Wi_K
-                   - 0.5*self.f_Ki_f
-                   + 0.5*y_Wi_K_i_y
-                   + self.NORMAL_CONST
-                  )
-
-        #Convert to float as its (1, 1) and Z must be a scalar
-        self.Z = np.float64(Z_tilde)
-        self.Y = Y_tilde
-        self.YYT = np.dot(self.Y, self.Y.T)
-        self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
-
-        #Compute dZ_dK which is how the approximated distributions gradients differ from the dL_dK computed for other likelihoods
-        self.dZ_dK = self._Kgradients()
-        #+ 0.5*self.Wi_K_i - 0.5*np.dot(self.Ki_f, self.Ki_f.T) #since we are not adding the K gradients explicit part theres no need to compute this again
+        LiW12, _ = dtrtrs(L, np.diag(W_12[:,0]), lower=1, trans=0)
+        K_Wi_i = np.dot(LiW12.T, LiW12) # R = W12BiW12, in R&W p 126, eq 5.25
 
+        return K_Wi_i, L, LiW12
 
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index c0974dc5..a0f4104c 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -48,6 +48,7 @@ class Posterior(object):
 
         if ((woodbury_chol is not None) and (woodbury_vector is not None))\
                 or ((woodbury_inv is not None) and (woodbury_vector is not None))\
+                or ((woodbury_inv is not None) and (mean is not None))\
                 or ((mean is not None) and (cov is not None)):
             pass # we have sufficient to compute the posterior
         else: