Started adding gaussian likelihood, changed round preloading old_a

2026-06-11 15:15:15 +02:00 · 2013-07-29 15:29:46 +01:00 · 2013-07-29 15:29:46 +01:00 · aa98608590
commit aa98608590
parent 57001851c4
4 changed files with 321 additions and 181 deletions
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -244,6 +244,12 @@ class model(parameterised):
        LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
        prior_gradients = self._transform_gradients(self._log_prior_gradients())
        obj_grads = -LL_gradients - prior_gradients
+        print self
+        print self._get_params()
+        print -obj_grads
+        self.plot()
+        if isinstance(self.likelihood, likelihoods.Laplace):
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
        return obj_f, obj_grads

    def optimize(self, optimizer=None, start=None, **kwargs):
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@ -85,10 +85,60 @@ def v_fail_test():
    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
    print(m)

+def student_t_obj_plane():
+    plt.close('all')
+    X = np.linspace(0, 1, 50)[:, None]
+    real_std = 0.002
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
+    deg_free = 1000
+
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp['noise'] = real_std**2
+    print "Gaussian"
+    print mgp
+
+    kernelst = kernelgp.copy()
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2))
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m.ensure_default_constraints()
+    m.constrain_fixed('t_no', real_std**2)
+    vs = 10
+    ls = 10
+    objs_t = np.zeros((vs, ls))
+    objs_g = np.zeros((vs, ls))
+    rbf_vs = np.linspace(1e-6, 8, vs)
+    rbf_ls = np.linspace(1e-2, 8, ls)
+    for v_id, rbf_v in enumerate(rbf_vs):
+        for l_id, rbf_l in enumerate(rbf_ls):
+            m['rbf_v'] = rbf_v
+            m['rbf_l'] = rbf_l
+            mgp['rbf_v'] = rbf_v
+            mgp['rbf_l'] = rbf_l
+            objs_t[v_id, l_id] = m.log_likelihood()
+            objs_g[v_id, l_id] = mgp.log_likelihood()
+    plt.figure()
+    plt.subplot(211)
+    plt.title('Student t')
+    plt.imshow(objs_t, interpolation='none')
+    plt.xlabel('variance')
+    plt.ylabel('lengthscale')
+    plt.subplot(212)
+    plt.title('Gaussian')
+    plt.imshow(objs_g, interpolation='none')
+    plt.xlabel('variance')
+    plt.ylabel('lengthscale')
+    plt.show()
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    return objs_t
+
 def student_t_f_check():
    plt.close('all')
    X = np.linspace(0, 1, 50)[:, None]
-    real_std = 0.001
+    real_std = 0.2
    noise = np.random.randn(*X.shape)*real_std
    Y = np.sin(X*2*np.pi) + noise
    deg_free = 1000
@ -98,17 +148,26 @@ def student_t_f_check():
    mgp.ensure_default_constraints()
    mgp.randomize()
    mgp.optimize()
+    print "Gaussian"
    print mgp
    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT

    kernelst = kernelgp.copy()
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=1e-5)
+    #kernelst += GPy.kern.bias(X.shape[1])
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05)
    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
-    m['rbf_v'] = mgp._get_params()[0]
-    m['rbf_l'] = mgp._get_params()[1] + 1
+    #m['rbf_v'] = mgp._get_params()[0]
+    #m['rbf_l'] = mgp._get_params()[1] + 1
    m.ensure_default_constraints()
+    #m.constrain_fixed('rbf_v', mgp._get_params()[0])
+    #m.constrain_fixed('rbf_l', mgp._get_params()[1])
+    #m.constrain_bounded('t_no', 2*real_std**2, 1e3)
+    #m.constrain_positive('bias')
    m.constrain_positive('t_no')
+    m.randomize()
+    m['t_no'] = 0.3
+    m.likelihood.X = X
    print m
    plt.figure()
    plt.subplot(511)
@ -143,7 +202,8 @@ def student_t_fix_optimise_check():
    Y = np.sin(X*2*np.pi) + noise
    X_full = X
    Y_full = np.sin(X_full)
-    #Y = Y/Y.max()
+    Y = Y/Y.max()
+    Y_full = Y_full/Y_full.max()
    deg_free = 1000

    #GP
@ -219,7 +279,7 @@ def student_t_fix_optimise_check():
    plt.subplot(121)
    mrbf.plot()
    plt.title('Student t fixed noise')
-    #mrbf.optimize()
+    mrbf.optimize()
    print "After optimize"
    print mrbf
    plt.subplot(122)
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@ -156,17 +156,23 @@ class Laplace(likelihood):
        Y_tilde = Wi*self.Ki_f + self.f_hat

        self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
+        #self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6
+
        self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
        self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)

        self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
-        Z_tilde = (#+ self.NORMAL_CONST
+        self.aA = 0.5*self.ln_det_K_Wi__Bi
+        self.bB = - 0.5*self.f_Ki_f
+        self.cC = 0.5*self.y_Wi_Ki_i_y
+        Z_tilde = (+ 100*self.NORMAL_CONST
                   + self.lik
                   + 0.5*self.ln_det_K_Wi__Bi
                   - 0.5*self.f_Ki_f
                   + 0.5*self.y_Wi_Ki_i_y
                  )
-        #print "Ztilde: {}".format(Z_tilde)
+        print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC)
+        print self.likelihood_function._get_params()

        #Convert to float as its (1, 1) and Z must be a scalar
        self.Z = np.float64(Z_tilde)
@ -198,7 +204,7 @@ class Laplace(likelihood):
        self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)

        if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-10  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                       #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                       #To cause the posterior to become less certain than the prior and likelihood,
                                       #This is a property only held by non-log-concave likelihoods
@ -280,7 +286,7 @@ class Laplace(likelihood):
        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
        return f_hat[:, None]

-    def rasm_mode(self, K, MAX_ITER=40, MAX_RESTART=10):
+    def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
        """
        Rasmussens numerically stable mode finding
        For nomenclature see Rasmussen & Williams 2006
@ -290,15 +296,19 @@ class Laplace(likelihood):
        :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
        :returns: f_mode
        """
-        old_a = np.zeros((self.N, 1))
-        #old_a = None
-        #if self.old_a is None:
-            #old_a = np.zeros((self.N, 1))
-        #else:
-            #old_a = self.old_a
+        self.old_before_s = self.likelihood_function._get_params()
+        print "before: ", self.old_before_s
+        #if self.old_before_s < 1e-5:
+            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+        #old_a = np.zeros((self.N, 1))
+        if self.old_a is None:
+            old_a = np.zeros((self.N, 1))
+            f = np.dot(K, old_a)
+        else:
+            old_a = self.old_a.copy()
+            f = self.f_hat.copy()

-        f = np.dot(self.K, old_a)
-        self.f = f
        new_obj = -np.inf
        old_obj = np.inf

@ -306,18 +316,20 @@ class Laplace(likelihood):
            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)

        difference = np.inf
-        epsilon = 1e-10
+        epsilon = 1e-4
        step_size = 1
        rs = 0
        i = 0
-        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
+
+        while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
            W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
+            #W = np.maximum(W, 0)
            if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-10     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                    # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                    # To cause the posterior to become less certain than the prior and likelihood,
                                    # This is a property only held by non-log-concave likelihoods
-            B, L, W_12 = self._compute_B_statistics(K, W)
+            B, L, W_12 = self._compute_B_statistics(K, W.copy())

            W_f = W*f
            grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
@ -328,54 +340,105 @@ class Laplace(likelihood):
            full_step_a = b - W_12*solve_L
            da = full_step_a - old_a

-            f_old = f.copy()
-
-            f_old = self.f.copy()
-            def inner_obj(step_size, old_a, da, K):
-                a = old_a + step_size*da
-                f = np.dot(K, a)
-                self.a = a # This is nasty, need to set something within an optimization though
-                self.f = f
-                return -obj(a, f)
-
-            from functools import partial
-            i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K)
-            new_obj = sp.optimize.brent(i_o, tol=1e-6, maxiter=10)
-
-            #update_passed = False
-            #while not update_passed:
+            #f_old = f.copy()
+            #def inner_obj(step_size, old_a, da, K):
                #a = old_a + step_size*da
                #f = np.dot(K, a)
+                #self.a = a.copy() # This is nasty, need to set something within an optimization though
+                #self.f = f.copy()
+                #return -obj(a, f)

-                #old_obj = new_obj
-                #new_obj = obj(a, f)
-                #difference = new_obj - old_obj
-                #print "difference: ",difference
-                #if difference < 0:
-                    ##print grad
-                    ##print "Objective function rose", np.float(difference)
-                    ##If the objective function isn't rising, restart optimization
-                    #step_size *= 0.8
-                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
-                    ##objective function isn't increasing, try reducing step size
-                    ##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
-                    ##old_obj = tmp_old_obj
-                    #old_obj = new_obj
-                    #rs += 1
-                #else:
-                    #update_passed = True
+            #from functools import partial
+            #i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
+            ##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
+            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
+            #f = self.f.copy()
+            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT

-            f = self.f
-            difference = new_obj - old_obj
-            difference = np.abs(np.sum(f - f_old)) #+ abs(difference)
-            old_a = self.a #a
+            f_old = f.copy()
+            update_passed = False
+            while not update_passed:
+                a = old_a + step_size*da
+                f = np.dot(K, a)
+
+                old_obj = new_obj
+                new_obj = obj(a, f)
+                difference = new_obj - old_obj
+                print "difference: ",difference
+                if difference < 0:
+                    #print "Objective function rose", np.float(difference)
+                    #If the objective function isn't rising, restart optimization
+                    step_size *= 0.8
+                    #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    #objective function isn't increasing, try reducing step size
+                    f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
+                    old_obj = new_obj
+                    rs += 1
+                else:
+                    update_passed = True
+
+            #difference = abs(new_obj - old_obj)
+            #old_obj = new_obj.copy()
+            difference = np.abs(np.sum(f - f_old))
+            #old_a = self.a.copy() #a
+            old_a = a.copy()
            i += 1
+            #print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))

-        self.old_a = old_a
+        self.old_a = old_a.copy()
        #print "Positive difference obj: ", np.float(difference)
        #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
        print "Iterations: {}, Final_difference: {}".format(i, difference)
-        #self.a = a
+        if difference > 1e-4:
+            print "FAIL FAIL FAIL FAIL FAIL FAIL"
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+            if hasattr(self, 'X'):
+                import pylab as pb
+                pb.figure()
+                pb.subplot(311)
+                pb.title('old f_hat')
+                pb.plot(self.X, self.f_hat)
+                pb.subplot(312)
+                pb.title('old ff')
+                pb.plot(self.X, self.old_ff)
+                pb.subplot(313)
+                pb.title('new f_hat')
+                pb.plot(self.X, f)
+
+                pb.figure()
+                pb.subplot(121)
+                pb.title('old K')
+                pb.imshow(np.diagflat(self.old_K), interpolation='none')
+                pb.colorbar()
+                pb.subplot(122)
+                pb.title('new K')
+                pb.imshow(np.diagflat(K), interpolation='none')
+                pb.colorbar()
+
+                pb.figure()
+                pb.subplot(121)
+                pb.title('old W')
+                pb.imshow(np.diagflat(self.old_W), interpolation='none')
+                pb.colorbar()
+                pb.subplot(122)
+                pb.title('new W')
+                pb.imshow(np.diagflat(W), interpolation='none')
+                pb.colorbar()
+
+                import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+                pb.close('all')
+
+        #FIXME: DELETE THESE
+        self.old_W = W.copy()
+        self.old_grad = grad.copy()
+        self.old_B = B.copy()
+        self.old_W_12 = W_12.copy()
+        self.old_ff = f.copy()
+        self.old_K = self.K.copy()
+        self.old_s = self.likelihood_function._get_params()
+        print "after: ", self.old_s
+        #print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
+        self.a = a
        #self.B, self.B_chol, self.W_12 = B, L, W_12
        #self.Bi, _, _, B_det = pdinv(self.B)
        return f
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@ -193,11 +193,16 @@ class student_t(likelihood_function):
        """
        assert y.shape == f.shape
        e = y - f
+        #A = gammaln((self.v + 1) * 0.5)
+        #B = - gammaln(self.v * 0.5)
+        #C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
+        #D = + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
        objective = (+ gammaln((self.v + 1) * 0.5)
                     - gammaln(self.v * 0.5)
                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
                     + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
                    )
+        #print "C: {} D: {} obj: {}".format(C, np.sum(D), objective.sum())
        return np.sum(objective)

    def dlik_df(self, y, f, extra_data=None):
@ -459,147 +464,153 @@ class weibull_survival(likelihood_function):
        hess = (y**self.shape)*np.exp(f)
        return np.squeeze(hess)

-#class gaussian(likelihood_function):
-    #"""
-    #Gaussian likelihood - this is a test class for approximation schemes
-    #"""
-    #def __init__(self, variance):
-        #self._set_params(np.asarray(variance))
+class gaussian(likelihood_function):
+    """
+    Gaussian likelihood - this is a test class for approximation schemes
+    """
+    def __init__(self, variance):
+        self._set_params(np.asarray(variance))

-    #def _get_params(self):
-        #return np.asarray(self.sigma2)
+    def _get_params(self):
+        return np.asarray(self._variance)

-    #def _get_param_names(self):
-        #return ["noise_variance"]
+    def _get_param_names(self):
+        return ["noise_variance"]

-    #def _set_params(self, x):
-        #self.variance = float(x)
+    def _set_params(self, x):
+        self._variance = float(x)
+        self.covariance_matrix = np.eye(self.N) * self._variance
+        self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG

-    #def link_function(self, y, f, extra_data=None):
-        #"""link_function $\ln p(y|f)$
-        #$$\ln p(y_{i}|f_{i}) = \ln $$
+    def link_function(self, y, f, extra_data=None):
+        """link_function $\ln p(y|f)$
+        $$\ln p(y_{i}|f_{i}) = \ln $$

-        #:y: data
-        #:f: latent variables f
-        #:extra_data: extra_data which is not used in student t distribution
-        #:returns: float(likelihood evaluated for this point)
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: float(likelihood evaluated for this point)

-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #objective = -0.5*self.D*
-        #return np.sum(objective)
+        """
+        assert y.shape == f.shape
+        e = y - f
+        eeT = np.dot(e, e.T)
+        objective = (- 0.5*self.D*np.log(2*np.pi)
+                     - 0.5*self.ln_K
+                     - 0.5*np.sum(np.multiply(self.Ki, eeT))
+                     )
+        return np.sum(objective)

-    #def dlik_df(self, y, f, extra_data=None):
-        #"""
-        #Gradient of the link function at y, given f w.r.t f
+    def dlik_df(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f

-        #$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
+        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$

-        #:y: data
-        #:f: latent variables f
-        #:extra_data: extra_data which is not used in student t distribution
-        #:returns: gradient of likelihood evaluated at points
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: gradient of likelihood evaluated at points

-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
-        #return grad
+        """
+        assert y.shape == f.shape
+        e = y - f
+        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        return grad

-    #def d2lik_d2f(self, y, f, extra_data=None):
-        #"""
-        #Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
-        #i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+    def d2lik_d2f(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j

-        #Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-        #(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+        Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+        (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}

-        #$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
+        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$

-        #:y: data
-        #:f: latent variables f
-        #:extra_data: extra_data which is not used in student t distribution
-        #:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
-        #return hess
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        assert y.shape == f.shape
+        e = y - f
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
+        return hess

-    #def d3lik_d3f(self, y, f, extra_data=None):
-        #"""
-        #Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+    def d3lik_d3f(self, y, f, extra_data=None):
+        """
+        Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j

-        #$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
-                       #((e**2 + self.sigma2*self.v)**3)
-                    #)
-        #return d3lik_d3f
+        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       ((e**2 + self.sigma2*self.v)**3)
+                    )
+        return d3lik_d3f

-    #def lik_dstd(self, y, f, extra_data=None):
-        #"""
-        #Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
+    def lik_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)

-        #Terms relavent to derivatives wrt sigma are:
-        #-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+        Terms relavent to derivatives wrt sigma are:
+        -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))

-        #$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        ##dlik_dsigma = ( - (1/sigma) +
-                        ##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      ##)
-        ##dlik_dsigma = ( - 1 +
-                        ##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      ##)
-        ##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
-        #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
-        #return dlik_dsigma
+        $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        sigma = np.sqrt(self.sigma2)
+        #dlik_dsigma = ( - (1/sigma) +
+                        #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      #)
+        #dlik_dsigma = ( - 1 +
+                        #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      #)
+        #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
+        dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
+        return dlik_dsigma

-    #def dlik_df_dstd(self, y, f, extra_data=None):
-        #"""
-        #Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
+    def dlik_df_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the dlik_df w.r.t sigma parameter (standard deviation)

-        #$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
-                            #/ ((self.v*self.sigma2 + e**2)**2)
-                           #)
-        #return dlik_grad_dsigma
+        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        sigma = np.sqrt(self.sigma2)
+        dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
+                            / ((self.v*self.sigma2 + e**2)**2)
+                           )
+        return dlik_grad_dsigma

-    #def d2lik_d2f_dstd(self, y, f, extra_data=None):
-        #"""
-        #Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
+    def d2lik_d2f_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)

-        #$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        #dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
-                              #((e**2 + self.sigma2*self.v)**3)
-                           #)
-        ##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
-                             ##/ ((e**2 + (self.sigma**2)*self.v)**3) )
-        #return dlik_hess_dsigma
+        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        sigma = np.sqrt(self.sigma2)
+        dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
+                              ((e**2 + self.sigma2*self.v)**3)
+                           )
+        #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
+                             #/ ((e**2 + (self.sigma**2)*self.v)**3) )
+        return dlik_hess_dsigma

-    #def _gradients(self, y, f, extra_data=None):
-        ##must be listed in same order as 'get_param_names'
-        #derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
-                  #[self.dlik_df_dstd(y, f, extra_data=extra_data)],
-                  #[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
-                 #) # lists as we might learn many parameters
-        ## ensure we have gradients for every parameter we want to optimize
-        #assert len(derivs[0]) == len(self._get_param_names())
-        #assert len(derivs[1]) == len(self._get_param_names())
-        #assert len(derivs[2]) == len(self._get_param_names())
-        #return derivs
+    def _gradients(self, y, f, extra_data=None):
+        #must be listed in same order as 'get_param_names'
+        derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
+                  [self.dlik_df_dstd(y, f, extra_data=extra_data)],
+                  [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+                 ) # lists as we might learn many parameters
+        # ensure we have gradients for every parameter we want to optimize
+        assert len(derivs[0]) == len(self._get_param_names())
+        assert len(derivs[1]) == len(self._get_param_names())
+        assert len(derivs[2]) == len(self._get_param_names())
+        return derivs