From 4e5cefb4b5cb14a3c4f94dbd4d18eac8c70a84fd Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 8 Jul 2013 15:48:53 +0100
Subject: [PATCH] Reparameratised in terms of sigma2

---
 GPy/core/model.py                       |   3 -
 GPy/examples/laplace_approximations.py  |  34 ++--
 GPy/likelihoods/Laplace.py              |  12 +-
 GPy/likelihoods/likelihood_functions.py | 207 +++++++++++++++++++++---
 4 files changed, 207 insertions(+), 49 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index f97938a4..94202396 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -244,9 +244,6 @@ class model(parameterised):
         LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
         prior_gradients = self._transform_gradients(self._log_prior_gradients())
         obj_grads = -LL_gradients - prior_gradients
-        print self
-        #self.checkgrad(verbose=1)
-        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         return obj_f, obj_grads
 
     def optimize(self, optimizer=None, start=None, **kwargs):
diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 14400a08..d6b48ebf 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -24,7 +24,7 @@ def timing():
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
 
-        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
         corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
         m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
         m.ensure_default_constraints()
@@ -53,7 +53,7 @@ def v_fail_test():
     edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel1)
     m.constrain_positive('')
@@ -92,18 +92,18 @@ def debug_student_t_noise_approx():
     X = np.random.rand(100)[:, None]
     #X = np.random.rand(100)[:, None]
     #X = np.array([0.5, 1])[:, None]
-    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var
+    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + 1
     #Y = X + np.random.randn(*X.shape)*real_var
     #ty = np.array([1., 9.97733584, 4.17841363])[:, None]
     #Y = ty
 
     X_full = X
-    Y_full = np.sin(X_full)
+    Y_full = np.sin(X_full) + 1
 
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 1000
+    deg_free = 100
 
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
@@ -115,7 +115,7 @@ def debug_student_t_noise_approx():
 
     plt.close('all')
     # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
     #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
@@ -140,24 +140,24 @@ def debug_student_t_noise_approx():
     #print m
 
     real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
-    edited_real_sd = real_stu_t_std + 1#initial_var_guess #real_sd
+    edited_real_sd = real_stu_t_std**2 #initial_var_guess #real_sd
     #edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
 
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     #m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
-    #m.constrain_fixed('rbf_l', 1.8651)
+    #m.constrain_fixed('rbf_l', 0.2651)
     #m.constrain_fixed('t_noise_std', edited_real_sd)
     #m.constrain_positive('rbf')
-    #m.constrain_positive('t_noise_std')
+    m.constrain_positive('t_noise_std')
     #m.constrain_positive('')
     #m.constrain_bounded('t_noi', 0.001, 10)
     #m.constrain_fixed('t_noi', real_stu_t_std)
-    m.constrain_fixed('white', 0.01)
+    #m.constrain_fixed('white', 0.01)
     #m.constrain_fixed('t_no', 0.01)
     #m['rbf_var'] = 0.20446332
     #m['rbf_leng'] = 0.85776241
@@ -179,7 +179,7 @@ def debug_student_t_noise_approx():
     return m
 
     #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
     #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
@@ -276,7 +276,7 @@ def student_t_approx():
     edited_real_sd = real_sd #initial_var_guess
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     m.ensure_default_constraints()
@@ -291,7 +291,7 @@ def student_t_approx():
     plt.title('Student-t rasm clean')
 
     print "Corrupt student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
@@ -308,7 +308,7 @@ def student_t_approx():
     return m
 
     #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
     #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
@@ -322,7 +322,7 @@ def student_t_approx():
     #plt.title('Student-t ncg clean')
 
     #print "Corrupt student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
     #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
     #m.ensure_default_constraints()
@@ -337,7 +337,7 @@ def student_t_approx():
 
 
     ###with a student t distribution, since it has heavy tails it should work well
-    ###likelihood_function = student_t(deg_free, sigma=real_var)
+    ###likelihood_function = student_t(deg_free, sigma2=real_var)
     ###lap = Laplace(Y, likelihood_function)
     ###cov = kernel.K(X)
     ###lap.fit_full(cov)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 2ae68613..984112a5 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -220,10 +220,10 @@ class Laplace(likelihood):
         self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
 
         #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
-        self.ln_z_hat = (- 0.5*self.f_Ki_f
-                         - self.ln_I_KW_det
-                         + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
-                         )
+        #self.ln_z_hat = (- 0.5*self.f_Ki_f
+                         #- self.ln_I_KW_det
+                         #+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+                         #)
 
         return self._compute_GP_variables()
 
@@ -308,6 +308,8 @@ class Laplace(likelihood):
         step_size = 1
         rs = 0
         i = 0
+        #if self.likelihood_function.sigma < 0.001:
+            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
@@ -316,8 +318,6 @@ class Laplace(likelihood):
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
             B, L, W_12 = self._compute_B_statistics(K, W)
-            #if i > 30:
-                #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
             W_f = W*f
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index fd64dbe6..bfc759d7 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -158,26 +158,26 @@ class student_t(likelihood_function):
     dln p(yi|fi)_dfi
     d2ln p(yi|fi)_d2fifj
     """
-    def __init__(self, deg_free, sigma=2):
+    def __init__(self, deg_free, sigma2=2):
         #super(student_t, self).__init__()
         self.v = deg_free
-        self.sigma = sigma
+        self.sigma2 = sigma2
         self.log_concave = False
 
-        self._set_params(np.asarray(sigma))
+        self._set_params(np.asarray(sigma2))
 
     def _get_params(self):
-        return np.asarray(self.sigma)
+        return np.asarray(self.sigma2)
 
     def _get_param_names(self):
-        return ["t_noise_std"]
+        return ["t_noise_std2"]
 
     def _set_params(self, x):
-        self.sigma = float(x)
+        self.sigma2 = float(x)
 
     @property
     def variance(self, extra_data=None):
-        return (self.v / float(self.v - 2)) * (self.sigma**2)
+        return (self.v / float(self.v - 2)) * self.sigma2
 
     def link_function(self, y, f, extra_data=None):
         """link_function $\ln p(y|f)$
@@ -193,12 +193,16 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
+        A = gammaln((self.v + 1) * 0.5)
+        B = -gammaln(self.v * 0.5)
+        C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
+        D = (-(self.v + 1)*0.5)*np.log(1 + (e**2)/(self.v*self.sigma2))
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
-                     - 0.5*np.log((self.sigma**2) * self.v * np.pi)
-                     #- (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v))
-                     - (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2)))
+                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
+                     + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
                     )
+        #print "A: {} B: {} C: {} D: {} obj: {}".format(A,B,C,D.sum(),objective.sum())
         return np.sum(objective)
 
     def dlik_df(self, y, f, extra_data=None):
@@ -215,7 +219,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
         return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
@@ -235,7 +239,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -246,8 +250,8 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
-                       ((e**2 + (self.sigma**2)*self.v)**3)
+        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       ((e**2 + self.sigma2*self.v)**3)
                     )
         return d3lik_d3f
 
@@ -262,10 +266,16 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_dsigma = ( - (1/self.sigma) +
-                        ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) )
-                      )
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_dsigma = ( - (1/sigma) +
+                        #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      #)
+        #dlik_dsigma = ( - 1 +
+                        #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      #)
         #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
+        #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
+        dlik_dsigma = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
@@ -276,9 +286,11 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
-                            / ((self.v*(self.sigma**2) + e**2)**2)
-                           )
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
+                            #/ ((self.v*self.sigma2 + e**2)**2)
+                           #)
+        dlik_grad_dsigma = (-self.v*(self.v+1)*e)/((self.sigma2*self.v + e**2)**2)
         return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
@@ -289,11 +301,15 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_hess_dsigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
-                              ((e**2 + (self.sigma**2)*self.v)**3)
-                           )
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
+                              #((e**2 + self.sigma2*self.v)**3)
+                           #)
         #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
                              #/ ((e**2 + (self.sigma**2)*self.v)**3) )
+        dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
+                              / (self.sigma2*self.v + (e**2))**3
+                           )
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):
@@ -466,3 +482,148 @@ class weibull_survival(likelihood_function):
 
         hess = (y**self.shape)*np.exp(f)
         return np.squeeze(hess)
+
+#class gaussian(likelihood_function):
+    #"""
+    #Gaussian likelihood - this is a test class for approximation schemes
+    #"""
+    #def __init__(self, variance):
+        #self._set_params(np.asarray(variance))
+
+    #def _get_params(self):
+        #return np.asarray(self.sigma2)
+
+    #def _get_param_names(self):
+        #return ["noise_variance"]
+
+    #def _set_params(self, x):
+        #self.variance = float(x)
+
+    #def link_function(self, y, f, extra_data=None):
+        #"""link_function $\ln p(y|f)$
+        #$$\ln p(y_{i}|f_{i}) = \ln $$
+
+        #:y: data
+        #:f: latent variables f
+        #:extra_data: extra_data which is not used in student t distribution
+        #:returns: float(likelihood evaluated for this point)
+
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #objective = -0.5*self.D*
+        #return np.sum(objective)
+
+    #def dlik_df(self, y, f, extra_data=None):
+        #"""
+        #Gradient of the link function at y, given f w.r.t f
+
+        #$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
+
+        #:y: data
+        #:f: latent variables f
+        #:extra_data: extra_data which is not used in student t distribution
+        #:returns: gradient of likelihood evaluated at points
+
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        #return grad
+
+    #def d2lik_d2f(self, y, f, extra_data=None):
+        #"""
+        #Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        #i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        #Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+        #(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+
+        #$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
+
+        #:y: data
+        #:f: latent variables f
+        #:extra_data: extra_data which is not used in student t distribution
+        #:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
+        #return hess
+
+    #def d3lik_d3f(self, y, f, extra_data=None):
+        #"""
+        #Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+
+        #$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       #((e**2 + self.sigma2*self.v)**3)
+                    #)
+        #return d3lik_d3f
+
+    #def lik_dstd(self, y, f, extra_data=None):
+        #"""
+        #Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
+
+        #Terms relavent to derivatives wrt sigma are:
+        #-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+
+        #$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #sigma = np.sqrt(self.sigma2)
+        ##dlik_dsigma = ( - (1/sigma) +
+                        ##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      ##)
+        ##dlik_dsigma = ( - 1 +
+                        ##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      ##)
+        ##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
+        #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
+        #return dlik_dsigma
+
+    #def dlik_df_dstd(self, y, f, extra_data=None):
+        #"""
+        #Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
+
+        #$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
+                            #/ ((self.v*self.sigma2 + e**2)**2)
+                           #)
+        #return dlik_grad_dsigma
+
+    #def d2lik_d2f_dstd(self, y, f, extra_data=None):
+        #"""
+        #Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
+
+        #$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
+                              #((e**2 + self.sigma2*self.v)**3)
+                           #)
+        ##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
+                             ##/ ((e**2 + (self.sigma**2)*self.v)**3) )
+        #return dlik_hess_dsigma
+
+    #def _gradients(self, y, f, extra_data=None):
+        ##must be listed in same order as 'get_param_names'
+        #derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
+                  #[self.dlik_df_dstd(y, f, extra_data=extra_data)],
+                  #[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+                 #) # lists as we might learn many parameters
+        ## ensure we have gradients for every parameter we want to optimize
+        #assert len(derivs[0]) == len(self._get_param_names())
+        #assert len(derivs[1]) == len(self._get_param_names())
+        #assert len(derivs[2]) == len(self._get_param_names())
+        #return derivs