Reparameratised in terms of sigma2

This commit is contained in:
Alan Saul 2013-07-08 15:48:53 +01:00
parent ab6a3a571e
commit 4e5cefb4b5
4 changed files with 207 additions and 49 deletions

View file

@ -244,9 +244,6 @@ class model(parameterised):
LL_gradients = self._transform_gradients(self._log_likelihood_gradients()) LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
prior_gradients = self._transform_gradients(self._log_prior_gradients()) prior_gradients = self._transform_gradients(self._log_prior_gradients())
obj_grads = -LL_gradients - prior_gradients obj_grads = -LL_gradients - prior_gradients
print self
#self.checkgrad(verbose=1)
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
return obj_f, obj_grads return obj_f, obj_grads
def optimize(self, optimizer=None, start=None, **kwargs): def optimize(self, optimizer=None, start=None, **kwargs):

View file

@ -24,7 +24,7 @@ def timing():
edited_real_sd = real_sd edited_real_sd = real_sd
kernel1 = GPy.kern.rbf(X.shape[1]) kernel1 = GPy.kern.rbf(X.shape[1])
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1) m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
m.ensure_default_constraints() m.ensure_default_constraints()
@ -53,7 +53,7 @@ def v_fail_test():
edited_real_sd = real_sd edited_real_sd = real_sd
print "Clean student t, rasm" print "Clean student t, rasm"
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, stu_t_likelihood, kernel1) m = GPy.models.GP(X, stu_t_likelihood, kernel1)
m.constrain_positive('') m.constrain_positive('')
@ -92,18 +92,18 @@ def debug_student_t_noise_approx():
X = np.random.rand(100)[:, None] X = np.random.rand(100)[:, None]
#X = np.random.rand(100)[:, None] #X = np.random.rand(100)[:, None]
#X = np.array([0.5, 1])[:, None] #X = np.array([0.5, 1])[:, None]
Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + 1
#Y = X + np.random.randn(*X.shape)*real_var #Y = X + np.random.randn(*X.shape)*real_var
#ty = np.array([1., 9.97733584, 4.17841363])[:, None] #ty = np.array([1., 9.97733584, 4.17841363])[:, None]
#Y = ty #Y = ty
X_full = X X_full = X
Y_full = np.sin(X_full) Y_full = np.sin(X_full) + 1
Y = Y/Y.max() Y = Y/Y.max()
#Add student t random noise to datapoints #Add student t random noise to datapoints
deg_free = 1000 deg_free = 100
real_sd = np.sqrt(real_var) real_sd = np.sqrt(real_var)
print "Real noise std: ", real_sd print "Real noise std: ", real_sd
@ -115,7 +115,7 @@ def debug_student_t_noise_approx():
plt.close('all') plt.close('all')
# Kernel object # Kernel object
kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
#kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1]) #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1])
kernel2 = kernel1.copy() kernel2 = kernel1.copy()
kernel3 = kernel1.copy() kernel3 = kernel1.copy()
@ -140,24 +140,24 @@ def debug_student_t_noise_approx():
#print m #print m
real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free))) real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
edited_real_sd = real_stu_t_std + 1#initial_var_guess #real_sd edited_real_sd = real_stu_t_std**2 #initial_var_guess #real_sd
#edited_real_sd = real_sd #edited_real_sd = real_sd
print "Clean student t, rasm" print "Clean student t, rasm"
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, stu_t_likelihood, kernel6) m = GPy.models.GP(X, stu_t_likelihood, kernel6)
#m['rbf_len'] = 1.5 #m['rbf_len'] = 1.5
#m.constrain_fixed('rbf_v', 1.0898) #m.constrain_fixed('rbf_v', 1.0898)
#m.constrain_fixed('rbf_l', 1.8651) #m.constrain_fixed('rbf_l', 0.2651)
#m.constrain_fixed('t_noise_std', edited_real_sd) #m.constrain_fixed('t_noise_std', edited_real_sd)
#m.constrain_positive('rbf') #m.constrain_positive('rbf')
#m.constrain_positive('t_noise_std') m.constrain_positive('t_noise_std')
#m.constrain_positive('') #m.constrain_positive('')
#m.constrain_bounded('t_noi', 0.001, 10) #m.constrain_bounded('t_noi', 0.001, 10)
#m.constrain_fixed('t_noi', real_stu_t_std) #m.constrain_fixed('t_noi', real_stu_t_std)
m.constrain_fixed('white', 0.01) #m.constrain_fixed('white', 0.01)
#m.constrain_fixed('t_no', 0.01) #m.constrain_fixed('t_no', 0.01)
#m['rbf_var'] = 0.20446332 #m['rbf_var'] = 0.20446332
#m['rbf_leng'] = 0.85776241 #m['rbf_leng'] = 0.85776241
@ -179,7 +179,7 @@ def debug_student_t_noise_approx():
return m return m
#print "Clean student t, ncg" #print "Clean student t, ncg"
#t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
#stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg') #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
#m = GPy.models.GP(X, stu_t_likelihood, kernel3) #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
#m.ensure_default_constraints() #m.ensure_default_constraints()
@ -276,7 +276,7 @@ def student_t_approx():
edited_real_sd = real_sd #initial_var_guess edited_real_sd = real_sd #initial_var_guess
print "Clean student t, rasm" print "Clean student t, rasm"
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, stu_t_likelihood, kernel6) m = GPy.models.GP(X, stu_t_likelihood, kernel6)
m.ensure_default_constraints() m.ensure_default_constraints()
@ -291,7 +291,7 @@ def student_t_approx():
plt.title('Student-t rasm clean') plt.title('Student-t rasm clean')
print "Corrupt student t, rasm" print "Corrupt student t, rasm"
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
m.ensure_default_constraints() m.ensure_default_constraints()
@ -308,7 +308,7 @@ def student_t_approx():
return m return m
#print "Clean student t, ncg" #print "Clean student t, ncg"
#t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
#stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg') #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
#m = GPy.models.GP(X, stu_t_likelihood, kernel3) #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
#m.ensure_default_constraints() #m.ensure_default_constraints()
@ -322,7 +322,7 @@ def student_t_approx():
#plt.title('Student-t ncg clean') #plt.title('Student-t ncg clean')
#print "Corrupt student t, ncg" #print "Corrupt student t, ncg"
#t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
#corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg') #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
#m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
#m.ensure_default_constraints() #m.ensure_default_constraints()
@ -337,7 +337,7 @@ def student_t_approx():
###with a student t distribution, since it has heavy tails it should work well ###with a student t distribution, since it has heavy tails it should work well
###likelihood_function = student_t(deg_free, sigma=real_var) ###likelihood_function = student_t(deg_free, sigma2=real_var)
###lap = Laplace(Y, likelihood_function) ###lap = Laplace(Y, likelihood_function)
###cov = kernel.K(X) ###cov = kernel.K(X)
###lap.fit_full(cov) ###lap.fit_full(cov)

View file

@ -220,10 +220,10 @@ class Laplace(likelihood):
self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T) self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
#self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W)) #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
self.ln_z_hat = (- 0.5*self.f_Ki_f #self.ln_z_hat = (- 0.5*self.f_Ki_f
- self.ln_I_KW_det #- self.ln_I_KW_det
+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) #+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
) #)
return self._compute_GP_variables() return self._compute_GP_variables()
@ -308,6 +308,8 @@ class Laplace(likelihood):
step_size = 1 step_size = 1
rs = 0 rs = 0
i = 0 i = 0
#if self.likelihood_function.sigma < 0.001:
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
if not self.likelihood_function.log_concave: if not self.likelihood_function.log_concave:
@ -316,8 +318,6 @@ class Laplace(likelihood):
# To cause the posterior to become less certain than the prior and likelihood, # To cause the posterior to become less certain than the prior and likelihood,
# This is a property only held by non-log-concave likelihoods # This is a property only held by non-log-concave likelihoods
B, L, W_12 = self._compute_B_statistics(K, W) B, L, W_12 = self._compute_B_statistics(K, W)
#if i > 30:
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
W_f = W*f W_f = W*f
grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data) grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)

View file

@ -158,26 +158,26 @@ class student_t(likelihood_function):
dln p(yi|fi)_dfi dln p(yi|fi)_dfi
d2ln p(yi|fi)_d2fifj d2ln p(yi|fi)_d2fifj
""" """
def __init__(self, deg_free, sigma=2): def __init__(self, deg_free, sigma2=2):
#super(student_t, self).__init__() #super(student_t, self).__init__()
self.v = deg_free self.v = deg_free
self.sigma = sigma self.sigma2 = sigma2
self.log_concave = False self.log_concave = False
self._set_params(np.asarray(sigma)) self._set_params(np.asarray(sigma2))
def _get_params(self): def _get_params(self):
return np.asarray(self.sigma) return np.asarray(self.sigma2)
def _get_param_names(self): def _get_param_names(self):
return ["t_noise_std"] return ["t_noise_std2"]
def _set_params(self, x): def _set_params(self, x):
self.sigma = float(x) self.sigma2 = float(x)
@property @property
def variance(self, extra_data=None): def variance(self, extra_data=None):
return (self.v / float(self.v - 2)) * (self.sigma**2) return (self.v / float(self.v - 2)) * self.sigma2
def link_function(self, y, f, extra_data=None): def link_function(self, y, f, extra_data=None):
"""link_function $\ln p(y|f)$ """link_function $\ln p(y|f)$
@ -193,12 +193,16 @@ class student_t(likelihood_function):
""" """
assert y.shape == f.shape assert y.shape == f.shape
e = y - f e = y - f
A = gammaln((self.v + 1) * 0.5)
B = -gammaln(self.v * 0.5)
C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
D = (-(self.v + 1)*0.5)*np.log(1 + (e**2)/(self.v*self.sigma2))
objective = (+ gammaln((self.v + 1) * 0.5) objective = (+ gammaln((self.v + 1) * 0.5)
- gammaln(self.v * 0.5) - gammaln(self.v * 0.5)
- 0.5*np.log((self.sigma**2) * self.v * np.pi) - 0.5*np.log(self.sigma2 * self.v * np.pi)
#- (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v)) + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
- (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2)))
) )
#print "A: {} B: {} C: {} D: {} obj: {}".format(A,B,C,D.sum(),objective.sum())
return np.sum(objective) return np.sum(objective)
def dlik_df(self, y, f, extra_data=None): def dlik_df(self, y, f, extra_data=None):
@ -215,7 +219,7 @@ class student_t(likelihood_function):
""" """
assert y.shape == f.shape assert y.shape == f.shape
e = y - f e = y - f
grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
return grad return grad
def d2lik_d2f(self, y, f, extra_data=None): def d2lik_d2f(self, y, f, extra_data=None):
@ -235,7 +239,7 @@ class student_t(likelihood_function):
""" """
assert y.shape == f.shape assert y.shape == f.shape
e = y - f e = y - f
hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
return hess return hess
def d3lik_d3f(self, y, f, extra_data=None): def d3lik_d3f(self, y, f, extra_data=None):
@ -246,8 +250,8 @@ class student_t(likelihood_function):
""" """
assert y.shape == f.shape assert y.shape == f.shape
e = y - f e = y - f
d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) / d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
((e**2 + (self.sigma**2)*self.v)**3) ((e**2 + self.sigma2*self.v)**3)
) )
return d3lik_d3f return d3lik_d3f
@ -262,10 +266,16 @@ class student_t(likelihood_function):
""" """
assert y.shape == f.shape assert y.shape == f.shape
e = y - f e = y - f
dlik_dsigma = ( - (1/self.sigma) + #sigma = np.sqrt(self.sigma2)
((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) ) #dlik_dsigma = ( - (1/sigma) +
) #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
#)
#dlik_dsigma = ( - 1 +
#((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
#)
#dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1 #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
#dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
dlik_dsigma = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
return dlik_dsigma return dlik_dsigma
def dlik_df_dstd(self, y, f, extra_data=None): def dlik_df_dstd(self, y, f, extra_data=None):
@ -276,9 +286,11 @@ class student_t(likelihood_function):
""" """
assert y.shape == f.shape assert y.shape == f.shape
e = y - f e = y - f
dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) #2 might not want to be here? #sigma = np.sqrt(self.sigma2)
/ ((self.v*(self.sigma**2) + e**2)**2) #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
) #/ ((self.v*self.sigma2 + e**2)**2)
#)
dlik_grad_dsigma = (-self.v*(self.v+1)*e)/((self.sigma2*self.v + e**2)**2)
return dlik_grad_dsigma return dlik_grad_dsigma
def d2lik_d2f_dstd(self, y, f, extra_data=None): def d2lik_d2f_dstd(self, y, f, extra_data=None):
@ -289,11 +301,15 @@ class student_t(likelihood_function):
""" """
assert y.shape == f.shape assert y.shape == f.shape
e = y - f e = y - f
dlik_hess_dsigma = ( (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) / #sigma = np.sqrt(self.sigma2)
((e**2 + (self.sigma**2)*self.v)**3) #dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
) #((e**2 + self.sigma2*self.v)**3)
#)
#dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2)) #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
#/ ((e**2 + (self.sigma**2)*self.v)**3) ) #/ ((e**2 + (self.sigma**2)*self.v)**3) )
dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
/ (self.sigma2*self.v + (e**2))**3
)
return dlik_hess_dsigma return dlik_hess_dsigma
def _gradients(self, y, f, extra_data=None): def _gradients(self, y, f, extra_data=None):
@ -466,3 +482,148 @@ class weibull_survival(likelihood_function):
hess = (y**self.shape)*np.exp(f) hess = (y**self.shape)*np.exp(f)
return np.squeeze(hess) return np.squeeze(hess)
#class gaussian(likelihood_function):
#"""
#Gaussian likelihood - this is a test class for approximation schemes
#"""
#def __init__(self, variance):
#self._set_params(np.asarray(variance))
#def _get_params(self):
#return np.asarray(self.sigma2)
#def _get_param_names(self):
#return ["noise_variance"]
#def _set_params(self, x):
#self.variance = float(x)
#def link_function(self, y, f, extra_data=None):
#"""link_function $\ln p(y|f)$
#$$\ln p(y_{i}|f_{i}) = \ln $$
#:y: data
#:f: latent variables f
#:extra_data: extra_data which is not used in student t distribution
#:returns: float(likelihood evaluated for this point)
#"""
#assert y.shape == f.shape
#e = y - f
#objective = -0.5*self.D*
#return np.sum(objective)
#def dlik_df(self, y, f, extra_data=None):
#"""
#Gradient of the link function at y, given f w.r.t f
#$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
#:y: data
#:f: latent variables f
#:extra_data: extra_data which is not used in student t distribution
#:returns: gradient of likelihood evaluated at points
#"""
#assert y.shape == f.shape
#e = y - f
#grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
#return grad
#def d2lik_d2f(self, y, f, extra_data=None):
#"""
#Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
#i.e. second derivative link_function at y given f f_j w.r.t f and f_j
#Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
#(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
#$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
#:y: data
#:f: latent variables f
#:extra_data: extra_data which is not used in student t distribution
#:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
#"""
#assert y.shape == f.shape
#e = y - f
#hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
#return hess
#def d3lik_d3f(self, y, f, extra_data=None):
#"""
#Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
#$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
#"""
#assert y.shape == f.shape
#e = y - f
#d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
#((e**2 + self.sigma2*self.v)**3)
#)
#return d3lik_d3f
#def lik_dstd(self, y, f, extra_data=None):
#"""
#Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
#Terms relavent to derivatives wrt sigma are:
#-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
#$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
#"""
#assert y.shape == f.shape
#e = y - f
#sigma = np.sqrt(self.sigma2)
##dlik_dsigma = ( - (1/sigma) +
##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
##)
##dlik_dsigma = ( - 1 +
##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
##)
##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
#dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
#return dlik_dsigma
#def dlik_df_dstd(self, y, f, extra_data=None):
#"""
#Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
#$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
#"""
#assert y.shape == f.shape
#e = y - f
#sigma = np.sqrt(self.sigma2)
#dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
#/ ((self.v*self.sigma2 + e**2)**2)
#)
#return dlik_grad_dsigma
#def d2lik_d2f_dstd(self, y, f, extra_data=None):
#"""
#Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
#$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
#"""
#assert y.shape == f.shape
#e = y - f
#sigma = np.sqrt(self.sigma2)
#dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
#((e**2 + self.sigma2*self.v)**3)
#)
##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
##/ ((e**2 + (self.sigma**2)*self.v)**3) )
#return dlik_hess_dsigma
#def _gradients(self, y, f, extra_data=None):
##must be listed in same order as 'get_param_names'
#derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
#[self.dlik_df_dstd(y, f, extra_data=extra_data)],
#[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
#) # lists as we might learn many parameters
## ensure we have gradients for every parameter we want to optimize
#assert len(derivs[0]) == len(self._get_param_names())
#assert len(derivs[1]) == len(self._get_param_names())
#assert len(derivs[2]) == len(self._get_param_names())
#return derivs