Started adding gaussian likelihood, changed round preloading old_a

This commit is contained in:
Alan Saul 2013-07-29 15:29:46 +01:00
parent 57001851c4
commit aa98608590
4 changed files with 321 additions and 181 deletions

View file

@ -244,6 +244,12 @@ class model(parameterised):
LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
prior_gradients = self._transform_gradients(self._log_prior_gradients())
obj_grads = -LL_gradients - prior_gradients
print self
print self._get_params()
print -obj_grads
self.plot()
if isinstance(self.likelihood, likelihoods.Laplace):
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
return obj_f, obj_grads
def optimize(self, optimizer=None, start=None, **kwargs):

View file

@ -85,10 +85,60 @@ def v_fail_test():
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
print(m)
def student_t_obj_plane():
plt.close('all')
X = np.linspace(0, 1, 50)[:, None]
real_std = 0.002
noise = np.random.randn(*X.shape)*real_std
Y = np.sin(X*2*np.pi) + noise
deg_free = 1000
kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
mgp.ensure_default_constraints()
mgp['noise'] = real_std**2
print "Gaussian"
print mgp
kernelst = kernelgp.copy()
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2))
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, stu_t_likelihood, kernelst)
m.ensure_default_constraints()
m.constrain_fixed('t_no', real_std**2)
vs = 10
ls = 10
objs_t = np.zeros((vs, ls))
objs_g = np.zeros((vs, ls))
rbf_vs = np.linspace(1e-6, 8, vs)
rbf_ls = np.linspace(1e-2, 8, ls)
for v_id, rbf_v in enumerate(rbf_vs):
for l_id, rbf_l in enumerate(rbf_ls):
m['rbf_v'] = rbf_v
m['rbf_l'] = rbf_l
mgp['rbf_v'] = rbf_v
mgp['rbf_l'] = rbf_l
objs_t[v_id, l_id] = m.log_likelihood()
objs_g[v_id, l_id] = mgp.log_likelihood()
plt.figure()
plt.subplot(211)
plt.title('Student t')
plt.imshow(objs_t, interpolation='none')
plt.xlabel('variance')
plt.ylabel('lengthscale')
plt.subplot(212)
plt.title('Gaussian')
plt.imshow(objs_g, interpolation='none')
plt.xlabel('variance')
plt.ylabel('lengthscale')
plt.show()
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
return objs_t
def student_t_f_check():
plt.close('all')
X = np.linspace(0, 1, 50)[:, None]
real_std = 0.001
real_std = 0.2
noise = np.random.randn(*X.shape)*real_std
Y = np.sin(X*2*np.pi) + noise
deg_free = 1000
@ -98,17 +148,26 @@ def student_t_f_check():
mgp.ensure_default_constraints()
mgp.randomize()
mgp.optimize()
print "Gaussian"
print mgp
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
kernelst = kernelgp.copy()
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=1e-5)
#kernelst += GPy.kern.bias(X.shape[1])
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
m = GPy.models.GP(X, stu_t_likelihood, kernelst)
m['rbf_v'] = mgp._get_params()[0]
m['rbf_l'] = mgp._get_params()[1] + 1
#m['rbf_v'] = mgp._get_params()[0]
#m['rbf_l'] = mgp._get_params()[1] + 1
m.ensure_default_constraints()
#m.constrain_fixed('rbf_v', mgp._get_params()[0])
#m.constrain_fixed('rbf_l', mgp._get_params()[1])
#m.constrain_bounded('t_no', 2*real_std**2, 1e3)
#m.constrain_positive('bias')
m.constrain_positive('t_no')
m.randomize()
m['t_no'] = 0.3
m.likelihood.X = X
print m
plt.figure()
plt.subplot(511)
@ -143,7 +202,8 @@ def student_t_fix_optimise_check():
Y = np.sin(X*2*np.pi) + noise
X_full = X
Y_full = np.sin(X_full)
#Y = Y/Y.max()
Y = Y/Y.max()
Y_full = Y_full/Y_full.max()
deg_free = 1000
#GP
@ -219,7 +279,7 @@ def student_t_fix_optimise_check():
plt.subplot(121)
mrbf.plot()
plt.title('Student t fixed noise')
#mrbf.optimize()
mrbf.optimize()
print "After optimize"
print mrbf
plt.subplot(122)

View file

@ -156,17 +156,23 @@ class Laplace(likelihood):
Y_tilde = Wi*self.Ki_f + self.f_hat
self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
#self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6
self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
Z_tilde = (#+ self.NORMAL_CONST
self.aA = 0.5*self.ln_det_K_Wi__Bi
self.bB = - 0.5*self.f_Ki_f
self.cC = 0.5*self.y_Wi_Ki_i_y
Z_tilde = (+ 100*self.NORMAL_CONST
+ self.lik
+ 0.5*self.ln_det_K_Wi__Bi
- 0.5*self.f_Ki_f
+ 0.5*self.y_Wi_Ki_i_y
)
#print "Ztilde: {}".format(Z_tilde)
print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC)
print self.likelihood_function._get_params()
#Convert to float as its (1, 1) and Z must be a scalar
self.Z = np.float64(Z_tilde)
@ -198,7 +204,7 @@ class Laplace(likelihood):
self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
if not self.likelihood_function.log_concave:
self.W[self.W < 0] = 1e-10 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
#If the likelihood is non-log-concave. We wan't to say that there is a negative variance
#To cause the posterior to become less certain than the prior and likelihood,
#This is a property only held by non-log-concave likelihoods
@ -280,7 +286,7 @@ class Laplace(likelihood):
f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
return f_hat[:, None]
def rasm_mode(self, K, MAX_ITER=40, MAX_RESTART=10):
def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
"""
Rasmussens numerically stable mode finding
For nomenclature see Rasmussen & Williams 2006
@ -290,15 +296,19 @@ class Laplace(likelihood):
:MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
:returns: f_mode
"""
old_a = np.zeros((self.N, 1))
#old_a = None
#if self.old_a is None:
#old_a = np.zeros((self.N, 1))
#else:
#old_a = self.old_a
self.old_before_s = self.likelihood_function._get_params()
print "before: ", self.old_before_s
#if self.old_before_s < 1e-5:
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
#old_a = np.zeros((self.N, 1))
if self.old_a is None:
old_a = np.zeros((self.N, 1))
f = np.dot(K, old_a)
else:
old_a = self.old_a.copy()
f = self.f_hat.copy()
f = np.dot(self.K, old_a)
self.f = f
new_obj = -np.inf
old_obj = np.inf
@ -306,18 +316,20 @@ class Laplace(likelihood):
return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
difference = np.inf
epsilon = 1e-10
epsilon = 1e-4
step_size = 1
rs = 0
i = 0
while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
#W = np.maximum(W, 0)
if not self.likelihood_function.log_concave:
W[W < 0] = 1e-10 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
# If the likelihood is non-log-concave. We wan't to say that there is a negative variance
# To cause the posterior to become less certain than the prior and likelihood,
# This is a property only held by non-log-concave likelihoods
B, L, W_12 = self._compute_B_statistics(K, W)
B, L, W_12 = self._compute_B_statistics(K, W.copy())
W_f = W*f
grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
@ -328,54 +340,105 @@ class Laplace(likelihood):
full_step_a = b - W_12*solve_L
da = full_step_a - old_a
f_old = f.copy()
f_old = self.f.copy()
def inner_obj(step_size, old_a, da, K):
a = old_a + step_size*da
f = np.dot(K, a)
self.a = a # This is nasty, need to set something within an optimization though
self.f = f
return -obj(a, f)
from functools import partial
i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K)
new_obj = sp.optimize.brent(i_o, tol=1e-6, maxiter=10)
#update_passed = False
#while not update_passed:
#f_old = f.copy()
#def inner_obj(step_size, old_a, da, K):
#a = old_a + step_size*da
#f = np.dot(K, a)
#self.a = a.copy() # This is nasty, need to set something within an optimization though
#self.f = f.copy()
#return -obj(a, f)
#old_obj = new_obj
#new_obj = obj(a, f)
#difference = new_obj - old_obj
#print "difference: ",difference
#if difference < 0:
##print grad
##print "Objective function rose", np.float(difference)
##If the objective function isn't rising, restart optimization
#step_size *= 0.8
##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
##objective function isn't increasing, try reducing step size
##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
##old_obj = tmp_old_obj
#old_obj = new_obj
#rs += 1
#else:
#update_passed = True
#from functools import partial
#i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
#new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
#f = self.f.copy()
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
f = self.f
difference = new_obj - old_obj
difference = np.abs(np.sum(f - f_old)) #+ abs(difference)
old_a = self.a #a
f_old = f.copy()
update_passed = False
while not update_passed:
a = old_a + step_size*da
f = np.dot(K, a)
old_obj = new_obj
new_obj = obj(a, f)
difference = new_obj - old_obj
print "difference: ",difference
if difference < 0:
#print "Objective function rose", np.float(difference)
#If the objective function isn't rising, restart optimization
step_size *= 0.8
#print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
#objective function isn't increasing, try reducing step size
f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
old_obj = new_obj
rs += 1
else:
update_passed = True
#difference = abs(new_obj - old_obj)
#old_obj = new_obj.copy()
difference = np.abs(np.sum(f - f_old))
#old_a = self.a.copy() #a
old_a = a.copy()
i += 1
#print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
self.old_a = old_a
self.old_a = old_a.copy()
#print "Positive difference obj: ", np.float(difference)
#print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
print "Iterations: {}, Final_difference: {}".format(i, difference)
#self.a = a
if difference > 1e-4:
print "FAIL FAIL FAIL FAIL FAIL FAIL"
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
if hasattr(self, 'X'):
import pylab as pb
pb.figure()
pb.subplot(311)
pb.title('old f_hat')
pb.plot(self.X, self.f_hat)
pb.subplot(312)
pb.title('old ff')
pb.plot(self.X, self.old_ff)
pb.subplot(313)
pb.title('new f_hat')
pb.plot(self.X, f)
pb.figure()
pb.subplot(121)
pb.title('old K')
pb.imshow(np.diagflat(self.old_K), interpolation='none')
pb.colorbar()
pb.subplot(122)
pb.title('new K')
pb.imshow(np.diagflat(K), interpolation='none')
pb.colorbar()
pb.figure()
pb.subplot(121)
pb.title('old W')
pb.imshow(np.diagflat(self.old_W), interpolation='none')
pb.colorbar()
pb.subplot(122)
pb.title('new W')
pb.imshow(np.diagflat(W), interpolation='none')
pb.colorbar()
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
pb.close('all')
#FIXME: DELETE THESE
self.old_W = W.copy()
self.old_grad = grad.copy()
self.old_B = B.copy()
self.old_W_12 = W_12.copy()
self.old_ff = f.copy()
self.old_K = self.K.copy()
self.old_s = self.likelihood_function._get_params()
print "after: ", self.old_s
#print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
self.a = a
#self.B, self.B_chol, self.W_12 = B, L, W_12
#self.Bi, _, _, B_det = pdinv(self.B)
return f

View file

@ -193,11 +193,16 @@ class student_t(likelihood_function):
"""
assert y.shape == f.shape
e = y - f
#A = gammaln((self.v + 1) * 0.5)
#B = - gammaln(self.v * 0.5)
#C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
#D = + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
objective = (+ gammaln((self.v + 1) * 0.5)
- gammaln(self.v * 0.5)
- 0.5*np.log(self.sigma2 * self.v * np.pi)
+ (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
)
#print "C: {} D: {} obj: {}".format(C, np.sum(D), objective.sum())
return np.sum(objective)
def dlik_df(self, y, f, extra_data=None):
@ -459,147 +464,153 @@ class weibull_survival(likelihood_function):
hess = (y**self.shape)*np.exp(f)
return np.squeeze(hess)
#class gaussian(likelihood_function):
#"""
#Gaussian likelihood - this is a test class for approximation schemes
#"""
#def __init__(self, variance):
#self._set_params(np.asarray(variance))
class gaussian(likelihood_function):
"""
Gaussian likelihood - this is a test class for approximation schemes
"""
def __init__(self, variance):
self._set_params(np.asarray(variance))
#def _get_params(self):
#return np.asarray(self.sigma2)
def _get_params(self):
return np.asarray(self._variance)
#def _get_param_names(self):
#return ["noise_variance"]
def _get_param_names(self):
return ["noise_variance"]
#def _set_params(self, x):
#self.variance = float(x)
def _set_params(self, x):
self._variance = float(x)
self.covariance_matrix = np.eye(self.N) * self._variance
self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG
#def link_function(self, y, f, extra_data=None):
#"""link_function $\ln p(y|f)$
#$$\ln p(y_{i}|f_{i}) = \ln $$
def link_function(self, y, f, extra_data=None):
"""link_function $\ln p(y|f)$
$$\ln p(y_{i}|f_{i}) = \ln $$
#:y: data
#:f: latent variables f
#:extra_data: extra_data which is not used in student t distribution
#:returns: float(likelihood evaluated for this point)
:y: data
:f: latent variables f
:extra_data: extra_data which is not used in student t distribution
:returns: float(likelihood evaluated for this point)
#"""
#assert y.shape == f.shape
#e = y - f
#objective = -0.5*self.D*
#return np.sum(objective)
"""
assert y.shape == f.shape
e = y - f
eeT = np.dot(e, e.T)
objective = (- 0.5*self.D*np.log(2*np.pi)
- 0.5*self.ln_K
- 0.5*np.sum(np.multiply(self.Ki, eeT))
)
return np.sum(objective)
#def dlik_df(self, y, f, extra_data=None):
#"""
#Gradient of the link function at y, given f w.r.t f
def dlik_df(self, y, f, extra_data=None):
"""
Gradient of the link function at y, given f w.r.t f
#$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
#:y: data
#:f: latent variables f
#:extra_data: extra_data which is not used in student t distribution
#:returns: gradient of likelihood evaluated at points
:y: data
:f: latent variables f
:extra_data: extra_data which is not used in student t distribution
:returns: gradient of likelihood evaluated at points
#"""
#assert y.shape == f.shape
#e = y - f
#grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
#return grad
"""
assert y.shape == f.shape
e = y - f
grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
return grad
#def d2lik_d2f(self, y, f, extra_data=None):
#"""
#Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
#i.e. second derivative link_function at y given f f_j w.r.t f and f_j
def d2lik_d2f(self, y, f, extra_data=None):
"""
Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
i.e. second derivative link_function at y given f f_j w.r.t f and f_j
#Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
#(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
#$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
#:y: data
#:f: latent variables f
#:extra_data: extra_data which is not used in student t distribution
#:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
#"""
#assert y.shape == f.shape
#e = y - f
#hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
#return hess
:y: data
:f: latent variables f
:extra_data: extra_data which is not used in student t distribution
:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
"""
assert y.shape == f.shape
e = y - f
hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
return hess
#def d3lik_d3f(self, y, f, extra_data=None):
#"""
#Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
def d3lik_d3f(self, y, f, extra_data=None):
"""
Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
#$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
#"""
#assert y.shape == f.shape
#e = y - f
#d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
#((e**2 + self.sigma2*self.v)**3)
#)
#return d3lik_d3f
$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
"""
assert y.shape == f.shape
e = y - f
d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
((e**2 + self.sigma2*self.v)**3)
)
return d3lik_d3f
#def lik_dstd(self, y, f, extra_data=None):
#"""
#Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
def lik_dstd(self, y, f, extra_data=None):
"""
Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
#Terms relavent to derivatives wrt sigma are:
#-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
Terms relavent to derivatives wrt sigma are:
-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
#$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
#"""
#assert y.shape == f.shape
#e = y - f
#sigma = np.sqrt(self.sigma2)
##dlik_dsigma = ( - (1/sigma) +
##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
##)
##dlik_dsigma = ( - 1 +
##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
##)
##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
#dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
#return dlik_dsigma
$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
"""
assert y.shape == f.shape
e = y - f
sigma = np.sqrt(self.sigma2)
#dlik_dsigma = ( - (1/sigma) +
#((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
#)
#dlik_dsigma = ( - 1 +
#((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
#)
#dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
return dlik_dsigma
#def dlik_df_dstd(self, y, f, extra_data=None):
#"""
#Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
def dlik_df_dstd(self, y, f, extra_data=None):
"""
Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
#$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
#"""
#assert y.shape == f.shape
#e = y - f
#sigma = np.sqrt(self.sigma2)
#dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
#/ ((self.v*self.sigma2 + e**2)**2)
#)
#return dlik_grad_dsigma
$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
"""
assert y.shape == f.shape
e = y - f
sigma = np.sqrt(self.sigma2)
dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
/ ((self.v*self.sigma2 + e**2)**2)
)
return dlik_grad_dsigma
#def d2lik_d2f_dstd(self, y, f, extra_data=None):
#"""
#Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
def d2lik_d2f_dstd(self, y, f, extra_data=None):
"""
Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
#$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
#"""
#assert y.shape == f.shape
#e = y - f
#sigma = np.sqrt(self.sigma2)
#dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
#((e**2 + self.sigma2*self.v)**3)
#)
##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
##/ ((e**2 + (self.sigma**2)*self.v)**3) )
#return dlik_hess_dsigma
$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
"""
assert y.shape == f.shape
e = y - f
sigma = np.sqrt(self.sigma2)
dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
((e**2 + self.sigma2*self.v)**3)
)
#dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
#/ ((e**2 + (self.sigma**2)*self.v)**3) )
return dlik_hess_dsigma
#def _gradients(self, y, f, extra_data=None):
##must be listed in same order as 'get_param_names'
#derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
#[self.dlik_df_dstd(y, f, extra_data=extra_data)],
#[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
#) # lists as we might learn many parameters
## ensure we have gradients for every parameter we want to optimize
#assert len(derivs[0]) == len(self._get_param_names())
#assert len(derivs[1]) == len(self._get_param_names())
#assert len(derivs[2]) == len(self._get_param_names())
#return derivs
def _gradients(self, y, f, extra_data=None):
#must be listed in same order as 'get_param_names'
derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
[self.dlik_df_dstd(y, f, extra_data=extra_data)],
[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
) # lists as we might learn many parameters
# ensure we have gradients for every parameter we want to optimize
assert len(derivs[0]) == len(self._get_param_names())
assert len(derivs[1]) == len(self._get_param_names())
assert len(derivs[2]) == len(self._get_param_names())
return derivs