mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-06-11 15:15:15 +02:00
Started adding gaussian likelihood, changed round preloading old_a
This commit is contained in:
parent
57001851c4
commit
aa98608590
4 changed files with 321 additions and 181 deletions
|
|
@ -244,6 +244,12 @@ class model(parameterised):
|
|||
LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
|
||||
prior_gradients = self._transform_gradients(self._log_prior_gradients())
|
||||
obj_grads = -LL_gradients - prior_gradients
|
||||
print self
|
||||
print self._get_params()
|
||||
print -obj_grads
|
||||
self.plot()
|
||||
if isinstance(self.likelihood, likelihoods.Laplace):
|
||||
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
return obj_f, obj_grads
|
||||
|
||||
def optimize(self, optimizer=None, start=None, **kwargs):
|
||||
|
|
|
|||
|
|
@ -85,10 +85,60 @@ def v_fail_test():
|
|||
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
print(m)
|
||||
|
||||
def student_t_obj_plane():
|
||||
plt.close('all')
|
||||
X = np.linspace(0, 1, 50)[:, None]
|
||||
real_std = 0.002
|
||||
noise = np.random.randn(*X.shape)*real_std
|
||||
Y = np.sin(X*2*np.pi) + noise
|
||||
deg_free = 1000
|
||||
|
||||
kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
|
||||
mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
|
||||
mgp.ensure_default_constraints()
|
||||
mgp['noise'] = real_std**2
|
||||
print "Gaussian"
|
||||
print mgp
|
||||
|
||||
kernelst = kernelgp.copy()
|
||||
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2))
|
||||
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
|
||||
m = GPy.models.GP(X, stu_t_likelihood, kernelst)
|
||||
m.ensure_default_constraints()
|
||||
m.constrain_fixed('t_no', real_std**2)
|
||||
vs = 10
|
||||
ls = 10
|
||||
objs_t = np.zeros((vs, ls))
|
||||
objs_g = np.zeros((vs, ls))
|
||||
rbf_vs = np.linspace(1e-6, 8, vs)
|
||||
rbf_ls = np.linspace(1e-2, 8, ls)
|
||||
for v_id, rbf_v in enumerate(rbf_vs):
|
||||
for l_id, rbf_l in enumerate(rbf_ls):
|
||||
m['rbf_v'] = rbf_v
|
||||
m['rbf_l'] = rbf_l
|
||||
mgp['rbf_v'] = rbf_v
|
||||
mgp['rbf_l'] = rbf_l
|
||||
objs_t[v_id, l_id] = m.log_likelihood()
|
||||
objs_g[v_id, l_id] = mgp.log_likelihood()
|
||||
plt.figure()
|
||||
plt.subplot(211)
|
||||
plt.title('Student t')
|
||||
plt.imshow(objs_t, interpolation='none')
|
||||
plt.xlabel('variance')
|
||||
plt.ylabel('lengthscale')
|
||||
plt.subplot(212)
|
||||
plt.title('Gaussian')
|
||||
plt.imshow(objs_g, interpolation='none')
|
||||
plt.xlabel('variance')
|
||||
plt.ylabel('lengthscale')
|
||||
plt.show()
|
||||
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
return objs_t
|
||||
|
||||
def student_t_f_check():
|
||||
plt.close('all')
|
||||
X = np.linspace(0, 1, 50)[:, None]
|
||||
real_std = 0.001
|
||||
real_std = 0.2
|
||||
noise = np.random.randn(*X.shape)*real_std
|
||||
Y = np.sin(X*2*np.pi) + noise
|
||||
deg_free = 1000
|
||||
|
|
@ -98,17 +148,26 @@ def student_t_f_check():
|
|||
mgp.ensure_default_constraints()
|
||||
mgp.randomize()
|
||||
mgp.optimize()
|
||||
print "Gaussian"
|
||||
print mgp
|
||||
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
|
||||
kernelst = kernelgp.copy()
|
||||
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=1e-5)
|
||||
#kernelst += GPy.kern.bias(X.shape[1])
|
||||
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05)
|
||||
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
|
||||
m = GPy.models.GP(X, stu_t_likelihood, kernelst)
|
||||
m['rbf_v'] = mgp._get_params()[0]
|
||||
m['rbf_l'] = mgp._get_params()[1] + 1
|
||||
#m['rbf_v'] = mgp._get_params()[0]
|
||||
#m['rbf_l'] = mgp._get_params()[1] + 1
|
||||
m.ensure_default_constraints()
|
||||
#m.constrain_fixed('rbf_v', mgp._get_params()[0])
|
||||
#m.constrain_fixed('rbf_l', mgp._get_params()[1])
|
||||
#m.constrain_bounded('t_no', 2*real_std**2, 1e3)
|
||||
#m.constrain_positive('bias')
|
||||
m.constrain_positive('t_no')
|
||||
m.randomize()
|
||||
m['t_no'] = 0.3
|
||||
m.likelihood.X = X
|
||||
print m
|
||||
plt.figure()
|
||||
plt.subplot(511)
|
||||
|
|
@ -143,7 +202,8 @@ def student_t_fix_optimise_check():
|
|||
Y = np.sin(X*2*np.pi) + noise
|
||||
X_full = X
|
||||
Y_full = np.sin(X_full)
|
||||
#Y = Y/Y.max()
|
||||
Y = Y/Y.max()
|
||||
Y_full = Y_full/Y_full.max()
|
||||
deg_free = 1000
|
||||
|
||||
#GP
|
||||
|
|
@ -219,7 +279,7 @@ def student_t_fix_optimise_check():
|
|||
plt.subplot(121)
|
||||
mrbf.plot()
|
||||
plt.title('Student t fixed noise')
|
||||
#mrbf.optimize()
|
||||
mrbf.optimize()
|
||||
print "After optimize"
|
||||
print mrbf
|
||||
plt.subplot(122)
|
||||
|
|
|
|||
|
|
@ -156,17 +156,23 @@ class Laplace(likelihood):
|
|||
Y_tilde = Wi*self.Ki_f + self.f_hat
|
||||
|
||||
self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
|
||||
#self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6
|
||||
|
||||
self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
|
||||
self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
|
||||
|
||||
self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
|
||||
Z_tilde = (#+ self.NORMAL_CONST
|
||||
self.aA = 0.5*self.ln_det_K_Wi__Bi
|
||||
self.bB = - 0.5*self.f_Ki_f
|
||||
self.cC = 0.5*self.y_Wi_Ki_i_y
|
||||
Z_tilde = (+ 100*self.NORMAL_CONST
|
||||
+ self.lik
|
||||
+ 0.5*self.ln_det_K_Wi__Bi
|
||||
- 0.5*self.f_Ki_f
|
||||
+ 0.5*self.y_Wi_Ki_i_y
|
||||
)
|
||||
#print "Ztilde: {}".format(Z_tilde)
|
||||
print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC)
|
||||
print self.likelihood_function._get_params()
|
||||
|
||||
#Convert to float as its (1, 1) and Z must be a scalar
|
||||
self.Z = np.float64(Z_tilde)
|
||||
|
|
@ -198,7 +204,7 @@ class Laplace(likelihood):
|
|||
self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
|
||||
|
||||
if not self.likelihood_function.log_concave:
|
||||
self.W[self.W < 0] = 1e-10 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
|
||||
self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
|
||||
#If the likelihood is non-log-concave. We wan't to say that there is a negative variance
|
||||
#To cause the posterior to become less certain than the prior and likelihood,
|
||||
#This is a property only held by non-log-concave likelihoods
|
||||
|
|
@ -280,7 +286,7 @@ class Laplace(likelihood):
|
|||
f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
|
||||
return f_hat[:, None]
|
||||
|
||||
def rasm_mode(self, K, MAX_ITER=40, MAX_RESTART=10):
|
||||
def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
|
||||
"""
|
||||
Rasmussens numerically stable mode finding
|
||||
For nomenclature see Rasmussen & Williams 2006
|
||||
|
|
@ -290,15 +296,19 @@ class Laplace(likelihood):
|
|||
:MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
|
||||
:returns: f_mode
|
||||
"""
|
||||
old_a = np.zeros((self.N, 1))
|
||||
#old_a = None
|
||||
#if self.old_a is None:
|
||||
#old_a = np.zeros((self.N, 1))
|
||||
#else:
|
||||
#old_a = self.old_a
|
||||
self.old_before_s = self.likelihood_function._get_params()
|
||||
print "before: ", self.old_before_s
|
||||
#if self.old_before_s < 1e-5:
|
||||
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
|
||||
#old_a = np.zeros((self.N, 1))
|
||||
if self.old_a is None:
|
||||
old_a = np.zeros((self.N, 1))
|
||||
f = np.dot(K, old_a)
|
||||
else:
|
||||
old_a = self.old_a.copy()
|
||||
f = self.f_hat.copy()
|
||||
|
||||
f = np.dot(self.K, old_a)
|
||||
self.f = f
|
||||
new_obj = -np.inf
|
||||
old_obj = np.inf
|
||||
|
||||
|
|
@ -306,18 +316,20 @@ class Laplace(likelihood):
|
|||
return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
|
||||
|
||||
difference = np.inf
|
||||
epsilon = 1e-10
|
||||
epsilon = 1e-4
|
||||
step_size = 1
|
||||
rs = 0
|
||||
i = 0
|
||||
while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
|
||||
|
||||
while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
|
||||
W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
|
||||
#W = np.maximum(W, 0)
|
||||
if not self.likelihood_function.log_concave:
|
||||
W[W < 0] = 1e-10 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
|
||||
W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
|
||||
# If the likelihood is non-log-concave. We wan't to say that there is a negative variance
|
||||
# To cause the posterior to become less certain than the prior and likelihood,
|
||||
# This is a property only held by non-log-concave likelihoods
|
||||
B, L, W_12 = self._compute_B_statistics(K, W)
|
||||
B, L, W_12 = self._compute_B_statistics(K, W.copy())
|
||||
|
||||
W_f = W*f
|
||||
grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
|
||||
|
|
@ -328,54 +340,105 @@ class Laplace(likelihood):
|
|||
full_step_a = b - W_12*solve_L
|
||||
da = full_step_a - old_a
|
||||
|
||||
f_old = f.copy()
|
||||
|
||||
f_old = self.f.copy()
|
||||
def inner_obj(step_size, old_a, da, K):
|
||||
a = old_a + step_size*da
|
||||
f = np.dot(K, a)
|
||||
self.a = a # This is nasty, need to set something within an optimization though
|
||||
self.f = f
|
||||
return -obj(a, f)
|
||||
|
||||
from functools import partial
|
||||
i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K)
|
||||
new_obj = sp.optimize.brent(i_o, tol=1e-6, maxiter=10)
|
||||
|
||||
#update_passed = False
|
||||
#while not update_passed:
|
||||
#f_old = f.copy()
|
||||
#def inner_obj(step_size, old_a, da, K):
|
||||
#a = old_a + step_size*da
|
||||
#f = np.dot(K, a)
|
||||
#self.a = a.copy() # This is nasty, need to set something within an optimization though
|
||||
#self.f = f.copy()
|
||||
#return -obj(a, f)
|
||||
|
||||
#old_obj = new_obj
|
||||
#new_obj = obj(a, f)
|
||||
#difference = new_obj - old_obj
|
||||
#print "difference: ",difference
|
||||
#if difference < 0:
|
||||
##print grad
|
||||
##print "Objective function rose", np.float(difference)
|
||||
##If the objective function isn't rising, restart optimization
|
||||
#step_size *= 0.8
|
||||
##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
|
||||
##objective function isn't increasing, try reducing step size
|
||||
##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
|
||||
##old_obj = tmp_old_obj
|
||||
#old_obj = new_obj
|
||||
#rs += 1
|
||||
#else:
|
||||
#update_passed = True
|
||||
#from functools import partial
|
||||
#i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
|
||||
##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
|
||||
#new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
|
||||
#f = self.f.copy()
|
||||
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
|
||||
f = self.f
|
||||
difference = new_obj - old_obj
|
||||
difference = np.abs(np.sum(f - f_old)) #+ abs(difference)
|
||||
old_a = self.a #a
|
||||
f_old = f.copy()
|
||||
update_passed = False
|
||||
while not update_passed:
|
||||
a = old_a + step_size*da
|
||||
f = np.dot(K, a)
|
||||
|
||||
old_obj = new_obj
|
||||
new_obj = obj(a, f)
|
||||
difference = new_obj - old_obj
|
||||
print "difference: ",difference
|
||||
if difference < 0:
|
||||
#print "Objective function rose", np.float(difference)
|
||||
#If the objective function isn't rising, restart optimization
|
||||
step_size *= 0.8
|
||||
#print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
|
||||
#objective function isn't increasing, try reducing step size
|
||||
f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
|
||||
old_obj = new_obj
|
||||
rs += 1
|
||||
else:
|
||||
update_passed = True
|
||||
|
||||
#difference = abs(new_obj - old_obj)
|
||||
#old_obj = new_obj.copy()
|
||||
difference = np.abs(np.sum(f - f_old))
|
||||
#old_a = self.a.copy() #a
|
||||
old_a = a.copy()
|
||||
i += 1
|
||||
#print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
|
||||
|
||||
self.old_a = old_a
|
||||
self.old_a = old_a.copy()
|
||||
#print "Positive difference obj: ", np.float(difference)
|
||||
#print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
|
||||
print "Iterations: {}, Final_difference: {}".format(i, difference)
|
||||
#self.a = a
|
||||
if difference > 1e-4:
|
||||
print "FAIL FAIL FAIL FAIL FAIL FAIL"
|
||||
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
if hasattr(self, 'X'):
|
||||
import pylab as pb
|
||||
pb.figure()
|
||||
pb.subplot(311)
|
||||
pb.title('old f_hat')
|
||||
pb.plot(self.X, self.f_hat)
|
||||
pb.subplot(312)
|
||||
pb.title('old ff')
|
||||
pb.plot(self.X, self.old_ff)
|
||||
pb.subplot(313)
|
||||
pb.title('new f_hat')
|
||||
pb.plot(self.X, f)
|
||||
|
||||
pb.figure()
|
||||
pb.subplot(121)
|
||||
pb.title('old K')
|
||||
pb.imshow(np.diagflat(self.old_K), interpolation='none')
|
||||
pb.colorbar()
|
||||
pb.subplot(122)
|
||||
pb.title('new K')
|
||||
pb.imshow(np.diagflat(K), interpolation='none')
|
||||
pb.colorbar()
|
||||
|
||||
pb.figure()
|
||||
pb.subplot(121)
|
||||
pb.title('old W')
|
||||
pb.imshow(np.diagflat(self.old_W), interpolation='none')
|
||||
pb.colorbar()
|
||||
pb.subplot(122)
|
||||
pb.title('new W')
|
||||
pb.imshow(np.diagflat(W), interpolation='none')
|
||||
pb.colorbar()
|
||||
|
||||
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||
pb.close('all')
|
||||
|
||||
#FIXME: DELETE THESE
|
||||
self.old_W = W.copy()
|
||||
self.old_grad = grad.copy()
|
||||
self.old_B = B.copy()
|
||||
self.old_W_12 = W_12.copy()
|
||||
self.old_ff = f.copy()
|
||||
self.old_K = self.K.copy()
|
||||
self.old_s = self.likelihood_function._get_params()
|
||||
print "after: ", self.old_s
|
||||
#print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
|
||||
self.a = a
|
||||
#self.B, self.B_chol, self.W_12 = B, L, W_12
|
||||
#self.Bi, _, _, B_det = pdinv(self.B)
|
||||
return f
|
||||
|
|
|
|||
|
|
@ -193,11 +193,16 @@ class student_t(likelihood_function):
|
|||
"""
|
||||
assert y.shape == f.shape
|
||||
e = y - f
|
||||
#A = gammaln((self.v + 1) * 0.5)
|
||||
#B = - gammaln(self.v * 0.5)
|
||||
#C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
|
||||
#D = + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
|
||||
objective = (+ gammaln((self.v + 1) * 0.5)
|
||||
- gammaln(self.v * 0.5)
|
||||
- 0.5*np.log(self.sigma2 * self.v * np.pi)
|
||||
+ (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
|
||||
)
|
||||
#print "C: {} D: {} obj: {}".format(C, np.sum(D), objective.sum())
|
||||
return np.sum(objective)
|
||||
|
||||
def dlik_df(self, y, f, extra_data=None):
|
||||
|
|
@ -459,147 +464,153 @@ class weibull_survival(likelihood_function):
|
|||
hess = (y**self.shape)*np.exp(f)
|
||||
return np.squeeze(hess)
|
||||
|
||||
#class gaussian(likelihood_function):
|
||||
#"""
|
||||
#Gaussian likelihood - this is a test class for approximation schemes
|
||||
#"""
|
||||
#def __init__(self, variance):
|
||||
#self._set_params(np.asarray(variance))
|
||||
class gaussian(likelihood_function):
|
||||
"""
|
||||
Gaussian likelihood - this is a test class for approximation schemes
|
||||
"""
|
||||
def __init__(self, variance):
|
||||
self._set_params(np.asarray(variance))
|
||||
|
||||
#def _get_params(self):
|
||||
#return np.asarray(self.sigma2)
|
||||
def _get_params(self):
|
||||
return np.asarray(self._variance)
|
||||
|
||||
#def _get_param_names(self):
|
||||
#return ["noise_variance"]
|
||||
def _get_param_names(self):
|
||||
return ["noise_variance"]
|
||||
|
||||
#def _set_params(self, x):
|
||||
#self.variance = float(x)
|
||||
def _set_params(self, x):
|
||||
self._variance = float(x)
|
||||
self.covariance_matrix = np.eye(self.N) * self._variance
|
||||
self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG
|
||||
|
||||
#def link_function(self, y, f, extra_data=None):
|
||||
#"""link_function $\ln p(y|f)$
|
||||
#$$\ln p(y_{i}|f_{i}) = \ln $$
|
||||
def link_function(self, y, f, extra_data=None):
|
||||
"""link_function $\ln p(y|f)$
|
||||
$$\ln p(y_{i}|f_{i}) = \ln $$
|
||||
|
||||
#:y: data
|
||||
#:f: latent variables f
|
||||
#:extra_data: extra_data which is not used in student t distribution
|
||||
#:returns: float(likelihood evaluated for this point)
|
||||
:y: data
|
||||
:f: latent variables f
|
||||
:extra_data: extra_data which is not used in student t distribution
|
||||
:returns: float(likelihood evaluated for this point)
|
||||
|
||||
#"""
|
||||
#assert y.shape == f.shape
|
||||
#e = y - f
|
||||
#objective = -0.5*self.D*
|
||||
#return np.sum(objective)
|
||||
"""
|
||||
assert y.shape == f.shape
|
||||
e = y - f
|
||||
eeT = np.dot(e, e.T)
|
||||
objective = (- 0.5*self.D*np.log(2*np.pi)
|
||||
- 0.5*self.ln_K
|
||||
- 0.5*np.sum(np.multiply(self.Ki, eeT))
|
||||
)
|
||||
return np.sum(objective)
|
||||
|
||||
#def dlik_df(self, y, f, extra_data=None):
|
||||
#"""
|
||||
#Gradient of the link function at y, given f w.r.t f
|
||||
def dlik_df(self, y, f, extra_data=None):
|
||||
"""
|
||||
Gradient of the link function at y, given f w.r.t f
|
||||
|
||||
#$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
|
||||
$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
|
||||
|
||||
#:y: data
|
||||
#:f: latent variables f
|
||||
#:extra_data: extra_data which is not used in student t distribution
|
||||
#:returns: gradient of likelihood evaluated at points
|
||||
:y: data
|
||||
:f: latent variables f
|
||||
:extra_data: extra_data which is not used in student t distribution
|
||||
:returns: gradient of likelihood evaluated at points
|
||||
|
||||
#"""
|
||||
#assert y.shape == f.shape
|
||||
#e = y - f
|
||||
#grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
|
||||
#return grad
|
||||
"""
|
||||
assert y.shape == f.shape
|
||||
e = y - f
|
||||
grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
|
||||
return grad
|
||||
|
||||
#def d2lik_d2f(self, y, f, extra_data=None):
|
||||
#"""
|
||||
#Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
|
||||
#i.e. second derivative link_function at y given f f_j w.r.t f and f_j
|
||||
def d2lik_d2f(self, y, f, extra_data=None):
|
||||
"""
|
||||
Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
|
||||
i.e. second derivative link_function at y given f f_j w.r.t f and f_j
|
||||
|
||||
#Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
|
||||
#(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
|
||||
Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
|
||||
(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
|
||||
|
||||
#$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
|
||||
$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
|
||||
|
||||
#:y: data
|
||||
#:f: latent variables f
|
||||
#:extra_data: extra_data which is not used in student t distribution
|
||||
#:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
|
||||
#"""
|
||||
#assert y.shape == f.shape
|
||||
#e = y - f
|
||||
#hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
|
||||
#return hess
|
||||
:y: data
|
||||
:f: latent variables f
|
||||
:extra_data: extra_data which is not used in student t distribution
|
||||
:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
|
||||
"""
|
||||
assert y.shape == f.shape
|
||||
e = y - f
|
||||
hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
|
||||
return hess
|
||||
|
||||
#def d3lik_d3f(self, y, f, extra_data=None):
|
||||
#"""
|
||||
#Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
|
||||
def d3lik_d3f(self, y, f, extra_data=None):
|
||||
"""
|
||||
Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
|
||||
|
||||
#$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
|
||||
#"""
|
||||
#assert y.shape == f.shape
|
||||
#e = y - f
|
||||
#d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
|
||||
#((e**2 + self.sigma2*self.v)**3)
|
||||
#)
|
||||
#return d3lik_d3f
|
||||
$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
|
||||
"""
|
||||
assert y.shape == f.shape
|
||||
e = y - f
|
||||
d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
|
||||
((e**2 + self.sigma2*self.v)**3)
|
||||
)
|
||||
return d3lik_d3f
|
||||
|
||||
#def lik_dstd(self, y, f, extra_data=None):
|
||||
#"""
|
||||
#Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
|
||||
def lik_dstd(self, y, f, extra_data=None):
|
||||
"""
|
||||
Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
|
||||
|
||||
#Terms relavent to derivatives wrt sigma are:
|
||||
#-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
|
||||
Terms relavent to derivatives wrt sigma are:
|
||||
-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
|
||||
|
||||
#$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
|
||||
#"""
|
||||
#assert y.shape == f.shape
|
||||
#e = y - f
|
||||
#sigma = np.sqrt(self.sigma2)
|
||||
##dlik_dsigma = ( - (1/sigma) +
|
||||
##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
|
||||
##)
|
||||
##dlik_dsigma = ( - 1 +
|
||||
##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
|
||||
##)
|
||||
##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
|
||||
#dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
|
||||
#return dlik_dsigma
|
||||
$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
|
||||
"""
|
||||
assert y.shape == f.shape
|
||||
e = y - f
|
||||
sigma = np.sqrt(self.sigma2)
|
||||
#dlik_dsigma = ( - (1/sigma) +
|
||||
#((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
|
||||
#)
|
||||
#dlik_dsigma = ( - 1 +
|
||||
#((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
|
||||
#)
|
||||
#dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
|
||||
dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
|
||||
return dlik_dsigma
|
||||
|
||||
#def dlik_df_dstd(self, y, f, extra_data=None):
|
||||
#"""
|
||||
#Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
|
||||
def dlik_df_dstd(self, y, f, extra_data=None):
|
||||
"""
|
||||
Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
|
||||
|
||||
#$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
|
||||
#"""
|
||||
#assert y.shape == f.shape
|
||||
#e = y - f
|
||||
#sigma = np.sqrt(self.sigma2)
|
||||
#dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
|
||||
#/ ((self.v*self.sigma2 + e**2)**2)
|
||||
#)
|
||||
#return dlik_grad_dsigma
|
||||
$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
|
||||
"""
|
||||
assert y.shape == f.shape
|
||||
e = y - f
|
||||
sigma = np.sqrt(self.sigma2)
|
||||
dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
|
||||
/ ((self.v*self.sigma2 + e**2)**2)
|
||||
)
|
||||
return dlik_grad_dsigma
|
||||
|
||||
#def d2lik_d2f_dstd(self, y, f, extra_data=None):
|
||||
#"""
|
||||
#Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
|
||||
def d2lik_d2f_dstd(self, y, f, extra_data=None):
|
||||
"""
|
||||
Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
|
||||
|
||||
#$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
|
||||
#"""
|
||||
#assert y.shape == f.shape
|
||||
#e = y - f
|
||||
#sigma = np.sqrt(self.sigma2)
|
||||
#dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
|
||||
#((e**2 + self.sigma2*self.v)**3)
|
||||
#)
|
||||
##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
|
||||
##/ ((e**2 + (self.sigma**2)*self.v)**3) )
|
||||
#return dlik_hess_dsigma
|
||||
$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
|
||||
"""
|
||||
assert y.shape == f.shape
|
||||
e = y - f
|
||||
sigma = np.sqrt(self.sigma2)
|
||||
dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
|
||||
((e**2 + self.sigma2*self.v)**3)
|
||||
)
|
||||
#dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
|
||||
#/ ((e**2 + (self.sigma**2)*self.v)**3) )
|
||||
return dlik_hess_dsigma
|
||||
|
||||
#def _gradients(self, y, f, extra_data=None):
|
||||
##must be listed in same order as 'get_param_names'
|
||||
#derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
|
||||
#[self.dlik_df_dstd(y, f, extra_data=extra_data)],
|
||||
#[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
|
||||
#) # lists as we might learn many parameters
|
||||
## ensure we have gradients for every parameter we want to optimize
|
||||
#assert len(derivs[0]) == len(self._get_param_names())
|
||||
#assert len(derivs[1]) == len(self._get_param_names())
|
||||
#assert len(derivs[2]) == len(self._get_param_names())
|
||||
#return derivs
|
||||
def _gradients(self, y, f, extra_data=None):
|
||||
#must be listed in same order as 'get_param_names'
|
||||
derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
|
||||
[self.dlik_df_dstd(y, f, extra_data=extra_data)],
|
||||
[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
|
||||
) # lists as we might learn many parameters
|
||||
# ensure we have gradients for every parameter we want to optimize
|
||||
assert len(derivs[0]) == len(self._get_param_names())
|
||||
assert len(derivs[1]) == len(self._get_param_names())
|
||||
assert len(derivs[2]) == len(self._get_param_names())
|
||||
return derivs
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue