mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-18 13:55:14 +02:00
Now gradchecks everytime but student_t fit is bad, noise is underestimated by a long way
This commit is contained in:
parent
ac461e1b2a
commit
de689fa8e9
4 changed files with 29 additions and 44 deletions
|
|
@ -39,28 +39,28 @@ def debug_student_t_noise_approx():
|
||||||
plot = False
|
plot = False
|
||||||
real_var = 0.1
|
real_var = 0.1
|
||||||
#Start a function, any function
|
#Start a function, any function
|
||||||
X = np.linspace(0.0, 10.0, 15)[:, None]
|
X = np.linspace(0.0, 10.0, 50)[:, None]
|
||||||
#X = np.array([0.5])[:, None]
|
#X = np.array([0.5])[:, None]
|
||||||
Y = np.sin(X) + np.random.randn(*X.shape)*real_var
|
Y = np.sin(X) + np.random.randn(*X.shape)*real_var
|
||||||
|
|
||||||
X_full = np.linspace(0.0, 10.0, 15)[:, None]
|
X_full = np.linspace(0.0, 10.0, 50)[:, None]
|
||||||
Y_full = np.sin(X_full)
|
Y_full = np.sin(X_full)
|
||||||
|
|
||||||
Y = Y/Y.max()
|
Y = Y/Y.max()
|
||||||
|
|
||||||
#Add student t random noise to datapoints
|
#Add student t random noise to datapoints
|
||||||
deg_free = 10000
|
deg_free = 1000
|
||||||
real_sd = np.sqrt(real_var)
|
real_sd = np.sqrt(real_var)
|
||||||
print "Real noise: ", real_sd
|
print "Real noise std: ", real_sd
|
||||||
|
|
||||||
initial_var_guess = 0.02
|
initial_var_guess = 0.3
|
||||||
#t_rv = t(deg_free, loc=0, scale=real_var)
|
#t_rv = t(deg_free, loc=0, scale=real_var)
|
||||||
#noise = t_rvrvs(size=Y.shape)
|
#noise = t_rvrvs(size=Y.shape)
|
||||||
#Y += noise
|
#Y += noise
|
||||||
|
|
||||||
plt.close('all')
|
plt.close('all')
|
||||||
# Kernel object
|
# Kernel object
|
||||||
kernel1 = GPy.kern.rbf(X.shape[1])
|
kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
|
||||||
kernel2 = kernel1.copy()
|
kernel2 = kernel1.copy()
|
||||||
kernel3 = kernel1.copy()
|
kernel3 = kernel1.copy()
|
||||||
kernel4 = kernel1.copy()
|
kernel4 = kernel1.copy()
|
||||||
|
|
@ -83,22 +83,24 @@ def debug_student_t_noise_approx():
|
||||||
#plt.plot(X_full, Y_full)
|
#plt.plot(X_full, Y_full)
|
||||||
#print m
|
#print m
|
||||||
|
|
||||||
#edited_real_sd = initial_var_guess #real_sd
|
edited_real_sd = initial_var_guess #real_sd
|
||||||
edited_real_sd = real_sd
|
edited_real_sd = real_sd
|
||||||
|
|
||||||
print "Clean student t, rasm"
|
print "Clean student t, rasm"
|
||||||
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
|
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
|
||||||
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
|
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
|
||||||
m = GPy.models.GP(X, stu_t_likelihood, kernel6)
|
m = GPy.models.GP(X, stu_t_likelihood, kernel6)
|
||||||
|
m['white'] = 1e-3
|
||||||
#m.constrain_positive('rbf')
|
#m.constrain_positive('rbf')
|
||||||
#m.constrain_fixed('rbf_v', 1.0898)
|
#m.constrain_fixed('rbf_v', 1.0898)
|
||||||
#m.constrain_fixed('rbf_l', 1.8651)
|
#m.constrain_fixed('rbf_l', 1.8651)
|
||||||
#m.constrain_fixed('t_noise_variance', real_sd)
|
#m.constrain_fixed('t_noise_variance', real_sd)
|
||||||
m.constrain_positive('rbf')
|
m.constrain_positive('rbf')
|
||||||
|
m.constrain_positive('t_noise')
|
||||||
#m.constrain_fixed('t_noi', real_sd)
|
#m.constrain_fixed('t_noi', real_sd)
|
||||||
m.ensure_default_constraints()
|
m.ensure_default_constraints()
|
||||||
m.update_likelihood_approximation()
|
m.update_likelihood_approximation()
|
||||||
m.optimize(messages=True)
|
#m.optimize(messages=True)
|
||||||
print(m)
|
print(m)
|
||||||
#return m
|
#return m
|
||||||
#m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
|
#m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
|
||||||
|
|
|
||||||
|
|
@ -84,12 +84,13 @@ class Laplace(likelihood):
|
||||||
|
|
||||||
#Implicit
|
#Implicit
|
||||||
impl = mdot(dlp, dL_dfhat.T, I_KW_i)
|
impl = mdot(dlp, dL_dfhat.T, I_KW_i)
|
||||||
expl_a = - mdot(self.Ki_f, self.Ki_f.T)
|
expl_a = mdot(self.Ki_f, self.Ki_f.T)
|
||||||
expl_b = Wi_K_i
|
expl_b = Wi_K_i
|
||||||
expl = 0.5*expl_a - 0.5*expl_b
|
expl = 0.5*expl_a + 0.5*expl_b
|
||||||
dL_dthetaK_exp = dK_dthetaK(expl, X)
|
dL_dthetaK_exp = dK_dthetaK(expl, X)
|
||||||
dL_dthetaK_imp = dK_dthetaK(impl, X)
|
dL_dthetaK_imp = dK_dthetaK(impl, X)
|
||||||
dL_dthetaK = -(dL_dthetaK_imp + dL_dthetaK_exp)
|
#print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
|
||||||
|
dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp
|
||||||
|
|
||||||
#dL_dthetaK = np.zeros(dK_dthetaK.shape)
|
#dL_dthetaK = np.zeros(dK_dthetaK.shape)
|
||||||
#for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
|
#for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
|
||||||
|
|
@ -117,10 +118,12 @@ class Laplace(likelihood):
|
||||||
#dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
|
#dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
|
||||||
#dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None])
|
#dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None])
|
||||||
# might be +
|
# might be +
|
||||||
dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
|
dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
|
||||||
#Implicit
|
#Implicit
|
||||||
df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
|
df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
|
||||||
dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL)
|
dL_dthetaL_imp = np.dot(dL_dfhat.T, df_hat_dthetaL)
|
||||||
|
#print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
|
||||||
|
dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp
|
||||||
|
|
||||||
return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
|
return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
|
||||||
|
|
||||||
|
|
@ -180,14 +183,20 @@ class Laplace(likelihood):
|
||||||
W = np.diagflat(self.W)
|
W = np.diagflat(self.W)
|
||||||
Wi = self.Sigma_tilde
|
Wi = self.Sigma_tilde
|
||||||
W12i = np.sqrt(Wi)
|
W12i = np.sqrt(Wi)
|
||||||
D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
|
#D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
|
||||||
fDf = mdot(self.f_hat.T, D, self.f_hat)
|
#fDf = mdot(self.f_hat.T, D, self.f_hat)
|
||||||
l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
|
l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
|
||||||
|
#print "fDf:{} l:{} detKWiBi:{} W:{} Wi:{} Bi:{} Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum())
|
||||||
|
|
||||||
|
y_Wi_Ki_i_y = mdot(Y_tilde.T, pdinv(self.K + Wi)[0], Y_tilde)
|
||||||
Z_tilde = (+ self.NORMAL_CONST
|
Z_tilde = (+ self.NORMAL_CONST
|
||||||
+ l
|
+ l
|
||||||
+ 0.5*ln_det_K_Wi__Bi
|
+ 0.5*ln_det_K_Wi__Bi
|
||||||
- 0.5*fDf
|
#- 0.5*fDf
|
||||||
|
- 0.5*self.f_Ki_f
|
||||||
|
+ 0.5*y_Wi_Ki_i_y
|
||||||
)
|
)
|
||||||
|
#print "Ztilde: {}".format(Z_tilde)
|
||||||
|
|
||||||
#Convert to float as its (1, 1) and Z must be a scalar
|
#Convert to float as its (1, 1) and Z must be a scalar
|
||||||
self.Z = np.float64(Z_tilde)
|
self.Z = np.float64(Z_tilde)
|
||||||
|
|
@ -316,7 +325,7 @@ class Laplace(likelihood):
|
||||||
#f_old = f.copy()
|
#f_old = f.copy()
|
||||||
W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
|
W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
|
||||||
if not self.likelihood_function.log_concave:
|
if not self.likelihood_function.log_concave:
|
||||||
W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
|
W[W < 0] = 1e-5 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
|
||||||
# If the likelihood is non-log-concave. We wan't to say that there is a negative variance
|
# If the likelihood is non-log-concave. We wan't to say that there is a negative variance
|
||||||
# To cause the posterior to become less certain than the prior and likelihood,
|
# To cause the posterior to become less certain than the prior and likelihood,
|
||||||
# This is a property only held by non-log-concave likelihoods
|
# This is a property only held by non-log-concave likelihoods
|
||||||
|
|
|
||||||
|
|
@ -170,7 +170,7 @@ class student_t(likelihood_function):
|
||||||
return np.asarray(self.sigma)
|
return np.asarray(self.sigma)
|
||||||
|
|
||||||
def _get_param_names(self):
|
def _get_param_names(self):
|
||||||
return ["t_noise_variance"]
|
return ["t_noise_std"]
|
||||||
|
|
||||||
def _set_params(self, x):
|
def _set_params(self, x):
|
||||||
self.sigma = float(x)
|
self.sigma = float(x)
|
||||||
|
|
@ -191,8 +191,6 @@ class student_t(likelihood_function):
|
||||||
:returns: float(likelihood evaluated for this point)
|
:returns: float(likelihood evaluated for this point)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
#y = np.squeeze(y)
|
|
||||||
#f = np.squeeze(f)
|
|
||||||
assert y.shape == f.shape
|
assert y.shape == f.shape
|
||||||
|
|
||||||
e = y - f
|
e = y - f
|
||||||
|
|
@ -215,8 +213,6 @@ class student_t(likelihood_function):
|
||||||
:returns: gradient of likelihood evaluated at points
|
:returns: gradient of likelihood evaluated at points
|
||||||
|
|
||||||
"""
|
"""
|
||||||
#y = np.squeeze(y)
|
|
||||||
#f = np.squeeze(f)
|
|
||||||
assert y.shape == f.shape
|
assert y.shape == f.shape
|
||||||
e = y - f
|
e = y - f
|
||||||
grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
|
grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
|
||||||
|
|
@ -237,8 +233,6 @@ class student_t(likelihood_function):
|
||||||
:extra_data: extra_data which is not used in student t distribution
|
:extra_data: extra_data which is not used in student t distribution
|
||||||
:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
|
:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
|
||||||
"""
|
"""
|
||||||
#y = np.squeeze(y)
|
|
||||||
#f = np.squeeze(f)
|
|
||||||
assert y.shape == f.shape
|
assert y.shape == f.shape
|
||||||
|
|
||||||
e = y - f
|
e = y - f
|
||||||
|
|
@ -251,8 +245,6 @@ class student_t(likelihood_function):
|
||||||
|
|
||||||
$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
|
$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
|
||||||
"""
|
"""
|
||||||
#y = np.squeeze(y)
|
|
||||||
#f = np.squeeze(f)
|
|
||||||
assert y.shape == f.shape
|
assert y.shape == f.shape
|
||||||
e = y - f
|
e = y - f
|
||||||
d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
|
d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
|
||||||
|
|
@ -269,8 +261,6 @@ class student_t(likelihood_function):
|
||||||
|
|
||||||
$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
|
$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
|
||||||
"""
|
"""
|
||||||
#y = np.squeeze(y)
|
|
||||||
#f = np.squeeze(f)
|
|
||||||
assert y.shape == f.shape
|
assert y.shape == f.shape
|
||||||
e = y - f
|
e = y - f
|
||||||
dlik_dsigma = ( - (1/self.sigma) +
|
dlik_dsigma = ( - (1/self.sigma) +
|
||||||
|
|
@ -284,8 +274,6 @@ class student_t(likelihood_function):
|
||||||
|
|
||||||
$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
|
$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
|
||||||
"""
|
"""
|
||||||
#y = np.squeeze(y)
|
|
||||||
#f = np.squeeze(f)
|
|
||||||
assert y.shape == f.shape
|
assert y.shape == f.shape
|
||||||
e = y - f
|
e = y - f
|
||||||
dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e)
|
dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e)
|
||||||
|
|
@ -299,8 +287,6 @@ class student_t(likelihood_function):
|
||||||
|
|
||||||
$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
|
$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
|
||||||
"""
|
"""
|
||||||
#y = np.squeeze(y)
|
|
||||||
#f = np.squeeze(f)
|
|
||||||
assert y.shape == f.shape
|
assert y.shape == f.shape
|
||||||
e = y - f
|
e = y - f
|
||||||
dlik_hess_dsigma = ( (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
|
dlik_hess_dsigma = ( (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
|
||||||
|
|
|
||||||
|
|
@ -145,18 +145,6 @@ class GP(model):
|
||||||
self.likelihood._set_params(self.likelihood._get_params())
|
self.likelihood._set_params(self.likelihood._get_params())
|
||||||
dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
|
dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
|
||||||
if isinstance(self.likelihood, Laplace):
|
if isinstance(self.likelihood, Laplace):
|
||||||
#Reapproximate incase it hasnt been done...
|
|
||||||
self.likelihood.fit_full(self.kern.K(self.X))
|
|
||||||
self.likelihood._set_params(self.likelihood._get_params())
|
|
||||||
print self.kern._get_params()
|
|
||||||
|
|
||||||
#Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
|
|
||||||
#fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
|
|
||||||
#fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
|
|
||||||
|
|
||||||
#BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
|
|
||||||
#dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
|
|
||||||
|
|
||||||
dK_dthetaK = self.kern.dK_dtheta
|
dK_dthetaK = self.kern.dK_dtheta
|
||||||
dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
|
dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
|
||||||
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
|
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue