Now gradchecks everytime but student_t fit is bad, noise is underestimated by a long way

This commit is contained in:
Alan Saul 2013-06-19 12:00:00 +01:00
parent ac461e1b2a
commit de689fa8e9
4 changed files with 29 additions and 44 deletions

View file

@ -39,28 +39,28 @@ def debug_student_t_noise_approx():
plot = False
real_var = 0.1
#Start a function, any function
X = np.linspace(0.0, 10.0, 15)[:, None]
X = np.linspace(0.0, 10.0, 50)[:, None]
#X = np.array([0.5])[:, None]
Y = np.sin(X) + np.random.randn(*X.shape)*real_var
X_full = np.linspace(0.0, 10.0, 15)[:, None]
X_full = np.linspace(0.0, 10.0, 50)[:, None]
Y_full = np.sin(X_full)
Y = Y/Y.max()
#Add student t random noise to datapoints
deg_free = 10000
deg_free = 1000
real_sd = np.sqrt(real_var)
print "Real noise: ", real_sd
print "Real noise std: ", real_sd
initial_var_guess = 0.02
initial_var_guess = 0.3
#t_rv = t(deg_free, loc=0, scale=real_var)
#noise = t_rvrvs(size=Y.shape)
#Y += noise
plt.close('all')
# Kernel object
kernel1 = GPy.kern.rbf(X.shape[1])
kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
kernel2 = kernel1.copy()
kernel3 = kernel1.copy()
kernel4 = kernel1.copy()
@ -83,22 +83,24 @@ def debug_student_t_noise_approx():
#plt.plot(X_full, Y_full)
#print m
#edited_real_sd = initial_var_guess #real_sd
edited_real_sd = initial_var_guess #real_sd
edited_real_sd = real_sd
print "Clean student t, rasm"
t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
m = GPy.models.GP(X, stu_t_likelihood, kernel6)
m['white'] = 1e-3
#m.constrain_positive('rbf')
#m.constrain_fixed('rbf_v', 1.0898)
#m.constrain_fixed('rbf_l', 1.8651)
#m.constrain_fixed('t_noise_variance', real_sd)
m.constrain_positive('rbf')
m.constrain_positive('t_noise')
#m.constrain_fixed('t_noi', real_sd)
m.ensure_default_constraints()
m.update_likelihood_approximation()
m.optimize(messages=True)
#m.optimize(messages=True)
print(m)
#return m
#m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)

View file

@ -84,12 +84,13 @@ class Laplace(likelihood):
#Implicit
impl = mdot(dlp, dL_dfhat.T, I_KW_i)
expl_a = - mdot(self.Ki_f, self.Ki_f.T)
expl_a = mdot(self.Ki_f, self.Ki_f.T)
expl_b = Wi_K_i
expl = 0.5*expl_a - 0.5*expl_b
expl = 0.5*expl_a + 0.5*expl_b
dL_dthetaK_exp = dK_dthetaK(expl, X)
dL_dthetaK_imp = dK_dthetaK(impl, X)
dL_dthetaK = -(dL_dthetaK_imp + dL_dthetaK_exp)
#print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp
#dL_dthetaK = np.zeros(dK_dthetaK.shape)
#for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
@ -117,10 +118,12 @@ class Laplace(likelihood):
#dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
#dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None])
# might be +
dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
#Implicit
df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL)
dL_dthetaL_imp = np.dot(dL_dfhat.T, df_hat_dthetaL)
#print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp
return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
@ -180,14 +183,20 @@ class Laplace(likelihood):
W = np.diagflat(self.W)
Wi = self.Sigma_tilde
W12i = np.sqrt(Wi)
D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
fDf = mdot(self.f_hat.T, D, self.f_hat)
#D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
#fDf = mdot(self.f_hat.T, D, self.f_hat)
l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
#print "fDf:{} l:{} detKWiBi:{} W:{} Wi:{} Bi:{} Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum())
y_Wi_Ki_i_y = mdot(Y_tilde.T, pdinv(self.K + Wi)[0], Y_tilde)
Z_tilde = (+ self.NORMAL_CONST
+ l
+ 0.5*ln_det_K_Wi__Bi
- 0.5*fDf
#- 0.5*fDf
- 0.5*self.f_Ki_f
+ 0.5*y_Wi_Ki_i_y
)
#print "Ztilde: {}".format(Z_tilde)
#Convert to float as its (1, 1) and Z must be a scalar
self.Z = np.float64(Z_tilde)
@ -316,7 +325,7 @@ class Laplace(likelihood):
#f_old = f.copy()
W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
if not self.likelihood_function.log_concave:
W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
W[W < 0] = 1e-5 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
# If the likelihood is non-log-concave. We wan't to say that there is a negative variance
# To cause the posterior to become less certain than the prior and likelihood,
# This is a property only held by non-log-concave likelihoods

View file

@ -170,7 +170,7 @@ class student_t(likelihood_function):
return np.asarray(self.sigma)
def _get_param_names(self):
return ["t_noise_variance"]
return ["t_noise_std"]
def _set_params(self, x):
self.sigma = float(x)
@ -191,8 +191,6 @@ class student_t(likelihood_function):
:returns: float(likelihood evaluated for this point)
"""
#y = np.squeeze(y)
#f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
@ -215,8 +213,6 @@ class student_t(likelihood_function):
:returns: gradient of likelihood evaluated at points
"""
#y = np.squeeze(y)
#f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
@ -237,8 +233,6 @@ class student_t(likelihood_function):
:extra_data: extra_data which is not used in student t distribution
:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
"""
#y = np.squeeze(y)
#f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
@ -251,8 +245,6 @@ class student_t(likelihood_function):
$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
"""
#y = np.squeeze(y)
#f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
@ -269,8 +261,6 @@ class student_t(likelihood_function):
$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
"""
#y = np.squeeze(y)
#f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
dlik_dsigma = ( - (1/self.sigma) +
@ -284,8 +274,6 @@ class student_t(likelihood_function):
$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
"""
#y = np.squeeze(y)
#f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e)
@ -299,8 +287,6 @@ class student_t(likelihood_function):
$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
"""
#y = np.squeeze(y)
#f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
dlik_hess_dsigma = ( (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /

View file

@ -145,18 +145,6 @@ class GP(model):
self.likelihood._set_params(self.likelihood._get_params())
dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
if isinstance(self.likelihood, Laplace):
#Reapproximate incase it hasnt been done...
self.likelihood.fit_full(self.kern.K(self.X))
self.likelihood._set_params(self.likelihood._get_params())
print self.kern._get_params()
#Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
#fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
#fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
#BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
#dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
dK_dthetaK = self.kern.dK_dtheta
dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))