From de689fa8e91928b7fc2d02f56d4eca14d82eaafd Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 19 Jun 2013 12:00:00 +0100 Subject: [PATCH] Now gradchecks everytime but student_t fit is bad, noise is underestimated by a long way --- GPy/examples/laplace_approximations.py | 18 +++++++++-------- GPy/likelihoods/Laplace.py | 27 ++++++++++++++++--------- GPy/likelihoods/likelihood_functions.py | 16 +-------------- GPy/models/GP.py | 12 ----------- 4 files changed, 29 insertions(+), 44 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 5120dfb5..84527d08 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -39,28 +39,28 @@ def debug_student_t_noise_approx(): plot = False real_var = 0.1 #Start a function, any function - X = np.linspace(0.0, 10.0, 15)[:, None] + X = np.linspace(0.0, 10.0, 50)[:, None] #X = np.array([0.5])[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var - X_full = np.linspace(0.0, 10.0, 15)[:, None] + X_full = np.linspace(0.0, 10.0, 50)[:, None] Y_full = np.sin(X_full) Y = Y/Y.max() #Add student t random noise to datapoints - deg_free = 10000 + deg_free = 1000 real_sd = np.sqrt(real_var) - print "Real noise: ", real_sd + print "Real noise std: ", real_sd - initial_var_guess = 0.02 + initial_var_guess = 0.3 #t_rv = t(deg_free, loc=0, scale=real_var) #noise = t_rvrvs(size=Y.shape) #Y += noise plt.close('all') # Kernel object - kernel1 = GPy.kern.rbf(X.shape[1]) + kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) kernel2 = kernel1.copy() kernel3 = kernel1.copy() kernel4 = kernel1.copy() @@ -83,22 +83,24 @@ def debug_student_t_noise_approx(): #plt.plot(X_full, Y_full) #print m - #edited_real_sd = initial_var_guess #real_sd + edited_real_sd = initial_var_guess #real_sd edited_real_sd = real_sd print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, stu_t_likelihood, kernel6) + m['white'] = 1e-3 #m.constrain_positive('rbf') #m.constrain_fixed('rbf_v', 1.0898) #m.constrain_fixed('rbf_l', 1.8651) #m.constrain_fixed('t_noise_variance', real_sd) m.constrain_positive('rbf') + m.constrain_positive('t_noise') #m.constrain_fixed('t_noi', real_sd) m.ensure_default_constraints() m.update_likelihood_approximation() - m.optimize(messages=True) + #m.optimize(messages=True) print(m) #return m #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index a8347345..5b1a814a 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -84,12 +84,13 @@ class Laplace(likelihood): #Implicit impl = mdot(dlp, dL_dfhat.T, I_KW_i) - expl_a = - mdot(self.Ki_f, self.Ki_f.T) + expl_a = mdot(self.Ki_f, self.Ki_f.T) expl_b = Wi_K_i - expl = 0.5*expl_a - 0.5*expl_b + expl = 0.5*expl_a + 0.5*expl_b dL_dthetaK_exp = dK_dthetaK(expl, X) dL_dthetaK_imp = dK_dthetaK(impl, X) - dL_dthetaK = -(dL_dthetaK_imp + dL_dthetaK_exp) + #print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) + dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp #dL_dthetaK = np.zeros(dK_dthetaK.shape) #for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK): @@ -117,10 +118,12 @@ class Laplace(likelihood): #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i]))) #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None]) # might be + - dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) + dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) #Implicit df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) - dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL) + dL_dthetaL_imp = np.dot(dL_dfhat.T, df_hat_dthetaL) + #print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp) + dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) @@ -180,14 +183,20 @@ class Laplace(likelihood): W = np.diagflat(self.W) Wi = self.Sigma_tilde W12i = np.sqrt(Wi) - D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W)) - fDf = mdot(self.f_hat.T, D, self.f_hat) + #D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W)) + #fDf = mdot(self.f_hat.T, D, self.f_hat) l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) + #print "fDf:{} l:{} detKWiBi:{} W:{} Wi:{} Bi:{} Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum()) + + y_Wi_Ki_i_y = mdot(Y_tilde.T, pdinv(self.K + Wi)[0], Y_tilde) Z_tilde = (+ self.NORMAL_CONST + l + 0.5*ln_det_K_Wi__Bi - - 0.5*fDf + #- 0.5*fDf + - 0.5*self.f_Ki_f + + 0.5*y_Wi_Ki_i_y ) + #print "Ztilde: {}".format(Z_tilde) #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -316,7 +325,7 @@ class Laplace(likelihood): #f_old = f.copy() W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + W[W < 0] = 1e-5 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index c3aee835..041b59bd 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -170,7 +170,7 @@ class student_t(likelihood_function): return np.asarray(self.sigma) def _get_param_names(self): - return ["t_noise_variance"] + return ["t_noise_std"] def _set_params(self, x): self.sigma = float(x) @@ -191,8 +191,6 @@ class student_t(likelihood_function): :returns: float(likelihood evaluated for this point) """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f @@ -215,8 +213,6 @@ class student_t(likelihood_function): :returns: gradient of likelihood evaluated at points """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) @@ -237,8 +233,6 @@ class student_t(likelihood_function): :extra_data: extra_data which is not used in student t distribution :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f @@ -251,8 +245,6 @@ class student_t(likelihood_function): $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$ """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) / @@ -269,8 +261,6 @@ class student_t(likelihood_function): $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$ """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f dlik_dsigma = ( - (1/self.sigma) + @@ -284,8 +274,6 @@ class student_t(likelihood_function): $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$ """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) @@ -299,8 +287,6 @@ class student_t(likelihood_function): $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f dlik_hess_dsigma = ( (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) / diff --git a/GPy/models/GP.py b/GPy/models/GP.py index d56ee86f..636ebba0 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -145,18 +145,6 @@ class GP(model): self.likelihood._set_params(self.likelihood._get_params()) dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) if isinstance(self.likelihood, Laplace): - #Reapproximate incase it hasnt been done... - self.likelihood.fit_full(self.kern.K(self.X)) - self.likelihood._set_params(self.likelihood._get_params()) - print self.kern._get_params() - - #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained - #fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right... - #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right... - - #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params) - #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) - dK_dthetaK = self.kern.dK_dtheta dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))