From 4e5cefb4b5cb14a3c4f94dbd4d18eac8c70a84fd Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 8 Jul 2013 15:48:53 +0100 Subject: [PATCH] Reparameratised in terms of sigma2 --- GPy/core/model.py | 3 - GPy/examples/laplace_approximations.py | 34 ++-- GPy/likelihoods/Laplace.py | 12 +- GPy/likelihoods/likelihood_functions.py | 207 +++++++++++++++++++++--- 4 files changed, 207 insertions(+), 49 deletions(-) diff --git a/GPy/core/model.py b/GPy/core/model.py index f97938a4..94202396 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -244,9 +244,6 @@ class model(parameterised): LL_gradients = self._transform_gradients(self._log_likelihood_gradients()) prior_gradients = self._transform_gradients(self._log_prior_gradients()) obj_grads = -LL_gradients - prior_gradients - print self - #self.checkgrad(verbose=1) - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return obj_f, obj_grads def optimize(self, optimizer=None, start=None, **kwargs): diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 14400a08..d6b48ebf 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -24,7 +24,7 @@ def timing(): edited_real_sd = real_sd kernel1 = GPy.kern.rbf(X.shape[1]) - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1) m.ensure_default_constraints() @@ -53,7 +53,7 @@ def v_fail_test(): edited_real_sd = real_sd print "Clean student t, rasm" - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernel1) m.constrain_positive('') @@ -92,18 +92,18 @@ def debug_student_t_noise_approx(): X = np.random.rand(100)[:, None] #X = np.random.rand(100)[:, None] #X = np.array([0.5, 1])[:, None] - Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + 1 #Y = X + np.random.randn(*X.shape)*real_var #ty = np.array([1., 9.97733584, 4.17841363])[:, None] #Y = ty X_full = X - Y_full = np.sin(X_full) + Y_full = np.sin(X_full) + 1 Y = Y/Y.max() #Add student t random noise to datapoints - deg_free = 1000 + deg_free = 100 real_sd = np.sqrt(real_var) print "Real noise std: ", real_sd @@ -115,7 +115,7 @@ def debug_student_t_noise_approx(): plt.close('all') # Kernel object - kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1]) #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1]) kernel2 = kernel1.copy() kernel3 = kernel1.copy() @@ -140,24 +140,24 @@ def debug_student_t_noise_approx(): #print m real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free))) - edited_real_sd = real_stu_t_std + 1#initial_var_guess #real_sd + edited_real_sd = real_stu_t_std**2 #initial_var_guess #real_sd #edited_real_sd = real_sd print "Clean student t, rasm" - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernel6) #m['rbf_len'] = 1.5 #m.constrain_fixed('rbf_v', 1.0898) - #m.constrain_fixed('rbf_l', 1.8651) + #m.constrain_fixed('rbf_l', 0.2651) #m.constrain_fixed('t_noise_std', edited_real_sd) #m.constrain_positive('rbf') - #m.constrain_positive('t_noise_std') + m.constrain_positive('t_noise_std') #m.constrain_positive('') #m.constrain_bounded('t_noi', 0.001, 10) #m.constrain_fixed('t_noi', real_stu_t_std) - m.constrain_fixed('white', 0.01) + #m.constrain_fixed('white', 0.01) #m.constrain_fixed('t_no', 0.01) #m['rbf_var'] = 0.20446332 #m['rbf_leng'] = 0.85776241 @@ -179,7 +179,7 @@ def debug_student_t_noise_approx(): return m #print "Clean student t, ncg" - #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg') #m = GPy.models.GP(X, stu_t_likelihood, kernel3) #m.ensure_default_constraints() @@ -276,7 +276,7 @@ def student_t_approx(): edited_real_sd = real_sd #initial_var_guess print "Clean student t, rasm" - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernel6) m.ensure_default_constraints() @@ -291,7 +291,7 @@ def student_t_approx(): plt.title('Student-t rasm clean') print "Corrupt student t, rasm" - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m.ensure_default_constraints() @@ -308,7 +308,7 @@ def student_t_approx(): return m #print "Clean student t, ncg" - #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg') #m = GPy.models.GP(X, stu_t_likelihood, kernel3) #m.ensure_default_constraints() @@ -322,7 +322,7 @@ def student_t_approx(): #plt.title('Student-t ncg clean') #print "Corrupt student t, ncg" - #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg') #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) #m.ensure_default_constraints() @@ -337,7 +337,7 @@ def student_t_approx(): ###with a student t distribution, since it has heavy tails it should work well - ###likelihood_function = student_t(deg_free, sigma=real_var) + ###likelihood_function = student_t(deg_free, sigma2=real_var) ###lap = Laplace(Y, likelihood_function) ###cov = kernel.K(X) ###lap.fit_full(cov) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 2ae68613..984112a5 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -220,10 +220,10 @@ class Laplace(likelihood): self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T) #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W)) - self.ln_z_hat = (- 0.5*self.f_Ki_f - - self.ln_I_KW_det - + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) - ) + #self.ln_z_hat = (- 0.5*self.f_Ki_f + #- self.ln_I_KW_det + #+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) + #) return self._compute_GP_variables() @@ -308,6 +308,8 @@ class Laplace(likelihood): step_size = 1 rs = 0 i = 0 + #if self.likelihood_function.sigma < 0.001: + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: @@ -316,8 +318,6 @@ class Laplace(likelihood): # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods B, L, W_12 = self._compute_B_statistics(K, W) - #if i > 30: - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT W_f = W*f grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index fd64dbe6..bfc759d7 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -158,26 +158,26 @@ class student_t(likelihood_function): dln p(yi|fi)_dfi d2ln p(yi|fi)_d2fifj """ - def __init__(self, deg_free, sigma=2): + def __init__(self, deg_free, sigma2=2): #super(student_t, self).__init__() self.v = deg_free - self.sigma = sigma + self.sigma2 = sigma2 self.log_concave = False - self._set_params(np.asarray(sigma)) + self._set_params(np.asarray(sigma2)) def _get_params(self): - return np.asarray(self.sigma) + return np.asarray(self.sigma2) def _get_param_names(self): - return ["t_noise_std"] + return ["t_noise_std2"] def _set_params(self, x): - self.sigma = float(x) + self.sigma2 = float(x) @property def variance(self, extra_data=None): - return (self.v / float(self.v - 2)) * (self.sigma**2) + return (self.v / float(self.v - 2)) * self.sigma2 def link_function(self, y, f, extra_data=None): """link_function $\ln p(y|f)$ @@ -193,12 +193,16 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f + A = gammaln((self.v + 1) * 0.5) + B = -gammaln(self.v * 0.5) + C = - 0.5*np.log(self.sigma2 * self.v * np.pi) + D = (-(self.v + 1)*0.5)*np.log(1 + (e**2)/(self.v*self.sigma2)) objective = (+ gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) - - 0.5*np.log((self.sigma**2) * self.v * np.pi) - #- (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v)) - - (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2))) + - 0.5*np.log(self.sigma2 * self.v * np.pi) + + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v)) ) + #print "A: {} B: {} C: {} D: {} obj: {}".format(A,B,C,D.sum(),objective.sum()) return np.sum(objective) def dlik_df(self, y, f, extra_data=None): @@ -215,7 +219,7 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) + grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2)) return grad def d2lik_d2f(self, y, f, extra_data=None): @@ -235,7 +239,7 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) + hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2) return hess def d3lik_d3f(self, y, f, extra_data=None): @@ -246,8 +250,8 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) / - ((e**2 + (self.sigma**2)*self.v)**3) + d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) / + ((e**2 + self.sigma2*self.v)**3) ) return d3lik_d3f @@ -262,10 +266,16 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - dlik_dsigma = ( - (1/self.sigma) + - ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) ) - ) + #sigma = np.sqrt(self.sigma2) + #dlik_dsigma = ( - (1/sigma) + + #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) + #) + #dlik_dsigma = ( - 1 + + #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) + #) #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1 + #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v)) + dlik_dsigma = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2)) return dlik_dsigma def dlik_df_dstd(self, y, f, extra_data=None): @@ -276,9 +286,11 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) #2 might not want to be here? - / ((self.v*(self.sigma**2) + e**2)**2) - ) + #sigma = np.sqrt(self.sigma2) + #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here? + #/ ((self.v*self.sigma2 + e**2)**2) + #) + dlik_grad_dsigma = (-self.v*(self.v+1)*e)/((self.sigma2*self.v + e**2)**2) return dlik_grad_dsigma def d2lik_d2f_dstd(self, y, f, extra_data=None): @@ -289,11 +301,15 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - dlik_hess_dsigma = ( (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) / - ((e**2 + (self.sigma**2)*self.v)**3) - ) + #sigma = np.sqrt(self.sigma2) + #dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) / + #((e**2 + self.sigma2*self.v)**3) + #) #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2)) #/ ((e**2 + (self.sigma**2)*self.v)**3) ) + dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2))) + / (self.sigma2*self.v + (e**2))**3 + ) return dlik_hess_dsigma def _gradients(self, y, f, extra_data=None): @@ -466,3 +482,148 @@ class weibull_survival(likelihood_function): hess = (y**self.shape)*np.exp(f) return np.squeeze(hess) + +#class gaussian(likelihood_function): + #""" + #Gaussian likelihood - this is a test class for approximation schemes + #""" + #def __init__(self, variance): + #self._set_params(np.asarray(variance)) + + #def _get_params(self): + #return np.asarray(self.sigma2) + + #def _get_param_names(self): + #return ["noise_variance"] + + #def _set_params(self, x): + #self.variance = float(x) + + #def link_function(self, y, f, extra_data=None): + #"""link_function $\ln p(y|f)$ + #$$\ln p(y_{i}|f_{i}) = \ln $$ + + #:y: data + #:f: latent variables f + #:extra_data: extra_data which is not used in student t distribution + #:returns: float(likelihood evaluated for this point) + + #""" + #assert y.shape == f.shape + #e = y - f + #objective = -0.5*self.D* + #return np.sum(objective) + + #def dlik_df(self, y, f, extra_data=None): + #""" + #Gradient of the link function at y, given f w.r.t f + + #$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$ + + #:y: data + #:f: latent variables f + #:extra_data: extra_data which is not used in student t distribution + #:returns: gradient of likelihood evaluated at points + + #""" + #assert y.shape == f.shape + #e = y - f + #grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2)) + #return grad + + #def d2lik_d2f(self, y, f, extra_data=None): + #""" + #Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j + #i.e. second derivative link_function at y given f f_j w.r.t f and f_j + + #Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases + #(the distribution for y_{i} depends only on f_{i} not on f_{j!=i} + + #$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$ + + #:y: data + #:f: latent variables f + #:extra_data: extra_data which is not used in student t distribution + #:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) + #""" + #assert y.shape == f.shape + #e = y - f + #hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2) + #return hess + + #def d3lik_d3f(self, y, f, extra_data=None): + #""" + #Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j + + #$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$ + #""" + #assert y.shape == f.shape + #e = y - f + #d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) / + #((e**2 + self.sigma2*self.v)**3) + #) + #return d3lik_d3f + + #def lik_dstd(self, y, f, extra_data=None): + #""" + #Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation) + + #Terms relavent to derivatives wrt sigma are: + #-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2)) + + #$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$ + #""" + #assert y.shape == f.shape + #e = y - f + #sigma = np.sqrt(self.sigma2) + ##dlik_dsigma = ( - (1/sigma) + + ##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) + ##) + ##dlik_dsigma = ( - 1 + + ##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) + ##) + ##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1 + #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v)) + #return dlik_dsigma + + #def dlik_df_dstd(self, y, f, extra_data=None): + #""" + #Gradient of the dlik_df w.r.t sigma parameter (standard deviation) + + #$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$ + #""" + #assert y.shape == f.shape + #e = y - f + #sigma = np.sqrt(self.sigma2) + #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here? + #/ ((self.v*self.sigma2 + e**2)**2) + #) + #return dlik_grad_dsigma + + #def d2lik_d2f_dstd(self, y, f, extra_data=None): + #""" + #Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation) + + #$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ + #""" + #assert y.shape == f.shape + #e = y - f + #sigma = np.sqrt(self.sigma2) + #dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) / + #((e**2 + self.sigma2*self.v)**3) + #) + ##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2)) + ##/ ((e**2 + (self.sigma**2)*self.v)**3) ) + #return dlik_hess_dsigma + + #def _gradients(self, y, f, extra_data=None): + ##must be listed in same order as 'get_param_names' + #derivs = ([self.lik_dstd(y, f, extra_data=extra_data)], + #[self.dlik_df_dstd(y, f, extra_data=extra_data)], + #[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)] + #) # lists as we might learn many parameters + ## ensure we have gradients for every parameter we want to optimize + #assert len(derivs[0]) == len(self._get_param_names()) + #assert len(derivs[1]) == len(self._get_param_names()) + #assert len(derivs[2]) == len(self._get_param_names()) + #return derivs