From 67248ab7c2b0becf471fe08638d35cf0786ee1a2 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Tue, 12 Mar 2013 03:16:33 -0700 Subject: [PATCH 01/22] Initial commit --- .gitignore | 35 +++++++++++++++++++++++++++++++++++ README.md | 4 ++++ 2 files changed, 39 insertions(+) create mode 100644 .gitignore create mode 100644 README.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..d2d6f360 --- /dev/null +++ b/.gitignore @@ -0,0 +1,35 @@ +*.py[cod] + +# C extensions +*.so + +# Packages +*.egg +*.egg-info +dist +build +eggs +parts +bin +var +sdist +develop-eggs +.installed.cfg +lib +lib64 + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.tox +nosetests.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject diff --git a/README.md b/README.md new file mode 100644 index 00000000..317fa353 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +coxGP +===== + +Gaussian Process models of Cox proportional hazard models \ No newline at end of file From 68eb83955c585b08cf93cbd659f749cff5b62bb3 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 12 Mar 2013 17:42:00 +0000 Subject: [PATCH 02/22] Initial commit, setting up the laplace approximation for a student t --- python/examples/laplace_approximations.py | 37 ++++++++++++++++ python/likelihoods/Laplace.py | 54 +++++++++++++++++++++++ python/likelihoods/likelihood_function.py | 51 +++++++++++++++++++++ python/models/coxGP.py | 19 ++++++++ python/testing/cox_tests.py | 14 ++++++ 5 files changed, 175 insertions(+) create mode 100644 python/examples/laplace_approximations.py create mode 100644 python/likelihoods/Laplace.py create mode 100644 python/likelihoods/likelihood_function.py create mode 100644 python/models/coxGP.py create mode 100644 python/testing/cox_tests.py diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py new file mode 100644 index 00000000..2f059831 --- /dev/null +++ b/python/examples/laplace_approximations.py @@ -0,0 +1,37 @@ +import GPy +import numpy as np +import scipy as sp +import scipy.stats +import matplotlib.pyplot as plt + + +def student_t_approx(): + """ + Example of regressing with a student t likelihood + """ + #Start a function, any function + X = np.sort(np.random.uniform(0, 15, 70))[:, None] + Y = np.sin(X) + + #Add some extreme value noise to some of the datapoints + percent_corrupted = 0.05 + corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted)) + indices = np.arange(Y.shape[0]) + np.random.shuffle(indices) + corrupted_indices = indices[:corrupted_datums] + print corrupted_indices + noise = np.random.uniform(-10,10,(len(corrupted_indices), 1)) + Y[corrupted_indices] += noise + + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model + m = GPy.models.GP_regression(X,Y) + + # optimize + m.ensure_default_constraints() + m.optimize() + # plot + m.plot() + print m + + #with a student t distribution, since it has heavy tails it should work well diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py new file mode 100644 index 00000000..a0dbc65c --- /dev/null +++ b/python/likelihoods/Laplace.py @@ -0,0 +1,54 @@ +import nump as np +import GPy +from GPy.util.linalg import jitchol + +class Laplace(GPy.likelihoods.likelihood): + """Laplace approximation to a posterior""" + + def __init__(self,data,likelihood_function): + """ + Laplace Approximation + + First find the moments \hat{f} and the hessian at this point (using Newton-Raphson) + then find the z^{prime} which allows this to be a normalised gaussian instead of a + non-normalized gaussian + + Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle} + which makes a gaussian the same as the laplace approximation + + Arguments + --------- + + :data: @todo + :likelihood_function: @todo + + """ + GPy.likelihoods.likelihood.__init__(self) + + self.data = data + self.likelihood_function = likelihood_function + + #Inital values + self.N, self.D = self.data.shape + + def _compute_GP_variables(self): + """ + Generates data Y which would give the normal distribution identical to the laplace approximation + + GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle} + that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood + """ + raise NotImplementedError + + def fit_full(self, K): + """ + The laplace approximation algorithm + For nomenclature see Rasmussen & Williams 2006 + :K: Covariance matrix + """ + self.f = np.zeros(self.N) + + #Find \hat(f) using a newton raphson optimizer for example + + #At this point get the hessian matrix + diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py new file mode 100644 index 00000000..fd19675b --- /dev/null +++ b/python/likelihoods/likelihood_function.py @@ -0,0 +1,51 @@ +import GPy +from scipy.special import gamma, gammaln + +class student_t(GPy.likelihoods.likelihood_function): + """Student t likelihood distribution + For nomanclature see Bayesian Data Analysis 2003 p576 + + Laplace: + Needs functions to calculate + ln p(yi|fi) + dln p(yi|fi)_dfi + d2ln p(yi|fi)_d2fi + """ + def __init__(self, deg_free, sigma=1): + self.v = deg_free + self.sigma = 1 + + def link_function(self, y_i, f_i): + """link_function $\ln p(y_i|f_i)$ + + :y_i: datum number i + :f_i: latent variable f_i + :returns: float(likelihood evaluated for this point) + + """ + e = y_i - f_i + return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) + + def link_grad(self, y_i, f_i): + """gradient of the link function at y_i, given f_i w.r.t f_i + + :y_i: datum number i + :f_i: latent variable f_i + :returns: float(gradient of likelihood evaluated at this point) + + """ + pass + + def link_hess(self, y_i, f_i, f_j): + """hessian at this point (the hessian will be 0 unless i == j) + i.e. second derivative w.r.t f_i and f_j + + :y_i: @todo + :f_i: @todo + :f_j: @todo + :returns: @todo + + """ + if f_i = + pass + diff --git a/python/models/coxGP.py b/python/models/coxGP.py new file mode 100644 index 00000000..f61a8f46 --- /dev/null +++ b/python/models/coxGP.py @@ -0,0 +1,19 @@ +# Copyright (c) 2013, Alan Saul + +from GPy.models import GP +from .. import likelihoods +from GPy import kern + + +class cox_GP_regression(GP): + """ + Cox Gaussian Process model for regression + """ + + def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None): + if kernel is None: + kernel = kern.rbf(X.shape[1]) + + likelihood = likelihoods.cox_piecewise(Y, normalize=normalize_Y) + + GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices) diff --git a/python/testing/cox_tests.py b/python/testing/cox_tests.py new file mode 100644 index 00000000..526f5c92 --- /dev/null +++ b/python/testing/cox_tests.py @@ -0,0 +1,14 @@ +# Copyright (c) 2013, Alan Saul + +import unittest +import numpy as np +import GPy + +class coxGPTests(unittest.TestCase): + def test_laplace_approx(self): + pass + +if __name__ == "__main__": + print "Running unit tests, please be (very) patient..." + unittest.main() + From ad2c266c65120e1fabf0cf1825fc0c661084611b Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 13 Mar 2013 11:54:33 +0000 Subject: [PATCH 03/22] Added some comments --- python/likelihoods/likelihood_function.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index fd19675b..5d4e51ce 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -5,6 +5,9 @@ class student_t(GPy.likelihoods.likelihood_function): """Student t likelihood distribution For nomanclature see Bayesian Data Analysis 2003 p576 + $$\ln(\frac{\Gamma(\frac{(v+1)}{2})}{\Gamma(\sqrt(v \pi \Gamma(\frac{v}{2}))})+ \ln(1+\frac{(y_i-f_i)^2}{\sigma v})^{-\frac{(v+1)}{2}}$$ + TODO:Double check this + Laplace: Needs functions to calculate ln p(yi|fi) @@ -17,6 +20,8 @@ class student_t(GPy.likelihoods.likelihood_function): def link_function(self, y_i, f_i): """link_function $\ln p(y_i|f_i)$ + $$\ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2}) - \ln \frac{v \pi \sigma}{2} - \frac{v+1}{2}\ln (1 + \frac{(y_{i} - f_{i})^{2}}{v\sigma})$$ + TODO: Double check this :y_i: datum number i :f_i: latent variable f_i @@ -24,11 +29,15 @@ class student_t(GPy.likelihoods.likelihood_function): """ e = y_i - f_i - return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) + return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) #Check the /v! def link_grad(self, y_i, f_i): """gradient of the link function at y_i, given f_i w.r.t f_i + derivative of log((gamma((v+1)/2)/gamma(sqrt(v*pi*gamma(v/2))))*(1+(t^2)/(a*v))^((-(v+1))/2)) with respect to t + $$\frac{(y_i - f_i)(v + 1)}{\sigma v (y_{i} - f_{i})^{2}}$$ + TODO: Double check this + :y_i: datum number i :f_i: latent variable f_i :returns: float(gradient of likelihood evaluated at this point) @@ -40,6 +49,8 @@ class student_t(GPy.likelihoods.likelihood_function): """hessian at this point (the hessian will be 0 unless i == j) i.e. second derivative w.r.t f_i and f_j + second derivative of + :y_i: @todo :f_i: @todo :f_j: @todo From 3f114aa020fb678b1c52eb441bb079d9a0b8cd00 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 13 Mar 2013 17:55:41 +0000 Subject: [PATCH 04/22] Got most of laplace approximation working --- __init__.py | 0 python/__init__.py | 0 python/examples/__init__.py | 0 python/examples/laplace_approximations.py | 44 +++++++++++-- python/likelihoods/Laplace.py | 45 +++++++++++-- python/likelihoods/__init__.py | 0 python/likelihoods/likelihood_function.py | 80 +++++++++++++---------- python/models/__init__.py | 0 python/testing/__init__.py | 0 9 files changed, 124 insertions(+), 45 deletions(-) create mode 100644 __init__.py create mode 100644 python/__init__.py create mode 100644 python/examples/__init__.py create mode 100644 python/likelihoods/__init__.py create mode 100644 python/models/__init__.py create mode 100644 python/testing/__init__.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/__init__.py b/python/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/examples/__init__.py b/python/examples/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 2f059831..0e1d3305 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -1,8 +1,9 @@ import GPy import numpy as np -import scipy as sp -import scipy.stats import matplotlib.pyplot as plt +from scipy.stats import t +from coxGP.python.likelihoods.Laplace import Laplace +from coxGP.python.likelihoods.likelihood_function import student_t def student_t_approx(): @@ -13,6 +14,41 @@ def student_t_approx(): X = np.sort(np.random.uniform(0, 15, 70))[:, None] Y = np.sin(X) + #Add student t random noise to datapoints + deg_free = 1 + noise = t.rvs(deg_free, loc=1.8, scale=1, size=Y.shape) + Y += noise + + # Kernel object + print X.shape + kernel = GPy.kern.rbf(X.shape[1]) + + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model + m = GPy.models.GP_regression(X, Y, kernel=kernel) + + # optimize + m.ensure_default_constraints() + m.optimize() + # plot + #m.plot() + print m + + #with a student t distribution, since it has heavy tails it should work well + likelihood_function = student_t(deg_free, sigma=1) + lap = Laplace(Y, likelihood_function) + cov = kernel.K(X) + lap.fit_full(cov) + + +def noisy_laplace_approx(): + """ + Example of regressing with a student t likelihood + """ + #Start a function, any function + X = np.sort(np.random.uniform(0, 15, 70))[:, None] + Y = np.sin(X) + #Add some extreme value noise to some of the datapoints percent_corrupted = 0.05 corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted)) @@ -20,12 +56,12 @@ def student_t_approx(): np.random.shuffle(indices) corrupted_indices = indices[:corrupted_datums] print corrupted_indices - noise = np.random.uniform(-10,10,(len(corrupted_indices), 1)) + noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1)) Y[corrupted_indices] += noise #A GP should completely break down due to the points as they get a lot of weight # create simple GP model - m = GPy.models.GP_regression(X,Y) + m = GPy.models.GP_regression(X, Y) # optimize m.ensure_default_constraints() diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index a0dbc65c..6efbfa30 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,8 +1,14 @@ -import nump as np +import numpy as np +import scipy as sp import GPy from GPy.util.linalg import jitchol +from functools import partial +from GPy.likelihoods.likelihood import likelihood +from GPy.util.linalg import pdinv,mdot -class Laplace(GPy.likelihoods.likelihood): + + +class Laplace(likelihood): """Laplace approximation to a posterior""" def __init__(self,data,likelihood_function): @@ -23,8 +29,6 @@ class Laplace(GPy.likelihoods.likelihood): :likelihood_function: @todo """ - GPy.likelihoods.likelihood.__init__(self) - self.data = data self.likelihood_function = likelihood_function @@ -38,7 +42,7 @@ class Laplace(GPy.likelihoods.likelihood): GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle} that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood """ - raise NotImplementedError + z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised def fit_full(self, K): """ @@ -46,9 +50,38 @@ class Laplace(GPy.likelihoods.likelihood): For nomenclature see Rasmussen & Williams 2006 :K: Covariance matrix """ - self.f = np.zeros(self.N) + f = np.zeros((self.N, 1)) + print K.shape + print f.shape + print self.data.shape + (Ki, _, _, log_Kdet) = pdinv(K) + obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2*np.pi)) #Find \hat(f) using a newton raphson optimizer for example + #TODO: Add newton-raphson as subclass of optimizer class + + #FIXME: Can we get rid of this horrible reshaping? + def obj(f): + f = f[:, None] + res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (Ki, f)) + obj_constant) + return float(res) + + def obj_grad(f): + f = f[:, None] + res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(Ki, f)) + return np.squeeze(res) + + def obj_hess(f): + f = f[:, None] + res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - Ki) + return np.squeeze(res) + + self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) #At this point get the hessian matrix + self.hess_hat = obj_hess(f_hat) + #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...) + self.height_unnormalised = obj(f_hat) #FIXME: Is it -1? + + return _compute_GP_variables() diff --git a/python/likelihoods/__init__.py b/python/likelihoods/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 5d4e51ce..78731199 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -1,62 +1,72 @@ -import GPy -from scipy.special import gamma, gammaln +from scipy.special import gammaln +import numpy as np +from GPy.likelihoods.likelihood_functions import likelihood_function -class student_t(GPy.likelihoods.likelihood_function): + +class student_t(likelihood_function): """Student t likelihood distribution For nomanclature see Bayesian Data Analysis 2003 p576 - $$\ln(\frac{\Gamma(\frac{(v+1)}{2})}{\Gamma(\sqrt(v \pi \Gamma(\frac{v}{2}))})+ \ln(1+\frac{(y_i-f_i)^2}{\sigma v})^{-\frac{(v+1)}{2}}$$ - TODO:Double check this + $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ Laplace: Needs functions to calculate ln p(yi|fi) dln p(yi|fi)_dfi - d2ln p(yi|fi)_d2fi + d2ln p(yi|fi)_d2fifj """ def __init__(self, deg_free, sigma=1): self.v = deg_free self.sigma = 1 - def link_function(self, y_i, f_i): - """link_function $\ln p(y_i|f_i)$ - $$\ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2}) - \ln \frac{v \pi \sigma}{2} - \frac{v+1}{2}\ln (1 + \frac{(y_{i} - f_{i})^{2}}{v\sigma})$$ - TODO: Double check this + def link_function(self, y, f): + """link_function $\ln p(y|f)$ + $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ - :y_i: datum number i - :f_i: latent variable f_i + :y: datum number i + :f: latent variable f :returns: float(likelihood evaluated for this point) """ - e = y_i - f_i - return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) #Check the /v! + e = y - f + #print "Link ", y.shape, f.shape, e.shape + objective = (gammaln((self.v + 1) * 0.5) + - gammaln(self.v * 0.5) + + np.log(self.sigma * np.sqrt(self.v * np.pi)) + - (self.v + 1) * 0.5 + * np.log(1 + ((e**2 / self.sigma**2) / self.v)) + ) + return np.sum(objective) - def link_grad(self, y_i, f_i): - """gradient of the link function at y_i, given f_i w.r.t f_i + def link_grad(self, y, f): + """ + Gradient of the link function at y, given f w.r.t f - derivative of log((gamma((v+1)/2)/gamma(sqrt(v*pi*gamma(v/2))))*(1+(t^2)/(a*v))^((-(v+1))/2)) with respect to t - $$\frac{(y_i - f_i)(v + 1)}{\sigma v (y_{i} - f_{i})^{2}}$$ - TODO: Double check this + $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ - :y_i: datum number i - :f_i: latent variable f_i + :y: datum number i + :f: latent variable f :returns: float(gradient of likelihood evaluated at this point) """ - pass - - def link_hess(self, y_i, f_i, f_j): - """hessian at this point (the hessian will be 0 unless i == j) - i.e. second derivative w.r.t f_i and f_j - - second derivative of - - :y_i: @todo - :f_i: @todo - :f_j: @todo - :returns: @todo + e = y - f + #print "Grad ", y.shape, f.shape, e.shape + grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) + return grad + def link_hess(self, y, f): """ - if f_i = - pass + Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j + i.e. second derivative link_function at y given f f_j w.r.t f and f_j + Will return diaganol of hessian, since every where else it is 0 + + $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ + + :y: datum number i + :f: latent variable f + :returns: float(second derivative of likelihood evaluated at this point) + """ + e = y - f + hess = ((self.v + 1) * e) / ((((self.sigma**2)*self.v) + e**2)**2) + return hess diff --git a/python/models/__init__.py b/python/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/testing/__init__.py b/python/testing/__init__.py new file mode 100644 index 00000000..e69de29b From f9535c858a653e08a32a8633fe37577c87812820 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 14 Mar 2013 15:30:22 +0000 Subject: [PATCH 05/22] Trying to 'debug' --- python/examples/laplace_approximations.py | 22 +++++++++++--- python/likelihoods/Laplace.py | 25 +++++++++------ python/likelihoods/likelihood_function.py | 37 ++++++++++++----------- 3 files changed, 52 insertions(+), 32 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 0e1d3305..5642d8a4 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -1,7 +1,7 @@ import GPy import numpy as np import matplotlib.pyplot as plt -from scipy.stats import t +from scipy.stats import t, norm from coxGP.python.likelihoods.Laplace import Laplace from coxGP.python.likelihoods.likelihood_function import student_t @@ -11,12 +11,13 @@ def student_t_approx(): Example of regressing with a student t likelihood """ #Start a function, any function - X = np.sort(np.random.uniform(0, 15, 70))[:, None] + X = np.sort(np.random.uniform(0, 15, 100))[:, None] Y = np.sin(X) #Add student t random noise to datapoints - deg_free = 1 - noise = t.rvs(deg_free, loc=1.8, scale=1, size=Y.shape) + deg_free = 2.5 + t_rv = t(deg_free, loc=5, scale=1) + noise = t_rv.rvs(size=Y.shape) Y += noise # Kernel object @@ -39,6 +40,19 @@ def student_t_approx(): lap = Laplace(Y, likelihood_function) cov = kernel.K(X) lap.fit_full(cov) + #Get one sample (just look at a single Y + mode = float(lap.f_hat[0]) + variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables + #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables + normalised_approx = norm(loc=mode, scale=variance) + print "Normal with mode %f, and variance %f" % (mode, variance) + print lap.height_unnormalised + + test_range = np.arange(0, 10, 0.1) + print np.diagonal(lap.hess_hat) + plt.plot(test_range, t_rv.pdf(test_range)) + plt.plot(test_range, normalised_approx.pdf(test_range)) + plt.show() def noisy_laplace_approx(): diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 6efbfa30..08ae0e6f 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -5,13 +5,13 @@ from GPy.util.linalg import jitchol from functools import partial from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv,mdot - +from scipy.stats import norm class Laplace(likelihood): """Laplace approximation to a posterior""" - def __init__(self,data,likelihood_function): + def __init__(self, data, likelihood_function): """ Laplace Approximation @@ -42,7 +42,13 @@ class Laplace(likelihood): GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle} that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood """ - z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised + #z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised + normalised_approx = norm(loc=self.f_hat, scale=self.hess_hat) + self.Z = normalised_approx.pdf(self.f_hat)/self.height_unnormalised + #self.Y = + #self.YYT = + #self.covariance_matrix = + #self.precision = def fit_full(self, K): """ @@ -51,11 +57,9 @@ class Laplace(likelihood): :K: Covariance matrix """ f = np.zeros((self.N, 1)) - print K.shape - print f.shape - print self.data.shape + #K = np.diag(np.ones(self.N)) (Ki, _, _, log_Kdet) = pdinv(K) - obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2*np.pi)) + obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi)) #Find \hat(f) using a newton raphson optimizer for example #TODO: Add newton-raphson as subclass of optimizer class @@ -77,11 +81,12 @@ class Laplace(likelihood): return np.squeeze(res) self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) + print self.f_hat #At this point get the hessian matrix - self.hess_hat = obj_hess(f_hat) + self.hess_hat = obj_hess(self.f_hat) #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...) - self.height_unnormalised = obj(f_hat) #FIXME: Is it -1? + self.height_unnormalised = obj(self.f_hat) #FIXME: Is it -1? - return _compute_GP_variables() + return self._compute_GP_variables() diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 78731199..46128de7 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -15,27 +15,27 @@ class student_t(likelihood_function): dln p(yi|fi)_dfi d2ln p(yi|fi)_d2fifj """ - def __init__(self, deg_free, sigma=1): + def __init__(self, deg_free, sigma=2): self.v = deg_free - self.sigma = 1 + self.sigma = sigma def link_function(self, y, f): """link_function $\ln p(y|f)$ $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ - :y: datum number i - :f: latent variable f + :y: data + :f: latent variables f :returns: float(likelihood evaluated for this point) """ + assert y.shape[0] == f.shape[0] e = y - f - #print "Link ", y.shape, f.shape, e.shape objective = (gammaln((self.v + 1) * 0.5) - - gammaln(self.v * 0.5) - + np.log(self.sigma * np.sqrt(self.v * np.pi)) - - (self.v + 1) * 0.5 - * np.log(1 + ((e**2 / self.sigma**2) / self.v)) - ) + - gammaln(self.v * 0.5) + + np.log(self.sigma * np.sqrt(self.v * np.pi)) + - (self.v + 1) * 0.5 + * np.log(1 + ((e**2 / self.sigma**2) / self.v)) + ) return np.sum(objective) def link_grad(self, y, f): @@ -44,13 +44,13 @@ class student_t(likelihood_function): $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ - :y: datum number i - :f: latent variable f - :returns: float(gradient of likelihood evaluated at this point) + :y: data + :f: latent variables f + :returns: gradient of likelihood evaluated at points """ + assert y.shape[0] == f.shape[0] e = y - f - #print "Grad ", y.shape, f.shape, e.shape grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) return grad @@ -63,10 +63,11 @@ class student_t(likelihood_function): $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ - :y: datum number i - :f: latent variable f - :returns: float(second derivative of likelihood evaluated at this point) + :y: data + :f: latent variables f + :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ + assert y.shape[0] == f.shape[0] e = y - f - hess = ((self.v + 1) * e) / ((((self.sigma**2)*self.v) + e**2)**2) + hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) return hess From 34ae852eea8d5f6cdc48028d4f21457c7f0b5259 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 15 Mar 2013 17:38:13 +0000 Subject: [PATCH 06/22] got an idea of how to implement! written in docs --- python/likelihoods/Laplace.py | 38 ++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 08ae0e6f..568fcef0 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -41,10 +41,26 @@ class Laplace(likelihood): GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle} that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood + + Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal) + then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f) + due to the z rescaling. + + at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1) + + This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1) + giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f) + + $$\tilde{Y} = \tilde{\Sigma} Hf$$ + where + $$\tilde{\Sigma}^{-1} = H - K^{-1}$$ + i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$ + since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$ + and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ + """ - #z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised - normalised_approx = norm(loc=self.f_hat, scale=self.hess_hat) - self.Z = normalised_approx.pdf(self.f_hat)/self.height_unnormalised + self.Sigma_tilde = self.hess_hat - + self.Z = #self.Y = #self.YYT = #self.covariance_matrix = @@ -58,8 +74,8 @@ class Laplace(likelihood): """ f = np.zeros((self.N, 1)) #K = np.diag(np.ones(self.N)) - (Ki, _, _, log_Kdet) = pdinv(K) - obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi)) + (self.Ki, _, _, self.log_Kdet) = pdinv(K) + obj_constant = (0.5 * self.log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi)) #Find \hat(f) using a newton raphson optimizer for example #TODO: Add newton-raphson as subclass of optimizer class @@ -67,17 +83,17 @@ class Laplace(likelihood): #FIXME: Can we get rid of this horrible reshaping? def obj(f): f = f[:, None] - res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (Ki, f)) + obj_constant) + res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (self.Ki, f)) + obj_constant) return float(res) def obj_grad(f): f = f[:, None] - res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(Ki, f)) + res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(self.Ki, f)) return np.squeeze(res) def obj_hess(f): f = f[:, None] - res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - Ki) + res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - self.Ki) return np.squeeze(res) self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) @@ -87,6 +103,10 @@ class Laplace(likelihood): self.hess_hat = obj_hess(self.f_hat) #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...) - self.height_unnormalised = obj(self.f_hat) #FIXME: Is it -1? + self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1? + #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to + #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode + #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) + self.z_hat = np.exp(-0.5*np.log(np.linalg.det(hess_hat)) + self.height_unnormalised) return self._compute_GP_variables() From 2bf1cf0eb6596773c2f75a06f152b3a7cfd66081 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 18 Mar 2013 15:59:12 +0000 Subject: [PATCH 07/22] following naming convention better, lots of inverses which should be able to get rid of one or two, unsure if it works --- python/examples/laplace_approximations.py | 17 +++++---- python/likelihoods/Laplace.py | 43 +++++++++++++---------- python/likelihoods/likelihood_function.py | 9 ++--- 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 5642d8a4..aa8cdcb4 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -41,18 +41,21 @@ def student_t_approx(): cov = kernel.K(X) lap.fit_full(cov) #Get one sample (just look at a single Y - mode = float(lap.f_hat[0]) - variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables + #mode = float(lap.f_hat[0]) + #variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables - normalised_approx = norm(loc=mode, scale=variance) - print "Normal with mode %f, and variance %f" % (mode, variance) - print lap.height_unnormalised test_range = np.arange(0, 10, 0.1) - print np.diagonal(lap.hess_hat) plt.plot(test_range, t_rv.pdf(test_range)) - plt.plot(test_range, normalised_approx.pdf(test_range)) + for i in xrange(X.shape[0]): + mode = lap.f_hat[i] + covariance = lap.hess_hat_i[i,i] + scaling = np.exp(lap.ln_z_hat) + normalised_approx = norm(loc=mode, scale=covariance) + print "Normal with mode %f, and variance %f" % (mode, covariance) + plt.plot(test_range, normalised_approx.pdf(test_range)) plt.show() + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def noisy_laplace_approx(): diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 568fcef0..9d622b0d 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,12 +1,10 @@ import numpy as np import scipy as sp import GPy -from GPy.util.linalg import jitchol +#from GPy.util.linalg import jitchol from functools import partial from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv,mdot -from scipy.stats import norm - class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -35,6 +33,8 @@ class Laplace(likelihood): #Inital values self.N, self.D = self.data.shape + self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi)) + def _compute_GP_variables(self): """ Generates data Y which would give the normal distribution identical to the laplace approximation @@ -59,12 +59,15 @@ class Laplace(likelihood): and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ """ - self.Sigma_tilde = self.hess_hat - - self.Z = - #self.Y = - #self.YYT = - #self.covariance_matrix = - #self.precision = + self.Sigma_tilde_i = self.hess_hat + self.Ki + #Do we really need to inverse Sigma_tilde_i? :( + (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i) + Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess? + self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde, (self.Sigma_tilde_i, Y_tilde)))) + self.Y = Y_tilde + self.covariance_matrix = self.Sigma_tilde + self.precision = np.diag(self.Sigma_tilde)[:, None] + self.YYT = np.dot(self.Y, self.Y) def fit_full(self, K): """ @@ -75,38 +78,40 @@ class Laplace(likelihood): f = np.zeros((self.N, 1)) #K = np.diag(np.ones(self.N)) (self.Ki, _, _, self.log_Kdet) = pdinv(K) - obj_constant = (0.5 * self.log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi)) - + LOG_K_CONST = -(0.5 * self.log_Kdet) + OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST #Find \hat(f) using a newton raphson optimizer for example #TODO: Add newton-raphson as subclass of optimizer class #FIXME: Can we get rid of this horrible reshaping? def obj(f): - f = f[:, None] - res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (self.Ki, f)) + obj_constant) + #f = f[:, None] + res = -1 * (self.likelihood_function.link_function(self.data[:,0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST) return float(res) def obj_grad(f): - f = f[:, None] - res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(self.Ki, f)) + #f = f[:, None] + res = -1 * (self.likelihood_function.link_grad(self.data[:,0], f) - mdot(self.Ki, f)) return np.squeeze(res) def obj_hess(f): - f = f[:, None] - res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - self.Ki) + res = -1 * (np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki) return np.squeeze(res) self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) print self.f_hat #At this point get the hessian matrix - self.hess_hat = obj_hess(self.f_hat) + self.hess_hat = -1*np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) #-1*obj_hess(self.f_hat) + self.Ki + #self.hess_hat = -1*obj_hess(self.f_hat) + self.Ki + (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat + self.Ki) #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...) self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1? #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) - self.z_hat = np.exp(-0.5*np.log(np.linalg.det(hess_hat)) + self.height_unnormalised) + self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) + self.height_unnormalised - self.NORMAL_CONST #Unsure whether its log_hess or log_hess_i + return self._compute_GP_variables() diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 46128de7..8adbf86c 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -28,7 +28,7 @@ class student_t(likelihood_function): :returns: float(likelihood evaluated for this point) """ - assert y.shape[0] == f.shape[0] + assert y.shape == f.shape e = y - f objective = (gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) @@ -49,7 +49,7 @@ class student_t(likelihood_function): :returns: gradient of likelihood evaluated at points """ - assert y.shape[0] == f.shape[0] + assert y.shape == f.shape e = y - f grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) return grad @@ -67,7 +67,8 @@ class student_t(likelihood_function): :f: latent variables f :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ - assert y.shape[0] == f.shape[0] + assert y.shape == f.shape e = y - f - hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) + #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) + hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2) return hess From 46d59c94b27cabe61056b71aa26d1293779c0697 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 19 Mar 2013 11:47:53 +0000 Subject: [PATCH 08/22] Just breaking some things... --- python/examples/laplace_approximations.py | 88 +++++++++++++++-------- python/likelihoods/Laplace.py | 52 ++++++++++---- python/likelihoods/likelihood_function.py | 16 ++++- 3 files changed, 113 insertions(+), 43 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index aa8cdcb4..73c8f67f 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -16,47 +16,75 @@ def student_t_approx(): #Add student t random noise to datapoints deg_free = 2.5 - t_rv = t(deg_free, loc=5, scale=1) + t_rv = t(deg_free, loc=0, scale=1) noise = t_rv.rvs(size=Y.shape) Y += noise + #Add some extreme value noise to some of the datapoints + #percent_corrupted = 0.05 + #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted)) + #indices = np.arange(Y.shape[0]) + #np.random.shuffle(indices) + #corrupted_indices = indices[:corrupted_datums] + #print corrupted_indices + #noise = t_rv.rvs(size=(len(corrupted_indices), 1)) + #Y[corrupted_indices] += noise + # Kernel object - print X.shape - kernel = GPy.kern.rbf(X.shape[1]) + #print X.shape + #kernel = GPy.kern.rbf(X.shape[1]) - #A GP should completely break down due to the points as they get a lot of weight - # create simple GP model - m = GPy.models.GP_regression(X, Y, kernel=kernel) + ##A GP should completely break down due to the points as they get a lot of weight + ## create simple GP model + #m = GPy.models.GP_regression(X, Y, kernel=kernel) - # optimize - m.ensure_default_constraints() - m.optimize() - # plot - #m.plot() - print m + ## optimize + #m.ensure_default_constraints() + #m.optimize() + ## plot + ##m.plot() + #print m #with a student t distribution, since it has heavy tails it should work well - likelihood_function = student_t(deg_free, sigma=1) - lap = Laplace(Y, likelihood_function) - cov = kernel.K(X) - lap.fit_full(cov) - #Get one sample (just look at a single Y - #mode = float(lap.f_hat[0]) - #variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables - #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables + #likelihood_function = student_t(deg_free, sigma=1) + #lap = Laplace(Y, likelihood_function) + #cov = kernel.K(X) + #lap.fit_full(cov) - test_range = np.arange(0, 10, 0.1) - plt.plot(test_range, t_rv.pdf(test_range)) - for i in xrange(X.shape[0]): - mode = lap.f_hat[i] - covariance = lap.hess_hat_i[i,i] - scaling = np.exp(lap.ln_z_hat) - normalised_approx = norm(loc=mode, scale=covariance) - print "Normal with mode %f, and variance %f" % (mode, covariance) - plt.plot(test_range, normalised_approx.pdf(test_range)) - plt.show() + #test_range = np.arange(0, 10, 0.1) + #plt.plot(test_range, t_rv.pdf(test_range)) + #for i in xrange(X.shape[0]): + #mode = lap.f_hat[i] + #covariance = lap.hess_hat_i[i,i] + #scaling = np.exp(lap.ln_z_hat) + #normalised_approx = norm(loc=mode, scale=covariance) + #print "Normal with mode %f, and variance %f" % (mode, covariance) + #plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) + #plt.show() + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + # Likelihood object + t_distribution = student_t(deg_free, sigma=1) + stu_t_likelihood = Laplace(Y, t_distribution) + kernel = GPy.kern.rbf(X.shape[1]) + + m = GPy.models.GP(X, stu_t_likelihood, kernel) + m.ensure_default_constraints() + + m.update_likelihood_approximation() + print "NEW MODEL" + print(m) + + # optimize + #m.optimize() + print(m) + + # plot + m.plot() import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return m + def noisy_laplace_approx(): """ diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 9d622b0d..23db6abd 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -5,6 +5,7 @@ import GPy from functools import partial from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv,mdot +import numpy.testing.assert_array_equal class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -35,6 +36,29 @@ class Laplace(likelihood): self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi)) + #Initial values for the GP variables + self.Y = np.zeros((self.N,1)) + self.covariance_matrix = np.eye(self.N) + self.precision = np.ones(self.N)[:,None] + self.Z = 0 + self.YYT = None + + def predictive_values(self,mu,var): + return self.likelihood_function.predictive_values(mu,var) + + def _get_params(self): + return np.zeros(0) + + def _get_param_names(self): + return [] + + def _set_params(self,p): + pass # TODO: Laplace likelihood might want to take some parameters... + + def _gradients(self,partial): + raise NotImplementedError + #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... + def _compute_GP_variables(self): """ Generates data Y which would give the normal distribution identical to the laplace approximation @@ -63,11 +87,14 @@ class Laplace(likelihood): #Do we really need to inverse Sigma_tilde_i? :( (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i) Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess? - self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde, (self.Sigma_tilde_i, Y_tilde)))) + self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)))) + + self.Z = self.Z_tilde self.Y = Y_tilde self.covariance_matrix = self.Sigma_tilde - self.precision = np.diag(self.Sigma_tilde)[:, None] - self.YYT = np.dot(self.Y, self.Y) + self.precision = 1/np.diag(self.Sigma_tilde)[:, None] + self.YYT = np.dot(self.Y, self.Y.T) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def fit_full(self, K): """ @@ -76,7 +103,6 @@ class Laplace(likelihood): :K: Covariance matrix """ f = np.zeros((self.N, 1)) - #K = np.diag(np.ones(self.N)) (self.Ki, _, _, self.log_Kdet) = pdinv(K) LOG_K_CONST = -(0.5 * self.log_Kdet) OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST @@ -95,23 +121,25 @@ class Laplace(likelihood): return np.squeeze(res) def obj_hess(f): - res = -1 * (np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki) + res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki) return np.squeeze(res) self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) - print self.f_hat #At this point get the hessian matrix - self.hess_hat = -1*np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) #-1*obj_hess(self.f_hat) + self.Ki - #self.hess_hat = -1*obj_hess(self.f_hat) + self.Ki - (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat + self.Ki) + self.hess_hat = np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) + self.Ki + (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat) + (self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat_i) + + np.testing.assert_array_equal(self.hess_hat, hess_hat_new) #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...) - self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1? + #self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1? #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) - self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) + self.height_unnormalised - self.NORMAL_CONST #Unsure whether its log_hess or log_hess_i - + #Unsure whether its log_hess or log_hess_i + self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(f.T, (self.Ki, f)) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return self._compute_GP_variables() diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 8adbf86c..e70cdc8d 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -1,7 +1,7 @@ from scipy.special import gammaln import numpy as np from GPy.likelihoods.likelihood_functions import likelihood_function - +from scipy import stats class student_t(likelihood_function): """Student t likelihood distribution @@ -72,3 +72,17 @@ class student_t(likelihood_function): #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2) return hess + + def predictive_values(self, mu, var): + """ + Compute mean, and conficence interval (percentiles 5 and 95) of the prediction + """ + mean = np.exp(mu) + p_025 = stats.t.ppf(025,mean) + p_975 = stats.t.ppf(975,mean) + + #p_025 = tmp[:,0] + #p_975 = tmp[:,1] + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return mean,p_025,p_975 + From a9d555597653c24bc67812776514e29066216d66 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 19 Mar 2013 18:21:57 +0000 Subject: [PATCH 09/22] Worked out in terms of W, needs gradients implementing --- python/examples/laplace_approximations.py | 44 ++++++++++----------- python/likelihoods/Laplace.py | 48 +++++++++++++++-------- python/likelihoods/likelihood_function.py | 5 ++- 3 files changed, 57 insertions(+), 40 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 73c8f67f..c8d06ab2 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -15,13 +15,13 @@ def student_t_approx(): Y = np.sin(X) #Add student t random noise to datapoints - deg_free = 2.5 + deg_free = 3.5 t_rv = t(deg_free, loc=0, scale=1) noise = t_rv.rvs(size=Y.shape) Y += noise #Add some extreme value noise to some of the datapoints - #percent_corrupted = 0.05 + #percent_corrupted = 0.15 #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted)) #indices = np.arange(Y.shape[0]) #np.random.shuffle(indices) @@ -31,11 +31,11 @@ def student_t_approx(): #Y[corrupted_indices] += noise # Kernel object - #print X.shape - #kernel = GPy.kern.rbf(X.shape[1]) + print X.shape + kernel = GPy.kern.rbf(X.shape[1]) - ##A GP should completely break down due to the points as they get a lot of weight - ## create simple GP model + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model #m = GPy.models.GP_regression(X, Y, kernel=kernel) ## optimize @@ -46,27 +46,27 @@ def student_t_approx(): #print m #with a student t distribution, since it has heavy tails it should work well - #likelihood_function = student_t(deg_free, sigma=1) - #lap = Laplace(Y, likelihood_function) - #cov = kernel.K(X) - #lap.fit_full(cov) + likelihood_function = student_t(deg_free, sigma=1) + lap = Laplace(Y, likelihood_function) + cov = kernel.K(X) + lap.fit_full(cov) - #test_range = np.arange(0, 10, 0.1) - #plt.plot(test_range, t_rv.pdf(test_range)) - #for i in xrange(X.shape[0]): - #mode = lap.f_hat[i] - #covariance = lap.hess_hat_i[i,i] - #scaling = np.exp(lap.ln_z_hat) - #normalised_approx = norm(loc=mode, scale=covariance) - #print "Normal with mode %f, and variance %f" % (mode, covariance) - #plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) - #plt.show() - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + test_range = np.arange(0, 10, 0.1) + plt.plot(test_range, t_rv.pdf(test_range)) + for i in xrange(X.shape[0]): + mode = lap.f_hat[i] + covariance = lap.hess_hat_i[i,i] + scaling = np.exp(lap.ln_z_hat) + normalised_approx = norm(loc=mode, scale=covariance) + print "Normal with mode %f, and variance %f" % (mode, covariance) + plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) + plt.show() + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT # Likelihood object t_distribution = student_t(deg_free, sigma=1) stu_t_likelihood = Laplace(Y, t_distribution) - kernel = GPy.kern.rbf(X.shape[1]) + kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1]) m = GPy.models.GP(X, stu_t_likelihood, kernel) m.ensure_default_constraints() diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 23db6abd..84128e3a 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,11 +1,11 @@ import numpy as np import scipy as sp import GPy -#from GPy.util.linalg import jitchol +from scipy.linalg import cholesky, eig, inv from functools import partial from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv,mdot -import numpy.testing.assert_array_equal +#import numpy.testing.assert_array_equal class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -56,8 +56,8 @@ class Laplace(likelihood): pass # TODO: Laplace likelihood might want to take some parameters... def _gradients(self,partial): + return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... raise NotImplementedError - #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... def _compute_GP_variables(self): """ @@ -83,16 +83,23 @@ class Laplace(likelihood): and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ """ - self.Sigma_tilde_i = self.hess_hat + self.Ki + self.Sigma_tilde_i = self.hess_hat_i #self.W #self.hess_hat_i - self.Ki #Do we really need to inverse Sigma_tilde_i? :( - (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i) - Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess? - self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)))) + if self.likelihood_function.log_concave: + (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i) + else: + self.Sigma_tilde = inv(self.Sigma_tilde_i) + #f_hat? should be f but we must have optimized for them I guess? + Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) + self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + - 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) + + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) + ) self.Z = self.Z_tilde self.Y = Y_tilde self.covariance_matrix = self.Sigma_tilde - self.precision = 1/np.diag(self.Sigma_tilde)[:, None] + self.precision = 1 / np.diag(self.Sigma_tilde)[:, None] self.YYT = np.dot(self.Y, self.Y.T) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT @@ -112,34 +119,41 @@ class Laplace(likelihood): #FIXME: Can we get rid of this horrible reshaping? def obj(f): #f = f[:, None] - res = -1 * (self.likelihood_function.link_function(self.data[:,0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST) + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST) return float(res) def obj_grad(f): #f = f[:, None] - res = -1 * (self.likelihood_function.link_grad(self.data[:,0], f) - mdot(self.Ki, f)) + res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f)) return np.squeeze(res) def obj_hess(f): - res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki) + res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) return np.squeeze(res) self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) #At this point get the hessian matrix - self.hess_hat = np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) + self.Ki + self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat)) + self.hess_hat = self.Ki + self.W (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat) - (self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat_i) - np.testing.assert_array_equal(self.hess_hat, hess_hat_new) + #Check hess_hat is positive definite + try: + cholesky(self.hess_hat) + except: + raise ValueError("Must be positive definite") + + #Check its eigenvalues are positive + eigenvalues = eig(self.hess_hat) + if not np.all(eigenvalues > 0): + raise ValueError("Eigen values not positive") - #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...) - #self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1? #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) #Unsure whether its log_hess or log_hess_i - self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(f.T, (self.Ki, f)) + self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(self.f_hat.T, (self.Ki, self.f_hat)) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return self._compute_GP_variables() diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index e70cdc8d..c4823703 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -19,6 +19,9 @@ class student_t(likelihood_function): self.v = deg_free self.sigma = sigma + #FIXME: This should be in the superclass + self.log_concave = False + def link_function(self, y, f): """link_function $\ln p(y|f)$ $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ @@ -70,7 +73,7 @@ class student_t(likelihood_function): assert y.shape == f.shape e = y - f #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) - hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2) + hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) return hess def predictive_values(self, mu, var): From 474d5484b06bdbceefa08fa573d28326bb3f8a92 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 21 Mar 2013 14:00:22 +0000 Subject: [PATCH 10/22] Changing definitions again... --- python/examples/laplace_approximations.py | 15 +++++--- python/likelihoods/Laplace.py | 44 +++++++++++++++-------- python/likelihoods/likelihood_function.py | 10 ++---- 3 files changed, 43 insertions(+), 26 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index c8d06ab2..6f2b19aa 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -15,8 +15,9 @@ def student_t_approx(): Y = np.sin(X) #Add student t random noise to datapoints - deg_free = 3.5 - t_rv = t(deg_free, loc=0, scale=1) + deg_free = 100000.5 + real_var = 4 + t_rv = t(deg_free, loc=0, scale=real_var) noise = t_rv.rvs(size=Y.shape) Y += noise @@ -46,7 +47,7 @@ def student_t_approx(): #print m #with a student t distribution, since it has heavy tails it should work well - likelihood_function = student_t(deg_free, sigma=1) + likelihood_function = student_t(deg_free, sigma=real_var) lap = Laplace(Y, likelihood_function) cov = kernel.K(X) lap.fit_full(cov) @@ -64,7 +65,7 @@ def student_t_approx(): import ipdb; ipdb.set_trace() ### XXX BREAKPOINT # Likelihood object - t_distribution = student_t(deg_free, sigma=1) + t_distribution = student_t(deg_free, sigma=real_var) stu_t_likelihood = Laplace(Y, t_distribution) kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1]) @@ -77,12 +78,16 @@ def student_t_approx(): # optimize #m.optimize() - print(m) + #print(m) # plot m.plot() import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + m.optimize() + print(m) + + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return m diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 84128e3a..b002034d 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,7 +1,7 @@ import numpy as np import scipy as sp import GPy -from scipy.linalg import cholesky, eig, inv +from scipy.linalg import cholesky, eig, inv, det from functools import partial from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv,mdot @@ -43,8 +43,10 @@ class Laplace(likelihood): self.Z = 0 self.YYT = None - def predictive_values(self,mu,var): - return self.likelihood_function.predictive_values(mu,var) + def predictive_values(self, mu, var, full_cov): + if full_cov: + raise NotImplementedError("Cannot make correlated predictions with an EP likelihood") + return self.likelihood_function.predictive_values(mu, var) def _get_params(self): return np.zeros(0) @@ -52,10 +54,10 @@ class Laplace(likelihood): def _get_param_names(self): return [] - def _set_params(self,p): + def _set_params(self, p): pass # TODO: Laplace likelihood might want to take some parameters... - def _gradients(self,partial): + def _gradients(self, partial): return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... raise NotImplementedError @@ -83,7 +85,13 @@ class Laplace(likelihood): and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ """ - self.Sigma_tilde_i = self.hess_hat_i #self.W #self.hess_hat_i - self.Ki + self.Sigma_tilde_i = self.W #self.hess_hat_i + #Check it isn't singular! + epsilon = 1e-2 + """ + if np.abs(det(self.Sigma_tilde_i)) < epsilon: + raise ValueError("inverse covariance must be non-singular to inverse!") + """ #Do we really need to inverse Sigma_tilde_i? :( if self.likelihood_function.log_concave: (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i) @@ -91,12 +99,17 @@ class Laplace(likelihood): self.Sigma_tilde = inv(self.Sigma_tilde_i) #f_hat? should be f but we must have optimized for them I guess? Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) - self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST - - 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) - + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) - ) + #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST + #- 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) + #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) + #) + Z_tilde = (self.ln_z_hat - self.NORMAL_CONST + + 0.5*self.log_hess_hat_det + + 0.5*mdot(self.f_hat, self.Ki , self.f_hat) + + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) + ) - self.Z = self.Z_tilde + self.Z = Z_tilde self.Y = Y_tilde self.covariance_matrix = self.Sigma_tilde self.precision = 1 / np.diag(self.Sigma_tilde)[:, None] @@ -128,7 +141,7 @@ class Laplace(likelihood): return np.squeeze(res) def obj_hess(f): - res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) + res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) return np.squeeze(res) self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) @@ -153,7 +166,10 @@ class Laplace(likelihood): #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) #Unsure whether its log_hess or log_hess_i - self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(self.f_hat.T, (self.Ki, self.f_hat)) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + self.ln_z_hat = (-0.5*self.log_hess_hat_det + - 0.5*self.log_Kdet + -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) + - mdot(self.f_hat.T, (self.Ki, self.f_hat)) + ) return self._compute_GP_variables() diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index c4823703..a299fe3a 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -81,11 +81,7 @@ class student_t(likelihood_function): Compute mean, and conficence interval (percentiles 5 and 95) of the prediction """ mean = np.exp(mu) - p_025 = stats.t.ppf(025,mean) - p_975 = stats.t.ppf(975,mean) - - #p_025 = tmp[:,0] - #p_975 = tmp[:,1] - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - return mean,p_025,p_975 + p_025 = stats.t.ppf(.025, mean) + p_975 = stats.t.ppf(.975, mean) + return mean, np.nan*mean, p_025, p_975 From 7b0d0550cb01f0c4eca567e80f950e7f54ecb7b2 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 22 Mar 2013 12:50:47 +0000 Subject: [PATCH 11/22] Seemed to be working, now its not --- python/examples/laplace_approximations.py | 118 +++++++++++++--------- python/likelihoods/Laplace.py | 37 +++---- 2 files changed, 92 insertions(+), 63 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 6f2b19aa..5fb39e08 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -11,15 +11,22 @@ def student_t_approx(): Example of regressing with a student t likelihood """ #Start a function, any function - X = np.sort(np.random.uniform(0, 15, 100))[:, None] - Y = np.sin(X) + X = np.linspace(0.0, 10.0, 100)[:, None] + Y = np.sin(X) + np.random.randn(*X.shape)*0.1 + Yc = Y.copy() + + Y = Y/Y.max() + + Yc[10] += 5 + Yc[15] += 20 + Yc = Yc/Yc.max() #Add student t random noise to datapoints - deg_free = 100000.5 - real_var = 4 - t_rv = t(deg_free, loc=0, scale=real_var) - noise = t_rv.rvs(size=Y.shape) - Y += noise + deg_free = 1000000 #100000.5 + real_var = 0.1 + #t_rv = t(deg_free, loc=0, scale=real_var) + #noise = t_rvrvs(size=Y.shape) + #Y += noise #Add some extreme value noise to some of the datapoints #percent_corrupted = 0.15 @@ -30,64 +37,83 @@ def student_t_approx(): #print corrupted_indices #noise = t_rv.rvs(size=(len(corrupted_indices), 1)) #Y[corrupted_indices] += noise - + plt.figure(1) # Kernel object - print X.shape - kernel = GPy.kern.rbf(X.shape[1]) + kernel1 = GPy.kern.rbf(X.shape[1]) + kernel2 = kernel1.copy() + kernel3 = kernel1.copy() + kernel4 = kernel1.copy() - #A GP should completely break down due to the points as they get a lot of weight - # create simple GP model - #m = GPy.models.GP_regression(X, Y, kernel=kernel) - - ## optimize + #print "Clean Gaussian" + ##A GP should completely break down due to the points as they get a lot of weight + ## create simple GP model + #m = GPy.models.GP_regression(X, Y, kernel=kernel1) + ### optimize #m.ensure_default_constraints() + ##m.unconstrain('noise') + ##m.constrain_fixed('noise', 0.1) #m.optimize() ## plot - ##m.plot() + #plt.subplot(221) + #m.plot() #print m - #with a student t distribution, since it has heavy tails it should work well - likelihood_function = student_t(deg_free, sigma=real_var) - lap = Laplace(Y, likelihood_function) - cov = kernel.K(X) - lap.fit_full(cov) + ##Corrupt + #print "Corrupt Gaussian" + #m = GPy.models.GP_regression(X, Yc, kernel=kernel2) + #m.ensure_default_constraints() + ##m.unconstrain('noise') + ##m.constrain_fixed('noise', 0.1) + #m.optimize() + #plt.subplot(222) + #m.plot() + #print m - test_range = np.arange(0, 10, 0.1) - plt.plot(test_range, t_rv.pdf(test_range)) - for i in xrange(X.shape[0]): - mode = lap.f_hat[i] - covariance = lap.hess_hat_i[i,i] - scaling = np.exp(lap.ln_z_hat) - normalised_approx = norm(loc=mode, scale=covariance) - print "Normal with mode %f, and variance %f" % (mode, covariance) - plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) - plt.show() - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + ##with a student t distribution, since it has heavy tails it should work well + ##likelihood_function = student_t(deg_free, sigma=real_var) + ##lap = Laplace(Y, likelihood_function) + ##cov = kernel.K(X) + ##lap.fit_full(cov) + + ##test_range = np.arange(0, 10, 0.1) + ##plt.plot(test_range, t_rv.pdf(test_range)) + ##for i in xrange(X.shape[0]): + ##mode = lap.f_hat[i] + ##covariance = lap.hess_hat_i[i,i] + ##scaling = np.exp(lap.ln_z_hat) + ##normalised_approx = norm(loc=mode, scale=covariance) + ##print "Normal with mode %f, and variance %f" % (mode, covariance) + ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) + ##plt.show() # Likelihood object - t_distribution = student_t(deg_free, sigma=real_var) + t_distribution = student_t(deg_free, sigma=np.sqrt(real_var)) stu_t_likelihood = Laplace(Y, t_distribution) - kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1]) - m = GPy.models.GP(X, stu_t_likelihood, kernel) + print "Clean student t" + m = GPy.models.GP(X, stu_t_likelihood, kernel3) m.ensure_default_constraints() - m.update_likelihood_approximation() - print "NEW MODEL" - print(m) - # optimize - #m.optimize() - #print(m) - - # plot - m.plot() - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - m.optimize() print(m) + # plot + plt.subplot(211) + m.plot_f() + + print "Corrupt student t" + t_distribution = student_t(deg_free, sigma=np.sqrt(real_var)) + corrupt_stu_t_likelihood = Laplace(Yc, t_distribution) + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(212) + m.plot_f() import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return m diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index b002034d..d86523d8 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -33,13 +33,15 @@ class Laplace(likelihood): #Inital values self.N, self.D = self.data.shape + self.is_heteroscedastic = True + self.Nparams = 0 self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi)) #Initial values for the GP variables - self.Y = np.zeros((self.N,1)) + self.Y = np.zeros((self.N, 1)) self.covariance_matrix = np.eye(self.N) - self.precision = np.ones(self.N)[:,None] + self.precision = np.ones(self.N)[:, None] self.Z = 0 self.YYT = None @@ -58,6 +60,7 @@ class Laplace(likelihood): pass # TODO: Laplace likelihood might want to take some parameters... def _gradients(self, partial): + #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... raise NotImplementedError @@ -88,10 +91,8 @@ class Laplace(likelihood): self.Sigma_tilde_i = self.W #self.hess_hat_i #Check it isn't singular! epsilon = 1e-2 - """ if np.abs(det(self.Sigma_tilde_i)) < epsilon: raise ValueError("inverse covariance must be non-singular to inverse!") - """ #Do we really need to inverse Sigma_tilde_i? :( if self.likelihood_function.log_concave: (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i) @@ -99,21 +100,17 @@ class Laplace(likelihood): self.Sigma_tilde = inv(self.Sigma_tilde_i) #f_hat? should be f but we must have optimized for them I guess? Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) - #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST - #- 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) - #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) - #) Z_tilde = (self.ln_z_hat - self.NORMAL_CONST - + 0.5*self.log_hess_hat_det - + 0.5*mdot(self.f_hat, self.Ki , self.f_hat) - + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) + + 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) + + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) + - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) ) self.Z = Z_tilde - self.Y = Y_tilde + self.Y = Y_tilde[:, None] + self.YYT = np.dot(self.Y, self.Y.T) self.covariance_matrix = self.Sigma_tilde self.precision = 1 / np.diag(self.Sigma_tilde)[:, None] - self.YYT = np.dot(self.Y, self.Y.T) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def fit_full(self, K): @@ -122,6 +119,7 @@ class Laplace(likelihood): For nomenclature see Rasmussen & Williams 2006 :K: Covariance matrix """ + self.K = K.copy() f = np.zeros((self.N, 1)) (self.Ki, _, _, self.log_Kdet) = pdinv(K) LOG_K_CONST = -(0.5 * self.log_Kdet) @@ -148,6 +146,11 @@ class Laplace(likelihood): #At this point get the hessian matrix self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat)) + if not self.likelihood_function.log_concave: + self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + #If the likelihood is non-log-concave. We wan't to say that there is a negative variance + #To cause the posterior to become less certain than the prior and likelihood, + #This is a property only held by non-log-concave likelihoods self.hess_hat = self.Ki + self.W (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat) @@ -166,10 +169,10 @@ class Laplace(likelihood): #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) #Unsure whether its log_hess or log_hess_i - self.ln_z_hat = (-0.5*self.log_hess_hat_det - - 0.5*self.log_Kdet - -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) - - mdot(self.f_hat.T, (self.Ki, self.f_hat)) + self.ln_z_hat = (- 0.5*self.log_hess_hat_det + + 0.5*self.log_Kdet + + self.likelihood_function.link_function(self.data[:,0], self.f_hat) + - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) ) return self._compute_GP_variables() From 15d5c2f22dff65a518a4f6a155e457a6516fca17 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 28 Mar 2013 17:42:42 +0000 Subject: [PATCH 12/22] Working laplace, just needs predictive values --- python/examples/laplace_approximations.py | 80 +++++++++++++---------- python/likelihoods/Laplace.py | 15 +++-- python/likelihoods/likelihood_function.py | 72 ++++++++++++++++++-- 3 files changed, 121 insertions(+), 46 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 5fb39e08..37681849 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -10,20 +10,23 @@ def student_t_approx(): """ Example of regressing with a student t likelihood """ + real_var = 0.1 #Start a function, any function - X = np.linspace(0.0, 10.0, 100)[:, None] - Y = np.sin(X) + np.random.randn(*X.shape)*0.1 + X = np.linspace(0.0, 10.0, 30)[:, None] + Y = np.sin(X) + np.random.randn(*X.shape)*real_var Yc = Y.copy() - Y = Y/Y.max() + #Y = Y/Y.max() - Yc[10] += 5 - Yc[15] += 20 - Yc = Yc/Yc.max() + #Yc[10] += 100 + Yc[25] += 10 + Yc[23] += 10 + Yc[24] += 10 + #Yc = Yc/Yc.max() #Add student t random noise to datapoints - deg_free = 1000000 #100000.5 - real_var = 0.1 + deg_free = 20 #100000.5 + real_sd = np.sqrt(real_var) #t_rv = t(deg_free, loc=0, scale=real_var) #noise = t_rvrvs(size=Y.shape) #Y += noise @@ -38,36 +41,37 @@ def student_t_approx(): #noise = t_rv.rvs(size=(len(corrupted_indices), 1)) #Y[corrupted_indices] += noise plt.figure(1) + plt.suptitle('Gaussian likelihood') # Kernel object kernel1 = GPy.kern.rbf(X.shape[1]) kernel2 = kernel1.copy() kernel3 = kernel1.copy() kernel4 = kernel1.copy() - #print "Clean Gaussian" - ##A GP should completely break down due to the points as they get a lot of weight - ## create simple GP model - #m = GPy.models.GP_regression(X, Y, kernel=kernel1) - ### optimize - #m.ensure_default_constraints() - ##m.unconstrain('noise') - ##m.constrain_fixed('noise', 0.1) - #m.optimize() - ## plot - #plt.subplot(221) - #m.plot() - #print m + print "Clean Gaussian" + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model + m = GPy.models.GP_regression(X, Y, kernel=kernel1) + ## optimize + m.ensure_default_constraints() + #m.unconstrain('noise') + #m.constrain_fixed('noise', 0.1) + m.optimize() + # plot + plt.subplot(211) + m.plot() + print m ##Corrupt - #print "Corrupt Gaussian" - #m = GPy.models.GP_regression(X, Yc, kernel=kernel2) - #m.ensure_default_constraints() - ##m.unconstrain('noise') - ##m.constrain_fixed('noise', 0.1) - #m.optimize() - #plt.subplot(222) - #m.plot() - #print m + print "Corrupt Gaussian" + m = GPy.models.GP_regression(X, Yc, kernel=kernel2) + m.ensure_default_constraints() + #m.unconstrain('noise') + #m.constrain_fixed('noise', 0.1) + m.optimize() + plt.subplot(212) + m.plot() + print m ##with a student t distribution, since it has heavy tails it should work well ##likelihood_function = student_t(deg_free, sigma=real_var) @@ -86,9 +90,13 @@ def student_t_approx(): ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) ##plt.show() + plt.figure(2) + plt.suptitle('Student-t likelihood') + edited_real_sd = real_sd + # Likelihood object - t_distribution = student_t(deg_free, sigma=np.sqrt(real_var)) - stu_t_likelihood = Laplace(Y, t_distribution) + t_distribution = student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = Laplace(Yc, t_distribution) print "Clean student t" m = GPy.models.GP(X, stu_t_likelihood, kernel3) @@ -100,9 +108,11 @@ def student_t_approx(): # plot plt.subplot(211) m.plot_f() + plt.ylim(-2.5,2.5) + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print "Corrupt student t" - t_distribution = student_t(deg_free, sigma=np.sqrt(real_var)) + t_distribution = student_t(deg_free, sigma=edited_real_sd) corrupt_stu_t_likelihood = Laplace(Yc, t_distribution) m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m.ensure_default_constraints() @@ -110,8 +120,8 @@ def student_t_approx(): m.optimize() print(m) plt.subplot(212) - m.plot_f() - + m.plot() + plt.ylim(-2.5,2.5) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return m diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index d86523d8..1411c22b 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -88,11 +88,12 @@ class Laplace(likelihood): and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ """ - self.Sigma_tilde_i = self.W #self.hess_hat_i + self.Sigma_tilde_i = self.W #Check it isn't singular! - epsilon = 1e-2 + epsilon = 1e-6 if np.abs(det(self.Sigma_tilde_i)) < epsilon: - raise ValueError("inverse covariance must be non-singular to inverse!") + print "WARNING: Transformed covariance matrix is signular!" + #raise ValueError("inverse covariance must be non-singular to invert!") #Do we really need to inverse Sigma_tilde_i? :( if self.likelihood_function.log_concave: (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i) @@ -110,8 +111,12 @@ class Laplace(likelihood): self.Y = Y_tilde[:, None] self.YYT = np.dot(self.Y, self.Y.T) self.covariance_matrix = self.Sigma_tilde - self.precision = 1 / np.diag(self.Sigma_tilde)[:, None] - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + #if not self.likelihood_function.log_concave: + #self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + ##If the likelihood is non-log-concave. We wan't to say that there is a negative variance + ##To cause the posterior to become less certain than the prior and likelihood, + ##This is a property only held by non-log-concave likelihoods + self.precision = 1 / np.diag(self.covariance_matrix)[:, None] def fit_full(self, K): """ diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index a299fe3a..7ac9c661 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -1,4 +1,5 @@ -from scipy.special import gammaln +from scipy.special import gammaln, gamma +from scipy import integrate import numpy as np from GPy.likelihoods.likelihood_functions import likelihood_function from scipy import stats @@ -79,9 +80,68 @@ class student_t(likelihood_function): def predictive_values(self, mu, var): """ Compute mean, and conficence interval (percentiles 5 and 95) of the prediction - """ - mean = np.exp(mu) - p_025 = stats.t.ppf(.025, mean) - p_975 = stats.t.ppf(.975, mean) - return mean, np.nan*mean, p_025, p_975 + Need to find what the variance is at the latent points for a student t*normal + (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) + +(((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2)) +*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) + """ + #p_025 = stats.t.ppf(.025, mu) + #p_975 = stats.t.ppf(.975, mu) + + num_test_points = mu.shape[0] + #Each mu is the latent point f* at the test point x*, + #and the var is the gaussian variance at this point + #Take lots of samples from this, so we have lots of possible values + #for latent point f* for each test point x* weighted by how likely we were to pick it + print "Taking %d samples of f*".format(num_test_points) + num_f_samples = 10 + num_y_samples = 10 + student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples)) + print "Student t means shape: ", student_t_means.shape + + #Now we have lots of f*, lets work out the likelihood of getting this by sampling + #from a student t centred on this point, sample many points from this distribution + #centred on f* + #for test_point, f in enumerate(student_t_means): + #print test_point + #print f.shape + #student_t_samples = stats.t.rvs(self.v, loc=f[:,None], + #scale=self.sigma, + #size=(num_f_samples, num_y_samples)) + #print student_t_samples.shape + + student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:,None], + scale=self.sigma, + size=(num_test_points, num_y_samples, num_f_samples)) + student_t_samples = np.reshape(student_t_samples, + (num_test_points, num_y_samples*num_f_samples)) + + #Now take the 97.5 and 0.25 percentile of these points + p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None] + p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None] + + p_025 = 1+p_025 + p_975 = 1+p_975 + + ##Alernenately we could sample from int p(y|f*)p(f*|x*) df* + def t_gaussian(f, mu, var): + return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5)) + * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2))) + ) + + def t_gauss_int(mu, var): + print "Mu: ", mu + print "var: ", var + result = integrate.quad(t_gaussian, -np.inf, 0.975, args=(mu, var)) + print "Result: ", result + return result[0] + + vec_t_gauss_int = np.vectorize(t_gauss_int) + + p_025 = vec_t_gauss_int(mu, var) + p_975 = vec_t_gauss_int(mu, var) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + return mu, np.nan*mu, p_025, p_975 From ffc168c1d20f36b1e72501176c4a7bb88ff41614 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 2 Apr 2013 12:33:01 +0100 Subject: [PATCH 13/22] Added predicted values for student t, works well --- python/examples/laplace_approximations.py | 48 +++++++++++------------ python/likelihoods/likelihood_function.py | 41 ++++++++++++++----- 2 files changed, 53 insertions(+), 36 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 37681849..6374a5fd 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -18,7 +18,7 @@ def student_t_approx(): #Y = Y/Y.max() - #Yc[10] += 100 + Yc[10] += 100 Yc[25] += 10 Yc[23] += 10 Yc[24] += 10 @@ -52,51 +52,30 @@ def student_t_approx(): #A GP should completely break down due to the points as they get a lot of weight # create simple GP model m = GPy.models.GP_regression(X, Y, kernel=kernel1) - ## optimize + # optimize m.ensure_default_constraints() - #m.unconstrain('noise') - #m.constrain_fixed('noise', 0.1) m.optimize() # plot plt.subplot(211) m.plot() print m - ##Corrupt + #Corrupt print "Corrupt Gaussian" m = GPy.models.GP_regression(X, Yc, kernel=kernel2) m.ensure_default_constraints() - #m.unconstrain('noise') - #m.constrain_fixed('noise', 0.1) m.optimize() plt.subplot(212) m.plot() print m - ##with a student t distribution, since it has heavy tails it should work well - ##likelihood_function = student_t(deg_free, sigma=real_var) - ##lap = Laplace(Y, likelihood_function) - ##cov = kernel.K(X) - ##lap.fit_full(cov) - - ##test_range = np.arange(0, 10, 0.1) - ##plt.plot(test_range, t_rv.pdf(test_range)) - ##for i in xrange(X.shape[0]): - ##mode = lap.f_hat[i] - ##covariance = lap.hess_hat_i[i,i] - ##scaling = np.exp(lap.ln_z_hat) - ##normalised_approx = norm(loc=mode, scale=covariance) - ##print "Normal with mode %f, and variance %f" % (mode, covariance) - ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) - ##plt.show() - plt.figure(2) plt.suptitle('Student-t likelihood') edited_real_sd = real_sd # Likelihood object t_distribution = student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = Laplace(Yc, t_distribution) + stu_t_likelihood = Laplace(Y, t_distribution) print "Clean student t" m = GPy.models.GP(X, stu_t_likelihood, kernel3) @@ -107,7 +86,7 @@ def student_t_approx(): print(m) # plot plt.subplot(211) - m.plot_f() + m.plot() plt.ylim(-2.5,2.5) #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT @@ -124,6 +103,23 @@ def student_t_approx(): plt.ylim(-2.5,2.5) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + ###with a student t distribution, since it has heavy tails it should work well + ###likelihood_function = student_t(deg_free, sigma=real_var) + ###lap = Laplace(Y, likelihood_function) + ###cov = kernel.K(X) + ###lap.fit_full(cov) + + ###test_range = np.arange(0, 10, 0.1) + ###plt.plot(test_range, t_rv.pdf(test_range)) + ###for i in xrange(X.shape[0]): + ###mode = lap.f_hat[i] + ###covariance = lap.hess_hat_i[i,i] + ###scaling = np.exp(lap.ln_z_hat) + ###normalised_approx = norm(loc=mode, scale=covariance) + ###print "Normal with mode %f, and variance %f" % (mode, covariance) + ###plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) + ###plt.show() + return m diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 7ac9c661..61b5c427 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -23,6 +23,10 @@ class student_t(likelihood_function): #FIXME: This should be in the superclass self.log_concave = False + @property + def variance(self): + return (self.v / float(self.v - 2)) * (self.sigma**2) + def link_function(self, y, f): """link_function $\ln p(y|f)$ $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ @@ -79,14 +83,32 @@ class student_t(likelihood_function): def predictive_values(self, mu, var): """ - Compute mean, and conficence interval (percentiles 5 and 95) of the prediction + Compute mean, and conficence interval (percentiles 5 and 95) of the prediction - Need to find what the variance is at the latent points for a student t*normal - (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) + Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*) + (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2)) + *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) -(((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2)) -*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) """ + + #We want the variance around test points y which comes from int p(y*|f*)p(f*) df* + #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)] + #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this + #Which was also given to us as (var) + #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution + #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom + true_var = var + self.variance + + #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now + #need the 95 and 5 percentiles. + #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles + p_025 = mu - 2.*true_var + p_975 = mu + 2.*true_var + + return mu, np.nan*mu, p_025, p_975 + + def sample_predicted_values(self, mu, var): + """ Experimental sample approches and numerical integration """ #p_025 = stats.t.ppf(.025, mu) #p_975 = stats.t.ppf(.975, mu) @@ -134,14 +156,13 @@ class student_t(likelihood_function): def t_gauss_int(mu, var): print "Mu: ", mu print "var: ", var - result = integrate.quad(t_gaussian, -np.inf, 0.975, args=(mu, var)) + result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var)) print "Result: ", result return result[0] vec_t_gauss_int = np.vectorize(t_gauss_int) - p_025 = vec_t_gauss_int(mu, var) - p_975 = vec_t_gauss_int(mu, var) + p = vec_t_gauss_int(mu, var) + p_025 = mu - p + p_975 = mu + p import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - - return mu, np.nan*mu, p_025, p_975 From afa5b1f9561189b3774a895b765d708186c10f5c Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 2 Apr 2013 12:39:57 +0100 Subject: [PATCH 14/22] Tidying up --- python/likelihoods/likelihood_function.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 61b5c427..50f9b620 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -88,7 +88,6 @@ class student_t(likelihood_function): Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*) (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2)) *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) - """ #We want the variance around test points y which comes from int p(y*|f*)p(f*) df* @@ -144,9 +143,6 @@ class student_t(likelihood_function): p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None] p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None] - p_025 = 1+p_025 - p_975 = 1+p_975 - ##Alernenately we could sample from int p(y|f*)p(f*|x*) df* def t_gaussian(f, mu, var): return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5)) From 0312f319ad4eef37f0c173120d80cc373d149519 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 2 Apr 2013 20:00:31 +0100 Subject: [PATCH 15/22] Still working on rasmussen, link function needs vectorizing I think --- python/examples/laplace_approximations.py | 58 ++++++--- python/likelihoods/Laplace.py | 137 ++++++++++++++++------ python/likelihoods/likelihood_function.py | 13 +- 3 files changed, 154 insertions(+), 54 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 6374a5fd..a1c71c71 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -16,6 +16,9 @@ def student_t_approx(): Y = np.sin(X) + np.random.randn(*X.shape)*real_var Yc = Y.copy() + X_full = np.linspace(0.0, 10.0, 500)[:, None] + Y_full = np.sin(X_full) + #Y = Y/Y.max() Yc[10] += 100 @@ -25,7 +28,7 @@ def student_t_approx(): #Yc = Yc/Yc.max() #Add student t random noise to datapoints - deg_free = 20 #100000.5 + deg_free = 10 real_sd = np.sqrt(real_var) #t_rv = t(deg_free, loc=0, scale=real_var) #noise = t_rvrvs(size=Y.shape) @@ -47,6 +50,8 @@ def student_t_approx(): kernel2 = kernel1.copy() kernel3 = kernel1.copy() kernel4 = kernel1.copy() + kernel5 = kernel1.copy() + kernel6 = kernel1.copy() print "Clean Gaussian" #A GP should completely break down due to the points as they get a lot of weight @@ -58,6 +63,7 @@ def student_t_approx(): # plot plt.subplot(211) m.plot() + plt.plot(X_full, Y_full) print m #Corrupt @@ -67,40 +73,64 @@ def student_t_approx(): m.optimize() plt.subplot(212) m.plot() + plt.plot(X_full, Y_full) print m plt.figure(2) plt.suptitle('Student-t likelihood') edited_real_sd = real_sd - # Likelihood object + print "Clean student t, ncg" t_distribution = student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = Laplace(Y, t_distribution) - - print "Clean student t" + stu_t_likelihood = Laplace(Y, t_distribution, rasm=False) m = GPy.models.GP(X, stu_t_likelihood, kernel3) m.ensure_default_constraints() m.update_likelihood_approximation() - # optimize m.optimize() print(m) - # plot - plt.subplot(211) + plt.subplot(221) m.plot() - plt.ylim(-2.5,2.5) - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) - print "Corrupt student t" + print "Corrupt student t, ncg" t_distribution = student_t(deg_free, sigma=edited_real_sd) - corrupt_stu_t_likelihood = Laplace(Yc, t_distribution) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False) + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(223) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + print "Clean student t, rasm" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True) + m = GPy.models.GP(X, stu_t_likelihood, kernel6) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(222) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + print "Corrupt student t, rasm" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m.ensure_default_constraints() m.update_likelihood_approximation() m.optimize() print(m) - plt.subplot(212) + plt.subplot(224) m.plot() - plt.ylim(-2.5,2.5) + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT ###with a student t distribution, since it has heavy tails it should work well diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 1411c22b..8eb69869 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,16 +1,15 @@ import numpy as np import scipy as sp import GPy -from scipy.linalg import cholesky, eig, inv, det -from functools import partial +from scipy.linalg import cholesky, eig, inv, det, cho_solve from GPy.likelihoods.likelihood import likelihood -from GPy.util.linalg import pdinv,mdot +from GPy.util.linalg import pdinv, mdot, jitchol #import numpy.testing.assert_array_equal class Laplace(likelihood): """Laplace approximation to a posterior""" - def __init__(self, data, likelihood_function): + def __init__(self, data, likelihood_function, rasm=True): """ Laplace Approximation @@ -30,6 +29,7 @@ class Laplace(likelihood): """ self.data = data self.likelihood_function = likelihood_function + self.rasm = rasm #Inital values self.N, self.D = self.data.shape @@ -102,20 +102,16 @@ class Laplace(likelihood): #f_hat? should be f but we must have optimized for them I guess? Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) Z_tilde = (self.ln_z_hat - self.NORMAL_CONST - + 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) + + 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat)) + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) ) - self.Z = Z_tilde - self.Y = Y_tilde[:, None] + #Convert to float as its (1, 1) and Z must be a scalar + self.Z = np.float64(Z_tilde) + self.Y = Y_tilde self.YYT = np.dot(self.Y, self.Y.T) self.covariance_matrix = self.Sigma_tilde - #if not self.likelihood_function.log_concave: - #self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur - ##If the likelihood is non-log-concave. We wan't to say that there is a negative variance - ##To cause the posterior to become less certain than the prior and likelihood, - ##This is a property only held by non-log-concave likelihoods self.precision = 1 / np.diag(self.covariance_matrix)[:, None] def fit_full(self, K): @@ -125,32 +121,15 @@ class Laplace(likelihood): :K: Covariance matrix """ self.K = K.copy() - f = np.zeros((self.N, 1)) - (self.Ki, _, _, self.log_Kdet) = pdinv(K) - LOG_K_CONST = -(0.5 * self.log_Kdet) - OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST - #Find \hat(f) using a newton raphson optimizer for example - #TODO: Add newton-raphson as subclass of optimizer class - - #FIXME: Can we get rid of this horrible reshaping? - def obj(f): - #f = f[:, None] - res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST) - return float(res) - - def obj_grad(f): - #f = f[:, None] - res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f)) - return np.squeeze(res) - - def obj_hess(f): - res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) - return np.squeeze(res) - - self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) + self.Ki, _, _, self.log_Kdet = pdinv(K) + if self.rasm: + self.f_hat = self.rasm_mode(K) + else: + self.f_hat = self.ncg_mode(K) #At this point get the hessian matrix - self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat)) + self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat)) + if not self.likelihood_function.log_concave: self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur #If the likelihood is non-log-concave. We wan't to say that there is a negative variance @@ -176,8 +155,92 @@ class Laplace(likelihood): #Unsure whether its log_hess or log_hess_i self.ln_z_hat = (- 0.5*self.log_hess_hat_det + 0.5*self.log_Kdet - + self.likelihood_function.link_function(self.data[:,0], self.f_hat) + + self.likelihood_function.link_function(self.data, self.f_hat) + #+ self.likelihood_function.link_function(self.data, self.f_hat) - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) ) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return self._compute_GP_variables() + + def ncg_mode(self, K): + """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative) + :K: Covariance matrix + :returns: f_mode + """ + self.K = K.copy() + f = np.zeros((self.N, 1)) + (self.Ki, _, _, self.log_Kdet) = pdinv(K) + LOG_K_CONST = -(0.5 * self.log_Kdet) + + #FIXME: Can we get rid of this horrible reshaping? + def obj(f): + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + + self.NORMAL_CONST + LOG_K_CONST) + return float(res) + + def obj_grad(f): + res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f)) + return np.squeeze(res) + + def obj_hess(f): + res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) + return np.squeeze(res) + + f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) + return f_hat[:, None] + + def rasm_mode(self, K): + """ + Rasmussens numerically stable mode finding + For nomenclature see Rasmussen & Williams 2006 + + :K: Covariance matrix + :returns: f_mode + """ + f = np.zeros((self.N, 1)) + new_obj = -np.inf + old_obj = np.inf + + def obj(a, f): + #Careful of shape of data! + return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f) + + difference = np.inf + epsilon = 1e-16 + step_size = 1 + while difference > epsilon: + W = -np.diag(self.likelihood_function.link_hess(self.data, f)) + if not self.likelihood_function.log_concave: + #if np.any(W < 0): + #print "NEGATIVE VALUES :(" + #pass + W[W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + #If the likelihood is non-log-concave. We wan't to say that there is a negative variance + #To cause the posterior to become less certain than the prior and likelihood, + #This is a property only held by non-log-concave likelihoods + #W is diagnoal so its sqrt is just the sqrt of the diagonal elements + W_12 = np.sqrt(W) + B = np.eye(self.N) + mdot(W_12, K, W_12) + L = jitchol(B) + b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)) + #TODO: Check L is lower + solve_L = cho_solve((L, True), mdot(W_12, (K, b))) + a = b - mdot(W_12, solve_L) + f = np.dot(K, a) + old_obj = new_obj + new_obj = obj(a, f) + difference = new_obj - old_obj + #print "Difference: ", new_obj - old_obj + if difference < 0: + #If the objective function isn't rising, restart optimization + print "Reducing step-size, restarting" + #objective function isn't increasing, try reducing step size + step_size *= 0.9 + f = np.zeros((self.N, 1)) + new_obj = -np.inf + old_obj = np.inf + + difference = abs(difference) + + return f diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 50f9b620..15859a81 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -36,7 +36,10 @@ class student_t(likelihood_function): :returns: float(likelihood evaluated for this point) """ + y = np.squeeze(y) + f = np.squeeze(f) assert y.shape == f.shape + e = y - f objective = (gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) @@ -44,6 +47,7 @@ class student_t(likelihood_function): - (self.v + 1) * 0.5 * np.log(1 + ((e**2 / self.sigma**2) / self.v)) ) + print (e**2).shape return np.sum(objective) def link_grad(self, y, f): @@ -57,10 +61,12 @@ class student_t(likelihood_function): :returns: gradient of likelihood evaluated at points """ + y = np.squeeze(y) + f = np.squeeze(f) assert y.shape == f.shape e = y - f grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) - return grad + return np.squeeze(grad) def link_hess(self, y, f): """ @@ -75,11 +81,12 @@ class student_t(likelihood_function): :f: latent variables f :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ + y = np.squeeze(y) + f = np.squeeze(f) assert y.shape == f.shape e = y - f - #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) - return hess + return np.squeeze(hess) def predictive_values(self, mu, var): """ From 2006a94caa859d195a7c2af1236eb84656b68cfc Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 3 Apr 2013 10:55:58 +0100 Subject: [PATCH 16/22] Fixed broadcasting bug, rasm now appears to work --- python/likelihoods/Laplace.py | 16 ++++++++++------ python/likelihoods/likelihood_function.py | 1 - 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 8eb69869..e967a743 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -159,7 +159,6 @@ class Laplace(likelihood): #+ self.likelihood_function.link_function(self.data, self.f_hat) - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) ) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return self._compute_GP_variables() @@ -190,7 +189,7 @@ class Laplace(likelihood): f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) return f_hat[:, None] - def rasm_mode(self, K): + def rasm_mode(self, K, MAX_ITER=5000, MAX_RESTART=30): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -209,7 +208,9 @@ class Laplace(likelihood): difference = np.inf epsilon = 1e-16 step_size = 1 - while difference > epsilon: + rs = 0 + i = 0 + while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: W = -np.diag(self.likelihood_function.link_hess(self.data, f)) if not self.likelihood_function.log_concave: #if np.any(W < 0): @@ -223,7 +224,7 @@ class Laplace(likelihood): W_12 = np.sqrt(W) B = np.eye(self.N) + mdot(W_12, K, W_12) L = jitchol(B) - b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)) + b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)[:, None]) #TODO: Check L is lower solve_L = cho_solve((L, True), mdot(W_12, (K, b))) a = b - mdot(W_12, solve_L) @@ -234,13 +235,16 @@ class Laplace(likelihood): #print "Difference: ", new_obj - old_obj if difference < 0: #If the objective function isn't rising, restart optimization - print "Reducing step-size, restarting" - #objective function isn't increasing, try reducing step size step_size *= 0.9 + print "Objective function rose" + print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + #objective function isn't increasing, try reducing step size f = np.zeros((self.N, 1)) new_obj = -np.inf old_obj = np.inf + rs += 1 difference = abs(difference) + i += 1 return f diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 15859a81..49174ce7 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -47,7 +47,6 @@ class student_t(likelihood_function): - (self.v + 1) * 0.5 * np.log(1 + ((e**2 / self.sigma**2) / self.v)) ) - print (e**2).shape return np.sum(objective) def link_grad(self, y, f): From 4a14a82dfba4bd3c48d4175bb8a861bab24a0d10 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 5 Apr 2013 17:34:11 +0100 Subject: [PATCH 17/22] Got the mode finding without computing Ki --- python/examples/laplace_approximations.py | 85 +++++++++----- python/likelihoods/Laplace.py | 130 ++++++++++++++++------ 2 files changed, 152 insertions(+), 63 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index a1c71c71..7ab26406 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -6,6 +6,38 @@ from coxGP.python.likelihoods.Laplace import Laplace from coxGP.python.likelihoods.likelihood_function import student_t +def timing(): + real_var = 0.1 + times = 1000 + deg_free = 10 + real_sd = np.sqrt(real_var) + the_is = np.zeros(times) + X = np.linspace(0.0, 10.0, 30)[:, None] + for a in xrange(times): + Y = np.sin(X) + np.random.randn(*X.shape)*real_var + Yc = Y.copy() + + Yc[10] += 100 + Yc[25] += 10 + Yc[23] += 10 + Yc[24] += 10 + + edited_real_sd = real_sd + kernel1 = GPy.kern.rbf(X.shape[1]) + + t_distribution = student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True) + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + the_is[a] = m.likelihood.i + + print the_is + print np.mean(the_is) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + def student_t_approx(): """ Example of regressing with a student t likelihood @@ -80,32 +112,6 @@ def student_t_approx(): plt.suptitle('Student-t likelihood') edited_real_sd = real_sd - print "Clean student t, ncg" - t_distribution = student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = Laplace(Y, t_distribution, rasm=False) - m = GPy.models.GP(X, stu_t_likelihood, kernel3) - m.ensure_default_constraints() - m.update_likelihood_approximation() - m.optimize() - print(m) - plt.subplot(221) - m.plot() - plt.plot(X_full, Y_full) - plt.ylim(-2.5, 2.5) - - print "Corrupt student t, ncg" - t_distribution = student_t(deg_free, sigma=edited_real_sd) - corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False) - m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) - m.ensure_default_constraints() - m.update_likelihood_approximation() - m.optimize() - print(m) - plt.subplot(223) - m.plot() - plt.plot(X_full, Y_full) - plt.ylim(-2.5, 2.5) - print "Clean student t, rasm" t_distribution = student_t(deg_free, sigma=edited_real_sd) stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True) @@ -133,6 +139,33 @@ def student_t_approx(): plt.ylim(-2.5, 2.5) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + print "Clean student t, ncg" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = Laplace(Y, t_distribution, rasm=False) + m = GPy.models.GP(X, stu_t_likelihood, kernel3) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(221) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + print "Corrupt student t, ncg" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False) + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(223) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + ###with a student t distribution, since it has heavy tails it should work well ###likelihood_function = student_t(deg_free, sigma=real_var) ###lap = Laplace(Y, likelihood_function) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index e967a743..396a0bc7 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -100,12 +100,19 @@ class Laplace(likelihood): else: self.Sigma_tilde = inv(self.Sigma_tilde_i) #f_hat? should be f but we must have optimized for them I guess? - Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) - Z_tilde = (self.ln_z_hat - self.NORMAL_CONST - + 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat)) - + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) - - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) - ) + #Y_tilde = mdot(self.Sigma_tilde, self.hess_hat_i, self.f_hat) + Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat) + #KW = np.dot(self.K, self.W) + #KW_i, _, _, _ = pdinv(KW) + #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat) + #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST + #+ 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat)) + #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) + #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) + #) + _, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12)) + f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat) + Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -121,7 +128,7 @@ class Laplace(likelihood): :K: Covariance matrix """ self.K = K.copy() - self.Ki, _, _, self.log_Kdet = pdinv(K) + self.Ki, _, _, log_Kdet = pdinv(K) if self.rasm: self.f_hat = self.rasm_mode(K) else: @@ -135,33 +142,64 @@ class Laplace(likelihood): #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods - self.hess_hat = self.Ki + self.W - (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat) + #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though + self.B, L, self.W_12 = self._compute_B_statistics(K, self.W) + self.Bi, _, _, B_det = pdinv(self.B) + #ln_W_det = np.linalg.det(self.W) + #ln_B_det = np.linalg.det(self.B) + ln_det = np.linalg.det(np.eye(self.N) - mdot(self.W_12, self.Bi, self.W_12, K)) + b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None] + #TODO: Check L is lower + solve_L = cho_solve((L, True), mdot(self.W_12, (K, b))) + a = b - mdot(self.W_12, solve_L) + self.f_Ki_f = np.dot(self.f_hat.T, a) - #Check hess_hat is positive definite - try: - cholesky(self.hess_hat) - except: - raise ValueError("Must be positive definite") + #self.hess_hat = self.Ki + self.W + #(self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat) - #Check its eigenvalues are positive - eigenvalues = eig(self.hess_hat) - if not np.all(eigenvalues > 0): - raise ValueError("Eigen values not positive") + ##Check hess_hat is positive definite + #try: + #cholesky(self.hess_hat) + #except: + #raise ValueError("Must be positive definite") + + ##Check its eigenvalues are positive + #eigenvalues = eig(self.hess_hat) + #if not np.all(eigenvalues > 0): + #raise ValueError("Eigen values not positive") #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) #Unsure whether its log_hess or log_hess_i - self.ln_z_hat = (- 0.5*self.log_hess_hat_det - + 0.5*self.log_Kdet - + self.likelihood_function.link_function(self.data, self.f_hat) + #self.ln_z_hat = (- 0.5*self.log_hess_hat_i_det + #+ 0.5*self.log_Kdet #+ self.likelihood_function.link_function(self.data, self.f_hat) - - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) + ##+ self.likelihood_function.link_function(self.data, self.f_hat) + #- 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) + #) + self.ln_z_hat = (- 0.5*log_Kdet + - 0.5*self.f_Ki_f + + self.likelihood_function.link_function(self.data, self.f_hat) + + 0.5*ln_det ) return self._compute_GP_variables() + def _compute_B_statistics(self, K, W): + """Rasmussen suggests the use of a numerically stable positive definite matrix B + Which has a positive diagonal element and can be easyily inverted + + :K: Covariance matrix + :W: Negative hessian at a point (diagonal matrix) + :returns: (B, L) + """ + #W is diagnoal so its sqrt is just the sqrt of the diagonal elements + W_12 = np.sqrt(W) + B = np.eye(K.shape[0]) + mdot(W_12, K, W_12) + L = jitchol(B) + return (B, L, W_12) + def ncg_mode(self, K): """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative) :K: Covariance matrix @@ -189,7 +227,7 @@ class Laplace(likelihood): f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) return f_hat[:, None] - def rasm_mode(self, K, MAX_ITER=5000, MAX_RESTART=30): + def rasm_mode(self, K, MAX_ITER=5000000000000000, MAX_RESTART=30): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -206,11 +244,12 @@ class Laplace(likelihood): return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f) difference = np.inf - epsilon = 1e-16 + epsilon = 1e-6 step_size = 1 rs = 0 i = 0 - while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: + while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART: + f_old = f.copy() W = -np.diag(self.likelihood_function.link_hess(self.data, f)) if not self.likelihood_function.log_concave: #if np.any(W < 0): @@ -220,31 +259,48 @@ class Laplace(likelihood): #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods - #W is diagnoal so its sqrt is just the sqrt of the diagonal elements - W_12 = np.sqrt(W) - B = np.eye(self.N) + mdot(W_12, K, W_12) - L = jitchol(B) - b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)[:, None]) + B, L, W_12 = self._compute_B_statistics(K, W) + + W_f = np.dot(W, f) + grad = self.likelihood_function.link_grad(self.data, f)[:, None] + #Find K_i_f + b = W_f + grad + #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None] #TODO: Check L is lower solve_L = cho_solve((L, True), mdot(W_12, (K, b))) a = b - mdot(W_12, solve_L) - f = np.dot(K, a) + #f = np.dot(K, a) + + #a should be equal to Ki*f now so should be able to use it + c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) + solve_L = cho_solve((L, True), mdot(W_12, c)) + f = c - mdot(K, W_12, solve_L) + + #K_w_f = mdot(K, (W, f)) + #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f + #d = f + K_w_f + c + #solve_L = cho_solve((L, True), mdot(W_12, d)) + #f = c - mdot(K, (W_12, solve_L)) + #a = mdot(self.Ki, f) + + tmp_old_obj = old_obj old_obj = new_obj new_obj = obj(a, f) difference = new_obj - old_obj - #print "Difference: ", new_obj - old_obj + #print "Difference: ", difference if difference < 0: + #print "Objective function rose", difference #If the objective function isn't rising, restart optimization step_size *= 0.9 - print "Objective function rose" - print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) #objective function isn't increasing, try reducing step size - f = np.zeros((self.N, 1)) - new_obj = -np.inf - old_obj = np.inf + #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode + old_obj = tmp_old_obj rs += 1 difference = abs(difference) i += 1 + self.i = i + print "{i} steps".format(i=i) return f From 31d8faecf866307c69dcade761ddb77d628b773e Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 5 Apr 2013 17:56:02 +0100 Subject: [PATCH 18/22] Added timing and realised mdot can be faster as its almost always a diagonal matrix its multiplying with --- python/examples/laplace_approximations.py | 9 +++++--- python/likelihoods/Laplace.py | 25 ++++++++++++++--------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 7ab26406..28a92c61 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -8,11 +8,12 @@ from coxGP.python.likelihoods.likelihood_function import student_t def timing(): real_var = 0.1 - times = 1000 + times = 1 deg_free = 10 real_sd = np.sqrt(real_var) the_is = np.zeros(times) - X = np.linspace(0.0, 10.0, 30)[:, None] + X = np.linspace(0.0, 10.0, 500)[:, None] + for a in xrange(times): Y = np.sin(X) + np.random.randn(*X.shape)*real_var Yc = Y.copy() @@ -21,6 +22,8 @@ def timing(): Yc[25] += 10 Yc[23] += 10 Yc[24] += 10 + Yc[300] += 10 + Yc[400] += 10000 edited_real_sd = real_sd kernel1 = GPy.kern.rbf(X.shape[1]) @@ -33,9 +36,9 @@ def timing(): m.optimize() the_is[a] = m.likelihood.i + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print the_is print np.mean(the_is) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def student_t_approx(): diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 396a0bc7..734bf6c8 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -128,7 +128,9 @@ class Laplace(likelihood): :K: Covariance matrix """ self.K = K.copy() - self.Ki, _, _, log_Kdet = pdinv(K) + print "Inverting K" + #self.Ki, _, _, log_Kdet = pdinv(K) + print "K inverted, optimising" if self.rasm: self.f_hat = self.rasm_mode(K) else: @@ -196,6 +198,7 @@ class Laplace(likelihood): """ #W is diagnoal so its sqrt is just the sqrt of the diagonal elements W_12 = np.sqrt(W) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT B = np.eye(K.shape[0]) + mdot(W_12, K, W_12) L = jitchol(B) return (B, L, W_12) @@ -205,9 +208,7 @@ class Laplace(likelihood): :K: Covariance matrix :returns: f_mode """ - self.K = K.copy() f = np.zeros((self.N, 1)) - (self.Ki, _, _, self.log_Kdet) = pdinv(K) LOG_K_CONST = -(0.5 * self.log_Kdet) #FIXME: Can we get rid of this horrible reshaping? @@ -227,7 +228,7 @@ class Laplace(likelihood): f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) return f_hat[:, None] - def rasm_mode(self, K, MAX_ITER=5000000000000000, MAX_RESTART=30): + def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -249,6 +250,7 @@ class Laplace(likelihood): rs = 0 i = 0 while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART: + print "optimising" f_old = f.copy() W = -np.diag(self.likelihood_function.link_hess(self.data, f)) if not self.likelihood_function.log_concave: @@ -259,22 +261,25 @@ class Laplace(likelihood): #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods + print "Decomposing" B, L, W_12 = self._compute_B_statistics(K, W) + print "Finding f" - W_f = np.dot(W, f) + W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal! grad = self.likelihood_function.link_grad(self.data, f)[:, None] #Find K_i_f b = W_f + grad #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None] #TODO: Check L is lower - solve_L = cho_solve((L, True), mdot(W_12, (K, b))) - a = b - mdot(W_12, solve_L) + + solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal! + a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! #f = np.dot(K, a) #a should be equal to Ki*f now so should be able to use it c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) - solve_L = cho_solve((L, True), mdot(W_12, c)) - f = c - mdot(K, W_12, solve_L) + solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal! + f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! #K_w_f = mdot(K, (W, f)) #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f @@ -302,5 +307,5 @@ class Laplace(likelihood): i += 1 self.i = i - print "{i} steps".format(i=i) + #print "{i} steps".format(i=i) return f From 431f93ef231875aeb6adbe6be2c70ea807aafdce Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 8 Apr 2013 18:09:07 +0100 Subject: [PATCH 19/22] Stabalised most of the algorithm (apart from the end inversion which is impossible) --- python/likelihoods/Laplace.py | 132 ++++++++++++++++++---------------- 1 file changed, 72 insertions(+), 60 deletions(-) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 734bf6c8..77359769 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -3,9 +3,15 @@ import scipy as sp import GPy from scipy.linalg import cholesky, eig, inv, det, cho_solve from GPy.likelihoods.likelihood import likelihood -from GPy.util.linalg import pdinv, mdot, jitchol +from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv +from scipy.linalg.lapack import dtrtrs #import numpy.testing.assert_array_equal +#TODO: Move this to utils +def det_ln_diag(A): + return np.log(np.diagonal(A)).sum() + + class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -60,7 +66,6 @@ class Laplace(likelihood): pass # TODO: Laplace likelihood might want to take some parameters... def _gradients(self, partial): - #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... raise NotImplementedError @@ -99,9 +104,26 @@ class Laplace(likelihood): (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i) else: self.Sigma_tilde = inv(self.Sigma_tilde_i) - #f_hat? should be f but we must have optimized for them I guess? - #Y_tilde = mdot(self.Sigma_tilde, self.hess_hat_i, self.f_hat) Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat) + + #dtritri -> L -> L_i + #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i + #((L.T*w)_i + I)f_hat = y_tilde + L = jitchol(self.K) + Li = chol_inv(L) + Lt_W = np.dot(L.T, self.W) + if np.abs(det(Lt_W)) < epsilon: + print "WARNING: Transformed covariance matrix is signular!" + Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0] + Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + #if np.abs(det(KW)) < epsilon: + #print "WARNING: Transformed covariance matrix is signular!" + #KW_i = inv(KW) + #Y_tilde = mdot(KW_i + np.eye(self.N), self.f_hat) + + #Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat) #KW = np.dot(self.K, self.W) #KW_i, _, _, _ = pdinv(KW) #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat) @@ -110,16 +132,38 @@ class Laplace(likelihood): #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) #) - _, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12)) - f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat) - Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f + #_, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12)) + #f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat) + #Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f + + #f_W_f = mdot(self.f_hat.T, self.W, self.f_hat) + #f_Y_f = mdot(Y_tilde, self.W, Y_tilde) + #Z_tilde = (np.dot(self.W, self.f_hat) - 0.5*y_W_y + self.ln_z_hat + #- 0.5*mdot(self.f_hat, ( + + f_Ki_W_f = mdot(self.f_hat.T, (self.Ki + self.W), self.f_hat) + y_W_f = mdot(Y_tilde.T, self.W, self.f_hat) + y_W_y = mdot(Y_tilde.T, self.W, Y_tilde) + self.ln_W_det = det_ln_diag(self.W) + Z_tilde = (self.NORMAL_CONST + - 0.5*self.ln_K_det + - 0.5*self.ln_W_det + - 0.5*self.ln_Ki_W_i_det + - 0.5*f_Ki_W_f + - 0.5*y_W_y + + y_W_f + + self.ln_z_hat + ) + + Sigma_tilde = inv(self.W) # Damn #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) self.Y = Y_tilde self.YYT = np.dot(self.Y, self.Y.T) - self.covariance_matrix = self.Sigma_tilde + self.covariance_matrix = Sigma_tilde self.precision = 1 / np.diag(self.covariance_matrix)[:, None] + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def fit_full(self, K): """ @@ -128,9 +172,7 @@ class Laplace(likelihood): :K: Covariance matrix """ self.K = K.copy() - print "Inverting K" - #self.Ki, _, _, log_Kdet = pdinv(K) - print "K inverted, optimising" + self.Ki, _, _, self.ln_K_det = pdinv(K) if self.rasm: self.f_hat = self.rasm_mode(K) else: @@ -144,46 +186,24 @@ class Laplace(likelihood): #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods + #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though - self.B, L, self.W_12 = self._compute_B_statistics(K, self.W) + self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W) self.Bi, _, _, B_det = pdinv(self.B) - #ln_W_det = np.linalg.det(self.W) - #ln_B_det = np.linalg.det(self.B) - ln_det = np.linalg.det(np.eye(self.N) - mdot(self.W_12, self.Bi, self.W_12, K)) + + Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) + self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i) + b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None] - #TODO: Check L is lower - solve_L = cho_solve((L, True), mdot(self.W_12, (K, b))) - a = b - mdot(self.W_12, solve_L) + solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b))) + a = b - mdot(self.W_12, solve_chol) self.f_Ki_f = np.dot(self.f_hat.T, a) - #self.hess_hat = self.Ki + self.W - #(self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat) - - ##Check hess_hat is positive definite - #try: - #cholesky(self.hess_hat) - #except: - #raise ValueError("Must be positive definite") - - ##Check its eigenvalues are positive - #eigenvalues = eig(self.hess_hat) - #if not np.all(eigenvalues > 0): - #raise ValueError("Eigen values not positive") - - #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to - #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode - #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) - #Unsure whether its log_hess or log_hess_i - #self.ln_z_hat = (- 0.5*self.log_hess_hat_i_det - #+ 0.5*self.log_Kdet - #+ self.likelihood_function.link_function(self.data, self.f_hat) - ##+ self.likelihood_function.link_function(self.data, self.f_hat) - #- 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) - #) - self.ln_z_hat = (- 0.5*log_Kdet + self.ln_z_hat = ( self.NORMAL_CONST - 0.5*self.f_Ki_f + - 0.5*self.ln_K_det + + 0.5*self.ln_Ki_W_i_det + self.likelihood_function.link_function(self.data, self.f_hat) - + 0.5*ln_det ) return self._compute_GP_variables() @@ -198,7 +218,7 @@ class Laplace(likelihood): """ #W is diagnoal so its sqrt is just the sqrt of the diagonal elements W_12 = np.sqrt(W) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT B = np.eye(K.shape[0]) + mdot(W_12, K, W_12) L = jitchol(B) return (B, L, W_12) @@ -209,12 +229,12 @@ class Laplace(likelihood): :returns: f_mode """ f = np.zeros((self.N, 1)) - LOG_K_CONST = -(0.5 * self.log_Kdet) #FIXME: Can we get rid of this horrible reshaping? + #ONLY WORKS FOR 1D DATA def obj(f): res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) - + self.NORMAL_CONST + LOG_K_CONST) + + self.NORMAL_CONST) return float(res) def obj_grad(f): @@ -249,21 +269,15 @@ class Laplace(likelihood): step_size = 1 rs = 0 i = 0 - while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART: - print "optimising" + while difference > epsilon: # and i < MAX_ITER and rs < MAX_RESTART: f_old = f.copy() W = -np.diag(self.likelihood_function.link_hess(self.data, f)) if not self.likelihood_function.log_concave: - #if np.any(W < 0): - #print "NEGATIVE VALUES :(" - #pass W[W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods - print "Decomposing" B, L, W_12 = self._compute_B_statistics(K, W) - print "Finding f" W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal! grad = self.likelihood_function.link_grad(self.data, f)[:, None] @@ -272,15 +286,15 @@ class Laplace(likelihood): #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None] #TODO: Check L is lower - solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal! - a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! - #f = np.dot(K, a) - #a should be equal to Ki*f now so should be able to use it c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal! f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! + solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal! + a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! + #f = np.dot(K, a) + #K_w_f = mdot(K, (W, f)) #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f #d = f + K_w_f + c @@ -292,7 +306,6 @@ class Laplace(likelihood): old_obj = new_obj new_obj = obj(a, f) difference = new_obj - old_obj - #print "Difference: ", difference if difference < 0: #print "Objective function rose", difference #If the objective function isn't rising, restart optimization @@ -307,5 +320,4 @@ class Laplace(likelihood): i += 1 self.i = i - #print "{i} steps".format(i=i) return f From e0c1e4a4df600d24f075cc13a359a4bc77dfcff3 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 8 Apr 2013 19:58:54 +0100 Subject: [PATCH 20/22] Fixed laplace approximation and made more numerically stable with cholesky decompositions, and commented --- python/examples/laplace_approximations.py | 1 - python/likelihoods/Laplace.py | 142 ++++++++++------------ 2 files changed, 65 insertions(+), 78 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 28a92c61..0500ba02 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -140,7 +140,6 @@ def student_t_approx(): m.plot() plt.plot(X_full, Y_full) plt.ylim(-2.5, 2.5) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print "Clean student t, ncg" t_distribution = student_t(deg_free, sigma=edited_real_sd) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 77359769..27ab7613 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,17 +1,32 @@ import numpy as np import scipy as sp import GPy -from scipy.linalg import cholesky, eig, inv, det, cho_solve +from scipy.linalg import cholesky, eig, inv, cho_solve +from numpy.linalg import cond from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv from scipy.linalg.lapack import dtrtrs -#import numpy.testing.assert_array_equal #TODO: Move this to utils + + def det_ln_diag(A): + """ + log determinant of a diagonal matrix + $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$ + """ return np.log(np.diagonal(A)).sum() +def pddet(A): + """ + Determinant of a positive definite matrix + """ + L = cholesky(A) + logdetA = 2*sum(np.log(np.diag(L))) + return logdetA + + class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -30,7 +45,8 @@ class Laplace(likelihood): --------- :data: @todo - :likelihood_function: @todo + :likelihood_function: likelihood function - subclass of likelihood_function + :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation """ self.data = data @@ -63,10 +79,10 @@ class Laplace(likelihood): return [] def _set_params(self, p): - pass # TODO: Laplace likelihood might want to take some parameters... + pass # TODO: Laplace likelihood might want to take some parameters... def _gradients(self, partial): - return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... + return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... raise NotImplementedError def _compute_GP_variables(self): @@ -91,20 +107,10 @@ class Laplace(likelihood): i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$ since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$ and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ + $$\tilde{\Sigma} = W^{-1}$$ """ - self.Sigma_tilde_i = self.W - #Check it isn't singular! epsilon = 1e-6 - if np.abs(det(self.Sigma_tilde_i)) < epsilon: - print "WARNING: Transformed covariance matrix is signular!" - #raise ValueError("inverse covariance must be non-singular to invert!") - #Do we really need to inverse Sigma_tilde_i? :( - if self.likelihood_function.log_concave: - (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i) - else: - self.Sigma_tilde = inv(self.Sigma_tilde_i) - Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat) #dtritri -> L -> L_i #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i @@ -112,42 +118,25 @@ class Laplace(likelihood): L = jitchol(self.K) Li = chol_inv(L) Lt_W = np.dot(L.T, self.W) - if np.abs(det(Lt_W)) < epsilon: - print "WARNING: Transformed covariance matrix is signular!" + + ##Check it isn't singular! + if cond(Lt_W) > 1e14: + print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem" + Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0] Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - #if np.abs(det(KW)) < epsilon: - #print "WARNING: Transformed covariance matrix is signular!" - #KW_i = inv(KW) - #Y_tilde = mdot(KW_i + np.eye(self.N), self.f_hat) + #f.T(Ki + W)f + f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat)) + + mdot(self.f_hat.T, self.W, self.f_hat) + ) - #Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat) - #KW = np.dot(self.K, self.W) - #KW_i, _, _, _ = pdinv(KW) - #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat) - #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST - #+ 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat)) - #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) - #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) - #) - #_, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12)) - #f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat) - #Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f - - #f_W_f = mdot(self.f_hat.T, self.W, self.f_hat) - #f_Y_f = mdot(Y_tilde, self.W, Y_tilde) - #Z_tilde = (np.dot(self.W, self.f_hat) - 0.5*y_W_y + self.ln_z_hat - #- 0.5*mdot(self.f_hat, ( - - f_Ki_W_f = mdot(self.f_hat.T, (self.Ki + self.W), self.f_hat) y_W_f = mdot(Y_tilde.T, self.W, self.f_hat) y_W_y = mdot(Y_tilde.T, self.W, Y_tilde) - self.ln_W_det = det_ln_diag(self.W) + ln_W_det = det_ln_diag(self.W) Z_tilde = (self.NORMAL_CONST - 0.5*self.ln_K_det - - 0.5*self.ln_W_det + - 0.5*ln_W_det - 0.5*self.ln_Ki_W_i_det - 0.5*f_Ki_W_f - 0.5*y_W_y @@ -155,7 +144,11 @@ class Laplace(likelihood): + self.ln_z_hat ) - Sigma_tilde = inv(self.W) # Damn + ##Check it isn't singular! + if cond(self.W) > 1e14: + print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem" + + Sigma_tilde = inv(self.W) # Damn #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -163,16 +156,14 @@ class Laplace(likelihood): self.YYT = np.dot(self.Y, self.Y.T) self.covariance_matrix = Sigma_tilde self.precision = 1 / np.diag(self.covariance_matrix)[:, None] - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def fit_full(self, K): """ The laplace approximation algorithm - For nomenclature see Rasmussen & Williams 2006 + For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability :K: Covariance matrix """ self.K = K.copy() - self.Ki, _, _, self.ln_K_det = pdinv(K) if self.rasm: self.f_hat = self.rasm_mode(K) else: @@ -182,10 +173,10 @@ class Laplace(likelihood): self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat)) if not self.likelihood_function.log_concave: - self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur - #If the likelihood is non-log-concave. We wan't to say that there is a negative variance - #To cause the posterior to become less certain than the prior and likelihood, - #This is a property only held by non-log-concave likelihoods + self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + #If the likelihood is non-log-concave. We wan't to say that there is a negative variance + #To cause the posterior to become less certain than the prior and likelihood, + #This is a property only held by non-log-concave likelihoods #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W) @@ -198,8 +189,9 @@ class Laplace(likelihood): solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b))) a = b - mdot(self.W_12, solve_chol) self.f_Ki_f = np.dot(self.f_hat.T, a) + self.ln_K_det = pddet(self.K) - self.ln_z_hat = ( self.NORMAL_CONST + self.ln_z_hat = (self.NORMAL_CONST - 0.5*self.f_Ki_f - 0.5*self.ln_K_det + 0.5*self.ln_Ki_W_i_det @@ -219,26 +211,29 @@ class Laplace(likelihood): #W is diagnoal so its sqrt is just the sqrt of the diagonal elements W_12 = np.sqrt(W) #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - B = np.eye(K.shape[0]) + mdot(W_12, K, W_12) + B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12)) L = jitchol(B) return (B, L, W_12) def ncg_mode(self, K): - """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative) + """ + Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative) :K: Covariance matrix :returns: f_mode """ + self.Ki, _, _, self.ln_K_det = pdinv(K) + f = np.zeros((self.N, 1)) #FIXME: Can we get rid of this horrible reshaping? #ONLY WORKS FOR 1D DATA def obj(f): - res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * np.dot(f.T, np.dot(self.Ki, f)) + self.NORMAL_CONST) return float(res) def obj_grad(f): - res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f)) + res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - np.dot(self.Ki, f)) return np.squeeze(res) def obj_hess(f): @@ -254,6 +249,8 @@ class Laplace(likelihood): For nomenclature see Rasmussen & Williams 2006 :K: Covariance matrix + :MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation + :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation :returns: f_mode """ f = np.zeros((self.N, 1)) @@ -269,39 +266,30 @@ class Laplace(likelihood): step_size = 1 rs = 0 i = 0 - while difference > epsilon: # and i < MAX_ITER and rs < MAX_RESTART: + while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: f_old = f.copy() W = -np.diag(self.likelihood_function.link_hess(self.data, f)) if not self.likelihood_function.log_concave: - W[W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur - #If the likelihood is non-log-concave. We wan't to say that there is a negative variance - #To cause the posterior to become less certain than the prior and likelihood, - #This is a property only held by non-log-concave likelihoods + W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + # If the likelihood is non-log-concave. We wan't to say that there is a negative variance + # To cause the posterior to become less certain than the prior and likelihood, + # This is a property only held by non-log-concave likelihoods B, L, W_12 = self._compute_B_statistics(K, W) - W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal! + W_f = np.dot(W, f) grad = self.likelihood_function.link_grad(self.data, f)[:, None] #Find K_i_f b = W_f + grad - #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None] - #TODO: Check L is lower #a should be equal to Ki*f now so should be able to use it - c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) - solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal! - f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! + c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) + solve_L = cho_solve((L, True), np.dot(W_12, c)) + f = c - np.dot(K, np.dot(W_12, solve_L)) - solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal! - a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! + solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b))) + a = b - np.dot(W_12, solve_L) #f = np.dot(K, a) - #K_w_f = mdot(K, (W, f)) - #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f - #d = f + K_w_f + c - #solve_L = cho_solve((L, True), mdot(W_12, d)) - #f = c - mdot(K, (W_12, solve_L)) - #a = mdot(self.Ki, f) - tmp_old_obj = old_obj old_obj = new_obj new_obj = obj(a, f) From 65481d7a73b8fe965a99b82126431ae2668958db Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 10 Apr 2013 13:43:13 +0100 Subject: [PATCH 21/22] Fixed the z scalings --- python/examples/laplace_approximations.py | 8 +++---- python/likelihoods/Laplace.py | 28 +++++++++++++++-------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 0500ba02..5b1331b6 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -12,7 +12,7 @@ def timing(): deg_free = 10 real_sd = np.sqrt(real_var) the_is = np.zeros(times) - X = np.linspace(0.0, 10.0, 500)[:, None] + X = np.linspace(0.0, 10.0, 300)[:, None] for a in xrange(times): Y = np.sin(X) + np.random.randn(*X.shape)*real_var @@ -22,8 +22,8 @@ def timing(): Yc[25] += 10 Yc[23] += 10 Yc[24] += 10 - Yc[300] += 10 - Yc[400] += 10000 + Yc[250] += 10 + #Yc[4] += 10000 edited_real_sd = real_sd kernel1 = GPy.kern.rbf(X.shape[1]) @@ -36,7 +36,7 @@ def timing(): m.optimize() the_is[a] = m.likelihood.i - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print the_is print np.mean(the_is) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 27ab7613..8ef8fb62 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,7 +1,7 @@ import numpy as np import scipy as sp import GPy -from scipy.linalg import cholesky, eig, inv, cho_solve +from scipy.linalg import cholesky, eig, inv, cho_solve, det from numpy.linalg import cond from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv @@ -134,15 +134,24 @@ class Laplace(likelihood): y_W_f = mdot(Y_tilde.T, self.W, self.f_hat) y_W_y = mdot(Y_tilde.T, self.W, Y_tilde) ln_W_det = det_ln_diag(self.W) - Z_tilde = (self.NORMAL_CONST - - 0.5*self.ln_K_det - - 0.5*ln_W_det - - 0.5*self.ln_Ki_W_i_det - - 0.5*f_Ki_W_f - - 0.5*y_W_y - + y_W_f + Z_tilde = (- self.NORMAL_CONST + + 0.5*self.ln_K_det + + 0.5*ln_W_det + + 0.5*self.ln_Ki_W_i_det + + 0.5*f_Ki_W_f + + 0.5*y_W_y + - y_W_f + self.ln_z_hat ) + #Z_tilde = (self.NORMAL_CONST + #- 0.5*self.ln_K_det + #- 0.5*ln_W_det + #- 0.5*self.ln_Ki_W_i_det + #- 0.5*f_Ki_W_f + #- 0.5*y_W_y + #+ y_W_f + #+ self.ln_z_hat + #) ##Check it isn't singular! if cond(self.W) > 1e14: @@ -191,8 +200,7 @@ class Laplace(likelihood): self.f_Ki_f = np.dot(self.f_hat.T, a) self.ln_K_det = pddet(self.K) - self.ln_z_hat = (self.NORMAL_CONST - - 0.5*self.f_Ki_f + self.ln_z_hat = (- 0.5*self.f_Ki_f - 0.5*self.ln_K_det + 0.5*self.ln_Ki_W_i_det + self.likelihood_function.link_function(self.data, self.f_hat) From 9bbb11b825f7c395a040e2385d6a2c88aa1c143e Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 10 Apr 2013 15:43:31 +0100 Subject: [PATCH 22/22] Adding weibull likelihood, requires 'extra_data' to be passed to likelihood, i.e. the censoring information --- python/likelihoods/Laplace.py | 24 +++--- python/likelihoods/likelihood_function.py | 99 +++++++++++++++++++++-- 2 files changed, 104 insertions(+), 19 deletions(-) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 8ef8fb62..4d94ba0f 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -30,7 +30,7 @@ def pddet(A): class Laplace(likelihood): """Laplace approximation to a posterior""" - def __init__(self, data, likelihood_function, rasm=True): + def __init__(self, data, likelihood_function, extra_data=None, rasm=True): """ Laplace Approximation @@ -44,13 +44,15 @@ class Laplace(likelihood): Arguments --------- - :data: @todo + :data: array of data the likelihood function is approximating :likelihood_function: likelihood function - subclass of likelihood_function + :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation """ self.data = data self.likelihood_function = likelihood_function + self.extra_data = extra_data self.rasm = rasm #Inital values @@ -179,7 +181,7 @@ class Laplace(likelihood): self.f_hat = self.ncg_mode(K) #At this point get the hessian matrix - self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat)) + self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data)) if not self.likelihood_function.log_concave: self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur @@ -194,7 +196,7 @@ class Laplace(likelihood): Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i) - b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None] + b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None] solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b))) a = b - mdot(self.W_12, solve_chol) self.f_Ki_f = np.dot(self.f_hat.T, a) @@ -203,7 +205,7 @@ class Laplace(likelihood): self.ln_z_hat = (- 0.5*self.f_Ki_f - 0.5*self.ln_K_det + 0.5*self.ln_Ki_W_i_det - + self.likelihood_function.link_function(self.data, self.f_hat) + + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) ) return self._compute_GP_variables() @@ -236,16 +238,16 @@ class Laplace(likelihood): #FIXME: Can we get rid of this horrible reshaping? #ONLY WORKS FOR 1D DATA def obj(f): - res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * np.dot(f.T, np.dot(self.Ki, f)) + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f)) + self.NORMAL_CONST) return float(res) def obj_grad(f): - res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - np.dot(self.Ki, f)) + res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f)) return np.squeeze(res) def obj_hess(f): - res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) + res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki) return np.squeeze(res) f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) @@ -267,7 +269,7 @@ class Laplace(likelihood): def obj(a, f): #Careful of shape of data! - return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f) + return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data) difference = np.inf epsilon = 1e-6 @@ -276,7 +278,7 @@ class Laplace(likelihood): i = 0 while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: f_old = f.copy() - W = -np.diag(self.likelihood_function.link_hess(self.data, f)) + W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data)) if not self.likelihood_function.log_concave: W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance @@ -285,7 +287,7 @@ class Laplace(likelihood): B, L, W_12 = self._compute_B_statistics(K, W) W_f = np.dot(W, f) - grad = self.likelihood_function.link_grad(self.data, f)[:, None] + grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None] #Find K_i_f b = W_f + grad diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 49174ce7..0d421882 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -4,6 +4,7 @@ import numpy as np from GPy.likelihoods.likelihood_functions import likelihood_function from scipy import stats + class student_t(likelihood_function): """Student t likelihood distribution For nomanclature see Bayesian Data Analysis 2003 p576 @@ -24,15 +25,16 @@ class student_t(likelihood_function): self.log_concave = False @property - def variance(self): + def variance(self, extra_data=None): return (self.v / float(self.v - 2)) * (self.sigma**2) - def link_function(self, y, f): + def link_function(self, y, f, extra_data=None): """link_function $\ln p(y|f)$ $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ :y: data :f: latent variables f + :extra_data: extra_data which is not used in student t distribution :returns: float(likelihood evaluated for this point) """ @@ -49,7 +51,7 @@ class student_t(likelihood_function): ) return np.sum(objective) - def link_grad(self, y, f): + def link_grad(self, y, f, extra_data=None): """ Gradient of the link function at y, given f w.r.t f @@ -57,6 +59,7 @@ class student_t(likelihood_function): :y: data :f: latent variables f + :extra_data: extra_data which is not used in student t distribution :returns: gradient of likelihood evaluated at points """ @@ -67,17 +70,18 @@ class student_t(likelihood_function): grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) return np.squeeze(grad) - def link_hess(self, y, f): + def link_hess(self, y, f, extra_data=None): """ Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j i.e. second derivative link_function at y given f f_j w.r.t f and f_j - Will return diaganol of hessian, since every where else it is 0 + Will return diagonal of hessian, since every where else it is 0 $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ :y: data :f: latent variables f + :extra_data: extra_data which is not used in student t distribution :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ y = np.squeeze(y) @@ -139,7 +143,7 @@ class student_t(likelihood_function): #size=(num_f_samples, num_y_samples)) #print student_t_samples.shape - student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:,None], + student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None], scale=self.sigma, size=(num_test_points, num_y_samples, num_f_samples)) student_t_samples = np.reshape(student_t_samples, @@ -152,7 +156,7 @@ class student_t(likelihood_function): ##Alernenately we could sample from int p(y|f*)p(f*|x*) df* def t_gaussian(f, mu, var): return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5)) - * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2))) + * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2))) ) def t_gauss_int(mu, var): @@ -167,4 +171,83 @@ class student_t(likelihood_function): p = vec_t_gauss_int(mu, var) p_025 = mu - p p_975 = mu + p - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return mu, np.nan*mu, p_025, p_975 + + +class weibull_survival(likelihood_function): + """Weibull t likelihood distribution for survival analysis with censoring + For nomanclature see Bayesian Survival Analysis + + Laplace: + Needs functions to calculate + ln p(yi|fi) + dln p(yi|fi)_dfi + d2ln p(yi|fi)_d2fifj + """ + def __init__(self, shape, scale): + self.shape = shape + self.scale = scale + + #FIXME: This should be in the superclass + self.log_concave = True + + def link_function(self, y, f, extra_data=None): + """ + link_function $\ln p(y|f)$, i.e. log likelihood + + $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$ + + :y: time of event data + :f: latent variables f + :extra_data: the censoring indicator, 1 for censored, 0 for not + :returns: float(likelihood evaluated for this point) + + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + v = extra_data + objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f) # FIXME: CHECK THIS WITH BOOK, wheres scale? + return np.sum(objective) + + def link_grad(self, y, f, extra_data=None): + """ + Gradient of the link function at y, given f w.r.t f + + $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i}) + + :y: data + :f: latent variables f + :extra_data: the censoring indicator, 1 for censored, 0 for not + :returns: gradient of likelihood evaluated at points + + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + v = extra_data + grad = v - (y**self.shape)*np.exp(f) + return np.squeeze(grad) + + def link_hess(self, y, f, extra_data=None): + """ + Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j + i.e. second derivative link_function at y given f f_j w.r.t f and f_j + + Will return diagonal of hessian, since every where else it is 0 + + $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ + + :y: data + :f: latent variables f + :extra_data: extra_data which is not used hessian + :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + hess = (y**self.shape)*np.exp(f) + return np.squeeze(hess)