From 67248ab7c2b0becf471fe08638d35cf0786ee1a2 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Tue, 12 Mar 2013 03:16:33 -0700 Subject: [PATCH 01/71] Initial commit --- .gitignore | 35 +++++++++++++++++++++++++++++++++++ README.md | 4 ++++ 2 files changed, 39 insertions(+) create mode 100644 .gitignore create mode 100644 README.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..d2d6f360 --- /dev/null +++ b/.gitignore @@ -0,0 +1,35 @@ +*.py[cod] + +# C extensions +*.so + +# Packages +*.egg +*.egg-info +dist +build +eggs +parts +bin +var +sdist +develop-eggs +.installed.cfg +lib +lib64 + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.tox +nosetests.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject diff --git a/README.md b/README.md new file mode 100644 index 00000000..317fa353 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +coxGP +===== + +Gaussian Process models of Cox proportional hazard models \ No newline at end of file From 68eb83955c585b08cf93cbd659f749cff5b62bb3 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 12 Mar 2013 17:42:00 +0000 Subject: [PATCH 02/71] Initial commit, setting up the laplace approximation for a student t --- python/examples/laplace_approximations.py | 37 ++++++++++++++++ python/likelihoods/Laplace.py | 54 +++++++++++++++++++++++ python/likelihoods/likelihood_function.py | 51 +++++++++++++++++++++ python/models/coxGP.py | 19 ++++++++ python/testing/cox_tests.py | 14 ++++++ 5 files changed, 175 insertions(+) create mode 100644 python/examples/laplace_approximations.py create mode 100644 python/likelihoods/Laplace.py create mode 100644 python/likelihoods/likelihood_function.py create mode 100644 python/models/coxGP.py create mode 100644 python/testing/cox_tests.py diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py new file mode 100644 index 00000000..2f059831 --- /dev/null +++ b/python/examples/laplace_approximations.py @@ -0,0 +1,37 @@ +import GPy +import numpy as np +import scipy as sp +import scipy.stats +import matplotlib.pyplot as plt + + +def student_t_approx(): + """ + Example of regressing with a student t likelihood + """ + #Start a function, any function + X = np.sort(np.random.uniform(0, 15, 70))[:, None] + Y = np.sin(X) + + #Add some extreme value noise to some of the datapoints + percent_corrupted = 0.05 + corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted)) + indices = np.arange(Y.shape[0]) + np.random.shuffle(indices) + corrupted_indices = indices[:corrupted_datums] + print corrupted_indices + noise = np.random.uniform(-10,10,(len(corrupted_indices), 1)) + Y[corrupted_indices] += noise + + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model + m = GPy.models.GP_regression(X,Y) + + # optimize + m.ensure_default_constraints() + m.optimize() + # plot + m.plot() + print m + + #with a student t distribution, since it has heavy tails it should work well diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py new file mode 100644 index 00000000..a0dbc65c --- /dev/null +++ b/python/likelihoods/Laplace.py @@ -0,0 +1,54 @@ +import nump as np +import GPy +from GPy.util.linalg import jitchol + +class Laplace(GPy.likelihoods.likelihood): + """Laplace approximation to a posterior""" + + def __init__(self,data,likelihood_function): + """ + Laplace Approximation + + First find the moments \hat{f} and the hessian at this point (using Newton-Raphson) + then find the z^{prime} which allows this to be a normalised gaussian instead of a + non-normalized gaussian + + Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle} + which makes a gaussian the same as the laplace approximation + + Arguments + --------- + + :data: @todo + :likelihood_function: @todo + + """ + GPy.likelihoods.likelihood.__init__(self) + + self.data = data + self.likelihood_function = likelihood_function + + #Inital values + self.N, self.D = self.data.shape + + def _compute_GP_variables(self): + """ + Generates data Y which would give the normal distribution identical to the laplace approximation + + GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle} + that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood + """ + raise NotImplementedError + + def fit_full(self, K): + """ + The laplace approximation algorithm + For nomenclature see Rasmussen & Williams 2006 + :K: Covariance matrix + """ + self.f = np.zeros(self.N) + + #Find \hat(f) using a newton raphson optimizer for example + + #At this point get the hessian matrix + diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py new file mode 100644 index 00000000..fd19675b --- /dev/null +++ b/python/likelihoods/likelihood_function.py @@ -0,0 +1,51 @@ +import GPy +from scipy.special import gamma, gammaln + +class student_t(GPy.likelihoods.likelihood_function): + """Student t likelihood distribution + For nomanclature see Bayesian Data Analysis 2003 p576 + + Laplace: + Needs functions to calculate + ln p(yi|fi) + dln p(yi|fi)_dfi + d2ln p(yi|fi)_d2fi + """ + def __init__(self, deg_free, sigma=1): + self.v = deg_free + self.sigma = 1 + + def link_function(self, y_i, f_i): + """link_function $\ln p(y_i|f_i)$ + + :y_i: datum number i + :f_i: latent variable f_i + :returns: float(likelihood evaluated for this point) + + """ + e = y_i - f_i + return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) + + def link_grad(self, y_i, f_i): + """gradient of the link function at y_i, given f_i w.r.t f_i + + :y_i: datum number i + :f_i: latent variable f_i + :returns: float(gradient of likelihood evaluated at this point) + + """ + pass + + def link_hess(self, y_i, f_i, f_j): + """hessian at this point (the hessian will be 0 unless i == j) + i.e. second derivative w.r.t f_i and f_j + + :y_i: @todo + :f_i: @todo + :f_j: @todo + :returns: @todo + + """ + if f_i = + pass + diff --git a/python/models/coxGP.py b/python/models/coxGP.py new file mode 100644 index 00000000..f61a8f46 --- /dev/null +++ b/python/models/coxGP.py @@ -0,0 +1,19 @@ +# Copyright (c) 2013, Alan Saul + +from GPy.models import GP +from .. import likelihoods +from GPy import kern + + +class cox_GP_regression(GP): + """ + Cox Gaussian Process model for regression + """ + + def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None): + if kernel is None: + kernel = kern.rbf(X.shape[1]) + + likelihood = likelihoods.cox_piecewise(Y, normalize=normalize_Y) + + GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices) diff --git a/python/testing/cox_tests.py b/python/testing/cox_tests.py new file mode 100644 index 00000000..526f5c92 --- /dev/null +++ b/python/testing/cox_tests.py @@ -0,0 +1,14 @@ +# Copyright (c) 2013, Alan Saul + +import unittest +import numpy as np +import GPy + +class coxGPTests(unittest.TestCase): + def test_laplace_approx(self): + pass + +if __name__ == "__main__": + print "Running unit tests, please be (very) patient..." + unittest.main() + From ad2c266c65120e1fabf0cf1825fc0c661084611b Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 13 Mar 2013 11:54:33 +0000 Subject: [PATCH 03/71] Added some comments --- python/likelihoods/likelihood_function.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index fd19675b..5d4e51ce 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -5,6 +5,9 @@ class student_t(GPy.likelihoods.likelihood_function): """Student t likelihood distribution For nomanclature see Bayesian Data Analysis 2003 p576 + $$\ln(\frac{\Gamma(\frac{(v+1)}{2})}{\Gamma(\sqrt(v \pi \Gamma(\frac{v}{2}))})+ \ln(1+\frac{(y_i-f_i)^2}{\sigma v})^{-\frac{(v+1)}{2}}$$ + TODO:Double check this + Laplace: Needs functions to calculate ln p(yi|fi) @@ -17,6 +20,8 @@ class student_t(GPy.likelihoods.likelihood_function): def link_function(self, y_i, f_i): """link_function $\ln p(y_i|f_i)$ + $$\ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2}) - \ln \frac{v \pi \sigma}{2} - \frac{v+1}{2}\ln (1 + \frac{(y_{i} - f_{i})^{2}}{v\sigma})$$ + TODO: Double check this :y_i: datum number i :f_i: latent variable f_i @@ -24,11 +29,15 @@ class student_t(GPy.likelihoods.likelihood_function): """ e = y_i - f_i - return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) + return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) #Check the /v! def link_grad(self, y_i, f_i): """gradient of the link function at y_i, given f_i w.r.t f_i + derivative of log((gamma((v+1)/2)/gamma(sqrt(v*pi*gamma(v/2))))*(1+(t^2)/(a*v))^((-(v+1))/2)) with respect to t + $$\frac{(y_i - f_i)(v + 1)}{\sigma v (y_{i} - f_{i})^{2}}$$ + TODO: Double check this + :y_i: datum number i :f_i: latent variable f_i :returns: float(gradient of likelihood evaluated at this point) @@ -40,6 +49,8 @@ class student_t(GPy.likelihoods.likelihood_function): """hessian at this point (the hessian will be 0 unless i == j) i.e. second derivative w.r.t f_i and f_j + second derivative of + :y_i: @todo :f_i: @todo :f_j: @todo From 3f114aa020fb678b1c52eb441bb079d9a0b8cd00 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 13 Mar 2013 17:55:41 +0000 Subject: [PATCH 04/71] Got most of laplace approximation working --- __init__.py | 0 python/__init__.py | 0 python/examples/__init__.py | 0 python/examples/laplace_approximations.py | 44 +++++++++++-- python/likelihoods/Laplace.py | 45 +++++++++++-- python/likelihoods/__init__.py | 0 python/likelihoods/likelihood_function.py | 80 +++++++++++++---------- python/models/__init__.py | 0 python/testing/__init__.py | 0 9 files changed, 124 insertions(+), 45 deletions(-) create mode 100644 __init__.py create mode 100644 python/__init__.py create mode 100644 python/examples/__init__.py create mode 100644 python/likelihoods/__init__.py create mode 100644 python/models/__init__.py create mode 100644 python/testing/__init__.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/__init__.py b/python/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/examples/__init__.py b/python/examples/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 2f059831..0e1d3305 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -1,8 +1,9 @@ import GPy import numpy as np -import scipy as sp -import scipy.stats import matplotlib.pyplot as plt +from scipy.stats import t +from coxGP.python.likelihoods.Laplace import Laplace +from coxGP.python.likelihoods.likelihood_function import student_t def student_t_approx(): @@ -13,6 +14,41 @@ def student_t_approx(): X = np.sort(np.random.uniform(0, 15, 70))[:, None] Y = np.sin(X) + #Add student t random noise to datapoints + deg_free = 1 + noise = t.rvs(deg_free, loc=1.8, scale=1, size=Y.shape) + Y += noise + + # Kernel object + print X.shape + kernel = GPy.kern.rbf(X.shape[1]) + + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model + m = GPy.models.GP_regression(X, Y, kernel=kernel) + + # optimize + m.ensure_default_constraints() + m.optimize() + # plot + #m.plot() + print m + + #with a student t distribution, since it has heavy tails it should work well + likelihood_function = student_t(deg_free, sigma=1) + lap = Laplace(Y, likelihood_function) + cov = kernel.K(X) + lap.fit_full(cov) + + +def noisy_laplace_approx(): + """ + Example of regressing with a student t likelihood + """ + #Start a function, any function + X = np.sort(np.random.uniform(0, 15, 70))[:, None] + Y = np.sin(X) + #Add some extreme value noise to some of the datapoints percent_corrupted = 0.05 corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted)) @@ -20,12 +56,12 @@ def student_t_approx(): np.random.shuffle(indices) corrupted_indices = indices[:corrupted_datums] print corrupted_indices - noise = np.random.uniform(-10,10,(len(corrupted_indices), 1)) + noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1)) Y[corrupted_indices] += noise #A GP should completely break down due to the points as they get a lot of weight # create simple GP model - m = GPy.models.GP_regression(X,Y) + m = GPy.models.GP_regression(X, Y) # optimize m.ensure_default_constraints() diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index a0dbc65c..6efbfa30 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,8 +1,14 @@ -import nump as np +import numpy as np +import scipy as sp import GPy from GPy.util.linalg import jitchol +from functools import partial +from GPy.likelihoods.likelihood import likelihood +from GPy.util.linalg import pdinv,mdot -class Laplace(GPy.likelihoods.likelihood): + + +class Laplace(likelihood): """Laplace approximation to a posterior""" def __init__(self,data,likelihood_function): @@ -23,8 +29,6 @@ class Laplace(GPy.likelihoods.likelihood): :likelihood_function: @todo """ - GPy.likelihoods.likelihood.__init__(self) - self.data = data self.likelihood_function = likelihood_function @@ -38,7 +42,7 @@ class Laplace(GPy.likelihoods.likelihood): GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle} that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood """ - raise NotImplementedError + z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised def fit_full(self, K): """ @@ -46,9 +50,38 @@ class Laplace(GPy.likelihoods.likelihood): For nomenclature see Rasmussen & Williams 2006 :K: Covariance matrix """ - self.f = np.zeros(self.N) + f = np.zeros((self.N, 1)) + print K.shape + print f.shape + print self.data.shape + (Ki, _, _, log_Kdet) = pdinv(K) + obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2*np.pi)) #Find \hat(f) using a newton raphson optimizer for example + #TODO: Add newton-raphson as subclass of optimizer class + + #FIXME: Can we get rid of this horrible reshaping? + def obj(f): + f = f[:, None] + res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (Ki, f)) + obj_constant) + return float(res) + + def obj_grad(f): + f = f[:, None] + res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(Ki, f)) + return np.squeeze(res) + + def obj_hess(f): + f = f[:, None] + res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - Ki) + return np.squeeze(res) + + self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) #At this point get the hessian matrix + self.hess_hat = obj_hess(f_hat) + #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...) + self.height_unnormalised = obj(f_hat) #FIXME: Is it -1? + + return _compute_GP_variables() diff --git a/python/likelihoods/__init__.py b/python/likelihoods/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 5d4e51ce..78731199 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -1,62 +1,72 @@ -import GPy -from scipy.special import gamma, gammaln +from scipy.special import gammaln +import numpy as np +from GPy.likelihoods.likelihood_functions import likelihood_function -class student_t(GPy.likelihoods.likelihood_function): + +class student_t(likelihood_function): """Student t likelihood distribution For nomanclature see Bayesian Data Analysis 2003 p576 - $$\ln(\frac{\Gamma(\frac{(v+1)}{2})}{\Gamma(\sqrt(v \pi \Gamma(\frac{v}{2}))})+ \ln(1+\frac{(y_i-f_i)^2}{\sigma v})^{-\frac{(v+1)}{2}}$$ - TODO:Double check this + $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ Laplace: Needs functions to calculate ln p(yi|fi) dln p(yi|fi)_dfi - d2ln p(yi|fi)_d2fi + d2ln p(yi|fi)_d2fifj """ def __init__(self, deg_free, sigma=1): self.v = deg_free self.sigma = 1 - def link_function(self, y_i, f_i): - """link_function $\ln p(y_i|f_i)$ - $$\ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2}) - \ln \frac{v \pi \sigma}{2} - \frac{v+1}{2}\ln (1 + \frac{(y_{i} - f_{i})^{2}}{v\sigma})$$ - TODO: Double check this + def link_function(self, y, f): + """link_function $\ln p(y|f)$ + $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ - :y_i: datum number i - :f_i: latent variable f_i + :y: datum number i + :f: latent variable f :returns: float(likelihood evaluated for this point) """ - e = y_i - f_i - return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) #Check the /v! + e = y - f + #print "Link ", y.shape, f.shape, e.shape + objective = (gammaln((self.v + 1) * 0.5) + - gammaln(self.v * 0.5) + + np.log(self.sigma * np.sqrt(self.v * np.pi)) + - (self.v + 1) * 0.5 + * np.log(1 + ((e**2 / self.sigma**2) / self.v)) + ) + return np.sum(objective) - def link_grad(self, y_i, f_i): - """gradient of the link function at y_i, given f_i w.r.t f_i + def link_grad(self, y, f): + """ + Gradient of the link function at y, given f w.r.t f - derivative of log((gamma((v+1)/2)/gamma(sqrt(v*pi*gamma(v/2))))*(1+(t^2)/(a*v))^((-(v+1))/2)) with respect to t - $$\frac{(y_i - f_i)(v + 1)}{\sigma v (y_{i} - f_{i})^{2}}$$ - TODO: Double check this + $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ - :y_i: datum number i - :f_i: latent variable f_i + :y: datum number i + :f: latent variable f :returns: float(gradient of likelihood evaluated at this point) """ - pass - - def link_hess(self, y_i, f_i, f_j): - """hessian at this point (the hessian will be 0 unless i == j) - i.e. second derivative w.r.t f_i and f_j - - second derivative of - - :y_i: @todo - :f_i: @todo - :f_j: @todo - :returns: @todo + e = y - f + #print "Grad ", y.shape, f.shape, e.shape + grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) + return grad + def link_hess(self, y, f): """ - if f_i = - pass + Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j + i.e. second derivative link_function at y given f f_j w.r.t f and f_j + Will return diaganol of hessian, since every where else it is 0 + + $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ + + :y: datum number i + :f: latent variable f + :returns: float(second derivative of likelihood evaluated at this point) + """ + e = y - f + hess = ((self.v + 1) * e) / ((((self.sigma**2)*self.v) + e**2)**2) + return hess diff --git a/python/models/__init__.py b/python/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/testing/__init__.py b/python/testing/__init__.py new file mode 100644 index 00000000..e69de29b From f9535c858a653e08a32a8633fe37577c87812820 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 14 Mar 2013 15:30:22 +0000 Subject: [PATCH 05/71] Trying to 'debug' --- python/examples/laplace_approximations.py | 22 +++++++++++--- python/likelihoods/Laplace.py | 25 +++++++++------ python/likelihoods/likelihood_function.py | 37 ++++++++++++----------- 3 files changed, 52 insertions(+), 32 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 0e1d3305..5642d8a4 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -1,7 +1,7 @@ import GPy import numpy as np import matplotlib.pyplot as plt -from scipy.stats import t +from scipy.stats import t, norm from coxGP.python.likelihoods.Laplace import Laplace from coxGP.python.likelihoods.likelihood_function import student_t @@ -11,12 +11,13 @@ def student_t_approx(): Example of regressing with a student t likelihood """ #Start a function, any function - X = np.sort(np.random.uniform(0, 15, 70))[:, None] + X = np.sort(np.random.uniform(0, 15, 100))[:, None] Y = np.sin(X) #Add student t random noise to datapoints - deg_free = 1 - noise = t.rvs(deg_free, loc=1.8, scale=1, size=Y.shape) + deg_free = 2.5 + t_rv = t(deg_free, loc=5, scale=1) + noise = t_rv.rvs(size=Y.shape) Y += noise # Kernel object @@ -39,6 +40,19 @@ def student_t_approx(): lap = Laplace(Y, likelihood_function) cov = kernel.K(X) lap.fit_full(cov) + #Get one sample (just look at a single Y + mode = float(lap.f_hat[0]) + variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables + #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables + normalised_approx = norm(loc=mode, scale=variance) + print "Normal with mode %f, and variance %f" % (mode, variance) + print lap.height_unnormalised + + test_range = np.arange(0, 10, 0.1) + print np.diagonal(lap.hess_hat) + plt.plot(test_range, t_rv.pdf(test_range)) + plt.plot(test_range, normalised_approx.pdf(test_range)) + plt.show() def noisy_laplace_approx(): diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 6efbfa30..08ae0e6f 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -5,13 +5,13 @@ from GPy.util.linalg import jitchol from functools import partial from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv,mdot - +from scipy.stats import norm class Laplace(likelihood): """Laplace approximation to a posterior""" - def __init__(self,data,likelihood_function): + def __init__(self, data, likelihood_function): """ Laplace Approximation @@ -42,7 +42,13 @@ class Laplace(likelihood): GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle} that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood """ - z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised + #z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised + normalised_approx = norm(loc=self.f_hat, scale=self.hess_hat) + self.Z = normalised_approx.pdf(self.f_hat)/self.height_unnormalised + #self.Y = + #self.YYT = + #self.covariance_matrix = + #self.precision = def fit_full(self, K): """ @@ -51,11 +57,9 @@ class Laplace(likelihood): :K: Covariance matrix """ f = np.zeros((self.N, 1)) - print K.shape - print f.shape - print self.data.shape + #K = np.diag(np.ones(self.N)) (Ki, _, _, log_Kdet) = pdinv(K) - obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2*np.pi)) + obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi)) #Find \hat(f) using a newton raphson optimizer for example #TODO: Add newton-raphson as subclass of optimizer class @@ -77,11 +81,12 @@ class Laplace(likelihood): return np.squeeze(res) self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) + print self.f_hat #At this point get the hessian matrix - self.hess_hat = obj_hess(f_hat) + self.hess_hat = obj_hess(self.f_hat) #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...) - self.height_unnormalised = obj(f_hat) #FIXME: Is it -1? + self.height_unnormalised = obj(self.f_hat) #FIXME: Is it -1? - return _compute_GP_variables() + return self._compute_GP_variables() diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 78731199..46128de7 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -15,27 +15,27 @@ class student_t(likelihood_function): dln p(yi|fi)_dfi d2ln p(yi|fi)_d2fifj """ - def __init__(self, deg_free, sigma=1): + def __init__(self, deg_free, sigma=2): self.v = deg_free - self.sigma = 1 + self.sigma = sigma def link_function(self, y, f): """link_function $\ln p(y|f)$ $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ - :y: datum number i - :f: latent variable f + :y: data + :f: latent variables f :returns: float(likelihood evaluated for this point) """ + assert y.shape[0] == f.shape[0] e = y - f - #print "Link ", y.shape, f.shape, e.shape objective = (gammaln((self.v + 1) * 0.5) - - gammaln(self.v * 0.5) - + np.log(self.sigma * np.sqrt(self.v * np.pi)) - - (self.v + 1) * 0.5 - * np.log(1 + ((e**2 / self.sigma**2) / self.v)) - ) + - gammaln(self.v * 0.5) + + np.log(self.sigma * np.sqrt(self.v * np.pi)) + - (self.v + 1) * 0.5 + * np.log(1 + ((e**2 / self.sigma**2) / self.v)) + ) return np.sum(objective) def link_grad(self, y, f): @@ -44,13 +44,13 @@ class student_t(likelihood_function): $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ - :y: datum number i - :f: latent variable f - :returns: float(gradient of likelihood evaluated at this point) + :y: data + :f: latent variables f + :returns: gradient of likelihood evaluated at points """ + assert y.shape[0] == f.shape[0] e = y - f - #print "Grad ", y.shape, f.shape, e.shape grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) return grad @@ -63,10 +63,11 @@ class student_t(likelihood_function): $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ - :y: datum number i - :f: latent variable f - :returns: float(second derivative of likelihood evaluated at this point) + :y: data + :f: latent variables f + :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ + assert y.shape[0] == f.shape[0] e = y - f - hess = ((self.v + 1) * e) / ((((self.sigma**2)*self.v) + e**2)**2) + hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) return hess From 34ae852eea8d5f6cdc48028d4f21457c7f0b5259 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 15 Mar 2013 17:38:13 +0000 Subject: [PATCH 06/71] got an idea of how to implement! written in docs --- python/likelihoods/Laplace.py | 38 ++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 08ae0e6f..568fcef0 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -41,10 +41,26 @@ class Laplace(likelihood): GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle} that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood + + Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal) + then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f) + due to the z rescaling. + + at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1) + + This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1) + giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f) + + $$\tilde{Y} = \tilde{\Sigma} Hf$$ + where + $$\tilde{\Sigma}^{-1} = H - K^{-1}$$ + i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$ + since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$ + and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ + """ - #z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised - normalised_approx = norm(loc=self.f_hat, scale=self.hess_hat) - self.Z = normalised_approx.pdf(self.f_hat)/self.height_unnormalised + self.Sigma_tilde = self.hess_hat - + self.Z = #self.Y = #self.YYT = #self.covariance_matrix = @@ -58,8 +74,8 @@ class Laplace(likelihood): """ f = np.zeros((self.N, 1)) #K = np.diag(np.ones(self.N)) - (Ki, _, _, log_Kdet) = pdinv(K) - obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi)) + (self.Ki, _, _, self.log_Kdet) = pdinv(K) + obj_constant = (0.5 * self.log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi)) #Find \hat(f) using a newton raphson optimizer for example #TODO: Add newton-raphson as subclass of optimizer class @@ -67,17 +83,17 @@ class Laplace(likelihood): #FIXME: Can we get rid of this horrible reshaping? def obj(f): f = f[:, None] - res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (Ki, f)) + obj_constant) + res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (self.Ki, f)) + obj_constant) return float(res) def obj_grad(f): f = f[:, None] - res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(Ki, f)) + res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(self.Ki, f)) return np.squeeze(res) def obj_hess(f): f = f[:, None] - res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - Ki) + res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - self.Ki) return np.squeeze(res) self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) @@ -87,6 +103,10 @@ class Laplace(likelihood): self.hess_hat = obj_hess(self.f_hat) #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...) - self.height_unnormalised = obj(self.f_hat) #FIXME: Is it -1? + self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1? + #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to + #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode + #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) + self.z_hat = np.exp(-0.5*np.log(np.linalg.det(hess_hat)) + self.height_unnormalised) return self._compute_GP_variables() From 2bf1cf0eb6596773c2f75a06f152b3a7cfd66081 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 18 Mar 2013 15:59:12 +0000 Subject: [PATCH 07/71] following naming convention better, lots of inverses which should be able to get rid of one or two, unsure if it works --- python/examples/laplace_approximations.py | 17 +++++---- python/likelihoods/Laplace.py | 43 +++++++++++++---------- python/likelihoods/likelihood_function.py | 9 ++--- 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 5642d8a4..aa8cdcb4 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -41,18 +41,21 @@ def student_t_approx(): cov = kernel.K(X) lap.fit_full(cov) #Get one sample (just look at a single Y - mode = float(lap.f_hat[0]) - variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables + #mode = float(lap.f_hat[0]) + #variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables - normalised_approx = norm(loc=mode, scale=variance) - print "Normal with mode %f, and variance %f" % (mode, variance) - print lap.height_unnormalised test_range = np.arange(0, 10, 0.1) - print np.diagonal(lap.hess_hat) plt.plot(test_range, t_rv.pdf(test_range)) - plt.plot(test_range, normalised_approx.pdf(test_range)) + for i in xrange(X.shape[0]): + mode = lap.f_hat[i] + covariance = lap.hess_hat_i[i,i] + scaling = np.exp(lap.ln_z_hat) + normalised_approx = norm(loc=mode, scale=covariance) + print "Normal with mode %f, and variance %f" % (mode, covariance) + plt.plot(test_range, normalised_approx.pdf(test_range)) plt.show() + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def noisy_laplace_approx(): diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 568fcef0..9d622b0d 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,12 +1,10 @@ import numpy as np import scipy as sp import GPy -from GPy.util.linalg import jitchol +#from GPy.util.linalg import jitchol from functools import partial from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv,mdot -from scipy.stats import norm - class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -35,6 +33,8 @@ class Laplace(likelihood): #Inital values self.N, self.D = self.data.shape + self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi)) + def _compute_GP_variables(self): """ Generates data Y which would give the normal distribution identical to the laplace approximation @@ -59,12 +59,15 @@ class Laplace(likelihood): and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ """ - self.Sigma_tilde = self.hess_hat - - self.Z = - #self.Y = - #self.YYT = - #self.covariance_matrix = - #self.precision = + self.Sigma_tilde_i = self.hess_hat + self.Ki + #Do we really need to inverse Sigma_tilde_i? :( + (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i) + Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess? + self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde, (self.Sigma_tilde_i, Y_tilde)))) + self.Y = Y_tilde + self.covariance_matrix = self.Sigma_tilde + self.precision = np.diag(self.Sigma_tilde)[:, None] + self.YYT = np.dot(self.Y, self.Y) def fit_full(self, K): """ @@ -75,38 +78,40 @@ class Laplace(likelihood): f = np.zeros((self.N, 1)) #K = np.diag(np.ones(self.N)) (self.Ki, _, _, self.log_Kdet) = pdinv(K) - obj_constant = (0.5 * self.log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi)) - + LOG_K_CONST = -(0.5 * self.log_Kdet) + OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST #Find \hat(f) using a newton raphson optimizer for example #TODO: Add newton-raphson as subclass of optimizer class #FIXME: Can we get rid of this horrible reshaping? def obj(f): - f = f[:, None] - res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (self.Ki, f)) + obj_constant) + #f = f[:, None] + res = -1 * (self.likelihood_function.link_function(self.data[:,0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST) return float(res) def obj_grad(f): - f = f[:, None] - res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(self.Ki, f)) + #f = f[:, None] + res = -1 * (self.likelihood_function.link_grad(self.data[:,0], f) - mdot(self.Ki, f)) return np.squeeze(res) def obj_hess(f): - f = f[:, None] - res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - self.Ki) + res = -1 * (np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki) return np.squeeze(res) self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) print self.f_hat #At this point get the hessian matrix - self.hess_hat = obj_hess(self.f_hat) + self.hess_hat = -1*np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) #-1*obj_hess(self.f_hat) + self.Ki + #self.hess_hat = -1*obj_hess(self.f_hat) + self.Ki + (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat + self.Ki) #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...) self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1? #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) - self.z_hat = np.exp(-0.5*np.log(np.linalg.det(hess_hat)) + self.height_unnormalised) + self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) + self.height_unnormalised - self.NORMAL_CONST #Unsure whether its log_hess or log_hess_i + return self._compute_GP_variables() diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 46128de7..8adbf86c 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -28,7 +28,7 @@ class student_t(likelihood_function): :returns: float(likelihood evaluated for this point) """ - assert y.shape[0] == f.shape[0] + assert y.shape == f.shape e = y - f objective = (gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) @@ -49,7 +49,7 @@ class student_t(likelihood_function): :returns: gradient of likelihood evaluated at points """ - assert y.shape[0] == f.shape[0] + assert y.shape == f.shape e = y - f grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) return grad @@ -67,7 +67,8 @@ class student_t(likelihood_function): :f: latent variables f :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ - assert y.shape[0] == f.shape[0] + assert y.shape == f.shape e = y - f - hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) + #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) + hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2) return hess From 46d59c94b27cabe61056b71aa26d1293779c0697 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 19 Mar 2013 11:47:53 +0000 Subject: [PATCH 08/71] Just breaking some things... --- python/examples/laplace_approximations.py | 88 +++++++++++++++-------- python/likelihoods/Laplace.py | 52 ++++++++++---- python/likelihoods/likelihood_function.py | 16 ++++- 3 files changed, 113 insertions(+), 43 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index aa8cdcb4..73c8f67f 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -16,47 +16,75 @@ def student_t_approx(): #Add student t random noise to datapoints deg_free = 2.5 - t_rv = t(deg_free, loc=5, scale=1) + t_rv = t(deg_free, loc=0, scale=1) noise = t_rv.rvs(size=Y.shape) Y += noise + #Add some extreme value noise to some of the datapoints + #percent_corrupted = 0.05 + #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted)) + #indices = np.arange(Y.shape[0]) + #np.random.shuffle(indices) + #corrupted_indices = indices[:corrupted_datums] + #print corrupted_indices + #noise = t_rv.rvs(size=(len(corrupted_indices), 1)) + #Y[corrupted_indices] += noise + # Kernel object - print X.shape - kernel = GPy.kern.rbf(X.shape[1]) + #print X.shape + #kernel = GPy.kern.rbf(X.shape[1]) - #A GP should completely break down due to the points as they get a lot of weight - # create simple GP model - m = GPy.models.GP_regression(X, Y, kernel=kernel) + ##A GP should completely break down due to the points as they get a lot of weight + ## create simple GP model + #m = GPy.models.GP_regression(X, Y, kernel=kernel) - # optimize - m.ensure_default_constraints() - m.optimize() - # plot - #m.plot() - print m + ## optimize + #m.ensure_default_constraints() + #m.optimize() + ## plot + ##m.plot() + #print m #with a student t distribution, since it has heavy tails it should work well - likelihood_function = student_t(deg_free, sigma=1) - lap = Laplace(Y, likelihood_function) - cov = kernel.K(X) - lap.fit_full(cov) - #Get one sample (just look at a single Y - #mode = float(lap.f_hat[0]) - #variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables - #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables + #likelihood_function = student_t(deg_free, sigma=1) + #lap = Laplace(Y, likelihood_function) + #cov = kernel.K(X) + #lap.fit_full(cov) - test_range = np.arange(0, 10, 0.1) - plt.plot(test_range, t_rv.pdf(test_range)) - for i in xrange(X.shape[0]): - mode = lap.f_hat[i] - covariance = lap.hess_hat_i[i,i] - scaling = np.exp(lap.ln_z_hat) - normalised_approx = norm(loc=mode, scale=covariance) - print "Normal with mode %f, and variance %f" % (mode, covariance) - plt.plot(test_range, normalised_approx.pdf(test_range)) - plt.show() + #test_range = np.arange(0, 10, 0.1) + #plt.plot(test_range, t_rv.pdf(test_range)) + #for i in xrange(X.shape[0]): + #mode = lap.f_hat[i] + #covariance = lap.hess_hat_i[i,i] + #scaling = np.exp(lap.ln_z_hat) + #normalised_approx = norm(loc=mode, scale=covariance) + #print "Normal with mode %f, and variance %f" % (mode, covariance) + #plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) + #plt.show() + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + # Likelihood object + t_distribution = student_t(deg_free, sigma=1) + stu_t_likelihood = Laplace(Y, t_distribution) + kernel = GPy.kern.rbf(X.shape[1]) + + m = GPy.models.GP(X, stu_t_likelihood, kernel) + m.ensure_default_constraints() + + m.update_likelihood_approximation() + print "NEW MODEL" + print(m) + + # optimize + #m.optimize() + print(m) + + # plot + m.plot() import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return m + def noisy_laplace_approx(): """ diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 9d622b0d..23db6abd 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -5,6 +5,7 @@ import GPy from functools import partial from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv,mdot +import numpy.testing.assert_array_equal class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -35,6 +36,29 @@ class Laplace(likelihood): self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi)) + #Initial values for the GP variables + self.Y = np.zeros((self.N,1)) + self.covariance_matrix = np.eye(self.N) + self.precision = np.ones(self.N)[:,None] + self.Z = 0 + self.YYT = None + + def predictive_values(self,mu,var): + return self.likelihood_function.predictive_values(mu,var) + + def _get_params(self): + return np.zeros(0) + + def _get_param_names(self): + return [] + + def _set_params(self,p): + pass # TODO: Laplace likelihood might want to take some parameters... + + def _gradients(self,partial): + raise NotImplementedError + #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... + def _compute_GP_variables(self): """ Generates data Y which would give the normal distribution identical to the laplace approximation @@ -63,11 +87,14 @@ class Laplace(likelihood): #Do we really need to inverse Sigma_tilde_i? :( (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i) Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess? - self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde, (self.Sigma_tilde_i, Y_tilde)))) + self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)))) + + self.Z = self.Z_tilde self.Y = Y_tilde self.covariance_matrix = self.Sigma_tilde - self.precision = np.diag(self.Sigma_tilde)[:, None] - self.YYT = np.dot(self.Y, self.Y) + self.precision = 1/np.diag(self.Sigma_tilde)[:, None] + self.YYT = np.dot(self.Y, self.Y.T) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def fit_full(self, K): """ @@ -76,7 +103,6 @@ class Laplace(likelihood): :K: Covariance matrix """ f = np.zeros((self.N, 1)) - #K = np.diag(np.ones(self.N)) (self.Ki, _, _, self.log_Kdet) = pdinv(K) LOG_K_CONST = -(0.5 * self.log_Kdet) OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST @@ -95,23 +121,25 @@ class Laplace(likelihood): return np.squeeze(res) def obj_hess(f): - res = -1 * (np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki) + res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki) return np.squeeze(res) self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) - print self.f_hat #At this point get the hessian matrix - self.hess_hat = -1*np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) #-1*obj_hess(self.f_hat) + self.Ki - #self.hess_hat = -1*obj_hess(self.f_hat) + self.Ki - (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat + self.Ki) + self.hess_hat = np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) + self.Ki + (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat) + (self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat_i) + + np.testing.assert_array_equal(self.hess_hat, hess_hat_new) #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...) - self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1? + #self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1? #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) - self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) + self.height_unnormalised - self.NORMAL_CONST #Unsure whether its log_hess or log_hess_i - + #Unsure whether its log_hess or log_hess_i + self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(f.T, (self.Ki, f)) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return self._compute_GP_variables() diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 8adbf86c..e70cdc8d 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -1,7 +1,7 @@ from scipy.special import gammaln import numpy as np from GPy.likelihoods.likelihood_functions import likelihood_function - +from scipy import stats class student_t(likelihood_function): """Student t likelihood distribution @@ -72,3 +72,17 @@ class student_t(likelihood_function): #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2) return hess + + def predictive_values(self, mu, var): + """ + Compute mean, and conficence interval (percentiles 5 and 95) of the prediction + """ + mean = np.exp(mu) + p_025 = stats.t.ppf(025,mean) + p_975 = stats.t.ppf(975,mean) + + #p_025 = tmp[:,0] + #p_975 = tmp[:,1] + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return mean,p_025,p_975 + From a9d555597653c24bc67812776514e29066216d66 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 19 Mar 2013 18:21:57 +0000 Subject: [PATCH 09/71] Worked out in terms of W, needs gradients implementing --- python/examples/laplace_approximations.py | 44 ++++++++++----------- python/likelihoods/Laplace.py | 48 +++++++++++++++-------- python/likelihoods/likelihood_function.py | 5 ++- 3 files changed, 57 insertions(+), 40 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 73c8f67f..c8d06ab2 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -15,13 +15,13 @@ def student_t_approx(): Y = np.sin(X) #Add student t random noise to datapoints - deg_free = 2.5 + deg_free = 3.5 t_rv = t(deg_free, loc=0, scale=1) noise = t_rv.rvs(size=Y.shape) Y += noise #Add some extreme value noise to some of the datapoints - #percent_corrupted = 0.05 + #percent_corrupted = 0.15 #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted)) #indices = np.arange(Y.shape[0]) #np.random.shuffle(indices) @@ -31,11 +31,11 @@ def student_t_approx(): #Y[corrupted_indices] += noise # Kernel object - #print X.shape - #kernel = GPy.kern.rbf(X.shape[1]) + print X.shape + kernel = GPy.kern.rbf(X.shape[1]) - ##A GP should completely break down due to the points as they get a lot of weight - ## create simple GP model + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model #m = GPy.models.GP_regression(X, Y, kernel=kernel) ## optimize @@ -46,27 +46,27 @@ def student_t_approx(): #print m #with a student t distribution, since it has heavy tails it should work well - #likelihood_function = student_t(deg_free, sigma=1) - #lap = Laplace(Y, likelihood_function) - #cov = kernel.K(X) - #lap.fit_full(cov) + likelihood_function = student_t(deg_free, sigma=1) + lap = Laplace(Y, likelihood_function) + cov = kernel.K(X) + lap.fit_full(cov) - #test_range = np.arange(0, 10, 0.1) - #plt.plot(test_range, t_rv.pdf(test_range)) - #for i in xrange(X.shape[0]): - #mode = lap.f_hat[i] - #covariance = lap.hess_hat_i[i,i] - #scaling = np.exp(lap.ln_z_hat) - #normalised_approx = norm(loc=mode, scale=covariance) - #print "Normal with mode %f, and variance %f" % (mode, covariance) - #plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) - #plt.show() - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + test_range = np.arange(0, 10, 0.1) + plt.plot(test_range, t_rv.pdf(test_range)) + for i in xrange(X.shape[0]): + mode = lap.f_hat[i] + covariance = lap.hess_hat_i[i,i] + scaling = np.exp(lap.ln_z_hat) + normalised_approx = norm(loc=mode, scale=covariance) + print "Normal with mode %f, and variance %f" % (mode, covariance) + plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) + plt.show() + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT # Likelihood object t_distribution = student_t(deg_free, sigma=1) stu_t_likelihood = Laplace(Y, t_distribution) - kernel = GPy.kern.rbf(X.shape[1]) + kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1]) m = GPy.models.GP(X, stu_t_likelihood, kernel) m.ensure_default_constraints() diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 23db6abd..84128e3a 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,11 +1,11 @@ import numpy as np import scipy as sp import GPy -#from GPy.util.linalg import jitchol +from scipy.linalg import cholesky, eig, inv from functools import partial from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv,mdot -import numpy.testing.assert_array_equal +#import numpy.testing.assert_array_equal class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -56,8 +56,8 @@ class Laplace(likelihood): pass # TODO: Laplace likelihood might want to take some parameters... def _gradients(self,partial): + return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... raise NotImplementedError - #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... def _compute_GP_variables(self): """ @@ -83,16 +83,23 @@ class Laplace(likelihood): and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ """ - self.Sigma_tilde_i = self.hess_hat + self.Ki + self.Sigma_tilde_i = self.hess_hat_i #self.W #self.hess_hat_i - self.Ki #Do we really need to inverse Sigma_tilde_i? :( - (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i) - Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess? - self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)))) + if self.likelihood_function.log_concave: + (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i) + else: + self.Sigma_tilde = inv(self.Sigma_tilde_i) + #f_hat? should be f but we must have optimized for them I guess? + Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) + self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + - 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) + + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) + ) self.Z = self.Z_tilde self.Y = Y_tilde self.covariance_matrix = self.Sigma_tilde - self.precision = 1/np.diag(self.Sigma_tilde)[:, None] + self.precision = 1 / np.diag(self.Sigma_tilde)[:, None] self.YYT = np.dot(self.Y, self.Y.T) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT @@ -112,34 +119,41 @@ class Laplace(likelihood): #FIXME: Can we get rid of this horrible reshaping? def obj(f): #f = f[:, None] - res = -1 * (self.likelihood_function.link_function(self.data[:,0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST) + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST) return float(res) def obj_grad(f): #f = f[:, None] - res = -1 * (self.likelihood_function.link_grad(self.data[:,0], f) - mdot(self.Ki, f)) + res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f)) return np.squeeze(res) def obj_hess(f): - res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki) + res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) return np.squeeze(res) self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) #At this point get the hessian matrix - self.hess_hat = np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) + self.Ki + self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat)) + self.hess_hat = self.Ki + self.W (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat) - (self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat_i) - np.testing.assert_array_equal(self.hess_hat, hess_hat_new) + #Check hess_hat is positive definite + try: + cholesky(self.hess_hat) + except: + raise ValueError("Must be positive definite") + + #Check its eigenvalues are positive + eigenvalues = eig(self.hess_hat) + if not np.all(eigenvalues > 0): + raise ValueError("Eigen values not positive") - #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...) - #self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1? #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) #Unsure whether its log_hess or log_hess_i - self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(f.T, (self.Ki, f)) + self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(self.f_hat.T, (self.Ki, self.f_hat)) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return self._compute_GP_variables() diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index e70cdc8d..c4823703 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -19,6 +19,9 @@ class student_t(likelihood_function): self.v = deg_free self.sigma = sigma + #FIXME: This should be in the superclass + self.log_concave = False + def link_function(self, y, f): """link_function $\ln p(y|f)$ $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ @@ -70,7 +73,7 @@ class student_t(likelihood_function): assert y.shape == f.shape e = y - f #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) - hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2) + hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) return hess def predictive_values(self, mu, var): From 474d5484b06bdbceefa08fa573d28326bb3f8a92 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 21 Mar 2013 14:00:22 +0000 Subject: [PATCH 10/71] Changing definitions again... --- python/examples/laplace_approximations.py | 15 +++++--- python/likelihoods/Laplace.py | 44 +++++++++++++++-------- python/likelihoods/likelihood_function.py | 10 ++---- 3 files changed, 43 insertions(+), 26 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index c8d06ab2..6f2b19aa 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -15,8 +15,9 @@ def student_t_approx(): Y = np.sin(X) #Add student t random noise to datapoints - deg_free = 3.5 - t_rv = t(deg_free, loc=0, scale=1) + deg_free = 100000.5 + real_var = 4 + t_rv = t(deg_free, loc=0, scale=real_var) noise = t_rv.rvs(size=Y.shape) Y += noise @@ -46,7 +47,7 @@ def student_t_approx(): #print m #with a student t distribution, since it has heavy tails it should work well - likelihood_function = student_t(deg_free, sigma=1) + likelihood_function = student_t(deg_free, sigma=real_var) lap = Laplace(Y, likelihood_function) cov = kernel.K(X) lap.fit_full(cov) @@ -64,7 +65,7 @@ def student_t_approx(): import ipdb; ipdb.set_trace() ### XXX BREAKPOINT # Likelihood object - t_distribution = student_t(deg_free, sigma=1) + t_distribution = student_t(deg_free, sigma=real_var) stu_t_likelihood = Laplace(Y, t_distribution) kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1]) @@ -77,12 +78,16 @@ def student_t_approx(): # optimize #m.optimize() - print(m) + #print(m) # plot m.plot() import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + m.optimize() + print(m) + + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return m diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 84128e3a..b002034d 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,7 +1,7 @@ import numpy as np import scipy as sp import GPy -from scipy.linalg import cholesky, eig, inv +from scipy.linalg import cholesky, eig, inv, det from functools import partial from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv,mdot @@ -43,8 +43,10 @@ class Laplace(likelihood): self.Z = 0 self.YYT = None - def predictive_values(self,mu,var): - return self.likelihood_function.predictive_values(mu,var) + def predictive_values(self, mu, var, full_cov): + if full_cov: + raise NotImplementedError("Cannot make correlated predictions with an EP likelihood") + return self.likelihood_function.predictive_values(mu, var) def _get_params(self): return np.zeros(0) @@ -52,10 +54,10 @@ class Laplace(likelihood): def _get_param_names(self): return [] - def _set_params(self,p): + def _set_params(self, p): pass # TODO: Laplace likelihood might want to take some parameters... - def _gradients(self,partial): + def _gradients(self, partial): return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... raise NotImplementedError @@ -83,7 +85,13 @@ class Laplace(likelihood): and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ """ - self.Sigma_tilde_i = self.hess_hat_i #self.W #self.hess_hat_i - self.Ki + self.Sigma_tilde_i = self.W #self.hess_hat_i + #Check it isn't singular! + epsilon = 1e-2 + """ + if np.abs(det(self.Sigma_tilde_i)) < epsilon: + raise ValueError("inverse covariance must be non-singular to inverse!") + """ #Do we really need to inverse Sigma_tilde_i? :( if self.likelihood_function.log_concave: (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i) @@ -91,12 +99,17 @@ class Laplace(likelihood): self.Sigma_tilde = inv(self.Sigma_tilde_i) #f_hat? should be f but we must have optimized for them I guess? Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) - self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST - - 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) - + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) - ) + #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST + #- 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) + #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) + #) + Z_tilde = (self.ln_z_hat - self.NORMAL_CONST + + 0.5*self.log_hess_hat_det + + 0.5*mdot(self.f_hat, self.Ki , self.f_hat) + + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) + ) - self.Z = self.Z_tilde + self.Z = Z_tilde self.Y = Y_tilde self.covariance_matrix = self.Sigma_tilde self.precision = 1 / np.diag(self.Sigma_tilde)[:, None] @@ -128,7 +141,7 @@ class Laplace(likelihood): return np.squeeze(res) def obj_hess(f): - res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) + res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) return np.squeeze(res) self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) @@ -153,7 +166,10 @@ class Laplace(likelihood): #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) #Unsure whether its log_hess or log_hess_i - self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(self.f_hat.T, (self.Ki, self.f_hat)) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + self.ln_z_hat = (-0.5*self.log_hess_hat_det + - 0.5*self.log_Kdet + -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) + - mdot(self.f_hat.T, (self.Ki, self.f_hat)) + ) return self._compute_GP_variables() diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index c4823703..a299fe3a 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -81,11 +81,7 @@ class student_t(likelihood_function): Compute mean, and conficence interval (percentiles 5 and 95) of the prediction """ mean = np.exp(mu) - p_025 = stats.t.ppf(025,mean) - p_975 = stats.t.ppf(975,mean) - - #p_025 = tmp[:,0] - #p_975 = tmp[:,1] - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - return mean,p_025,p_975 + p_025 = stats.t.ppf(.025, mean) + p_975 = stats.t.ppf(.975, mean) + return mean, np.nan*mean, p_025, p_975 From 7b0d0550cb01f0c4eca567e80f950e7f54ecb7b2 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 22 Mar 2013 12:50:47 +0000 Subject: [PATCH 11/71] Seemed to be working, now its not --- python/examples/laplace_approximations.py | 118 +++++++++++++--------- python/likelihoods/Laplace.py | 37 +++---- 2 files changed, 92 insertions(+), 63 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 6f2b19aa..5fb39e08 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -11,15 +11,22 @@ def student_t_approx(): Example of regressing with a student t likelihood """ #Start a function, any function - X = np.sort(np.random.uniform(0, 15, 100))[:, None] - Y = np.sin(X) + X = np.linspace(0.0, 10.0, 100)[:, None] + Y = np.sin(X) + np.random.randn(*X.shape)*0.1 + Yc = Y.copy() + + Y = Y/Y.max() + + Yc[10] += 5 + Yc[15] += 20 + Yc = Yc/Yc.max() #Add student t random noise to datapoints - deg_free = 100000.5 - real_var = 4 - t_rv = t(deg_free, loc=0, scale=real_var) - noise = t_rv.rvs(size=Y.shape) - Y += noise + deg_free = 1000000 #100000.5 + real_var = 0.1 + #t_rv = t(deg_free, loc=0, scale=real_var) + #noise = t_rvrvs(size=Y.shape) + #Y += noise #Add some extreme value noise to some of the datapoints #percent_corrupted = 0.15 @@ -30,64 +37,83 @@ def student_t_approx(): #print corrupted_indices #noise = t_rv.rvs(size=(len(corrupted_indices), 1)) #Y[corrupted_indices] += noise - + plt.figure(1) # Kernel object - print X.shape - kernel = GPy.kern.rbf(X.shape[1]) + kernel1 = GPy.kern.rbf(X.shape[1]) + kernel2 = kernel1.copy() + kernel3 = kernel1.copy() + kernel4 = kernel1.copy() - #A GP should completely break down due to the points as they get a lot of weight - # create simple GP model - #m = GPy.models.GP_regression(X, Y, kernel=kernel) - - ## optimize + #print "Clean Gaussian" + ##A GP should completely break down due to the points as they get a lot of weight + ## create simple GP model + #m = GPy.models.GP_regression(X, Y, kernel=kernel1) + ### optimize #m.ensure_default_constraints() + ##m.unconstrain('noise') + ##m.constrain_fixed('noise', 0.1) #m.optimize() ## plot - ##m.plot() + #plt.subplot(221) + #m.plot() #print m - #with a student t distribution, since it has heavy tails it should work well - likelihood_function = student_t(deg_free, sigma=real_var) - lap = Laplace(Y, likelihood_function) - cov = kernel.K(X) - lap.fit_full(cov) + ##Corrupt + #print "Corrupt Gaussian" + #m = GPy.models.GP_regression(X, Yc, kernel=kernel2) + #m.ensure_default_constraints() + ##m.unconstrain('noise') + ##m.constrain_fixed('noise', 0.1) + #m.optimize() + #plt.subplot(222) + #m.plot() + #print m - test_range = np.arange(0, 10, 0.1) - plt.plot(test_range, t_rv.pdf(test_range)) - for i in xrange(X.shape[0]): - mode = lap.f_hat[i] - covariance = lap.hess_hat_i[i,i] - scaling = np.exp(lap.ln_z_hat) - normalised_approx = norm(loc=mode, scale=covariance) - print "Normal with mode %f, and variance %f" % (mode, covariance) - plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) - plt.show() - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + ##with a student t distribution, since it has heavy tails it should work well + ##likelihood_function = student_t(deg_free, sigma=real_var) + ##lap = Laplace(Y, likelihood_function) + ##cov = kernel.K(X) + ##lap.fit_full(cov) + + ##test_range = np.arange(0, 10, 0.1) + ##plt.plot(test_range, t_rv.pdf(test_range)) + ##for i in xrange(X.shape[0]): + ##mode = lap.f_hat[i] + ##covariance = lap.hess_hat_i[i,i] + ##scaling = np.exp(lap.ln_z_hat) + ##normalised_approx = norm(loc=mode, scale=covariance) + ##print "Normal with mode %f, and variance %f" % (mode, covariance) + ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) + ##plt.show() # Likelihood object - t_distribution = student_t(deg_free, sigma=real_var) + t_distribution = student_t(deg_free, sigma=np.sqrt(real_var)) stu_t_likelihood = Laplace(Y, t_distribution) - kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1]) - m = GPy.models.GP(X, stu_t_likelihood, kernel) + print "Clean student t" + m = GPy.models.GP(X, stu_t_likelihood, kernel3) m.ensure_default_constraints() - m.update_likelihood_approximation() - print "NEW MODEL" - print(m) - # optimize - #m.optimize() - #print(m) - - # plot - m.plot() - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - m.optimize() print(m) + # plot + plt.subplot(211) + m.plot_f() + + print "Corrupt student t" + t_distribution = student_t(deg_free, sigma=np.sqrt(real_var)) + corrupt_stu_t_likelihood = Laplace(Yc, t_distribution) + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(212) + m.plot_f() import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return m diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index b002034d..d86523d8 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -33,13 +33,15 @@ class Laplace(likelihood): #Inital values self.N, self.D = self.data.shape + self.is_heteroscedastic = True + self.Nparams = 0 self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi)) #Initial values for the GP variables - self.Y = np.zeros((self.N,1)) + self.Y = np.zeros((self.N, 1)) self.covariance_matrix = np.eye(self.N) - self.precision = np.ones(self.N)[:,None] + self.precision = np.ones(self.N)[:, None] self.Z = 0 self.YYT = None @@ -58,6 +60,7 @@ class Laplace(likelihood): pass # TODO: Laplace likelihood might want to take some parameters... def _gradients(self, partial): + #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... raise NotImplementedError @@ -88,10 +91,8 @@ class Laplace(likelihood): self.Sigma_tilde_i = self.W #self.hess_hat_i #Check it isn't singular! epsilon = 1e-2 - """ if np.abs(det(self.Sigma_tilde_i)) < epsilon: raise ValueError("inverse covariance must be non-singular to inverse!") - """ #Do we really need to inverse Sigma_tilde_i? :( if self.likelihood_function.log_concave: (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i) @@ -99,21 +100,17 @@ class Laplace(likelihood): self.Sigma_tilde = inv(self.Sigma_tilde_i) #f_hat? should be f but we must have optimized for them I guess? Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) - #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST - #- 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) - #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) - #) Z_tilde = (self.ln_z_hat - self.NORMAL_CONST - + 0.5*self.log_hess_hat_det - + 0.5*mdot(self.f_hat, self.Ki , self.f_hat) - + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) + + 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) + + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) + - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) ) self.Z = Z_tilde - self.Y = Y_tilde + self.Y = Y_tilde[:, None] + self.YYT = np.dot(self.Y, self.Y.T) self.covariance_matrix = self.Sigma_tilde self.precision = 1 / np.diag(self.Sigma_tilde)[:, None] - self.YYT = np.dot(self.Y, self.Y.T) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def fit_full(self, K): @@ -122,6 +119,7 @@ class Laplace(likelihood): For nomenclature see Rasmussen & Williams 2006 :K: Covariance matrix """ + self.K = K.copy() f = np.zeros((self.N, 1)) (self.Ki, _, _, self.log_Kdet) = pdinv(K) LOG_K_CONST = -(0.5 * self.log_Kdet) @@ -148,6 +146,11 @@ class Laplace(likelihood): #At this point get the hessian matrix self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat)) + if not self.likelihood_function.log_concave: + self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + #If the likelihood is non-log-concave. We wan't to say that there is a negative variance + #To cause the posterior to become less certain than the prior and likelihood, + #This is a property only held by non-log-concave likelihoods self.hess_hat = self.Ki + self.W (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat) @@ -166,10 +169,10 @@ class Laplace(likelihood): #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) #Unsure whether its log_hess or log_hess_i - self.ln_z_hat = (-0.5*self.log_hess_hat_det - - 0.5*self.log_Kdet - -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) - - mdot(self.f_hat.T, (self.Ki, self.f_hat)) + self.ln_z_hat = (- 0.5*self.log_hess_hat_det + + 0.5*self.log_Kdet + + self.likelihood_function.link_function(self.data[:,0], self.f_hat) + - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) ) return self._compute_GP_variables() From 15d5c2f22dff65a518a4f6a155e457a6516fca17 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 28 Mar 2013 17:42:42 +0000 Subject: [PATCH 12/71] Working laplace, just needs predictive values --- python/examples/laplace_approximations.py | 80 +++++++++++++---------- python/likelihoods/Laplace.py | 15 +++-- python/likelihoods/likelihood_function.py | 72 ++++++++++++++++++-- 3 files changed, 121 insertions(+), 46 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 5fb39e08..37681849 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -10,20 +10,23 @@ def student_t_approx(): """ Example of regressing with a student t likelihood """ + real_var = 0.1 #Start a function, any function - X = np.linspace(0.0, 10.0, 100)[:, None] - Y = np.sin(X) + np.random.randn(*X.shape)*0.1 + X = np.linspace(0.0, 10.0, 30)[:, None] + Y = np.sin(X) + np.random.randn(*X.shape)*real_var Yc = Y.copy() - Y = Y/Y.max() + #Y = Y/Y.max() - Yc[10] += 5 - Yc[15] += 20 - Yc = Yc/Yc.max() + #Yc[10] += 100 + Yc[25] += 10 + Yc[23] += 10 + Yc[24] += 10 + #Yc = Yc/Yc.max() #Add student t random noise to datapoints - deg_free = 1000000 #100000.5 - real_var = 0.1 + deg_free = 20 #100000.5 + real_sd = np.sqrt(real_var) #t_rv = t(deg_free, loc=0, scale=real_var) #noise = t_rvrvs(size=Y.shape) #Y += noise @@ -38,36 +41,37 @@ def student_t_approx(): #noise = t_rv.rvs(size=(len(corrupted_indices), 1)) #Y[corrupted_indices] += noise plt.figure(1) + plt.suptitle('Gaussian likelihood') # Kernel object kernel1 = GPy.kern.rbf(X.shape[1]) kernel2 = kernel1.copy() kernel3 = kernel1.copy() kernel4 = kernel1.copy() - #print "Clean Gaussian" - ##A GP should completely break down due to the points as they get a lot of weight - ## create simple GP model - #m = GPy.models.GP_regression(X, Y, kernel=kernel1) - ### optimize - #m.ensure_default_constraints() - ##m.unconstrain('noise') - ##m.constrain_fixed('noise', 0.1) - #m.optimize() - ## plot - #plt.subplot(221) - #m.plot() - #print m + print "Clean Gaussian" + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model + m = GPy.models.GP_regression(X, Y, kernel=kernel1) + ## optimize + m.ensure_default_constraints() + #m.unconstrain('noise') + #m.constrain_fixed('noise', 0.1) + m.optimize() + # plot + plt.subplot(211) + m.plot() + print m ##Corrupt - #print "Corrupt Gaussian" - #m = GPy.models.GP_regression(X, Yc, kernel=kernel2) - #m.ensure_default_constraints() - ##m.unconstrain('noise') - ##m.constrain_fixed('noise', 0.1) - #m.optimize() - #plt.subplot(222) - #m.plot() - #print m + print "Corrupt Gaussian" + m = GPy.models.GP_regression(X, Yc, kernel=kernel2) + m.ensure_default_constraints() + #m.unconstrain('noise') + #m.constrain_fixed('noise', 0.1) + m.optimize() + plt.subplot(212) + m.plot() + print m ##with a student t distribution, since it has heavy tails it should work well ##likelihood_function = student_t(deg_free, sigma=real_var) @@ -86,9 +90,13 @@ def student_t_approx(): ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) ##plt.show() + plt.figure(2) + plt.suptitle('Student-t likelihood') + edited_real_sd = real_sd + # Likelihood object - t_distribution = student_t(deg_free, sigma=np.sqrt(real_var)) - stu_t_likelihood = Laplace(Y, t_distribution) + t_distribution = student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = Laplace(Yc, t_distribution) print "Clean student t" m = GPy.models.GP(X, stu_t_likelihood, kernel3) @@ -100,9 +108,11 @@ def student_t_approx(): # plot plt.subplot(211) m.plot_f() + plt.ylim(-2.5,2.5) + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print "Corrupt student t" - t_distribution = student_t(deg_free, sigma=np.sqrt(real_var)) + t_distribution = student_t(deg_free, sigma=edited_real_sd) corrupt_stu_t_likelihood = Laplace(Yc, t_distribution) m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m.ensure_default_constraints() @@ -110,8 +120,8 @@ def student_t_approx(): m.optimize() print(m) plt.subplot(212) - m.plot_f() - + m.plot() + plt.ylim(-2.5,2.5) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return m diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index d86523d8..1411c22b 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -88,11 +88,12 @@ class Laplace(likelihood): and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ """ - self.Sigma_tilde_i = self.W #self.hess_hat_i + self.Sigma_tilde_i = self.W #Check it isn't singular! - epsilon = 1e-2 + epsilon = 1e-6 if np.abs(det(self.Sigma_tilde_i)) < epsilon: - raise ValueError("inverse covariance must be non-singular to inverse!") + print "WARNING: Transformed covariance matrix is signular!" + #raise ValueError("inverse covariance must be non-singular to invert!") #Do we really need to inverse Sigma_tilde_i? :( if self.likelihood_function.log_concave: (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i) @@ -110,8 +111,12 @@ class Laplace(likelihood): self.Y = Y_tilde[:, None] self.YYT = np.dot(self.Y, self.Y.T) self.covariance_matrix = self.Sigma_tilde - self.precision = 1 / np.diag(self.Sigma_tilde)[:, None] - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + #if not self.likelihood_function.log_concave: + #self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + ##If the likelihood is non-log-concave. We wan't to say that there is a negative variance + ##To cause the posterior to become less certain than the prior and likelihood, + ##This is a property only held by non-log-concave likelihoods + self.precision = 1 / np.diag(self.covariance_matrix)[:, None] def fit_full(self, K): """ diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index a299fe3a..7ac9c661 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -1,4 +1,5 @@ -from scipy.special import gammaln +from scipy.special import gammaln, gamma +from scipy import integrate import numpy as np from GPy.likelihoods.likelihood_functions import likelihood_function from scipy import stats @@ -79,9 +80,68 @@ class student_t(likelihood_function): def predictive_values(self, mu, var): """ Compute mean, and conficence interval (percentiles 5 and 95) of the prediction - """ - mean = np.exp(mu) - p_025 = stats.t.ppf(.025, mean) - p_975 = stats.t.ppf(.975, mean) - return mean, np.nan*mean, p_025, p_975 + Need to find what the variance is at the latent points for a student t*normal + (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) + +(((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2)) +*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) + """ + #p_025 = stats.t.ppf(.025, mu) + #p_975 = stats.t.ppf(.975, mu) + + num_test_points = mu.shape[0] + #Each mu is the latent point f* at the test point x*, + #and the var is the gaussian variance at this point + #Take lots of samples from this, so we have lots of possible values + #for latent point f* for each test point x* weighted by how likely we were to pick it + print "Taking %d samples of f*".format(num_test_points) + num_f_samples = 10 + num_y_samples = 10 + student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples)) + print "Student t means shape: ", student_t_means.shape + + #Now we have lots of f*, lets work out the likelihood of getting this by sampling + #from a student t centred on this point, sample many points from this distribution + #centred on f* + #for test_point, f in enumerate(student_t_means): + #print test_point + #print f.shape + #student_t_samples = stats.t.rvs(self.v, loc=f[:,None], + #scale=self.sigma, + #size=(num_f_samples, num_y_samples)) + #print student_t_samples.shape + + student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:,None], + scale=self.sigma, + size=(num_test_points, num_y_samples, num_f_samples)) + student_t_samples = np.reshape(student_t_samples, + (num_test_points, num_y_samples*num_f_samples)) + + #Now take the 97.5 and 0.25 percentile of these points + p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None] + p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None] + + p_025 = 1+p_025 + p_975 = 1+p_975 + + ##Alernenately we could sample from int p(y|f*)p(f*|x*) df* + def t_gaussian(f, mu, var): + return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5)) + * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2))) + ) + + def t_gauss_int(mu, var): + print "Mu: ", mu + print "var: ", var + result = integrate.quad(t_gaussian, -np.inf, 0.975, args=(mu, var)) + print "Result: ", result + return result[0] + + vec_t_gauss_int = np.vectorize(t_gauss_int) + + p_025 = vec_t_gauss_int(mu, var) + p_975 = vec_t_gauss_int(mu, var) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + return mu, np.nan*mu, p_025, p_975 From ffc168c1d20f36b1e72501176c4a7bb88ff41614 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 2 Apr 2013 12:33:01 +0100 Subject: [PATCH 13/71] Added predicted values for student t, works well --- python/examples/laplace_approximations.py | 48 +++++++++++------------ python/likelihoods/likelihood_function.py | 41 ++++++++++++++----- 2 files changed, 53 insertions(+), 36 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 37681849..6374a5fd 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -18,7 +18,7 @@ def student_t_approx(): #Y = Y/Y.max() - #Yc[10] += 100 + Yc[10] += 100 Yc[25] += 10 Yc[23] += 10 Yc[24] += 10 @@ -52,51 +52,30 @@ def student_t_approx(): #A GP should completely break down due to the points as they get a lot of weight # create simple GP model m = GPy.models.GP_regression(X, Y, kernel=kernel1) - ## optimize + # optimize m.ensure_default_constraints() - #m.unconstrain('noise') - #m.constrain_fixed('noise', 0.1) m.optimize() # plot plt.subplot(211) m.plot() print m - ##Corrupt + #Corrupt print "Corrupt Gaussian" m = GPy.models.GP_regression(X, Yc, kernel=kernel2) m.ensure_default_constraints() - #m.unconstrain('noise') - #m.constrain_fixed('noise', 0.1) m.optimize() plt.subplot(212) m.plot() print m - ##with a student t distribution, since it has heavy tails it should work well - ##likelihood_function = student_t(deg_free, sigma=real_var) - ##lap = Laplace(Y, likelihood_function) - ##cov = kernel.K(X) - ##lap.fit_full(cov) - - ##test_range = np.arange(0, 10, 0.1) - ##plt.plot(test_range, t_rv.pdf(test_range)) - ##for i in xrange(X.shape[0]): - ##mode = lap.f_hat[i] - ##covariance = lap.hess_hat_i[i,i] - ##scaling = np.exp(lap.ln_z_hat) - ##normalised_approx = norm(loc=mode, scale=covariance) - ##print "Normal with mode %f, and variance %f" % (mode, covariance) - ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) - ##plt.show() - plt.figure(2) plt.suptitle('Student-t likelihood') edited_real_sd = real_sd # Likelihood object t_distribution = student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = Laplace(Yc, t_distribution) + stu_t_likelihood = Laplace(Y, t_distribution) print "Clean student t" m = GPy.models.GP(X, stu_t_likelihood, kernel3) @@ -107,7 +86,7 @@ def student_t_approx(): print(m) # plot plt.subplot(211) - m.plot_f() + m.plot() plt.ylim(-2.5,2.5) #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT @@ -124,6 +103,23 @@ def student_t_approx(): plt.ylim(-2.5,2.5) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + ###with a student t distribution, since it has heavy tails it should work well + ###likelihood_function = student_t(deg_free, sigma=real_var) + ###lap = Laplace(Y, likelihood_function) + ###cov = kernel.K(X) + ###lap.fit_full(cov) + + ###test_range = np.arange(0, 10, 0.1) + ###plt.plot(test_range, t_rv.pdf(test_range)) + ###for i in xrange(X.shape[0]): + ###mode = lap.f_hat[i] + ###covariance = lap.hess_hat_i[i,i] + ###scaling = np.exp(lap.ln_z_hat) + ###normalised_approx = norm(loc=mode, scale=covariance) + ###print "Normal with mode %f, and variance %f" % (mode, covariance) + ###plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) + ###plt.show() + return m diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 7ac9c661..61b5c427 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -23,6 +23,10 @@ class student_t(likelihood_function): #FIXME: This should be in the superclass self.log_concave = False + @property + def variance(self): + return (self.v / float(self.v - 2)) * (self.sigma**2) + def link_function(self, y, f): """link_function $\ln p(y|f)$ $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ @@ -79,14 +83,32 @@ class student_t(likelihood_function): def predictive_values(self, mu, var): """ - Compute mean, and conficence interval (percentiles 5 and 95) of the prediction + Compute mean, and conficence interval (percentiles 5 and 95) of the prediction - Need to find what the variance is at the latent points for a student t*normal - (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) + Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*) + (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2)) + *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) -(((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2)) -*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) """ + + #We want the variance around test points y which comes from int p(y*|f*)p(f*) df* + #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)] + #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this + #Which was also given to us as (var) + #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution + #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom + true_var = var + self.variance + + #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now + #need the 95 and 5 percentiles. + #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles + p_025 = mu - 2.*true_var + p_975 = mu + 2.*true_var + + return mu, np.nan*mu, p_025, p_975 + + def sample_predicted_values(self, mu, var): + """ Experimental sample approches and numerical integration """ #p_025 = stats.t.ppf(.025, mu) #p_975 = stats.t.ppf(.975, mu) @@ -134,14 +156,13 @@ class student_t(likelihood_function): def t_gauss_int(mu, var): print "Mu: ", mu print "var: ", var - result = integrate.quad(t_gaussian, -np.inf, 0.975, args=(mu, var)) + result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var)) print "Result: ", result return result[0] vec_t_gauss_int = np.vectorize(t_gauss_int) - p_025 = vec_t_gauss_int(mu, var) - p_975 = vec_t_gauss_int(mu, var) + p = vec_t_gauss_int(mu, var) + p_025 = mu - p + p_975 = mu + p import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - - return mu, np.nan*mu, p_025, p_975 From afa5b1f9561189b3774a895b765d708186c10f5c Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 2 Apr 2013 12:39:57 +0100 Subject: [PATCH 14/71] Tidying up --- python/likelihoods/likelihood_function.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 61b5c427..50f9b620 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -88,7 +88,6 @@ class student_t(likelihood_function): Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*) (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2)) *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) - """ #We want the variance around test points y which comes from int p(y*|f*)p(f*) df* @@ -144,9 +143,6 @@ class student_t(likelihood_function): p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None] p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None] - p_025 = 1+p_025 - p_975 = 1+p_975 - ##Alernenately we could sample from int p(y|f*)p(f*|x*) df* def t_gaussian(f, mu, var): return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5)) From 0312f319ad4eef37f0c173120d80cc373d149519 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 2 Apr 2013 20:00:31 +0100 Subject: [PATCH 15/71] Still working on rasmussen, link function needs vectorizing I think --- python/examples/laplace_approximations.py | 58 ++++++--- python/likelihoods/Laplace.py | 137 ++++++++++++++++------ python/likelihoods/likelihood_function.py | 13 +- 3 files changed, 154 insertions(+), 54 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 6374a5fd..a1c71c71 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -16,6 +16,9 @@ def student_t_approx(): Y = np.sin(X) + np.random.randn(*X.shape)*real_var Yc = Y.copy() + X_full = np.linspace(0.0, 10.0, 500)[:, None] + Y_full = np.sin(X_full) + #Y = Y/Y.max() Yc[10] += 100 @@ -25,7 +28,7 @@ def student_t_approx(): #Yc = Yc/Yc.max() #Add student t random noise to datapoints - deg_free = 20 #100000.5 + deg_free = 10 real_sd = np.sqrt(real_var) #t_rv = t(deg_free, loc=0, scale=real_var) #noise = t_rvrvs(size=Y.shape) @@ -47,6 +50,8 @@ def student_t_approx(): kernel2 = kernel1.copy() kernel3 = kernel1.copy() kernel4 = kernel1.copy() + kernel5 = kernel1.copy() + kernel6 = kernel1.copy() print "Clean Gaussian" #A GP should completely break down due to the points as they get a lot of weight @@ -58,6 +63,7 @@ def student_t_approx(): # plot plt.subplot(211) m.plot() + plt.plot(X_full, Y_full) print m #Corrupt @@ -67,40 +73,64 @@ def student_t_approx(): m.optimize() plt.subplot(212) m.plot() + plt.plot(X_full, Y_full) print m plt.figure(2) plt.suptitle('Student-t likelihood') edited_real_sd = real_sd - # Likelihood object + print "Clean student t, ncg" t_distribution = student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = Laplace(Y, t_distribution) - - print "Clean student t" + stu_t_likelihood = Laplace(Y, t_distribution, rasm=False) m = GPy.models.GP(X, stu_t_likelihood, kernel3) m.ensure_default_constraints() m.update_likelihood_approximation() - # optimize m.optimize() print(m) - # plot - plt.subplot(211) + plt.subplot(221) m.plot() - plt.ylim(-2.5,2.5) - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) - print "Corrupt student t" + print "Corrupt student t, ncg" t_distribution = student_t(deg_free, sigma=edited_real_sd) - corrupt_stu_t_likelihood = Laplace(Yc, t_distribution) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False) + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(223) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + print "Clean student t, rasm" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True) + m = GPy.models.GP(X, stu_t_likelihood, kernel6) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(222) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + print "Corrupt student t, rasm" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m.ensure_default_constraints() m.update_likelihood_approximation() m.optimize() print(m) - plt.subplot(212) + plt.subplot(224) m.plot() - plt.ylim(-2.5,2.5) + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT ###with a student t distribution, since it has heavy tails it should work well diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 1411c22b..8eb69869 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,16 +1,15 @@ import numpy as np import scipy as sp import GPy -from scipy.linalg import cholesky, eig, inv, det -from functools import partial +from scipy.linalg import cholesky, eig, inv, det, cho_solve from GPy.likelihoods.likelihood import likelihood -from GPy.util.linalg import pdinv,mdot +from GPy.util.linalg import pdinv, mdot, jitchol #import numpy.testing.assert_array_equal class Laplace(likelihood): """Laplace approximation to a posterior""" - def __init__(self, data, likelihood_function): + def __init__(self, data, likelihood_function, rasm=True): """ Laplace Approximation @@ -30,6 +29,7 @@ class Laplace(likelihood): """ self.data = data self.likelihood_function = likelihood_function + self.rasm = rasm #Inital values self.N, self.D = self.data.shape @@ -102,20 +102,16 @@ class Laplace(likelihood): #f_hat? should be f but we must have optimized for them I guess? Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) Z_tilde = (self.ln_z_hat - self.NORMAL_CONST - + 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat) + + 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat)) + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) ) - self.Z = Z_tilde - self.Y = Y_tilde[:, None] + #Convert to float as its (1, 1) and Z must be a scalar + self.Z = np.float64(Z_tilde) + self.Y = Y_tilde self.YYT = np.dot(self.Y, self.Y.T) self.covariance_matrix = self.Sigma_tilde - #if not self.likelihood_function.log_concave: - #self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur - ##If the likelihood is non-log-concave. We wan't to say that there is a negative variance - ##To cause the posterior to become less certain than the prior and likelihood, - ##This is a property only held by non-log-concave likelihoods self.precision = 1 / np.diag(self.covariance_matrix)[:, None] def fit_full(self, K): @@ -125,32 +121,15 @@ class Laplace(likelihood): :K: Covariance matrix """ self.K = K.copy() - f = np.zeros((self.N, 1)) - (self.Ki, _, _, self.log_Kdet) = pdinv(K) - LOG_K_CONST = -(0.5 * self.log_Kdet) - OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST - #Find \hat(f) using a newton raphson optimizer for example - #TODO: Add newton-raphson as subclass of optimizer class - - #FIXME: Can we get rid of this horrible reshaping? - def obj(f): - #f = f[:, None] - res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST) - return float(res) - - def obj_grad(f): - #f = f[:, None] - res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f)) - return np.squeeze(res) - - def obj_hess(f): - res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) - return np.squeeze(res) - - self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) + self.Ki, _, _, self.log_Kdet = pdinv(K) + if self.rasm: + self.f_hat = self.rasm_mode(K) + else: + self.f_hat = self.ncg_mode(K) #At this point get the hessian matrix - self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat)) + self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat)) + if not self.likelihood_function.log_concave: self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur #If the likelihood is non-log-concave. We wan't to say that there is a negative variance @@ -176,8 +155,92 @@ class Laplace(likelihood): #Unsure whether its log_hess or log_hess_i self.ln_z_hat = (- 0.5*self.log_hess_hat_det + 0.5*self.log_Kdet - + self.likelihood_function.link_function(self.data[:,0], self.f_hat) + + self.likelihood_function.link_function(self.data, self.f_hat) + #+ self.likelihood_function.link_function(self.data, self.f_hat) - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) ) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return self._compute_GP_variables() + + def ncg_mode(self, K): + """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative) + :K: Covariance matrix + :returns: f_mode + """ + self.K = K.copy() + f = np.zeros((self.N, 1)) + (self.Ki, _, _, self.log_Kdet) = pdinv(K) + LOG_K_CONST = -(0.5 * self.log_Kdet) + + #FIXME: Can we get rid of this horrible reshaping? + def obj(f): + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + + self.NORMAL_CONST + LOG_K_CONST) + return float(res) + + def obj_grad(f): + res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f)) + return np.squeeze(res) + + def obj_hess(f): + res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) + return np.squeeze(res) + + f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) + return f_hat[:, None] + + def rasm_mode(self, K): + """ + Rasmussens numerically stable mode finding + For nomenclature see Rasmussen & Williams 2006 + + :K: Covariance matrix + :returns: f_mode + """ + f = np.zeros((self.N, 1)) + new_obj = -np.inf + old_obj = np.inf + + def obj(a, f): + #Careful of shape of data! + return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f) + + difference = np.inf + epsilon = 1e-16 + step_size = 1 + while difference > epsilon: + W = -np.diag(self.likelihood_function.link_hess(self.data, f)) + if not self.likelihood_function.log_concave: + #if np.any(W < 0): + #print "NEGATIVE VALUES :(" + #pass + W[W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + #If the likelihood is non-log-concave. We wan't to say that there is a negative variance + #To cause the posterior to become less certain than the prior and likelihood, + #This is a property only held by non-log-concave likelihoods + #W is diagnoal so its sqrt is just the sqrt of the diagonal elements + W_12 = np.sqrt(W) + B = np.eye(self.N) + mdot(W_12, K, W_12) + L = jitchol(B) + b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)) + #TODO: Check L is lower + solve_L = cho_solve((L, True), mdot(W_12, (K, b))) + a = b - mdot(W_12, solve_L) + f = np.dot(K, a) + old_obj = new_obj + new_obj = obj(a, f) + difference = new_obj - old_obj + #print "Difference: ", new_obj - old_obj + if difference < 0: + #If the objective function isn't rising, restart optimization + print "Reducing step-size, restarting" + #objective function isn't increasing, try reducing step size + step_size *= 0.9 + f = np.zeros((self.N, 1)) + new_obj = -np.inf + old_obj = np.inf + + difference = abs(difference) + + return f diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 50f9b620..15859a81 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -36,7 +36,10 @@ class student_t(likelihood_function): :returns: float(likelihood evaluated for this point) """ + y = np.squeeze(y) + f = np.squeeze(f) assert y.shape == f.shape + e = y - f objective = (gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) @@ -44,6 +47,7 @@ class student_t(likelihood_function): - (self.v + 1) * 0.5 * np.log(1 + ((e**2 / self.sigma**2) / self.v)) ) + print (e**2).shape return np.sum(objective) def link_grad(self, y, f): @@ -57,10 +61,12 @@ class student_t(likelihood_function): :returns: gradient of likelihood evaluated at points """ + y = np.squeeze(y) + f = np.squeeze(f) assert y.shape == f.shape e = y - f grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) - return grad + return np.squeeze(grad) def link_hess(self, y, f): """ @@ -75,11 +81,12 @@ class student_t(likelihood_function): :f: latent variables f :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ + y = np.squeeze(y) + f = np.squeeze(f) assert y.shape == f.shape e = y - f - #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2) hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) - return hess + return np.squeeze(hess) def predictive_values(self, mu, var): """ From 2006a94caa859d195a7c2af1236eb84656b68cfc Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 3 Apr 2013 10:55:58 +0100 Subject: [PATCH 16/71] Fixed broadcasting bug, rasm now appears to work --- python/likelihoods/Laplace.py | 16 ++++++++++------ python/likelihoods/likelihood_function.py | 1 - 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 8eb69869..e967a743 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -159,7 +159,6 @@ class Laplace(likelihood): #+ self.likelihood_function.link_function(self.data, self.f_hat) - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) ) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return self._compute_GP_variables() @@ -190,7 +189,7 @@ class Laplace(likelihood): f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) return f_hat[:, None] - def rasm_mode(self, K): + def rasm_mode(self, K, MAX_ITER=5000, MAX_RESTART=30): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -209,7 +208,9 @@ class Laplace(likelihood): difference = np.inf epsilon = 1e-16 step_size = 1 - while difference > epsilon: + rs = 0 + i = 0 + while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: W = -np.diag(self.likelihood_function.link_hess(self.data, f)) if not self.likelihood_function.log_concave: #if np.any(W < 0): @@ -223,7 +224,7 @@ class Laplace(likelihood): W_12 = np.sqrt(W) B = np.eye(self.N) + mdot(W_12, K, W_12) L = jitchol(B) - b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)) + b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)[:, None]) #TODO: Check L is lower solve_L = cho_solve((L, True), mdot(W_12, (K, b))) a = b - mdot(W_12, solve_L) @@ -234,13 +235,16 @@ class Laplace(likelihood): #print "Difference: ", new_obj - old_obj if difference < 0: #If the objective function isn't rising, restart optimization - print "Reducing step-size, restarting" - #objective function isn't increasing, try reducing step size step_size *= 0.9 + print "Objective function rose" + print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + #objective function isn't increasing, try reducing step size f = np.zeros((self.N, 1)) new_obj = -np.inf old_obj = np.inf + rs += 1 difference = abs(difference) + i += 1 return f diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 15859a81..49174ce7 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -47,7 +47,6 @@ class student_t(likelihood_function): - (self.v + 1) * 0.5 * np.log(1 + ((e**2 / self.sigma**2) / self.v)) ) - print (e**2).shape return np.sum(objective) def link_grad(self, y, f): From 4a14a82dfba4bd3c48d4175bb8a861bab24a0d10 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 5 Apr 2013 17:34:11 +0100 Subject: [PATCH 17/71] Got the mode finding without computing Ki --- python/examples/laplace_approximations.py | 85 +++++++++----- python/likelihoods/Laplace.py | 130 ++++++++++++++++------ 2 files changed, 152 insertions(+), 63 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index a1c71c71..7ab26406 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -6,6 +6,38 @@ from coxGP.python.likelihoods.Laplace import Laplace from coxGP.python.likelihoods.likelihood_function import student_t +def timing(): + real_var = 0.1 + times = 1000 + deg_free = 10 + real_sd = np.sqrt(real_var) + the_is = np.zeros(times) + X = np.linspace(0.0, 10.0, 30)[:, None] + for a in xrange(times): + Y = np.sin(X) + np.random.randn(*X.shape)*real_var + Yc = Y.copy() + + Yc[10] += 100 + Yc[25] += 10 + Yc[23] += 10 + Yc[24] += 10 + + edited_real_sd = real_sd + kernel1 = GPy.kern.rbf(X.shape[1]) + + t_distribution = student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True) + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + the_is[a] = m.likelihood.i + + print the_is + print np.mean(the_is) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + def student_t_approx(): """ Example of regressing with a student t likelihood @@ -80,32 +112,6 @@ def student_t_approx(): plt.suptitle('Student-t likelihood') edited_real_sd = real_sd - print "Clean student t, ncg" - t_distribution = student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = Laplace(Y, t_distribution, rasm=False) - m = GPy.models.GP(X, stu_t_likelihood, kernel3) - m.ensure_default_constraints() - m.update_likelihood_approximation() - m.optimize() - print(m) - plt.subplot(221) - m.plot() - plt.plot(X_full, Y_full) - plt.ylim(-2.5, 2.5) - - print "Corrupt student t, ncg" - t_distribution = student_t(deg_free, sigma=edited_real_sd) - corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False) - m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) - m.ensure_default_constraints() - m.update_likelihood_approximation() - m.optimize() - print(m) - plt.subplot(223) - m.plot() - plt.plot(X_full, Y_full) - plt.ylim(-2.5, 2.5) - print "Clean student t, rasm" t_distribution = student_t(deg_free, sigma=edited_real_sd) stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True) @@ -133,6 +139,33 @@ def student_t_approx(): plt.ylim(-2.5, 2.5) import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + print "Clean student t, ncg" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = Laplace(Y, t_distribution, rasm=False) + m = GPy.models.GP(X, stu_t_likelihood, kernel3) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(221) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + print "Corrupt student t, ncg" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False) + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(223) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + ###with a student t distribution, since it has heavy tails it should work well ###likelihood_function = student_t(deg_free, sigma=real_var) ###lap = Laplace(Y, likelihood_function) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index e967a743..396a0bc7 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -100,12 +100,19 @@ class Laplace(likelihood): else: self.Sigma_tilde = inv(self.Sigma_tilde_i) #f_hat? should be f but we must have optimized for them I guess? - Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) - Z_tilde = (self.ln_z_hat - self.NORMAL_CONST - + 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat)) - + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) - - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) - ) + #Y_tilde = mdot(self.Sigma_tilde, self.hess_hat_i, self.f_hat) + Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat) + #KW = np.dot(self.K, self.W) + #KW_i, _, _, _ = pdinv(KW) + #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat) + #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST + #+ 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat)) + #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) + #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) + #) + _, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12)) + f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat) + Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -121,7 +128,7 @@ class Laplace(likelihood): :K: Covariance matrix """ self.K = K.copy() - self.Ki, _, _, self.log_Kdet = pdinv(K) + self.Ki, _, _, log_Kdet = pdinv(K) if self.rasm: self.f_hat = self.rasm_mode(K) else: @@ -135,33 +142,64 @@ class Laplace(likelihood): #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods - self.hess_hat = self.Ki + self.W - (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat) + #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though + self.B, L, self.W_12 = self._compute_B_statistics(K, self.W) + self.Bi, _, _, B_det = pdinv(self.B) + #ln_W_det = np.linalg.det(self.W) + #ln_B_det = np.linalg.det(self.B) + ln_det = np.linalg.det(np.eye(self.N) - mdot(self.W_12, self.Bi, self.W_12, K)) + b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None] + #TODO: Check L is lower + solve_L = cho_solve((L, True), mdot(self.W_12, (K, b))) + a = b - mdot(self.W_12, solve_L) + self.f_Ki_f = np.dot(self.f_hat.T, a) - #Check hess_hat is positive definite - try: - cholesky(self.hess_hat) - except: - raise ValueError("Must be positive definite") + #self.hess_hat = self.Ki + self.W + #(self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat) - #Check its eigenvalues are positive - eigenvalues = eig(self.hess_hat) - if not np.all(eigenvalues > 0): - raise ValueError("Eigen values not positive") + ##Check hess_hat is positive definite + #try: + #cholesky(self.hess_hat) + #except: + #raise ValueError("Must be positive definite") + + ##Check its eigenvalues are positive + #eigenvalues = eig(self.hess_hat) + #if not np.all(eigenvalues > 0): + #raise ValueError("Eigen values not positive") #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) #Unsure whether its log_hess or log_hess_i - self.ln_z_hat = (- 0.5*self.log_hess_hat_det - + 0.5*self.log_Kdet - + self.likelihood_function.link_function(self.data, self.f_hat) + #self.ln_z_hat = (- 0.5*self.log_hess_hat_i_det + #+ 0.5*self.log_Kdet #+ self.likelihood_function.link_function(self.data, self.f_hat) - - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) + ##+ self.likelihood_function.link_function(self.data, self.f_hat) + #- 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) + #) + self.ln_z_hat = (- 0.5*log_Kdet + - 0.5*self.f_Ki_f + + self.likelihood_function.link_function(self.data, self.f_hat) + + 0.5*ln_det ) return self._compute_GP_variables() + def _compute_B_statistics(self, K, W): + """Rasmussen suggests the use of a numerically stable positive definite matrix B + Which has a positive diagonal element and can be easyily inverted + + :K: Covariance matrix + :W: Negative hessian at a point (diagonal matrix) + :returns: (B, L) + """ + #W is diagnoal so its sqrt is just the sqrt of the diagonal elements + W_12 = np.sqrt(W) + B = np.eye(K.shape[0]) + mdot(W_12, K, W_12) + L = jitchol(B) + return (B, L, W_12) + def ncg_mode(self, K): """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative) :K: Covariance matrix @@ -189,7 +227,7 @@ class Laplace(likelihood): f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) return f_hat[:, None] - def rasm_mode(self, K, MAX_ITER=5000, MAX_RESTART=30): + def rasm_mode(self, K, MAX_ITER=5000000000000000, MAX_RESTART=30): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -206,11 +244,12 @@ class Laplace(likelihood): return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f) difference = np.inf - epsilon = 1e-16 + epsilon = 1e-6 step_size = 1 rs = 0 i = 0 - while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: + while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART: + f_old = f.copy() W = -np.diag(self.likelihood_function.link_hess(self.data, f)) if not self.likelihood_function.log_concave: #if np.any(W < 0): @@ -220,31 +259,48 @@ class Laplace(likelihood): #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods - #W is diagnoal so its sqrt is just the sqrt of the diagonal elements - W_12 = np.sqrt(W) - B = np.eye(self.N) + mdot(W_12, K, W_12) - L = jitchol(B) - b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)[:, None]) + B, L, W_12 = self._compute_B_statistics(K, W) + + W_f = np.dot(W, f) + grad = self.likelihood_function.link_grad(self.data, f)[:, None] + #Find K_i_f + b = W_f + grad + #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None] #TODO: Check L is lower solve_L = cho_solve((L, True), mdot(W_12, (K, b))) a = b - mdot(W_12, solve_L) - f = np.dot(K, a) + #f = np.dot(K, a) + + #a should be equal to Ki*f now so should be able to use it + c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) + solve_L = cho_solve((L, True), mdot(W_12, c)) + f = c - mdot(K, W_12, solve_L) + + #K_w_f = mdot(K, (W, f)) + #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f + #d = f + K_w_f + c + #solve_L = cho_solve((L, True), mdot(W_12, d)) + #f = c - mdot(K, (W_12, solve_L)) + #a = mdot(self.Ki, f) + + tmp_old_obj = old_obj old_obj = new_obj new_obj = obj(a, f) difference = new_obj - old_obj - #print "Difference: ", new_obj - old_obj + #print "Difference: ", difference if difference < 0: + #print "Objective function rose", difference #If the objective function isn't rising, restart optimization step_size *= 0.9 - print "Objective function rose" - print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) #objective function isn't increasing, try reducing step size - f = np.zeros((self.N, 1)) - new_obj = -np.inf - old_obj = np.inf + #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode + old_obj = tmp_old_obj rs += 1 difference = abs(difference) i += 1 + self.i = i + print "{i} steps".format(i=i) return f From 31d8faecf866307c69dcade761ddb77d628b773e Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 5 Apr 2013 17:56:02 +0100 Subject: [PATCH 18/71] Added timing and realised mdot can be faster as its almost always a diagonal matrix its multiplying with --- python/examples/laplace_approximations.py | 9 +++++--- python/likelihoods/Laplace.py | 25 ++++++++++++++--------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 7ab26406..28a92c61 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -8,11 +8,12 @@ from coxGP.python.likelihoods.likelihood_function import student_t def timing(): real_var = 0.1 - times = 1000 + times = 1 deg_free = 10 real_sd = np.sqrt(real_var) the_is = np.zeros(times) - X = np.linspace(0.0, 10.0, 30)[:, None] + X = np.linspace(0.0, 10.0, 500)[:, None] + for a in xrange(times): Y = np.sin(X) + np.random.randn(*X.shape)*real_var Yc = Y.copy() @@ -21,6 +22,8 @@ def timing(): Yc[25] += 10 Yc[23] += 10 Yc[24] += 10 + Yc[300] += 10 + Yc[400] += 10000 edited_real_sd = real_sd kernel1 = GPy.kern.rbf(X.shape[1]) @@ -33,9 +36,9 @@ def timing(): m.optimize() the_is[a] = m.likelihood.i + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print the_is print np.mean(the_is) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def student_t_approx(): diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 396a0bc7..734bf6c8 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -128,7 +128,9 @@ class Laplace(likelihood): :K: Covariance matrix """ self.K = K.copy() - self.Ki, _, _, log_Kdet = pdinv(K) + print "Inverting K" + #self.Ki, _, _, log_Kdet = pdinv(K) + print "K inverted, optimising" if self.rasm: self.f_hat = self.rasm_mode(K) else: @@ -196,6 +198,7 @@ class Laplace(likelihood): """ #W is diagnoal so its sqrt is just the sqrt of the diagonal elements W_12 = np.sqrt(W) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT B = np.eye(K.shape[0]) + mdot(W_12, K, W_12) L = jitchol(B) return (B, L, W_12) @@ -205,9 +208,7 @@ class Laplace(likelihood): :K: Covariance matrix :returns: f_mode """ - self.K = K.copy() f = np.zeros((self.N, 1)) - (self.Ki, _, _, self.log_Kdet) = pdinv(K) LOG_K_CONST = -(0.5 * self.log_Kdet) #FIXME: Can we get rid of this horrible reshaping? @@ -227,7 +228,7 @@ class Laplace(likelihood): f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) return f_hat[:, None] - def rasm_mode(self, K, MAX_ITER=5000000000000000, MAX_RESTART=30): + def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -249,6 +250,7 @@ class Laplace(likelihood): rs = 0 i = 0 while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART: + print "optimising" f_old = f.copy() W = -np.diag(self.likelihood_function.link_hess(self.data, f)) if not self.likelihood_function.log_concave: @@ -259,22 +261,25 @@ class Laplace(likelihood): #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods + print "Decomposing" B, L, W_12 = self._compute_B_statistics(K, W) + print "Finding f" - W_f = np.dot(W, f) + W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal! grad = self.likelihood_function.link_grad(self.data, f)[:, None] #Find K_i_f b = W_f + grad #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None] #TODO: Check L is lower - solve_L = cho_solve((L, True), mdot(W_12, (K, b))) - a = b - mdot(W_12, solve_L) + + solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal! + a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! #f = np.dot(K, a) #a should be equal to Ki*f now so should be able to use it c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) - solve_L = cho_solve((L, True), mdot(W_12, c)) - f = c - mdot(K, W_12, solve_L) + solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal! + f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! #K_w_f = mdot(K, (W, f)) #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f @@ -302,5 +307,5 @@ class Laplace(likelihood): i += 1 self.i = i - print "{i} steps".format(i=i) + #print "{i} steps".format(i=i) return f From 431f93ef231875aeb6adbe6be2c70ea807aafdce Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 8 Apr 2013 18:09:07 +0100 Subject: [PATCH 19/71] Stabalised most of the algorithm (apart from the end inversion which is impossible) --- python/likelihoods/Laplace.py | 132 ++++++++++++++++++---------------- 1 file changed, 72 insertions(+), 60 deletions(-) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 734bf6c8..77359769 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -3,9 +3,15 @@ import scipy as sp import GPy from scipy.linalg import cholesky, eig, inv, det, cho_solve from GPy.likelihoods.likelihood import likelihood -from GPy.util.linalg import pdinv, mdot, jitchol +from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv +from scipy.linalg.lapack import dtrtrs #import numpy.testing.assert_array_equal +#TODO: Move this to utils +def det_ln_diag(A): + return np.log(np.diagonal(A)).sum() + + class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -60,7 +66,6 @@ class Laplace(likelihood): pass # TODO: Laplace likelihood might want to take some parameters... def _gradients(self, partial): - #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... raise NotImplementedError @@ -99,9 +104,26 @@ class Laplace(likelihood): (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i) else: self.Sigma_tilde = inv(self.Sigma_tilde_i) - #f_hat? should be f but we must have optimized for them I guess? - #Y_tilde = mdot(self.Sigma_tilde, self.hess_hat_i, self.f_hat) Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat) + + #dtritri -> L -> L_i + #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i + #((L.T*w)_i + I)f_hat = y_tilde + L = jitchol(self.K) + Li = chol_inv(L) + Lt_W = np.dot(L.T, self.W) + if np.abs(det(Lt_W)) < epsilon: + print "WARNING: Transformed covariance matrix is signular!" + Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0] + Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + #if np.abs(det(KW)) < epsilon: + #print "WARNING: Transformed covariance matrix is signular!" + #KW_i = inv(KW) + #Y_tilde = mdot(KW_i + np.eye(self.N), self.f_hat) + + #Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat) #KW = np.dot(self.K, self.W) #KW_i, _, _, _ = pdinv(KW) #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat) @@ -110,16 +132,38 @@ class Laplace(likelihood): #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) #) - _, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12)) - f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat) - Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f + #_, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12)) + #f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat) + #Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f + + #f_W_f = mdot(self.f_hat.T, self.W, self.f_hat) + #f_Y_f = mdot(Y_tilde, self.W, Y_tilde) + #Z_tilde = (np.dot(self.W, self.f_hat) - 0.5*y_W_y + self.ln_z_hat + #- 0.5*mdot(self.f_hat, ( + + f_Ki_W_f = mdot(self.f_hat.T, (self.Ki + self.W), self.f_hat) + y_W_f = mdot(Y_tilde.T, self.W, self.f_hat) + y_W_y = mdot(Y_tilde.T, self.W, Y_tilde) + self.ln_W_det = det_ln_diag(self.W) + Z_tilde = (self.NORMAL_CONST + - 0.5*self.ln_K_det + - 0.5*self.ln_W_det + - 0.5*self.ln_Ki_W_i_det + - 0.5*f_Ki_W_f + - 0.5*y_W_y + + y_W_f + + self.ln_z_hat + ) + + Sigma_tilde = inv(self.W) # Damn #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) self.Y = Y_tilde self.YYT = np.dot(self.Y, self.Y.T) - self.covariance_matrix = self.Sigma_tilde + self.covariance_matrix = Sigma_tilde self.precision = 1 / np.diag(self.covariance_matrix)[:, None] + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def fit_full(self, K): """ @@ -128,9 +172,7 @@ class Laplace(likelihood): :K: Covariance matrix """ self.K = K.copy() - print "Inverting K" - #self.Ki, _, _, log_Kdet = pdinv(K) - print "K inverted, optimising" + self.Ki, _, _, self.ln_K_det = pdinv(K) if self.rasm: self.f_hat = self.rasm_mode(K) else: @@ -144,46 +186,24 @@ class Laplace(likelihood): #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods + #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though - self.B, L, self.W_12 = self._compute_B_statistics(K, self.W) + self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W) self.Bi, _, _, B_det = pdinv(self.B) - #ln_W_det = np.linalg.det(self.W) - #ln_B_det = np.linalg.det(self.B) - ln_det = np.linalg.det(np.eye(self.N) - mdot(self.W_12, self.Bi, self.W_12, K)) + + Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) + self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i) + b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None] - #TODO: Check L is lower - solve_L = cho_solve((L, True), mdot(self.W_12, (K, b))) - a = b - mdot(self.W_12, solve_L) + solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b))) + a = b - mdot(self.W_12, solve_chol) self.f_Ki_f = np.dot(self.f_hat.T, a) - #self.hess_hat = self.Ki + self.W - #(self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat) - - ##Check hess_hat is positive definite - #try: - #cholesky(self.hess_hat) - #except: - #raise ValueError("Must be positive definite") - - ##Check its eigenvalues are positive - #eigenvalues = eig(self.hess_hat) - #if not np.all(eigenvalues > 0): - #raise ValueError("Eigen values not positive") - - #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to - #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode - #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n) - #Unsure whether its log_hess or log_hess_i - #self.ln_z_hat = (- 0.5*self.log_hess_hat_i_det - #+ 0.5*self.log_Kdet - #+ self.likelihood_function.link_function(self.data, self.f_hat) - ##+ self.likelihood_function.link_function(self.data, self.f_hat) - #- 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat)) - #) - self.ln_z_hat = (- 0.5*log_Kdet + self.ln_z_hat = ( self.NORMAL_CONST - 0.5*self.f_Ki_f + - 0.5*self.ln_K_det + + 0.5*self.ln_Ki_W_i_det + self.likelihood_function.link_function(self.data, self.f_hat) - + 0.5*ln_det ) return self._compute_GP_variables() @@ -198,7 +218,7 @@ class Laplace(likelihood): """ #W is diagnoal so its sqrt is just the sqrt of the diagonal elements W_12 = np.sqrt(W) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT B = np.eye(K.shape[0]) + mdot(W_12, K, W_12) L = jitchol(B) return (B, L, W_12) @@ -209,12 +229,12 @@ class Laplace(likelihood): :returns: f_mode """ f = np.zeros((self.N, 1)) - LOG_K_CONST = -(0.5 * self.log_Kdet) #FIXME: Can we get rid of this horrible reshaping? + #ONLY WORKS FOR 1D DATA def obj(f): res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) - + self.NORMAL_CONST + LOG_K_CONST) + + self.NORMAL_CONST) return float(res) def obj_grad(f): @@ -249,21 +269,15 @@ class Laplace(likelihood): step_size = 1 rs = 0 i = 0 - while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART: - print "optimising" + while difference > epsilon: # and i < MAX_ITER and rs < MAX_RESTART: f_old = f.copy() W = -np.diag(self.likelihood_function.link_hess(self.data, f)) if not self.likelihood_function.log_concave: - #if np.any(W < 0): - #print "NEGATIVE VALUES :(" - #pass W[W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods - print "Decomposing" B, L, W_12 = self._compute_B_statistics(K, W) - print "Finding f" W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal! grad = self.likelihood_function.link_grad(self.data, f)[:, None] @@ -272,15 +286,15 @@ class Laplace(likelihood): #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None] #TODO: Check L is lower - solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal! - a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! - #f = np.dot(K, a) - #a should be equal to Ki*f now so should be able to use it c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal! f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! + solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal! + a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! + #f = np.dot(K, a) + #K_w_f = mdot(K, (W, f)) #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f #d = f + K_w_f + c @@ -292,7 +306,6 @@ class Laplace(likelihood): old_obj = new_obj new_obj = obj(a, f) difference = new_obj - old_obj - #print "Difference: ", difference if difference < 0: #print "Objective function rose", difference #If the objective function isn't rising, restart optimization @@ -307,5 +320,4 @@ class Laplace(likelihood): i += 1 self.i = i - #print "{i} steps".format(i=i) return f From e0c1e4a4df600d24f075cc13a359a4bc77dfcff3 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 8 Apr 2013 19:58:54 +0100 Subject: [PATCH 20/71] Fixed laplace approximation and made more numerically stable with cholesky decompositions, and commented --- python/examples/laplace_approximations.py | 1 - python/likelihoods/Laplace.py | 142 ++++++++++------------ 2 files changed, 65 insertions(+), 78 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 28a92c61..0500ba02 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -140,7 +140,6 @@ def student_t_approx(): m.plot() plt.plot(X_full, Y_full) plt.ylim(-2.5, 2.5) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print "Clean student t, ncg" t_distribution = student_t(deg_free, sigma=edited_real_sd) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 77359769..27ab7613 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,17 +1,32 @@ import numpy as np import scipy as sp import GPy -from scipy.linalg import cholesky, eig, inv, det, cho_solve +from scipy.linalg import cholesky, eig, inv, cho_solve +from numpy.linalg import cond from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv from scipy.linalg.lapack import dtrtrs -#import numpy.testing.assert_array_equal #TODO: Move this to utils + + def det_ln_diag(A): + """ + log determinant of a diagonal matrix + $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$ + """ return np.log(np.diagonal(A)).sum() +def pddet(A): + """ + Determinant of a positive definite matrix + """ + L = cholesky(A) + logdetA = 2*sum(np.log(np.diag(L))) + return logdetA + + class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -30,7 +45,8 @@ class Laplace(likelihood): --------- :data: @todo - :likelihood_function: @todo + :likelihood_function: likelihood function - subclass of likelihood_function + :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation """ self.data = data @@ -63,10 +79,10 @@ class Laplace(likelihood): return [] def _set_params(self, p): - pass # TODO: Laplace likelihood might want to take some parameters... + pass # TODO: Laplace likelihood might want to take some parameters... def _gradients(self, partial): - return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... + return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... raise NotImplementedError def _compute_GP_variables(self): @@ -91,20 +107,10 @@ class Laplace(likelihood): i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$ since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$ and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ + $$\tilde{\Sigma} = W^{-1}$$ """ - self.Sigma_tilde_i = self.W - #Check it isn't singular! epsilon = 1e-6 - if np.abs(det(self.Sigma_tilde_i)) < epsilon: - print "WARNING: Transformed covariance matrix is signular!" - #raise ValueError("inverse covariance must be non-singular to invert!") - #Do we really need to inverse Sigma_tilde_i? :( - if self.likelihood_function.log_concave: - (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i) - else: - self.Sigma_tilde = inv(self.Sigma_tilde_i) - Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat) #dtritri -> L -> L_i #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i @@ -112,42 +118,25 @@ class Laplace(likelihood): L = jitchol(self.K) Li = chol_inv(L) Lt_W = np.dot(L.T, self.W) - if np.abs(det(Lt_W)) < epsilon: - print "WARNING: Transformed covariance matrix is signular!" + + ##Check it isn't singular! + if cond(Lt_W) > 1e14: + print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem" + Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0] Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - #if np.abs(det(KW)) < epsilon: - #print "WARNING: Transformed covariance matrix is signular!" - #KW_i = inv(KW) - #Y_tilde = mdot(KW_i + np.eye(self.N), self.f_hat) + #f.T(Ki + W)f + f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat)) + + mdot(self.f_hat.T, self.W, self.f_hat) + ) - #Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat) - #KW = np.dot(self.K, self.W) - #KW_i, _, _, _ = pdinv(KW) - #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat) - #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST - #+ 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat)) - #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde)) - #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat)) - #) - #_, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12)) - #f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat) - #Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f - - #f_W_f = mdot(self.f_hat.T, self.W, self.f_hat) - #f_Y_f = mdot(Y_tilde, self.W, Y_tilde) - #Z_tilde = (np.dot(self.W, self.f_hat) - 0.5*y_W_y + self.ln_z_hat - #- 0.5*mdot(self.f_hat, ( - - f_Ki_W_f = mdot(self.f_hat.T, (self.Ki + self.W), self.f_hat) y_W_f = mdot(Y_tilde.T, self.W, self.f_hat) y_W_y = mdot(Y_tilde.T, self.W, Y_tilde) - self.ln_W_det = det_ln_diag(self.W) + ln_W_det = det_ln_diag(self.W) Z_tilde = (self.NORMAL_CONST - 0.5*self.ln_K_det - - 0.5*self.ln_W_det + - 0.5*ln_W_det - 0.5*self.ln_Ki_W_i_det - 0.5*f_Ki_W_f - 0.5*y_W_y @@ -155,7 +144,11 @@ class Laplace(likelihood): + self.ln_z_hat ) - Sigma_tilde = inv(self.W) # Damn + ##Check it isn't singular! + if cond(self.W) > 1e14: + print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem" + + Sigma_tilde = inv(self.W) # Damn #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -163,16 +156,14 @@ class Laplace(likelihood): self.YYT = np.dot(self.Y, self.Y.T) self.covariance_matrix = Sigma_tilde self.precision = 1 / np.diag(self.covariance_matrix)[:, None] - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def fit_full(self, K): """ The laplace approximation algorithm - For nomenclature see Rasmussen & Williams 2006 + For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability :K: Covariance matrix """ self.K = K.copy() - self.Ki, _, _, self.ln_K_det = pdinv(K) if self.rasm: self.f_hat = self.rasm_mode(K) else: @@ -182,10 +173,10 @@ class Laplace(likelihood): self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat)) if not self.likelihood_function.log_concave: - self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur - #If the likelihood is non-log-concave. We wan't to say that there is a negative variance - #To cause the posterior to become less certain than the prior and likelihood, - #This is a property only held by non-log-concave likelihoods + self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + #If the likelihood is non-log-concave. We wan't to say that there is a negative variance + #To cause the posterior to become less certain than the prior and likelihood, + #This is a property only held by non-log-concave likelihoods #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W) @@ -198,8 +189,9 @@ class Laplace(likelihood): solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b))) a = b - mdot(self.W_12, solve_chol) self.f_Ki_f = np.dot(self.f_hat.T, a) + self.ln_K_det = pddet(self.K) - self.ln_z_hat = ( self.NORMAL_CONST + self.ln_z_hat = (self.NORMAL_CONST - 0.5*self.f_Ki_f - 0.5*self.ln_K_det + 0.5*self.ln_Ki_W_i_det @@ -219,26 +211,29 @@ class Laplace(likelihood): #W is diagnoal so its sqrt is just the sqrt of the diagonal elements W_12 = np.sqrt(W) #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - B = np.eye(K.shape[0]) + mdot(W_12, K, W_12) + B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12)) L = jitchol(B) return (B, L, W_12) def ncg_mode(self, K): - """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative) + """ + Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative) :K: Covariance matrix :returns: f_mode """ + self.Ki, _, _, self.ln_K_det = pdinv(K) + f = np.zeros((self.N, 1)) #FIXME: Can we get rid of this horrible reshaping? #ONLY WORKS FOR 1D DATA def obj(f): - res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * np.dot(f.T, np.dot(self.Ki, f)) + self.NORMAL_CONST) return float(res) def obj_grad(f): - res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f)) + res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - np.dot(self.Ki, f)) return np.squeeze(res) def obj_hess(f): @@ -254,6 +249,8 @@ class Laplace(likelihood): For nomenclature see Rasmussen & Williams 2006 :K: Covariance matrix + :MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation + :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation :returns: f_mode """ f = np.zeros((self.N, 1)) @@ -269,39 +266,30 @@ class Laplace(likelihood): step_size = 1 rs = 0 i = 0 - while difference > epsilon: # and i < MAX_ITER and rs < MAX_RESTART: + while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: f_old = f.copy() W = -np.diag(self.likelihood_function.link_hess(self.data, f)) if not self.likelihood_function.log_concave: - W[W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur - #If the likelihood is non-log-concave. We wan't to say that there is a negative variance - #To cause the posterior to become less certain than the prior and likelihood, - #This is a property only held by non-log-concave likelihoods + W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + # If the likelihood is non-log-concave. We wan't to say that there is a negative variance + # To cause the posterior to become less certain than the prior and likelihood, + # This is a property only held by non-log-concave likelihoods B, L, W_12 = self._compute_B_statistics(K, W) - W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal! + W_f = np.dot(W, f) grad = self.likelihood_function.link_grad(self.data, f)[:, None] #Find K_i_f b = W_f + grad - #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None] - #TODO: Check L is lower #a should be equal to Ki*f now so should be able to use it - c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) - solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal! - f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! + c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) + solve_L = cho_solve((L, True), np.dot(W_12, c)) + f = c - np.dot(K, np.dot(W_12, solve_L)) - solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal! - a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal! + solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b))) + a = b - np.dot(W_12, solve_L) #f = np.dot(K, a) - #K_w_f = mdot(K, (W, f)) - #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f - #d = f + K_w_f + c - #solve_L = cho_solve((L, True), mdot(W_12, d)) - #f = c - mdot(K, (W_12, solve_L)) - #a = mdot(self.Ki, f) - tmp_old_obj = old_obj old_obj = new_obj new_obj = obj(a, f) From 65481d7a73b8fe965a99b82126431ae2668958db Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 10 Apr 2013 13:43:13 +0100 Subject: [PATCH 21/71] Fixed the z scalings --- python/examples/laplace_approximations.py | 8 +++---- python/likelihoods/Laplace.py | 28 +++++++++++++++-------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py index 0500ba02..5b1331b6 100644 --- a/python/examples/laplace_approximations.py +++ b/python/examples/laplace_approximations.py @@ -12,7 +12,7 @@ def timing(): deg_free = 10 real_sd = np.sqrt(real_var) the_is = np.zeros(times) - X = np.linspace(0.0, 10.0, 500)[:, None] + X = np.linspace(0.0, 10.0, 300)[:, None] for a in xrange(times): Y = np.sin(X) + np.random.randn(*X.shape)*real_var @@ -22,8 +22,8 @@ def timing(): Yc[25] += 10 Yc[23] += 10 Yc[24] += 10 - Yc[300] += 10 - Yc[400] += 10000 + Yc[250] += 10 + #Yc[4] += 10000 edited_real_sd = real_sd kernel1 = GPy.kern.rbf(X.shape[1]) @@ -36,7 +36,7 @@ def timing(): m.optimize() the_is[a] = m.likelihood.i - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print the_is print np.mean(the_is) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 27ab7613..8ef8fb62 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -1,7 +1,7 @@ import numpy as np import scipy as sp import GPy -from scipy.linalg import cholesky, eig, inv, cho_solve +from scipy.linalg import cholesky, eig, inv, cho_solve, det from numpy.linalg import cond from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv @@ -134,15 +134,24 @@ class Laplace(likelihood): y_W_f = mdot(Y_tilde.T, self.W, self.f_hat) y_W_y = mdot(Y_tilde.T, self.W, Y_tilde) ln_W_det = det_ln_diag(self.W) - Z_tilde = (self.NORMAL_CONST - - 0.5*self.ln_K_det - - 0.5*ln_W_det - - 0.5*self.ln_Ki_W_i_det - - 0.5*f_Ki_W_f - - 0.5*y_W_y - + y_W_f + Z_tilde = (- self.NORMAL_CONST + + 0.5*self.ln_K_det + + 0.5*ln_W_det + + 0.5*self.ln_Ki_W_i_det + + 0.5*f_Ki_W_f + + 0.5*y_W_y + - y_W_f + self.ln_z_hat ) + #Z_tilde = (self.NORMAL_CONST + #- 0.5*self.ln_K_det + #- 0.5*ln_W_det + #- 0.5*self.ln_Ki_W_i_det + #- 0.5*f_Ki_W_f + #- 0.5*y_W_y + #+ y_W_f + #+ self.ln_z_hat + #) ##Check it isn't singular! if cond(self.W) > 1e14: @@ -191,8 +200,7 @@ class Laplace(likelihood): self.f_Ki_f = np.dot(self.f_hat.T, a) self.ln_K_det = pddet(self.K) - self.ln_z_hat = (self.NORMAL_CONST - - 0.5*self.f_Ki_f + self.ln_z_hat = (- 0.5*self.f_Ki_f - 0.5*self.ln_K_det + 0.5*self.ln_Ki_W_i_det + self.likelihood_function.link_function(self.data, self.f_hat) From 9bbb11b825f7c395a040e2385d6a2c88aa1c143e Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 10 Apr 2013 15:43:31 +0100 Subject: [PATCH 22/71] Adding weibull likelihood, requires 'extra_data' to be passed to likelihood, i.e. the censoring information --- python/likelihoods/Laplace.py | 24 +++--- python/likelihoods/likelihood_function.py | 99 +++++++++++++++++++++-- 2 files changed, 104 insertions(+), 19 deletions(-) diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py index 8ef8fb62..4d94ba0f 100644 --- a/python/likelihoods/Laplace.py +++ b/python/likelihoods/Laplace.py @@ -30,7 +30,7 @@ def pddet(A): class Laplace(likelihood): """Laplace approximation to a posterior""" - def __init__(self, data, likelihood_function, rasm=True): + def __init__(self, data, likelihood_function, extra_data=None, rasm=True): """ Laplace Approximation @@ -44,13 +44,15 @@ class Laplace(likelihood): Arguments --------- - :data: @todo + :data: array of data the likelihood function is approximating :likelihood_function: likelihood function - subclass of likelihood_function + :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation """ self.data = data self.likelihood_function = likelihood_function + self.extra_data = extra_data self.rasm = rasm #Inital values @@ -179,7 +181,7 @@ class Laplace(likelihood): self.f_hat = self.ncg_mode(K) #At this point get the hessian matrix - self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat)) + self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data)) if not self.likelihood_function.log_concave: self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur @@ -194,7 +196,7 @@ class Laplace(likelihood): Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i) - b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None] + b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None] solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b))) a = b - mdot(self.W_12, solve_chol) self.f_Ki_f = np.dot(self.f_hat.T, a) @@ -203,7 +205,7 @@ class Laplace(likelihood): self.ln_z_hat = (- 0.5*self.f_Ki_f - 0.5*self.ln_K_det + 0.5*self.ln_Ki_W_i_det - + self.likelihood_function.link_function(self.data, self.f_hat) + + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) ) return self._compute_GP_variables() @@ -236,16 +238,16 @@ class Laplace(likelihood): #FIXME: Can we get rid of this horrible reshaping? #ONLY WORKS FOR 1D DATA def obj(f): - res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * np.dot(f.T, np.dot(self.Ki, f)) + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f)) + self.NORMAL_CONST) return float(res) def obj_grad(f): - res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - np.dot(self.Ki, f)) + res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f)) return np.squeeze(res) def obj_hess(f): - res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki) + res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki) return np.squeeze(res) f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) @@ -267,7 +269,7 @@ class Laplace(likelihood): def obj(a, f): #Careful of shape of data! - return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f) + return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data) difference = np.inf epsilon = 1e-6 @@ -276,7 +278,7 @@ class Laplace(likelihood): i = 0 while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: f_old = f.copy() - W = -np.diag(self.likelihood_function.link_hess(self.data, f)) + W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data)) if not self.likelihood_function.log_concave: W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance @@ -285,7 +287,7 @@ class Laplace(likelihood): B, L, W_12 = self._compute_B_statistics(K, W) W_f = np.dot(W, f) - grad = self.likelihood_function.link_grad(self.data, f)[:, None] + grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None] #Find K_i_f b = W_f + grad diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 49174ce7..0d421882 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -4,6 +4,7 @@ import numpy as np from GPy.likelihoods.likelihood_functions import likelihood_function from scipy import stats + class student_t(likelihood_function): """Student t likelihood distribution For nomanclature see Bayesian Data Analysis 2003 p576 @@ -24,15 +25,16 @@ class student_t(likelihood_function): self.log_concave = False @property - def variance(self): + def variance(self, extra_data=None): return (self.v / float(self.v - 2)) * (self.sigma**2) - def link_function(self, y, f): + def link_function(self, y, f, extra_data=None): """link_function $\ln p(y|f)$ $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ :y: data :f: latent variables f + :extra_data: extra_data which is not used in student t distribution :returns: float(likelihood evaluated for this point) """ @@ -49,7 +51,7 @@ class student_t(likelihood_function): ) return np.sum(objective) - def link_grad(self, y, f): + def link_grad(self, y, f, extra_data=None): """ Gradient of the link function at y, given f w.r.t f @@ -57,6 +59,7 @@ class student_t(likelihood_function): :y: data :f: latent variables f + :extra_data: extra_data which is not used in student t distribution :returns: gradient of likelihood evaluated at points """ @@ -67,17 +70,18 @@ class student_t(likelihood_function): grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) return np.squeeze(grad) - def link_hess(self, y, f): + def link_hess(self, y, f, extra_data=None): """ Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j i.e. second derivative link_function at y given f f_j w.r.t f and f_j - Will return diaganol of hessian, since every where else it is 0 + Will return diagonal of hessian, since every where else it is 0 $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ :y: data :f: latent variables f + :extra_data: extra_data which is not used in student t distribution :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ y = np.squeeze(y) @@ -139,7 +143,7 @@ class student_t(likelihood_function): #size=(num_f_samples, num_y_samples)) #print student_t_samples.shape - student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:,None], + student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None], scale=self.sigma, size=(num_test_points, num_y_samples, num_f_samples)) student_t_samples = np.reshape(student_t_samples, @@ -152,7 +156,7 @@ class student_t(likelihood_function): ##Alernenately we could sample from int p(y|f*)p(f*|x*) df* def t_gaussian(f, mu, var): return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5)) - * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2))) + * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2))) ) def t_gauss_int(mu, var): @@ -167,4 +171,83 @@ class student_t(likelihood_function): p = vec_t_gauss_int(mu, var) p_025 = mu - p p_975 = mu + p - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return mu, np.nan*mu, p_025, p_975 + + +class weibull_survival(likelihood_function): + """Weibull t likelihood distribution for survival analysis with censoring + For nomanclature see Bayesian Survival Analysis + + Laplace: + Needs functions to calculate + ln p(yi|fi) + dln p(yi|fi)_dfi + d2ln p(yi|fi)_d2fifj + """ + def __init__(self, shape, scale): + self.shape = shape + self.scale = scale + + #FIXME: This should be in the superclass + self.log_concave = True + + def link_function(self, y, f, extra_data=None): + """ + link_function $\ln p(y|f)$, i.e. log likelihood + + $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$ + + :y: time of event data + :f: latent variables f + :extra_data: the censoring indicator, 1 for censored, 0 for not + :returns: float(likelihood evaluated for this point) + + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + v = extra_data + objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f) # FIXME: CHECK THIS WITH BOOK, wheres scale? + return np.sum(objective) + + def link_grad(self, y, f, extra_data=None): + """ + Gradient of the link function at y, given f w.r.t f + + $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i}) + + :y: data + :f: latent variables f + :extra_data: the censoring indicator, 1 for censored, 0 for not + :returns: gradient of likelihood evaluated at points + + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + v = extra_data + grad = v - (y**self.shape)*np.exp(f) + return np.squeeze(grad) + + def link_hess(self, y, f, extra_data=None): + """ + Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j + i.e. second derivative link_function at y given f f_j w.r.t f and f_j + + Will return diagonal of hessian, since every where else it is 0 + + $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ + + :y: data + :f: latent variables f + :extra_data: extra_data which is not used hessian + :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + hess = (y**self.shape)*np.exp(f) + return np.squeeze(hess) From 296c093611f46c8632a7235f7d414581f5969294 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 15 Apr 2013 12:08:22 +0100 Subject: [PATCH 23/71] Tidy up comments --- python/likelihoods/likelihood_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py index 0d421882..f14faf33 100644 --- a/python/likelihoods/likelihood_function.py +++ b/python/likelihoods/likelihood_function.py @@ -9,7 +9,7 @@ class student_t(likelihood_function): """Student t likelihood distribution For nomanclature see Bayesian Data Analysis 2003 p576 - $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ + $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2)$$ Laplace: Needs functions to calculate From 1e707f125c7e9313b4444b23811425ddc555dba3 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 15 Apr 2013 12:10:42 +0100 Subject: [PATCH 24/71] Make directory structure match that of GPy --- {python => GPy}/__init__.py | 0 {python => GPy}/examples/__init__.py | 0 {python => GPy}/examples/laplace_approximations.py | 0 {python => GPy}/likelihoods/Laplace.py | 0 {python => GPy}/likelihoods/__init__.py | 0 {python => GPy}/likelihoods/likelihood_function.py | 0 {python => GPy}/models/__init__.py | 0 {python => GPy}/models/coxGP.py | 0 {python => GPy}/testing/__init__.py | 0 {python => GPy}/testing/cox_tests.py | 0 10 files changed, 0 insertions(+), 0 deletions(-) rename {python => GPy}/__init__.py (100%) rename {python => GPy}/examples/__init__.py (100%) rename {python => GPy}/examples/laplace_approximations.py (100%) rename {python => GPy}/likelihoods/Laplace.py (100%) rename {python => GPy}/likelihoods/__init__.py (100%) rename {python => GPy}/likelihoods/likelihood_function.py (100%) rename {python => GPy}/models/__init__.py (100%) rename {python => GPy}/models/coxGP.py (100%) rename {python => GPy}/testing/__init__.py (100%) rename {python => GPy}/testing/cox_tests.py (100%) diff --git a/python/__init__.py b/GPy/__init__.py similarity index 100% rename from python/__init__.py rename to GPy/__init__.py diff --git a/python/examples/__init__.py b/GPy/examples/__init__.py similarity index 100% rename from python/examples/__init__.py rename to GPy/examples/__init__.py diff --git a/python/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py similarity index 100% rename from python/examples/laplace_approximations.py rename to GPy/examples/laplace_approximations.py diff --git a/python/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py similarity index 100% rename from python/likelihoods/Laplace.py rename to GPy/likelihoods/Laplace.py diff --git a/python/likelihoods/__init__.py b/GPy/likelihoods/__init__.py similarity index 100% rename from python/likelihoods/__init__.py rename to GPy/likelihoods/__init__.py diff --git a/python/likelihoods/likelihood_function.py b/GPy/likelihoods/likelihood_function.py similarity index 100% rename from python/likelihoods/likelihood_function.py rename to GPy/likelihoods/likelihood_function.py diff --git a/python/models/__init__.py b/GPy/models/__init__.py similarity index 100% rename from python/models/__init__.py rename to GPy/models/__init__.py diff --git a/python/models/coxGP.py b/GPy/models/coxGP.py similarity index 100% rename from python/models/coxGP.py rename to GPy/models/coxGP.py diff --git a/python/testing/__init__.py b/GPy/testing/__init__.py similarity index 100% rename from python/testing/__init__.py rename to GPy/testing/__init__.py diff --git a/python/testing/cox_tests.py b/GPy/testing/cox_tests.py similarity index 100% rename from python/testing/cox_tests.py rename to GPy/testing/cox_tests.py From 589aeda88cc938a537ecb5a5df34dd276bae5a37 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 15 Apr 2013 15:44:29 +0100 Subject: [PATCH 25/71] Should be working now, needed to change relative path names --- GPy/examples/classification.py | 3 +-- GPy/examples/laplace_approximations.py | 29 +++++++++++--------------- GPy/likelihoods/__init__.py | 2 +- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py index 5df019e4..4899e75e 100644 --- a/GPy/examples/classification.py +++ b/GPy/examples/classification.py @@ -17,8 +17,7 @@ def crescent_data(seed=default_seed): #FIXME :param seed : seed value for data generation. :type seed: int :param inducing : number of inducing variables (only used for 'FITC' or 'DTC'). - :type inducing: int - """ + :type inducing: int """ data = GPy.util.datasets.crescent_data(seed=seed) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 5b1331b6..07801150 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -1,10 +1,6 @@ import GPy import numpy as np import matplotlib.pyplot as plt -from scipy.stats import t, norm -from coxGP.python.likelihoods.Laplace import Laplace -from coxGP.python.likelihoods.likelihood_function import student_t - def timing(): real_var = 0.1 @@ -28,15 +24,14 @@ def timing(): edited_real_sd = real_sd kernel1 = GPy.kern.rbf(X.shape[1]) - t_distribution = student_t(deg_free, sigma=edited_real_sd) - corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1) m.ensure_default_constraints() m.update_likelihood_approximation() m.optimize() the_is[a] = m.likelihood.i - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print the_is print np.mean(the_is) @@ -116,8 +111,8 @@ def student_t_approx(): edited_real_sd = real_sd print "Clean student t, rasm" - t_distribution = student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, stu_t_likelihood, kernel6) m.ensure_default_constraints() m.update_likelihood_approximation() @@ -129,8 +124,8 @@ def student_t_approx(): plt.ylim(-2.5, 2.5) print "Corrupt student t, rasm" - t_distribution = student_t(deg_free, sigma=edited_real_sd) - corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m.ensure_default_constraints() m.update_likelihood_approximation() @@ -142,8 +137,8 @@ def student_t_approx(): plt.ylim(-2.5, 2.5) print "Clean student t, ncg" - t_distribution = student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = Laplace(Y, t_distribution, rasm=False) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False) m = GPy.models.GP(X, stu_t_likelihood, kernel3) m.ensure_default_constraints() m.update_likelihood_approximation() @@ -155,8 +150,8 @@ def student_t_approx(): plt.ylim(-2.5, 2.5) print "Corrupt student t, ncg" - t_distribution = student_t(deg_free, sigma=edited_real_sd) - corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=False) m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) m.ensure_default_constraints() m.update_likelihood_approximation() @@ -169,8 +164,8 @@ def student_t_approx(): ###with a student t distribution, since it has heavy tails it should work well - ###likelihood_function = student_t(deg_free, sigma=real_var) - ###lap = Laplace(Y, likelihood_function) + ###likelihood_functions = student_t(deg_free, sigma=real_var) + ###lap = Laplace(Y, likelihood_functions) ###cov = kernel.K(X) ###lap.fit_full(cov) diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py index 83413255..9becb1b1 100644 --- a/GPy/likelihoods/__init__.py +++ b/GPy/likelihoods/__init__.py @@ -1,4 +1,4 @@ from EP import EP from Gaussian import Gaussian -# TODO: from Laplace import Laplace +from Laplace import Laplace import likelihood_functions as functions From 01671b6c570b7c40a2b1a326ab2c68606834c674 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 16 Apr 2013 16:34:26 +0100 Subject: [PATCH 26/71] Merged likelihood functions --- GPy/examples/laplace_approximations.py | 4 +- GPy/likelihoods/likelihood_function.py | 253 ----------------------- GPy/likelihoods/likelihood_functions.py | 254 +++++++++++++++++++++++- 3 files changed, 254 insertions(+), 257 deletions(-) delete mode 100644 GPy/likelihoods/likelihood_function.py diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 07801150..5d1c1224 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -164,8 +164,8 @@ def student_t_approx(): ###with a student t distribution, since it has heavy tails it should work well - ###likelihood_functions = student_t(deg_free, sigma=real_var) - ###lap = Laplace(Y, likelihood_functions) + ###likelihood_function = student_t(deg_free, sigma=real_var) + ###lap = Laplace(Y, likelihood_function) ###cov = kernel.K(X) ###lap.fit_full(cov) diff --git a/GPy/likelihoods/likelihood_function.py b/GPy/likelihoods/likelihood_function.py deleted file mode 100644 index f14faf33..00000000 --- a/GPy/likelihoods/likelihood_function.py +++ /dev/null @@ -1,253 +0,0 @@ -from scipy.special import gammaln, gamma -from scipy import integrate -import numpy as np -from GPy.likelihoods.likelihood_functions import likelihood_function -from scipy import stats - - -class student_t(likelihood_function): - """Student t likelihood distribution - For nomanclature see Bayesian Data Analysis 2003 p576 - - $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2)$$ - - Laplace: - Needs functions to calculate - ln p(yi|fi) - dln p(yi|fi)_dfi - d2ln p(yi|fi)_d2fifj - """ - def __init__(self, deg_free, sigma=2): - self.v = deg_free - self.sigma = sigma - - #FIXME: This should be in the superclass - self.log_concave = False - - @property - def variance(self, extra_data=None): - return (self.v / float(self.v - 2)) * (self.sigma**2) - - def link_function(self, y, f, extra_data=None): - """link_function $\ln p(y|f)$ - $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ - - :y: data - :f: latent variables f - :extra_data: extra_data which is not used in student t distribution - :returns: float(likelihood evaluated for this point) - - """ - y = np.squeeze(y) - f = np.squeeze(f) - assert y.shape == f.shape - - e = y - f - objective = (gammaln((self.v + 1) * 0.5) - - gammaln(self.v * 0.5) - + np.log(self.sigma * np.sqrt(self.v * np.pi)) - - (self.v + 1) * 0.5 - * np.log(1 + ((e**2 / self.sigma**2) / self.v)) - ) - return np.sum(objective) - - def link_grad(self, y, f, extra_data=None): - """ - Gradient of the link function at y, given f w.r.t f - - $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ - - :y: data - :f: latent variables f - :extra_data: extra_data which is not used in student t distribution - :returns: gradient of likelihood evaluated at points - - """ - y = np.squeeze(y) - f = np.squeeze(f) - assert y.shape == f.shape - e = y - f - grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) - return np.squeeze(grad) - - def link_hess(self, y, f, extra_data=None): - """ - Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j - i.e. second derivative link_function at y given f f_j w.r.t f and f_j - - Will return diagonal of hessian, since every where else it is 0 - - $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ - - :y: data - :f: latent variables f - :extra_data: extra_data which is not used in student t distribution - :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) - """ - y = np.squeeze(y) - f = np.squeeze(f) - assert y.shape == f.shape - e = y - f - hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) - return np.squeeze(hess) - - def predictive_values(self, mu, var): - """ - Compute mean, and conficence interval (percentiles 5 and 95) of the prediction - - Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*) - (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2)) - *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) - """ - - #We want the variance around test points y which comes from int p(y*|f*)p(f*) df* - #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)] - #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this - #Which was also given to us as (var) - #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution - #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom - true_var = var + self.variance - - #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now - #need the 95 and 5 percentiles. - #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles - p_025 = mu - 2.*true_var - p_975 = mu + 2.*true_var - - return mu, np.nan*mu, p_025, p_975 - - def sample_predicted_values(self, mu, var): - """ Experimental sample approches and numerical integration """ - #p_025 = stats.t.ppf(.025, mu) - #p_975 = stats.t.ppf(.975, mu) - - num_test_points = mu.shape[0] - #Each mu is the latent point f* at the test point x*, - #and the var is the gaussian variance at this point - #Take lots of samples from this, so we have lots of possible values - #for latent point f* for each test point x* weighted by how likely we were to pick it - print "Taking %d samples of f*".format(num_test_points) - num_f_samples = 10 - num_y_samples = 10 - student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples)) - print "Student t means shape: ", student_t_means.shape - - #Now we have lots of f*, lets work out the likelihood of getting this by sampling - #from a student t centred on this point, sample many points from this distribution - #centred on f* - #for test_point, f in enumerate(student_t_means): - #print test_point - #print f.shape - #student_t_samples = stats.t.rvs(self.v, loc=f[:,None], - #scale=self.sigma, - #size=(num_f_samples, num_y_samples)) - #print student_t_samples.shape - - student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None], - scale=self.sigma, - size=(num_test_points, num_y_samples, num_f_samples)) - student_t_samples = np.reshape(student_t_samples, - (num_test_points, num_y_samples*num_f_samples)) - - #Now take the 97.5 and 0.25 percentile of these points - p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None] - p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None] - - ##Alernenately we could sample from int p(y|f*)p(f*|x*) df* - def t_gaussian(f, mu, var): - return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5)) - * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2))) - ) - - def t_gauss_int(mu, var): - print "Mu: ", mu - print "var: ", var - result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var)) - print "Result: ", result - return result[0] - - vec_t_gauss_int = np.vectorize(t_gauss_int) - - p = vec_t_gauss_int(mu, var) - p_025 = mu - p - p_975 = mu + p - return mu, np.nan*mu, p_025, p_975 - - -class weibull_survival(likelihood_function): - """Weibull t likelihood distribution for survival analysis with censoring - For nomanclature see Bayesian Survival Analysis - - Laplace: - Needs functions to calculate - ln p(yi|fi) - dln p(yi|fi)_dfi - d2ln p(yi|fi)_d2fifj - """ - def __init__(self, shape, scale): - self.shape = shape - self.scale = scale - - #FIXME: This should be in the superclass - self.log_concave = True - - def link_function(self, y, f, extra_data=None): - """ - link_function $\ln p(y|f)$, i.e. log likelihood - - $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$ - - :y: time of event data - :f: latent variables f - :extra_data: the censoring indicator, 1 for censored, 0 for not - :returns: float(likelihood evaluated for this point) - - """ - y = np.squeeze(y) - f = np.squeeze(f) - assert y.shape == f.shape - - v = extra_data - objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f) # FIXME: CHECK THIS WITH BOOK, wheres scale? - return np.sum(objective) - - def link_grad(self, y, f, extra_data=None): - """ - Gradient of the link function at y, given f w.r.t f - - $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i}) - - :y: data - :f: latent variables f - :extra_data: the censoring indicator, 1 for censored, 0 for not - :returns: gradient of likelihood evaluated at points - - """ - y = np.squeeze(y) - f = np.squeeze(f) - assert y.shape == f.shape - - v = extra_data - grad = v - (y**self.shape)*np.exp(f) - return np.squeeze(grad) - - def link_hess(self, y, f, extra_data=None): - """ - Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j - i.e. second derivative link_function at y given f f_j w.r.t f and f_j - - Will return diagonal of hessian, since every where else it is 0 - - $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ - - :y: data - :f: latent variables f - :extra_data: extra_data which is not used hessian - :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) - """ - y = np.squeeze(y) - f = np.squeeze(f) - assert y.shape == f.shape - - hess = (y**self.shape)*np.exp(f) - return np.squeeze(hess) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 4b8e7013..c759e15f 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -1,12 +1,14 @@ # Copyright (c) 2012, 2013 Ricardo Andrade # Licensed under the BSD 3-clause license (see LICENSE.txt) - import numpy as np -from scipy import stats +from scipy import stats, integrate import scipy as sp import pylab as pb from ..util.plot import gpplot +from scipy.special import gammaln, gamma +#from GPy.likelihoods.likelihood_functions import likelihood_function + class likelihood_function: """ @@ -132,3 +134,251 @@ class Poisson(likelihood_function): p_025 = tmp[:,0] p_975 = tmp[:,1] return mean,np.nan*mean,p_025,p_975 # better variance here TODO + + +class student_t(likelihood_function): + """Student t likelihood distribution + For nomanclature see Bayesian Data Analysis 2003 p576 + + $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2)$$ + + Laplace: + Needs functions to calculate + ln p(yi|fi) + dln p(yi|fi)_dfi + d2ln p(yi|fi)_d2fifj + """ + def __init__(self, deg_free, sigma=2): + self.v = deg_free + self.sigma = sigma + + #FIXME: This should be in the superclass + self.log_concave = False + + @property + def variance(self, extra_data=None): + return (self.v / float(self.v - 2)) * (self.sigma**2) + + def link_function(self, y, f, extra_data=None): + """link_function $\ln p(y|f)$ + $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ + + :y: data + :f: latent variables f + :extra_data: extra_data which is not used in student t distribution + :returns: float(likelihood evaluated for this point) + + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + e = y - f + objective = (gammaln((self.v + 1) * 0.5) + - gammaln(self.v * 0.5) + + np.log(self.sigma * np.sqrt(self.v * np.pi)) + - (self.v + 1) * 0.5 + * np.log(1 + ((e**2 / self.sigma**2) / self.v)) + ) + return np.sum(objective) + + def link_grad(self, y, f, extra_data=None): + """ + Gradient of the link function at y, given f w.r.t f + + $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ + + :y: data + :f: latent variables f + :extra_data: extra_data which is not used in student t distribution + :returns: gradient of likelihood evaluated at points + + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + e = y - f + grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) + return np.squeeze(grad) + + def link_hess(self, y, f, extra_data=None): + """ + Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j + i.e. second derivative link_function at y given f f_j w.r.t f and f_j + + Will return diagonal of hessian, since every where else it is 0 + + $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ + + :y: data + :f: latent variables f + :extra_data: extra_data which is not used in student t distribution + :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + e = y - f + hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) + return np.squeeze(hess) + + def predictive_values(self, mu, var): + """ + Compute mean, and conficence interval (percentiles 5 and 95) of the prediction + + Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*) + (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2)) + *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) + """ + + #We want the variance around test points y which comes from int p(y*|f*)p(f*) df* + #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)] + #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this + #Which was also given to us as (var) + #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution + #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom + true_var = var + self.variance + + #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now + #need the 95 and 5 percentiles. + #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles + p_025 = mu - 2.*true_var + p_975 = mu + 2.*true_var + + return mu, np.nan*mu, p_025, p_975 + + def sample_predicted_values(self, mu, var): + """ Experimental sample approches and numerical integration """ + #p_025 = stats.t.ppf(.025, mu) + #p_975 = stats.t.ppf(.975, mu) + + num_test_points = mu.shape[0] + #Each mu is the latent point f* at the test point x*, + #and the var is the gaussian variance at this point + #Take lots of samples from this, so we have lots of possible values + #for latent point f* for each test point x* weighted by how likely we were to pick it + print "Taking %d samples of f*".format(num_test_points) + num_f_samples = 10 + num_y_samples = 10 + student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples)) + print "Student t means shape: ", student_t_means.shape + + #Now we have lots of f*, lets work out the likelihood of getting this by sampling + #from a student t centred on this point, sample many points from this distribution + #centred on f* + #for test_point, f in enumerate(student_t_means): + #print test_point + #print f.shape + #student_t_samples = stats.t.rvs(self.v, loc=f[:,None], + #scale=self.sigma, + #size=(num_f_samples, num_y_samples)) + #print student_t_samples.shape + + student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None], + scale=self.sigma, + size=(num_test_points, num_y_samples, num_f_samples)) + student_t_samples = np.reshape(student_t_samples, + (num_test_points, num_y_samples*num_f_samples)) + + #Now take the 97.5 and 0.25 percentile of these points + p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None] + p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None] + + ##Alernenately we could sample from int p(y|f*)p(f*|x*) df* + def t_gaussian(f, mu, var): + return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5)) + * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2))) + ) + + def t_gauss_int(mu, var): + print "Mu: ", mu + print "var: ", var + result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var)) + print "Result: ", result + return result[0] + + vec_t_gauss_int = np.vectorize(t_gauss_int) + + p = vec_t_gauss_int(mu, var) + p_025 = mu - p + p_975 = mu + p + return mu, np.nan*mu, p_025, p_975 + + +class weibull_survival(likelihood_function): + """Weibull t likelihood distribution for survival analysis with censoring + For nomanclature see Bayesian Survival Analysis + + Laplace: + Needs functions to calculate + ln p(yi|fi) + dln p(yi|fi)_dfi + d2ln p(yi|fi)_d2fifj + """ + def __init__(self, shape, scale): + self.shape = shape + self.scale = scale + + #FIXME: This should be in the superclass + self.log_concave = True + + def link_function(self, y, f, extra_data=None): + """ + link_function $\ln p(y|f)$, i.e. log likelihood + + $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$ + + :y: time of event data + :f: latent variables f + :extra_data: the censoring indicator, 1 for censored, 0 for not + :returns: float(likelihood evaluated for this point) + + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + v = extra_data + objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f) # FIXME: CHECK THIS WITH BOOK, wheres scale? + return np.sum(objective) + + def link_grad(self, y, f, extra_data=None): + """ + Gradient of the link function at y, given f w.r.t f + + $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i}) + + :y: data + :f: latent variables f + :extra_data: the censoring indicator, 1 for censored, 0 for not + :returns: gradient of likelihood evaluated at points + + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + v = extra_data + grad = v - (y**self.shape)*np.exp(f) + return np.squeeze(grad) + + def link_hess(self, y, f, extra_data=None): + """ + Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j + i.e. second derivative link_function at y given f f_j w.r.t f and f_j + + Will return diagonal of hessian, since every where else it is 0 + + $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ + + :y: data + :f: latent variables f + :extra_data: extra_data which is not used hessian + :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + hess = (y**self.shape)*np.exp(f) + return np.squeeze(hess) From 1420aa532c5df8eaf4e6db5b89e77f4b375ebf1c Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 19 Apr 2013 12:23:00 +0100 Subject: [PATCH 27/71] Attempted to introduce gradient methods, won't work yet I doubt --- GPy/examples/__init__.py | 1 + GPy/likelihoods/Laplace.py | 120 ++++++++++++++++++------ GPy/likelihoods/likelihood_functions.py | 58 +++++++++++- GPy/models/GP.py | 16 +++- GPy/util/linalg.py | 19 +++- 5 files changed, 177 insertions(+), 37 deletions(-) diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py index 551bff54..68832e77 100644 --- a/GPy/examples/__init__.py +++ b/GPy/examples/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) +import laplace_approximations import classification import regression import dimensionality_reduction diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 4d94ba0f..b1b41957 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -4,28 +4,9 @@ import GPy from scipy.linalg import cholesky, eig, inv, cho_solve, det from numpy.linalg import cond from GPy.likelihoods.likelihood import likelihood -from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv +from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet from scipy.linalg.lapack import dtrtrs -#TODO: Move this to utils - - -def det_ln_diag(A): - """ - log determinant of a diagonal matrix - $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$ - """ - return np.log(np.diagonal(A)).sum() - - -def pddet(A): - """ - Determinant of a positive definite matrix - """ - L = cholesky(A) - logdetA = 2*sum(np.log(np.diag(L))) - return logdetA - class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -75,17 +56,92 @@ class Laplace(likelihood): return self.likelihood_function.predictive_values(mu, var) def _get_params(self): - return np.zeros(0) + return np.asarray(self.likelihood_function._get_params()) def _get_param_names(self): - return [] + return self.likelihood_function._get_param_names() def _set_params(self, p): - pass # TODO: Laplace likelihood might want to take some parameters... + return self.likelihood_function._set_params() + + def both_gradients(self, dL_d_K_Sigma, dK_dthetaK): + """ + Find the gradients of the marginal likelihood w.r.t both thetaK and thetaL + + dL_dthetaK differs from that of normal likelihoods as it has additional terms coming from + changes to y_tilde and changes to Sigma_tilde when the kernel parameters are adjusted + + Similar terms arise when finding the gradients with respect to changes in the liklihood + parameters + """ + return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) + + def _shared_gradients_components(self): + dL_dytil = -np.dot((self.K+self.Sigma_tilde), self.Y) + dytil_dfhat = np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W? + return dL_dytil, dytil_dfhat + + def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): + """ + #explicit #implicit #implicit + dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK) + :param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance) + :param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers + :returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters + """ + dL_dytil, dytil_dfhat = self._shared_gradients_components() + + I_KW_i, _, _, _ = pdinv(np.eye(self.N) + np.dot(self.K, self.W)) + #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K! + dfhat_dthetaK = I_KW_i*dK_dthetaK*self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) + + dytil_dthetaK = dytil_dfhat*dfhat_dthetaK + + #FIXME: Careful dL_dK = dL_d_K_Sigma + #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? + dL_dSigma = dL_d_K_Sigma + d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) + #explicit #implicit + dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS + dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde) + + dL_dthetaK_implicit = dL_dytil*dytil_dthetaK + dL_dSigma*dSigma_dthetaK + return dL_dthetaK_implicit def _gradients(self, partial): - return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... - raise NotImplementedError + """ + Gradients with respect to likelihood parameters + + Complicated, it differs for parameters of the kernel \theta_{K}, and + parameters of the likelihood, \theta_{L} + + dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK) + dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL) + dL_dK*dK_dthetaL = 0 + + dytil_dthetaX = dytil_dfhat * dfhat_dthetaX + dytil_dfhat = Sigma*Ki + I + + fhat = K*log_p(y|fhat) from rasm p125 + dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat) from rasm p125 + + dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi + dW_dthetaX = d_dthetaX[d2phi_d2fhat] + d2phi_d2fhat = Hessian function of likelihood + + partial = dL_dK + """ + dL_dytil, dytil_dfhat = self._shared_gradients_components() + dfhat_dthetaL = self.likelihood_function.df_dtheta() + + dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? + dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde) + dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case + + dytil_dthetaL = dytil_dfhat*dfhat_dthetaL + dL_dthetaL = 0 + dL_dytil*dytil_dthetaL + dL_dSigma*dSigma_dthetaL + return dL_dthetaL + #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... def _compute_GP_variables(self): """ @@ -112,8 +168,9 @@ class Laplace(likelihood): $$\tilde{\Sigma} = W^{-1}$$ """ - epsilon = 1e-6 + epsilon = 1e14 + #Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I #dtritri -> L -> L_i #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i #((L.T*w)_i + I)f_hat = y_tilde @@ -122,11 +179,12 @@ class Laplace(likelihood): Lt_W = np.dot(L.T, self.W) ##Check it isn't singular! - if cond(Lt_W) > 1e14: + if cond(Lt_W) > epsilon: print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem" Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0] - Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat) + self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N) + Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat) #f.T(Ki + W)f f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat)) @@ -156,16 +214,16 @@ class Laplace(likelihood): #) ##Check it isn't singular! - if cond(self.W) > 1e14: + if cond(self.W) > epsilon: print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem" - Sigma_tilde = inv(self.W) # Damn + self.Sigma_tilde = inv(self.W) # Damn #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) self.Y = Y_tilde self.YYT = np.dot(self.Y, self.Y.T) - self.covariance_matrix = Sigma_tilde + self.covariance_matrix = self.Sigma_tilde self.precision = 1 / np.diag(self.covariance_matrix)[:, None] def fit_full(self, K): diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index c759e15f..6e72b029 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -20,6 +20,16 @@ class likelihood_function: def __init__(self,location=0,scale=1): self.location = location self.scale = scale + self.log_concave = True + + def _get_params(self): + return np.zeros(0) + + def _get_param_names(self): + return [] + + def _set_params(self, p): + pass class probit(likelihood_function): """ @@ -149,12 +159,22 @@ class student_t(likelihood_function): d2ln p(yi|fi)_d2fifj """ def __init__(self, deg_free, sigma=2): + super(student_t, self).__init__() self.v = deg_free self.sigma = sigma - - #FIXME: This should be in the superclass self.log_concave = False + def _get_params(self): + return np.asarray(self.sigma) + + def _get_param_names(self): + return ["t_noise_variance"] + + def _set_params(self, x): + self.sigma = float(x) + #self.covariance_matrix = np.eye(self.N)*self._variance + #self.precision = 1./self._variance + @property def variance(self, extra_data=None): return (self.v / float(self.v - 2)) * (self.sigma**2) @@ -222,6 +242,40 @@ class student_t(likelihood_function): hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) return np.squeeze(hess) + def d3link(self, y, f, extra_data=None): + """ + Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j + + $$\frac{-2(v+1)((f-y)^{3} - 3\sigma^{2}v(f-y))}{((f-y)^{2} + \sigma^{2}v)^{3}}$$ + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + #NB f-y not y-f + e = f - y + d3link_d3f = ( (-2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e)) + / ((e**2 + (self.sigma**2)*self.v)**3) + ) + return d3link_d3f + + def link_hess_grad_sigma(self, y, f, extra_data=None): + """ + Gradient of the hessian w.r.t sigma parameter + + $$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}} + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + e = y - f + hess_grad_sigma = ( (2*self.sigma*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) + / ((e**2 + (self.sigma**2)*self.v)**3) + ) + return hess_grad_sigma + + def _gradients(self, y, f, extra_data=None): + return [self.link_hess_grad_sigma] # list as we might learn many parameters + def predictive_values(self, mu, var): """ Compute mean, and conficence interval (percentiles 5 and 95) of the prediction diff --git a/GPy/models/GP.py b/GPy/models/GP.py index cfda0cfe..1024b5ef 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -8,7 +8,7 @@ from .. import kern from ..core import model from ..util.linalg import pdinv,mdot from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango -from ..likelihoods import EP +from ..likelihoods import EP, Laplace class GP(model): """ @@ -128,7 +128,19 @@ class GP(model): For the likelihood parameters, pass in alpha = K^-1 y """ - return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK,X=self.X,slices1=self.Xslices,slices2=self.Xslices), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) + if isinstance(self.likelihood, Laplace): + dL_dthetaK_explicit = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices) + #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained + fake_dL_dKs = np.ones(self.dL_dK.shape) + dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices) + + dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK) + dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit + dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) + else: + dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices) + dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) + return np.hstack((dL_dthetaK, dL_dthetaL)) def _raw_predict(self,_Xnew,slices=None, full_cov=False): """ diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py index f88099a4..cb899397 100644 --- a/GPy/util/linalg.py +++ b/GPy/util/linalg.py @@ -14,6 +14,21 @@ import types #import scipy.lib.lapack.flapack import scipy as sp +def det_ln_diag(A): + """ + log determinant of a diagonal matrix + $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$ + """ + return np.log(np.diagonal(A)).sum() + +def pddet(A): + """ + Determinant of a positive definite matrix + """ + L = cholesky(A) + logdetA = 2*sum(np.log(np.diag(L))) + return logdetA + def trace_dot(a,b): """ efficiently compute the trace of the matrix product of a and b @@ -166,8 +181,8 @@ def PCA(Y, Q): """ if not np.allclose(Y.mean(axis=0), 0.0): print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)" - - #Y -= Y.mean(axis=0) + + #Y -= Y.mean(axis=0) Z = linalg.svd(Y-Y.mean(axis=0), full_matrices = False) [X, W] = [Z[0][:,0:Q], np.dot(np.diag(Z[1]), Z[2]).T[:,0:Q]] From 267a8e427c147aa5ac98e3f42c58d90492e53b4c Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 19 Apr 2013 17:41:01 +0100 Subject: [PATCH 28/71] Adding gradients, shapes starting to make sense --- GPy/likelihoods/Laplace.py | 53 ++++++++++++++++--------- GPy/likelihoods/likelihood_functions.py | 28 +++++++++---- GPy/models/GP.py | 6 +-- GPy/util/linalg.py | 2 +- 4 files changed, 60 insertions(+), 29 deletions(-) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index b1b41957..b5c0bdfe 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -1,11 +1,12 @@ import numpy as np import scipy as sp import GPy -from scipy.linalg import cholesky, eig, inv, cho_solve, det +from scipy.linalg import inv, cho_solve, det from numpy.linalg import cond from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet from scipy.linalg.lapack import dtrtrs +import pylab as plt class Laplace(likelihood): @@ -62,7 +63,7 @@ class Laplace(likelihood): return self.likelihood_function._get_param_names() def _set_params(self, p): - return self.likelihood_function._set_params() + return self.likelihood_function._set_params(p) def both_gradients(self, dL_d_K_Sigma, dK_dthetaK): """ @@ -77,8 +78,8 @@ class Laplace(likelihood): return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) def _shared_gradients_components(self): - dL_dytil = -np.dot((self.K+self.Sigma_tilde), self.Y) - dytil_dfhat = np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W? + dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) + dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W? return dL_dytil, dytil_dfhat def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): @@ -91,12 +92,18 @@ class Laplace(likelihood): """ dL_dytil, dytil_dfhat = self._shared_gradients_components() - I_KW_i, _, _, _ = pdinv(np.eye(self.N) + np.dot(self.K, self.W)) + A = np.eye(self.N) + np.dot(self.K, self.W) + plt.imshow(A) + plt.show() + I_KW_i, _, _, _ = pdinv(A) + #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K! - dfhat_dthetaK = I_KW_i*dK_dthetaK*self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) - - dytil_dthetaK = dytil_dfhat*dfhat_dthetaK + #Derivative for each f dimension, for each of K's hyper parameters + dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0])) + for ind_j, thetaj in enumerate(dK_dthetaK): + dfhat_dthetaK[:, ind_j] = mdot(I_KW_i, thetaj, self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)) + dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK) #FIXME: Careful dL_dK = dL_d_K_Sigma #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? dL_dSigma = dL_d_K_Sigma @@ -105,8 +112,9 @@ class Laplace(likelihood): dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde) - dL_dthetaK_implicit = dL_dytil*dytil_dthetaK + dL_dSigma*dSigma_dthetaK - return dL_dthetaK_implicit + dL_dthetaK_implicit = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)# + np.dot(dL_dSigma, dSigma_dthetaK) + #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T) + return np.squeeze(dL_dthetaK_implicit) def _gradients(self, partial): """ @@ -132,16 +140,25 @@ class Laplace(likelihood): partial = dL_dK """ dL_dytil, dytil_dfhat = self._shared_gradients_components() - dfhat_dthetaL = self.likelihood_function.df_dtheta() + dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde) + #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? + #Derivative for each f dimension, for each of K's hyper parameters + dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names()))) + for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T): + dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde, + dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)? + self.Sigma_tilde + ) + + #TODO: This is Wi*A*Wi, can be more numerically stable with a trick + #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde) dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case - dytil_dthetaL = dytil_dfhat*dfhat_dthetaL - dL_dthetaL = 0 + dL_dytil*dytil_dthetaL + dL_dSigma*dSigma_dthetaL - return dL_dthetaL - #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... + #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL + dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL) + dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL) + return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) def _compute_GP_variables(self): """ @@ -335,7 +352,7 @@ class Laplace(likelihood): rs = 0 i = 0 while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: - f_old = f.copy() + #f_old = f.copy() W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data)) if not self.likelihood_function.log_concave: W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 6e72b029..64791047 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -159,10 +159,10 @@ class student_t(likelihood_function): d2ln p(yi|fi)_d2fifj """ def __init__(self, deg_free, sigma=2): - super(student_t, self).__init__() self.v = deg_free self.sigma = sigma self.log_concave = False + #super(student_t, self).__init__() def _get_params(self): return np.asarray(self.sigma) @@ -258,9 +258,9 @@ class student_t(likelihood_function): ) return d3link_d3f - def link_hess_grad_sigma(self, y, f, extra_data=None): + def link_hess_grad_std(self, y, f, extra_data=None): """ - Gradient of the hessian w.r.t sigma parameter + Gradient of the hessian w.r.t sigma parameter (standard deviation) $$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}} """ @@ -273,8 +273,24 @@ class student_t(likelihood_function): ) return hess_grad_sigma + def link_grad_std(self, y, f, extra_data=None): + """ + Gradient of the likelihood w.r.t sigma parameter (standard deviation) + + $$\frac{-2\sigma(v+1)(y-f)}{(v\sigma^{2} + (y-f)^{2})^{2}}$$ + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + e = y - f + grad_sigma = ( (-2*self.sigma*self.v*(self.v + 1)*e) + / ((self.v*(self.sigma**2) + e**2)**2) + ) + return grad_sigma + def _gradients(self, y, f, extra_data=None): - return [self.link_hess_grad_sigma] # list as we might learn many parameters + return [self.link_grad_std(y, f, extra_data=extra_data)[:, None], + self.link_hess_grad_std(y, f, extra_data=extra_data)[:, None]] # list as we might learn many parameters def predictive_values(self, mu, var): """ @@ -372,9 +388,7 @@ class weibull_survival(likelihood_function): def __init__(self, shape, scale): self.shape = shape self.scale = scale - - #FIXME: This should be in the superclass - self.log_concave = True + self.log_concave = True # Or false? def link_function(self, y, f, extra_data=None): """ diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 1024b5ef..24037afe 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -128,17 +128,17 @@ class GP(model): For the likelihood parameters, pass in alpha = K^-1 y """ + dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices) if isinstance(self.likelihood, Laplace): - dL_dthetaK_explicit = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices) + dL_dthetaK_explicit = dL_dthetaK #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained fake_dL_dKs = np.ones(self.dL_dK.shape) dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices) dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK) dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit - dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) + dL_dthetaL = self.likelihood._gradients(partial=self.dL_dK) else: - dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) return np.hstack((dL_dthetaK, dL_dthetaL)) diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py index cb899397..20293ed8 100644 --- a/GPy/util/linalg.py +++ b/GPy/util/linalg.py @@ -25,7 +25,7 @@ def pddet(A): """ Determinant of a positive definite matrix """ - L = cholesky(A) + L = jitchol(A) logdetA = 2*sum(np.log(np.diag(L))) return logdetA From 9de0b23f65470dfa3ec2fad756f2ab901f29ef0c Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 29 Apr 2013 18:08:46 +0100 Subject: [PATCH 29/71] Plotting problematic kernel --- GPy/likelihoods/Laplace.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index b5c0bdfe..9cacb0e1 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -92,9 +92,12 @@ class Laplace(likelihood): """ dL_dytil, dytil_dfhat = self._shared_gradients_components() - A = np.eye(self.N) + np.dot(self.K, self.W) - plt.imshow(A) - plt.show() + print "Computing K gradients" + I = np.eye(self.N) + C = np.dot(self.K, self.W) + A = I + C + #plt.imshow(A) + #plt.show() I_KW_i, _, _, _ = pdinv(A) #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K! @@ -250,6 +253,8 @@ class Laplace(likelihood): :K: Covariance matrix """ self.K = K.copy() + #assert np.all(self.K.T == self.K) + #self.K_safe = K.copy() if self.rasm: self.f_hat = self.rasm_mode(K) else: From f95666a8f9cb07209d80226ed1c5b0352b9eed75 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 6 May 2013 10:15:39 +0100 Subject: [PATCH 30/71] Merging --- GPy/likelihoods/Laplace.py | 1 + GPy/models/GP.py | 15 +++++---------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 9cacb0e1..5e28212e 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -98,6 +98,7 @@ class Laplace(likelihood): A = I + C #plt.imshow(A) #plt.show() + ki, _, _, _ = pdinv(self.K) I_KW_i, _, _, _ = pdinv(A) #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K! diff --git a/GPy/models/GP.py b/GPy/models/GP.py index d353e5dd..96ec6582 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -6,15 +6,9 @@ import numpy as np import pylab as pb from .. import kern from ..core import model -<<<<<<< HEAD -from ..util.linalg import pdinv,mdot -from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango -from ..likelihoods import EP, Laplace -======= from ..util.linalg import pdinv, mdot from ..util.plot import gpplot, x_frame1D, x_frame2D, Tango -from ..likelihoods import EP ->>>>>>> upstream/devel +from ..likelihoods import EP, Laplace class GP(model): """ @@ -34,6 +28,7 @@ class GP(model): """ def __init__(self, X, likelihood, kernel, normalize_X=False): + self.has_uncertain_inputs=False # parse arguments self.X = X @@ -128,12 +123,12 @@ class GP(model): Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta """ - dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices) + dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) if isinstance(self.likelihood, Laplace): dL_dthetaK_explicit = dL_dthetaK #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained fake_dL_dKs = np.ones(self.dL_dK.shape) - dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices) + dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK) dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit @@ -251,7 +246,7 @@ class GP(model): else: raise NotImplementedError, "Cannot define a frame with more than two input dimensions" - def plot(self, samples=0, plot_limits=None, which_data='all', which_functions='all', resolution=None, levels=20): + def plot(self, samples=0, plot_limits=None, which_data='all', which_functions='all', which_parts='all', resolution=None, levels=20): """ TODO: Docstrings! :param levels: for 2D plotting, the number of contour levels to use From a52c20f47008233495e20d96b4ab50be8eb7d4a3 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 7 May 2013 13:35:47 +0100 Subject: [PATCH 31/71] Added a debug examples --- GPy/examples/laplace_approximations.py | 84 +++++++++++++++++++++++++- GPy/likelihoods/Laplace.py | 23 +++++-- GPy/models/GP.py | 6 +- 3 files changed, 104 insertions(+), 9 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 5d1c1224..7e5c55bf 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -35,12 +35,86 @@ def timing(): print the_is print np.mean(the_is) +def debug_student_t_noise_approx(): + real_var = 0.2 + #Start a function, any function + X = np.linspace(0.0, 10.0, 30)[:, None] + Y = np.sin(X) + np.random.randn(*X.shape)*real_var + + X_full = np.linspace(0.0, 10.0, 500)[:, None] + Y_full = np.sin(X_full) + + #Y = Y/Y.max() + + #Add student t random noise to datapoints + deg_free = 10000 + real_sd = np.sqrt(real_var) + print "Real noise: ", real_sd + + initial_var_guess = 0.01 + #t_rv = t(deg_free, loc=0, scale=real_var) + #noise = t_rvrvs(size=Y.shape) + #Y += noise + + plt.figure(1) + plt.suptitle('Gaussian likelihood') + # Kernel object + kernel1 = GPy.kern.rbf(X.shape[1]) + kernel2 = kernel1.copy() + kernel3 = kernel1.copy() + kernel4 = kernel1.copy() + kernel5 = kernel1.copy() + kernel6 = kernel1.copy() + + print "Clean Gaussian" + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model + m = GPy.models.GP_regression(X, Y, kernel=kernel1) + # optimize + m.ensure_default_constraints() + m.optimize() + # plot + plt.subplot(131) + m.plot() + plt.plot(X_full, Y_full) + print m + + plt.suptitle('Student-t likelihood') + edited_real_sd = initial_var_guess #real_sd + + print "Clean student t, rasm" + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) + m = GPy.models.GP(X, stu_t_likelihood, kernel6) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(132) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + print "Clean student t, ncg" + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False) + m = GPy.models.GP(X, stu_t_likelihood, kernel3) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(133) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + plt.show() def student_t_approx(): """ Example of regressing with a student t likelihood """ - real_var = 0.1 + real_var = 0.2 #Start a function, any function X = np.linspace(0.0, 10.0, 30)[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var @@ -58,8 +132,11 @@ def student_t_approx(): #Yc = Yc/Yc.max() #Add student t random noise to datapoints - deg_free = 10 + deg_free = 1000000000000 real_sd = np.sqrt(real_var) + print "Real noise: ", real_sd + + initial_var_guess = 0.01 #t_rv = t(deg_free, loc=0, scale=real_var) #noise = t_rvrvs(size=Y.shape) #Y += noise @@ -73,6 +150,7 @@ def student_t_approx(): #print corrupted_indices #noise = t_rv.rvs(size=(len(corrupted_indices), 1)) #Y[corrupted_indices] += noise + plt.figure(1) plt.suptitle('Gaussian likelihood') # Kernel object @@ -108,7 +186,7 @@ def student_t_approx(): plt.figure(2) plt.suptitle('Student-t likelihood') - edited_real_sd = real_sd + edited_real_sd = initial_var_guess #real_sd print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 5e28212e..02f2c93f 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -5,7 +5,7 @@ from scipy.linalg import inv, cho_solve, det from numpy.linalg import cond from GPy.likelihoods.likelihood import likelihood from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet -from scipy.linalg.lapack import dtrtrs +from scipy.linalg.flapack import dtrtrs import pylab as plt @@ -63,6 +63,7 @@ class Laplace(likelihood): return self.likelihood_function._get_param_names() def _set_params(self, p): + print "Setting noise sd: ", p return self.likelihood_function._set_params(p) def both_gradients(self, dL_d_K_Sigma, dK_dthetaK): @@ -79,7 +80,9 @@ class Laplace(likelihood): def _shared_gradients_components(self): dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) - dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W? + #dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W? + Ki = inv(self.K) + dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? return dL_dytil, dytil_dfhat def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): @@ -93,19 +96,26 @@ class Laplace(likelihood): dL_dytil, dytil_dfhat = self._shared_gradients_components() print "Computing K gradients" + print "dytil_dfhat: ", np.mean(dytil_dfhat) I = np.eye(self.N) C = np.dot(self.K, self.W) A = I + C #plt.imshow(A) #plt.show() - ki, _, _, _ = pdinv(self.K) - I_KW_i, _, _, _ = pdinv(A) + + #FIXME: K ISNT SYMMETRIC SO NEITHER IS A AND IT MAKES IT NON-PD! + #ki, _, _, _ = pdinv(self.K) + #I_KW_i, _, _, _ = pdinv(A) + + I_KW_i = inv(A) + #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K! #Derivative for each f dimension, for each of K's hyper parameters dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0])) + grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) for ind_j, thetaj in enumerate(dK_dthetaK): - dfhat_dthetaK[:, ind_j] = mdot(I_KW_i, thetaj, self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)) + dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad)) dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK) #FIXME: Careful dL_dK = dL_d_K_Sigma @@ -116,8 +126,11 @@ class Laplace(likelihood): dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde) + print "dL_dytil: ", np.mean(dL_dytil) + print "dytil_dthetaK: ", np.mean(dytil_dthetaK) dL_dthetaK_implicit = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)# + np.dot(dL_dSigma, dSigma_dthetaK) #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T) + import ipdb; ipdb.set_trace() # XXX BREAKPOINT return np.squeeze(dL_dthetaK_implicit) def _gradients(self, partial): diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 96ec6582..07c7a708 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -116,7 +116,6 @@ class GP(model): """ return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z - def _log_likelihood_gradients(self): """ The gradient of all parameters. @@ -132,9 +131,14 @@ class GP(model): dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK) dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit + + print "dL_dthetaK_explicit: {dldkx} dL_dthetaK_implicit: {dldki} dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK) + dL_dthetaL = self.likelihood._gradients(partial=self.dL_dK) else: + print "dL_dthetaK: ", dL_dthetaK dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) + print "dL_dthetaL: ", dL_dthetaL return np.hstack((dL_dthetaK, dL_dthetaL)) #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) From 84f12c1079a10db7dfe0737c5de1ca5b74d3b2d0 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 8 May 2013 12:36:31 +0100 Subject: [PATCH 32/71] Scale and switch KW+I --- GPy/examples/laplace_approximations.py | 5 ++-- GPy/likelihoods/Laplace.py | 37 +++++++++++++++----------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 7e5c55bf..704297ef 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -36,7 +36,7 @@ def timing(): print np.mean(the_is) def debug_student_t_noise_approx(): - real_var = 0.2 + real_var = 0.1 #Start a function, any function X = np.linspace(0.0, 10.0, 30)[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var @@ -44,7 +44,7 @@ def debug_student_t_noise_approx(): X_full = np.linspace(0.0, 10.0, 500)[:, None] Y_full = np.sin(X_full) - #Y = Y/Y.max() + Y = Y/Y.max() #Add student t random noise to datapoints deg_free = 10000 @@ -56,6 +56,7 @@ def debug_student_t_noise_approx(): #noise = t_rvrvs(size=Y.shape) #Y += noise + plt.close('all') plt.figure(1) plt.suptitle('Gaussian likelihood') # Kernel object diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 02f2c93f..934b2a90 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -3,8 +3,8 @@ import scipy as sp import GPy from scipy.linalg import inv, cho_solve, det from numpy.linalg import cond -from GPy.likelihoods.likelihood import likelihood -from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet +from likelihood import likelihood +from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet from scipy.linalg.flapack import dtrtrs import pylab as plt @@ -79,10 +79,10 @@ class Laplace(likelihood): return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) def _shared_gradients_components(self): - dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) - #dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W? - Ki = inv(self.K) - dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? + dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? + dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W? + #Ki = inv(self.K) + #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? return dL_dytil, dytil_dfhat def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): @@ -95,6 +95,10 @@ class Laplace(likelihood): """ dL_dytil, dytil_dfhat = self._shared_gradients_components() + d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) + + dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde)) + print "Computing K gradients" print "dytil_dfhat: ", np.mean(dytil_dfhat) I = np.eye(self.N) @@ -103,12 +107,7 @@ class Laplace(likelihood): #plt.imshow(A) #plt.show() - #FIXME: K ISNT SYMMETRIC SO NEITHER IS A AND IT MAKES IT NON-PD! - #ki, _, _, _ = pdinv(self.K) - #I_KW_i, _, _, _ = pdinv(A) - - I_KW_i = inv(A) - + I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?! #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K! #Derivative for each f dimension, for each of K's hyper parameters @@ -121,14 +120,20 @@ class Laplace(likelihood): #FIXME: Careful dL_dK = dL_d_K_Sigma #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? dL_dSigma = dL_d_K_Sigma - d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) + #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) #explicit #implicit - dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS - dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde) + dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK) + dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0])) + for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK): + dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde) print "dL_dytil: ", np.mean(dL_dytil) print "dytil_dthetaK: ", np.mean(dytil_dthetaK) - dL_dthetaK_implicit = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)# + np.dot(dL_dSigma, dSigma_dthetaK) + + #FIXME: Won't handle multi dimensional data + dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0) + dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=(0,1)) + dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T) import ipdb; ipdb.set_trace() # XXX BREAKPOINT return np.squeeze(dL_dthetaK_implicit) From 6c4866662c9f20dbc3a9a5d08aab85bf95e1e84d Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 8 May 2013 16:05:01 +0100 Subject: [PATCH 33/71] Seem to have gradients much closer now --- GPy/examples/laplace_approximations.py | 34 +++++---- GPy/likelihoods/Laplace.py | 99 ++++++++++++++++++------- GPy/likelihoods/likelihood_functions.py | 19 +++-- GPy/models/GP.py | 18 +++-- 4 files changed, 110 insertions(+), 60 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 704297ef..57ae9be7 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -36,6 +36,7 @@ def timing(): print np.mean(the_is) def debug_student_t_noise_approx(): + plot = False real_var = 0.1 #Start a function, any function X = np.linspace(0.0, 10.0, 30)[:, None] @@ -57,8 +58,6 @@ def debug_student_t_noise_approx(): #Y += noise plt.close('all') - plt.figure(1) - plt.suptitle('Gaussian likelihood') # Kernel object kernel1 = GPy.kern.rbf(X.shape[1]) kernel2 = kernel1.copy() @@ -75,12 +74,14 @@ def debug_student_t_noise_approx(): m.ensure_default_constraints() m.optimize() # plot - plt.subplot(131) - m.plot() - plt.plot(X_full, Y_full) + if plot: + plt.figure(1) + plt.suptitle('Gaussian likelihood') + plt.subplot(131) + m.plot() + plt.plot(X_full, Y_full) print m - plt.suptitle('Student-t likelihood') edited_real_sd = initial_var_guess #real_sd print "Clean student t, rasm" @@ -91,10 +92,12 @@ def debug_student_t_noise_approx(): m.update_likelihood_approximation() m.optimize() print(m) - plt.subplot(132) - m.plot() - plt.plot(X_full, Y_full) - plt.ylim(-2.5, 2.5) + if plot: + plt.suptitle('Student-t likelihood') + plt.subplot(132) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) print "Clean student t, ncg" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) @@ -104,12 +107,13 @@ def debug_student_t_noise_approx(): m.update_likelihood_approximation() m.optimize() print(m) - plt.subplot(133) - m.plot() - plt.plot(X_full, Y_full) - plt.ylim(-2.5, 2.5) + if plot: + plt.subplot(133) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) - plt.show() + #plt.show() def student_t_approx(): """ diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 934b2a90..566e4e25 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -5,8 +5,8 @@ from scipy.linalg import inv, cho_solve, det from numpy.linalg import cond from likelihood import likelihood from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet -from scipy.linalg.flapack import dtrtrs -import pylab as plt +from scipy.linalg.lapack import dtrtrs +#import pylab as plt class Laplace(likelihood): @@ -79,9 +79,9 @@ class Laplace(likelihood): return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) def _shared_gradients_components(self): - dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? + dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W? - #Ki = inv(self.K) + #Ki, _, _, _ = pdinv(self.K) #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? return dL_dytil, dytil_dfhat @@ -95,9 +95,8 @@ class Laplace(likelihood): """ dL_dytil, dytil_dfhat = self._shared_gradients_components() - d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) - dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde)) + #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde)) print "Computing K gradients" print "dytil_dfhat: ", np.mean(dytil_dfhat) @@ -107,7 +106,8 @@ class Laplace(likelihood): #plt.imshow(A) #plt.show() - I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?! + #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?! + I_KW_i = self.Bi # could use self.B_chol?? #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K! #Derivative for each f dimension, for each of K's hyper parameters @@ -117,25 +117,44 @@ class Laplace(likelihood): dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad)) dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK) - #FIXME: Careful dL_dK = dL_d_K_Sigma #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? dL_dSigma = dL_d_K_Sigma #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) #explicit #implicit - dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK) - dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0])) - for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK): - dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde) - - print "dL_dytil: ", np.mean(dL_dytil) - print "dytil_dthetaK: ", np.mean(dytil_dthetaK) + #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK) + #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0])) + d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) + Wi = np.diagonal(self.Sigma_tilde) #Convenience + dSigma_dthetaK_explicit = 0 + #Can just hadamard product as diagonal matricies multiplied are just multiplying elements + dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) + #dSigma_dthetaK_implicit = -np.sum(np.dot(dWi_dfhat, dfhat_dthetaK), axis=0) + dSigma_dthetaK_implicit = np.dot(dWi_dfhat, dfhat_dthetaK) + dSigma_dthetaK = dSigma_dthetaK_explicit + dSigma_dthetaK_implicit + #dSigma_dthetaK = 0 + np.dot(, dfhat_dthetaK) + #for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK): + #dSigma_dthetaK_explicit = 0 + #dSigma_dthetaK_implicit = -np.dot(Wi, dW_dfhat + #dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde) #FIXME: Won't handle multi dimensional data dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0) - dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=(0,1)) + dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0) dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T) - import ipdb; ipdb.set_trace() # XXX BREAKPOINT + + #print "\n" + #print "dL_dytil: ", np.mean(dL_dytil) + #print "dytil_dthetaK: ", np.mean(dytil_dthetaK) + #print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil + #print "\n" + #print "dL_dSigma: ", np.mean(dL_dSigma) + #print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK) + #print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma + #print "\n" + #print "dL_dthetaK_implicit: ", dL_dthetaK_implicit + #import ipdb; ipdb.set_trace() # XXX BREAKPOINT + return np.squeeze(dL_dthetaK_implicit) def _gradients(self, partial): @@ -159,27 +178,51 @@ class Laplace(likelihood): dW_dthetaX = d_dthetaX[d2phi_d2fhat] d2phi_d2fhat = Hessian function of likelihood - partial = dL_dK + partial = dL_d_K_Sigma """ dL_dytil, dytil_dfhat = self._shared_gradients_components() - dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? + #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? + + dlikelihood_dthetaL_explicit, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? + dlikelihood_dfhat = self.likelihood_function.link_hess(self.data, self.f_hat, self.extra_data) + dfhat_dthetaL_cyclic = 0 #what is this? how can dfhat_dthetaL be used in the value of itself? + dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f + dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None]) + dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL) + + #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? + dL_dSigma = partial #Is actually but can't rename it because of naming convention... dL_d_K_Sigma + + Wi = np.diagonal(self.Sigma_tilde) #Convenience + #-1 as we are looking at W which is -1*d2log p(y|f) + #Can just hadamard product as diagonal matricies multiplied are just multiplying elements + dSigma_dthetaL_explicit = np.diagflat(-(Wi*(-1*d2likelihood_dthetaL)*Wi)) + + d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) + dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) + dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL_cyclic) + dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? #Derivative for each f dimension, for each of K's hyper parameters - dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names()))) - for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T): - dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde, - dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)? - self.Sigma_tilde - ) + #dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names()))) + #for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T): + #dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde, + #dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)? + #self.Sigma_tilde + #) #TODO: This is Wi*A*Wi, can be more numerically stable with a trick #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde) - dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL - dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL) - dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL) + #dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL) + #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL) + + dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0) + dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0) + dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma + return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) def _compute_GP_variables(self): diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index cd6467d7..2176aac0 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -248,17 +248,16 @@ class student_t(likelihood_function): """ Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j - $$\frac{-2(v+1)((f-y)^{3} - 3\sigma^{2}v(f-y))}{((f-y)^{2} + \sigma^{2}v)^{3}}$$ + $$\frac{2(v+1)((y-f)^{3} - 3\sigma^{2}v(y-f))}{((y-f)^{2} + \sigma^{2}v)^{3}}$$ """ y = np.squeeze(y) f = np.squeeze(f) assert y.shape == f.shape - #NB f-y not y-f - e = f - y - d3link_d3f = ( (-2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e)) + e = y - f + d3link_d3f = ( (2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e)) / ((e**2 + (self.sigma**2)*self.v)**3) ) - return d3link_d3f + return np.squeeze(d3link_d3f) def link_hess_grad_std(self, y, f, extra_data=None): """ @@ -270,10 +269,10 @@ class student_t(likelihood_function): f = np.squeeze(f) assert y.shape == f.shape e = y - f - hess_grad_sigma = ( (2*self.sigma*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) + hess_grad_sigma = ( (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) / ((e**2 + (self.sigma**2)*self.v)**3) ) - return hess_grad_sigma + return np.squeeze(hess_grad_sigma) def link_grad_std(self, y, f, extra_data=None): """ @@ -288,11 +287,11 @@ class student_t(likelihood_function): grad_sigma = ( (-2*self.sigma*self.v*(self.v + 1)*e) / ((self.v*(self.sigma**2) + e**2)**2) ) - return grad_sigma + return np.squeeze(grad_sigma) def _gradients(self, y, f, extra_data=None): - return [self.link_grad_std(y, f, extra_data=extra_data)[:, None], - self.link_hess_grad_std(y, f, extra_data=extra_data)[:, None]] # list as we might learn many parameters + return [self.link_grad_std(y, f, extra_data=extra_data), + self.link_hess_grad_std(y, f, extra_data=extra_data)] # list as we might learn many parameters def predictive_values(self, mu, var): """ diff --git a/GPy/models/GP.py b/GPy/models/GP.py index a346b47b..1682ee6c 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -125,19 +125,23 @@ class GP(model): if isinstance(self.likelihood, Laplace): dL_dthetaK_explicit = dL_dthetaK #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained - fake_dL_dKs = np.ones(self.dL_dK.shape) + fake_dL_dKs = np.eye(self.dL_dK.shape[0]) dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) - dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK) + #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now + dL_dthetaK_implicit = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK) dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit - print "dL_dthetaK_explicit: {dldkx} dL_dthetaK_implicit: {dldki} dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK) + #print "dL_dthetaK_explicit: {dldkx} dL_dthetaK_implicit: {dldki} dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK) - dL_dthetaL = self.likelihood._gradients(partial=self.dL_dK) - else: - print "dL_dthetaK: ", dL_dthetaK dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) - print "dL_dthetaL: ", dL_dthetaL + print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + else: + #print "dL_dthetaK: ", dL_dthetaK + dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) + print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) + #print "dL_dthetaL: ", dL_dthetaL return np.hstack((dL_dthetaK, dL_dthetaL)) #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) From 9500b12b532e2f9abd68621a0ce8662e4553cb2c Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 8 May 2013 20:53:23 +0100 Subject: [PATCH 34/71] Working on putting callback to update laplace in callback --- GPy/inference/optimization.py | 13 ++++++++++++- GPy/likelihoods/Laplace.py | 1 - GPy/likelihoods/likelihood_functions.py | 4 ++++ GPy/models/GP.py | 10 ++++++++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/GPy/inference/optimization.py b/GPy/inference/optimization.py index 75cd94ba..1445eed0 100644 --- a/GPy/inference/optimization.py +++ b/GPy/inference/optimization.py @@ -29,7 +29,7 @@ class Optimizer(): :rtype: optimizer object. """ - def __init__(self, x_init, messages=False, model = None, max_f_eval=1e4, max_iters = 1e3, ftol=None, gtol=None, xtol=None): + def __init__(self, x_init, messages=False, model = None, max_f_eval=1e4, max_iters = 1e3, ftol=None, gtol=None, xtol=None, callback=None): self.opt_name = None self.x_init = x_init self.messages = messages @@ -45,6 +45,7 @@ class Optimizer(): self.gtol = gtol self.ftol = ftol self.model = model + self.callback = callback def run(self, **kwargs): start = dt.datetime.now() @@ -94,6 +95,8 @@ class opt_tnc(Optimizer): opt_dict['ftol'] = self.ftol if self.gtol is not None: opt_dict['pgtol'] = self.gtol + if self.callback is not None: + opt_dict['callback'] = self.callback opt_result = optimize.fmin_tnc(f_fp, self.x_init, messages = self.messages, maxfun = self.max_f_eval, **opt_dict) @@ -128,6 +131,8 @@ class opt_lbfgsb(Optimizer): print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it" if self.gtol is not None: opt_dict['pgtol'] = self.gtol + if self.callback is not None: + opt_dict['callback'] = self.callback opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint = iprint, maxfun = self.max_f_eval, **opt_dict) @@ -155,6 +160,8 @@ class opt_simplex(Optimizer): opt_dict['ftol'] = self.ftol if self.gtol is not None: print "WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it" + if self.callback is not None: + opt_dict['callback'] = self.callback opt_result = optimize.fmin(f, self.x_init, (), disp = self.messages, maxfun = self.max_f_eval, full_output=True, **opt_dict) @@ -187,6 +194,8 @@ class opt_rasm(Optimizer): print "WARNING: minimize doesn't have an ftol arg, so I'm going to ignore it" if self.gtol is not None: print "WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it" + if self.callback is not None: + print "WARNING: minimize doesn't have a callback arg, so I'm going to ignore it" opt_result = rasm.minimize(self.x_init, f_fp, (), messages = self.messages, maxnumfuneval = self.max_f_eval) @@ -205,6 +214,8 @@ class opt_SCG(Optimizer): def opt(self, f_fp = None, f = None, fp = None): assert not f is None assert not fp is None + if self.callback is not None: + print "WARNING: SCG doesn't have a callback arg, so I'm going to ignore it" opt_result = SCG(f,fp,self.x_init, display=self.messages, maxiters=self.max_iters, max_f_eval=self.max_f_eval, xtol=self.xtol, ftol=self.ftol) self.x_opt = opt_result[0] self.trace = opt_result[1] diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 566e4e25..208b1102 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -63,7 +63,6 @@ class Laplace(likelihood): return self.likelihood_function._get_param_names() def _set_params(self, p): - print "Setting noise sd: ", p return self.likelihood_function._set_params(p) def both_gradients(self, dL_d_K_Sigma, dK_dthetaK): diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 2176aac0..61c79385 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -166,6 +166,8 @@ class student_t(likelihood_function): self.log_concave = False #super(student_t, self).__init__() + self._set_params(np.asarray(sigma)) + def _get_params(self): return np.asarray(self.sigma) @@ -174,6 +176,8 @@ class student_t(likelihood_function): def _set_params(self, x): self.sigma = float(x) + print "Setting student t sigma: ", x + print x #self.covariance_matrix = np.eye(self.N)*self._variance #self.precision = 1./self._variance diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 1682ee6c..79284b59 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -86,6 +86,16 @@ class GP(model): def _get_param_names(self): return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() + def _update_params_callback(self, p): + #FIXME:Check the transforming + #Set the new parameters of the kernel and likelihood within the optimization + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()]) + self.likelihood._set_params(p[self.kern.Nparam_transformed():]) + #update the likelihood approximation within the optimisation with the current parameters + self.update_likelihood_approximation() + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + def update_likelihood_approximation(self): """ Approximates a non-gaussian likelihood using Expectation Propagation From 5472c5c6ba445c49fcdb98ccef4635f17a801b28 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 13 May 2013 18:36:02 +0100 Subject: [PATCH 35/71] Almost have likelihood gradients working but kernels still way off --- GPy/examples/laplace_approximations.py | 39 ++++++----- GPy/likelihoods/Laplace.py | 88 ++++++++++++++++--------- GPy/likelihoods/likelihood_functions.py | 4 +- GPy/models/GP.py | 20 +++--- 4 files changed, 91 insertions(+), 60 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 57ae9be7..2054881c 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -52,7 +52,7 @@ def debug_student_t_noise_approx(): real_sd = np.sqrt(real_var) print "Real noise: ", real_sd - initial_var_guess = 0.01 + initial_var_guess = 1 #t_rv = t(deg_free, loc=0, scale=real_var) #noise = t_rvrvs(size=Y.shape) #Y += noise @@ -84,14 +84,21 @@ def debug_student_t_noise_approx(): edited_real_sd = initial_var_guess #real_sd + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, stu_t_likelihood, kernel6) - m.ensure_default_constraints() + #m.constrain_positive('rbf') + m.constrain_fixed('rbf_v', 1.0898) + m.constrain_fixed('rbf_l', 1.8651) + m.constrain_positive('t_noi') + #m.constrain_fixed('t_noise_variance', real_sd) m.update_likelihood_approximation() - m.optimize() + #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback) + m.optimize('scg', messages=True) print(m) + return m if plot: plt.suptitle('Student-t likelihood') plt.subplot(132) @@ -99,19 +106,19 @@ def debug_student_t_noise_approx(): plt.plot(X_full, Y_full) plt.ylim(-2.5, 2.5) - print "Clean student t, ncg" - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False) - m = GPy.models.GP(X, stu_t_likelihood, kernel3) - m.ensure_default_constraints() - m.update_likelihood_approximation() - m.optimize() - print(m) - if plot: - plt.subplot(133) - m.plot() - plt.plot(X_full, Y_full) - plt.ylim(-2.5, 2.5) + #print "Clean student t, ncg" + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False) + #m = GPy.models.GP(X, stu_t_likelihood, kernel3) + #m.ensure_default_constraints() + #m.update_likelihood_approximation() + #m.optimize() + #print(m) + #if plot: + #plt.subplot(133) + #m.plot() + #plt.plot(X_full, Y_full) + #plt.ylim(-2.5, 2.5) #plt.show() diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 208b1102..5b3e8f43 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -63,6 +63,7 @@ class Laplace(likelihood): return self.likelihood_function._get_param_names() def _set_params(self, p): + #print "Setting laplace param with: ", p return self.likelihood_function._set_params(p) def both_gradients(self, dL_d_K_Sigma, dK_dthetaK): @@ -78,10 +79,24 @@ class Laplace(likelihood): return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) def _shared_gradients_components(self): - dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R - dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W? - #Ki, _, _, _ = pdinv(self.K) - #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? + dL_dytil = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R + + d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) + Wi = np.diagonal(self.Sigma_tilde) #Convenience + #Can just hadamard product as diagonal matricies multiplied are just multiplying elements + dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) + + Ki, _, _, _ = pdinv(self.K) + #dytil_dfhat_implicit = np.dot(dWi_dfhat, Ki) + np.eye(self.N) + #dytil_dfhat = np.dot(dWi_dfhat, Ki) + np.eye(self.N) + + #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full + #dytil_dfhat_explicit = self.Wi__Ki_W + #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit + #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically + + a = mdot(dWi_dfhat, Ki, self.f_hat) + dytil_dfhat = mdot(dWi_dfhat, Ki, self.f_hat) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) return dL_dytil, dytil_dfhat def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): @@ -94,18 +109,18 @@ class Laplace(likelihood): """ dL_dytil, dytil_dfhat = self._shared_gradients_components() - #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde)) - print "Computing K gradients" - print "dytil_dfhat: ", np.mean(dytil_dfhat) - I = np.eye(self.N) - C = np.dot(self.K, self.W) - A = I + C + #print "Computing K gradients" + #print "dytil_dfhat: ", np.mean(dytil_dfhat) + #I = np.eye(self.N) + #C = np.dot(self.K, self.W) + #A = I + C #plt.imshow(A) #plt.show() #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?! + #B = I + w12*K*w12 I_KW_i = self.Bi # could use self.B_chol?? #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K! @@ -113,15 +128,22 @@ class Laplace(likelihood): dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0])) grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) for ind_j, thetaj in enumerate(dK_dthetaK): - dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad)) + #dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad))) + dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad) + print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat) + print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK) dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK) + print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK) + print "\n" + #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? dL_dSigma = dL_d_K_Sigma #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) #explicit #implicit #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK) #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0])) + d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) Wi = np.diagonal(self.Sigma_tilde) #Convenience dSigma_dthetaK_explicit = 0 @@ -140,19 +162,16 @@ class Laplace(likelihood): dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0) dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0) dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma - #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T) - #print "\n" - #print "dL_dytil: ", np.mean(dL_dytil) - #print "dytil_dthetaK: ", np.mean(dytil_dthetaK) - #print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil - #print "\n" - #print "dL_dSigma: ", np.mean(dL_dSigma) - #print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK) - #print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma - #print "\n" - #print "dL_dthetaK_implicit: ", dL_dthetaK_implicit - #import ipdb; ipdb.set_trace() # XXX BREAKPOINT + print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil) + print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK) + print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil + print "\n" + print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma) + print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK) + print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma + print "\n" + print "dL_dthetaK_implicit: ", dL_dthetaK_implicit return np.squeeze(dL_dthetaK_implicit) @@ -182,11 +201,15 @@ class Laplace(likelihood): dL_dytil, dytil_dfhat = self._shared_gradients_components() #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - dlikelihood_dthetaL_explicit, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - dlikelihood_dfhat = self.likelihood_function.link_hess(self.data, self.f_hat, self.extra_data) - dfhat_dthetaL_cyclic = 0 #what is this? how can dfhat_dthetaL be used in the value of itself? - dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f - dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None]) + dlikelihood_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? + dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) + #dfhat_dthetaL_cyclic = 0 #FIXME: what is this? how can dfhat_dthetaL be used in the value of itself? + #dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f + #dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None]) + #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N)) + KW_I_i = self.Bi # could use self.B_chol?? + dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihood_dfhat)) + dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL) #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? @@ -199,7 +222,7 @@ class Laplace(likelihood): d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) - dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL_cyclic) + dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL) dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? @@ -219,8 +242,10 @@ class Laplace(likelihood): #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL) dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0) - dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0) + dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma[:, None].T, dSigma_dthetaL), axis=0) dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma + dL_dthetaL_via_Sigma_old = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) @@ -257,7 +282,7 @@ class Laplace(likelihood): #((L.T*w)_i + I)f_hat = y_tilde L = jitchol(self.K) Li = chol_inv(L) - Lt_W = np.dot(L.T, self.W) + Lt_W = np.dot(L.T, self.W) #FIXME: Can make Faster ##Check it isn't singular! if cond(Lt_W) > epsilon: @@ -361,7 +386,6 @@ class Laplace(likelihood): """ #W is diagnoal so its sqrt is just the sqrt of the diagonal elements W_12 = np.sqrt(W) - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12)) L = jitchol(B) return (B, L, W_12) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 61c79385..6eef9f33 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -176,8 +176,6 @@ class student_t(likelihood_function): def _set_params(self, x): self.sigma = float(x) - print "Setting student t sigma: ", x - print x #self.covariance_matrix = np.eye(self.N)*self._variance #self.precision = 1./self._variance @@ -288,7 +286,7 @@ class student_t(likelihood_function): f = np.squeeze(f) assert y.shape == f.shape e = y - f - grad_sigma = ( (-2*self.sigma*self.v*(self.v + 1)*e) + grad_sigma = ( (2*self.sigma*self.v*(self.v + 1)*e) / ((self.v*(self.sigma**2) + e**2)**2) ) return np.squeeze(grad_sigma) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 79284b59..ff852766 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -66,6 +66,10 @@ class GP(model): # self.likelihood._set_params(p[self.kern.Nparam:]) # test by Nicolas self.likelihood._set_params(p[self.kern.Nparam_transformed():]) # test by Nicolas + if isinstance(self.likelihood, Laplace): + print "Updating approx: ", p + self.likelihood.fit_full(self.kern.K(self.X)) + self.likelihood._set_params(self.likelihood._get_params()) self.K = self.kern.K(self.X) self.K += self.likelihood.covariance_matrix @@ -87,14 +91,12 @@ class GP(model): return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() def _update_params_callback(self, p): - #FIXME:Check the transforming - #Set the new parameters of the kernel and likelihood within the optimization - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + #parameters will be in transformed space self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()]) + #set_params_transformed for likelihood doesn't exist? self.likelihood._set_params(p[self.kern.Nparam_transformed():]) #update the likelihood approximation within the optimisation with the current parameters self.update_likelihood_approximation() - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT def update_likelihood_approximation(self): """ @@ -123,7 +125,9 @@ class GP(model): model for a new variable Y* = v_tilde/tau_tilde, with a covariance matrix K* = K + diag(1./tau_tilde) plus a normalization term. """ - return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z + l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z + print "Log likelihood: ", l + return l def _log_likelihood_gradients(self): """ @@ -135,7 +139,7 @@ class GP(model): if isinstance(self.likelihood, Laplace): dL_dthetaK_explicit = dL_dthetaK #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained - fake_dL_dKs = np.eye(self.dL_dK.shape[0]) + fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right... dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now @@ -145,13 +149,11 @@ class GP(model): #print "dL_dthetaK_explicit: {dldkx} dL_dthetaK_implicit: {dldki} dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) + #print "dL_dthetaL: ", dL_dthetaL print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT else: - #print "dL_dthetaK: ", dL_dthetaK dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) - #print "dL_dthetaL: ", dL_dthetaL return np.hstack((dL_dthetaK, dL_dthetaL)) #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) From 787a038401ee959fbbd8bfe354c84c1d4cbd56fa Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 14 May 2013 16:23:18 +0100 Subject: [PATCH 36/71] Still getting closer to grads for likelihood --- GPy/examples/laplace_approximations.py | 4 ++-- GPy/likelihoods/Laplace.py | 16 ++++++---------- GPy/likelihoods/likelihood_functions.py | 4 ++-- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 2054881c..eb725b53 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -95,10 +95,10 @@ def debug_student_t_noise_approx(): m.constrain_positive('t_noi') #m.constrain_fixed('t_noise_variance', real_sd) m.update_likelihood_approximation() - #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback) - m.optimize('scg', messages=True) print(m) return m + #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback) + m.optimize('scg', messages=True) if plot: plt.suptitle('Student-t likelihood') plt.subplot(132) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 5b3e8f43..2af51f2b 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -201,24 +201,22 @@ class Laplace(likelihood): dL_dytil, dytil_dfhat = self._shared_gradients_components() #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - dlikelihood_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? + dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) - #dfhat_dthetaL_cyclic = 0 #FIXME: what is this? how can dfhat_dthetaL be used in the value of itself? - #dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f - #dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None]) #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N)) KW_I_i = self.Bi # could use self.B_chol?? - dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihood_dfhat)) + dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL)) + #dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None] dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL) #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? - dL_dSigma = partial #Is actually but can't rename it because of naming convention... dL_d_K_Sigma + dL_dSigma = np.diagflat(partial) #Is actually but can't rename it because of naming convention... dL_d_K_Sigma Wi = np.diagonal(self.Sigma_tilde) #Convenience #-1 as we are looking at W which is -1*d2log p(y|f) #Can just hadamard product as diagonal matricies multiplied are just multiplying elements - dSigma_dthetaL_explicit = np.diagflat(-(Wi*(-1*d2likelihood_dthetaL)*Wi)) + dSigma_dthetaL_explicit = np.diagflat(-1*(Wi*(-1*d2likelihood_dthetaL)*Wi)) d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) @@ -242,10 +240,8 @@ class Laplace(likelihood): #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL) dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0) - dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma[:, None].T, dSigma_dthetaL), axis=0) + dL_dthetaL_via_Sigma = np.sum(np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)) dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma - dL_dthetaL_via_Sigma_old = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 6eef9f33..1a9dac75 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -256,7 +256,7 @@ class student_t(likelihood_function): f = np.squeeze(f) assert y.shape == f.shape e = y - f - d3link_d3f = ( (2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e)) + d3link_d3f = ( (2*(self.v + 1)*(-1*e)*(e**2 - 3*(self.sigma**2)*self.v)) / ((e**2 + (self.sigma**2)*self.v)**3) ) return np.squeeze(d3link_d3f) @@ -286,7 +286,7 @@ class student_t(likelihood_function): f = np.squeeze(f) assert y.shape == f.shape e = y - f - grad_sigma = ( (2*self.sigma*self.v*(self.v + 1)*e) + grad_sigma = ( (-2*self.sigma*self.v*(self.v + 1)*e) / ((self.v*(self.sigma**2) + e**2)**2) ) return np.squeeze(grad_sigma) From 569311b5107c6ec6cb2cc41587701f5526fb70dd Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 15 May 2013 19:25:55 +0100 Subject: [PATCH 37/71] Gradients almost there for dytil_dfhat, diagonal terms are right --- GPy/likelihoods/Laplace.py | 21 ++-- GPy/likelihoods/likelihood_functions.py | 4 +- GPy/testing/laplace_approx.tests.py | 123 ++++++++++++++++++++++++ 3 files changed, 140 insertions(+), 8 deletions(-) create mode 100644 GPy/testing/laplace_approx.tests.py diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 2af51f2b..ce3f870f 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -96,7 +96,10 @@ class Laplace(likelihood): #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically a = mdot(dWi_dfhat, Ki, self.f_hat) - dytil_dfhat = mdot(dWi_dfhat, Ki, self.f_hat) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) + b = np.dot(self.Sigma_tilde, Ki) + dytil_dfhat = - np.dot(dWi_dfhat, np.dot(Ki, self.f_hat)) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) + #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N) + self.dytil_dfhat = dytil_dfhat return dL_dytil, dytil_dfhat def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): @@ -330,19 +333,25 @@ class Laplace(likelihood): def fit_full(self, K): """ - The laplace approximation algorithm + The laplace approximation algorithm, find K and expand hessian For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability :K: Covariance matrix """ self.K = K.copy() - #assert np.all(self.K.T == self.K) - #self.K_safe = K.copy() + + #Find mode if self.rasm: self.f_hat = self.rasm_mode(K) else: self.f_hat = self.ncg_mode(K) + #Compute hessian and other variables at mode + self._compute_likelihood_variables() + + def _compute_likelihood_variables(self): #At this point get the hessian matrix + #print "Data: ", self.data + #print "fhat: ", self.f_hat self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data)) if not self.likelihood_function.log_concave: @@ -352,14 +361,14 @@ class Laplace(likelihood): #This is a property only held by non-log-concave likelihoods #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though - self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W) + self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) self.Bi, _, _, B_det = pdinv(self.B) Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i) b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None] - solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b))) + solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b))) a = b - mdot(self.W_12, solve_chol) self.f_Ki_f = np.dot(self.f_hat.T, a) self.ln_K_det = pddet(self.K) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 0d194c01..646293d2 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -10,8 +10,7 @@ from scipy.special import gammaln, gamma from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf class likelihood_function: - """ - Likelihood class for doing Expectation propagation + """ Likelihood class for doing Expectation propagation :param Y: observed output (Nx1 numpy.darray) ..Note:: Y values allowed depend on the likelihood_function used @@ -241,6 +240,7 @@ class student_t(likelihood_function): y = np.squeeze(y) f = np.squeeze(f) assert y.shape == f.shape + e = y - f hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) return np.squeeze(hess) diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx.tests.py new file mode 100644 index 00000000..394950d5 --- /dev/null +++ b/GPy/testing/laplace_approx.tests.py @@ -0,0 +1,123 @@ +import unittest +import numpy as np + +import GPy +from GPy.models import GP +from GPy.util.linalg import pdinv, tdot +from scipy import linalg + +class LikelihoodGradParam(GP): + def __init__(self, X, likelihood_function, kernel, param_name=None, function=None, **kwargs): + super(LikelihoodGradParam, self).__init__(X, likelihood_function, kernel) + self.param_name = param_name + self.func = function + #self.func_params = kwargs + #self.parameter = self.likelihood.__getattribute__(self.param_name) + + def _get_param_names(self): + f_hats = ["f_{}".format(i) for i in range(len(self.likelihood.f_hat))] + return f_hats + + def _get_params(self): + return np.hstack([np.squeeze(self.likelihood.f_hat)]) + #return np.hstack([self.likelihood.__getattribute__(self.param_name)]) + + def hack_dL_dK(self): + self.K = self.kern.K(self.X) + self.K += self.likelihood.covariance_matrix + + self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) + + # the gradient of the likelihood wrt the covariance matrix + if self.likelihood.YYT is None: + alpha, _ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y, lower=1) + self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki) + else: + tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1) + tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1) + self.dL_dK = 0.5 * (tmp - self.D * self.Ki) + + def _set_params(self, x): + self.likelihood.f_hat = x.reshape(self.N, 1) + self.likelihood._compute_likelihood_variables() + self.hack_dL_dK() + + def log_likelihood(self): + return self.func(self.likelihood)[0, 0] + + def _log_likelihood_gradients(self): + #gradient = self.likelihood.__getattribute__(self.param_name) + self.likelihood._compute_likelihood_variables() + self.likelihood._gradients(partial=np.diag(self.dL_dK)) + gradient = getattr(self.likelihood, self.param_name) + #Need to sum over fhats? For dytil_dfhat... + #gradient = np.flatten(gradient, axis=0) + #return gradient[:, 0] + return gradient[0, :] + + +class LaplaceTests(unittest.TestCase): + def setUp(self): + real_var = 0.1 + #Start a function, any function + #self.X = np.linspace(0.0, 10.0, 30)[:, None] + self.X = np.random.randn(2,1) + #self.X = np.ones((10,1)) + Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var + self.Y = Y/Y.max() + self.kernel = GPy.kern.rbf(self.X.shape[1]) + + deg_free = 10000 + real_sd = np.sqrt(real_var) + initial_sd_guess = 1 + + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=initial_sd_guess) + self.stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) + self.stu_t_likelihood.fit_full(self.kernel.K(self.X)) + self.m = LikelihoodGradParam(self.X, self.stu_t_likelihood, self.kernel, None, None) + self.m.constrain_fixed('rbf_v', 1.0898) + self.m.constrain_fixed('rbf_l', 1.8651) + + def tearDown(self): + self.m = None + + def test_dy_dfhat(self): + def ytil(likelihood): + Sigma_tilde = likelihood.Sigma_tilde + K = likelihood.K + Ki, _, _, _ = pdinv(K) + f_hat = likelihood.f_hat + Sigma, _, _, _ = pdinv(Sigma_tilde) + return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat) + + self.m.func = ytil + self.m.param_name = 'dytil_dfhat' + self.m.randomize() + #try: + self.m.checkgrad(verbose=1) + assert self.m.checkgrad() + #except: + #import ipdb;ipdb.set_trace() + + + #def test_dL_dytil(self): + #def L(likelihood): + ##-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z + #Sigma_tilde = likelihood.Sigma_tilde + #Ki = likelihood.K + #f_hat = likelihood.f_hat + #Sigma, _, _, _ = pdinv(Sigma_tilde) + #return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat) + + #self.m.func = L + #self.m.param_name = 'dL_dytil' + #m.randomize() + ##try: + #m.checkgrad(verbose=1) + #assert m.checkgrad() + #except: + #import ipdb;ipdb.set_trace() + +if __name__ == "__main__": + unittest.main() + From 21ae81de29c36ad94d8d7fc412db869c7926719a Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 16 May 2013 12:00:15 +0100 Subject: [PATCH 38/71] Workong on doing explicit gradients --- GPy/likelihoods/Laplace.py | 13 +++++++++++++ GPy/testing/laplace_approx.tests.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index ce3f870f..f2197e55 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -97,6 +97,19 @@ class Laplace(likelihood): a = mdot(dWi_dfhat, Ki, self.f_hat) b = np.dot(self.Sigma_tilde, Ki) + #dytil_dfhat = np.zeros(self.K.shape) + #for col in range(self.N): + #for row in range(self.N): + #t1 = 0 + #for l in range(self.N): + #t1 += dWi_dfhat[col, col]*Ki[col,l]*self.f_hat[l, 0] + ##t2 = np.zeros((1, self.N)) + #t2 = np.dot(self.Sigma_tilde, Ki[:, col]) + ##for k in range(self.N): + ##t2[:] += self.Sigma_tilde[k, k]*Ki[k, col] + #dytil_dfhat[row, col] = (t1 + t2)[row] + #dytil_dfhat += np.eye(self.N) + dytil_dfhat = - np.dot(dWi_dfhat, np.dot(Ki, self.f_hat)) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N) self.dytil_dfhat = dytil_dfhat diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx.tests.py index 394950d5..73dfbfd6 100644 --- a/GPy/testing/laplace_approx.tests.py +++ b/GPy/testing/laplace_approx.tests.py @@ -61,7 +61,7 @@ class LaplaceTests(unittest.TestCase): real_var = 0.1 #Start a function, any function #self.X = np.linspace(0.0, 10.0, 30)[:, None] - self.X = np.random.randn(2,1) + self.X = np.random.randn(9,1) #self.X = np.ones((10,1)) Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var self.Y = Y/Y.max() From e5d7ee972848e5eb5ec1186c3150d9720328076f Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 16 May 2013 12:06:09 +0100 Subject: [PATCH 39/71] FIXED DYTIL_DFHAT --- GPy/likelihoods/Laplace.py | 6 +++--- GPy/testing/laplace_approx.tests.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index f2197e55..42897f80 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -105,12 +105,12 @@ class Laplace(likelihood): #t1 += dWi_dfhat[col, col]*Ki[col,l]*self.f_hat[l, 0] ##t2 = np.zeros((1, self.N)) #t2 = np.dot(self.Sigma_tilde, Ki[:, col]) - ##for k in range(self.N): - ##t2[:] += self.Sigma_tilde[k, k]*Ki[k, col] + ###for k in range(self.N): + ###t2[:] += self.Sigma_tilde[k, k]*Ki[k, col] #dytil_dfhat[row, col] = (t1 + t2)[row] #dytil_dfhat += np.eye(self.N) - dytil_dfhat = - np.dot(dWi_dfhat, np.dot(Ki, self.f_hat)) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) + dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N) self.dytil_dfhat = dytil_dfhat return dL_dytil, dytil_dfhat diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx.tests.py index 73dfbfd6..2b3af2ad 100644 --- a/GPy/testing/laplace_approx.tests.py +++ b/GPy/testing/laplace_approx.tests.py @@ -60,8 +60,8 @@ class LaplaceTests(unittest.TestCase): def setUp(self): real_var = 0.1 #Start a function, any function - #self.X = np.linspace(0.0, 10.0, 30)[:, None] - self.X = np.random.randn(9,1) + self.X = np.linspace(0.0, 10.0, 30)[:, None] + #self.X = np.random.randn(,1) #self.X = np.ones((10,1)) Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var self.Y = Y/Y.max() From 48d693791eabf51e64b28706910a9a9444457825 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 16 May 2013 12:22:37 +0100 Subject: [PATCH 40/71] changed name --- GPy/examples/laplace_approximations.py | 2 +- GPy/likelihoods/Laplace.py | 25 ++++--------------- ...pprox.tests.py => laplace_approx_tests.py} | 0 3 files changed, 6 insertions(+), 21 deletions(-) rename GPy/testing/{laplace_approx.tests.py => laplace_approx_tests.py} (100%) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index eb725b53..4d8e96b8 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -39,7 +39,7 @@ def debug_student_t_noise_approx(): plot = False real_var = 0.1 #Start a function, any function - X = np.linspace(0.0, 10.0, 30)[:, None] + X = np.linspace(0.0, 10.0, 2)[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var X_full = np.linspace(0.0, 10.0, 500)[:, None] diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 42897f80..b0dde03f 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -95,23 +95,7 @@ class Laplace(likelihood): #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically - a = mdot(dWi_dfhat, Ki, self.f_hat) - b = np.dot(self.Sigma_tilde, Ki) - #dytil_dfhat = np.zeros(self.K.shape) - #for col in range(self.N): - #for row in range(self.N): - #t1 = 0 - #for l in range(self.N): - #t1 += dWi_dfhat[col, col]*Ki[col,l]*self.f_hat[l, 0] - ##t2 = np.zeros((1, self.N)) - #t2 = np.dot(self.Sigma_tilde, Ki[:, col]) - ###for k in range(self.N): - ###t2[:] += self.Sigma_tilde[k, k]*Ki[k, col] - #dytil_dfhat[row, col] = (t1 + t2)[row] - #dytil_dfhat += np.eye(self.N) - dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) - #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N) self.dytil_dfhat = dytil_dfhat return dL_dytil, dytil_dfhat @@ -219,10 +203,10 @@ class Laplace(likelihood): dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) - #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N)) - KW_I_i = self.Bi # could use self.B_chol?? + KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N)) + #KW_I_i = self.Bi # could use self.B_chol?? dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL)) - #dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None] + dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None] dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL) @@ -383,7 +367,8 @@ class Laplace(likelihood): b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None] solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b))) a = b - mdot(self.W_12, solve_chol) - self.f_Ki_f = np.dot(self.f_hat.T, a) + self.Ki_f = a + self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f) self.ln_K_det = pddet(self.K) self.ln_z_hat = (- 0.5*self.f_Ki_f diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx_tests.py similarity index 100% rename from GPy/testing/laplace_approx.tests.py rename to GPy/testing/laplace_approx_tests.py From 146d7e2458cbfc69f8303b0b413e50cebf7fd7f7 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 17 May 2013 17:42:00 +0100 Subject: [PATCH 41/71] Trying to fix dL_dytil gradient --- GPy/likelihoods/Laplace.py | 23 +++++- GPy/testing/laplace_approx_tests.py | 109 +++++++++++++++++----------- 2 files changed, 84 insertions(+), 48 deletions(-) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index b0dde03f..af20d36a 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -79,16 +79,29 @@ class Laplace(likelihood): return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) def _shared_gradients_components(self): - dL_dytil = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R + Ki, _, _, _ = pdinv(self.K) + + #Y__KS_i = np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) + #dL_dytil = -0.5*Y__KS_i #or *0.5? Shouldn't this be -y*R + #dL_dytil = -0.5*np.trace(np.dot(inv(self.K+self.Sigma_tilde), (np.dot(self.Y, self.Y.T) + self.Y.T))) + #dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde), + #dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y) + c = inv(self.K+self.Sigma_tilde) + dL_dytil_simple_term = -0.5*np.diag(np.dot(c, self.Y) + np.dot(self.Y.T, c)) + + P = np.diagflat(1/np.dot(Ki, self.f_hat)) + K_Wi_i = inv(self.K+self.Sigma_tilde) + + dL_dytil_difficult_term = np.diag(( -0.5*(np.dot(self.K + self.Sigma_tilde, P)) + +0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P) + ) * np.eye(self.N)) + dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) Wi = np.diagonal(self.Sigma_tilde) #Convenience #Can just hadamard product as diagonal matricies multiplied are just multiplying elements dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) - Ki, _, _, _ = pdinv(self.K) - #dytil_dfhat_implicit = np.dot(dWi_dfhat, Ki) + np.eye(self.N) - #dytil_dfhat = np.dot(dWi_dfhat, Ki) + np.eye(self.N) #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full #dytil_dfhat_explicit = self.Wi__Ki_W @@ -97,6 +110,8 @@ class Laplace(likelihood): dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) self.dytil_dfhat = dytil_dfhat + #dytil_dfhat = np.eye(dytil_dfhat.shape[0]) + self.dL_dfhat = np.dot(dL_dytil, dytil_dfhat) #FIXME: Purely for checkgradding.... return dL_dytil, dytil_dfhat def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): diff --git a/GPy/testing/laplace_approx_tests.py b/GPy/testing/laplace_approx_tests.py index 2b3af2ad..acb1c822 100644 --- a/GPy/testing/laplace_approx_tests.py +++ b/GPy/testing/laplace_approx_tests.py @@ -1,26 +1,29 @@ import unittest import numpy as np +np.random.seed(82) import GPy from GPy.models import GP from GPy.util.linalg import pdinv, tdot from scipy import linalg -class LikelihoodGradParam(GP): - def __init__(self, X, likelihood_function, kernel, param_name=None, function=None, **kwargs): - super(LikelihoodGradParam, self).__init__(X, likelihood_function, kernel) +class LikelihoodParamGrad(GP): + def __init__(self, X=None, likelihood_function=None, kernel=None, param_name=None, function=None, dparam_name=None, **kwargs): self.param_name = param_name + self.dparam_name = dparam_name self.func = function + super(LikelihoodParamGrad, self).__init__(X, likelihood_function, kernel) #self.func_params = kwargs #self.parameter = self.likelihood.__getattribute__(self.param_name) def _get_param_names(self): - f_hats = ["f_{}".format(i) for i in range(len(self.likelihood.f_hat))] - return f_hats + params = getattr(self.likelihood, self.dparam_name) + params_names = ["{}_{}".format(self.dparam_name, i) for i in range(len(params))] + return params_names def _get_params(self): - return np.hstack([np.squeeze(self.likelihood.f_hat)]) - #return np.hstack([self.likelihood.__getattribute__(self.param_name)]) + params = getattr(self.likelihood, self.dparam_name) + return np.hstack([params]) def hack_dL_dK(self): self.K = self.kern.K(self.X) @@ -38,29 +41,56 @@ class LikelihoodGradParam(GP): self.dL_dK = 0.5 * (tmp - self.D * self.Ki) def _set_params(self, x): - self.likelihood.f_hat = x.reshape(self.N, 1) + raise NotImplementedError + + def log_likelihood(self): + raise NotImplementedError + + def _log_likelihood_gradients(self): + raise NotImplementedError + + +class Likelihood_F_Grad(LikelihoodParamGrad): + def __init__(self, **kwargs): + super(Likelihood_F_Grad, self).__init__(**kwargs) + + def _set_params(self, x): + params = getattr(self.likelihood, self.dparam_name) + setattr(self.likelihood, self.dparam_name, x.reshape(*params.shape)) self.likelihood._compute_likelihood_variables() self.hack_dL_dK() def log_likelihood(self): - return self.func(self.likelihood)[0, 0] + ll = self.func(self) + if self.param_name == "dL_dfhat_": + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + if len(ll.shape) == 0 or len(ll.shape) == 1: + return ll.sum() + elif len(ll.shape) == 2: + #print "Only checking first likelihood" + return ll[0, 0] + else: + raise ValueError('Not implemented for larger matricies yet') + return ll def _log_likelihood_gradients(self): - #gradient = self.likelihood.__getattribute__(self.param_name) self.likelihood._compute_likelihood_variables() self.likelihood._gradients(partial=np.diag(self.dL_dK)) gradient = getattr(self.likelihood, self.param_name) - #Need to sum over fhats? For dytil_dfhat... - #gradient = np.flatten(gradient, axis=0) - #return gradient[:, 0] - return gradient[0, :] + if len(gradient.shape) == 1: + return gradient + elif len(gradient.shape) == 2: + #print "Only checking first gradients" + return gradient[0,: ] + else: + raise ValueError('Not implemented for larger matricies yet') class LaplaceTests(unittest.TestCase): def setUp(self): real_var = 0.1 #Start a function, any function - self.X = np.linspace(0.0, 10.0, 30)[:, None] + self.X = np.linspace(0.0, 10.0, 4)[:, None] #self.X = np.random.randn(,1) #self.X = np.ones((10,1)) Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var @@ -74,49 +104,40 @@ class LaplaceTests(unittest.TestCase): t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=initial_sd_guess) self.stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) self.stu_t_likelihood.fit_full(self.kernel.K(self.X)) - self.m = LikelihoodGradParam(self.X, self.stu_t_likelihood, self.kernel, None, None) - self.m.constrain_fixed('rbf_v', 1.0898) - self.m.constrain_fixed('rbf_l', 1.8651) def tearDown(self): self.m = None def test_dy_dfhat(self): - def ytil(likelihood): - Sigma_tilde = likelihood.Sigma_tilde - K = likelihood.K + def ytil(self): + Sigma_tilde = self.likelihood.Sigma_tilde + K = self.likelihood.K Ki, _, _, _ = pdinv(K) - f_hat = likelihood.f_hat + f_hat = self.likelihood.f_hat Sigma, _, _, _ = pdinv(Sigma_tilde) return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat) - self.m.func = ytil - self.m.param_name = 'dytil_dfhat' + self.m = Likelihood_F_Grad(X=self.X, likelihood_function=self.stu_t_likelihood, + kernel=self.kernel, param_name='dytil_dfhat', + function=ytil, dparam_name='f_hat') + #self.m.constrain_fixed('rbf_v', 1.0898) + #self.m.constrain_fixed('rbf_l', 1.8651) self.m.randomize() - #try: self.m.checkgrad(verbose=1) assert self.m.checkgrad() - #except: - #import ipdb;ipdb.set_trace() + def test_dL_dfhat(self): + def L(self): + return np.array(-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z) - #def test_dL_dytil(self): - #def L(likelihood): - ##-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z - #Sigma_tilde = likelihood.Sigma_tilde - #Ki = likelihood.K - #f_hat = likelihood.f_hat - #Sigma, _, _, _ = pdinv(Sigma_tilde) - #return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat) - - #self.m.func = L - #self.m.param_name = 'dL_dytil' - #m.randomize() - ##try: - #m.checkgrad(verbose=1) - #assert m.checkgrad() - #except: - #import ipdb;ipdb.set_trace() + self.m = Likelihood_F_Grad(X=self.X, likelihood_function=self.stu_t_likelihood, + kernel=self.kernel, param_name='dL_dfhat', + function=L, dparam_name='f_hat') + self.m.constrain_fixed('rbf_v', 1.0898) + self.m.constrain_fixed('rbf_l', 1.8651) + self.m.randomize() + self.m.checkgrad(verbose=1) + assert self.m.checkgrad() if __name__ == "__main__": unittest.main() From d63d370641846642bdc02f0295177f7f37b5f5fb Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 29 May 2013 13:46:55 +0100 Subject: [PATCH 42/71] About to rip out old chain rule method of learning gradients --- GPy/likelihoods/Laplace.py | 4 +++- GPy/testing/laplace_approx_tests.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index af20d36a..666fa227 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -87,7 +87,7 @@ class Laplace(likelihood): #dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde), #dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y) c = inv(self.K+self.Sigma_tilde) - dL_dytil_simple_term = -0.5*np.diag(np.dot(c, self.Y) + np.dot(self.Y.T, c)) + dL_dytil_simple_term = -0.5*np.diag(2*np.dot(c, self.Y)) P = np.diagflat(1/np.dot(Ki, self.f_hat)) K_Wi_i = inv(self.K+self.Sigma_tilde) @@ -96,6 +96,7 @@ class Laplace(likelihood): +0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P) ) * np.eye(self.N)) dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term + dL_dytil = dL_dytil.reshape(1, self.N) d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) Wi = np.diagonal(self.Sigma_tilde) #Convenience @@ -329,6 +330,7 @@ class Laplace(likelihood): #+ y_W_f #+ self.ln_z_hat #) + self.Z_tilde = 0 ##Check it isn't singular! if cond(self.W) > epsilon: diff --git a/GPy/testing/laplace_approx_tests.py b/GPy/testing/laplace_approx_tests.py index acb1c822..15d84c9c 100644 --- a/GPy/testing/laplace_approx_tests.py +++ b/GPy/testing/laplace_approx_tests.py @@ -62,8 +62,6 @@ class Likelihood_F_Grad(LikelihoodParamGrad): def log_likelihood(self): ll = self.func(self) - if self.param_name == "dL_dfhat_": - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT if len(ll.shape) == 0 or len(ll.shape) == 1: return ll.sum() elif len(ll.shape) == 2: @@ -128,6 +126,7 @@ class LaplaceTests(unittest.TestCase): def test_dL_dfhat(self): def L(self): + #return np.array(-0.5 * self.D * self.K_logdet + self._model_fit_term()) #Ignore Z for now return np.array(-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z) self.m = Likelihood_F_Grad(X=self.X, likelihood_function=self.stu_t_likelihood, From 117c377d13efe81b2df567936ff48e85f918efcd Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 29 May 2013 14:02:03 +0100 Subject: [PATCH 43/71] Ripped out all things Laplace parameter estimation, starting again with new tactic --- GPy/likelihoods/Laplace.py | 175 +------------------------------------ GPy/models/GP.py | 8 +- 2 files changed, 4 insertions(+), 179 deletions(-) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 666fa227..69c0876b 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -79,187 +79,18 @@ class Laplace(likelihood): return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) def _shared_gradients_components(self): - Ki, _, _, _ = pdinv(self.K) - - #Y__KS_i = np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) - #dL_dytil = -0.5*Y__KS_i #or *0.5? Shouldn't this be -y*R - #dL_dytil = -0.5*np.trace(np.dot(inv(self.K+self.Sigma_tilde), (np.dot(self.Y, self.Y.T) + self.Y.T))) - #dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde), - #dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y) - c = inv(self.K+self.Sigma_tilde) - dL_dytil_simple_term = -0.5*np.diag(2*np.dot(c, self.Y)) - - P = np.diagflat(1/np.dot(Ki, self.f_hat)) - K_Wi_i = inv(self.K+self.Sigma_tilde) - - dL_dytil_difficult_term = np.diag(( -0.5*(np.dot(self.K + self.Sigma_tilde, P)) - +0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P) - ) * np.eye(self.N)) - dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term - dL_dytil = dL_dytil.reshape(1, self.N) - - d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) - Wi = np.diagonal(self.Sigma_tilde) #Convenience - #Can just hadamard product as diagonal matricies multiplied are just multiplying elements - dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) - - - #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full - #dytil_dfhat_explicit = self.Wi__Ki_W - #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit - #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically - - dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) - self.dytil_dfhat = dytil_dfhat - #dytil_dfhat = np.eye(dytil_dfhat.shape[0]) - self.dL_dfhat = np.dot(dL_dytil, dytil_dfhat) #FIXME: Purely for checkgradding.... - return dL_dytil, dytil_dfhat def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): """ - #explicit #implicit #implicit - dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK) - :param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance) - :param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers - :returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters + Gradients with respect to prior kernel parameters """ - dL_dytil, dytil_dfhat = self._shared_gradients_components() - - #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde)) - - #print "Computing K gradients" - #print "dytil_dfhat: ", np.mean(dytil_dfhat) - #I = np.eye(self.N) - #C = np.dot(self.K, self.W) - #A = I + C - #plt.imshow(A) - #plt.show() - - #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?! - #B = I + w12*K*w12 - I_KW_i = self.Bi # could use self.B_chol?? - - #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K! - #Derivative for each f dimension, for each of K's hyper parameters - dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0])) - grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) - for ind_j, thetaj in enumerate(dK_dthetaK): - #dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad))) - dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad) - - print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat) - print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK) - dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK) - print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK) - print "\n" - - #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? - dL_dSigma = dL_d_K_Sigma - #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) - #explicit #implicit - #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK) - #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0])) - - d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) - Wi = np.diagonal(self.Sigma_tilde) #Convenience - dSigma_dthetaK_explicit = 0 - #Can just hadamard product as diagonal matricies multiplied are just multiplying elements - dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) - #dSigma_dthetaK_implicit = -np.sum(np.dot(dWi_dfhat, dfhat_dthetaK), axis=0) - dSigma_dthetaK_implicit = np.dot(dWi_dfhat, dfhat_dthetaK) - dSigma_dthetaK = dSigma_dthetaK_explicit + dSigma_dthetaK_implicit - #dSigma_dthetaK = 0 + np.dot(, dfhat_dthetaK) - #for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK): - #dSigma_dthetaK_explicit = 0 - #dSigma_dthetaK_implicit = -np.dot(Wi, dW_dfhat - #dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde) - - #FIXME: Won't handle multi dimensional data - dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0) - dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0) - dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma - - print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil) - print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK) - print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil - print "\n" - print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma) - print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK) - print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma - print "\n" - print "dL_dthetaK_implicit: ", dL_dthetaK_implicit - - return np.squeeze(dL_dthetaK_implicit) + return dL_dthetaK def _gradients(self, partial): """ Gradients with respect to likelihood parameters - - Complicated, it differs for parameters of the kernel \theta_{K}, and - parameters of the likelihood, \theta_{L} - - dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK) - dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL) - dL_dK*dK_dthetaL = 0 - - dytil_dthetaX = dytil_dfhat * dfhat_dthetaX - dytil_dfhat = Sigma*Ki + I - - fhat = K*log_p(y|fhat) from rasm p125 - dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat) from rasm p125 - - dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi - dW_dthetaX = d_dthetaX[d2phi_d2fhat] - d2phi_d2fhat = Hessian function of likelihood - - partial = dL_d_K_Sigma """ - dL_dytil, dytil_dfhat = self._shared_gradients_components() - #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - - dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data) - KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N)) - #KW_I_i = self.Bi # could use self.B_chol?? - dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL)) - dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None] - - dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL) - - #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5? - dL_dSigma = np.diagflat(partial) #Is actually but can't rename it because of naming convention... dL_d_K_Sigma - - Wi = np.diagonal(self.Sigma_tilde) #Convenience - #-1 as we are looking at W which is -1*d2log p(y|f) - #Can just hadamard product as diagonal matricies multiplied are just multiplying elements - dSigma_dthetaL_explicit = np.diagflat(-1*(Wi*(-1*d2likelihood_dthetaL)*Wi)) - - d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data) - dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi) - dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL) - dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit - - #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell? - #Derivative for each f dimension, for each of K's hyper parameters - #dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names()))) - #for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T): - #dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde, - #dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)? - #self.Sigma_tilde - #) - - #TODO: This is Wi*A*Wi, can be more numerically stable with a trick - #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde) - - #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL - #dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL) - #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL) - - dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0) - dL_dthetaL_via_Sigma = np.sum(np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)) - dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma - - return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) + return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) def _compute_GP_variables(self): """ diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 17e2a1b1..da379eb1 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -150,14 +150,8 @@ class GP(model): fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right... dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) - #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now - dL_dthetaK_implicit = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK) - dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit - - #print "dL_dthetaK_explicit: {dldkx} dL_dthetaK_implicit: {dldki} dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK) - + dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) - #print "dL_dthetaL: ", dL_dthetaL print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) else: dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) From 23ed2a2d15c28fe5d868639ad1358024808a328f Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 29 May 2013 17:33:06 +0100 Subject: [PATCH 44/71] Lots of name changing and went through all likelihood gradients again --- GPy/examples/laplace_approximations.py | 27 ++++--- GPy/likelihoods/Laplace.py | 35 +++++++-- GPy/likelihoods/likelihood_functions.py | 96 +++++++++++++++---------- GPy/models/GP.py | 2 +- 4 files changed, 103 insertions(+), 57 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 4d8e96b8..27f063dc 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -69,22 +69,21 @@ def debug_student_t_noise_approx(): print "Clean Gaussian" #A GP should completely break down due to the points as they get a lot of weight # create simple GP model - m = GPy.models.GP_regression(X, Y, kernel=kernel1) - # optimize - m.ensure_default_constraints() - m.optimize() - # plot - if plot: - plt.figure(1) - plt.suptitle('Gaussian likelihood') - plt.subplot(131) - m.plot() - plt.plot(X_full, Y_full) - print m + #m = GPy.models.GP_regression(X, Y, kernel=kernel1) + ## optimize + #m.ensure_default_constraints() + #m.optimize() + ## plot + #if plot: + #plt.figure(1) + #plt.suptitle('Gaussian likelihood') + #plt.subplot(131) + #m.plot() + #plt.plot(X_full, Y_full) + #print m edited_real_sd = initial_var_guess #real_sd - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) @@ -95,10 +94,10 @@ def debug_student_t_noise_approx(): m.constrain_positive('t_noi') #m.constrain_fixed('t_noise_variance', real_sd) m.update_likelihood_approximation() + m.optimize('scg', messages=True) print(m) return m #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback) - m.optimize('scg', messages=True) if plot: plt.suptitle('Student-t likelihood') plt.subplot(132) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 69c0876b..f8ba25f1 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -79,17 +79,40 @@ class Laplace(likelihood): return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) def _shared_gradients_components(self): + Ki, _, _, _ = pdinv(self.K) + Ki_W_i = inv(Ki + self.W) #Do it non numerically stable for now + d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat) + dL_dfhat = -0.5*np.dot(np.diag(Ki_W_i), d3lik_d3fhat) + KW = np.dot(self.K, self.W) + I_KW_i = inv(np.eye(KW.shape[0]) + KW) + return dL_dfhat, Ki, I_KW_i def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): """ Gradients with respect to prior kernel parameters """ + dL_dfhat, Ki, I_KW_i = self._shared_gradients_components() + K_Wi_i = inv(self.K + inv(self.W)) + dlp = self.likelihood_function.dlik_df(self.data, self.f_hat) + + dL_dthetaK = np.zeros(dK_dthetaK.shape) + for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK): + #Explicit + dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(np.dot(K_Wi_i, dK_dthetaK_i)) + #Implicit + df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK, dlp) + dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK) + return dL_dthetaK def _gradients(self, partial): """ Gradients with respect to likelihood parameters """ + dL_dfhat, Ki, I_KW_i = self._shared_gradients_components() + dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat) + dL_dthetaL = np.zeros(dlik_dthetaL.shape) + return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) def _compute_GP_variables(self): @@ -197,7 +220,7 @@ class Laplace(likelihood): #At this point get the hessian matrix #print "Data: ", self.data #print "fhat: ", self.f_hat - self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data)) + self.W = -np.diag(self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)) if not self.likelihood_function.log_concave: self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur @@ -212,7 +235,7 @@ class Laplace(likelihood): Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i) - b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None] + b = np.dot(self.W, self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)[:, None] solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b))) a = b - mdot(self.W_12, solve_chol) self.Ki_f = a @@ -259,11 +282,11 @@ class Laplace(likelihood): return float(res) def obj_grad(f): - res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f)) + res = -1 * (self.likelihood_function.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f)) return np.squeeze(res) def obj_hess(f): - res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki) + res = -1 * (--np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki) return np.squeeze(res) f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) @@ -294,7 +317,7 @@ class Laplace(likelihood): i = 0 while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: #f_old = f.copy() - W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data)) + W = -np.diag(self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)) if not self.likelihood_function.log_concave: W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance @@ -303,7 +326,7 @@ class Laplace(likelihood): B, L, W_12 = self._compute_B_statistics(K, W) W_f = np.dot(W, f) - grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None] + grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)[:, None] #Find K_i_f b = W_f + grad diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 646293d2..d75e7218 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -159,10 +159,10 @@ class student_t(likelihood_function): d2ln p(yi|fi)_d2fifj """ def __init__(self, deg_free, sigma=2): + #super(student_t, self).__init__() self.v = deg_free self.sigma = sigma self.log_concave = False - #super(student_t, self).__init__() self._set_params(np.asarray(sigma)) @@ -174,8 +174,6 @@ class student_t(likelihood_function): def _set_params(self, x): self.sigma = float(x) - #self.covariance_matrix = np.eye(self.N)*self._variance - #self.precision = 1./self._variance @property def variance(self, extra_data=None): @@ -185,6 +183,8 @@ class student_t(likelihood_function): """link_function $\ln p(y|f)$ $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ + For wolfram alpha import parts for derivative of sigma are -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2)) + :y: data :f: latent variables f :extra_data: extra_data which is not used in student t distribution @@ -198,17 +198,16 @@ class student_t(likelihood_function): e = y - f objective = (gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) - + np.log(self.sigma * np.sqrt(self.v * np.pi)) - - (self.v + 1) * 0.5 - * np.log(1 + ((e**2 / self.sigma**2) / self.v)) - ) + - np.log(self.sigma * np.sqrt(self.v * np.pi)) + - (self.v + 1) * 0.5 * np.log(1 + ((e**2 / self.sigma**2) / self.v)) + ) return np.sum(objective) - def link_grad(self, y, f, extra_data=None): + def dlik_df(self, y, f, extra_data=None): """ Gradient of the link function at y, given f w.r.t f - $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ + $$\frac{dp(y_{i}|f_{i})}{df} = \frac{-(v+1)(f_{i}-y_{i})}{(f_{i}-y_{i})^{2} + \sigma^{2}v}$$ :y: data :f: latent variables f @@ -220,17 +219,17 @@ class student_t(likelihood_function): f = np.squeeze(f) assert y.shape == f.shape e = y - f - grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) + grad = -((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) return np.squeeze(grad) - def link_hess(self, y, f, extra_data=None): + def d2lik_d2f(self, y, f, extra_data=None): """ Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j i.e. second derivative link_function at y given f f_j w.r.t f and f_j Will return diagonal of hessian, since every where else it is 0 - $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ + $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((f_{i}-y_{i})^{2} - \sigma^{2}v)}{((f_{i}-y_{i})^{2} + \sigma^{2}v)^{2}}$$ :y: data :f: latent variables f @@ -245,54 +244,79 @@ class student_t(likelihood_function): hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) return np.squeeze(hess) - def d3link(self, y, f, extra_data=None): + def d3lik_d3f(self, y, f, extra_data=None): """ Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j - $$\frac{2(v+1)((y-f)^{3} - 3\sigma^{2}v(y-f))}{((y-f)^{2} + \sigma^{2}v)^{3}}$$ + $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((f_{i} - y_{i})^3 - 3(f_{i} - y_{i}) \sigma^{2} v))}{((f_{i} - y_{i}) + \sigma^{2} v)^3}$$ """ y = np.squeeze(y) f = np.squeeze(f) assert y.shape == f.shape e = y - f - d3link_d3f = ( (2*(self.v + 1)*(-1*e)*(e**2 - 3*(self.sigma**2)*self.v)) - / ((e**2 + (self.sigma**2)*self.v)**3) - ) - return np.squeeze(d3link_d3f) + d3lik_d3f = ( -(2*(self.v + 1)*(e**3 - e*3*self.v*(self.sigma**2))) / + ((e**2 + (self.sigma**2)*self.v)**3) + ) + return np.squeeze(d3lik_d3f) - def link_hess_grad_std(self, y, f, extra_data=None): + def link_dstd(self, y, f, extra_data=None): """ - Gradient of the hessian w.r.t sigma parameter (standard deviation) + Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation) - $$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}} + Terms relavent to derivatives wrt sigma are: + -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2)) + + $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$ """ y = np.squeeze(y) f = np.squeeze(f) assert y.shape == f.shape e = y - f - hess_grad_sigma = ( (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) - / ((e**2 + (self.sigma**2)*self.v)**3) - ) - return np.squeeze(hess_grad_sigma) + dlik_dsigma = ( (1/self.sigma) - + ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + (e**2) / ((self.sigma**2)*self.v) ) ) + ) + return np.squeeze(dlik_dsigma) - def link_grad_std(self, y, f, extra_data=None): + def dlik_df_dstd(self, y, f, extra_data=None): """ - Gradient of the likelihood w.r.t sigma parameter (standard deviation) + Gradient of the dlik_df w.r.t sigma parameter (standard deviation) - $$\frac{-2\sigma(v+1)(y-f)}{(v\sigma^{2} + (y-f)^{2})^{2}}$$ + $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{2\sigma v(v + 1)(f-y)}{(f-y)^2 + \sigma^2 v)^2}$$ """ y = np.squeeze(y) f = np.squeeze(f) assert y.shape == f.shape e = y - f - grad_sigma = ( (-2*self.sigma*self.v*(self.v + 1)*e) - / ((self.v*(self.sigma**2) + e**2)**2) - ) - return np.squeeze(grad_sigma) + dlik_grad_dsigma = ((2*self.sigma*self.v*(self.v + 1)*e) + / ((self.v*(self.sigma**2) + e**2)**2) + ) + return np.squeeze(dlik_grad_dsigma) + + def d2lik_d2f_dstd(self, y, f, extra_data=None): + """ + Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation) + + $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{(v + 1)((f-y)^2 - \sigma^2 v)}{((f-y)^2 + \sigma^2 v)}$$ + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + e = y - f + dlik_hess_dsigma = ( ((v + 1)*(e**2 - (self.sigma**2)*self.v)) / + ((e**2 + (self.sigma**2)*self.v)**2) + ) + return np.squeeze(dlik_hess_dsigma) def _gradients(self, y, f, extra_data=None): - return [self.link_grad_std(y, f, extra_data=extra_data), - self.link_hess_grad_std(y, f, extra_data=extra_data)] # list as we might learn many parameters + derivs = ([self.link_dstd(y, f, extra_data=extra_data)], + [self.dlik_df_dstd(y, f, extra_data=extra_data)], + [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)] + ) # lists as we might learn many parameters + # ensure we have gradients for every parameter we want to optimize + assert len(derivs[0]) == len(self._get_param_names()) + assert len(derivs[1]) == len(self._get_param_names()) + assert len(derivs[2]) == len(self._get_param_names()) + return derivs def predictive_values(self, mu, var): """ @@ -412,7 +436,7 @@ class weibull_survival(likelihood_function): objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f) # FIXME: CHECK THIS WITH BOOK, wheres scale? return np.sum(objective) - def link_grad(self, y, f, extra_data=None): + def dlik_df(self, y, f, extra_data=None): """ Gradient of the link function at y, given f w.r.t f @@ -432,7 +456,7 @@ class weibull_survival(likelihood_function): grad = v - (y**self.shape)*np.exp(f) return np.squeeze(grad) - def link_hess(self, y, f, extra_data=None): + def d2lik_d2f(self, y, f, extra_data=None): """ Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j i.e. second derivative link_function at y given f f_j w.r.t f and f_j diff --git a/GPy/models/GP.py b/GPy/models/GP.py index da379eb1..0b5a8db6 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -147,7 +147,7 @@ class GP(model): if isinstance(self.likelihood, Laplace): dL_dthetaK_explicit = dL_dthetaK #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained - fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right... + fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right... dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK) From 20227fb2ac2c0d173eed515c7870864147a5d5d5 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 30 May 2013 16:17:37 +0100 Subject: [PATCH 45/71] Made more numerically stable in a hope that it will work and I will find a bug... --- GPy/examples/laplace_approximations.py | 10 +++--- GPy/likelihoods/Laplace.py | 45 ++++++++++++++++--------- GPy/likelihoods/likelihood_functions.py | 5 +-- GPy/models/GP.py | 7 ++-- 4 files changed, 39 insertions(+), 28 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 27f063dc..203d308d 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -37,9 +37,9 @@ def timing(): def debug_student_t_noise_approx(): plot = False - real_var = 0.1 + real_var = 0.4 #Start a function, any function - X = np.linspace(0.0, 10.0, 2)[:, None] + X = np.linspace(0.0, 10.0, 100)[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var X_full = np.linspace(0.0, 10.0, 500)[:, None] @@ -89,12 +89,12 @@ def debug_student_t_noise_approx(): stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, stu_t_likelihood, kernel6) #m.constrain_positive('rbf') - m.constrain_fixed('rbf_v', 1.0898) - m.constrain_fixed('rbf_l', 1.8651) + #m.constrain_fixed('rbf_v', 1.0898) + #m.constrain_fixed('rbf_l', 1.8651) m.constrain_positive('t_noi') #m.constrain_fixed('t_noise_variance', real_sd) m.update_likelihood_approximation() - m.optimize('scg', messages=True) + m.optimize(messages=True) print(m) return m #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index f8ba25f1..85af82f9 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -79,41 +79,54 @@ class Laplace(likelihood): return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) def _shared_gradients_components(self): + #FIXME: Careful of side effects! And make sure W and K are up to date! Ki, _, _, _ = pdinv(self.K) - Ki_W_i = inv(Ki + self.W) #Do it non numerically stable for now d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat) - dL_dfhat = -0.5*np.dot(np.diag(Ki_W_i), d3lik_d3fhat) - KW = np.dot(self.K, self.W) - I_KW_i = inv(np.eye(KW.shape[0]) + KW) - return dL_dfhat, Ki, I_KW_i + #dL_dfhat = -0.5*np.diag(self.Ki_W_i)*d3lik_d3fhat + dL_dfhat = -0.5*(np.diag(self.Ki_W_i)*d3lik_d3fhat)[:, None] + Wi_K_i = mdot(self.W_12, self.Bi, self.W_12) #same as rasms R + I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i) + return dL_dfhat, Ki, I_KW_i, Wi_K_i def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): """ Gradients with respect to prior kernel parameters """ - dL_dfhat, Ki, I_KW_i = self._shared_gradients_components() - K_Wi_i = inv(self.K + inv(self.W)) - dlp = self.likelihood_function.dlik_df(self.data, self.f_hat) + dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components() + dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)[:, None] dL_dthetaK = np.zeros(dK_dthetaK.shape) for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK): #Explicit - dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(np.dot(K_Wi_i, dK_dthetaK_i)) + dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(Wi_K_i*dK_dthetaK_i) #Implicit - df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK, dlp) + df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp) dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK) - return dL_dthetaK + return np.squeeze(dL_dthetaK) def _gradients(self, partial): """ Gradients with respect to likelihood parameters """ - dL_dfhat, Ki, I_KW_i = self._shared_gradients_components() + dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components() dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat) - dL_dthetaL = np.zeros(dlik_dthetaL.shape) - return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) + num_params = len(dlik_dthetaL) + #Ki_W_i = np.diag(inv(Ki + self.W))[:, None] + dL_dthetaL = np.zeros((1, num_params)) # make space for one derivative for each likelihood parameter + for thetaL_i in range(num_params): + #Explicit + #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i]))) + #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None]) + # might be + + dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) + #Implicit + df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL) + + return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) def _compute_GP_variables(self): """ @@ -232,8 +245,8 @@ class Laplace(likelihood): self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) self.Bi, _, _, B_det = pdinv(self.B) - Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) - self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i) + self.Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) + self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i) b = np.dot(self.W, self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)[:, None] solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b))) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index d75e7218..c6186137 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -302,12 +302,13 @@ class student_t(likelihood_function): f = np.squeeze(f) assert y.shape == f.shape e = y - f - dlik_hess_dsigma = ( ((v + 1)*(e**2 - (self.sigma**2)*self.v)) / + dlik_hess_dsigma = ( ((self.v + 1)*(e**2 - (self.sigma**2)*self.v)) / ((e**2 + (self.sigma**2)*self.v)**2) ) - return np.squeeze(dlik_hess_dsigma) + return dlik_hess_dsigma def _gradients(self, y, f, extra_data=None): + #must be listed in same order as 'get_param_names' derivs = ([self.link_dstd(y, f, extra_data=extra_data)], [self.dlik_df_dstd(y, f, extra_data=extra_data)], [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)] diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 0b5a8db6..9ce83a5a 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -69,7 +69,6 @@ class GP(model): self.likelihood._set_params(p[self.kern.Nparam_transformed():]) # test by Nicolas if isinstance(self.likelihood, Laplace): - print "Updating approx: ", p self.likelihood.fit_full(self.kern.K(self.X)) self.likelihood._set_params(self.likelihood._get_params()) @@ -134,7 +133,6 @@ class GP(model): matrix K* = K + diag(1./tau_tilde) plus a normalization term. """ l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z - print "Log likelihood: ", l return l def _log_likelihood_gradients(self): @@ -145,17 +143,16 @@ class GP(model): """ dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) if isinstance(self.likelihood, Laplace): - dL_dthetaK_explicit = dL_dthetaK #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right... dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) - print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) + #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) else: dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) - print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) + #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) return np.hstack((dL_dthetaK, dL_dthetaL)) #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) From f9857e08c0b4f130f2ae8ace5264e9ba65d9687c Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 31 May 2013 11:55:32 +0100 Subject: [PATCH 46/71] Broken it by getting rid of squeeze, but now working on making it faster using proper vector multiplciation for diagonals --- GPy/examples/laplace_approximations.py | 12 +++-- GPy/likelihoods/Laplace.py | 45 ++++++---------- GPy/likelihoods/likelihood_functions.py | 69 +++++++++++++------------ GPy/models/GP.py | 13 ++++- 4 files changed, 69 insertions(+), 70 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 203d308d..5103eefb 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -37,9 +37,10 @@ def timing(): def debug_student_t_noise_approx(): plot = False - real_var = 0.4 + real_var = 0.1 #Start a function, any function X = np.linspace(0.0, 10.0, 100)[:, None] + #X = np.array([0.5])[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var X_full = np.linspace(0.0, 10.0, 500)[:, None] @@ -52,7 +53,7 @@ def debug_student_t_noise_approx(): real_sd = np.sqrt(real_var) print "Real noise: ", real_sd - initial_var_guess = 1 + initial_var_guess = 0.02 #t_rv = t(deg_free, loc=0, scale=real_var) #noise = t_rvrvs(size=Y.shape) #Y += noise @@ -91,12 +92,14 @@ def debug_student_t_noise_approx(): #m.constrain_positive('rbf') #m.constrain_fixed('rbf_v', 1.0898) #m.constrain_fixed('rbf_l', 1.8651) - m.constrain_positive('t_noi') #m.constrain_fixed('t_noise_variance', real_sd) + m.constrain_positive('rbf') + m.constrain_fixed('t_noi', real_sd) + m.ensure_default_constraints() m.update_likelihood_approximation() m.optimize(messages=True) print(m) - return m + #return m #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback) if plot: plt.suptitle('Student-t likelihood') @@ -104,6 +107,7 @@ def debug_student_t_noise_approx(): m.plot() plt.plot(X_full, Y_full) plt.ylim(-2.5, 2.5) + return m #print "Clean student t, ncg" #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 85af82f9..027f014e 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -53,7 +53,7 @@ class Laplace(likelihood): def predictive_values(self, mu, var, full_cov): if full_cov: - raise NotImplementedError("Cannot make correlated predictions with an EP likelihood") + raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood") return self.likelihood_function.predictive_values(mu, var) def _get_params(self): @@ -63,42 +63,28 @@ class Laplace(likelihood): return self.likelihood_function._get_param_names() def _set_params(self, p): - #print "Setting laplace param with: ", p return self.likelihood_function._set_params(p) - def both_gradients(self, dL_d_K_Sigma, dK_dthetaK): - """ - Find the gradients of the marginal likelihood w.r.t both thetaK and thetaL - - dL_dthetaK differs from that of normal likelihoods as it has additional terms coming from - changes to y_tilde and changes to Sigma_tilde when the kernel parameters are adjusted - - Similar terms arise when finding the gradients with respect to changes in the liklihood - parameters - """ - return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma)) - def _shared_gradients_components(self): #FIXME: Careful of side effects! And make sure W and K are up to date! - Ki, _, _, _ = pdinv(self.K) d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat) - #dL_dfhat = -0.5*np.diag(self.Ki_W_i)*d3lik_d3fhat dL_dfhat = -0.5*(np.diag(self.Ki_W_i)*d3lik_d3fhat)[:, None] Wi_K_i = mdot(self.W_12, self.Bi, self.W_12) #same as rasms R I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i) - return dL_dfhat, Ki, I_KW_i, Wi_K_i + return dL_dfhat, I_KW_i, Wi_K_i - def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK): + def _Kgradients(self, dK_dthetaK): """ Gradients with respect to prior kernel parameters """ - dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components() + dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components() dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)[:, None] dL_dthetaK = np.zeros(dK_dthetaK.shape) for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK): #Explicit - dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(Wi_K_i*dK_dthetaK_i) + f_Ki_dK_dtheta_Ki_f = mdot(self.Ki_f.T, dK_dthetaK_i, self.Ki_f) + dL_dthetaK[thetaK_i] = 0.5*f_Ki_dK_dtheta_Ki_f - 0.5*np.trace(Wi_K_i*dK_dthetaK_i) #Implicit df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp) dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK) @@ -109,11 +95,12 @@ class Laplace(likelihood): """ Gradients with respect to likelihood parameters """ - dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components() + return np.zeros(1) + #return np.zeros(0) + dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components() dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat) num_params = len(dlik_dthetaL) - #Ki_W_i = np.diag(inv(Ki + self.W))[:, None] dL_dthetaL = np.zeros((1, num_params)) # make space for one derivative for each likelihood parameter for thetaL_i in range(num_params): #Explicit @@ -123,7 +110,6 @@ class Laplace(likelihood): dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) #Implicit df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL) return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) @@ -230,10 +216,8 @@ class Laplace(likelihood): self._compute_likelihood_variables() def _compute_likelihood_variables(self): - #At this point get the hessian matrix - #print "Data: ", self.data - #print "fhat: ", self.f_hat - self.W = -np.diag(self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)) + #At this point get the hessian matrix (or vector as W is diagonal) + self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data) if not self.likelihood_function.log_concave: self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur @@ -273,7 +257,8 @@ class Laplace(likelihood): """ #W is diagnoal so its sqrt is just the sqrt of the diagonal elements W_12 = np.sqrt(W) - B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12)) + assert np.all(W_12.T*K*W_12 == np.dot(np.diagflat(W_12), np.dot(K, np.diagflat(W_12)))) # FIXME Take this out when you've done multiinput + B = np.eye(K.shape[0]) + W_12.T*K*W_12 L = jitchol(B) return (B, L, W_12) @@ -330,7 +315,7 @@ class Laplace(likelihood): i = 0 while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: #f_old = f.copy() - W = -np.diag(self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)) + W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance @@ -339,7 +324,7 @@ class Laplace(likelihood): B, L, W_12 = self._compute_B_statistics(K, W) W_f = np.dot(W, f) - grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)[:, None] + grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data) #Find K_i_f b = W_f + grad diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index c6186137..c3aee835 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -191,8 +191,8 @@ class student_t(likelihood_function): :returns: float(likelihood evaluated for this point) """ - y = np.squeeze(y) - f = np.squeeze(f) + #y = np.squeeze(y) + #f = np.squeeze(f) assert y.shape == f.shape e = y - f @@ -207,7 +207,7 @@ class student_t(likelihood_function): """ Gradient of the link function at y, given f w.r.t f - $$\frac{dp(y_{i}|f_{i})}{df} = \frac{-(v+1)(f_{i}-y_{i})}{(f_{i}-y_{i})^{2} + \sigma^{2}v}$$ + $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$ :y: data :f: latent variables f @@ -215,51 +215,52 @@ class student_t(likelihood_function): :returns: gradient of likelihood evaluated at points """ - y = np.squeeze(y) - f = np.squeeze(f) + #y = np.squeeze(y) + #f = np.squeeze(f) assert y.shape == f.shape e = y - f - grad = -((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) - return np.squeeze(grad) + grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) + return grad def d2lik_d2f(self, y, f, extra_data=None): """ Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j i.e. second derivative link_function at y given f f_j w.r.t f and f_j - Will return diagonal of hessian, since every where else it is 0 + Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases + (the distribution for y_{i} depends only on f_{i} not on f_{j!=i} - $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((f_{i}-y_{i})^{2} - \sigma^{2}v)}{((f_{i}-y_{i})^{2} + \sigma^{2}v)^{2}}$$ + $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$ :y: data :f: latent variables f :extra_data: extra_data which is not used in student t distribution :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ - y = np.squeeze(y) - f = np.squeeze(f) + #y = np.squeeze(y) + #f = np.squeeze(f) assert y.shape == f.shape e = y - f hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) - return np.squeeze(hess) + return hess def d3lik_d3f(self, y, f, extra_data=None): """ Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j - $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((f_{i} - y_{i})^3 - 3(f_{i} - y_{i}) \sigma^{2} v))}{((f_{i} - y_{i}) + \sigma^{2} v)^3}$$ + $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$ """ - y = np.squeeze(y) - f = np.squeeze(f) + #y = np.squeeze(y) + #f = np.squeeze(f) assert y.shape == f.shape e = y - f - d3lik_d3f = ( -(2*(self.v + 1)*(e**3 - e*3*self.v*(self.sigma**2))) / + d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) / ((e**2 + (self.sigma**2)*self.v)**3) ) - return np.squeeze(d3lik_d3f) + return d3lik_d3f - def link_dstd(self, y, f, extra_data=None): + def lik_dstd(self, y, f, extra_data=None): """ Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation) @@ -268,48 +269,48 @@ class student_t(likelihood_function): $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$ """ - y = np.squeeze(y) - f = np.squeeze(f) + #y = np.squeeze(y) + #f = np.squeeze(f) assert y.shape == f.shape e = y - f - dlik_dsigma = ( (1/self.sigma) - - ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + (e**2) / ((self.sigma**2)*self.v) ) ) + dlik_dsigma = ( - (1/self.sigma) + + ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) ) ) - return np.squeeze(dlik_dsigma) + return dlik_dsigma def dlik_df_dstd(self, y, f, extra_data=None): """ Gradient of the dlik_df w.r.t sigma parameter (standard deviation) - $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{2\sigma v(v + 1)(f-y)}{(f-y)^2 + \sigma^2 v)^2}$$ + $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$ """ - y = np.squeeze(y) - f = np.squeeze(f) + #y = np.squeeze(y) + #f = np.squeeze(f) assert y.shape == f.shape e = y - f - dlik_grad_dsigma = ((2*self.sigma*self.v*(self.v + 1)*e) + dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) / ((self.v*(self.sigma**2) + e**2)**2) ) - return np.squeeze(dlik_grad_dsigma) + return dlik_grad_dsigma def d2lik_d2f_dstd(self, y, f, extra_data=None): """ Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation) - $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{(v + 1)((f-y)^2 - \sigma^2 v)}{((f-y)^2 + \sigma^2 v)}$$ + $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ """ - y = np.squeeze(y) - f = np.squeeze(f) + #y = np.squeeze(y) + #f = np.squeeze(f) assert y.shape == f.shape e = y - f - dlik_hess_dsigma = ( ((self.v + 1)*(e**2 - (self.sigma**2)*self.v)) / - ((e**2 + (self.sigma**2)*self.v)**2) + dlik_hess_dsigma = ( (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) / + ((e**2 + (self.sigma**2)*self.v)**3) ) return dlik_hess_dsigma def _gradients(self, y, f, extra_data=None): #must be listed in same order as 'get_param_names' - derivs = ([self.link_dstd(y, f, extra_data=extra_data)], + derivs = ([self.lik_dstd(y, f, extra_data=extra_data)], [self.dlik_df_dstd(y, f, extra_data=extra_data)], [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)] ) # lists as we might learn many parameters diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 9ce83a5a..0f3dcb58 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -142,13 +142,22 @@ class GP(model): Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta """ dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) + print "dL_dthetaK before: ",dL_dthetaK if isinstance(self.likelihood, Laplace): + #Reapproximate incase it hasnt been done... + if isinstance(self.likelihood, Laplace): + self.likelihood.fit_full(self.kern.K(self.X)) + self.likelihood._set_params(self.likelihood._get_params()) + #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right... + #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right... dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) + #THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params) - dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK) - dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) + dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK) + dL_dthetaL = 0 # self.likelihood._gradients(partial=np.diag(self.dL_dK)) + print "dL_dthetaK after: ",dL_dthetaK #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) else: dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) From e842f6e68735adaf95b31d0bc3c074dc39d553ea Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 31 May 2013 16:45:22 +0100 Subject: [PATCH 47/71] Made it use the fact that W is diagonal and put assertions in to ensure that the results are the same --- GPy/likelihoods/Laplace.py | 99 ++++++++++++++++++++++++++++---------- GPy/models/GP.py | 2 +- 2 files changed, 75 insertions(+), 26 deletions(-) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 027f014e..af74755f 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -68,8 +68,11 @@ class Laplace(likelihood): def _shared_gradients_components(self): #FIXME: Careful of side effects! And make sure W and K are up to date! d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat) - dL_dfhat = -0.5*(np.diag(self.Ki_W_i)*d3lik_d3fhat)[:, None] - Wi_K_i = mdot(self.W_12, self.Bi, self.W_12) #same as rasms R + dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat) + Wi_K_i = mdot(np.diagflat(self.W_12), self.Bi, np.diagflat(self.W_12)) #same as rasms R + Wi_K_inew = self.W_12*self.Bi*self.W_12.T #same as rasms R + assert np.all(Wi_K_i == Wi_K_inew) + I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i) return dL_dfhat, I_KW_i, Wi_K_i @@ -78,7 +81,7 @@ class Laplace(likelihood): Gradients with respect to prior kernel parameters """ dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components() - dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)[:, None] + dlp = self.likelihood_function.dlik_df(self.data, self.f_hat) dL_dthetaK = np.zeros(dK_dthetaK.shape) for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK): @@ -89,7 +92,7 @@ class Laplace(likelihood): df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp) dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK) - return np.squeeze(dL_dthetaK) + return dL_dthetaK def _gradients(self, partial): """ @@ -112,7 +115,7 @@ class Laplace(likelihood): df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL) - return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) + return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) def _compute_GP_variables(self): """ @@ -147,7 +150,9 @@ class Laplace(likelihood): #((L.T*w)_i + I)f_hat = y_tilde L = jitchol(self.K) Li = chol_inv(L) - Lt_W = np.dot(L.T, self.W) #FIXME: Can make Faster + Lt_W = np.dot(L.T, np.diagflat(self.W)) #FIXME: Can make Faster + Lt_Wnew = L.T*self.W.T + assert np.all(Lt_Wnew == Lt_W) ##Check it isn't singular! if cond(Lt_W) > epsilon: @@ -159,12 +164,27 @@ class Laplace(likelihood): #f.T(Ki + W)f f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat)) - + mdot(self.f_hat.T, self.W, self.f_hat) + + mdot(self.f_hat.T, np.diagflat(self.W), self.f_hat) ) + f_Ki_W_fnew = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat)) + + mdot(self.f_hat.T, self.W*self.f_hat) + ) + assert np.all(f_Ki_W_f == f_Ki_W_fnew) - y_W_f = mdot(Y_tilde.T, self.W, self.f_hat) - y_W_y = mdot(Y_tilde.T, self.W, Y_tilde) - ln_W_det = det_ln_diag(self.W) + y_W_f = mdot((Y_tilde.T, np.diagflat(self.W)), self.f_hat) + y_W_fnew = mdot(Y_tilde.T*self.W.T, self.f_hat) + assert np.all(y_W_f == y_W_fnew) + + + y_W_y = mdot((Y_tilde.T, np.diagflat(self.W)), Y_tilde) + y_W_ynew = mdot(Y_tilde.T, self.W*Y_tilde) + assert np.all(y_W_y == y_W_ynew) + + ln_W_det = det_ln_diag(np.diagflat(self.W)) + ln_W_detnew = np.log(self.W).sum() + assert np.all(ln_W_det == ln_W_detnew) + + #FIXME: Revisit this Z_tilde = (- self.NORMAL_CONST + 0.5*self.ln_K_det + 0.5*ln_W_det @@ -189,14 +209,16 @@ class Laplace(likelihood): if cond(self.W) > epsilon: print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem" - self.Sigma_tilde = inv(self.W) # Damn + self.Sigma_tilde = inv(np.diagflat(self.W)) # Damn + Sigma_tildenew = np.diagflat(1.0/self.W) + assert np.all(self.Sigma_tilde == Sigma_tildenew) #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) self.Y = Y_tilde self.YYT = np.dot(self.Y, self.Y.T) self.covariance_matrix = self.Sigma_tilde - self.precision = 1 / np.diag(self.covariance_matrix)[:, None] + self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None] def fit_full(self, K): """ @@ -229,12 +251,24 @@ class Laplace(likelihood): self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) self.Bi, _, _, B_det = pdinv(self.B) - self.Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) + self.Ki_W_i = self.K - mdot(self.K, (np.diagflat(self.W_12), self.Bi, np.diagflat(self.W_12)), self.K) # Funky, order matters on stability! + Ki_W_inew = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K) + assert np.all(self.Ki_W_i == Ki_W_inew) + self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i) - b = np.dot(self.W, self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)[:, None] - solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b))) - a = b - mdot(self.W_12, solve_chol) + b = np.dot(np.diagflat(self.W), self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data) + bnew = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data) + assert np.all(b == bnew) + + solve_chol = cho_solve((self.B_chol, True), mdot((np.diagflat(self.W_12), self.K), b)) + solve_cholnew = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b)) + assert np.all(solve_chol == solve_cholnew) + + a = b - mdot(np.diagflat(self.W_12), solve_chol) + anew = b - self.W_12*solve_chol + assert np.all(a == anew) + self.Ki_f = a self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f) self.ln_K_det = pddet(self.K) @@ -255,10 +289,13 @@ class Laplace(likelihood): :W: Negative hessian at a point (diagonal matrix) :returns: (B, L) """ - #W is diagnoal so its sqrt is just the sqrt of the diagonal elements + #W is diagonal so its sqrt is just the sqrt of the diagonal elements W_12 = np.sqrt(W) - assert np.all(W_12.T*K*W_12 == np.dot(np.diagflat(W_12), np.dot(K, np.diagflat(W_12)))) # FIXME Take this out when you've done multiinput - B = np.eye(K.shape[0]) + W_12.T*K*W_12 + # FIXME Take this out when you've done multiinput, Weirdly this is + # better when its W_12.T*K*W_12 which shouldnt make a difference + # because K is symmetrical + assert np.allclose(W_12*K*W_12.T, np.dot(np.diagflat(W_12), np.dot(K, np.diagflat(W_12)))) + B = np.eye(self.N) + W_12*K*W_12.T L = jitchol(B) return (B, L, W_12) @@ -323,19 +360,31 @@ class Laplace(likelihood): # This is a property only held by non-log-concave likelihoods B, L, W_12 = self._compute_B_statistics(K, W) - W_f = np.dot(W, f) + W_f = np.dot(np.diagflat(W), f) + W_fnew = W*f + assert np.all(W_f == W_fnew) grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data) #Find K_i_f b = W_f + grad #a should be equal to Ki*f now so should be able to use it c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) - solve_L = cho_solve((L, True), np.dot(W_12, c)) - f = c - np.dot(K, np.dot(W_12, solve_L)) - solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b))) - a = b - np.dot(W_12, solve_L) - #f = np.dot(K, a) + solve_L = cho_solve((L, True), np.dot(np.diagflat(W_12), c)) + solve_Lnew = cho_solve((L, True), W_12*c) + assert np.all(solve_L == solve_Lnew) + + f = c - np.dot(K, np.dot(np.diagflat(W_12), solve_L)) + fnew = c - np.dot(K, W_12*solve_L) + assert np.all(f == fnew) + + solve_L = cho_solve((L, True), np.dot(np.diagflat(W_12), np.dot(K, b))) + solve_Lnew = cho_solve((L, True), W_12*np.dot(K, b)) + assert np.all(solve_L == solve_Lnew) + + a = b - np.dot(np.diagflat(W_12), solve_L) + anew = b - W_12*solve_L + assert np.all(a == anew) tmp_old_obj = old_obj old_obj = new_obj diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 0f3dcb58..787429de 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -156,7 +156,7 @@ class GP(model): #THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params) dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK) - dL_dthetaL = 0 # self.likelihood._gradients(partial=np.diag(self.dL_dK)) + dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) print "dL_dthetaK after: ",dL_dthetaK #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) else: From 6c2975079517364f00b2345f0ef9b3d2f5a14103 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 31 May 2013 16:59:54 +0100 Subject: [PATCH 48/71] Took out all the asserts and using pure broadcasting method of diagonal now --- GPy/examples/laplace_approximations.py | 4 +- GPy/likelihoods/Laplace.py | 70 ++++++-------------------- GPy/models/GP.py | 3 +- 3 files changed, 20 insertions(+), 57 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 5103eefb..14ff44a0 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -39,8 +39,8 @@ def debug_student_t_noise_approx(): plot = False real_var = 0.1 #Start a function, any function - X = np.linspace(0.0, 10.0, 100)[:, None] - #X = np.array([0.5])[:, None] + #X = np.linspace(0.0, 10.0, 100)[:, None] + X = np.array([0.5])[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var X_full = np.linspace(0.0, 10.0, 500)[:, None] diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index af74755f..74d37d48 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -69,9 +69,7 @@ class Laplace(likelihood): #FIXME: Careful of side effects! And make sure W and K are up to date! d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat) dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat) - Wi_K_i = mdot(np.diagflat(self.W_12), self.Bi, np.diagflat(self.W_12)) #same as rasms R - Wi_K_inew = self.W_12*self.Bi*self.W_12.T #same as rasms R - assert np.all(Wi_K_i == Wi_K_inew) + Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i) return dL_dfhat, I_KW_i, Wi_K_i @@ -150,9 +148,7 @@ class Laplace(likelihood): #((L.T*w)_i + I)f_hat = y_tilde L = jitchol(self.K) Li = chol_inv(L) - Lt_W = np.dot(L.T, np.diagflat(self.W)) #FIXME: Can make Faster - Lt_Wnew = L.T*self.W.T - assert np.all(Lt_Wnew == Lt_W) + Lt_W = L.T*self.W.T ##Check it isn't singular! if cond(Lt_W) > epsilon: @@ -164,25 +160,15 @@ class Laplace(likelihood): #f.T(Ki + W)f f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat)) - + mdot(self.f_hat.T, np.diagflat(self.W), self.f_hat) - ) - f_Ki_W_fnew = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat)) + mdot(self.f_hat.T, self.W*self.f_hat) ) - assert np.all(f_Ki_W_f == f_Ki_W_fnew) - y_W_f = mdot((Y_tilde.T, np.diagflat(self.W)), self.f_hat) - y_W_fnew = mdot(Y_tilde.T*self.W.T, self.f_hat) - assert np.all(y_W_f == y_W_fnew) + y_W_f = mdot(Y_tilde.T*self.W.T, self.f_hat) - y_W_y = mdot((Y_tilde.T, np.diagflat(self.W)), Y_tilde) - y_W_ynew = mdot(Y_tilde.T, self.W*Y_tilde) - assert np.all(y_W_y == y_W_ynew) + y_W_y = mdot(Y_tilde.T, self.W*Y_tilde) - ln_W_det = det_ln_diag(np.diagflat(self.W)) - ln_W_detnew = np.log(self.W).sum() - assert np.all(ln_W_det == ln_W_detnew) + ln_W_det = np.log(self.W).sum() #FIXME: Revisit this Z_tilde = (- self.NORMAL_CONST @@ -203,15 +189,13 @@ class Laplace(likelihood): #+ y_W_f #+ self.ln_z_hat #) - self.Z_tilde = 0 + #self.Z_tilde = 0 ##Check it isn't singular! if cond(self.W) > epsilon: print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem" - self.Sigma_tilde = inv(np.diagflat(self.W)) # Damn - Sigma_tildenew = np.diagflat(1.0/self.W) - assert np.all(self.Sigma_tilde == Sigma_tildenew) + self.Sigma_tilde = np.diagflat(1.0/self.W) #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -251,23 +235,15 @@ class Laplace(likelihood): self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) self.Bi, _, _, B_det = pdinv(self.B) - self.Ki_W_i = self.K - mdot(self.K, (np.diagflat(self.W_12), self.Bi, np.diagflat(self.W_12)), self.K) # Funky, order matters on stability! - Ki_W_inew = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K) - assert np.all(self.Ki_W_i == Ki_W_inew) + self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K) self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i) - b = np.dot(np.diagflat(self.W), self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data) - bnew = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data) - assert np.all(b == bnew) + b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data) - solve_chol = cho_solve((self.B_chol, True), mdot((np.diagflat(self.W_12), self.K), b)) - solve_cholnew = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b)) - assert np.all(solve_chol == solve_cholnew) + solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b)) - a = b - mdot(np.diagflat(self.W_12), solve_chol) - anew = b - self.W_12*solve_chol - assert np.all(a == anew) + a = b - self.W_12*solve_chol self.Ki_f = a self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f) @@ -291,10 +267,6 @@ class Laplace(likelihood): """ #W is diagonal so its sqrt is just the sqrt of the diagonal elements W_12 = np.sqrt(W) - # FIXME Take this out when you've done multiinput, Weirdly this is - # better when its W_12.T*K*W_12 which shouldnt make a difference - # because K is symmetrical - assert np.allclose(W_12*K*W_12.T, np.dot(np.diagflat(W_12), np.dot(K, np.diagflat(W_12)))) B = np.eye(self.N) + W_12*K*W_12.T L = jitchol(B) return (B, L, W_12) @@ -360,9 +332,7 @@ class Laplace(likelihood): # This is a property only held by non-log-concave likelihoods B, L, W_12 = self._compute_B_statistics(K, W) - W_f = np.dot(np.diagflat(W), f) - W_fnew = W*f - assert np.all(W_f == W_fnew) + W_f = W*f grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data) #Find K_i_f b = W_f + grad @@ -370,21 +340,13 @@ class Laplace(likelihood): #a should be equal to Ki*f now so should be able to use it c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) - solve_L = cho_solve((L, True), np.dot(np.diagflat(W_12), c)) - solve_Lnew = cho_solve((L, True), W_12*c) - assert np.all(solve_L == solve_Lnew) + solve_L = cho_solve((L, True), W_12*c) - f = c - np.dot(K, np.dot(np.diagflat(W_12), solve_L)) - fnew = c - np.dot(K, W_12*solve_L) - assert np.all(f == fnew) + f = c - np.dot(K, W_12*solve_L) - solve_L = cho_solve((L, True), np.dot(np.diagflat(W_12), np.dot(K, b))) - solve_Lnew = cho_solve((L, True), W_12*np.dot(K, b)) - assert np.all(solve_L == solve_Lnew) + solve_L = cho_solve((L, True), W_12*np.dot(K, b)) - a = b - np.dot(np.diagflat(W_12), solve_L) - anew = b - W_12*solve_L - assert np.all(a == anew) + a = b - W_12*solve_L tmp_old_obj = old_obj old_obj = new_obj diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 787429de..0ba20d7b 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -152,8 +152,9 @@ class GP(model): #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right... #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right... + + #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params) dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) - #THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params) dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) From f3b8dfb2225c8a25a0b753ec0e2f63b28cdec827 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 3 Jun 2013 14:51:09 +0100 Subject: [PATCH 49/71] about to input new derivations for Z's... --- GPy/examples/laplace_approximations.py | 15 +++++++++++--- GPy/likelihoods/Laplace.py | 28 ++++++++++++++++---------- GPy/models/GP.py | 17 ++++++++-------- 3 files changed, 37 insertions(+), 23 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 14ff44a0..ee71a950 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -143,11 +143,12 @@ def student_t_approx(): Yc[10] += 100 Yc[25] += 10 Yc[23] += 10 + Yc[26] += 1000 Yc[24] += 10 #Yc = Yc/Yc.max() #Add student t random noise to datapoints - deg_free = 1000000000000 + deg_free = 10 real_sd = np.sqrt(real_var) print "Real noise: ", real_sd @@ -187,21 +188,25 @@ def student_t_approx(): plt.subplot(211) m.plot() plt.plot(X_full, Y_full) + plt.title('Gaussian clean') print m #Corrupt print "Corrupt Gaussian" m = GPy.models.GP_regression(X, Yc, kernel=kernel2) m.ensure_default_constraints() - m.optimize() + #m.optimize() plt.subplot(212) m.plot() plt.plot(X_full, Y_full) + plt.title('Gaussian corrupt') print m + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + plt.figure(2) plt.suptitle('Student-t likelihood') - edited_real_sd = initial_var_guess #real_sd + edited_real_sd = real_sd #initial_var_guess print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) @@ -215,6 +220,7 @@ def student_t_approx(): m.plot() plt.plot(X_full, Y_full) plt.ylim(-2.5, 2.5) + plt.title('Student-t rasm clean') print "Corrupt student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) @@ -228,6 +234,7 @@ def student_t_approx(): m.plot() plt.plot(X_full, Y_full) plt.ylim(-2.5, 2.5) + plt.title('Student-t rasm corrupt') print "Clean student t, ncg" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) @@ -241,6 +248,7 @@ def student_t_approx(): m.plot() plt.plot(X_full, Y_full) plt.ylim(-2.5, 2.5) + plt.title('Student-t ncg clean') print "Corrupt student t, ncg" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) @@ -254,6 +262,7 @@ def student_t_approx(): m.plot() plt.plot(X_full, Y_full) plt.ylim(-2.5, 2.5) + plt.title('Student-t ncg corrupt') ###with a student t distribution, since it has heavy tails it should work well diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 74d37d48..45fddeaa 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -6,7 +6,10 @@ from numpy.linalg import cond from likelihood import likelihood from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet from scipy.linalg.lapack import dtrtrs +import random #import pylab as plt +np.random.seed(50) +random.seed(50) class Laplace(likelihood): @@ -156,6 +159,7 @@ class Laplace(likelihood): Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0] self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N) + Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat) #f.T(Ki + W)f @@ -239,15 +243,15 @@ class Laplace(likelihood): self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i) + #Do the computation again at f to get Ki_f which is useful b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data) - solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b)) - a = b - self.W_12*solve_chol - self.Ki_f = a + self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f) self.ln_K_det = pddet(self.K) + #_, _, _, self.ln_K_det = pdinv(self.K) self.ln_z_hat = (- 0.5*self.f_Ki_f - 0.5*self.ln_K_det @@ -296,7 +300,7 @@ class Laplace(likelihood): res = -1 * (--np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki) return np.squeeze(res) - f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) + f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False) return f_hat[:, None] def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50): @@ -336,17 +340,19 @@ class Laplace(likelihood): grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data) #Find K_i_f b = W_f + grad + b = step_size*b - #a should be equal to Ki*f now so should be able to use it - c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) - - solve_L = cho_solve((L, True), W_12*c) - - f = c - np.dot(K, W_12*solve_L) + #Need this to find the f we have a stepsize which we need to move in, rather than a full unit movement + #c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) + #solve_L = cho_solve((L, True), W_12*c) + #f = c - np.dot(K, W_12*solve_L) + #FIXME: Can't we get rid of this? Don't we want to evaluate obj(c,f) and this is our new_obj? + #Why did I choose to evaluate the objective function at the new f with the old hessian? I'm sure there was a good reason, + #Document it! solve_L = cho_solve((L, True), W_12*np.dot(K, b)) - a = b - W_12*solve_L + f = np.dot(K, a) tmp_old_obj = old_obj old_obj = new_obj diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 0ba20d7b..e4ed52ef 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -142,23 +142,22 @@ class GP(model): Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta """ dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) - print "dL_dthetaK before: ",dL_dthetaK if isinstance(self.likelihood, Laplace): #Reapproximate incase it hasnt been done... - if isinstance(self.likelihood, Laplace): - self.likelihood.fit_full(self.kern.K(self.X)) - self.likelihood._set_params(self.likelihood._get_params()) + self.likelihood.fit_full(self.kern.K(self.X)) + self.likelihood._set_params(self.likelihood._get_params()) + print self.kern._get_params() #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained - fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right... + #fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right... #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right... #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params) - dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) + #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) - dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK) - dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) - print "dL_dthetaK after: ",dL_dthetaK + #dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK) + dL_dthetaL = 0 #self.likelihood._gradients(partial=np.diag(self.dL_dK)) + #print "dL_dthetaK after: ",dL_dthetaK #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) else: dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) From ac461e1b2aa65afa08359e1ac6d6cb8956e962b4 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 18 Jun 2013 17:55:58 +0100 Subject: [PATCH 50/71] Checkgrads with explicit and implicit components half the time --- GPy/examples/laplace_approximations.py | 69 +++++++-------- GPy/likelihoods/Laplace.py | 114 +++++++++++-------------- GPy/models/GP.py | 7 +- GPy/util/linalg.py | 2 +- 4 files changed, 91 insertions(+), 101 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index ee71a950..5120dfb5 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -39,11 +39,11 @@ def debug_student_t_noise_approx(): plot = False real_var = 0.1 #Start a function, any function - #X = np.linspace(0.0, 10.0, 100)[:, None] - X = np.array([0.5])[:, None] + X = np.linspace(0.0, 10.0, 15)[:, None] + #X = np.array([0.5])[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var - X_full = np.linspace(0.0, 10.0, 500)[:, None] + X_full = np.linspace(0.0, 10.0, 15)[:, None] Y_full = np.sin(X_full) Y = Y/Y.max() @@ -83,7 +83,8 @@ def debug_student_t_noise_approx(): #plt.plot(X_full, Y_full) #print m - edited_real_sd = initial_var_guess #real_sd + #edited_real_sd = initial_var_guess #real_sd + edited_real_sd = real_sd print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) @@ -94,7 +95,7 @@ def debug_student_t_noise_approx(): #m.constrain_fixed('rbf_l', 1.8651) #m.constrain_fixed('t_noise_variance', real_sd) m.constrain_positive('rbf') - m.constrain_fixed('t_noi', real_sd) + #m.constrain_fixed('t_noi', real_sd) m.ensure_default_constraints() m.update_likelihood_approximation() m.optimize(messages=True) @@ -148,7 +149,7 @@ def student_t_approx(): #Yc = Yc/Yc.max() #Add student t random noise to datapoints - deg_free = 10 + deg_free = 8 real_sd = np.sqrt(real_var) print "Real noise: ", real_sd @@ -202,8 +203,6 @@ def student_t_approx(): plt.title('Gaussian corrupt') print m - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - plt.figure(2) plt.suptitle('Student-t likelihood') edited_real_sd = real_sd #initial_var_guess @@ -236,33 +235,35 @@ def student_t_approx(): plt.ylim(-2.5, 2.5) plt.title('Student-t rasm corrupt') - print "Clean student t, ncg" - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False) - m = GPy.models.GP(X, stu_t_likelihood, kernel3) - m.ensure_default_constraints() - m.update_likelihood_approximation() - m.optimize() - print(m) - plt.subplot(221) - m.plot() - plt.plot(X_full, Y_full) - plt.ylim(-2.5, 2.5) - plt.title('Student-t ncg clean') + return m - print "Corrupt student t, ncg" - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) - corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=False) - m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) - m.ensure_default_constraints() - m.update_likelihood_approximation() - m.optimize() - print(m) - plt.subplot(223) - m.plot() - plt.plot(X_full, Y_full) - plt.ylim(-2.5, 2.5) - plt.title('Student-t ncg corrupt') + #print "Clean student t, ncg" + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False) + #m = GPy.models.GP(X, stu_t_likelihood, kernel3) + #m.ensure_default_constraints() + #m.update_likelihood_approximation() + #m.optimize() + #print(m) + #plt.subplot(221) + #m.plot() + #plt.plot(X_full, Y_full) + #plt.ylim(-2.5, 2.5) + #plt.title('Student-t ncg clean') + + #print "Corrupt student t, ncg" + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=False) + #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) + #m.ensure_default_constraints() + #m.update_likelihood_approximation() + #m.optimize() + #print(m) + #plt.subplot(223) + #m.plot() + #plt.plot(X_full, Y_full) + #plt.ylim(-2.5, 2.5) + #plt.title('Student-t ncg corrupt') ###with a student t distribution, since it has heavy tails it should work well diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 45fddeaa..a8347345 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -8,9 +8,6 @@ from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet from scipy.linalg.lapack import dtrtrs import random #import pylab as plt -np.random.seed(50) -random.seed(50) - class Laplace(likelihood): """Laplace approximation to a posterior""" @@ -45,7 +42,7 @@ class Laplace(likelihood): self.is_heteroscedastic = True self.Nparams = 0 - self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi)) + self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi)) #Initial values for the GP variables self.Y = np.zeros((self.N, 1)) @@ -72,26 +69,36 @@ class Laplace(likelihood): #FIXME: Careful of side effects! And make sure W and K are up to date! d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat) dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat) + Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i) return dL_dfhat, I_KW_i, Wi_K_i - def _Kgradients(self, dK_dthetaK): + def _Kgradients(self, dK_dthetaK, X): """ Gradients with respect to prior kernel parameters """ dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components() dlp = self.likelihood_function.dlik_df(self.data, self.f_hat) - dL_dthetaK = np.zeros(dK_dthetaK.shape) - for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK): - #Explicit - f_Ki_dK_dtheta_Ki_f = mdot(self.Ki_f.T, dK_dthetaK_i, self.Ki_f) - dL_dthetaK[thetaK_i] = 0.5*f_Ki_dK_dtheta_Ki_f - 0.5*np.trace(Wi_K_i*dK_dthetaK_i) - #Implicit - df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp) - dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK) + #Implicit + impl = mdot(dlp, dL_dfhat.T, I_KW_i) + expl_a = - mdot(self.Ki_f, self.Ki_f.T) + expl_b = Wi_K_i + expl = 0.5*expl_a - 0.5*expl_b + dL_dthetaK_exp = dK_dthetaK(expl, X) + dL_dthetaK_imp = dK_dthetaK(impl, X) + dL_dthetaK = -(dL_dthetaK_imp + dL_dthetaK_exp) + + #dL_dthetaK = np.zeros(dK_dthetaK.shape) + #for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK): + ##Explicit + #f_Ki_dK_dtheta_Ki_f = mdot(self.Ki_f.T, dK_dthetaK_i, self.Ki_f) + #dL_dthetaK[thetaK_i] = 0.5*f_Ki_dK_dtheta_Ki_f - 0.5*np.trace(Wi_K_i*dK_dthetaK_i) + ##Implicit + #df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp) + #dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK) return dL_dthetaK @@ -99,13 +106,12 @@ class Laplace(likelihood): """ Gradients with respect to likelihood parameters """ - return np.zeros(1) - #return np.zeros(0) + #return np.zeros(1) dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components() dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat) num_params = len(dlik_dthetaL) - dL_dthetaL = np.zeros((1, num_params)) # make space for one derivative for each likelihood parameter + dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter for thetaL_i in range(num_params): #Explicit #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i]))) @@ -143,8 +149,6 @@ class Laplace(likelihood): $$\tilde{\Sigma} = W^{-1}$$ """ - epsilon = 1e14 - #Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I #dtritri -> L -> L_i #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i @@ -153,54 +157,38 @@ class Laplace(likelihood): Li = chol_inv(L) Lt_W = L.T*self.W.T - ##Check it isn't singular! - if cond(Lt_W) > epsilon: - print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem" - Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0] self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N) Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat) - #f.T(Ki + W)f - f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat)) - + mdot(self.f_hat.T, self.W*self.f_hat) - ) + ln_W_det = det_ln_diag(self.W) + yf_W_yf = mdot((Y_tilde - self.f_hat).T, np.diagflat(self.W), (Y_tilde - self.f_hat)) - y_W_f = mdot(Y_tilde.T*self.W.T, self.f_hat) - - - y_W_y = mdot(Y_tilde.T, self.W*Y_tilde) - - ln_W_det = np.log(self.W).sum() - - #FIXME: Revisit this - Z_tilde = (- self.NORMAL_CONST - + 0.5*self.ln_K_det - + 0.5*ln_W_det - + 0.5*self.ln_Ki_W_i_det - + 0.5*f_Ki_W_f - + 0.5*y_W_y - - y_W_f - + self.ln_z_hat - ) - #Z_tilde = (self.NORMAL_CONST - #- 0.5*self.ln_K_det - #- 0.5*ln_W_det - #- 0.5*self.ln_Ki_W_i_det - #- 0.5*f_Ki_W_f - #- 0.5*y_W_y - #+ y_W_f + #Z_tilde = (+ self.NORMAL_CONST #+ self.ln_z_hat + #+ 0.5*self.ln_I_KW_det + #- 0.5*ln_W_det + #+ 0.5*self.f_Ki_f + #+ 0.5*yf_W_yf #) - #self.Z_tilde = 0 - - ##Check it isn't singular! - if cond(self.W) > epsilon: - print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem" self.Sigma_tilde = np.diagflat(1.0/self.W) + Ki, _, _, K_det = pdinv(self.K) + ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K) + W = np.diagflat(self.W) + Wi = self.Sigma_tilde + W12i = np.sqrt(Wi) + D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W)) + fDf = mdot(self.f_hat.T, D, self.f_hat) + l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) + Z_tilde = (+ self.NORMAL_CONST + + l + + 0.5*ln_det_K_Wi__Bi + - 0.5*fDf + ) + #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) self.Y = Y_tilde @@ -239,10 +227,6 @@ class Laplace(likelihood): self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) self.Bi, _, _, B_det = pdinv(self.B) - self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K) - - self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i) - #Do the computation again at f to get Ki_f which is useful b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data) solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b)) @@ -250,12 +234,14 @@ class Laplace(likelihood): self.Ki_f = a self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f) - self.ln_K_det = pddet(self.K) - #_, _, _, self.ln_K_det = pdinv(self.K) + self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K) + #For det, |I + KW| == |I + W_12*K*W_12| + self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T) + + #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W)) self.ln_z_hat = (- 0.5*self.f_Ki_f - - 0.5*self.ln_K_det - + 0.5*self.ln_Ki_W_i_det + - self.ln_I_KW_det + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) ) @@ -289,7 +275,7 @@ class Laplace(likelihood): #ONLY WORKS FOR 1D DATA def obj(f): res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f)) - + self.NORMAL_CONST) + - self.NORMAL_CONST) return float(res) def obj_grad(f): diff --git a/GPy/models/GP.py b/GPy/models/GP.py index e4ed52ef..d56ee86f 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -141,6 +141,8 @@ class GP(model): Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta """ + self.likelihood.fit_full(self.kern.K(self.X)) + self.likelihood._set_params(self.likelihood._get_params()) dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) if isinstance(self.likelihood, Laplace): #Reapproximate incase it hasnt been done... @@ -155,8 +157,9 @@ class GP(model): #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params) #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) - #dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK) - dL_dthetaL = 0 #self.likelihood._gradients(partial=np.diag(self.dL_dK)) + dK_dthetaK = self.kern.dK_dtheta + dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X) + dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) #print "dL_dthetaK after: ",dL_dthetaK #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) else: diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py index 08e6fd99..f19acf1a 100644 --- a/GPy/util/linalg.py +++ b/GPy/util/linalg.py @@ -34,7 +34,7 @@ def det_ln_diag(A): def pddet(A): """ - Determinant of a positive definite matrix + Determinant of a positive definite matrix, only symmetric matricies though """ L = jitchol(A) logdetA = 2*sum(np.log(np.diag(L))) From de689fa8e91928b7fc2d02f56d4eca14d82eaafd Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 19 Jun 2013 12:00:00 +0100 Subject: [PATCH 51/71] Now gradchecks everytime but student_t fit is bad, noise is underestimated by a long way --- GPy/examples/laplace_approximations.py | 18 +++++++++-------- GPy/likelihoods/Laplace.py | 27 ++++++++++++++++--------- GPy/likelihoods/likelihood_functions.py | 16 +-------------- GPy/models/GP.py | 12 ----------- 4 files changed, 29 insertions(+), 44 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 5120dfb5..84527d08 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -39,28 +39,28 @@ def debug_student_t_noise_approx(): plot = False real_var = 0.1 #Start a function, any function - X = np.linspace(0.0, 10.0, 15)[:, None] + X = np.linspace(0.0, 10.0, 50)[:, None] #X = np.array([0.5])[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var - X_full = np.linspace(0.0, 10.0, 15)[:, None] + X_full = np.linspace(0.0, 10.0, 50)[:, None] Y_full = np.sin(X_full) Y = Y/Y.max() #Add student t random noise to datapoints - deg_free = 10000 + deg_free = 1000 real_sd = np.sqrt(real_var) - print "Real noise: ", real_sd + print "Real noise std: ", real_sd - initial_var_guess = 0.02 + initial_var_guess = 0.3 #t_rv = t(deg_free, loc=0, scale=real_var) #noise = t_rvrvs(size=Y.shape) #Y += noise plt.close('all') # Kernel object - kernel1 = GPy.kern.rbf(X.shape[1]) + kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) kernel2 = kernel1.copy() kernel3 = kernel1.copy() kernel4 = kernel1.copy() @@ -83,22 +83,24 @@ def debug_student_t_noise_approx(): #plt.plot(X_full, Y_full) #print m - #edited_real_sd = initial_var_guess #real_sd + edited_real_sd = initial_var_guess #real_sd edited_real_sd = real_sd print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, stu_t_likelihood, kernel6) + m['white'] = 1e-3 #m.constrain_positive('rbf') #m.constrain_fixed('rbf_v', 1.0898) #m.constrain_fixed('rbf_l', 1.8651) #m.constrain_fixed('t_noise_variance', real_sd) m.constrain_positive('rbf') + m.constrain_positive('t_noise') #m.constrain_fixed('t_noi', real_sd) m.ensure_default_constraints() m.update_likelihood_approximation() - m.optimize(messages=True) + #m.optimize(messages=True) print(m) #return m #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index a8347345..5b1a814a 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -84,12 +84,13 @@ class Laplace(likelihood): #Implicit impl = mdot(dlp, dL_dfhat.T, I_KW_i) - expl_a = - mdot(self.Ki_f, self.Ki_f.T) + expl_a = mdot(self.Ki_f, self.Ki_f.T) expl_b = Wi_K_i - expl = 0.5*expl_a - 0.5*expl_b + expl = 0.5*expl_a + 0.5*expl_b dL_dthetaK_exp = dK_dthetaK(expl, X) dL_dthetaK_imp = dK_dthetaK(impl, X) - dL_dthetaK = -(dL_dthetaK_imp + dL_dthetaK_exp) + #print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) + dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp #dL_dthetaK = np.zeros(dK_dthetaK.shape) #for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK): @@ -117,10 +118,12 @@ class Laplace(likelihood): #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i]))) #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None]) # might be + - dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) + dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) #Implicit df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) - dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL) + dL_dthetaL_imp = np.dot(dL_dfhat.T, df_hat_dthetaL) + #print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp) + dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) @@ -180,14 +183,20 @@ class Laplace(likelihood): W = np.diagflat(self.W) Wi = self.Sigma_tilde W12i = np.sqrt(Wi) - D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W)) - fDf = mdot(self.f_hat.T, D, self.f_hat) + #D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W)) + #fDf = mdot(self.f_hat.T, D, self.f_hat) l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) + #print "fDf:{} l:{} detKWiBi:{} W:{} Wi:{} Bi:{} Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum()) + + y_Wi_Ki_i_y = mdot(Y_tilde.T, pdinv(self.K + Wi)[0], Y_tilde) Z_tilde = (+ self.NORMAL_CONST + l + 0.5*ln_det_K_Wi__Bi - - 0.5*fDf + #- 0.5*fDf + - 0.5*self.f_Ki_f + + 0.5*y_Wi_Ki_i_y ) + #print "Ztilde: {}".format(Z_tilde) #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -316,7 +325,7 @@ class Laplace(likelihood): #f_old = f.copy() W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + W[W < 0] = 1e-5 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index c3aee835..041b59bd 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -170,7 +170,7 @@ class student_t(likelihood_function): return np.asarray(self.sigma) def _get_param_names(self): - return ["t_noise_variance"] + return ["t_noise_std"] def _set_params(self, x): self.sigma = float(x) @@ -191,8 +191,6 @@ class student_t(likelihood_function): :returns: float(likelihood evaluated for this point) """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f @@ -215,8 +213,6 @@ class student_t(likelihood_function): :returns: gradient of likelihood evaluated at points """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) @@ -237,8 +233,6 @@ class student_t(likelihood_function): :extra_data: extra_data which is not used in student t distribution :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f @@ -251,8 +245,6 @@ class student_t(likelihood_function): $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$ """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) / @@ -269,8 +261,6 @@ class student_t(likelihood_function): $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$ """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f dlik_dsigma = ( - (1/self.sigma) + @@ -284,8 +274,6 @@ class student_t(likelihood_function): $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$ """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) @@ -299,8 +287,6 @@ class student_t(likelihood_function): $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ """ - #y = np.squeeze(y) - #f = np.squeeze(f) assert y.shape == f.shape e = y - f dlik_hess_dsigma = ( (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) / diff --git a/GPy/models/GP.py b/GPy/models/GP.py index d56ee86f..636ebba0 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -145,18 +145,6 @@ class GP(model): self.likelihood._set_params(self.likelihood._get_params()) dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) if isinstance(self.likelihood, Laplace): - #Reapproximate incase it hasnt been done... - self.likelihood.fit_full(self.kern.K(self.X)) - self.likelihood._set_params(self.likelihood._get_params()) - print self.kern._get_params() - - #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained - #fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right... - #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right... - - #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params) - #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X) - dK_dthetaK = self.kern.dK_dtheta dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) From e900509a7c146a80a866d29a4efaedfb10f1291a Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 19 Jun 2013 16:13:11 +0100 Subject: [PATCH 52/71] Fixed a sign wrong, now gradchecks weirdly only above certain points --- GPy/examples/laplace_approximations.py | 61 ++++++++++++++++++++++--- GPy/likelihoods/Laplace.py | 47 +++---------------- GPy/likelihoods/likelihood_functions.py | 7 ++- 3 files changed, 64 insertions(+), 51 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 84527d08..887e35ae 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -35,6 +35,54 @@ def timing(): print the_is print np.mean(the_is) +def v_fail_test(): + plt.close('all') + real_var = 0.1 + X = np.linspace(0.0, 10.0, 50)[:, None] + Y = np.sin(X) + np.random.randn(*X.shape)*real_var + Y = Y/Y.max() + + #Add student t random noise to datapoints + deg_free = 10 + real_sd = np.sqrt(real_var) + print "Real noise std: ", real_sd + + kernel1 = GPy.kern.white(X.shape[1]) #+ GPy.kern.white(X.shape[1]) + + edited_real_sd = 0.3#real_sd + edited_real_sd = real_sd + + print "Clean student t, rasm" + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) + m = GPy.models.GP(X, stu_t_likelihood, kernel1) + m.constrain_fixed('white', 1) + vs = 15 + noises = 40 + checkgrads = np.zeros((vs, noises)) + vs_noises = np.zeros((vs, noises)) + for v_ind, v in enumerate(np.linspace(1, 20, vs)): + m.likelihood.likelihood_function.v = v + print v + for noise_ind, noise in enumerate(np.linspace(0.0000001, 1, noises)): + m['t_noise'] = noise + m.update_likelihood_approximation() + checkgrads[v_ind, noise_ind] = m.checkgrad() + vs_noises[v_ind, noise_ind] = (float(v)/(float(v) - 2))*(noise**2) + + plt.figure(1) + plt.title('Checkgrads') + plt.imshow(checkgrads, interpolation='nearest') + plt.xlabel('noise') + plt.ylabel('v') + + plt.figure(2) + plt.title('variance change') + plt.imshow(vs_noises, interpolation='nearest') + plt.xlabel('noise') + plt.ylabel('v') + print(m) + def debug_student_t_noise_approx(): plot = False real_var = 0.1 @@ -49,7 +97,7 @@ def debug_student_t_noise_approx(): Y = Y/Y.max() #Add student t random noise to datapoints - deg_free = 1000 + deg_free = 10 real_sd = np.sqrt(real_var) print "Real noise std: ", real_sd @@ -60,7 +108,7 @@ def debug_student_t_noise_approx(): plt.close('all') # Kernel object - kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1]) kernel2 = kernel1.copy() kernel3 = kernel1.copy() kernel4 = kernel1.copy() @@ -90,12 +138,11 @@ def debug_student_t_noise_approx(): t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, stu_t_likelihood, kernel6) - m['white'] = 1e-3 - #m.constrain_positive('rbf') - #m.constrain_fixed('rbf_v', 1.0898) - #m.constrain_fixed('rbf_l', 1.8651) + #m['white'] = 1e-3 + m.constrain_fixed('rbf_v', 1.0898) + m.constrain_fixed('rbf_l', 1.8651) #m.constrain_fixed('t_noise_variance', real_sd) - m.constrain_positive('rbf') + #m.constrain_positive('rbf') m.constrain_positive('t_noise') #m.constrain_fixed('t_noi', real_sd) m.ensure_default_constraints() diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 5b1a814a..70ec568a 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -70,54 +70,38 @@ class Laplace(likelihood): d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat) dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat) - Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R - - I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i) - return dL_dfhat, I_KW_i, Wi_K_i + I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i) + return dL_dfhat, I_KW_i def _Kgradients(self, dK_dthetaK, X): """ Gradients with respect to prior kernel parameters """ - dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components() + dL_dfhat, I_KW_i = self._shared_gradients_components() dlp = self.likelihood_function.dlik_df(self.data, self.f_hat) #Implicit impl = mdot(dlp, dL_dfhat.T, I_KW_i) expl_a = mdot(self.Ki_f, self.Ki_f.T) - expl_b = Wi_K_i + expl_b = self.Wi_K_i expl = 0.5*expl_a + 0.5*expl_b dL_dthetaK_exp = dK_dthetaK(expl, X) dL_dthetaK_imp = dK_dthetaK(impl, X) #print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp - - #dL_dthetaK = np.zeros(dK_dthetaK.shape) - #for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK): - ##Explicit - #f_Ki_dK_dtheta_Ki_f = mdot(self.Ki_f.T, dK_dthetaK_i, self.Ki_f) - #dL_dthetaK[thetaK_i] = 0.5*f_Ki_dK_dtheta_Ki_f - 0.5*np.trace(Wi_K_i*dK_dthetaK_i) - ##Implicit - #df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp) - #dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK) - return dL_dthetaK def _gradients(self, partial): """ Gradients with respect to likelihood parameters """ - #return np.zeros(1) - dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components() + dL_dfhat, I_KW_i = self._shared_gradients_components() dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat) num_params = len(dlik_dthetaL) dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter for thetaL_i in range(num_params): #Explicit - #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i]))) - #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None]) - # might be + dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) #Implicit df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) @@ -165,34 +149,17 @@ class Laplace(likelihood): Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat) - ln_W_det = det_ln_diag(self.W) - yf_W_yf = mdot((Y_tilde - self.f_hat).T, np.diagflat(self.W), (Y_tilde - self.f_hat)) - - #Z_tilde = (+ self.NORMAL_CONST - #+ self.ln_z_hat - #+ 0.5*self.ln_I_KW_det - #- 0.5*ln_W_det - #+ 0.5*self.f_Ki_f - #+ 0.5*yf_W_yf - #) - self.Sigma_tilde = np.diagflat(1.0/self.W) - Ki, _, _, K_det = pdinv(self.K) + self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K) - W = np.diagflat(self.W) - Wi = self.Sigma_tilde - W12i = np.sqrt(Wi) - #D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W)) - #fDf = mdot(self.f_hat.T, D, self.f_hat) l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) #print "fDf:{} l:{} detKWiBi:{} W:{} Wi:{} Bi:{} Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum()) - y_Wi_Ki_i_y = mdot(Y_tilde.T, pdinv(self.K + Wi)[0], Y_tilde) + y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) Z_tilde = (+ self.NORMAL_CONST + l + 0.5*ln_det_K_Wi__Bi - #- 0.5*fDf - 0.5*self.f_Ki_f + 0.5*y_Wi_Ki_i_y ) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 041b59bd..d6dbf55f 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -194,10 +194,10 @@ class student_t(likelihood_function): assert y.shape == f.shape e = y - f - objective = (gammaln((self.v + 1) * 0.5) + objective = (+ gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) - np.log(self.sigma * np.sqrt(self.v * np.pi)) - - (self.v + 1) * 0.5 * np.log(1 + ((e**2 / self.sigma**2) / self.v)) + - (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v)) ) return np.sum(objective) @@ -234,7 +234,6 @@ class student_t(likelihood_function): :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ assert y.shape == f.shape - e = y - f hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) return hess @@ -247,7 +246,7 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) / + d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) / ((e**2 + (self.sigma**2)*self.v)**3) ) return d3lik_d3f From d4bfd99c21c835e5cf7873e20295561c031d5221 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 20 Jun 2013 14:30:25 +0100 Subject: [PATCH 53/71] Starting to fiddle with mode finding code --- GPy/examples/laplace_approximations.py | 18 ++++++++++-------- GPy/likelihoods/Laplace.py | 12 ++++++------ GPy/likelihoods/likelihood_functions.py | 1 - 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 887e35ae..d300806f 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -36,7 +36,7 @@ def timing(): print np.mean(the_is) def v_fail_test(): - plt.close('all') + #plt.close('all') real_var = 0.1 X = np.linspace(0.0, 10.0, 50)[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var @@ -57,6 +57,7 @@ def v_fail_test(): stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, stu_t_likelihood, kernel1) m.constrain_fixed('white', 1) + m.constrain_positive('t_noise') vs = 15 noises = 40 checkgrads = np.zeros((vs, noises)) @@ -64,23 +65,24 @@ def v_fail_test(): for v_ind, v in enumerate(np.linspace(1, 20, vs)): m.likelihood.likelihood_function.v = v print v - for noise_ind, noise in enumerate(np.linspace(0.0000001, 1, noises)): + for noise_ind, noise in enumerate(np.linspace(0.0001, 1, noises)): m['t_noise'] = noise m.update_likelihood_approximation() checkgrads[v_ind, noise_ind] = m.checkgrad() vs_noises[v_ind, noise_ind] = (float(v)/(float(v) - 2))*(noise**2) - plt.figure(1) + plt.figure() plt.title('Checkgrads') plt.imshow(checkgrads, interpolation='nearest') plt.xlabel('noise') plt.ylabel('v') - plt.figure(2) + plt.figure() plt.title('variance change') plt.imshow(vs_noises, interpolation='nearest') plt.xlabel('noise') plt.ylabel('v') + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print(m) def debug_student_t_noise_approx(): @@ -139,13 +141,13 @@ def debug_student_t_noise_approx(): stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) m = GPy.models.GP(X, stu_t_likelihood, kernel6) #m['white'] = 1e-3 - m.constrain_fixed('rbf_v', 1.0898) - m.constrain_fixed('rbf_l', 1.8651) + #m.constrain_fixed('rbf_v', 1.0898) + #m.constrain_fixed('rbf_l', 1.8651) #m.constrain_fixed('t_noise_variance', real_sd) #m.constrain_positive('rbf') - m.constrain_positive('t_noise') + #m.constrain_positive('t_noise') + m.constrain_positive('') #m.constrain_fixed('t_noi', real_sd) - m.ensure_default_constraints() m.update_likelihood_approximation() #m.optimize(messages=True) print(m) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 70ec568a..ed3229a9 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -68,8 +68,7 @@ class Laplace(likelihood): def _shared_gradients_components(self): #FIXME: Careful of side effects! And make sure W and K are up to date! d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat) - dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat) - + dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i) return dL_dfhat, I_KW_i @@ -81,10 +80,10 @@ class Laplace(likelihood): dlp = self.likelihood_function.dlik_df(self.data, self.f_hat) #Implicit - impl = mdot(dlp, dL_dfhat.T, I_KW_i) + impl = mdot(dlp, dL_dfhat, I_KW_i) expl_a = mdot(self.Ki_f, self.Ki_f.T) expl_b = self.Wi_K_i - expl = 0.5*expl_a + 0.5*expl_b + expl = 0.5*expl_a - 0.5*expl_b # Might need to be -? dL_dthetaK_exp = dK_dthetaK(expl, X) dL_dthetaK_imp = dK_dthetaK(impl, X) #print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) @@ -103,10 +102,11 @@ class Laplace(likelihood): for thetaL_i in range(num_params): #Explicit dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) + #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(mdot(self.Bi, self.K, dlik_hess_dthetaL[thetaL_i])) #Implicit df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) - dL_dthetaL_imp = np.dot(dL_dfhat.T, df_hat_dthetaL) - #print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp) + dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL) + print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp) dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index d6dbf55f..4d298122 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -192,7 +192,6 @@ class student_t(likelihood_function): """ assert y.shape == f.shape - e = y - f objective = (+ gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) From e80fad197ca3250bca4e9d7830a23dadf8ae62e9 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 24 Jun 2013 15:39:38 +0100 Subject: [PATCH 54/71] trying to fix optimisation problem, fixed a few bugs but still fails at very low noise --- GPy/examples/laplace_approximations.py | 4 +- GPy/likelihoods/Laplace.py | 79 +++++++++++++++----------- 2 files changed, 49 insertions(+), 34 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index d300806f..7b9f10b1 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -90,7 +90,7 @@ def debug_student_t_noise_approx(): real_var = 0.1 #Start a function, any function X = np.linspace(0.0, 10.0, 50)[:, None] - #X = np.array([0.5])[:, None] + #X = np.array([0.5, 1])[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var X_full = np.linspace(0.0, 10.0, 50)[:, None] @@ -99,7 +99,7 @@ def debug_student_t_noise_approx(): Y = Y/Y.max() #Add student t random noise to datapoints - deg_free = 10 + deg_free = 100000 real_sd = np.sqrt(real_var) print "Real noise std: ", real_sd diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index ed3229a9..b5362839 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -51,6 +51,8 @@ class Laplace(likelihood): self.Z = 0 self.YYT = None + self.old_a = None + def predictive_values(self, mu, var, full_cov): if full_cov: raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood") @@ -83,7 +85,7 @@ class Laplace(likelihood): impl = mdot(dlp, dL_dfhat, I_KW_i) expl_a = mdot(self.Ki_f, self.Ki_f.T) expl_b = self.Wi_K_i - expl = 0.5*expl_a - 0.5*expl_b # Might need to be -? + expl = 0.5*expl_a + 0.5*expl_b # Might need to be -? dL_dthetaK_exp = dK_dthetaK(expl, X) dL_dthetaK_imp = dK_dthetaK(impl, X) #print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) @@ -265,7 +267,7 @@ class Laplace(likelihood): f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False) return f_hat[:, None] - def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50): + def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=40): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -275,7 +277,12 @@ class Laplace(likelihood): :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation :returns: f_mode """ - f = np.zeros((self.N, 1)) + if self.old_a is None: + old_a = np.zeros((self.N, 1)) + else: + old_a = self.old_a + + f = np.dot(self.K, old_a) new_obj = -np.inf old_obj = np.inf @@ -292,7 +299,7 @@ class Laplace(likelihood): #f_old = f.copy() W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - W[W < 0] = 1e-5 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + W[W < 0] = 1e-8 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods @@ -300,38 +307,46 @@ class Laplace(likelihood): W_f = W*f grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data) - #Find K_i_f + b = W_f + grad - b = step_size*b - - #Need this to find the f we have a stepsize which we need to move in, rather than a full unit movement - #c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) - #solve_L = cho_solve((L, True), W_12*c) - #f = c - np.dot(K, W_12*solve_L) - - #FIXME: Can't we get rid of this? Don't we want to evaluate obj(c,f) and this is our new_obj? - #Why did I choose to evaluate the objective function at the new f with the old hessian? I'm sure there was a good reason, - #Document it! solve_L = cho_solve((L, True), W_12*np.dot(K, b)) - a = b - W_12*solve_L - f = np.dot(K, a) + #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet + full_step_a = b - W_12*solve_L + da = full_step_a - old_a - tmp_old_obj = old_obj - old_obj = new_obj - new_obj = obj(a, f) - difference = new_obj - old_obj - if difference < 0: - #print "Objective function rose", difference - #If the objective function isn't rising, restart optimization - step_size *= 0.9 - #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) - #objective function isn't increasing, try reducing step size - #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode - old_obj = tmp_old_obj - rs += 1 + update_passed = False + while not update_passed: + a = old_a + step_size*da + f = np.dot(K, a) - difference = abs(difference) + old_obj = new_obj + new_obj = np.float(obj(a, f)) + difference = new_obj - old_obj + #print "difference: ",difference + if difference < 0: + #print grad + print "Objective function rose", np.float(difference) + #If the objective function isn't rising, restart optimization + step_size *= 0.8 + print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + #objective function isn't increasing, try reducing step size + #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode + #old_obj = tmp_old_obj + old_obj = new_obj + rs += 1 + else: + update_passed = True + + #print "Iter difference: ", difference + #print "F: ", f + #print "A: ", a + old_a = a + #print "Positive difference obj: ", np.float(difference) + difference = np.float(abs(difference)) i += 1 - self.i = i + #print "Positive difference obj: ", np.float(difference) + print "Iterations: ",i + print "Step size reductions", rs + print "Final difference: ", difference return f From 064efd5535818b3ca6ec93baa83fc72ade12eb42 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 25 Jun 2013 18:20:00 +0100 Subject: [PATCH 55/71] Added another optimisation which doesn't use gradients. Seems like F is almost always found, but Y can be off, suggesting that Wi__Ki_W is wrong, maybe W? --- GPy/examples/laplace_approximations.py | 47 +++++++++--------- GPy/likelihoods/Laplace.py | 69 ++++++++++++++++---------- 2 files changed, 67 insertions(+), 49 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 7b9f10b1..61291e71 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -25,7 +25,7 @@ def timing(): kernel1 = GPy.kern.rbf(X.shape[1]) t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) - corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=True) + corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1) m.ensure_default_constraints() m.update_likelihood_approximation() @@ -54,18 +54,17 @@ def v_fail_test(): print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernel1) - m.constrain_fixed('white', 1) - m.constrain_positive('t_noise') - vs = 15 + m.constrain_positive('') + vs = 25 noises = 40 checkgrads = np.zeros((vs, noises)) vs_noises = np.zeros((vs, noises)) - for v_ind, v in enumerate(np.linspace(1, 20, vs)): + for v_ind, v in enumerate(np.linspace(1, 100, vs)): m.likelihood.likelihood_function.v = v print v - for noise_ind, noise in enumerate(np.linspace(0.0001, 1, noises)): + for noise_ind, noise in enumerate(np.linspace(0.0001, 10, noises)): m['t_noise'] = noise m.update_likelihood_approximation() checkgrads[v_ind, noise_ind] = m.checkgrad() @@ -77,11 +76,11 @@ def v_fail_test(): plt.xlabel('noise') plt.ylabel('v') - plt.figure() - plt.title('variance change') - plt.imshow(vs_noises, interpolation='nearest') - plt.xlabel('noise') - plt.ylabel('v') + #plt.figure() + #plt.title('variance change') + #plt.imshow(vs_noises, interpolation='nearest') + #plt.xlabel('noise') + #plt.ylabel('v') import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print(m) @@ -93,13 +92,14 @@ def debug_student_t_noise_approx(): #X = np.array([0.5, 1])[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var - X_full = np.linspace(0.0, 10.0, 50)[:, None] + X_full = X Y_full = np.sin(X_full) Y = Y/Y.max() #Add student t random noise to datapoints - deg_free = 100000 + deg_free = 10 + real_sd = np.sqrt(real_var) print "Real noise std: ", real_sd @@ -110,7 +110,7 @@ def debug_student_t_noise_approx(): plt.close('all') # Kernel object - kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1]) + kernel1 = GPy.kern.rbf(X.shape[1])# + GPy.kern.white(X.shape[1]) kernel2 = kernel1.copy() kernel3 = kernel1.copy() kernel4 = kernel1.copy() @@ -134,13 +134,13 @@ def debug_student_t_noise_approx(): #print m edited_real_sd = initial_var_guess #real_sd - edited_real_sd = real_sd + #edited_real_sd = real_sd print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernel6) - #m['white'] = 1e-3 + m['rbf_len'] = 1.5 #m.constrain_fixed('rbf_v', 1.0898) #m.constrain_fixed('rbf_l', 1.8651) #m.constrain_fixed('t_noise_variance', real_sd) @@ -159,11 +159,12 @@ def debug_student_t_noise_approx(): m.plot() plt.plot(X_full, Y_full) plt.ylim(-2.5, 2.5) + print "Real noise std: ", real_sd return m #print "Clean student t, ncg" #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) - #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False) + #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg') #m = GPy.models.GP(X, stu_t_likelihood, kernel3) #m.ensure_default_constraints() #m.update_likelihood_approximation() @@ -260,7 +261,7 @@ def student_t_approx(): print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) - stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernel6) m.ensure_default_constraints() m.update_likelihood_approximation() @@ -274,7 +275,7 @@ def student_t_approx(): print "Corrupt student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) - corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=True) + corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m.ensure_default_constraints() m.update_likelihood_approximation() @@ -290,7 +291,7 @@ def student_t_approx(): #print "Clean student t, ncg" #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) - #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False) + #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg') #m = GPy.models.GP(X, stu_t_likelihood, kernel3) #m.ensure_default_constraints() #m.update_likelihood_approximation() @@ -304,7 +305,7 @@ def student_t_approx(): #print "Corrupt student t, ncg" #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) - #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=False) + #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg') #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) #m.ensure_default_constraints() #m.update_likelihood_approximation() diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index b5362839..b9d74846 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -12,7 +12,7 @@ import random class Laplace(likelihood): """Laplace approximation to a posterior""" - def __init__(self, data, likelihood_function, extra_data=None, rasm=True): + def __init__(self, data, likelihood_function, extra_data=None, opt='rasm'): """ Laplace Approximation @@ -29,13 +29,13 @@ class Laplace(likelihood): :data: array of data the likelihood function is approximating :likelihood_function: likelihood function - subclass of likelihood_function :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data - :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation + :opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data) """ self.data = data self.likelihood_function = likelihood_function self.extra_data = extra_data - self.rasm = rasm + self.opt = opt #Inital values self.N, self.D = self.data.shape @@ -85,11 +85,12 @@ class Laplace(likelihood): impl = mdot(dlp, dL_dfhat, I_KW_i) expl_a = mdot(self.Ki_f, self.Ki_f.T) expl_b = self.Wi_K_i + #print "expl_a: {}, expl_b: {}".format(expl_a, expl_b) expl = 0.5*expl_a + 0.5*expl_b # Might need to be -? dL_dthetaK_exp = dK_dthetaK(expl, X) dL_dthetaK_imp = dK_dthetaK(impl, X) - #print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) - dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp + print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) + dL_dthetaK = dL_dthetaK_exp +dL_dthetaK_imp return dL_dthetaK def _gradients(self, partial): @@ -109,7 +110,7 @@ class Laplace(likelihood): df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL) print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp) - dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp + dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) @@ -165,7 +166,7 @@ class Laplace(likelihood): - 0.5*self.f_Ki_f + 0.5*y_Wi_Ki_i_y ) - #print "Ztilde: {}".format(Z_tilde) + print "Ztilde: {}".format(Z_tilde) #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -183,10 +184,11 @@ class Laplace(likelihood): self.K = K.copy() #Find mode - if self.rasm: - self.f_hat = self.rasm_mode(K) - else: - self.f_hat = self.ncg_mode(K) + self.f_hat = { + 'rasm': self.rasm_mode, + 'ncg': self.ncg_mode, + 'nelder': self.nelder_mode + }[self.opt](self.K) #Compute hessian and other variables at mode self._compute_likelihood_variables() @@ -196,20 +198,20 @@ class Laplace(likelihood): self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + self.W[self.W < 0] = 1e-5 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though - self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) - self.Bi, _, _, B_det = pdinv(self.B) + #self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) + #self.Bi, _, _, B_det = pdinv(self.B) #Do the computation again at f to get Ki_f which is useful - b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data) - solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b)) - a = b - self.W_12*solve_chol - self.Ki_f = a + #b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data) + #solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b)) + #a = b - self.W_12*solve_chol + self.Ki_f = self.a self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f) self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K) @@ -239,6 +241,17 @@ class Laplace(likelihood): L = jitchol(B) return (B, L, W_12) + def nelder_mode(self, K): + f = np.zeros((self.N, 1)) + self.Ki, _, _, self.ln_K_det = pdinv(K) + def obj(f): + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f))) + return float(res) + + res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True}) + f_new = res.x + return f_new[:, None] + def ncg_mode(self, K): """ Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative) @@ -261,13 +274,13 @@ class Laplace(likelihood): return np.squeeze(res) def obj_hess(f): - res = -1 * (--np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki) + res = -1 * (np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki) return np.squeeze(res) f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False) return f_hat[:, None] - def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=40): + def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=10): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -287,11 +300,10 @@ class Laplace(likelihood): old_obj = np.inf def obj(a, f): - #Careful of shape of data! return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data) difference = np.inf - epsilon = 1e-6 + epsilon = 1e-9 step_size = 1 rs = 0 i = 0 @@ -299,7 +311,7 @@ class Laplace(likelihood): #f_old = f.copy() W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - W[W < 0] = 1e-8 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods @@ -314,6 +326,7 @@ class Laplace(likelihood): full_step_a = b - W_12*solve_L da = full_step_a - old_a + f_old = f update_passed = False while not update_passed: a = old_a + step_size*da @@ -323,11 +336,11 @@ class Laplace(likelihood): new_obj = np.float(obj(a, f)) difference = new_obj - old_obj #print "difference: ",difference - if difference < 0: + if difference < -epsilon: #print grad print "Objective function rose", np.float(difference) #If the objective function isn't rising, restart optimization - step_size *= 0.8 + step_size *= 0.4 print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) #objective function isn't increasing, try reducing step size #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode @@ -337,16 +350,20 @@ class Laplace(likelihood): else: update_passed = True + difference = np.abs(np.sum(f - f_old)) + abs(difference) #print "Iter difference: ", difference #print "F: ", f #print "A: ", a old_a = a #print "Positive difference obj: ", np.float(difference) - difference = np.float(abs(difference)) + #difference = np.float(abs(difference)) i += 1 #print "Positive difference obj: ", np.float(difference) print "Iterations: ",i print "Step size reductions", rs print "Final difference: ", difference + self.a = a + self.B, self.B_chol, self.W_12 = B, L, W_12 + self.Bi, _, _, B_det = pdinv(self.B) return f From 617d73ca3271f080ed2e58efd9cbd9a49e301ac0 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 26 Jun 2013 15:44:26 +0100 Subject: [PATCH 56/71] Now checkgrads a lot more of the time, but still fails in optimisation, seems also odd that when parameter is fixed kernel parameters go to infinity --- GPy/examples/laplace_approximations.py | 17 +++++++++++------ GPy/likelihoods/Laplace.py | 23 ++++++++--------------- GPy/models/GP.py | 7 +++++-- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 61291e71..0fd3efeb 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -98,7 +98,7 @@ def debug_student_t_noise_approx(): Y = Y/Y.max() #Add student t random noise to datapoints - deg_free = 10 + deg_free = 100 real_sd = np.sqrt(real_var) print "Real noise std: ", real_sd @@ -133,20 +133,23 @@ def debug_student_t_noise_approx(): #plt.plot(X_full, Y_full) #print m - edited_real_sd = initial_var_guess #real_sd + real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free))) + edited_real_sd = real_stu_t_std#initial_var_guess #real_sd #edited_real_sd = real_sd print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') + m = GPy.models.GP(X, stu_t_likelihood, kernel6) - m['rbf_len'] = 1.5 + #m['rbf_len'] = 1.5 #m.constrain_fixed('rbf_v', 1.0898) #m.constrain_fixed('rbf_l', 1.8651) - #m.constrain_fixed('t_noise_variance', real_sd) + m.constrain_fixed('t_noise_std', edited_real_sd) #m.constrain_positive('rbf') - #m.constrain_positive('t_noise') - m.constrain_positive('') + #m.constrain_positive('t_noise_std') + #m.constrain_positive('') + m.ensure_default_constraints() #m.constrain_fixed('t_noi', real_sd) m.update_likelihood_approximation() #m.optimize(messages=True) @@ -264,6 +267,7 @@ def student_t_approx(): stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernel6) m.ensure_default_constraints() + m.constrain_positive('t_noise') m.update_likelihood_approximation() m.optimize() print(m) @@ -278,6 +282,7 @@ def student_t_approx(): corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m.ensure_default_constraints() + m.constrain_positive('t_noise') m.update_likelihood_approximation() m.optimize() print(m) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index b9d74846..1431a7c6 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -109,7 +109,7 @@ class Laplace(likelihood): #Implicit df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL) - print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp) + #print "dL_dthetaL_exp: {} dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp) dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,) @@ -147,10 +147,11 @@ class Laplace(likelihood): Li = chol_inv(L) Lt_W = L.T*self.W.T - Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0] + Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0] self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N) Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat) + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT self.Sigma_tilde = np.diagflat(1.0/self.W) @@ -166,7 +167,7 @@ class Laplace(likelihood): - 0.5*self.f_Ki_f + 0.5*y_Wi_Ki_i_y ) - print "Ztilde: {}".format(Z_tilde) + #print "Ztilde: {}".format(Z_tilde) #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -280,7 +281,7 @@ class Laplace(likelihood): f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False) return f_hat[:, None] - def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=10): + def rasm_mode(self, K, MAX_ITER=250, MAX_RESTART=10): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -308,7 +309,6 @@ class Laplace(likelihood): rs = 0 i = 0 while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: - #f_old = f.copy() W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur @@ -338,10 +338,10 @@ class Laplace(likelihood): #print "difference: ",difference if difference < -epsilon: #print grad - print "Objective function rose", np.float(difference) + #print "Objective function rose", np.float(difference) #If the objective function isn't rising, restart optimization step_size *= 0.4 - print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) #objective function isn't increasing, try reducing step size #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode #old_obj = tmp_old_obj @@ -351,18 +351,11 @@ class Laplace(likelihood): update_passed = True difference = np.abs(np.sum(f - f_old)) + abs(difference) - #print "Iter difference: ", difference - #print "F: ", f - #print "A: ", a old_a = a - #print "Positive difference obj: ", np.float(difference) - #difference = np.float(abs(difference)) i += 1 #print "Positive difference obj: ", np.float(difference) - print "Iterations: ",i - print "Step size reductions", rs - print "Final difference: ", difference + print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) self.a = a self.B, self.B_chol, self.W_12 = B, L, W_12 self.Bi, _, _, B_det = pdinv(self.B) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 636ebba0..7b6fab27 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -141,10 +141,11 @@ class GP(model): Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta """ - self.likelihood.fit_full(self.kern.K(self.X)) - self.likelihood._set_params(self.likelihood._get_params()) dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) + print "dL_dthetaK should be: ", dL_dthetaK if isinstance(self.likelihood, Laplace): + self.likelihood.fit_full(self.kern.K(self.X)) + self.likelihood._set_params(self.likelihood._get_params()) dK_dthetaK = self.kern.dK_dtheta dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) @@ -153,6 +154,8 @@ class GP(model): else: dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) + print "dL_dthetaK is: ", dL_dthetaK + return np.hstack((dL_dthetaK, dL_dthetaL)) #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) From c90b1f0c99b84bf7e981113e5bfd83396b825ed1 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 27 Jun 2013 15:04:57 +0100 Subject: [PATCH 57/71] Added minimizer for finding f, doesn't help --- GPy/examples/laplace_approximations.py | 8 +-- GPy/likelihoods/Laplace.py | 80 ++++++++++++++++---------- GPy/models/GP.py | 11 ++-- 3 files changed, 58 insertions(+), 41 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 0fd3efeb..abb5f4ce 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -58,13 +58,13 @@ def v_fail_test(): m = GPy.models.GP(X, stu_t_likelihood, kernel1) m.constrain_positive('') vs = 25 - noises = 40 + noises = 30 checkgrads = np.zeros((vs, noises)) vs_noises = np.zeros((vs, noises)) for v_ind, v in enumerate(np.linspace(1, 100, vs)): m.likelihood.likelihood_function.v = v print v - for noise_ind, noise in enumerate(np.linspace(0.0001, 10, noises)): + for noise_ind, noise in enumerate(np.linspace(0.0001, 100, noises)): m['t_noise'] = noise m.update_likelihood_approximation() checkgrads[v_ind, noise_ind] = m.checkgrad() @@ -145,9 +145,9 @@ def debug_student_t_noise_approx(): #m['rbf_len'] = 1.5 #m.constrain_fixed('rbf_v', 1.0898) #m.constrain_fixed('rbf_l', 1.8651) - m.constrain_fixed('t_noise_std', edited_real_sd) + #m.constrain_fixed('t_noise_std', edited_real_sd) #m.constrain_positive('rbf') - #m.constrain_positive('t_noise_std') + m.constrain_positive('t_noise_std') #m.constrain_positive('') m.ensure_default_constraints() #m.constrain_fixed('t_noi', real_sd) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 1431a7c6..e096c5f4 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -90,7 +90,7 @@ class Laplace(likelihood): dL_dthetaK_exp = dK_dthetaK(expl, X) dL_dthetaK_imp = dK_dthetaK(impl, X) print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) - dL_dthetaK = dL_dthetaK_exp +dL_dthetaK_imp + dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp return dL_dthetaK def _gradients(self, partial): @@ -126,7 +126,6 @@ class Laplace(likelihood): due to the z rescaling. at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1) - This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1) giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f) @@ -143,17 +142,18 @@ class Laplace(likelihood): #dtritri -> L -> L_i #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i #((L.T*w)_i + I)f_hat = y_tilde - L = jitchol(self.K) - Li = chol_inv(L) - Lt_W = L.T*self.W.T + #L = jitchol(self.K) + #Li = chol_inv(L) + #Lt_W = L.T*self.W.T - Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0] - self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N) + #Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0] + #self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N) + #Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat) - Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat) - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + Wi = 1.0/self.W + self.Sigma_tilde = np.diagflat(Wi) - self.Sigma_tilde = np.diagflat(1.0/self.W) + Y_tilde = Wi*(self.Ki_f + self.W*self.f_hat) self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K) @@ -281,7 +281,7 @@ class Laplace(likelihood): f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False) return f_hat[:, None] - def rasm_mode(self, K, MAX_ITER=250, MAX_RESTART=10): + def rasm_mode(self, K, MAX_ITER=40, MAX_RESTART=10): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -297,6 +297,7 @@ class Laplace(likelihood): old_a = self.old_a f = np.dot(self.K, old_a) + self.f = f new_obj = -np.inf old_obj = np.inf @@ -304,7 +305,7 @@ class Laplace(likelihood): return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data) difference = np.inf - epsilon = 1e-9 + epsilon = 1e-6 step_size = 1 rs = 0 i = 0 @@ -316,6 +317,8 @@ class Laplace(likelihood): # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods B, L, W_12 = self._compute_B_statistics(K, W) + #if i > 30: + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT W_f = W*f grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data) @@ -326,37 +329,52 @@ class Laplace(likelihood): full_step_a = b - W_12*solve_L da = full_step_a - old_a - f_old = f - update_passed = False - while not update_passed: + f_old = self.f.copy() + + def inner_obj(step_size, old_a, da, K): a = old_a + step_size*da f = np.dot(K, a) + self.a = a + self.f = f + return -obj(a, f) - old_obj = new_obj - new_obj = np.float(obj(a, f)) - difference = new_obj - old_obj + from functools import partial + i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K) + old_obj = new_obj + new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10) + + #update_passed = False + #while not update_passed: + #a = old_a + step_size*da + #f = np.dot(K, a) + + #old_obj = new_obj + #new_obj = obj(a, f) + #difference = new_obj - old_obj #print "difference: ",difference - if difference < -epsilon: - #print grad + #if difference < 0: + ##print grad #print "Objective function rose", np.float(difference) - #If the objective function isn't rising, restart optimization - step_size *= 0.4 + ##If the objective function isn't rising, restart optimization + #step_size *= 0.8 #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) - #objective function isn't increasing, try reducing step size - #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode - #old_obj = tmp_old_obj - old_obj = new_obj - rs += 1 - else: - update_passed = True + ##objective function isn't increasing, try reducing step size + ##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode + ##old_obj = tmp_old_obj + #old_obj = new_obj + #rs += 1 + #else: + #update_passed = True + f = self.f + difference = new_obj - old_obj difference = np.abs(np.sum(f - f_old)) + abs(difference) - old_a = a + old_a = self.a #a i += 1 #print "Positive difference obj: ", np.float(difference) print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) - self.a = a + #self.a = a self.B, self.B_chol, self.W_12 = B, L, W_12 self.Bi, _, _, B_det = pdinv(self.B) return f diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 7b6fab27..1d57ed38 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -142,19 +142,18 @@ class GP(model): Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta """ dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) - print "dL_dthetaK should be: ", dL_dthetaK + #print "dL_dthetaK should be: ", dL_dthetaK if isinstance(self.likelihood, Laplace): - self.likelihood.fit_full(self.kern.K(self.X)) - self.likelihood._set_params(self.likelihood._get_params()) + #self.likelihood.fit_full(self.kern.K(self.X)) + #self.likelihood._set_params(self.likelihood._get_params()) dK_dthetaK = self.kern.dK_dtheta dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) - #print "dL_dthetaK after: ",dL_dthetaK - #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) else: dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) - #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) + #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) print "dL_dthetaK is: ", dL_dthetaK + print "dL_dthetaL is: ", dL_dthetaL return np.hstack((dL_dthetaK, dL_dthetaL)) #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) From 26b3855af56ee220cfa00928f6f936bd1161acdf Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 1 Jul 2013 10:06:20 +0100 Subject: [PATCH 58/71] Everything seems to be gradchecking again --- GPy/examples/laplace_approximations.py | 7 ++++++- GPy/likelihoods/Laplace.py | 18 +++++++++--------- GPy/likelihoods/likelihood_functions.py | 2 +- GPy/models/GP.py | 3 +-- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index abb5f4ce..24f2d88c 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -91,6 +91,8 @@ def debug_student_t_noise_approx(): X = np.linspace(0.0, 10.0, 50)[:, None] #X = np.array([0.5, 1])[:, None] Y = np.sin(X) + np.random.randn(*X.shape)*real_var + #ty = np.array([1., 9.97733584, 4.17841363])[:, None] + #Y = ty X_full = X Y_full = np.sin(X_full) @@ -98,7 +100,7 @@ def debug_student_t_noise_approx(): Y = Y/Y.max() #Add student t random noise to datapoints - deg_free = 100 + deg_free = 10000 real_sd = np.sqrt(real_var) print "Real noise std: ", real_sd @@ -151,6 +153,9 @@ def debug_student_t_noise_approx(): #m.constrain_positive('') m.ensure_default_constraints() #m.constrain_fixed('t_noi', real_sd) + #m['rbf_var'] = 0.20446332 + #m['rbf_leng'] = 0.85776241 + #m['t_noise'] = 0.667083294421005 m.update_likelihood_approximation() #m.optimize(messages=True) print(m) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index e096c5f4..e4652f27 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -153,7 +153,7 @@ class Laplace(likelihood): Wi = 1.0/self.W self.Sigma_tilde = np.diagflat(Wi) - Y_tilde = Wi*(self.Ki_f + self.W*self.f_hat) + Y_tilde = Wi*self.Ki_f + self.f_hat self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K) @@ -199,7 +199,7 @@ class Laplace(likelihood): self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - self.W[self.W < 0] = 1e-5 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + self.W[self.W < 0] = 1e-8 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods @@ -312,7 +312,7 @@ class Laplace(likelihood): while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + W[W < 0] = 0#1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods @@ -329,8 +329,9 @@ class Laplace(likelihood): full_step_a = b - W_12*solve_L da = full_step_a - old_a - f_old = self.f.copy() + f_old = f.copy() + f_old = self.f.copy() def inner_obj(step_size, old_a, da, K): a = old_a + step_size*da f = np.dot(K, a) @@ -340,7 +341,6 @@ class Laplace(likelihood): from functools import partial i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K) - old_obj = new_obj new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10) #update_passed = False @@ -354,10 +354,10 @@ class Laplace(likelihood): #print "difference: ",difference #if difference < 0: ##print grad - #print "Objective function rose", np.float(difference) + ##print "Objective function rose", np.float(difference) ##If the objective function isn't rising, restart optimization #step_size *= 0.8 - #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) ##objective function isn't increasing, try reducing step size ##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode ##old_obj = tmp_old_obj @@ -368,12 +368,12 @@ class Laplace(likelihood): f = self.f difference = new_obj - old_obj - difference = np.abs(np.sum(f - f_old)) + abs(difference) + difference = np.abs(np.sum(f - f_old)) #+ abs(difference) old_a = self.a #a i += 1 #print "Positive difference obj: ", np.float(difference) - print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) + #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) #self.a = a self.B, self.B_chol, self.W_12 = B, L, W_12 self.Bi, _, _, B_det = pdinv(self.B) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 4d298122..ebc87f56 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -274,7 +274,7 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) + dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) #2 might not want to be here? / ((self.v*(self.sigma**2) + e**2)**2) ) return dlik_grad_dsigma diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 1d57ed38..20337ef5 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -152,8 +152,7 @@ class GP(model): else: dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) - print "dL_dthetaK is: ", dL_dthetaK - print "dL_dthetaL is: ", dL_dthetaL + print "dL_dthetaK: {} dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL) return np.hstack((dL_dthetaK, dL_dthetaL)) #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) From a7169ab1ab771e567e45d6a11ae9e13b13f3c754 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 1 Jul 2013 15:21:47 +0100 Subject: [PATCH 59/71] Fixed bug where B wasn't refering to current f location --- GPy/core/model.py | 3 +++ GPy/examples/laplace_approximations.py | 5 +++-- GPy/likelihoods/Laplace.py | 21 ++++++++++----------- GPy/likelihoods/likelihood_functions.py | 6 +++++- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/GPy/core/model.py b/GPy/core/model.py index 94202396..83a4a428 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -244,6 +244,9 @@ class model(parameterised): LL_gradients = self._transform_gradients(self._log_likelihood_gradients()) prior_gradients = self._transform_gradients(self._log_prior_gradients()) obj_grads = -LL_gradients - prior_gradients + print self + #self.checkgrad(verbose=1) + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return obj_f, obj_grads def optimize(self, optimizer=None, start=None, **kwargs): diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 24f2d88c..bb621424 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -100,7 +100,7 @@ def debug_student_t_noise_approx(): Y = Y/Y.max() #Add student t random noise to datapoints - deg_free = 10000 + deg_free = 1000 real_sd = np.sqrt(real_var) print "Real noise std: ", real_sd @@ -152,7 +152,7 @@ def debug_student_t_noise_approx(): m.constrain_positive('t_noise_std') #m.constrain_positive('') m.ensure_default_constraints() - #m.constrain_fixed('t_noi', real_sd) + m.constrain_bounded('t_noi', 0.001, 10) #m['rbf_var'] = 0.20446332 #m['rbf_leng'] = 0.85776241 #m['t_noise'] = 0.667083294421005 @@ -168,6 +168,7 @@ def debug_student_t_noise_approx(): plt.plot(X_full, Y_full) plt.ylim(-2.5, 2.5) print "Real noise std: ", real_sd + print "or Real noise std: ", real_stu_t_std return m #print "Clean student t, ncg" diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index e4652f27..4c9c67df 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -158,7 +158,6 @@ class Laplace(likelihood): self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K) l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) - #print "fDf:{} l:{} detKWiBi:{} W:{} Wi:{} Bi:{} Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum()) y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) Z_tilde = (+ self.NORMAL_CONST @@ -199,14 +198,14 @@ class Laplace(likelihood): self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - self.W[self.W < 0] = 1e-8 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though - #self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) - #self.Bi, _, _, B_det = pdinv(self.B) + self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W) + self.Bi, _, _, B_det = pdinv(self.B) #Do the computation again at f to get Ki_f which is useful #b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data) @@ -305,14 +304,14 @@ class Laplace(likelihood): return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data) difference = np.inf - epsilon = 1e-6 + epsilon = 1e-10 step_size = 1 rs = 0 i = 0 while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - W[W < 0] = 0#1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods @@ -335,13 +334,13 @@ class Laplace(likelihood): def inner_obj(step_size, old_a, da, K): a = old_a + step_size*da f = np.dot(K, a) - self.a = a + self.a = a # This is nasty, need to set something within an optimization though self.f = f return -obj(a, f) from functools import partial i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K) - new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10) + new_obj = sp.optimize.brent(i_o, tol=1e-6, maxiter=10) #update_passed = False #while not update_passed: @@ -373,8 +372,8 @@ class Laplace(likelihood): i += 1 #print "Positive difference obj: ", np.float(difference) - #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) + print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) #self.a = a - self.B, self.B_chol, self.W_12 = B, L, W_12 - self.Bi, _, _, B_det = pdinv(self.B) + #self.B, self.B_chol, self.W_12 = B, L, W_12 + #self.Bi, _, _, B_det = pdinv(self.B) return f diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index ebc87f56..57627198 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -195,8 +195,9 @@ class student_t(likelihood_function): e = y - f objective = (+ gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) - - np.log(self.sigma * np.sqrt(self.v * np.pi)) + - 0.5*np.log((self.sigma**2) * self.v * np.pi) - (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v)) + #- (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2))) ) return np.sum(objective) @@ -264,6 +265,7 @@ class student_t(likelihood_function): dlik_dsigma = ( - (1/self.sigma) + ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) ) ) + #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1 return dlik_dsigma def dlik_df_dstd(self, y, f, extra_data=None): @@ -290,6 +292,8 @@ class student_t(likelihood_function): dlik_hess_dsigma = ( (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) / ((e**2 + (self.sigma**2)*self.v)**3) ) + #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2)) + #/ ((e**2 + (self.sigma**2)*self.v)**3) ) return dlik_hess_dsigma def _gradients(self, y, f, extra_data=None): From ab6a3a571e4ef0aec66776f56921326166f09d40 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 2 Jul 2013 11:14:48 +0100 Subject: [PATCH 60/71] Playing trying to find what makes it want to go so low --- GPy/core/model.py | 2 +- GPy/examples/laplace_approximations.py | 21 ++++++++++++++------- GPy/likelihoods/Laplace.py | 18 +++++++++--------- GPy/likelihoods/likelihood_functions.py | 4 ++-- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/GPy/core/model.py b/GPy/core/model.py index 83a4a428..f97938a4 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -246,7 +246,7 @@ class model(parameterised): obj_grads = -LL_gradients - prior_gradients print self #self.checkgrad(verbose=1) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return obj_f, obj_grads def optimize(self, optimizer=None, start=None, **kwargs): diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index bb621424..14400a08 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -88,9 +88,12 @@ def debug_student_t_noise_approx(): plot = False real_var = 0.1 #Start a function, any function - X = np.linspace(0.0, 10.0, 50)[:, None] + #X = np.linspace(0.0, 10.0, 50)[:, None] + X = np.random.rand(100)[:, None] + #X = np.random.rand(100)[:, None] #X = np.array([0.5, 1])[:, None] - Y = np.sin(X) + np.random.randn(*X.shape)*real_var + Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + #Y = X + np.random.randn(*X.shape)*real_var #ty = np.array([1., 9.97733584, 4.17841363])[:, None] #Y = ty @@ -112,7 +115,8 @@ def debug_student_t_noise_approx(): plt.close('all') # Kernel object - kernel1 = GPy.kern.rbf(X.shape[1])# + GPy.kern.white(X.shape[1]) + kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1]) kernel2 = kernel1.copy() kernel3 = kernel1.copy() kernel4 = kernel1.copy() @@ -136,7 +140,7 @@ def debug_student_t_noise_approx(): #print m real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free))) - edited_real_sd = real_stu_t_std#initial_var_guess #real_sd + edited_real_sd = real_stu_t_std + 1#initial_var_guess #real_sd #edited_real_sd = real_sd print "Clean student t, rasm" @@ -149,13 +153,16 @@ def debug_student_t_noise_approx(): #m.constrain_fixed('rbf_l', 1.8651) #m.constrain_fixed('t_noise_std', edited_real_sd) #m.constrain_positive('rbf') - m.constrain_positive('t_noise_std') + #m.constrain_positive('t_noise_std') #m.constrain_positive('') - m.ensure_default_constraints() - m.constrain_bounded('t_noi', 0.001, 10) + #m.constrain_bounded('t_noi', 0.001, 10) + #m.constrain_fixed('t_noi', real_stu_t_std) + m.constrain_fixed('white', 0.01) + #m.constrain_fixed('t_no', 0.01) #m['rbf_var'] = 0.20446332 #m['rbf_leng'] = 0.85776241 #m['t_noise'] = 0.667083294421005 + m.ensure_default_constraints() m.update_likelihood_approximation() #m.optimize(messages=True) print(m) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 4c9c67df..2ae68613 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -156,15 +156,15 @@ class Laplace(likelihood): Y_tilde = Wi*self.Ki_f + self.f_hat self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R - ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K) - l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) + self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K) + self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) - y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) - Z_tilde = (+ self.NORMAL_CONST - + l - + 0.5*ln_det_K_Wi__Bi + self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) + Z_tilde = (#+ self.NORMAL_CONST + + self.lik + + 0.5*self.ln_det_K_Wi__Bi - 0.5*self.f_Ki_f - + 0.5*y_Wi_Ki_i_y + + 0.5*self.y_Wi_Ki_i_y ) #print "Ztilde: {}".format(Z_tilde) @@ -198,7 +198,7 @@ class Laplace(likelihood): self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + self.W[self.W < 0] = 1e-10 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods @@ -311,7 +311,7 @@ class Laplace(likelihood): while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + W[W < 0] = 1e-10 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 57627198..fd64dbe6 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -196,8 +196,8 @@ class student_t(likelihood_function): objective = (+ gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) - 0.5*np.log((self.sigma**2) * self.v * np.pi) - - (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v)) - #- (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2))) + #- (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v)) + - (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2))) ) return np.sum(objective) From 4e5cefb4b5cb14a3c4f94dbd4d18eac8c70a84fd Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 8 Jul 2013 15:48:53 +0100 Subject: [PATCH 61/71] Reparameratised in terms of sigma2 --- GPy/core/model.py | 3 - GPy/examples/laplace_approximations.py | 34 ++-- GPy/likelihoods/Laplace.py | 12 +- GPy/likelihoods/likelihood_functions.py | 207 +++++++++++++++++++++--- 4 files changed, 207 insertions(+), 49 deletions(-) diff --git a/GPy/core/model.py b/GPy/core/model.py index f97938a4..94202396 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -244,9 +244,6 @@ class model(parameterised): LL_gradients = self._transform_gradients(self._log_likelihood_gradients()) prior_gradients = self._transform_gradients(self._log_prior_gradients()) obj_grads = -LL_gradients - prior_gradients - print self - #self.checkgrad(verbose=1) - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return obj_f, obj_grads def optimize(self, optimizer=None, start=None, **kwargs): diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 14400a08..d6b48ebf 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -24,7 +24,7 @@ def timing(): edited_real_sd = real_sd kernel1 = GPy.kern.rbf(X.shape[1]) - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1) m.ensure_default_constraints() @@ -53,7 +53,7 @@ def v_fail_test(): edited_real_sd = real_sd print "Clean student t, rasm" - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernel1) m.constrain_positive('') @@ -92,18 +92,18 @@ def debug_student_t_noise_approx(): X = np.random.rand(100)[:, None] #X = np.random.rand(100)[:, None] #X = np.array([0.5, 1])[:, None] - Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + 1 #Y = X + np.random.randn(*X.shape)*real_var #ty = np.array([1., 9.97733584, 4.17841363])[:, None] #Y = ty X_full = X - Y_full = np.sin(X_full) + Y_full = np.sin(X_full) + 1 Y = Y/Y.max() #Add student t random noise to datapoints - deg_free = 1000 + deg_free = 100 real_sd = np.sqrt(real_var) print "Real noise std: ", real_sd @@ -115,7 +115,7 @@ def debug_student_t_noise_approx(): plt.close('all') # Kernel object - kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1]) #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1]) kernel2 = kernel1.copy() kernel3 = kernel1.copy() @@ -140,24 +140,24 @@ def debug_student_t_noise_approx(): #print m real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free))) - edited_real_sd = real_stu_t_std + 1#initial_var_guess #real_sd + edited_real_sd = real_stu_t_std**2 #initial_var_guess #real_sd #edited_real_sd = real_sd print "Clean student t, rasm" - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernel6) #m['rbf_len'] = 1.5 #m.constrain_fixed('rbf_v', 1.0898) - #m.constrain_fixed('rbf_l', 1.8651) + #m.constrain_fixed('rbf_l', 0.2651) #m.constrain_fixed('t_noise_std', edited_real_sd) #m.constrain_positive('rbf') - #m.constrain_positive('t_noise_std') + m.constrain_positive('t_noise_std') #m.constrain_positive('') #m.constrain_bounded('t_noi', 0.001, 10) #m.constrain_fixed('t_noi', real_stu_t_std) - m.constrain_fixed('white', 0.01) + #m.constrain_fixed('white', 0.01) #m.constrain_fixed('t_no', 0.01) #m['rbf_var'] = 0.20446332 #m['rbf_leng'] = 0.85776241 @@ -179,7 +179,7 @@ def debug_student_t_noise_approx(): return m #print "Clean student t, ncg" - #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg') #m = GPy.models.GP(X, stu_t_likelihood, kernel3) #m.ensure_default_constraints() @@ -276,7 +276,7 @@ def student_t_approx(): edited_real_sd = real_sd #initial_var_guess print "Clean student t, rasm" - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernel6) m.ensure_default_constraints() @@ -291,7 +291,7 @@ def student_t_approx(): plt.title('Student-t rasm clean') print "Corrupt student t, rasm" - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m.ensure_default_constraints() @@ -308,7 +308,7 @@ def student_t_approx(): return m #print "Clean student t, ncg" - #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg') #m = GPy.models.GP(X, stu_t_likelihood, kernel3) #m.ensure_default_constraints() @@ -322,7 +322,7 @@ def student_t_approx(): #plt.title('Student-t ncg clean') #print "Corrupt student t, ncg" - #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd) + #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg') #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) #m.ensure_default_constraints() @@ -337,7 +337,7 @@ def student_t_approx(): ###with a student t distribution, since it has heavy tails it should work well - ###likelihood_function = student_t(deg_free, sigma=real_var) + ###likelihood_function = student_t(deg_free, sigma2=real_var) ###lap = Laplace(Y, likelihood_function) ###cov = kernel.K(X) ###lap.fit_full(cov) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 2ae68613..984112a5 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -220,10 +220,10 @@ class Laplace(likelihood): self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T) #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W)) - self.ln_z_hat = (- 0.5*self.f_Ki_f - - self.ln_I_KW_det - + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) - ) + #self.ln_z_hat = (- 0.5*self.f_Ki_f + #- self.ln_I_KW_det + #+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) + #) return self._compute_GP_variables() @@ -308,6 +308,8 @@ class Laplace(likelihood): step_size = 1 rs = 0 i = 0 + #if self.likelihood_function.sigma < 0.001: + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: @@ -316,8 +318,6 @@ class Laplace(likelihood): # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods B, L, W_12 = self._compute_B_statistics(K, W) - #if i > 30: - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT W_f = W*f grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index fd64dbe6..bfc759d7 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -158,26 +158,26 @@ class student_t(likelihood_function): dln p(yi|fi)_dfi d2ln p(yi|fi)_d2fifj """ - def __init__(self, deg_free, sigma=2): + def __init__(self, deg_free, sigma2=2): #super(student_t, self).__init__() self.v = deg_free - self.sigma = sigma + self.sigma2 = sigma2 self.log_concave = False - self._set_params(np.asarray(sigma)) + self._set_params(np.asarray(sigma2)) def _get_params(self): - return np.asarray(self.sigma) + return np.asarray(self.sigma2) def _get_param_names(self): - return ["t_noise_std"] + return ["t_noise_std2"] def _set_params(self, x): - self.sigma = float(x) + self.sigma2 = float(x) @property def variance(self, extra_data=None): - return (self.v / float(self.v - 2)) * (self.sigma**2) + return (self.v / float(self.v - 2)) * self.sigma2 def link_function(self, y, f, extra_data=None): """link_function $\ln p(y|f)$ @@ -193,12 +193,16 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f + A = gammaln((self.v + 1) * 0.5) + B = -gammaln(self.v * 0.5) + C = - 0.5*np.log(self.sigma2 * self.v * np.pi) + D = (-(self.v + 1)*0.5)*np.log(1 + (e**2)/(self.v*self.sigma2)) objective = (+ gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) - - 0.5*np.log((self.sigma**2) * self.v * np.pi) - #- (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v)) - - (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2))) + - 0.5*np.log(self.sigma2 * self.v * np.pi) + + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v)) ) + #print "A: {} B: {} C: {} D: {} obj: {}".format(A,B,C,D.sum(),objective.sum()) return np.sum(objective) def dlik_df(self, y, f, extra_data=None): @@ -215,7 +219,7 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) + grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2)) return grad def d2lik_d2f(self, y, f, extra_data=None): @@ -235,7 +239,7 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) + hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2) return hess def d3lik_d3f(self, y, f, extra_data=None): @@ -246,8 +250,8 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) / - ((e**2 + (self.sigma**2)*self.v)**3) + d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) / + ((e**2 + self.sigma2*self.v)**3) ) return d3lik_d3f @@ -262,10 +266,16 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - dlik_dsigma = ( - (1/self.sigma) + - ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) ) - ) + #sigma = np.sqrt(self.sigma2) + #dlik_dsigma = ( - (1/sigma) + + #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) + #) + #dlik_dsigma = ( - 1 + + #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) + #) #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1 + #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v)) + dlik_dsigma = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2)) return dlik_dsigma def dlik_df_dstd(self, y, f, extra_data=None): @@ -276,9 +286,11 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) #2 might not want to be here? - / ((self.v*(self.sigma**2) + e**2)**2) - ) + #sigma = np.sqrt(self.sigma2) + #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here? + #/ ((self.v*self.sigma2 + e**2)**2) + #) + dlik_grad_dsigma = (-self.v*(self.v+1)*e)/((self.sigma2*self.v + e**2)**2) return dlik_grad_dsigma def d2lik_d2f_dstd(self, y, f, extra_data=None): @@ -289,11 +301,15 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - dlik_hess_dsigma = ( (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) / - ((e**2 + (self.sigma**2)*self.v)**3) - ) + #sigma = np.sqrt(self.sigma2) + #dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) / + #((e**2 + self.sigma2*self.v)**3) + #) #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2)) #/ ((e**2 + (self.sigma**2)*self.v)**3) ) + dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2))) + / (self.sigma2*self.v + (e**2))**3 + ) return dlik_hess_dsigma def _gradients(self, y, f, extra_data=None): @@ -466,3 +482,148 @@ class weibull_survival(likelihood_function): hess = (y**self.shape)*np.exp(f) return np.squeeze(hess) + +#class gaussian(likelihood_function): + #""" + #Gaussian likelihood - this is a test class for approximation schemes + #""" + #def __init__(self, variance): + #self._set_params(np.asarray(variance)) + + #def _get_params(self): + #return np.asarray(self.sigma2) + + #def _get_param_names(self): + #return ["noise_variance"] + + #def _set_params(self, x): + #self.variance = float(x) + + #def link_function(self, y, f, extra_data=None): + #"""link_function $\ln p(y|f)$ + #$$\ln p(y_{i}|f_{i}) = \ln $$ + + #:y: data + #:f: latent variables f + #:extra_data: extra_data which is not used in student t distribution + #:returns: float(likelihood evaluated for this point) + + #""" + #assert y.shape == f.shape + #e = y - f + #objective = -0.5*self.D* + #return np.sum(objective) + + #def dlik_df(self, y, f, extra_data=None): + #""" + #Gradient of the link function at y, given f w.r.t f + + #$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$ + + #:y: data + #:f: latent variables f + #:extra_data: extra_data which is not used in student t distribution + #:returns: gradient of likelihood evaluated at points + + #""" + #assert y.shape == f.shape + #e = y - f + #grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2)) + #return grad + + #def d2lik_d2f(self, y, f, extra_data=None): + #""" + #Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j + #i.e. second derivative link_function at y given f f_j w.r.t f and f_j + + #Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases + #(the distribution for y_{i} depends only on f_{i} not on f_{j!=i} + + #$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$ + + #:y: data + #:f: latent variables f + #:extra_data: extra_data which is not used in student t distribution + #:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) + #""" + #assert y.shape == f.shape + #e = y - f + #hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2) + #return hess + + #def d3lik_d3f(self, y, f, extra_data=None): + #""" + #Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j + + #$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$ + #""" + #assert y.shape == f.shape + #e = y - f + #d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) / + #((e**2 + self.sigma2*self.v)**3) + #) + #return d3lik_d3f + + #def lik_dstd(self, y, f, extra_data=None): + #""" + #Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation) + + #Terms relavent to derivatives wrt sigma are: + #-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2)) + + #$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$ + #""" + #assert y.shape == f.shape + #e = y - f + #sigma = np.sqrt(self.sigma2) + ##dlik_dsigma = ( - (1/sigma) + + ##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) + ##) + ##dlik_dsigma = ( - 1 + + ##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) + ##) + ##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1 + #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v)) + #return dlik_dsigma + + #def dlik_df_dstd(self, y, f, extra_data=None): + #""" + #Gradient of the dlik_df w.r.t sigma parameter (standard deviation) + + #$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$ + #""" + #assert y.shape == f.shape + #e = y - f + #sigma = np.sqrt(self.sigma2) + #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here? + #/ ((self.v*self.sigma2 + e**2)**2) + #) + #return dlik_grad_dsigma + + #def d2lik_d2f_dstd(self, y, f, extra_data=None): + #""" + #Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation) + + #$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ + #""" + #assert y.shape == f.shape + #e = y - f + #sigma = np.sqrt(self.sigma2) + #dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) / + #((e**2 + self.sigma2*self.v)**3) + #) + ##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2)) + ##/ ((e**2 + (self.sigma**2)*self.v)**3) ) + #return dlik_hess_dsigma + + #def _gradients(self, y, f, extra_data=None): + ##must be listed in same order as 'get_param_names' + #derivs = ([self.lik_dstd(y, f, extra_data=extra_data)], + #[self.dlik_df_dstd(y, f, extra_data=extra_data)], + #[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)] + #) # lists as we might learn many parameters + ## ensure we have gradients for every parameter we want to optimize + #assert len(derivs[0]) == len(self._get_param_names()) + #assert len(derivs[1]) == len(self._get_param_names()) + #assert len(derivs[2]) == len(self._get_param_names()) + #return derivs From 2a366619b340d25d5dd53836e2e66ffcfb2257d7 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 8 Jul 2013 16:09:20 +0100 Subject: [PATCH 62/71] Changed incorrect naming --- GPy/examples/laplace_approximations.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index d6b48ebf..78b4e986 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -84,6 +84,26 @@ def v_fail_test(): import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print(m) +def student_t_f_check(): + real_var = 0.1 + X = np.random.rand(100)[:, None] + Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + X_full = X + Y_full = np.sin(X_full) + Y = Y/Y.max() + deg_free = 1000 + real_sd = np.sqrt(real_var) + + kernel = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1]) + real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free))) + + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std**2) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') + m = GPy.models.GP(X, stu_t_likelihood, kernel) + m.constrain_positive('t_noise_std2') + m.ensure_default_constraints() + m.update_likelihood_approximation() + def debug_student_t_noise_approx(): plot = False real_var = 0.1 @@ -151,9 +171,9 @@ def debug_student_t_noise_approx(): #m['rbf_len'] = 1.5 #m.constrain_fixed('rbf_v', 1.0898) #m.constrain_fixed('rbf_l', 0.2651) - #m.constrain_fixed('t_noise_std', edited_real_sd) + #m.constrain_fixed('t_noise_std2', edited_real_sd) #m.constrain_positive('rbf') - m.constrain_positive('t_noise_std') + m.constrain_positive('t_noise_std2') #m.constrain_positive('') #m.constrain_bounded('t_noi', 0.001, 10) #m.constrain_fixed('t_noi', real_stu_t_std) From ee980227ac34262b192565cafb5e195cefee46d0 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 9 Jul 2013 11:35:42 +0100 Subject: [PATCH 63/71] Fixed 2*variance plotting instead of 2*std plotting, tidied up --- GPy/examples/laplace_approximations.py | 93 ++++++++++++++++++++----- GPy/likelihoods/Laplace.py | 2 +- GPy/likelihoods/likelihood_functions.py | 28 +------- GPy/models/GP.py | 2 +- 4 files changed, 78 insertions(+), 47 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 78b4e986..b3048f5a 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -85,24 +85,78 @@ def v_fail_test(): print(m) def student_t_f_check(): - real_var = 0.1 + plt.close('all') + real_std = 0.1 X = np.random.rand(100)[:, None] - Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + noise = np.random.randn(*X.shape)*real_std + Y = np.sin(X*2*np.pi) + noise X_full = X Y_full = np.sin(X_full) - Y = Y/Y.max() - deg_free = 1000 - real_sd = np.sqrt(real_var) + #Y = Y/Y.max() + deg_free = 10000 - kernel = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1]) - real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free))) + #GP + kernelgp = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1]) + mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp) + mgp.ensure_default_constraints() + mgp.randomize() + mgp.optimize() - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std**2) + kernelst = kernelgp.copy() + real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free)) + + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std2) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') - m = GPy.models.GP(X, stu_t_likelihood, kernel) - m.constrain_positive('t_noise_std2') - m.ensure_default_constraints() + + plt.figure(1) + plt.suptitle('Student likelihood') + m = GPy.models.GP(X, stu_t_likelihood, kernelst) + m.constrain_fixed('rbf_var', mgp._get_params()[0]) + m.constrain_fixed('rbf_len', mgp._get_params()[1]) + m.update_likelihood_approximation() + print "T std2 {} converted from original data, LL: {}".format(real_stu_t_std2, m.log_likelihood()) + plt.subplot(221) + m.plot() + plt.title('Student t original data noise') + + #Fix student t noise variance to same a GP + gp_noise = mgp._get_params()[2] + m['t_noise_std2'] = gp_noise + m.update_likelihood_approximation() + print "T std2 {} same as GP noise, LL: {}".format(gp_noise, m.log_likelihood()) + plt.subplot(222) + m.plot() + plt.title('Student t GP noise') + + #Fix student t noise to variance converted from the GP + real_stu_t_std2gp = (gp_noise)*((deg_free - 2)/float(deg_free)) + m['t_noise_std2'] = real_stu_t_std2gp + m.update_likelihood_approximation() + print "T std2 {} converted to student t noise from GP noise, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.log_likelihood()) + plt.subplot(223) + m.plot() + plt.title('Student t GP noise converted') + + m.constrain_positive('t_noise_std2') + m.randomize() + m.update_likelihood_approximation() + m.optimize() + print "T std2 {} var {} after optimising, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.likelihood.likelihood_function.variance, m.log_likelihood()) + plt.subplot(224) + m.plot() + plt.title('Student t optimised') + + plt.figure(2) + print "GP noise {} after optimising, LL: {}".format(gp_noise, mgp.log_likelihood()) + plt.suptitle('Gaussian likelihood optimised') + mgp.plot() + print "Real std: {}".format(real_std) + print "Real variance {}".format(real_std**2) + + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + return m def debug_student_t_noise_approx(): plot = False @@ -218,16 +272,16 @@ def student_t_approx(): """ Example of regressing with a student t likelihood """ - real_var = 0.2 + real_std = 0.1 #Start a function, any function - X = np.linspace(0.0, 10.0, 30)[:, None] - Y = np.sin(X) + np.random.randn(*X.shape)*real_var + X = np.linspace(0.0, 10.0, 50)[:, None] + Y = np.sin(X) + np.random.randn(*X.shape)*real_std Yc = Y.copy() X_full = np.linspace(0.0, 10.0, 500)[:, None] Y_full = np.sin(X_full) - #Y = Y/Y.max() + Y = Y/Y.max() Yc[10] += 100 Yc[25] += 10 @@ -238,10 +292,9 @@ def student_t_approx(): #Add student t random noise to datapoints deg_free = 8 - real_sd = np.sqrt(real_var) - print "Real noise: ", real_sd + print "Real noise: ", real_std - initial_var_guess = 0.01 + initial_var_guess = 0.1 #t_rv = t(deg_free, loc=0, scale=real_var) #noise = t_rvrvs(size=Y.shape) #Y += noise @@ -293,7 +346,7 @@ def student_t_approx(): plt.figure(2) plt.suptitle('Student-t likelihood') - edited_real_sd = real_sd #initial_var_guess + edited_real_sd = real_std #initial_var_guess print "Clean student t, rasm" t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd) @@ -301,6 +354,7 @@ def student_t_approx(): m = GPy.models.GP(X, stu_t_likelihood, kernel6) m.ensure_default_constraints() m.constrain_positive('t_noise') + m.randomize() m.update_likelihood_approximation() m.optimize() print(m) @@ -316,6 +370,7 @@ def student_t_approx(): m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) m.ensure_default_constraints() m.constrain_positive('t_noise') + m.randomize() m.update_likelihood_approximation() m.optimize() print(m) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 984112a5..c5894ed6 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -89,7 +89,7 @@ class Laplace(likelihood): expl = 0.5*expl_a + 0.5*expl_b # Might need to be -? dL_dthetaK_exp = dK_dthetaK(expl, X) dL_dthetaK_imp = dK_dthetaK(impl, X) - print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) + #print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp return dL_dthetaK diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index bfc759d7..595fa63c 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -193,16 +193,11 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - A = gammaln((self.v + 1) * 0.5) - B = -gammaln(self.v * 0.5) - C = - 0.5*np.log(self.sigma2 * self.v * np.pi) - D = (-(self.v + 1)*0.5)*np.log(1 + (e**2)/(self.v*self.sigma2)) objective = (+ gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) - 0.5*np.log(self.sigma2 * self.v * np.pi) + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v)) ) - #print "A: {} B: {} C: {} D: {} obj: {}".format(A,B,C,D.sum(),objective.sum()) return np.sum(objective) def dlik_df(self, y, f, extra_data=None): @@ -266,15 +261,6 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - #sigma = np.sqrt(self.sigma2) - #dlik_dsigma = ( - (1/sigma) + - #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) - #) - #dlik_dsigma = ( - 1 + - #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) - #) - #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1 - #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v)) dlik_dsigma = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2)) return dlik_dsigma @@ -286,10 +272,6 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - #sigma = np.sqrt(self.sigma2) - #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here? - #/ ((self.v*self.sigma2 + e**2)**2) - #) dlik_grad_dsigma = (-self.v*(self.v+1)*e)/((self.sigma2*self.v + e**2)**2) return dlik_grad_dsigma @@ -301,12 +283,6 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - #sigma = np.sqrt(self.sigma2) - #dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) / - #((e**2 + self.sigma2*self.v)**3) - #) - #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2)) - #/ ((e**2 + (self.sigma**2)*self.v)**3) ) dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2))) / (self.sigma2*self.v + (e**2))**3 ) @@ -344,8 +320,8 @@ class student_t(likelihood_function): #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now #need the 95 and 5 percentiles. #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles - p_025 = mu - 2.*true_var - p_975 = mu + 2.*true_var + p_025 = mu - 2.*np.sqrt(true_var) + p_975 = mu + 2.*np.sqrt(true_var) return mu, np.nan*mu, p_025, p_975 diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 20337ef5..cd4b7dac 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -152,7 +152,7 @@ class GP(model): else: dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL)) - print "dL_dthetaK: {} dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL) + #print "dL_dthetaK: {} dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL) return np.hstack((dL_dthetaK, dL_dthetaL)) #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) From 57001851c46f34d075aa605ac1aa0ac0eb302c57 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 9 Jul 2013 20:05:03 +0100 Subject: [PATCH 64/71] Trying to debug kernel parameters learning (fails even when noise fixed) may be some instablility, seems like it can get it if it starts close --- GPy/examples/laplace_approximations.py | 103 ++++++++++++++++++++++--- GPy/likelihoods/Laplace.py | 18 +++-- GPy/models/GP.py | 12 ++- 3 files changed, 110 insertions(+), 23 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index b3048f5a..279bc597 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -1,6 +1,7 @@ import GPy import numpy as np import matplotlib.pyplot as plt +np.random.seed(1) def timing(): real_var = 0.1 @@ -86,17 +87,67 @@ def v_fail_test(): def student_t_f_check(): plt.close('all') - real_std = 0.1 - X = np.random.rand(100)[:, None] + X = np.linspace(0, 1, 50)[:, None] + real_std = 0.001 + noise = np.random.randn(*X.shape)*real_std + Y = np.sin(X*2*np.pi) + noise + deg_free = 1000 + + kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1]) + mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp) + mgp.ensure_default_constraints() + mgp.randomize() + mgp.optimize() + print mgp + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + kernelst = kernelgp.copy() + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=1e-5) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') + m = GPy.models.GP(X, stu_t_likelihood, kernelst) + m['rbf_v'] = mgp._get_params()[0] + m['rbf_l'] = mgp._get_params()[1] + 1 + m.ensure_default_constraints() + m.constrain_positive('t_no') + print m + plt.figure() + plt.subplot(511) + m.plot() + print m + plt.subplot(512) + m.optimize(max_f_eval=15) + m.plot() + print m + plt.subplot(513) + m.optimize(max_f_eval=15) + m.plot() + print m + plt.subplot(514) + m.optimize(max_f_eval=15) + m.plot() + print m + plt.subplot(515) + m.optimize() + m.plot() + print "final optimised student t" + print m + print "real GP" + print mgp + +def student_t_fix_optimise_check(): + plt.close('all') + real_var = 0.1 + real_std = np.sqrt(real_var) + X = np.random.rand(200)[:, None] noise = np.random.randn(*X.shape)*real_std Y = np.sin(X*2*np.pi) + noise X_full = X Y_full = np.sin(X_full) #Y = Y/Y.max() - deg_free = 10000 + deg_free = 1000 #GP - kernelgp = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1]) + kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1]) mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp) mgp.ensure_default_constraints() mgp.randomize() @@ -113,10 +164,12 @@ def student_t_f_check(): m = GPy.models.GP(X, stu_t_likelihood, kernelst) m.constrain_fixed('rbf_var', mgp._get_params()[0]) m.constrain_fixed('rbf_len', mgp._get_params()[1]) + m.constrain_positive('t_noise') + #m.ensure_default_constraints() m.update_likelihood_approximation() print "T std2 {} converted from original data, LL: {}".format(real_stu_t_std2, m.log_likelihood()) - plt.subplot(221) + plt.subplot(231) m.plot() plt.title('Student t original data noise') @@ -125,7 +178,7 @@ def student_t_f_check(): m['t_noise_std2'] = gp_noise m.update_likelihood_approximation() print "T std2 {} same as GP noise, LL: {}".format(gp_noise, m.log_likelihood()) - plt.subplot(222) + plt.subplot(232) m.plot() plt.title('Student t GP noise') @@ -134,29 +187,57 @@ def student_t_f_check(): m['t_noise_std2'] = real_stu_t_std2gp m.update_likelihood_approximation() print "T std2 {} converted to student t noise from GP noise, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.log_likelihood()) - plt.subplot(223) + plt.subplot(233) m.plot() plt.title('Student t GP noise converted') m.constrain_positive('t_noise_std2') m.randomize() m.update_likelihood_approximation() + plt.subplot(234) + m.plot() + plt.title('Student t fixed rbf') m.optimize() print "T std2 {} var {} after optimising, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.likelihood.likelihood_function.variance, m.log_likelihood()) - plt.subplot(224) + plt.subplot(235) m.plot() - plt.title('Student t optimised') + plt.title('Student t fixed rbf optimised') plt.figure(2) + mrbf = m.copy() + mrbf.unconstrain('') + mrbf.constrain_fixed('t_noise', m.likelihood.likelihood_function.sigma2) + gp_var = mgp._get_params()[0] + gp_len = mgp._get_params()[1] + mrbf.constrain_fixed('rbf_var', gp_var) + mrbf.constrain_positive('rbf_len') + mrbf.randomize() + print "Before optimize" + print mrbf + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + mrbf.checkgrad(verbose=1) + plt.subplot(121) + mrbf.plot() + plt.title('Student t fixed noise') + #mrbf.optimize() + print "After optimize" + print mrbf + plt.subplot(122) + mrbf.plot() + plt.title('Student t fixed noise optimized') + print mrbf + + plt.figure(3) print "GP noise {} after optimising, LL: {}".format(gp_noise, mgp.log_likelihood()) plt.suptitle('Gaussian likelihood optimised') mgp.plot() print "Real std: {}".format(real_std) print "Real variance {}".format(real_std**2) - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - return m + print "Len should be: {}".format(gp_len) + return mrbf def debug_student_t_noise_approx(): plot = False diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index c5894ed6..5343f5dc 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -89,7 +89,7 @@ class Laplace(likelihood): expl = 0.5*expl_a + 0.5*expl_b # Might need to be -? dL_dthetaK_exp = dK_dthetaK(expl, X) dL_dthetaK_imp = dK_dthetaK(impl, X) - #print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) + print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp return dL_dthetaK @@ -290,10 +290,12 @@ class Laplace(likelihood): :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation :returns: f_mode """ - if self.old_a is None: - old_a = np.zeros((self.N, 1)) - else: - old_a = self.old_a + old_a = np.zeros((self.N, 1)) + #old_a = None + #if self.old_a is None: + #old_a = np.zeros((self.N, 1)) + #else: + #old_a = self.old_a f = np.dot(self.K, old_a) self.f = f @@ -308,8 +310,6 @@ class Laplace(likelihood): step_size = 1 rs = 0 i = 0 - #if self.likelihood_function.sigma < 0.001: - #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) if not self.likelihood_function.log_concave: @@ -371,8 +371,10 @@ class Laplace(likelihood): old_a = self.a #a i += 1 + self.old_a = old_a #print "Positive difference obj: ", np.float(difference) - print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) + #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) + print "Iterations: {}, Final_difference: {}".format(i, difference) #self.a = a #self.B, self.B_chol, self.W_12 = B, L, W_12 #self.Bi, _, _, B_det = pdinv(self.B) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index cd4b7dac..0f56e21c 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -132,7 +132,11 @@ class GP(model): model for a new variable Y* = v_tilde/tau_tilde, with a covariance matrix K* = K + diag(1./tau_tilde) plus a normalization term. """ + if isinstance(self.likelihood, Laplace): + self.likelihood.fit_full(self.kern.K(self.X)) + self.likelihood._set_params(self.likelihood._get_params()) l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z + print "K_ldet: {} mft: {} Z: {}".format(self.K_logdet, self._model_fit_term(), self.likelihood.Z) return l def _log_likelihood_gradients(self): @@ -142,12 +146,12 @@ class GP(model): Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta """ dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) - #print "dL_dthetaK should be: ", dL_dthetaK + print "dL_dthetaK should be: ", dL_dthetaK if isinstance(self.likelihood, Laplace): - #self.likelihood.fit_full(self.kern.K(self.X)) - #self.likelihood._set_params(self.likelihood._get_params()) + self.likelihood.fit_full(self.kern.K(self.X)) + self.likelihood._set_params(self.likelihood._get_params()) dK_dthetaK = self.kern.dK_dtheta - dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X) + dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy()) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) else: dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) From aa9860859000530ba3297e72236c359f2a36a42b Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 29 Jul 2013 15:29:46 +0100 Subject: [PATCH 65/71] Started adding gaussian likelihood, changed round preloading old_a --- GPy/core/model.py | 6 + GPy/examples/laplace_approximations.py | 72 ++++++- GPy/likelihoods/Laplace.py | 173 ++++++++++------ GPy/likelihoods/likelihood_functions.py | 251 +++++++++++++----------- 4 files changed, 321 insertions(+), 181 deletions(-) diff --git a/GPy/core/model.py b/GPy/core/model.py index 94202396..e3a9bb68 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -244,6 +244,12 @@ class model(parameterised): LL_gradients = self._transform_gradients(self._log_likelihood_gradients()) prior_gradients = self._transform_gradients(self._log_prior_gradients()) obj_grads = -LL_gradients - prior_gradients + print self + print self._get_params() + print -obj_grads + self.plot() + if isinstance(self.likelihood, likelihoods.Laplace): + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT return obj_f, obj_grads def optimize(self, optimizer=None, start=None, **kwargs): diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 279bc597..2b93122c 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -85,10 +85,60 @@ def v_fail_test(): import ipdb; ipdb.set_trace() ### XXX BREAKPOINT print(m) +def student_t_obj_plane(): + plt.close('all') + X = np.linspace(0, 1, 50)[:, None] + real_std = 0.002 + noise = np.random.randn(*X.shape)*real_std + Y = np.sin(X*2*np.pi) + noise + deg_free = 1000 + + kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1]) + mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp) + mgp.ensure_default_constraints() + mgp['noise'] = real_std**2 + print "Gaussian" + print mgp + + kernelst = kernelgp.copy() + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2)) + stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') + m = GPy.models.GP(X, stu_t_likelihood, kernelst) + m.ensure_default_constraints() + m.constrain_fixed('t_no', real_std**2) + vs = 10 + ls = 10 + objs_t = np.zeros((vs, ls)) + objs_g = np.zeros((vs, ls)) + rbf_vs = np.linspace(1e-6, 8, vs) + rbf_ls = np.linspace(1e-2, 8, ls) + for v_id, rbf_v in enumerate(rbf_vs): + for l_id, rbf_l in enumerate(rbf_ls): + m['rbf_v'] = rbf_v + m['rbf_l'] = rbf_l + mgp['rbf_v'] = rbf_v + mgp['rbf_l'] = rbf_l + objs_t[v_id, l_id] = m.log_likelihood() + objs_g[v_id, l_id] = mgp.log_likelihood() + plt.figure() + plt.subplot(211) + plt.title('Student t') + plt.imshow(objs_t, interpolation='none') + plt.xlabel('variance') + plt.ylabel('lengthscale') + plt.subplot(212) + plt.title('Gaussian') + plt.imshow(objs_g, interpolation='none') + plt.xlabel('variance') + plt.ylabel('lengthscale') + plt.show() + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return objs_t + def student_t_f_check(): plt.close('all') X = np.linspace(0, 1, 50)[:, None] - real_std = 0.001 + real_std = 0.2 noise = np.random.randn(*X.shape)*real_std Y = np.sin(X*2*np.pi) + noise deg_free = 1000 @@ -98,17 +148,26 @@ def student_t_f_check(): mgp.ensure_default_constraints() mgp.randomize() mgp.optimize() + print "Gaussian" print mgp import ipdb; ipdb.set_trace() ### XXX BREAKPOINT kernelst = kernelgp.copy() - t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=1e-5) + #kernelst += GPy.kern.bias(X.shape[1]) + t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05) stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm') m = GPy.models.GP(X, stu_t_likelihood, kernelst) - m['rbf_v'] = mgp._get_params()[0] - m['rbf_l'] = mgp._get_params()[1] + 1 + #m['rbf_v'] = mgp._get_params()[0] + #m['rbf_l'] = mgp._get_params()[1] + 1 m.ensure_default_constraints() + #m.constrain_fixed('rbf_v', mgp._get_params()[0]) + #m.constrain_fixed('rbf_l', mgp._get_params()[1]) + #m.constrain_bounded('t_no', 2*real_std**2, 1e3) + #m.constrain_positive('bias') m.constrain_positive('t_no') + m.randomize() + m['t_no'] = 0.3 + m.likelihood.X = X print m plt.figure() plt.subplot(511) @@ -143,7 +202,8 @@ def student_t_fix_optimise_check(): Y = np.sin(X*2*np.pi) + noise X_full = X Y_full = np.sin(X_full) - #Y = Y/Y.max() + Y = Y/Y.max() + Y_full = Y_full/Y_full.max() deg_free = 1000 #GP @@ -219,7 +279,7 @@ def student_t_fix_optimise_check(): plt.subplot(121) mrbf.plot() plt.title('Student t fixed noise') - #mrbf.optimize() + mrbf.optimize() print "After optimize" print mrbf plt.subplot(122) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 5343f5dc..8b39f222 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -156,17 +156,23 @@ class Laplace(likelihood): Y_tilde = Wi*self.Ki_f + self.f_hat self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R + #self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6 + self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K) self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde) - Z_tilde = (#+ self.NORMAL_CONST + self.aA = 0.5*self.ln_det_K_Wi__Bi + self.bB = - 0.5*self.f_Ki_f + self.cC = 0.5*self.y_Wi_Ki_i_y + Z_tilde = (+ 100*self.NORMAL_CONST + self.lik + 0.5*self.ln_det_K_Wi__Bi - 0.5*self.f_Ki_f + 0.5*self.y_Wi_Ki_i_y ) - #print "Ztilde: {}".format(Z_tilde) + print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC) + print self.likelihood_function._get_params() #Convert to float as its (1, 1) and Z must be a scalar self.Z = np.float64(Z_tilde) @@ -198,7 +204,7 @@ class Laplace(likelihood): self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data) if not self.likelihood_function.log_concave: - self.W[self.W < 0] = 1e-10 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur #If the likelihood is non-log-concave. We wan't to say that there is a negative variance #To cause the posterior to become less certain than the prior and likelihood, #This is a property only held by non-log-concave likelihoods @@ -280,7 +286,7 @@ class Laplace(likelihood): f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False) return f_hat[:, None] - def rasm_mode(self, K, MAX_ITER=40, MAX_RESTART=10): + def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10): """ Rasmussens numerically stable mode finding For nomenclature see Rasmussen & Williams 2006 @@ -290,15 +296,19 @@ class Laplace(likelihood): :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation :returns: f_mode """ - old_a = np.zeros((self.N, 1)) - #old_a = None - #if self.old_a is None: - #old_a = np.zeros((self.N, 1)) - #else: - #old_a = self.old_a + self.old_before_s = self.likelihood_function._get_params() + print "before: ", self.old_before_s + #if self.old_before_s < 1e-5: + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + #old_a = np.zeros((self.N, 1)) + if self.old_a is None: + old_a = np.zeros((self.N, 1)) + f = np.dot(K, old_a) + else: + old_a = self.old_a.copy() + f = self.f_hat.copy() - f = np.dot(self.K, old_a) - self.f = f new_obj = -np.inf old_obj = np.inf @@ -306,18 +316,20 @@ class Laplace(likelihood): return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data) difference = np.inf - epsilon = 1e-10 + epsilon = 1e-4 step_size = 1 rs = 0 i = 0 - while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: + + while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART: W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data) + #W = np.maximum(W, 0) if not self.likelihood_function.log_concave: - W[W < 0] = 1e-10 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur # If the likelihood is non-log-concave. We wan't to say that there is a negative variance # To cause the posterior to become less certain than the prior and likelihood, # This is a property only held by non-log-concave likelihoods - B, L, W_12 = self._compute_B_statistics(K, W) + B, L, W_12 = self._compute_B_statistics(K, W.copy()) W_f = W*f grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data) @@ -328,54 +340,105 @@ class Laplace(likelihood): full_step_a = b - W_12*solve_L da = full_step_a - old_a - f_old = f.copy() - - f_old = self.f.copy() - def inner_obj(step_size, old_a, da, K): - a = old_a + step_size*da - f = np.dot(K, a) - self.a = a # This is nasty, need to set something within an optimization though - self.f = f - return -obj(a, f) - - from functools import partial - i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K) - new_obj = sp.optimize.brent(i_o, tol=1e-6, maxiter=10) - - #update_passed = False - #while not update_passed: + #f_old = f.copy() + #def inner_obj(step_size, old_a, da, K): #a = old_a + step_size*da #f = np.dot(K, a) + #self.a = a.copy() # This is nasty, need to set something within an optimization though + #self.f = f.copy() + #return -obj(a, f) - #old_obj = new_obj - #new_obj = obj(a, f) - #difference = new_obj - old_obj - #print "difference: ",difference - #if difference < 0: - ##print grad - ##print "Objective function rose", np.float(difference) - ##If the objective function isn't rising, restart optimization - #step_size *= 0.8 - ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) - ##objective function isn't increasing, try reducing step size - ##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode - ##old_obj = tmp_old_obj - #old_obj = new_obj - #rs += 1 - #else: - #update_passed = True + #from functools import partial + #i_o = partial(inner_obj, old_a=old_a, da=da, K=K) + ##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20) + #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun + #f = self.f.copy() + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - f = self.f - difference = new_obj - old_obj - difference = np.abs(np.sum(f - f_old)) #+ abs(difference) - old_a = self.a #a + f_old = f.copy() + update_passed = False + while not update_passed: + a = old_a + step_size*da + f = np.dot(K, a) + + old_obj = new_obj + new_obj = obj(a, f) + difference = new_obj - old_obj + print "difference: ",difference + if difference < 0: + #print "Objective function rose", np.float(difference) + #If the objective function isn't rising, restart optimization + step_size *= 0.8 + #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + #objective function isn't increasing, try reducing step size + f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode + old_obj = new_obj + rs += 1 + else: + update_passed = True + + #difference = abs(new_obj - old_obj) + #old_obj = new_obj.copy() + difference = np.abs(np.sum(f - f_old)) + #old_a = self.a.copy() #a + old_a = a.copy() i += 1 + #print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a)) - self.old_a = old_a + self.old_a = old_a.copy() #print "Positive difference obj: ", np.float(difference) #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size) print "Iterations: {}, Final_difference: {}".format(i, difference) - #self.a = a + if difference > 1e-4: + print "FAIL FAIL FAIL FAIL FAIL FAIL" + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + if hasattr(self, 'X'): + import pylab as pb + pb.figure() + pb.subplot(311) + pb.title('old f_hat') + pb.plot(self.X, self.f_hat) + pb.subplot(312) + pb.title('old ff') + pb.plot(self.X, self.old_ff) + pb.subplot(313) + pb.title('new f_hat') + pb.plot(self.X, f) + + pb.figure() + pb.subplot(121) + pb.title('old K') + pb.imshow(np.diagflat(self.old_K), interpolation='none') + pb.colorbar() + pb.subplot(122) + pb.title('new K') + pb.imshow(np.diagflat(K), interpolation='none') + pb.colorbar() + + pb.figure() + pb.subplot(121) + pb.title('old W') + pb.imshow(np.diagflat(self.old_W), interpolation='none') + pb.colorbar() + pb.subplot(122) + pb.title('new W') + pb.imshow(np.diagflat(W), interpolation='none') + pb.colorbar() + + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + pb.close('all') + + #FIXME: DELETE THESE + self.old_W = W.copy() + self.old_grad = grad.copy() + self.old_B = B.copy() + self.old_W_12 = W_12.copy() + self.old_ff = f.copy() + self.old_K = self.K.copy() + self.old_s = self.likelihood_function._get_params() + print "after: ", self.old_s + #print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a)) + self.a = a #self.B, self.B_chol, self.W_12 = B, L, W_12 #self.Bi, _, _, B_det = pdinv(self.B) return f diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 595fa63c..62e09a1a 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -193,11 +193,16 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f + #A = gammaln((self.v + 1) * 0.5) + #B = - gammaln(self.v * 0.5) + #C = - 0.5*np.log(self.sigma2 * self.v * np.pi) + #D = + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v)) objective = (+ gammaln((self.v + 1) * 0.5) - gammaln(self.v * 0.5) - 0.5*np.log(self.sigma2 * self.v * np.pi) + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v)) ) + #print "C: {} D: {} obj: {}".format(C, np.sum(D), objective.sum()) return np.sum(objective) def dlik_df(self, y, f, extra_data=None): @@ -459,147 +464,153 @@ class weibull_survival(likelihood_function): hess = (y**self.shape)*np.exp(f) return np.squeeze(hess) -#class gaussian(likelihood_function): - #""" - #Gaussian likelihood - this is a test class for approximation schemes - #""" - #def __init__(self, variance): - #self._set_params(np.asarray(variance)) +class gaussian(likelihood_function): + """ + Gaussian likelihood - this is a test class for approximation schemes + """ + def __init__(self, variance): + self._set_params(np.asarray(variance)) - #def _get_params(self): - #return np.asarray(self.sigma2) + def _get_params(self): + return np.asarray(self._variance) - #def _get_param_names(self): - #return ["noise_variance"] + def _get_param_names(self): + return ["noise_variance"] - #def _set_params(self, x): - #self.variance = float(x) + def _set_params(self, x): + self._variance = float(x) + self.covariance_matrix = np.eye(self.N) * self._variance + self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG - #def link_function(self, y, f, extra_data=None): - #"""link_function $\ln p(y|f)$ - #$$\ln p(y_{i}|f_{i}) = \ln $$ + def link_function(self, y, f, extra_data=None): + """link_function $\ln p(y|f)$ + $$\ln p(y_{i}|f_{i}) = \ln $$ - #:y: data - #:f: latent variables f - #:extra_data: extra_data which is not used in student t distribution - #:returns: float(likelihood evaluated for this point) + :y: data + :f: latent variables f + :extra_data: extra_data which is not used in student t distribution + :returns: float(likelihood evaluated for this point) - #""" - #assert y.shape == f.shape - #e = y - f - #objective = -0.5*self.D* - #return np.sum(objective) + """ + assert y.shape == f.shape + e = y - f + eeT = np.dot(e, e.T) + objective = (- 0.5*self.D*np.log(2*np.pi) + - 0.5*self.ln_K + - 0.5*np.sum(np.multiply(self.Ki, eeT)) + ) + return np.sum(objective) - #def dlik_df(self, y, f, extra_data=None): - #""" - #Gradient of the link function at y, given f w.r.t f + def dlik_df(self, y, f, extra_data=None): + """ + Gradient of the link function at y, given f w.r.t f - #$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$ + $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$ - #:y: data - #:f: latent variables f - #:extra_data: extra_data which is not used in student t distribution - #:returns: gradient of likelihood evaluated at points + :y: data + :f: latent variables f + :extra_data: extra_data which is not used in student t distribution + :returns: gradient of likelihood evaluated at points - #""" - #assert y.shape == f.shape - #e = y - f - #grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2)) - #return grad + """ + assert y.shape == f.shape + e = y - f + grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2)) + return grad - #def d2lik_d2f(self, y, f, extra_data=None): - #""" - #Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j - #i.e. second derivative link_function at y given f f_j w.r.t f and f_j + def d2lik_d2f(self, y, f, extra_data=None): + """ + Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j + i.e. second derivative link_function at y given f f_j w.r.t f and f_j - #Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases - #(the distribution for y_{i} depends only on f_{i} not on f_{j!=i} + Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases + (the distribution for y_{i} depends only on f_{i} not on f_{j!=i} - #$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$ + $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$ - #:y: data - #:f: latent variables f - #:extra_data: extra_data which is not used in student t distribution - #:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) - #""" - #assert y.shape == f.shape - #e = y - f - #hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2) - #return hess + :y: data + :f: latent variables f + :extra_data: extra_data which is not used in student t distribution + :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) + """ + assert y.shape == f.shape + e = y - f + hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2) + return hess - #def d3lik_d3f(self, y, f, extra_data=None): - #""" - #Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j + def d3lik_d3f(self, y, f, extra_data=None): + """ + Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j - #$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$ - #""" - #assert y.shape == f.shape - #e = y - f - #d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) / - #((e**2 + self.sigma2*self.v)**3) - #) - #return d3lik_d3f + $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$ + """ + assert y.shape == f.shape + e = y - f + d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) / + ((e**2 + self.sigma2*self.v)**3) + ) + return d3lik_d3f - #def lik_dstd(self, y, f, extra_data=None): - #""" - #Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation) + def lik_dstd(self, y, f, extra_data=None): + """ + Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation) - #Terms relavent to derivatives wrt sigma are: - #-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2)) + Terms relavent to derivatives wrt sigma are: + -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2)) - #$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$ - #""" - #assert y.shape == f.shape - #e = y - f - #sigma = np.sqrt(self.sigma2) - ##dlik_dsigma = ( - (1/sigma) + - ##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) - ##) - ##dlik_dsigma = ( - 1 + - ##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) - ##) - ##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1 - #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v)) - #return dlik_dsigma + $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$ + """ + assert y.shape == f.shape + e = y - f + sigma = np.sqrt(self.sigma2) + #dlik_dsigma = ( - (1/sigma) + + #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) + #) + #dlik_dsigma = ( - 1 + + #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) + #) + #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1 + dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v)) + return dlik_dsigma - #def dlik_df_dstd(self, y, f, extra_data=None): - #""" - #Gradient of the dlik_df w.r.t sigma parameter (standard deviation) + def dlik_df_dstd(self, y, f, extra_data=None): + """ + Gradient of the dlik_df w.r.t sigma parameter (standard deviation) - #$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$ - #""" - #assert y.shape == f.shape - #e = y - f - #sigma = np.sqrt(self.sigma2) - #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here? - #/ ((self.v*self.sigma2 + e**2)**2) - #) - #return dlik_grad_dsigma + $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$ + """ + assert y.shape == f.shape + e = y - f + sigma = np.sqrt(self.sigma2) + dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here? + / ((self.v*self.sigma2 + e**2)**2) + ) + return dlik_grad_dsigma - #def d2lik_d2f_dstd(self, y, f, extra_data=None): - #""" - #Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation) + def d2lik_d2f_dstd(self, y, f, extra_data=None): + """ + Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation) - #$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ - #""" - #assert y.shape == f.shape - #e = y - f - #sigma = np.sqrt(self.sigma2) - #dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) / - #((e**2 + self.sigma2*self.v)**3) - #) - ##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2)) - ##/ ((e**2 + (self.sigma**2)*self.v)**3) ) - #return dlik_hess_dsigma + $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ + """ + assert y.shape == f.shape + e = y - f + sigma = np.sqrt(self.sigma2) + dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) / + ((e**2 + self.sigma2*self.v)**3) + ) + #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2)) + #/ ((e**2 + (self.sigma**2)*self.v)**3) ) + return dlik_hess_dsigma - #def _gradients(self, y, f, extra_data=None): - ##must be listed in same order as 'get_param_names' - #derivs = ([self.lik_dstd(y, f, extra_data=extra_data)], - #[self.dlik_df_dstd(y, f, extra_data=extra_data)], - #[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)] - #) # lists as we might learn many parameters - ## ensure we have gradients for every parameter we want to optimize - #assert len(derivs[0]) == len(self._get_param_names()) - #assert len(derivs[1]) == len(self._get_param_names()) - #assert len(derivs[2]) == len(self._get_param_names()) - #return derivs + def _gradients(self, y, f, extra_data=None): + #must be listed in same order as 'get_param_names' + derivs = ([self.lik_dstd(y, f, extra_data=extra_data)], + [self.dlik_df_dstd(y, f, extra_data=extra_data)], + [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)] + ) # lists as we might learn many parameters + # ensure we have gradients for every parameter we want to optimize + assert len(derivs[0]) == len(self._get_param_names()) + assert len(derivs[1]) == len(self._get_param_names()) + assert len(derivs[2]) == len(self._get_param_names()) + return derivs From fdb7b99e0bd8a740dd898317aab5cd506b97e34e Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 29 Jul 2013 17:21:52 +0100 Subject: [PATCH 66/71] Got rid of some overdoing the approximation --- GPy/likelihoods/Laplace.py | 2 +- GPy/models/GP.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index 8b39f222..f86c47b6 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -165,7 +165,7 @@ class Laplace(likelihood): self.aA = 0.5*self.ln_det_K_Wi__Bi self.bB = - 0.5*self.f_Ki_f self.cC = 0.5*self.y_Wi_Ki_i_y - Z_tilde = (+ 100*self.NORMAL_CONST + Z_tilde = (#+ 100*self.NORMAL_CONST + self.lik + 0.5*self.ln_det_K_Wi__Bi - 0.5*self.f_Ki_f diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 0f56e21c..77620488 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -132,9 +132,9 @@ class GP(model): model for a new variable Y* = v_tilde/tau_tilde, with a covariance matrix K* = K + diag(1./tau_tilde) plus a normalization term. """ - if isinstance(self.likelihood, Laplace): - self.likelihood.fit_full(self.kern.K(self.X)) - self.likelihood._set_params(self.likelihood._get_params()) + #if isinstance(self.likelihood, Laplace): + #self.likelihood.fit_full(self.kern.K(self.X)) + #self.likelihood._set_params(self.likelihood._get_params()) l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z print "K_ldet: {} mft: {} Z: {}".format(self.K_logdet, self._model_fit_term(), self.likelihood.Z) return l @@ -148,8 +148,8 @@ class GP(model): dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X) print "dL_dthetaK should be: ", dL_dthetaK if isinstance(self.likelihood, Laplace): - self.likelihood.fit_full(self.kern.K(self.X)) - self.likelihood._set_params(self.likelihood._get_params()) + #self.likelihood.fit_full(self.kern.K(self.X)) + #self.likelihood._set_params(self.likelihood._get_params()) dK_dthetaK = self.kern.dK_dtheta dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy()) dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK)) From 9364efc755405fdb3b424f4e3ffc01e68694b31e Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Tue, 30 Jul 2013 16:11:03 +0100 Subject: [PATCH 67/71] Started adding gaussian sanity checker --- GPy/examples/laplace_approximations.py | 10 ++-- GPy/likelihoods/Laplace.py | 80 +++++++++++++------------ GPy/likelihoods/likelihood_functions.py | 58 +++++------------- 3 files changed, 60 insertions(+), 88 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 2b93122c..e8b6419f 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -168,23 +168,23 @@ def student_t_f_check(): m.randomize() m['t_no'] = 0.3 m.likelihood.X = X - print m + #print m plt.figure() plt.subplot(511) m.plot() - print m + #print m plt.subplot(512) m.optimize(max_f_eval=15) m.plot() - print m + #print m plt.subplot(513) m.optimize(max_f_eval=15) m.plot() - print m + #print m plt.subplot(514) m.optimize(max_f_eval=15) m.plot() - print m + #print m plt.subplot(515) m.optimize() m.plot() diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index f86c47b6..aeda17da 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -89,7 +89,8 @@ class Laplace(likelihood): expl = 0.5*expl_a + 0.5*expl_b # Might need to be -? dL_dthetaK_exp = dK_dthetaK(expl, X) dL_dthetaK_imp = dK_dthetaK(impl, X) - print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) + #print "dL_dthetaK_exp: {} dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp) + #print "expl_a: {}, {} expl_b: {}, {}".format(np.mean(expl_a), np.std(expl_a), np.mean(expl_b), np.std(expl_b)) dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp return dL_dthetaK @@ -165,8 +166,7 @@ class Laplace(likelihood): self.aA = 0.5*self.ln_det_K_Wi__Bi self.bB = - 0.5*self.f_Ki_f self.cC = 0.5*self.y_Wi_Ki_i_y - Z_tilde = (#+ 100*self.NORMAL_CONST - + self.lik + Z_tilde = (+ self.lik + 0.5*self.ln_det_K_Wi__Bi - 0.5*self.f_Ki_f + 0.5*self.y_Wi_Ki_i_y @@ -379,7 +379,8 @@ class Laplace(likelihood): #difference = abs(new_obj - old_obj) #old_obj = new_obj.copy() - difference = np.abs(np.sum(f - f_old)) + #difference = np.abs(np.sum(f - f_old)) + difference = np.abs(np.sum(a - old_a)) #old_a = self.a.copy() #a old_a = a.copy() i += 1 @@ -391,42 +392,43 @@ class Laplace(likelihood): print "Iterations: {}, Final_difference: {}".format(i, difference) if difference > 1e-4: print "FAIL FAIL FAIL FAIL FAIL FAIL" - import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - if hasattr(self, 'X'): - import pylab as pb - pb.figure() - pb.subplot(311) - pb.title('old f_hat') - pb.plot(self.X, self.f_hat) - pb.subplot(312) - pb.title('old ff') - pb.plot(self.X, self.old_ff) - pb.subplot(313) - pb.title('new f_hat') - pb.plot(self.X, f) - - pb.figure() - pb.subplot(121) - pb.title('old K') - pb.imshow(np.diagflat(self.old_K), interpolation='none') - pb.colorbar() - pb.subplot(122) - pb.title('new K') - pb.imshow(np.diagflat(K), interpolation='none') - pb.colorbar() - - pb.figure() - pb.subplot(121) - pb.title('old W') - pb.imshow(np.diagflat(self.old_W), interpolation='none') - pb.colorbar() - pb.subplot(122) - pb.title('new W') - pb.imshow(np.diagflat(W), interpolation='none') - pb.colorbar() - + if False: import ipdb; ipdb.set_trace() ### XXX BREAKPOINT - pb.close('all') + if hasattr(self, 'X'): + import pylab as pb + pb.figure() + pb.subplot(311) + pb.title('old f_hat') + pb.plot(self.X, self.f_hat) + pb.subplot(312) + pb.title('old ff') + pb.plot(self.X, self.old_ff) + pb.subplot(313) + pb.title('new f_hat') + pb.plot(self.X, f) + + pb.figure() + pb.subplot(121) + pb.title('old K') + pb.imshow(np.diagflat(self.old_K), interpolation='none') + pb.colorbar() + pb.subplot(122) + pb.title('new K') + pb.imshow(np.diagflat(K), interpolation='none') + pb.colorbar() + + pb.figure() + pb.subplot(121) + pb.title('old W') + pb.imshow(np.diagflat(self.old_W), interpolation='none') + pb.colorbar() + pb.subplot(122) + pb.title('new W') + pb.imshow(np.diagflat(W), interpolation='none') + pb.colorbar() + + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + pb.close('all') #FIXME: DELETE THESE self.old_W = W.copy() diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 62e09a1a..42af9c8d 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -239,7 +239,7 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2) + hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2) return hess def d3lik_d3f(self, y, f, extra_data=None): @@ -277,7 +277,7 @@ class student_t(likelihood_function): """ assert y.shape == f.shape e = y - f - dlik_grad_dsigma = (-self.v*(self.v+1)*e)/((self.sigma2*self.v + e**2)**2) + dlik_grad_dsigma = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2) return dlik_grad_dsigma def d2lik_d2f_dstd(self, y, f, extra_data=None): @@ -289,7 +289,7 @@ class student_t(likelihood_function): assert y.shape == f.shape e = y - f dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2))) - / (self.sigma2*self.v + (e**2))**3 + / ((self.sigma2*self.v + (e**2))**3) ) return dlik_hess_dsigma @@ -479,7 +479,8 @@ class gaussian(likelihood_function): def _set_params(self, x): self._variance = float(x) - self.covariance_matrix = np.eye(self.N) * self._variance + self.I = np.eye(self.N) + self.covariance_matrix = self.I * self._variance self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG def link_function(self, y, f, extra_data=None): @@ -505,8 +506,6 @@ class gaussian(likelihood_function): """ Gradient of the link function at y, given f w.r.t f - $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$ - :y: data :f: latent variables f :extra_data: extra_data which is not used in student t distribution @@ -514,8 +513,8 @@ class gaussian(likelihood_function): """ assert y.shape == f.shape - e = y - f - grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2)) + s2_i = (1.0/self._variance)*self.I + grad = np.dot(s2_i, y) - 0.5*np.dot(s2_i, f) return grad def d2lik_d2f(self, y, f, extra_data=None): @@ -526,16 +525,14 @@ class gaussian(likelihood_function): Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases (the distribution for y_{i} depends only on f_{i} not on f_{j!=i} - $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$ - :y: data :f: latent variables f :extra_data: extra_data which is not used in student t distribution :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) """ assert y.shape == f.shape - e = y - f - hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2) + s2_i = (1.0/self._variance)*self.I + hess = np.diagonal(-0.5*s2_i) return hess def d3lik_d3f(self, y, f, extra_data=None): @@ -545,46 +542,25 @@ class gaussian(likelihood_function): $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$ """ assert y.shape == f.shape - e = y - f - d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) / - ((e**2 + self.sigma2*self.v)**3) - ) + d3lik_d3f = np.diagonal(0*self.I) return d3lik_d3f def lik_dstd(self, y, f, extra_data=None): """ Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation) - - Terms relavent to derivatives wrt sigma are: - -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2)) - - $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$ """ assert y.shape == f.shape e = y - f - sigma = np.sqrt(self.sigma2) - #dlik_dsigma = ( - (1/sigma) + - #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) - #) - #dlik_dsigma = ( - 1 + - #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) ) - #) - #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1 - dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v)) + dlik_dsigma = -0.5*self.N*self._variance - 0.5*np.dot(e.T, e) return dlik_dsigma def dlik_df_dstd(self, y, f, extra_data=None): """ Gradient of the dlik_df w.r.t sigma parameter (standard deviation) - - $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$ """ assert y.shape == f.shape - e = y - f - sigma = np.sqrt(self.sigma2) - dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here? - / ((self.v*self.sigma2 + e**2)**2) - ) + s_4 = 1.0/(self._variance**2) + dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f)) return dlik_grad_dsigma def d2lik_d2f_dstd(self, y, f, extra_data=None): @@ -594,13 +570,7 @@ class gaussian(likelihood_function): $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ """ assert y.shape == f.shape - e = y - f - sigma = np.sqrt(self.sigma2) - dlik_hess_dsigma = ( (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) / - ((e**2 + self.sigma2*self.v)**3) - ) - #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2)) - #/ ((e**2 + (self.sigma**2)*self.v)**3) ) + dlik_hess_dsigma = 1.0/(2*(self._variance**2)) return dlik_hess_dsigma def _gradients(self, y, f, extra_data=None): From 1314868ea8cf4c81d0c76f90dd4a8b11a123c427 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Fri, 16 Aug 2013 11:16:47 +0100 Subject: [PATCH 68/71] Added gaussian checker and gaussian likelihood, not checkgrading yet --- GPy/examples/laplace_approximations.py | 65 +++++++++++++++++++------ GPy/likelihoods/likelihood_functions.py | 38 ++++++++++----- 2 files changed, 77 insertions(+), 26 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index e8b6419f..02b38a79 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -170,28 +170,18 @@ def student_t_f_check(): m.likelihood.X = X #print m plt.figure() - plt.subplot(511) + plt.subplot(211) m.plot() - #print m - plt.subplot(512) - m.optimize(max_f_eval=15) - m.plot() - #print m - plt.subplot(513) - m.optimize(max_f_eval=15) - m.plot() - #print m - plt.subplot(514) - m.optimize(max_f_eval=15) - m.plot() - #print m - plt.subplot(515) + print "OPTIMIZED ONCE" + plt.subplot(212) m.optimize() m.plot() print "final optimised student t" print m print "real GP" print mgp + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + return m def student_t_fix_optimise_check(): plt.close('all') @@ -602,3 +592,48 @@ def noisy_laplace_approx(): print m #with a student t distribution, since it has heavy tails it should work well + +def gaussian_f_check(): + plt.close('all') + X = np.linspace(0, 1, 50)[:, None] + real_std = 0.2 + noise = np.random.randn(*X.shape)*real_std + Y = np.sin(X*2*np.pi) + noise + + kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1]) + mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp) + mgp.ensure_default_constraints() + mgp.randomize() + mgp.optimize() + print "Gaussian" + print mgp + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + + kernelg = kernelgp.copy() + #kernelst += GPy.kern.bias(X.shape[1]) + N, D = X.shape + g_distribution = GPy.likelihoods.likelihood_functions.gaussian(variance=0.1, N=N, D=D) + g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm') + m = GPy.models.GP(X, g_likelihood, kernelg) + #m['rbf_v'] = mgp._get_params()[0] + #m['rbf_l'] = mgp._get_params()[1] + 1 + m.ensure_default_constraints() + #m.constrain_fixed('rbf_v', mgp._get_params()[0]) + #m.constrain_fixed('rbf_l', mgp._get_params()[1]) + #m.constrain_bounded('t_no', 2*real_std**2, 1e3) + #m.constrain_positive('bias') + m.constrain_positive('noise_var') + m.randomize() + m['noise_variance'] = 0.1 + m.likelihood.X = X + plt.figure() + plt.subplot(211) + m.plot() + plt.subplot(212) + m.optimize() + m.plot() + print "final optimised student t" + print m + print "real GP" + print mgp + import ipdb; ipdb.set_trace() ### XXX BREAKPOINT diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 42af9c8d..81d93f6b 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -9,7 +9,7 @@ from ..util.plot import gpplot from scipy.special import gammaln, gamma from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf -class likelihood_function: +class likelihood_function(object): """ Likelihood class for doing Expectation propagation :param Y: observed output (Nx1 numpy.darray) @@ -159,7 +159,7 @@ class student_t(likelihood_function): d2ln p(yi|fi)_d2fifj """ def __init__(self, deg_free, sigma2=2): - #super(student_t, self).__init__() + super(student_t, self).__init__() self.v = deg_free self.sigma2 = sigma2 self.log_concave = False @@ -468,9 +468,16 @@ class gaussian(likelihood_function): """ Gaussian likelihood - this is a test class for approximation schemes """ - def __init__(self, variance): + def __init__(self, variance, D, N): + super(gaussian, self).__init__() + self.D = D + self.N = N self._set_params(np.asarray(variance)) + #Don't support normalizing yet + self._bias = np.zeros((1, self.D)) + self._scale = np.ones((1, self.D)) + def _get_params(self): return np.asarray(self._variance) @@ -481,7 +488,8 @@ class gaussian(likelihood_function): self._variance = float(x) self.I = np.eye(self.N) self.covariance_matrix = self.I * self._variance - self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG + self.Ki = self.I*(1.0 / self._variance) + self.ln_K = np.trace(self.covariance_matrix) def link_function(self, y, f, extra_data=None): """link_function $\ln p(y|f)$ @@ -498,7 +506,8 @@ class gaussian(likelihood_function): eeT = np.dot(e, e.T) objective = (- 0.5*self.D*np.log(2*np.pi) - 0.5*self.ln_K - - 0.5*np.sum(np.multiply(self.Ki, eeT)) + #- 0.5*np.sum(np.multiply(self.Ki, eeT)) + - 0.5*np.dot(np.dot(e.T, self.Ki), e) ) return np.sum(objective) @@ -514,7 +523,7 @@ class gaussian(likelihood_function): """ assert y.shape == f.shape s2_i = (1.0/self._variance)*self.I - grad = np.dot(s2_i, y) - 0.5*np.dot(s2_i, f) + grad = np.dot(s2_i, y) - np.dot(s2_i, f) return grad def d2lik_d2f(self, y, f, extra_data=None): @@ -532,7 +541,7 @@ class gaussian(likelihood_function): """ assert y.shape == f.shape s2_i = (1.0/self._variance)*self.I - hess = np.diagonal(-0.5*s2_i) + hess = np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS? return hess def d3lik_d3f(self, y, f, extra_data=None): @@ -542,7 +551,7 @@ class gaussian(likelihood_function): $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$ """ assert y.shape == f.shape - d3lik_d3f = np.diagonal(0*self.I) + d3lik_d3f = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS? return d3lik_d3f def lik_dstd(self, y, f, extra_data=None): @@ -551,7 +560,7 @@ class gaussian(likelihood_function): """ assert y.shape == f.shape e = y - f - dlik_dsigma = -0.5*self.N*self._variance - 0.5*np.dot(e.T, e) + dlik_dsigma = -0.5*self.D/self._variance - 0.5*np.trace(np.dot(e.T, np.dot(self.I, e))) return dlik_dsigma def dlik_df_dstd(self, y, f, extra_data=None): @@ -560,7 +569,7 @@ class gaussian(likelihood_function): """ assert y.shape == f.shape s_4 = 1.0/(self._variance**2) - dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f)) + dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + np.dot(s_4, np.dot(self.I, f)) return dlik_grad_dsigma def d2lik_d2f_dstd(self, y, f, extra_data=None): @@ -570,7 +579,7 @@ class gaussian(likelihood_function): $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ """ assert y.shape == f.shape - dlik_hess_dsigma = 1.0/(2*(self._variance**2)) + dlik_hess_dsigma = np.diag(1.0/(self._variance**2)*self.I)[:, None] return dlik_hess_dsigma def _gradients(self, y, f, extra_data=None): @@ -584,3 +593,10 @@ class gaussian(likelihood_function): assert len(derivs[1]) == len(self._get_param_names()) assert len(derivs[2]) == len(self._get_param_names()) return derivs + + def predictive_values(self, mu, var): + mean = mu * self._scale + self._bias + true_var = (var + self._variance) * self._scale ** 2 + _5pc = mean - 2.*np.sqrt(true_var) + _95pc = mean + 2.*np.sqrt(true_var) + return mean, true_var, _5pc, _95pc From 000491b25da515a595c25fbc57e3dcbc3ee4e3f4 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Wed, 28 Aug 2013 13:26:15 +0100 Subject: [PATCH 69/71] Gaussian likelihood errors, still not working --- GPy/likelihoods/likelihood_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 81d93f6b..25f770b5 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -560,7 +560,7 @@ class gaussian(likelihood_function): """ assert y.shape == f.shape e = y - f - dlik_dsigma = -0.5*self.D/self._variance - 0.5*np.trace(np.dot(e.T, np.dot(self.I, e))) + dlik_dsigma = -0.5*self.N/self._variance - 0.5*np.trace(np.dot(e.T, np.dot(self.I, e))) return dlik_dsigma def dlik_df_dstd(self, y, f, extra_data=None): @@ -579,7 +579,7 @@ class gaussian(likelihood_function): $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ """ assert y.shape == f.shape - dlik_hess_dsigma = np.diag(1.0/(self._variance**2)*self.I)[:, None] + dlik_hess_dsigma = np.diag((1.0/(self._variance**2))*self.I)[:, None] return dlik_hess_dsigma def _gradients(self, y, f, extra_data=None): From 54954c63f83d566a383bd0d2b14dadaa66ce363e Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 29 Aug 2013 13:47:56 +0100 Subject: [PATCH 70/71] A few typos --- GPy/examples/laplace_approximations.py | 2 +- GPy/likelihoods/Laplace.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py index 02b38a79..8be08a8f 100644 --- a/GPy/examples/laplace_approximations.py +++ b/GPy/examples/laplace_approximations.py @@ -632,7 +632,7 @@ def gaussian_f_check(): plt.subplot(212) m.optimize() m.plot() - print "final optimised student t" + print "final optimised gaussian" print m print "real GP" print mgp diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py index aeda17da..58304c23 100644 --- a/GPy/likelihoods/Laplace.py +++ b/GPy/likelihoods/Laplace.py @@ -105,8 +105,15 @@ class Laplace(likelihood): dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter for thetaL_i in range(num_params): #Explicit + #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) + #a = 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) + #d = dlik_hess_dthetaL[thetaL_i] + #e = pdinv(pdinv(self.K)[0] + np.diagflat(self.W))[0] + #b = 0.5*np.dot(np.diag(e).T, d) + #g = 0.5*(np.diag(self.K) - np.sum(cho_solve((self.B_chol, True), np.dot(np.diagflat(self.W_12),self.K))**2, 1)) + #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - np.dot(g.T, dlik_hess_dthetaL[thetaL_i]) dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i]) - #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(mdot(self.Bi, self.K, dlik_hess_dthetaL[thetaL_i])) + #Implicit df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i]) dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL) From f943cf9ddb9db80556ff7873108d22ac48113c2d Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Mon, 9 Sep 2013 11:54:32 +0100 Subject: [PATCH 71/71] Changed the gradients (perhaps for the worse) --- GPy/likelihoods/likelihood_functions.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 25f770b5..72d2ff82 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -523,7 +523,7 @@ class gaussian(likelihood_function): """ assert y.shape == f.shape s2_i = (1.0/self._variance)*self.I - grad = np.dot(s2_i, y) - np.dot(s2_i, f) + grad = np.dot(s2_i, y) - 0.5*np.dot(s2_i, f) return grad def d2lik_d2f(self, y, f, extra_data=None): @@ -541,7 +541,7 @@ class gaussian(likelihood_function): """ assert y.shape == f.shape s2_i = (1.0/self._variance)*self.I - hess = np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS? + hess = 0.5*np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS? return hess def d3lik_d3f(self, y, f, extra_data=None): @@ -560,7 +560,8 @@ class gaussian(likelihood_function): """ assert y.shape == f.shape e = y - f - dlik_dsigma = -0.5*self.N/self._variance - 0.5*np.trace(np.dot(e.T, np.dot(self.I, e))) + s_4 = 1.0/(self._variance**2) + dlik_dsigma = -0.5*self.N*1/self._variance + 0.5*s_4*np.trace(np.dot(e.T, np.dot(self.I, e))) return dlik_dsigma def dlik_df_dstd(self, y, f, extra_data=None): @@ -569,7 +570,7 @@ class gaussian(likelihood_function): """ assert y.shape == f.shape s_4 = 1.0/(self._variance**2) - dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + np.dot(s_4, np.dot(self.I, f)) + dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f)) return dlik_grad_dsigma def d2lik_d2f_dstd(self, y, f, extra_data=None): @@ -579,7 +580,7 @@ class gaussian(likelihood_function): $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$ """ assert y.shape == f.shape - dlik_hess_dsigma = np.diag((1.0/(self._variance**2))*self.I)[:, None] + dlik_hess_dsigma = 0.5*np.diag((1.0/(self._variance**2))*self.I)[:, None] return dlik_hess_dsigma def _gradients(self, y, f, extra_data=None):