Still working on rasmussen, link function needs vectorizing I think

This commit is contained in:
Alan Saul 2013-04-02 20:00:31 +01:00
parent afa5b1f956
commit 0312f319ad
3 changed files with 154 additions and 54 deletions

View file

@ -16,6 +16,9 @@ def student_t_approx():
Y = np.sin(X) + np.random.randn(*X.shape)*real_var
Yc = Y.copy()
X_full = np.linspace(0.0, 10.0, 500)[:, None]
Y_full = np.sin(X_full)
#Y = Y/Y.max()
Yc[10] += 100
@ -25,7 +28,7 @@ def student_t_approx():
#Yc = Yc/Yc.max()
#Add student t random noise to datapoints
deg_free = 20 #100000.5
deg_free = 10
real_sd = np.sqrt(real_var)
#t_rv = t(deg_free, loc=0, scale=real_var)
#noise = t_rvrvs(size=Y.shape)
@ -47,6 +50,8 @@ def student_t_approx():
kernel2 = kernel1.copy()
kernel3 = kernel1.copy()
kernel4 = kernel1.copy()
kernel5 = kernel1.copy()
kernel6 = kernel1.copy()
print "Clean Gaussian"
#A GP should completely break down due to the points as they get a lot of weight
@ -58,6 +63,7 @@ def student_t_approx():
# plot
plt.subplot(211)
m.plot()
plt.plot(X_full, Y_full)
print m
#Corrupt
@ -67,40 +73,64 @@ def student_t_approx():
m.optimize()
plt.subplot(212)
m.plot()
plt.plot(X_full, Y_full)
print m
plt.figure(2)
plt.suptitle('Student-t likelihood')
edited_real_sd = real_sd
# Likelihood object
print "Clean student t, ncg"
t_distribution = student_t(deg_free, sigma=edited_real_sd)
stu_t_likelihood = Laplace(Y, t_distribution)
print "Clean student t"
stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
m = GPy.models.GP(X, stu_t_likelihood, kernel3)
m.ensure_default_constraints()
m.update_likelihood_approximation()
# optimize
m.optimize()
print(m)
# plot
plt.subplot(211)
plt.subplot(221)
m.plot()
plt.ylim(-2.5,2.5)
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
print "Corrupt student t"
print "Corrupt student t, ncg"
t_distribution = student_t(deg_free, sigma=edited_real_sd)
corrupt_stu_t_likelihood = Laplace(Yc, t_distribution)
corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
m.ensure_default_constraints()
m.update_likelihood_approximation()
m.optimize()
print(m)
plt.subplot(223)
m.plot()
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
print "Clean student t, rasm"
t_distribution = student_t(deg_free, sigma=edited_real_sd)
stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
m = GPy.models.GP(X, stu_t_likelihood, kernel6)
m.ensure_default_constraints()
m.update_likelihood_approximation()
m.optimize()
print(m)
plt.subplot(222)
m.plot()
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
print "Corrupt student t, rasm"
t_distribution = student_t(deg_free, sigma=edited_real_sd)
corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
m.ensure_default_constraints()
m.update_likelihood_approximation()
m.optimize()
print(m)
plt.subplot(212)
plt.subplot(224)
m.plot()
plt.ylim(-2.5,2.5)
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
###with a student t distribution, since it has heavy tails it should work well

View file

@ -1,16 +1,15 @@
import numpy as np
import scipy as sp
import GPy
from scipy.linalg import cholesky, eig, inv, det
from functools import partial
from scipy.linalg import cholesky, eig, inv, det, cho_solve
from GPy.likelihoods.likelihood import likelihood
from GPy.util.linalg import pdinv,mdot
from GPy.util.linalg import pdinv, mdot, jitchol
#import numpy.testing.assert_array_equal
class Laplace(likelihood):
"""Laplace approximation to a posterior"""
def __init__(self, data, likelihood_function):
def __init__(self, data, likelihood_function, rasm=True):
"""
Laplace Approximation
@ -30,6 +29,7 @@ class Laplace(likelihood):
"""
self.data = data
self.likelihood_function = likelihood_function
self.rasm = rasm
#Inital values
self.N, self.D = self.data.shape
@ -102,20 +102,16 @@ class Laplace(likelihood):
#f_hat? should be f but we must have optimized for them I guess?
Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
+ 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+ 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
)
self.Z = Z_tilde
self.Y = Y_tilde[:, None]
#Convert to float as its (1, 1) and Z must be a scalar
self.Z = np.float64(Z_tilde)
self.Y = Y_tilde
self.YYT = np.dot(self.Y, self.Y.T)
self.covariance_matrix = self.Sigma_tilde
#if not self.likelihood_function.log_concave:
#self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
##If the likelihood is non-log-concave. We wan't to say that there is a negative variance
##To cause the posterior to become less certain than the prior and likelihood,
##This is a property only held by non-log-concave likelihoods
self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
def fit_full(self, K):
@ -125,32 +121,15 @@ class Laplace(likelihood):
:K: Covariance matrix
"""
self.K = K.copy()
f = np.zeros((self.N, 1))
(self.Ki, _, _, self.log_Kdet) = pdinv(K)
LOG_K_CONST = -(0.5 * self.log_Kdet)
OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST
#Find \hat(f) using a newton raphson optimizer for example
#TODO: Add newton-raphson as subclass of optimizer class
#FIXME: Can we get rid of this horrible reshaping?
def obj(f):
#f = f[:, None]
res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
return float(res)
def obj_grad(f):
#f = f[:, None]
res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
return np.squeeze(res)
def obj_hess(f):
res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
return np.squeeze(res)
self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
self.Ki, _, _, self.log_Kdet = pdinv(K)
if self.rasm:
self.f_hat = self.rasm_mode(K)
else:
self.f_hat = self.ncg_mode(K)
#At this point get the hessian matrix
self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat))
self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat))
if not self.likelihood_function.log_concave:
self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
#If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@ -176,8 +155,92 @@ class Laplace(likelihood):
#Unsure whether its log_hess or log_hess_i
self.ln_z_hat = (- 0.5*self.log_hess_hat_det
+ 0.5*self.log_Kdet
+ self.likelihood_function.link_function(self.data[:,0], self.f_hat)
+ self.likelihood_function.link_function(self.data, self.f_hat)
#+ self.likelihood_function.link_function(self.data, self.f_hat)
- 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
)
import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
return self._compute_GP_variables()
def ncg_mode(self, K):
"""Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
:K: Covariance matrix
:returns: f_mode
"""
self.K = K.copy()
f = np.zeros((self.N, 1))
(self.Ki, _, _, self.log_Kdet) = pdinv(K)
LOG_K_CONST = -(0.5 * self.log_Kdet)
#FIXME: Can we get rid of this horrible reshaping?
def obj(f):
res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f))
+ self.NORMAL_CONST + LOG_K_CONST)
return float(res)
def obj_grad(f):
res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
return np.squeeze(res)
def obj_hess(f):
res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
return np.squeeze(res)
f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
return f_hat[:, None]
def rasm_mode(self, K):
"""
Rasmussens numerically stable mode finding
For nomenclature see Rasmussen & Williams 2006
:K: Covariance matrix
:returns: f_mode
"""
f = np.zeros((self.N, 1))
new_obj = -np.inf
old_obj = np.inf
def obj(a, f):
#Careful of shape of data!
return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f)
difference = np.inf
epsilon = 1e-16
step_size = 1
while difference > epsilon:
W = -np.diag(self.likelihood_function.link_hess(self.data, f))
if not self.likelihood_function.log_concave:
#if np.any(W < 0):
#print "NEGATIVE VALUES :("
#pass
W[W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
#If the likelihood is non-log-concave. We wan't to say that there is a negative variance
#To cause the posterior to become less certain than the prior and likelihood,
#This is a property only held by non-log-concave likelihoods
#W is diagnoal so its sqrt is just the sqrt of the diagonal elements
W_12 = np.sqrt(W)
B = np.eye(self.N) + mdot(W_12, K, W_12)
L = jitchol(B)
b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f))
#TODO: Check L is lower
solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
a = b - mdot(W_12, solve_L)
f = np.dot(K, a)
old_obj = new_obj
new_obj = obj(a, f)
difference = new_obj - old_obj
#print "Difference: ", new_obj - old_obj
if difference < 0:
#If the objective function isn't rising, restart optimization
print "Reducing step-size, restarting"
#objective function isn't increasing, try reducing step size
step_size *= 0.9
f = np.zeros((self.N, 1))
new_obj = -np.inf
old_obj = np.inf
difference = abs(difference)
return f

View file

@ -36,7 +36,10 @@ class student_t(likelihood_function):
:returns: float(likelihood evaluated for this point)
"""
y = np.squeeze(y)
f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
objective = (gammaln((self.v + 1) * 0.5)
- gammaln(self.v * 0.5)
@ -44,6 +47,7 @@ class student_t(likelihood_function):
- (self.v + 1) * 0.5
* np.log(1 + ((e**2 / self.sigma**2) / self.v))
)
print (e**2).shape
return np.sum(objective)
def link_grad(self, y, f):
@ -57,10 +61,12 @@ class student_t(likelihood_function):
:returns: gradient of likelihood evaluated at points
"""
y = np.squeeze(y)
f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
return grad
return np.squeeze(grad)
def link_hess(self, y, f):
"""
@ -75,11 +81,12 @@ class student_t(likelihood_function):
:f: latent variables f
:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
"""
y = np.squeeze(y)
f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
#hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
return hess
return np.squeeze(hess)
def predictive_values(self, mu, var):
"""