mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-15 06:52:39 +02:00
merged in the GPy upstream
This commit is contained in:
commit
203159eff5
12 changed files with 830 additions and 9 deletions
12
README.md
12
README.md
|
|
@ -1,10 +1,4 @@
|
||||||
GPy
|
coxGP
|
||||||
===
|
=====
|
||||||
|
|
||||||
A Gaussian processes framework in python
|
Gaussian Process models of Cox proportional hazard models
|
||||||
|
|
||||||
* [Online documentation](https://gpy.readthedocs.org/en/latest/)
|
|
||||||
* [Unit tests (Travis-CI)](https://travis-ci.org/SheffieldML/GPy)
|
|
||||||
|
|
||||||
|
|
||||||
Continuous integration status: 
|
|
||||||
|
|
|
||||||
0
__init__.py
Normal file
0
__init__.py
Normal file
0
python/__init__.py
Normal file
0
python/__init__.py
Normal file
0
python/examples/__init__.py
Normal file
0
python/examples/__init__.py
Normal file
220
python/examples/laplace_approximations.py
Normal file
220
python/examples/laplace_approximations.py
Normal file
|
|
@ -0,0 +1,220 @@
|
||||||
|
import GPy
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from scipy.stats import t, norm
|
||||||
|
from coxGP.python.likelihoods.Laplace import Laplace
|
||||||
|
from coxGP.python.likelihoods.likelihood_function import student_t
|
||||||
|
|
||||||
|
|
||||||
|
def timing():
|
||||||
|
real_var = 0.1
|
||||||
|
times = 1
|
||||||
|
deg_free = 10
|
||||||
|
real_sd = np.sqrt(real_var)
|
||||||
|
the_is = np.zeros(times)
|
||||||
|
X = np.linspace(0.0, 10.0, 300)[:, None]
|
||||||
|
|
||||||
|
for a in xrange(times):
|
||||||
|
Y = np.sin(X) + np.random.randn(*X.shape)*real_var
|
||||||
|
Yc = Y.copy()
|
||||||
|
|
||||||
|
Yc[10] += 100
|
||||||
|
Yc[25] += 10
|
||||||
|
Yc[23] += 10
|
||||||
|
Yc[24] += 10
|
||||||
|
Yc[250] += 10
|
||||||
|
#Yc[4] += 10000
|
||||||
|
|
||||||
|
edited_real_sd = real_sd
|
||||||
|
kernel1 = GPy.kern.rbf(X.shape[1])
|
||||||
|
|
||||||
|
t_distribution = student_t(deg_free, sigma=edited_real_sd)
|
||||||
|
corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
|
||||||
|
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
m.optimize()
|
||||||
|
the_is[a] = m.likelihood.i
|
||||||
|
|
||||||
|
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
print the_is
|
||||||
|
print np.mean(the_is)
|
||||||
|
|
||||||
|
|
||||||
|
def student_t_approx():
|
||||||
|
"""
|
||||||
|
Example of regressing with a student t likelihood
|
||||||
|
"""
|
||||||
|
real_var = 0.1
|
||||||
|
#Start a function, any function
|
||||||
|
X = np.linspace(0.0, 10.0, 30)[:, None]
|
||||||
|
Y = np.sin(X) + np.random.randn(*X.shape)*real_var
|
||||||
|
Yc = Y.copy()
|
||||||
|
|
||||||
|
X_full = np.linspace(0.0, 10.0, 500)[:, None]
|
||||||
|
Y_full = np.sin(X_full)
|
||||||
|
|
||||||
|
#Y = Y/Y.max()
|
||||||
|
|
||||||
|
Yc[10] += 100
|
||||||
|
Yc[25] += 10
|
||||||
|
Yc[23] += 10
|
||||||
|
Yc[24] += 10
|
||||||
|
#Yc = Yc/Yc.max()
|
||||||
|
|
||||||
|
#Add student t random noise to datapoints
|
||||||
|
deg_free = 10
|
||||||
|
real_sd = np.sqrt(real_var)
|
||||||
|
#t_rv = t(deg_free, loc=0, scale=real_var)
|
||||||
|
#noise = t_rvrvs(size=Y.shape)
|
||||||
|
#Y += noise
|
||||||
|
|
||||||
|
#Add some extreme value noise to some of the datapoints
|
||||||
|
#percent_corrupted = 0.15
|
||||||
|
#corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
|
||||||
|
#indices = np.arange(Y.shape[0])
|
||||||
|
#np.random.shuffle(indices)
|
||||||
|
#corrupted_indices = indices[:corrupted_datums]
|
||||||
|
#print corrupted_indices
|
||||||
|
#noise = t_rv.rvs(size=(len(corrupted_indices), 1))
|
||||||
|
#Y[corrupted_indices] += noise
|
||||||
|
plt.figure(1)
|
||||||
|
plt.suptitle('Gaussian likelihood')
|
||||||
|
# Kernel object
|
||||||
|
kernel1 = GPy.kern.rbf(X.shape[1])
|
||||||
|
kernel2 = kernel1.copy()
|
||||||
|
kernel3 = kernel1.copy()
|
||||||
|
kernel4 = kernel1.copy()
|
||||||
|
kernel5 = kernel1.copy()
|
||||||
|
kernel6 = kernel1.copy()
|
||||||
|
|
||||||
|
print "Clean Gaussian"
|
||||||
|
#A GP should completely break down due to the points as they get a lot of weight
|
||||||
|
# create simple GP model
|
||||||
|
m = GPy.models.GP_regression(X, Y, kernel=kernel1)
|
||||||
|
# optimize
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.optimize()
|
||||||
|
# plot
|
||||||
|
plt.subplot(211)
|
||||||
|
m.plot()
|
||||||
|
plt.plot(X_full, Y_full)
|
||||||
|
print m
|
||||||
|
|
||||||
|
#Corrupt
|
||||||
|
print "Corrupt Gaussian"
|
||||||
|
m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.optimize()
|
||||||
|
plt.subplot(212)
|
||||||
|
m.plot()
|
||||||
|
plt.plot(X_full, Y_full)
|
||||||
|
print m
|
||||||
|
|
||||||
|
plt.figure(2)
|
||||||
|
plt.suptitle('Student-t likelihood')
|
||||||
|
edited_real_sd = real_sd
|
||||||
|
|
||||||
|
print "Clean student t, rasm"
|
||||||
|
t_distribution = student_t(deg_free, sigma=edited_real_sd)
|
||||||
|
stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
|
||||||
|
m = GPy.models.GP(X, stu_t_likelihood, kernel6)
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
m.optimize()
|
||||||
|
print(m)
|
||||||
|
plt.subplot(222)
|
||||||
|
m.plot()
|
||||||
|
plt.plot(X_full, Y_full)
|
||||||
|
plt.ylim(-2.5, 2.5)
|
||||||
|
|
||||||
|
print "Corrupt student t, rasm"
|
||||||
|
t_distribution = student_t(deg_free, sigma=edited_real_sd)
|
||||||
|
corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
|
||||||
|
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
m.optimize()
|
||||||
|
print(m)
|
||||||
|
plt.subplot(224)
|
||||||
|
m.plot()
|
||||||
|
plt.plot(X_full, Y_full)
|
||||||
|
plt.ylim(-2.5, 2.5)
|
||||||
|
|
||||||
|
print "Clean student t, ncg"
|
||||||
|
t_distribution = student_t(deg_free, sigma=edited_real_sd)
|
||||||
|
stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
|
||||||
|
m = GPy.models.GP(X, stu_t_likelihood, kernel3)
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
m.optimize()
|
||||||
|
print(m)
|
||||||
|
plt.subplot(221)
|
||||||
|
m.plot()
|
||||||
|
plt.plot(X_full, Y_full)
|
||||||
|
plt.ylim(-2.5, 2.5)
|
||||||
|
|
||||||
|
print "Corrupt student t, ncg"
|
||||||
|
t_distribution = student_t(deg_free, sigma=edited_real_sd)
|
||||||
|
corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
|
||||||
|
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.update_likelihood_approximation()
|
||||||
|
m.optimize()
|
||||||
|
print(m)
|
||||||
|
plt.subplot(223)
|
||||||
|
m.plot()
|
||||||
|
plt.plot(X_full, Y_full)
|
||||||
|
plt.ylim(-2.5, 2.5)
|
||||||
|
|
||||||
|
|
||||||
|
###with a student t distribution, since it has heavy tails it should work well
|
||||||
|
###likelihood_function = student_t(deg_free, sigma=real_var)
|
||||||
|
###lap = Laplace(Y, likelihood_function)
|
||||||
|
###cov = kernel.K(X)
|
||||||
|
###lap.fit_full(cov)
|
||||||
|
|
||||||
|
###test_range = np.arange(0, 10, 0.1)
|
||||||
|
###plt.plot(test_range, t_rv.pdf(test_range))
|
||||||
|
###for i in xrange(X.shape[0]):
|
||||||
|
###mode = lap.f_hat[i]
|
||||||
|
###covariance = lap.hess_hat_i[i,i]
|
||||||
|
###scaling = np.exp(lap.ln_z_hat)
|
||||||
|
###normalised_approx = norm(loc=mode, scale=covariance)
|
||||||
|
###print "Normal with mode %f, and variance %f" % (mode, covariance)
|
||||||
|
###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
|
||||||
|
###plt.show()
|
||||||
|
|
||||||
|
return m
|
||||||
|
|
||||||
|
|
||||||
|
def noisy_laplace_approx():
|
||||||
|
"""
|
||||||
|
Example of regressing with a student t likelihood
|
||||||
|
"""
|
||||||
|
#Start a function, any function
|
||||||
|
X = np.sort(np.random.uniform(0, 15, 70))[:, None]
|
||||||
|
Y = np.sin(X)
|
||||||
|
|
||||||
|
#Add some extreme value noise to some of the datapoints
|
||||||
|
percent_corrupted = 0.05
|
||||||
|
corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
|
||||||
|
indices = np.arange(Y.shape[0])
|
||||||
|
np.random.shuffle(indices)
|
||||||
|
corrupted_indices = indices[:corrupted_datums]
|
||||||
|
print corrupted_indices
|
||||||
|
noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1))
|
||||||
|
Y[corrupted_indices] += noise
|
||||||
|
|
||||||
|
#A GP should completely break down due to the points as they get a lot of weight
|
||||||
|
# create simple GP model
|
||||||
|
m = GPy.models.GP_regression(X, Y)
|
||||||
|
|
||||||
|
# optimize
|
||||||
|
m.ensure_default_constraints()
|
||||||
|
m.optimize()
|
||||||
|
# plot
|
||||||
|
m.plot()
|
||||||
|
print m
|
||||||
|
|
||||||
|
#with a student t distribution, since it has heavy tails it should work well
|
||||||
321
python/likelihoods/Laplace.py
Normal file
321
python/likelihoods/Laplace.py
Normal file
|
|
@ -0,0 +1,321 @@
|
||||||
|
import numpy as np
|
||||||
|
import scipy as sp
|
||||||
|
import GPy
|
||||||
|
from scipy.linalg import cholesky, eig, inv, cho_solve, det
|
||||||
|
from numpy.linalg import cond
|
||||||
|
from GPy.likelihoods.likelihood import likelihood
|
||||||
|
from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
|
||||||
|
from scipy.linalg.lapack import dtrtrs
|
||||||
|
|
||||||
|
#TODO: Move this to utils
|
||||||
|
|
||||||
|
|
||||||
|
def det_ln_diag(A):
|
||||||
|
"""
|
||||||
|
log determinant of a diagonal matrix
|
||||||
|
$$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
|
||||||
|
"""
|
||||||
|
return np.log(np.diagonal(A)).sum()
|
||||||
|
|
||||||
|
|
||||||
|
def pddet(A):
|
||||||
|
"""
|
||||||
|
Determinant of a positive definite matrix
|
||||||
|
"""
|
||||||
|
L = cholesky(A)
|
||||||
|
logdetA = 2*sum(np.log(np.diag(L)))
|
||||||
|
return logdetA
|
||||||
|
|
||||||
|
|
||||||
|
class Laplace(likelihood):
|
||||||
|
"""Laplace approximation to a posterior"""
|
||||||
|
|
||||||
|
def __init__(self, data, likelihood_function, extra_data=None, rasm=True):
|
||||||
|
"""
|
||||||
|
Laplace Approximation
|
||||||
|
|
||||||
|
First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
|
||||||
|
then find the z^{prime} which allows this to be a normalised gaussian instead of a
|
||||||
|
non-normalized gaussian
|
||||||
|
|
||||||
|
Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
|
||||||
|
which makes a gaussian the same as the laplace approximation
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
|
||||||
|
:data: array of data the likelihood function is approximating
|
||||||
|
:likelihood_function: likelihood function - subclass of likelihood_function
|
||||||
|
:extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
|
||||||
|
:rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.data = data
|
||||||
|
self.likelihood_function = likelihood_function
|
||||||
|
self.extra_data = extra_data
|
||||||
|
self.rasm = rasm
|
||||||
|
|
||||||
|
#Inital values
|
||||||
|
self.N, self.D = self.data.shape
|
||||||
|
self.is_heteroscedastic = True
|
||||||
|
self.Nparams = 0
|
||||||
|
|
||||||
|
self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
|
||||||
|
|
||||||
|
#Initial values for the GP variables
|
||||||
|
self.Y = np.zeros((self.N, 1))
|
||||||
|
self.covariance_matrix = np.eye(self.N)
|
||||||
|
self.precision = np.ones(self.N)[:, None]
|
||||||
|
self.Z = 0
|
||||||
|
self.YYT = None
|
||||||
|
|
||||||
|
def predictive_values(self, mu, var, full_cov):
|
||||||
|
if full_cov:
|
||||||
|
raise NotImplementedError("Cannot make correlated predictions with an EP likelihood")
|
||||||
|
return self.likelihood_function.predictive_values(mu, var)
|
||||||
|
|
||||||
|
def _get_params(self):
|
||||||
|
return np.zeros(0)
|
||||||
|
|
||||||
|
def _get_param_names(self):
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _set_params(self, p):
|
||||||
|
pass # TODO: Laplace likelihood might want to take some parameters...
|
||||||
|
|
||||||
|
def _gradients(self, partial):
|
||||||
|
return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _compute_GP_variables(self):
|
||||||
|
"""
|
||||||
|
Generates data Y which would give the normal distribution identical to the laplace approximation
|
||||||
|
|
||||||
|
GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
|
||||||
|
that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
|
||||||
|
|
||||||
|
Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
|
||||||
|
then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
|
||||||
|
due to the z rescaling.
|
||||||
|
|
||||||
|
at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
|
||||||
|
|
||||||
|
This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
|
||||||
|
giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
|
||||||
|
|
||||||
|
$$\tilde{Y} = \tilde{\Sigma} Hf$$
|
||||||
|
where
|
||||||
|
$$\tilde{\Sigma}^{-1} = H - K^{-1}$$
|
||||||
|
i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
|
||||||
|
since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
|
||||||
|
and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
|
||||||
|
$$\tilde{\Sigma} = W^{-1}$$
|
||||||
|
|
||||||
|
"""
|
||||||
|
epsilon = 1e-6
|
||||||
|
|
||||||
|
#dtritri -> L -> L_i
|
||||||
|
#dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
|
||||||
|
#((L.T*w)_i + I)f_hat = y_tilde
|
||||||
|
L = jitchol(self.K)
|
||||||
|
Li = chol_inv(L)
|
||||||
|
Lt_W = np.dot(L.T, self.W)
|
||||||
|
|
||||||
|
##Check it isn't singular!
|
||||||
|
if cond(Lt_W) > 1e14:
|
||||||
|
print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
|
||||||
|
|
||||||
|
Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
|
||||||
|
Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
|
||||||
|
|
||||||
|
#f.T(Ki + W)f
|
||||||
|
f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
|
||||||
|
+ mdot(self.f_hat.T, self.W, self.f_hat)
|
||||||
|
)
|
||||||
|
|
||||||
|
y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
|
||||||
|
y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
|
||||||
|
ln_W_det = det_ln_diag(self.W)
|
||||||
|
Z_tilde = (- self.NORMAL_CONST
|
||||||
|
+ 0.5*self.ln_K_det
|
||||||
|
+ 0.5*ln_W_det
|
||||||
|
+ 0.5*self.ln_Ki_W_i_det
|
||||||
|
+ 0.5*f_Ki_W_f
|
||||||
|
+ 0.5*y_W_y
|
||||||
|
- y_W_f
|
||||||
|
+ self.ln_z_hat
|
||||||
|
)
|
||||||
|
#Z_tilde = (self.NORMAL_CONST
|
||||||
|
#- 0.5*self.ln_K_det
|
||||||
|
#- 0.5*ln_W_det
|
||||||
|
#- 0.5*self.ln_Ki_W_i_det
|
||||||
|
#- 0.5*f_Ki_W_f
|
||||||
|
#- 0.5*y_W_y
|
||||||
|
#+ y_W_f
|
||||||
|
#+ self.ln_z_hat
|
||||||
|
#)
|
||||||
|
|
||||||
|
##Check it isn't singular!
|
||||||
|
if cond(self.W) > 1e14:
|
||||||
|
print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
|
||||||
|
|
||||||
|
Sigma_tilde = inv(self.W) # Damn
|
||||||
|
|
||||||
|
#Convert to float as its (1, 1) and Z must be a scalar
|
||||||
|
self.Z = np.float64(Z_tilde)
|
||||||
|
self.Y = Y_tilde
|
||||||
|
self.YYT = np.dot(self.Y, self.Y.T)
|
||||||
|
self.covariance_matrix = Sigma_tilde
|
||||||
|
self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
|
||||||
|
|
||||||
|
def fit_full(self, K):
|
||||||
|
"""
|
||||||
|
The laplace approximation algorithm
|
||||||
|
For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
|
||||||
|
:K: Covariance matrix
|
||||||
|
"""
|
||||||
|
self.K = K.copy()
|
||||||
|
if self.rasm:
|
||||||
|
self.f_hat = self.rasm_mode(K)
|
||||||
|
else:
|
||||||
|
self.f_hat = self.ncg_mode(K)
|
||||||
|
|
||||||
|
#At this point get the hessian matrix
|
||||||
|
self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data))
|
||||||
|
|
||||||
|
if not self.likelihood_function.log_concave:
|
||||||
|
self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
|
||||||
|
#If the likelihood is non-log-concave. We wan't to say that there is a negative variance
|
||||||
|
#To cause the posterior to become less certain than the prior and likelihood,
|
||||||
|
#This is a property only held by non-log-concave likelihoods
|
||||||
|
|
||||||
|
#TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
|
||||||
|
self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W)
|
||||||
|
self.Bi, _, _, B_det = pdinv(self.B)
|
||||||
|
|
||||||
|
Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
|
||||||
|
self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
|
||||||
|
|
||||||
|
b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
|
||||||
|
solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
|
||||||
|
a = b - mdot(self.W_12, solve_chol)
|
||||||
|
self.f_Ki_f = np.dot(self.f_hat.T, a)
|
||||||
|
self.ln_K_det = pddet(self.K)
|
||||||
|
|
||||||
|
self.ln_z_hat = (- 0.5*self.f_Ki_f
|
||||||
|
- 0.5*self.ln_K_det
|
||||||
|
+ 0.5*self.ln_Ki_W_i_det
|
||||||
|
+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
|
||||||
|
)
|
||||||
|
|
||||||
|
return self._compute_GP_variables()
|
||||||
|
|
||||||
|
def _compute_B_statistics(self, K, W):
|
||||||
|
"""Rasmussen suggests the use of a numerically stable positive definite matrix B
|
||||||
|
Which has a positive diagonal element and can be easyily inverted
|
||||||
|
|
||||||
|
:K: Covariance matrix
|
||||||
|
:W: Negative hessian at a point (diagonal matrix)
|
||||||
|
:returns: (B, L)
|
||||||
|
"""
|
||||||
|
#W is diagnoal so its sqrt is just the sqrt of the diagonal elements
|
||||||
|
W_12 = np.sqrt(W)
|
||||||
|
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
|
||||||
|
B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
|
||||||
|
L = jitchol(B)
|
||||||
|
return (B, L, W_12)
|
||||||
|
|
||||||
|
def ncg_mode(self, K):
|
||||||
|
"""
|
||||||
|
Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
|
||||||
|
:K: Covariance matrix
|
||||||
|
:returns: f_mode
|
||||||
|
"""
|
||||||
|
self.Ki, _, _, self.ln_K_det = pdinv(K)
|
||||||
|
|
||||||
|
f = np.zeros((self.N, 1))
|
||||||
|
|
||||||
|
#FIXME: Can we get rid of this horrible reshaping?
|
||||||
|
#ONLY WORKS FOR 1D DATA
|
||||||
|
def obj(f):
|
||||||
|
res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
|
||||||
|
+ self.NORMAL_CONST)
|
||||||
|
return float(res)
|
||||||
|
|
||||||
|
def obj_grad(f):
|
||||||
|
res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
|
||||||
|
return np.squeeze(res)
|
||||||
|
|
||||||
|
def obj_hess(f):
|
||||||
|
res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
|
||||||
|
return np.squeeze(res)
|
||||||
|
|
||||||
|
f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
|
||||||
|
return f_hat[:, None]
|
||||||
|
|
||||||
|
def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50):
|
||||||
|
"""
|
||||||
|
Rasmussens numerically stable mode finding
|
||||||
|
For nomenclature see Rasmussen & Williams 2006
|
||||||
|
|
||||||
|
:K: Covariance matrix
|
||||||
|
:MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
|
||||||
|
:MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
|
||||||
|
:returns: f_mode
|
||||||
|
"""
|
||||||
|
f = np.zeros((self.N, 1))
|
||||||
|
new_obj = -np.inf
|
||||||
|
old_obj = np.inf
|
||||||
|
|
||||||
|
def obj(a, f):
|
||||||
|
#Careful of shape of data!
|
||||||
|
return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
|
||||||
|
|
||||||
|
difference = np.inf
|
||||||
|
epsilon = 1e-6
|
||||||
|
step_size = 1
|
||||||
|
rs = 0
|
||||||
|
i = 0
|
||||||
|
while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
|
||||||
|
f_old = f.copy()
|
||||||
|
W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data))
|
||||||
|
if not self.likelihood_function.log_concave:
|
||||||
|
W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
|
||||||
|
# If the likelihood is non-log-concave. We wan't to say that there is a negative variance
|
||||||
|
# To cause the posterior to become less certain than the prior and likelihood,
|
||||||
|
# This is a property only held by non-log-concave likelihoods
|
||||||
|
B, L, W_12 = self._compute_B_statistics(K, W)
|
||||||
|
|
||||||
|
W_f = np.dot(W, f)
|
||||||
|
grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None]
|
||||||
|
#Find K_i_f
|
||||||
|
b = W_f + grad
|
||||||
|
|
||||||
|
#a should be equal to Ki*f now so should be able to use it
|
||||||
|
c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
|
||||||
|
solve_L = cho_solve((L, True), np.dot(W_12, c))
|
||||||
|
f = c - np.dot(K, np.dot(W_12, solve_L))
|
||||||
|
|
||||||
|
solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b)))
|
||||||
|
a = b - np.dot(W_12, solve_L)
|
||||||
|
#f = np.dot(K, a)
|
||||||
|
|
||||||
|
tmp_old_obj = old_obj
|
||||||
|
old_obj = new_obj
|
||||||
|
new_obj = obj(a, f)
|
||||||
|
difference = new_obj - old_obj
|
||||||
|
if difference < 0:
|
||||||
|
#print "Objective function rose", difference
|
||||||
|
#If the objective function isn't rising, restart optimization
|
||||||
|
step_size *= 0.9
|
||||||
|
#print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
|
||||||
|
#objective function isn't increasing, try reducing step size
|
||||||
|
#f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
|
||||||
|
old_obj = tmp_old_obj
|
||||||
|
rs += 1
|
||||||
|
|
||||||
|
difference = abs(difference)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
self.i = i
|
||||||
|
return f
|
||||||
0
python/likelihoods/__init__.py
Normal file
0
python/likelihoods/__init__.py
Normal file
253
python/likelihoods/likelihood_function.py
Normal file
253
python/likelihoods/likelihood_function.py
Normal file
|
|
@ -0,0 +1,253 @@
|
||||||
|
from scipy.special import gammaln, gamma
|
||||||
|
from scipy import integrate
|
||||||
|
import numpy as np
|
||||||
|
from GPy.likelihoods.likelihood_functions import likelihood_function
|
||||||
|
from scipy import stats
|
||||||
|
|
||||||
|
|
||||||
|
class student_t(likelihood_function):
|
||||||
|
"""Student t likelihood distribution
|
||||||
|
For nomanclature see Bayesian Data Analysis 2003 p576
|
||||||
|
|
||||||
|
$$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
|
||||||
|
|
||||||
|
Laplace:
|
||||||
|
Needs functions to calculate
|
||||||
|
ln p(yi|fi)
|
||||||
|
dln p(yi|fi)_dfi
|
||||||
|
d2ln p(yi|fi)_d2fifj
|
||||||
|
"""
|
||||||
|
def __init__(self, deg_free, sigma=2):
|
||||||
|
self.v = deg_free
|
||||||
|
self.sigma = sigma
|
||||||
|
|
||||||
|
#FIXME: This should be in the superclass
|
||||||
|
self.log_concave = False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def variance(self, extra_data=None):
|
||||||
|
return (self.v / float(self.v - 2)) * (self.sigma**2)
|
||||||
|
|
||||||
|
def link_function(self, y, f, extra_data=None):
|
||||||
|
"""link_function $\ln p(y|f)$
|
||||||
|
$$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
|
||||||
|
|
||||||
|
:y: data
|
||||||
|
:f: latent variables f
|
||||||
|
:extra_data: extra_data which is not used in student t distribution
|
||||||
|
:returns: float(likelihood evaluated for this point)
|
||||||
|
|
||||||
|
"""
|
||||||
|
y = np.squeeze(y)
|
||||||
|
f = np.squeeze(f)
|
||||||
|
assert y.shape == f.shape
|
||||||
|
|
||||||
|
e = y - f
|
||||||
|
objective = (gammaln((self.v + 1) * 0.5)
|
||||||
|
- gammaln(self.v * 0.5)
|
||||||
|
+ np.log(self.sigma * np.sqrt(self.v * np.pi))
|
||||||
|
- (self.v + 1) * 0.5
|
||||||
|
* np.log(1 + ((e**2 / self.sigma**2) / self.v))
|
||||||
|
)
|
||||||
|
return np.sum(objective)
|
||||||
|
|
||||||
|
def link_grad(self, y, f, extra_data=None):
|
||||||
|
"""
|
||||||
|
Gradient of the link function at y, given f w.r.t f
|
||||||
|
|
||||||
|
$$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
|
||||||
|
|
||||||
|
:y: data
|
||||||
|
:f: latent variables f
|
||||||
|
:extra_data: extra_data which is not used in student t distribution
|
||||||
|
:returns: gradient of likelihood evaluated at points
|
||||||
|
|
||||||
|
"""
|
||||||
|
y = np.squeeze(y)
|
||||||
|
f = np.squeeze(f)
|
||||||
|
assert y.shape == f.shape
|
||||||
|
e = y - f
|
||||||
|
grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
|
||||||
|
return np.squeeze(grad)
|
||||||
|
|
||||||
|
def link_hess(self, y, f, extra_data=None):
|
||||||
|
"""
|
||||||
|
Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
|
||||||
|
i.e. second derivative link_function at y given f f_j w.r.t f and f_j
|
||||||
|
|
||||||
|
Will return diagonal of hessian, since every where else it is 0
|
||||||
|
|
||||||
|
$$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
|
||||||
|
|
||||||
|
:y: data
|
||||||
|
:f: latent variables f
|
||||||
|
:extra_data: extra_data which is not used in student t distribution
|
||||||
|
:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
|
||||||
|
"""
|
||||||
|
y = np.squeeze(y)
|
||||||
|
f = np.squeeze(f)
|
||||||
|
assert y.shape == f.shape
|
||||||
|
e = y - f
|
||||||
|
hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
|
||||||
|
return np.squeeze(hess)
|
||||||
|
|
||||||
|
def predictive_values(self, mu, var):
|
||||||
|
"""
|
||||||
|
Compute mean, and conficence interval (percentiles 5 and 95) of the prediction
|
||||||
|
|
||||||
|
Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
|
||||||
|
(((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
|
||||||
|
*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
|
||||||
|
"""
|
||||||
|
|
||||||
|
#We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
|
||||||
|
#Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
|
||||||
|
#Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
|
||||||
|
#Which was also given to us as (var)
|
||||||
|
#We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
|
||||||
|
#However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
|
||||||
|
true_var = var + self.variance
|
||||||
|
|
||||||
|
#Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
|
||||||
|
#need the 95 and 5 percentiles.
|
||||||
|
#FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
|
||||||
|
p_025 = mu - 2.*true_var
|
||||||
|
p_975 = mu + 2.*true_var
|
||||||
|
|
||||||
|
return mu, np.nan*mu, p_025, p_975
|
||||||
|
|
||||||
|
def sample_predicted_values(self, mu, var):
|
||||||
|
""" Experimental sample approches and numerical integration """
|
||||||
|
#p_025 = stats.t.ppf(.025, mu)
|
||||||
|
#p_975 = stats.t.ppf(.975, mu)
|
||||||
|
|
||||||
|
num_test_points = mu.shape[0]
|
||||||
|
#Each mu is the latent point f* at the test point x*,
|
||||||
|
#and the var is the gaussian variance at this point
|
||||||
|
#Take lots of samples from this, so we have lots of possible values
|
||||||
|
#for latent point f* for each test point x* weighted by how likely we were to pick it
|
||||||
|
print "Taking %d samples of f*".format(num_test_points)
|
||||||
|
num_f_samples = 10
|
||||||
|
num_y_samples = 10
|
||||||
|
student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
|
||||||
|
print "Student t means shape: ", student_t_means.shape
|
||||||
|
|
||||||
|
#Now we have lots of f*, lets work out the likelihood of getting this by sampling
|
||||||
|
#from a student t centred on this point, sample many points from this distribution
|
||||||
|
#centred on f*
|
||||||
|
#for test_point, f in enumerate(student_t_means):
|
||||||
|
#print test_point
|
||||||
|
#print f.shape
|
||||||
|
#student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
|
||||||
|
#scale=self.sigma,
|
||||||
|
#size=(num_f_samples, num_y_samples))
|
||||||
|
#print student_t_samples.shape
|
||||||
|
|
||||||
|
student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
|
||||||
|
scale=self.sigma,
|
||||||
|
size=(num_test_points, num_y_samples, num_f_samples))
|
||||||
|
student_t_samples = np.reshape(student_t_samples,
|
||||||
|
(num_test_points, num_y_samples*num_f_samples))
|
||||||
|
|
||||||
|
#Now take the 97.5 and 0.25 percentile of these points
|
||||||
|
p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
|
||||||
|
p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
|
||||||
|
|
||||||
|
##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
|
||||||
|
def t_gaussian(f, mu, var):
|
||||||
|
return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
|
||||||
|
* ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
|
||||||
|
)
|
||||||
|
|
||||||
|
def t_gauss_int(mu, var):
|
||||||
|
print "Mu: ", mu
|
||||||
|
print "var: ", var
|
||||||
|
result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
|
||||||
|
print "Result: ", result
|
||||||
|
return result[0]
|
||||||
|
|
||||||
|
vec_t_gauss_int = np.vectorize(t_gauss_int)
|
||||||
|
|
||||||
|
p = vec_t_gauss_int(mu, var)
|
||||||
|
p_025 = mu - p
|
||||||
|
p_975 = mu + p
|
||||||
|
return mu, np.nan*mu, p_025, p_975
|
||||||
|
|
||||||
|
|
||||||
|
class weibull_survival(likelihood_function):
|
||||||
|
"""Weibull t likelihood distribution for survival analysis with censoring
|
||||||
|
For nomanclature see Bayesian Survival Analysis
|
||||||
|
|
||||||
|
Laplace:
|
||||||
|
Needs functions to calculate
|
||||||
|
ln p(yi|fi)
|
||||||
|
dln p(yi|fi)_dfi
|
||||||
|
d2ln p(yi|fi)_d2fifj
|
||||||
|
"""
|
||||||
|
def __init__(self, shape, scale):
|
||||||
|
self.shape = shape
|
||||||
|
self.scale = scale
|
||||||
|
|
||||||
|
#FIXME: This should be in the superclass
|
||||||
|
self.log_concave = True
|
||||||
|
|
||||||
|
def link_function(self, y, f, extra_data=None):
|
||||||
|
"""
|
||||||
|
link_function $\ln p(y|f)$, i.e. log likelihood
|
||||||
|
|
||||||
|
$$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$
|
||||||
|
|
||||||
|
:y: time of event data
|
||||||
|
:f: latent variables f
|
||||||
|
:extra_data: the censoring indicator, 1 for censored, 0 for not
|
||||||
|
:returns: float(likelihood evaluated for this point)
|
||||||
|
|
||||||
|
"""
|
||||||
|
y = np.squeeze(y)
|
||||||
|
f = np.squeeze(f)
|
||||||
|
assert y.shape == f.shape
|
||||||
|
|
||||||
|
v = extra_data
|
||||||
|
objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f) # FIXME: CHECK THIS WITH BOOK, wheres scale?
|
||||||
|
return np.sum(objective)
|
||||||
|
|
||||||
|
def link_grad(self, y, f, extra_data=None):
|
||||||
|
"""
|
||||||
|
Gradient of the link function at y, given f w.r.t f
|
||||||
|
|
||||||
|
$$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i})
|
||||||
|
|
||||||
|
:y: data
|
||||||
|
:f: latent variables f
|
||||||
|
:extra_data: the censoring indicator, 1 for censored, 0 for not
|
||||||
|
:returns: gradient of likelihood evaluated at points
|
||||||
|
|
||||||
|
"""
|
||||||
|
y = np.squeeze(y)
|
||||||
|
f = np.squeeze(f)
|
||||||
|
assert y.shape == f.shape
|
||||||
|
|
||||||
|
v = extra_data
|
||||||
|
grad = v - (y**self.shape)*np.exp(f)
|
||||||
|
return np.squeeze(grad)
|
||||||
|
|
||||||
|
def link_hess(self, y, f, extra_data=None):
|
||||||
|
"""
|
||||||
|
Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
|
||||||
|
i.e. second derivative link_function at y given f f_j w.r.t f and f_j
|
||||||
|
|
||||||
|
Will return diagonal of hessian, since every where else it is 0
|
||||||
|
|
||||||
|
$$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
|
||||||
|
|
||||||
|
:y: data
|
||||||
|
:f: latent variables f
|
||||||
|
:extra_data: extra_data which is not used hessian
|
||||||
|
:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
|
||||||
|
"""
|
||||||
|
y = np.squeeze(y)
|
||||||
|
f = np.squeeze(f)
|
||||||
|
assert y.shape == f.shape
|
||||||
|
|
||||||
|
hess = (y**self.shape)*np.exp(f)
|
||||||
|
return np.squeeze(hess)
|
||||||
0
python/models/__init__.py
Normal file
0
python/models/__init__.py
Normal file
19
python/models/coxGP.py
Normal file
19
python/models/coxGP.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
# Copyright (c) 2013, Alan Saul
|
||||||
|
|
||||||
|
from GPy.models import GP
|
||||||
|
from .. import likelihoods
|
||||||
|
from GPy import kern
|
||||||
|
|
||||||
|
|
||||||
|
class cox_GP_regression(GP):
|
||||||
|
"""
|
||||||
|
Cox Gaussian Process model for regression
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None):
|
||||||
|
if kernel is None:
|
||||||
|
kernel = kern.rbf(X.shape[1])
|
||||||
|
|
||||||
|
likelihood = likelihoods.cox_piecewise(Y, normalize=normalize_Y)
|
||||||
|
|
||||||
|
GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices)
|
||||||
0
python/testing/__init__.py
Normal file
0
python/testing/__init__.py
Normal file
14
python/testing/cox_tests.py
Normal file
14
python/testing/cox_tests.py
Normal file
|
|
@ -0,0 +1,14 @@
|
||||||
|
# Copyright (c) 2013, Alan Saul
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import numpy as np
|
||||||
|
import GPy
|
||||||
|
|
||||||
|
class coxGPTests(unittest.TestCase):
|
||||||
|
def test_laplace_approx(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print "Running unit tests, please be (very) patient..."
|
||||||
|
unittest.main()
|
||||||
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue