merged in the GPy upstream

This commit is contained in:
Alan Saul 2013-04-15 12:15:14 +01:00
commit 203159eff5
12 changed files with 830 additions and 9 deletions

View file

@ -1,10 +1,4 @@
GPy coxGP
=== =====
A Gaussian processes framework in python Gaussian Process models of Cox proportional hazard models
* [Online documentation](https://gpy.readthedocs.org/en/latest/)
* [Unit tests (Travis-CI)](https://travis-ci.org/SheffieldML/GPy)
Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GPy.png)

0
__init__.py Normal file
View file

0
python/__init__.py Normal file
View file

View file

View file

@ -0,0 +1,220 @@
import GPy
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import t, norm
from coxGP.python.likelihoods.Laplace import Laplace
from coxGP.python.likelihoods.likelihood_function import student_t
def timing():
real_var = 0.1
times = 1
deg_free = 10
real_sd = np.sqrt(real_var)
the_is = np.zeros(times)
X = np.linspace(0.0, 10.0, 300)[:, None]
for a in xrange(times):
Y = np.sin(X) + np.random.randn(*X.shape)*real_var
Yc = Y.copy()
Yc[10] += 100
Yc[25] += 10
Yc[23] += 10
Yc[24] += 10
Yc[250] += 10
#Yc[4] += 10000
edited_real_sd = real_sd
kernel1 = GPy.kern.rbf(X.shape[1])
t_distribution = student_t(deg_free, sigma=edited_real_sd)
corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
m.ensure_default_constraints()
m.update_likelihood_approximation()
m.optimize()
the_is[a] = m.likelihood.i
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
print the_is
print np.mean(the_is)
def student_t_approx():
"""
Example of regressing with a student t likelihood
"""
real_var = 0.1
#Start a function, any function
X = np.linspace(0.0, 10.0, 30)[:, None]
Y = np.sin(X) + np.random.randn(*X.shape)*real_var
Yc = Y.copy()
X_full = np.linspace(0.0, 10.0, 500)[:, None]
Y_full = np.sin(X_full)
#Y = Y/Y.max()
Yc[10] += 100
Yc[25] += 10
Yc[23] += 10
Yc[24] += 10
#Yc = Yc/Yc.max()
#Add student t random noise to datapoints
deg_free = 10
real_sd = np.sqrt(real_var)
#t_rv = t(deg_free, loc=0, scale=real_var)
#noise = t_rvrvs(size=Y.shape)
#Y += noise
#Add some extreme value noise to some of the datapoints
#percent_corrupted = 0.15
#corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
#indices = np.arange(Y.shape[0])
#np.random.shuffle(indices)
#corrupted_indices = indices[:corrupted_datums]
#print corrupted_indices
#noise = t_rv.rvs(size=(len(corrupted_indices), 1))
#Y[corrupted_indices] += noise
plt.figure(1)
plt.suptitle('Gaussian likelihood')
# Kernel object
kernel1 = GPy.kern.rbf(X.shape[1])
kernel2 = kernel1.copy()
kernel3 = kernel1.copy()
kernel4 = kernel1.copy()
kernel5 = kernel1.copy()
kernel6 = kernel1.copy()
print "Clean Gaussian"
#A GP should completely break down due to the points as they get a lot of weight
# create simple GP model
m = GPy.models.GP_regression(X, Y, kernel=kernel1)
# optimize
m.ensure_default_constraints()
m.optimize()
# plot
plt.subplot(211)
m.plot()
plt.plot(X_full, Y_full)
print m
#Corrupt
print "Corrupt Gaussian"
m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
m.ensure_default_constraints()
m.optimize()
plt.subplot(212)
m.plot()
plt.plot(X_full, Y_full)
print m
plt.figure(2)
plt.suptitle('Student-t likelihood')
edited_real_sd = real_sd
print "Clean student t, rasm"
t_distribution = student_t(deg_free, sigma=edited_real_sd)
stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
m = GPy.models.GP(X, stu_t_likelihood, kernel6)
m.ensure_default_constraints()
m.update_likelihood_approximation()
m.optimize()
print(m)
plt.subplot(222)
m.plot()
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
print "Corrupt student t, rasm"
t_distribution = student_t(deg_free, sigma=edited_real_sd)
corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
m.ensure_default_constraints()
m.update_likelihood_approximation()
m.optimize()
print(m)
plt.subplot(224)
m.plot()
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
print "Clean student t, ncg"
t_distribution = student_t(deg_free, sigma=edited_real_sd)
stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
m = GPy.models.GP(X, stu_t_likelihood, kernel3)
m.ensure_default_constraints()
m.update_likelihood_approximation()
m.optimize()
print(m)
plt.subplot(221)
m.plot()
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
print "Corrupt student t, ncg"
t_distribution = student_t(deg_free, sigma=edited_real_sd)
corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
m.ensure_default_constraints()
m.update_likelihood_approximation()
m.optimize()
print(m)
plt.subplot(223)
m.plot()
plt.plot(X_full, Y_full)
plt.ylim(-2.5, 2.5)
###with a student t distribution, since it has heavy tails it should work well
###likelihood_function = student_t(deg_free, sigma=real_var)
###lap = Laplace(Y, likelihood_function)
###cov = kernel.K(X)
###lap.fit_full(cov)
###test_range = np.arange(0, 10, 0.1)
###plt.plot(test_range, t_rv.pdf(test_range))
###for i in xrange(X.shape[0]):
###mode = lap.f_hat[i]
###covariance = lap.hess_hat_i[i,i]
###scaling = np.exp(lap.ln_z_hat)
###normalised_approx = norm(loc=mode, scale=covariance)
###print "Normal with mode %f, and variance %f" % (mode, covariance)
###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
###plt.show()
return m
def noisy_laplace_approx():
"""
Example of regressing with a student t likelihood
"""
#Start a function, any function
X = np.sort(np.random.uniform(0, 15, 70))[:, None]
Y = np.sin(X)
#Add some extreme value noise to some of the datapoints
percent_corrupted = 0.05
corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
indices = np.arange(Y.shape[0])
np.random.shuffle(indices)
corrupted_indices = indices[:corrupted_datums]
print corrupted_indices
noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1))
Y[corrupted_indices] += noise
#A GP should completely break down due to the points as they get a lot of weight
# create simple GP model
m = GPy.models.GP_regression(X, Y)
# optimize
m.ensure_default_constraints()
m.optimize()
# plot
m.plot()
print m
#with a student t distribution, since it has heavy tails it should work well

View file

@ -0,0 +1,321 @@
import numpy as np
import scipy as sp
import GPy
from scipy.linalg import cholesky, eig, inv, cho_solve, det
from numpy.linalg import cond
from GPy.likelihoods.likelihood import likelihood
from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
from scipy.linalg.lapack import dtrtrs
#TODO: Move this to utils
def det_ln_diag(A):
"""
log determinant of a diagonal matrix
$$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
"""
return np.log(np.diagonal(A)).sum()
def pddet(A):
"""
Determinant of a positive definite matrix
"""
L = cholesky(A)
logdetA = 2*sum(np.log(np.diag(L)))
return logdetA
class Laplace(likelihood):
"""Laplace approximation to a posterior"""
def __init__(self, data, likelihood_function, extra_data=None, rasm=True):
"""
Laplace Approximation
First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
then find the z^{prime} which allows this to be a normalised gaussian instead of a
non-normalized gaussian
Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
which makes a gaussian the same as the laplace approximation
Arguments
---------
:data: array of data the likelihood function is approximating
:likelihood_function: likelihood function - subclass of likelihood_function
:extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
:rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation
"""
self.data = data
self.likelihood_function = likelihood_function
self.extra_data = extra_data
self.rasm = rasm
#Inital values
self.N, self.D = self.data.shape
self.is_heteroscedastic = True
self.Nparams = 0
self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
#Initial values for the GP variables
self.Y = np.zeros((self.N, 1))
self.covariance_matrix = np.eye(self.N)
self.precision = np.ones(self.N)[:, None]
self.Z = 0
self.YYT = None
def predictive_values(self, mu, var, full_cov):
if full_cov:
raise NotImplementedError("Cannot make correlated predictions with an EP likelihood")
return self.likelihood_function.predictive_values(mu, var)
def _get_params(self):
return np.zeros(0)
def _get_param_names(self):
return []
def _set_params(self, p):
pass # TODO: Laplace likelihood might want to take some parameters...
def _gradients(self, partial):
return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
raise NotImplementedError
def _compute_GP_variables(self):
"""
Generates data Y which would give the normal distribution identical to the laplace approximation
GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
due to the z rescaling.
at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
$$\tilde{Y} = \tilde{\Sigma} Hf$$
where
$$\tilde{\Sigma}^{-1} = H - K^{-1}$$
i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
$$\tilde{\Sigma} = W^{-1}$$
"""
epsilon = 1e-6
#dtritri -> L -> L_i
#dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
#((L.T*w)_i + I)f_hat = y_tilde
L = jitchol(self.K)
Li = chol_inv(L)
Lt_W = np.dot(L.T, self.W)
##Check it isn't singular!
if cond(Lt_W) > 1e14:
print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
#f.T(Ki + W)f
f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
+ mdot(self.f_hat.T, self.W, self.f_hat)
)
y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
ln_W_det = det_ln_diag(self.W)
Z_tilde = (- self.NORMAL_CONST
+ 0.5*self.ln_K_det
+ 0.5*ln_W_det
+ 0.5*self.ln_Ki_W_i_det
+ 0.5*f_Ki_W_f
+ 0.5*y_W_y
- y_W_f
+ self.ln_z_hat
)
#Z_tilde = (self.NORMAL_CONST
#- 0.5*self.ln_K_det
#- 0.5*ln_W_det
#- 0.5*self.ln_Ki_W_i_det
#- 0.5*f_Ki_W_f
#- 0.5*y_W_y
#+ y_W_f
#+ self.ln_z_hat
#)
##Check it isn't singular!
if cond(self.W) > 1e14:
print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
Sigma_tilde = inv(self.W) # Damn
#Convert to float as its (1, 1) and Z must be a scalar
self.Z = np.float64(Z_tilde)
self.Y = Y_tilde
self.YYT = np.dot(self.Y, self.Y.T)
self.covariance_matrix = Sigma_tilde
self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
def fit_full(self, K):
"""
The laplace approximation algorithm
For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
:K: Covariance matrix
"""
self.K = K.copy()
if self.rasm:
self.f_hat = self.rasm_mode(K)
else:
self.f_hat = self.ncg_mode(K)
#At this point get the hessian matrix
self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data))
if not self.likelihood_function.log_concave:
self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
#If the likelihood is non-log-concave. We wan't to say that there is a negative variance
#To cause the posterior to become less certain than the prior and likelihood,
#This is a property only held by non-log-concave likelihoods
#TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W)
self.Bi, _, _, B_det = pdinv(self.B)
Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
a = b - mdot(self.W_12, solve_chol)
self.f_Ki_f = np.dot(self.f_hat.T, a)
self.ln_K_det = pddet(self.K)
self.ln_z_hat = (- 0.5*self.f_Ki_f
- 0.5*self.ln_K_det
+ 0.5*self.ln_Ki_W_i_det
+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
)
return self._compute_GP_variables()
def _compute_B_statistics(self, K, W):
"""Rasmussen suggests the use of a numerically stable positive definite matrix B
Which has a positive diagonal element and can be easyily inverted
:K: Covariance matrix
:W: Negative hessian at a point (diagonal matrix)
:returns: (B, L)
"""
#W is diagnoal so its sqrt is just the sqrt of the diagonal elements
W_12 = np.sqrt(W)
#import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
L = jitchol(B)
return (B, L, W_12)
def ncg_mode(self, K):
"""
Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
:K: Covariance matrix
:returns: f_mode
"""
self.Ki, _, _, self.ln_K_det = pdinv(K)
f = np.zeros((self.N, 1))
#FIXME: Can we get rid of this horrible reshaping?
#ONLY WORKS FOR 1D DATA
def obj(f):
res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
+ self.NORMAL_CONST)
return float(res)
def obj_grad(f):
res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
return np.squeeze(res)
def obj_hess(f):
res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
return np.squeeze(res)
f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
return f_hat[:, None]
def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50):
"""
Rasmussens numerically stable mode finding
For nomenclature see Rasmussen & Williams 2006
:K: Covariance matrix
:MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
:MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
:returns: f_mode
"""
f = np.zeros((self.N, 1))
new_obj = -np.inf
old_obj = np.inf
def obj(a, f):
#Careful of shape of data!
return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
difference = np.inf
epsilon = 1e-6
step_size = 1
rs = 0
i = 0
while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
f_old = f.copy()
W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data))
if not self.likelihood_function.log_concave:
W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
# If the likelihood is non-log-concave. We wan't to say that there is a negative variance
# To cause the posterior to become less certain than the prior and likelihood,
# This is a property only held by non-log-concave likelihoods
B, L, W_12 = self._compute_B_statistics(K, W)
W_f = np.dot(W, f)
grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None]
#Find K_i_f
b = W_f + grad
#a should be equal to Ki*f now so should be able to use it
c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
solve_L = cho_solve((L, True), np.dot(W_12, c))
f = c - np.dot(K, np.dot(W_12, solve_L))
solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b)))
a = b - np.dot(W_12, solve_L)
#f = np.dot(K, a)
tmp_old_obj = old_obj
old_obj = new_obj
new_obj = obj(a, f)
difference = new_obj - old_obj
if difference < 0:
#print "Objective function rose", difference
#If the objective function isn't rising, restart optimization
step_size *= 0.9
#print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
#objective function isn't increasing, try reducing step size
#f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
old_obj = tmp_old_obj
rs += 1
difference = abs(difference)
i += 1
self.i = i
return f

View file

View file

@ -0,0 +1,253 @@
from scipy.special import gammaln, gamma
from scipy import integrate
import numpy as np
from GPy.likelihoods.likelihood_functions import likelihood_function
from scipy import stats
class student_t(likelihood_function):
"""Student t likelihood distribution
For nomanclature see Bayesian Data Analysis 2003 p576
$$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
Laplace:
Needs functions to calculate
ln p(yi|fi)
dln p(yi|fi)_dfi
d2ln p(yi|fi)_d2fifj
"""
def __init__(self, deg_free, sigma=2):
self.v = deg_free
self.sigma = sigma
#FIXME: This should be in the superclass
self.log_concave = False
@property
def variance(self, extra_data=None):
return (self.v / float(self.v - 2)) * (self.sigma**2)
def link_function(self, y, f, extra_data=None):
"""link_function $\ln p(y|f)$
$$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
:y: data
:f: latent variables f
:extra_data: extra_data which is not used in student t distribution
:returns: float(likelihood evaluated for this point)
"""
y = np.squeeze(y)
f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
objective = (gammaln((self.v + 1) * 0.5)
- gammaln(self.v * 0.5)
+ np.log(self.sigma * np.sqrt(self.v * np.pi))
- (self.v + 1) * 0.5
* np.log(1 + ((e**2 / self.sigma**2) / self.v))
)
return np.sum(objective)
def link_grad(self, y, f, extra_data=None):
"""
Gradient of the link function at y, given f w.r.t f
$$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
:y: data
:f: latent variables f
:extra_data: extra_data which is not used in student t distribution
:returns: gradient of likelihood evaluated at points
"""
y = np.squeeze(y)
f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
return np.squeeze(grad)
def link_hess(self, y, f, extra_data=None):
"""
Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
i.e. second derivative link_function at y given f f_j w.r.t f and f_j
Will return diagonal of hessian, since every where else it is 0
$$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
:y: data
:f: latent variables f
:extra_data: extra_data which is not used in student t distribution
:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
"""
y = np.squeeze(y)
f = np.squeeze(f)
assert y.shape == f.shape
e = y - f
hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
return np.squeeze(hess)
def predictive_values(self, mu, var):
"""
Compute mean, and conficence interval (percentiles 5 and 95) of the prediction
Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
(((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
"""
#We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
#Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
#Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
#Which was also given to us as (var)
#We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
#However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
true_var = var + self.variance
#Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
#need the 95 and 5 percentiles.
#FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
p_025 = mu - 2.*true_var
p_975 = mu + 2.*true_var
return mu, np.nan*mu, p_025, p_975
def sample_predicted_values(self, mu, var):
""" Experimental sample approches and numerical integration """
#p_025 = stats.t.ppf(.025, mu)
#p_975 = stats.t.ppf(.975, mu)
num_test_points = mu.shape[0]
#Each mu is the latent point f* at the test point x*,
#and the var is the gaussian variance at this point
#Take lots of samples from this, so we have lots of possible values
#for latent point f* for each test point x* weighted by how likely we were to pick it
print "Taking %d samples of f*".format(num_test_points)
num_f_samples = 10
num_y_samples = 10
student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
print "Student t means shape: ", student_t_means.shape
#Now we have lots of f*, lets work out the likelihood of getting this by sampling
#from a student t centred on this point, sample many points from this distribution
#centred on f*
#for test_point, f in enumerate(student_t_means):
#print test_point
#print f.shape
#student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
#scale=self.sigma,
#size=(num_f_samples, num_y_samples))
#print student_t_samples.shape
student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
scale=self.sigma,
size=(num_test_points, num_y_samples, num_f_samples))
student_t_samples = np.reshape(student_t_samples,
(num_test_points, num_y_samples*num_f_samples))
#Now take the 97.5 and 0.25 percentile of these points
p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
def t_gaussian(f, mu, var):
return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
* ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
)
def t_gauss_int(mu, var):
print "Mu: ", mu
print "var: ", var
result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
print "Result: ", result
return result[0]
vec_t_gauss_int = np.vectorize(t_gauss_int)
p = vec_t_gauss_int(mu, var)
p_025 = mu - p
p_975 = mu + p
return mu, np.nan*mu, p_025, p_975
class weibull_survival(likelihood_function):
"""Weibull t likelihood distribution for survival analysis with censoring
For nomanclature see Bayesian Survival Analysis
Laplace:
Needs functions to calculate
ln p(yi|fi)
dln p(yi|fi)_dfi
d2ln p(yi|fi)_d2fifj
"""
def __init__(self, shape, scale):
self.shape = shape
self.scale = scale
#FIXME: This should be in the superclass
self.log_concave = True
def link_function(self, y, f, extra_data=None):
"""
link_function $\ln p(y|f)$, i.e. log likelihood
$$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$
:y: time of event data
:f: latent variables f
:extra_data: the censoring indicator, 1 for censored, 0 for not
:returns: float(likelihood evaluated for this point)
"""
y = np.squeeze(y)
f = np.squeeze(f)
assert y.shape == f.shape
v = extra_data
objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f) # FIXME: CHECK THIS WITH BOOK, wheres scale?
return np.sum(objective)
def link_grad(self, y, f, extra_data=None):
"""
Gradient of the link function at y, given f w.r.t f
$$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i})
:y: data
:f: latent variables f
:extra_data: the censoring indicator, 1 for censored, 0 for not
:returns: gradient of likelihood evaluated at points
"""
y = np.squeeze(y)
f = np.squeeze(f)
assert y.shape == f.shape
v = extra_data
grad = v - (y**self.shape)*np.exp(f)
return np.squeeze(grad)
def link_hess(self, y, f, extra_data=None):
"""
Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
i.e. second derivative link_function at y given f f_j w.r.t f and f_j
Will return diagonal of hessian, since every where else it is 0
$$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
:y: data
:f: latent variables f
:extra_data: extra_data which is not used hessian
:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
"""
y = np.squeeze(y)
f = np.squeeze(f)
assert y.shape == f.shape
hess = (y**self.shape)*np.exp(f)
return np.squeeze(hess)

View file

19
python/models/coxGP.py Normal file
View file

@ -0,0 +1,19 @@
# Copyright (c) 2013, Alan Saul
from GPy.models import GP
from .. import likelihoods
from GPy import kern
class cox_GP_regression(GP):
"""
Cox Gaussian Process model for regression
"""
def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None):
if kernel is None:
kernel = kern.rbf(X.shape[1])
likelihood = likelihoods.cox_piecewise(Y, normalize=normalize_Y)
GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices)

View file

View file

@ -0,0 +1,14 @@
# Copyright (c) 2013, Alan Saul
import unittest
import numpy as np
import GPy
class coxGPTests(unittest.TestCase):
def test_laplace_approx(self):
pass
if __name__ == "__main__":
print "Running unit tests, please be (very) patient..."
unittest.main()