diff --git a/README.md b/README.md index 09bc78f5..b15e04cc 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,4 @@ -GPy -=== +coxGP +===== -A Gaussian processes framework in python - -* [Online documentation](https://gpy.readthedocs.org/en/latest/) -* [Unit tests (Travis-CI)](https://travis-ci.org/SheffieldML/GPy) - - -Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GPy.png) +Gaussian Process models of Cox proportional hazard models diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/__init__.py b/python/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/examples/__init__.py b/python/examples/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py new file mode 100644 index 00000000..5b1331b6 --- /dev/null +++ b/python/examples/laplace_approximations.py @@ -0,0 +1,220 @@ +import GPy +import numpy as np +import matplotlib.pyplot as plt +from scipy.stats import t, norm +from coxGP.python.likelihoods.Laplace import Laplace +from coxGP.python.likelihoods.likelihood_function import student_t + + +def timing(): + real_var = 0.1 + times = 1 + deg_free = 10 + real_sd = np.sqrt(real_var) + the_is = np.zeros(times) + X = np.linspace(0.0, 10.0, 300)[:, None] + + for a in xrange(times): + Y = np.sin(X) + np.random.randn(*X.shape)*real_var + Yc = Y.copy() + + Yc[10] += 100 + Yc[25] += 10 + Yc[23] += 10 + Yc[24] += 10 + Yc[250] += 10 + #Yc[4] += 10000 + + edited_real_sd = real_sd + kernel1 = GPy.kern.rbf(X.shape[1]) + + t_distribution = student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True) + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + the_is[a] = m.likelihood.i + + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + print the_is + print np.mean(the_is) + + +def student_t_approx(): + """ + Example of regressing with a student t likelihood + """ + real_var = 0.1 + #Start a function, any function + X = np.linspace(0.0, 10.0, 30)[:, None] + Y = np.sin(X) + np.random.randn(*X.shape)*real_var + Yc = Y.copy() + + X_full = np.linspace(0.0, 10.0, 500)[:, None] + Y_full = np.sin(X_full) + + #Y = Y/Y.max() + + Yc[10] += 100 + Yc[25] += 10 + Yc[23] += 10 + Yc[24] += 10 + #Yc = Yc/Yc.max() + + #Add student t random noise to datapoints + deg_free = 10 + real_sd = np.sqrt(real_var) + #t_rv = t(deg_free, loc=0, scale=real_var) + #noise = t_rvrvs(size=Y.shape) + #Y += noise + + #Add some extreme value noise to some of the datapoints + #percent_corrupted = 0.15 + #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted)) + #indices = np.arange(Y.shape[0]) + #np.random.shuffle(indices) + #corrupted_indices = indices[:corrupted_datums] + #print corrupted_indices + #noise = t_rv.rvs(size=(len(corrupted_indices), 1)) + #Y[corrupted_indices] += noise + plt.figure(1) + plt.suptitle('Gaussian likelihood') + # Kernel object + kernel1 = GPy.kern.rbf(X.shape[1]) + kernel2 = kernel1.copy() + kernel3 = kernel1.copy() + kernel4 = kernel1.copy() + kernel5 = kernel1.copy() + kernel6 = kernel1.copy() + + print "Clean Gaussian" + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model + m = GPy.models.GP_regression(X, Y, kernel=kernel1) + # optimize + m.ensure_default_constraints() + m.optimize() + # plot + plt.subplot(211) + m.plot() + plt.plot(X_full, Y_full) + print m + + #Corrupt + print "Corrupt Gaussian" + m = GPy.models.GP_regression(X, Yc, kernel=kernel2) + m.ensure_default_constraints() + m.optimize() + plt.subplot(212) + m.plot() + plt.plot(X_full, Y_full) + print m + + plt.figure(2) + plt.suptitle('Student-t likelihood') + edited_real_sd = real_sd + + print "Clean student t, rasm" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True) + m = GPy.models.GP(X, stu_t_likelihood, kernel6) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(222) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + print "Corrupt student t, rasm" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True) + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(224) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + print "Clean student t, ncg" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + stu_t_likelihood = Laplace(Y, t_distribution, rasm=False) + m = GPy.models.GP(X, stu_t_likelihood, kernel3) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(221) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + print "Corrupt student t, ncg" + t_distribution = student_t(deg_free, sigma=edited_real_sd) + corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False) + m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5) + m.ensure_default_constraints() + m.update_likelihood_approximation() + m.optimize() + print(m) + plt.subplot(223) + m.plot() + plt.plot(X_full, Y_full) + plt.ylim(-2.5, 2.5) + + + ###with a student t distribution, since it has heavy tails it should work well + ###likelihood_function = student_t(deg_free, sigma=real_var) + ###lap = Laplace(Y, likelihood_function) + ###cov = kernel.K(X) + ###lap.fit_full(cov) + + ###test_range = np.arange(0, 10, 0.1) + ###plt.plot(test_range, t_rv.pdf(test_range)) + ###for i in xrange(X.shape[0]): + ###mode = lap.f_hat[i] + ###covariance = lap.hess_hat_i[i,i] + ###scaling = np.exp(lap.ln_z_hat) + ###normalised_approx = norm(loc=mode, scale=covariance) + ###print "Normal with mode %f, and variance %f" % (mode, covariance) + ###plt.plot(test_range, scaling*normalised_approx.pdf(test_range)) + ###plt.show() + + return m + + +def noisy_laplace_approx(): + """ + Example of regressing with a student t likelihood + """ + #Start a function, any function + X = np.sort(np.random.uniform(0, 15, 70))[:, None] + Y = np.sin(X) + + #Add some extreme value noise to some of the datapoints + percent_corrupted = 0.05 + corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted)) + indices = np.arange(Y.shape[0]) + np.random.shuffle(indices) + corrupted_indices = indices[:corrupted_datums] + print corrupted_indices + noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1)) + Y[corrupted_indices] += noise + + #A GP should completely break down due to the points as they get a lot of weight + # create simple GP model + m = GPy.models.GP_regression(X, Y) + + # optimize + m.ensure_default_constraints() + m.optimize() + # plot + m.plot() + print m + + #with a student t distribution, since it has heavy tails it should work well diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py new file mode 100644 index 00000000..4d94ba0f --- /dev/null +++ b/python/likelihoods/Laplace.py @@ -0,0 +1,321 @@ +import numpy as np +import scipy as sp +import GPy +from scipy.linalg import cholesky, eig, inv, cho_solve, det +from numpy.linalg import cond +from GPy.likelihoods.likelihood import likelihood +from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv +from scipy.linalg.lapack import dtrtrs + +#TODO: Move this to utils + + +def det_ln_diag(A): + """ + log determinant of a diagonal matrix + $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$ + """ + return np.log(np.diagonal(A)).sum() + + +def pddet(A): + """ + Determinant of a positive definite matrix + """ + L = cholesky(A) + logdetA = 2*sum(np.log(np.diag(L))) + return logdetA + + +class Laplace(likelihood): + """Laplace approximation to a posterior""" + + def __init__(self, data, likelihood_function, extra_data=None, rasm=True): + """ + Laplace Approximation + + First find the moments \hat{f} and the hessian at this point (using Newton-Raphson) + then find the z^{prime} which allows this to be a normalised gaussian instead of a + non-normalized gaussian + + Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle} + which makes a gaussian the same as the laplace approximation + + Arguments + --------- + + :data: array of data the likelihood function is approximating + :likelihood_function: likelihood function - subclass of likelihood_function + :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data + :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation + + """ + self.data = data + self.likelihood_function = likelihood_function + self.extra_data = extra_data + self.rasm = rasm + + #Inital values + self.N, self.D = self.data.shape + self.is_heteroscedastic = True + self.Nparams = 0 + + self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi)) + + #Initial values for the GP variables + self.Y = np.zeros((self.N, 1)) + self.covariance_matrix = np.eye(self.N) + self.precision = np.ones(self.N)[:, None] + self.Z = 0 + self.YYT = None + + def predictive_values(self, mu, var, full_cov): + if full_cov: + raise NotImplementedError("Cannot make correlated predictions with an EP likelihood") + return self.likelihood_function.predictive_values(mu, var) + + def _get_params(self): + return np.zeros(0) + + def _get_param_names(self): + return [] + + def _set_params(self, p): + pass # TODO: Laplace likelihood might want to take some parameters... + + def _gradients(self, partial): + return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters... + raise NotImplementedError + + def _compute_GP_variables(self): + """ + Generates data Y which would give the normal distribution identical to the laplace approximation + + GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle} + that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood + + Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal) + then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f) + due to the z rescaling. + + at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1) + + This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1) + giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f) + + $$\tilde{Y} = \tilde{\Sigma} Hf$$ + where + $$\tilde{\Sigma}^{-1} = H - K^{-1}$$ + i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$ + since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$ + and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$ + $$\tilde{\Sigma} = W^{-1}$$ + + """ + epsilon = 1e-6 + + #dtritri -> L -> L_i + #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i + #((L.T*w)_i + I)f_hat = y_tilde + L = jitchol(self.K) + Li = chol_inv(L) + Lt_W = np.dot(L.T, self.W) + + ##Check it isn't singular! + if cond(Lt_W) > 1e14: + print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem" + + Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0] + Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat) + + #f.T(Ki + W)f + f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat)) + + mdot(self.f_hat.T, self.W, self.f_hat) + ) + + y_W_f = mdot(Y_tilde.T, self.W, self.f_hat) + y_W_y = mdot(Y_tilde.T, self.W, Y_tilde) + ln_W_det = det_ln_diag(self.W) + Z_tilde = (- self.NORMAL_CONST + + 0.5*self.ln_K_det + + 0.5*ln_W_det + + 0.5*self.ln_Ki_W_i_det + + 0.5*f_Ki_W_f + + 0.5*y_W_y + - y_W_f + + self.ln_z_hat + ) + #Z_tilde = (self.NORMAL_CONST + #- 0.5*self.ln_K_det + #- 0.5*ln_W_det + #- 0.5*self.ln_Ki_W_i_det + #- 0.5*f_Ki_W_f + #- 0.5*y_W_y + #+ y_W_f + #+ self.ln_z_hat + #) + + ##Check it isn't singular! + if cond(self.W) > 1e14: + print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem" + + Sigma_tilde = inv(self.W) # Damn + + #Convert to float as its (1, 1) and Z must be a scalar + self.Z = np.float64(Z_tilde) + self.Y = Y_tilde + self.YYT = np.dot(self.Y, self.Y.T) + self.covariance_matrix = Sigma_tilde + self.precision = 1 / np.diag(self.covariance_matrix)[:, None] + + def fit_full(self, K): + """ + The laplace approximation algorithm + For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability + :K: Covariance matrix + """ + self.K = K.copy() + if self.rasm: + self.f_hat = self.rasm_mode(K) + else: + self.f_hat = self.ncg_mode(K) + + #At this point get the hessian matrix + self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data)) + + if not self.likelihood_function.log_concave: + self.W[self.W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + #If the likelihood is non-log-concave. We wan't to say that there is a negative variance + #To cause the posterior to become less certain than the prior and likelihood, + #This is a property only held by non-log-concave likelihoods + + #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though + self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W) + self.Bi, _, _, B_det = pdinv(self.B) + + Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K) + self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i) + + b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None] + solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b))) + a = b - mdot(self.W_12, solve_chol) + self.f_Ki_f = np.dot(self.f_hat.T, a) + self.ln_K_det = pddet(self.K) + + self.ln_z_hat = (- 0.5*self.f_Ki_f + - 0.5*self.ln_K_det + + 0.5*self.ln_Ki_W_i_det + + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data) + ) + + return self._compute_GP_variables() + + def _compute_B_statistics(self, K, W): + """Rasmussen suggests the use of a numerically stable positive definite matrix B + Which has a positive diagonal element and can be easyily inverted + + :K: Covariance matrix + :W: Negative hessian at a point (diagonal matrix) + :returns: (B, L) + """ + #W is diagnoal so its sqrt is just the sqrt of the diagonal elements + W_12 = np.sqrt(W) + #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT + B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12)) + L = jitchol(B) + return (B, L, W_12) + + def ncg_mode(self, K): + """ + Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative) + :K: Covariance matrix + :returns: f_mode + """ + self.Ki, _, _, self.ln_K_det = pdinv(K) + + f = np.zeros((self.N, 1)) + + #FIXME: Can we get rid of this horrible reshaping? + #ONLY WORKS FOR 1D DATA + def obj(f): + res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f)) + + self.NORMAL_CONST) + return float(res) + + def obj_grad(f): + res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f)) + return np.squeeze(res) + + def obj_hess(f): + res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki) + return np.squeeze(res) + + f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess) + return f_hat[:, None] + + def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50): + """ + Rasmussens numerically stable mode finding + For nomenclature see Rasmussen & Williams 2006 + + :K: Covariance matrix + :MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation + :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation + :returns: f_mode + """ + f = np.zeros((self.N, 1)) + new_obj = -np.inf + old_obj = np.inf + + def obj(a, f): + #Careful of shape of data! + return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data) + + difference = np.inf + epsilon = 1e-6 + step_size = 1 + rs = 0 + i = 0 + while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART: + f_old = f.copy() + W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data)) + if not self.likelihood_function.log_concave: + W[W < 0] = 1e-6 # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur + # If the likelihood is non-log-concave. We wan't to say that there is a negative variance + # To cause the posterior to become less certain than the prior and likelihood, + # This is a property only held by non-log-concave likelihoods + B, L, W_12 = self._compute_B_statistics(K, W) + + W_f = np.dot(W, f) + grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None] + #Find K_i_f + b = W_f + grad + + #a should be equal to Ki*f now so should be able to use it + c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad) + solve_L = cho_solve((L, True), np.dot(W_12, c)) + f = c - np.dot(K, np.dot(W_12, solve_L)) + + solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b))) + a = b - np.dot(W_12, solve_L) + #f = np.dot(K, a) + + tmp_old_obj = old_obj + old_obj = new_obj + new_obj = obj(a, f) + difference = new_obj - old_obj + if difference < 0: + #print "Objective function rose", difference + #If the objective function isn't rising, restart optimization + step_size *= 0.9 + #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size) + #objective function isn't increasing, try reducing step size + #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode + old_obj = tmp_old_obj + rs += 1 + + difference = abs(difference) + i += 1 + + self.i = i + return f diff --git a/python/likelihoods/__init__.py b/python/likelihoods/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py new file mode 100644 index 00000000..0d421882 --- /dev/null +++ b/python/likelihoods/likelihood_function.py @@ -0,0 +1,253 @@ +from scipy.special import gammaln, gamma +from scipy import integrate +import numpy as np +from GPy.likelihoods.likelihood_functions import likelihood_function +from scipy import stats + + +class student_t(likelihood_function): + """Student t likelihood distribution + For nomanclature see Bayesian Data Analysis 2003 p576 + + $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ + + Laplace: + Needs functions to calculate + ln p(yi|fi) + dln p(yi|fi)_dfi + d2ln p(yi|fi)_d2fifj + """ + def __init__(self, deg_free, sigma=2): + self.v = deg_free + self.sigma = sigma + + #FIXME: This should be in the superclass + self.log_concave = False + + @property + def variance(self, extra_data=None): + return (self.v / float(self.v - 2)) * (self.sigma**2) + + def link_function(self, y, f, extra_data=None): + """link_function $\ln p(y|f)$ + $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$ + + :y: data + :f: latent variables f + :extra_data: extra_data which is not used in student t distribution + :returns: float(likelihood evaluated for this point) + + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + e = y - f + objective = (gammaln((self.v + 1) * 0.5) + - gammaln(self.v * 0.5) + + np.log(self.sigma * np.sqrt(self.v * np.pi)) + - (self.v + 1) * 0.5 + * np.log(1 + ((e**2 / self.sigma**2) / self.v)) + ) + return np.sum(objective) + + def link_grad(self, y, f, extra_data=None): + """ + Gradient of the link function at y, given f w.r.t f + + $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ + + :y: data + :f: latent variables f + :extra_data: extra_data which is not used in student t distribution + :returns: gradient of likelihood evaluated at points + + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + e = y - f + grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2)) + return np.squeeze(grad) + + def link_hess(self, y, f, extra_data=None): + """ + Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j + i.e. second derivative link_function at y given f f_j w.r.t f and f_j + + Will return diagonal of hessian, since every where else it is 0 + + $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ + + :y: data + :f: latent variables f + :extra_data: extra_data which is not used in student t distribution + :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + e = y - f + hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2) + return np.squeeze(hess) + + def predictive_values(self, mu, var): + """ + Compute mean, and conficence interval (percentiles 5 and 95) of the prediction + + Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*) + (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2)) + *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2))) + """ + + #We want the variance around test points y which comes from int p(y*|f*)p(f*) df* + #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)] + #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this + #Which was also given to us as (var) + #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution + #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom + true_var = var + self.variance + + #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now + #need the 95 and 5 percentiles. + #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles + p_025 = mu - 2.*true_var + p_975 = mu + 2.*true_var + + return mu, np.nan*mu, p_025, p_975 + + def sample_predicted_values(self, mu, var): + """ Experimental sample approches and numerical integration """ + #p_025 = stats.t.ppf(.025, mu) + #p_975 = stats.t.ppf(.975, mu) + + num_test_points = mu.shape[0] + #Each mu is the latent point f* at the test point x*, + #and the var is the gaussian variance at this point + #Take lots of samples from this, so we have lots of possible values + #for latent point f* for each test point x* weighted by how likely we were to pick it + print "Taking %d samples of f*".format(num_test_points) + num_f_samples = 10 + num_y_samples = 10 + student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples)) + print "Student t means shape: ", student_t_means.shape + + #Now we have lots of f*, lets work out the likelihood of getting this by sampling + #from a student t centred on this point, sample many points from this distribution + #centred on f* + #for test_point, f in enumerate(student_t_means): + #print test_point + #print f.shape + #student_t_samples = stats.t.rvs(self.v, loc=f[:,None], + #scale=self.sigma, + #size=(num_f_samples, num_y_samples)) + #print student_t_samples.shape + + student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None], + scale=self.sigma, + size=(num_test_points, num_y_samples, num_f_samples)) + student_t_samples = np.reshape(student_t_samples, + (num_test_points, num_y_samples*num_f_samples)) + + #Now take the 97.5 and 0.25 percentile of these points + p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None] + p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None] + + ##Alernenately we could sample from int p(y|f*)p(f*|x*) df* + def t_gaussian(f, mu, var): + return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5)) + * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2))) + ) + + def t_gauss_int(mu, var): + print "Mu: ", mu + print "var: ", var + result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var)) + print "Result: ", result + return result[0] + + vec_t_gauss_int = np.vectorize(t_gauss_int) + + p = vec_t_gauss_int(mu, var) + p_025 = mu - p + p_975 = mu + p + return mu, np.nan*mu, p_025, p_975 + + +class weibull_survival(likelihood_function): + """Weibull t likelihood distribution for survival analysis with censoring + For nomanclature see Bayesian Survival Analysis + + Laplace: + Needs functions to calculate + ln p(yi|fi) + dln p(yi|fi)_dfi + d2ln p(yi|fi)_d2fifj + """ + def __init__(self, shape, scale): + self.shape = shape + self.scale = scale + + #FIXME: This should be in the superclass + self.log_concave = True + + def link_function(self, y, f, extra_data=None): + """ + link_function $\ln p(y|f)$, i.e. log likelihood + + $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$ + + :y: time of event data + :f: latent variables f + :extra_data: the censoring indicator, 1 for censored, 0 for not + :returns: float(likelihood evaluated for this point) + + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + v = extra_data + objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f) # FIXME: CHECK THIS WITH BOOK, wheres scale? + return np.sum(objective) + + def link_grad(self, y, f, extra_data=None): + """ + Gradient of the link function at y, given f w.r.t f + + $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i}) + + :y: data + :f: latent variables f + :extra_data: the censoring indicator, 1 for censored, 0 for not + :returns: gradient of likelihood evaluated at points + + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + v = extra_data + grad = v - (y**self.shape)*np.exp(f) + return np.squeeze(grad) + + def link_hess(self, y, f, extra_data=None): + """ + Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j + i.e. second derivative link_function at y given f f_j w.r.t f and f_j + + Will return diagonal of hessian, since every where else it is 0 + + $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$ + + :y: data + :f: latent variables f + :extra_data: extra_data which is not used hessian + :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points) + """ + y = np.squeeze(y) + f = np.squeeze(f) + assert y.shape == f.shape + + hess = (y**self.shape)*np.exp(f) + return np.squeeze(hess) diff --git a/python/models/__init__.py b/python/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/models/coxGP.py b/python/models/coxGP.py new file mode 100644 index 00000000..f61a8f46 --- /dev/null +++ b/python/models/coxGP.py @@ -0,0 +1,19 @@ +# Copyright (c) 2013, Alan Saul + +from GPy.models import GP +from .. import likelihoods +from GPy import kern + + +class cox_GP_regression(GP): + """ + Cox Gaussian Process model for regression + """ + + def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None): + if kernel is None: + kernel = kern.rbf(X.shape[1]) + + likelihood = likelihoods.cox_piecewise(Y, normalize=normalize_Y) + + GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices) diff --git a/python/testing/__init__.py b/python/testing/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/testing/cox_tests.py b/python/testing/cox_tests.py new file mode 100644 index 00000000..526f5c92 --- /dev/null +++ b/python/testing/cox_tests.py @@ -0,0 +1,14 @@ +# Copyright (c) 2013, Alan Saul + +import unittest +import numpy as np +import GPy + +class coxGPTests(unittest.TestCase): + def test_laplace_approx(self): + pass + +if __name__ == "__main__": + print "Running unit tests, please be (very) patient..." + unittest.main() +