merged in the GPy upstream

2026-05-15 06:52:39 +02:00 · 2013-04-15 12:15:14 +01:00 · 2013-04-15 12:15:14 +01:00 · 203159eff5
commit 203159eff5
parent f3ea01f76d 9bbb11b825
12 changed files with 830 additions and 9 deletions
--- a/README.md
+++ b/README.md
@ -1,10 +1,4 @@
-GPy
+coxGP
-===
+=====
-A Gaussian processes framework in python
+Gaussian Process models of Cox proportional hazard models
 * [Online documentation](https://gpy.readthedocs.org/en/latest/)
 * [Unit tests (Travis-CI)](https://travis-ci.org/SheffieldML/GPy)
 Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GPy.png)
--- a/init.py
+++ b/init.py
--- a/python/init.py
+++ b/python/init.py
--- a/python/examples/init.py
+++ b/python/examples/init.py
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@ -0,0 +1,220 @@
 import GPy
 import numpy as np
 import matplotlib.pyplot as plt
 from scipy.stats import t, norm
 from coxGP.python.likelihoods.Laplace import Laplace
 from coxGP.python.likelihoods.likelihood_function import student_t
 def timing():
    real_var = 0.1
    times = 1
    deg_free = 10
    real_sd = np.sqrt(real_var)
    the_is = np.zeros(times)
    X = np.linspace(0.0, 10.0, 300)[:, None]
    for a in xrange(times):
        Y = np.sin(X) + np.random.randn(*X.shape)*real_var
        Yc = Y.copy()
        Yc[10] += 100
        Yc[25] += 10
        Yc[23] += 10
        Yc[24] += 10
        Yc[250] += 10
        #Yc[4] += 10000
        edited_real_sd = real_sd
        kernel1 = GPy.kern.rbf(X.shape[1])
        t_distribution = student_t(deg_free, sigma=edited_real_sd)
        corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
        m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
        m.ensure_default_constraints()
        m.update_likelihood_approximation()
        m.optimize()
        the_is[a] = m.likelihood.i
    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
    print the_is
    print np.mean(the_is)
 def student_t_approx():
    """
    Example of regressing with a student t likelihood
    """
    real_var = 0.1
    #Start a function, any function
    X = np.linspace(0.0, 10.0, 30)[:, None]
    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
    Yc = Y.copy()
    X_full = np.linspace(0.0, 10.0, 500)[:, None]
    Y_full = np.sin(X_full)
    #Y = Y/Y.max()
    Yc[10] += 100
    Yc[25] += 10
    Yc[23] += 10
    Yc[24] += 10
    #Yc = Yc/Yc.max()
    #Add student t random noise to datapoints
    deg_free = 10
    real_sd = np.sqrt(real_var)
    #t_rv = t(deg_free, loc=0, scale=real_var)
    #noise = t_rvrvs(size=Y.shape)
    #Y += noise
    #Add some extreme value noise to some of the datapoints
    #percent_corrupted = 0.15
    #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
    #indices = np.arange(Y.shape[0])
    #np.random.shuffle(indices)
    #corrupted_indices = indices[:corrupted_datums]
    #print corrupted_indices
    #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
    #Y[corrupted_indices] += noise
    plt.figure(1)
    plt.suptitle('Gaussian likelihood')
    # Kernel object
    kernel1 = GPy.kern.rbf(X.shape[1])
    kernel2 = kernel1.copy()
    kernel3 = kernel1.copy()
    kernel4 = kernel1.copy()
    kernel5 = kernel1.copy()
    kernel6 = kernel1.copy()
    print "Clean Gaussian"
    #A GP should completely break down due to the points as they get a lot of weight
    # create simple GP model
    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    # plot
    plt.subplot(211)
    m.plot()
    plt.plot(X_full, Y_full)
    print m
    #Corrupt
    print "Corrupt Gaussian"
    m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
    m.ensure_default_constraints()
    m.optimize()
    plt.subplot(212)
    m.plot()
    plt.plot(X_full, Y_full)
    print m
    plt.figure(2)
    plt.suptitle('Student-t likelihood')
    edited_real_sd = real_sd
    print "Clean student t, rasm"
    t_distribution = student_t(deg_free, sigma=edited_real_sd)
    stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
    m.ensure_default_constraints()
    m.update_likelihood_approximation()
    m.optimize()
    print(m)
    plt.subplot(222)
    m.plot()
    plt.plot(X_full, Y_full)
    plt.ylim(-2.5, 2.5)
    print "Corrupt student t, rasm"
    t_distribution = student_t(deg_free, sigma=edited_real_sd)
    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
    m.ensure_default_constraints()
    m.update_likelihood_approximation()
    m.optimize()
    print(m)
    plt.subplot(224)
    m.plot()
    plt.plot(X_full, Y_full)
    plt.ylim(-2.5, 2.5)
    print "Clean student t, ncg"
    t_distribution = student_t(deg_free, sigma=edited_real_sd)
    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
    m.ensure_default_constraints()
    m.update_likelihood_approximation()
    m.optimize()
    print(m)
    plt.subplot(221)
    m.plot()
    plt.plot(X_full, Y_full)
    plt.ylim(-2.5, 2.5)
    print "Corrupt student t, ncg"
    t_distribution = student_t(deg_free, sigma=edited_real_sd)
    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
    m.ensure_default_constraints()
    m.update_likelihood_approximation()
    m.optimize()
    print(m)
    plt.subplot(223)
    m.plot()
    plt.plot(X_full, Y_full)
    plt.ylim(-2.5, 2.5)
    ###with a student t distribution, since it has heavy tails it should work well
    ###likelihood_function = student_t(deg_free, sigma=real_var)
    ###lap = Laplace(Y, likelihood_function)
    ###cov = kernel.K(X)
    ###lap.fit_full(cov)
    ###test_range = np.arange(0, 10, 0.1)
    ###plt.plot(test_range, t_rv.pdf(test_range))
    ###for i in xrange(X.shape[0]):
        ###mode = lap.f_hat[i]
        ###covariance = lap.hess_hat_i[i,i]
        ###scaling = np.exp(lap.ln_z_hat)
        ###normalised_approx = norm(loc=mode, scale=covariance)
        ###print "Normal with mode %f, and variance %f" % (mode, covariance)
        ###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
    ###plt.show()
    return m
 def noisy_laplace_approx():
    """
    Example of regressing with a student t likelihood
    """
    #Start a function, any function
    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
    Y = np.sin(X)
    #Add some extreme value noise to some of the datapoints
    percent_corrupted = 0.05
    corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
    indices = np.arange(Y.shape[0])
    np.random.shuffle(indices)
    corrupted_indices = indices[:corrupted_datums]
    print corrupted_indices
    noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1))
    Y[corrupted_indices] += noise
    #A GP should completely break down due to the points as they get a lot of weight
    # create simple GP model
    m = GPy.models.GP_regression(X, Y)
    # optimize
    m.ensure_default_constraints()
    m.optimize()
    # plot
    m.plot()
    print m
    #with a student t distribution, since it has heavy tails it should work well
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@ -0,0 +1,321 @@
 import numpy as np
 import scipy as sp
 import GPy
 from scipy.linalg import cholesky, eig, inv, cho_solve, det
 from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
 from scipy.linalg.lapack import dtrtrs
 #TODO: Move this to utils
 def det_ln_diag(A):
    """
    log determinant of a diagonal matrix
    $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
    """
    return np.log(np.diagonal(A)).sum()
 def pddet(A):
    """
    Determinant of a positive definite matrix
    """
    L = cholesky(A)
    logdetA = 2*sum(np.log(np.diag(L)))
    return logdetA
 class Laplace(likelihood):
    """Laplace approximation to a posterior"""
    def __init__(self, data, likelihood_function, extra_data=None, rasm=True):
        """
        Laplace Approximation
        First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
        then find the z^{prime} which allows this to be a normalised gaussian instead of a
        non-normalized gaussian
        Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
        which makes a gaussian the same as the laplace approximation
        Arguments
        ---------
        :data: array of data the likelihood function is approximating
        :likelihood_function: likelihood function - subclass of likelihood_function
        :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
        :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation
        """
        self.data = data
        self.likelihood_function = likelihood_function
        self.extra_data = extra_data
        self.rasm = rasm
        #Inital values
        self.N, self.D = self.data.shape
        self.is_heteroscedastic = True
        self.Nparams = 0
        self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
        #Initial values for the GP variables
        self.Y = np.zeros((self.N, 1))
        self.covariance_matrix = np.eye(self.N)
        self.precision = np.ones(self.N)[:, None]
        self.Z = 0
        self.YYT = None
    def predictive_values(self, mu, var, full_cov):
        if full_cov:
            raise NotImplementedError("Cannot make correlated predictions with an EP likelihood")
        return self.likelihood_function.predictive_values(mu, var)
    def _get_params(self):
        return np.zeros(0)
    def _get_param_names(self):
        return []
    def _set_params(self, p):
        pass  # TODO: Laplace likelihood might want to take some parameters...
    def _gradients(self, partial):
        return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
        raise NotImplementedError
    def _compute_GP_variables(self):
        """
        Generates data Y which would give the normal distribution identical to the laplace approximation
        GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
        that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
        Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
        then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
        due to the z rescaling.
        at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
        This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
        giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
        $$\tilde{Y} = \tilde{\Sigma} Hf$$
        where
        $$\tilde{\Sigma}^{-1} = H - K^{-1}$$
        i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
        since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
        and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
        $$\tilde{\Sigma} = W^{-1}$$
        """
        epsilon = 1e-6
        #dtritri -> L -> L_i
        #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
        #((L.T*w)_i + I)f_hat = y_tilde
        L = jitchol(self.K)
        Li = chol_inv(L)
        Lt_W = np.dot(L.T, self.W)
        ##Check it isn't singular!
        if cond(Lt_W) > 1e14:
            print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
        Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
        Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
        #f.T(Ki + W)f
        f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
                    + mdot(self.f_hat.T, self.W, self.f_hat)
                    )
        y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
        y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
        ln_W_det = det_ln_diag(self.W)
        Z_tilde = (- self.NORMAL_CONST
                   + 0.5*self.ln_K_det
                   + 0.5*ln_W_det
                   + 0.5*self.ln_Ki_W_i_det
                   + 0.5*f_Ki_W_f
                   + 0.5*y_W_y
                   - y_W_f
                   + self.ln_z_hat
                   )
        #Z_tilde = (self.NORMAL_CONST
                   #- 0.5*self.ln_K_det
                   #- 0.5*ln_W_det
                   #- 0.5*self.ln_Ki_W_i_det
                   #- 0.5*f_Ki_W_f
                   #- 0.5*y_W_y
                   #+ y_W_f
                   #+ self.ln_z_hat
                   #)
        ##Check it isn't singular!
        if cond(self.W) > 1e14:
            print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
        Sigma_tilde = inv(self.W)  # Damn
        #Convert to float as its (1, 1) and Z must be a scalar
        self.Z = np.float64(Z_tilde)
        self.Y = Y_tilde
        self.YYT = np.dot(self.Y, self.Y.T)
        self.covariance_matrix = Sigma_tilde
        self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
    def fit_full(self, K):
        """
        The laplace approximation algorithm
        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
        :K: Covariance matrix
        """
        self.K = K.copy()
        if self.rasm:
            self.f_hat = self.rasm_mode(K)
        else:
            self.f_hat = self.ncg_mode(K)
        #At this point get the hessian matrix
        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data))
        if not self.likelihood_function.log_concave:
            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                       #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                       #To cause the posterior to become less certain than the prior and likelihood,
                                       #This is a property only held by non-log-concave likelihoods
        #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W)
        self.Bi, _, _, B_det = pdinv(self.B)
        Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
        self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
        solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
        a = b - mdot(self.W_12, solve_chol)
        self.f_Ki_f = np.dot(self.f_hat.T, a)
        self.ln_K_det = pddet(self.K)
        self.ln_z_hat = (- 0.5*self.f_Ki_f
                         - 0.5*self.ln_K_det
                         + 0.5*self.ln_Ki_W_i_det
                         + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
                         )
        return self._compute_GP_variables()
    def _compute_B_statistics(self, K, W):
        """Rasmussen suggests the use of a numerically stable positive definite matrix B
        Which has a positive diagonal element and can be easyily inverted
        :K: Covariance matrix
        :W: Negative hessian at a point (diagonal matrix)
        :returns: (B, L)
        """
        #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
        W_12 = np.sqrt(W)
        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
        B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
        L = jitchol(B)
        return (B, L, W_12)
    def ncg_mode(self, K):
        """
        Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
        :K: Covariance matrix
        :returns: f_mode
        """
        self.Ki, _, _, self.ln_K_det = pdinv(K)
        f = np.zeros((self.N, 1))
        #FIXME: Can we get rid of this horrible reshaping?
        #ONLY WORKS FOR 1D DATA
        def obj(f):
            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
                        + self.NORMAL_CONST)
            return float(res)
        def obj_grad(f):
            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
            return np.squeeze(res)
        def obj_hess(f):
            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
            return np.squeeze(res)
        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
        return f_hat[:, None]
    def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50):
        """
        Rasmussens numerically stable mode finding
        For nomenclature see Rasmussen & Williams 2006
        :K: Covariance matrix
        :MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
        :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
        :returns: f_mode
        """
        f = np.zeros((self.N, 1))
        new_obj = -np.inf
        old_obj = np.inf
        def obj(a, f):
            #Careful of shape of data!
            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
        difference = np.inf
        epsilon = 1e-6
        step_size = 1
        rs = 0
        i = 0
        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
            f_old = f.copy()
            W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data))
            if not self.likelihood_function.log_concave:
                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                    # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                    # To cause the posterior to become less certain than the prior and likelihood,
                                    # This is a property only held by non-log-concave likelihoods
            B, L, W_12 = self._compute_B_statistics(K, W)
            W_f = np.dot(W, f)
            grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None]
            #Find K_i_f
            b = W_f + grad
            #a should be equal to Ki*f now so should be able to use it
            c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
            solve_L = cho_solve((L, True), np.dot(W_12, c))
            f = c - np.dot(K, np.dot(W_12, solve_L))
            solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b)))
            a = b - np.dot(W_12, solve_L)
            #f = np.dot(K, a)
            tmp_old_obj = old_obj
            old_obj = new_obj
            new_obj = obj(a, f)
            difference = new_obj - old_obj
            if difference < 0:
                #print "Objective function rose", difference
                #If the objective function isn't rising, restart optimization
                step_size *= 0.9
                #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
                #objective function isn't increasing, try reducing step size
                #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
                old_obj = tmp_old_obj
                rs += 1
            difference = abs(difference)
            i += 1
        self.i = i
        return f
--- a/python/likelihoods/init.py
+++ b/python/likelihoods/init.py
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@ -0,0 +1,253 @@
 from scipy.special import gammaln, gamma
 from scipy import integrate
 import numpy as np
 from GPy.likelihoods.likelihood_functions import likelihood_function
 from scipy import stats
 class student_t(likelihood_function):
    """Student t likelihood distribution
    For nomanclature see Bayesian Data Analysis 2003 p576
    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
    Laplace:
    Needs functions to calculate
    ln p(yi|fi)
    dln p(yi|fi)_dfi
    d2ln p(yi|fi)_d2fifj
    """
    def __init__(self, deg_free, sigma=2):
        self.v = deg_free
        self.sigma = sigma
        #FIXME: This should be in the superclass
        self.log_concave = False
    @property
    def variance(self, extra_data=None):
        return (self.v / float(self.v - 2)) * (self.sigma**2)
    def link_function(self, y, f, extra_data=None):
        """link_function $\ln p(y|f)$
        $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
        :y: data
        :f: latent variables f
        :extra_data: extra_data which is not used in student t distribution
        :returns: float(likelihood evaluated for this point)
        """
        y = np.squeeze(y)
        f = np.squeeze(f)
        assert y.shape == f.shape
        e = y - f
        objective = (gammaln((self.v + 1) * 0.5)
                     - gammaln(self.v * 0.5)
                     + np.log(self.sigma * np.sqrt(self.v * np.pi))
                     - (self.v + 1) * 0.5
                     * np.log(1 + ((e**2 / self.sigma**2) / self.v))
                     )
        return np.sum(objective)
    def link_grad(self, y, f, extra_data=None):
        """
        Gradient of the link function at y, given f w.r.t f
        $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
        :y: data
        :f: latent variables f
        :extra_data: extra_data which is not used in student t distribution
        :returns: gradient of likelihood evaluated at points
        """
        y = np.squeeze(y)
        f = np.squeeze(f)
        assert y.shape == f.shape
        e = y - f
        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
        return np.squeeze(grad)
    def link_hess(self, y, f, extra_data=None):
        """
        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
        Will return diagonal of hessian, since every where else it is 0
        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
        :y: data
        :f: latent variables f
        :extra_data: extra_data which is not used in student t distribution
        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
        """
        y = np.squeeze(y)
        f = np.squeeze(f)
        assert y.shape == f.shape
        e = y - f
        hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
        return np.squeeze(hess)
    def predictive_values(self, mu, var):
        """
        Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
        """
        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
        #Which was also given to us as (var)
        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
        true_var = var + self.variance
        #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
        #need the 95 and 5 percentiles.
        #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
        p_025 = mu - 2.*true_var
        p_975 = mu + 2.*true_var
        return mu, np.nan*mu, p_025, p_975
    def sample_predicted_values(self, mu, var):
        """ Experimental sample approches and numerical integration """
        #p_025 = stats.t.ppf(.025, mu)
        #p_975 = stats.t.ppf(.975, mu)
        num_test_points = mu.shape[0]
        #Each mu is the latent point f* at the test point x*,
        #and the var is the gaussian variance at this point
        #Take lots of samples from this, so we have lots of possible values
        #for latent point f* for each test point x* weighted by how likely we were to pick it
        print "Taking %d samples of f*".format(num_test_points)
        num_f_samples = 10
        num_y_samples = 10
        student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
        print "Student t means shape: ", student_t_means.shape
        #Now we have lots of f*, lets work out the likelihood of getting this by sampling
        #from a student t centred on this point, sample many points from this distribution
        #centred on f*
        #for test_point, f in enumerate(student_t_means):
            #print test_point
            #print f.shape
            #student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
                                            #scale=self.sigma,
                                            #size=(num_f_samples, num_y_samples))
            #print student_t_samples.shape
        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
                                        scale=self.sigma,
                                        size=(num_test_points, num_y_samples, num_f_samples))
        student_t_samples = np.reshape(student_t_samples,
                                       (num_test_points, num_y_samples*num_f_samples))
        #Now take the 97.5 and 0.25 percentile of these points
        p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
        p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
        ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
        def t_gaussian(f, mu, var):
            return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
                    * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
                    )
        def t_gauss_int(mu, var):
            print "Mu: ", mu
            print "var: ", var
            result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
            print "Result: ", result
            return result[0]
        vec_t_gauss_int = np.vectorize(t_gauss_int)
        p = vec_t_gauss_int(mu, var)
        p_025 = mu - p
        p_975 = mu + p
        return mu, np.nan*mu, p_025, p_975
 class weibull_survival(likelihood_function):
    """Weibull t likelihood distribution for survival analysis with censoring
        For nomanclature see Bayesian Survival Analysis
    Laplace:
    Needs functions to calculate
    ln p(yi|fi)
    dln p(yi|fi)_dfi
    d2ln p(yi|fi)_d2fifj
    """
    def __init__(self, shape, scale):
        self.shape = shape
        self.scale = scale
        #FIXME: This should be in the superclass
        self.log_concave = True
    def link_function(self, y, f, extra_data=None):
        """
        link_function $\ln p(y|f)$, i.e. log likelihood
        $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$
        :y: time of event data
        :f: latent variables f
        :extra_data: the censoring indicator, 1 for censored, 0 for not
        :returns: float(likelihood evaluated for this point)
        """
        y = np.squeeze(y)
        f = np.squeeze(f)
        assert y.shape == f.shape
        v = extra_data
        objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f)  # FIXME: CHECK THIS WITH BOOK, wheres scale?
        return np.sum(objective)
    def link_grad(self, y, f, extra_data=None):
        """
        Gradient of the link function at y, given f w.r.t f
        $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i})
        :y: data
        :f: latent variables f
        :extra_data: the censoring indicator, 1 for censored, 0 for not
        :returns: gradient of likelihood evaluated at points
        """
        y = np.squeeze(y)
        f = np.squeeze(f)
        assert y.shape == f.shape
        v = extra_data
        grad = v - (y**self.shape)*np.exp(f)
        return np.squeeze(grad)
    def link_hess(self, y, f, extra_data=None):
        """
        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
        Will return diagonal of hessian, since every where else it is 0
        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
        :y: data
        :f: latent variables f
        :extra_data: extra_data which is not used hessian
        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
        """
        y = np.squeeze(y)
        f = np.squeeze(f)
        assert y.shape == f.shape
        hess = (y**self.shape)*np.exp(f)
        return np.squeeze(hess)
--- a/python/models/init.py
+++ b/python/models/init.py
--- a/python/models/coxGP.py
+++ b/python/models/coxGP.py
@ -0,0 +1,19 @@
 # Copyright (c) 2013, Alan Saul
 from GPy.models import GP
 from .. import likelihoods
 from GPy import kern
 class cox_GP_regression(GP):
    """
    Cox Gaussian Process model for regression
    """
    def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None):
        if kernel is None:
            kernel = kern.rbf(X.shape[1])
        likelihood = likelihoods.cox_piecewise(Y, normalize=normalize_Y)
        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices)
--- a/python/testing/init.py
+++ b/python/testing/init.py
--- a/python/testing/cox_tests.py
+++ b/python/testing/cox_tests.py
@ -0,0 +1,14 @@
 # Copyright (c) 2013, Alan Saul
 import unittest
 import numpy as np
 import GPy
 class coxGPTests(unittest.TestCase):
    def test_laplace_approx(self):
        pass
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
    unittest.main()