diff --git a/README.md b/README.md
index 09bc78f5..b15e04cc 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,4 @@
-GPy
-===
+coxGP
+=====
 
-A Gaussian processes framework in python
-
-* [Online documentation](https://gpy.readthedocs.org/en/latest/)
-* [Unit tests (Travis-CI)](https://travis-ci.org/SheffieldML/GPy)
-
-
-Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GPy.png)
+Gaussian Process models of Cox proportional hazard models
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/__init__.py b/python/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/examples/__init__.py b/python/examples/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
new file mode 100644
index 00000000..5b1331b6
--- /dev/null
+++ b/python/examples/laplace_approximations.py
@@ -0,0 +1,220 @@
+import GPy
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import t, norm
+from coxGP.python.likelihoods.Laplace import Laplace
+from coxGP.python.likelihoods.likelihood_function import student_t
+
+
+def timing():
+    real_var = 0.1
+    times = 1
+    deg_free = 10
+    real_sd = np.sqrt(real_var)
+    the_is = np.zeros(times)
+    X = np.linspace(0.0, 10.0, 300)[:, None]
+
+    for a in xrange(times):
+        Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+        Yc = Y.copy()
+
+        Yc[10] += 100
+        Yc[25] += 10
+        Yc[23] += 10
+        Yc[24] += 10
+        Yc[250] += 10
+        #Yc[4] += 10000
+
+        edited_real_sd = real_sd
+        kernel1 = GPy.kern.rbf(X.shape[1])
+
+        t_distribution = student_t(deg_free, sigma=edited_real_sd)
+        corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
+        m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
+        m.ensure_default_constraints()
+        m.update_likelihood_approximation()
+        m.optimize()
+        the_is[a] = m.likelihood.i
+
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    print the_is
+    print np.mean(the_is)
+
+
+def student_t_approx():
+    """
+    Example of regressing with a student t likelihood
+    """
+    real_var = 0.1
+    #Start a function, any function
+    X = np.linspace(0.0, 10.0, 30)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+    Yc = Y.copy()
+
+    X_full = np.linspace(0.0, 10.0, 500)[:, None]
+    Y_full = np.sin(X_full)
+
+    #Y = Y/Y.max()
+
+    Yc[10] += 100
+    Yc[25] += 10
+    Yc[23] += 10
+    Yc[24] += 10
+    #Yc = Yc/Yc.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 10
+    real_sd = np.sqrt(real_var)
+    #t_rv = t(deg_free, loc=0, scale=real_var)
+    #noise = t_rvrvs(size=Y.shape)
+    #Y += noise
+
+    #Add some extreme value noise to some of the datapoints
+    #percent_corrupted = 0.15
+    #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
+    #indices = np.arange(Y.shape[0])
+    #np.random.shuffle(indices)
+    #corrupted_indices = indices[:corrupted_datums]
+    #print corrupted_indices
+    #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
+    #Y[corrupted_indices] += noise
+    plt.figure(1)
+    plt.suptitle('Gaussian likelihood')
+    # Kernel object
+    kernel1 = GPy.kern.rbf(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
+    kernel5 = kernel1.copy()
+    kernel6 = kernel1.copy()
+
+    print "Clean Gaussian"
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize()
+    # plot
+    plt.subplot(211)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    print m
+
+    #Corrupt
+    print "Corrupt Gaussian"
+    m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
+    m.ensure_default_constraints()
+    m.optimize()
+    plt.subplot(212)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    print m
+
+    plt.figure(2)
+    plt.suptitle('Student-t likelihood')
+    edited_real_sd = real_sd
+
+    print "Clean student t, rasm"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(222)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Corrupt student t, rasm"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(224)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Clean student t, ncg"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(221)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Corrupt student t, ncg"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(223)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+
+    ###with a student t distribution, since it has heavy tails it should work well
+    ###likelihood_function = student_t(deg_free, sigma=real_var)
+    ###lap = Laplace(Y, likelihood_function)
+    ###cov = kernel.K(X)
+    ###lap.fit_full(cov)
+
+    ###test_range = np.arange(0, 10, 0.1)
+    ###plt.plot(test_range, t_rv.pdf(test_range))
+    ###for i in xrange(X.shape[0]):
+        ###mode = lap.f_hat[i]
+        ###covariance = lap.hess_hat_i[i,i]
+        ###scaling = np.exp(lap.ln_z_hat)
+        ###normalised_approx = norm(loc=mode, scale=covariance)
+        ###print "Normal with mode %f, and variance %f" % (mode, covariance)
+        ###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    ###plt.show()
+
+    return m
+
+
+def noisy_laplace_approx():
+    """
+    Example of regressing with a student t likelihood
+    """
+    #Start a function, any function
+    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
+    Y = np.sin(X)
+
+    #Add some extreme value noise to some of the datapoints
+    percent_corrupted = 0.05
+    corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
+    indices = np.arange(Y.shape[0])
+    np.random.shuffle(indices)
+    corrupted_indices = indices[:corrupted_datums]
+    print corrupted_indices
+    noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1))
+    Y[corrupted_indices] += noise
+
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X, Y)
+
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize()
+    # plot
+    m.plot()
+    print m
+
+    #with a student t distribution, since it has heavy tails it should work well
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
new file mode 100644
index 00000000..4d94ba0f
--- /dev/null
+++ b/python/likelihoods/Laplace.py
@@ -0,0 +1,321 @@
+import numpy as np
+import scipy as sp
+import GPy
+from scipy.linalg import cholesky, eig, inv, cho_solve, det
+from numpy.linalg import cond
+from GPy.likelihoods.likelihood import likelihood
+from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
+from scipy.linalg.lapack import dtrtrs
+
+#TODO: Move this to utils
+
+
+def det_ln_diag(A):
+    """
+    log determinant of a diagonal matrix
+    $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
+    """
+    return np.log(np.diagonal(A)).sum()
+
+
+def pddet(A):
+    """
+    Determinant of a positive definite matrix
+    """
+    L = cholesky(A)
+    logdetA = 2*sum(np.log(np.diag(L)))
+    return logdetA
+
+
+class Laplace(likelihood):
+    """Laplace approximation to a posterior"""
+
+    def __init__(self, data, likelihood_function, extra_data=None, rasm=True):
+        """
+        Laplace Approximation
+
+        First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
+        then find the z^{prime} which allows this to be a normalised gaussian instead of a
+        non-normalized gaussian
+
+        Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
+        which makes a gaussian the same as the laplace approximation
+
+        Arguments
+        ---------
+
+        :data: array of data the likelihood function is approximating
+        :likelihood_function: likelihood function - subclass of likelihood_function
+        :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
+        :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation
+
+        """
+        self.data = data
+        self.likelihood_function = likelihood_function
+        self.extra_data = extra_data
+        self.rasm = rasm
+
+        #Inital values
+        self.N, self.D = self.data.shape
+        self.is_heteroscedastic = True
+        self.Nparams = 0
+
+        self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
+
+        #Initial values for the GP variables
+        self.Y = np.zeros((self.N, 1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:, None]
+        self.Z = 0
+        self.YYT = None
+
+    def predictive_values(self, mu, var, full_cov):
+        if full_cov:
+            raise NotImplementedError("Cannot make correlated predictions with an EP likelihood")
+        return self.likelihood_function.predictive_values(mu, var)
+
+    def _get_params(self):
+        return np.zeros(0)
+
+    def _get_param_names(self):
+        return []
+
+    def _set_params(self, p):
+        pass  # TODO: Laplace likelihood might want to take some parameters...
+
+    def _gradients(self, partial):
+        return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
+        raise NotImplementedError
+
+    def _compute_GP_variables(self):
+        """
+        Generates data Y which would give the normal distribution identical to the laplace approximation
+
+        GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
+        that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
+
+        Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
+        then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
+        due to the z rescaling.
+
+        at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
+
+        This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
+        giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
+
+        $$\tilde{Y} = \tilde{\Sigma} Hf$$
+        where
+        $$\tilde{\Sigma}^{-1} = H - K^{-1}$$
+        i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
+        since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
+        and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
+        $$\tilde{\Sigma} = W^{-1}$$
+
+        """
+        epsilon = 1e-6
+
+        #dtritri -> L -> L_i
+        #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
+        #((L.T*w)_i + I)f_hat = y_tilde
+        L = jitchol(self.K)
+        Li = chol_inv(L)
+        Lt_W = np.dot(L.T, self.W)
+
+        ##Check it isn't singular!
+        if cond(Lt_W) > 1e14:
+            print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
+
+        Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
+        Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
+
+        #f.T(Ki + W)f
+        f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
+                    + mdot(self.f_hat.T, self.W, self.f_hat)
+                    )
+
+        y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
+        y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
+        ln_W_det = det_ln_diag(self.W)
+        Z_tilde = (- self.NORMAL_CONST
+                   + 0.5*self.ln_K_det
+                   + 0.5*ln_W_det
+                   + 0.5*self.ln_Ki_W_i_det
+                   + 0.5*f_Ki_W_f
+                   + 0.5*y_W_y
+                   - y_W_f
+                   + self.ln_z_hat
+                   )
+        #Z_tilde = (self.NORMAL_CONST
+                   #- 0.5*self.ln_K_det
+                   #- 0.5*ln_W_det
+                   #- 0.5*self.ln_Ki_W_i_det
+                   #- 0.5*f_Ki_W_f
+                   #- 0.5*y_W_y
+                   #+ y_W_f
+                   #+ self.ln_z_hat
+                   #)
+
+        ##Check it isn't singular!
+        if cond(self.W) > 1e14:
+            print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
+
+        Sigma_tilde = inv(self.W)  # Damn
+
+        #Convert to float as its (1, 1) and Z must be a scalar
+        self.Z = np.float64(Z_tilde)
+        self.Y = Y_tilde
+        self.YYT = np.dot(self.Y, self.Y.T)
+        self.covariance_matrix = Sigma_tilde
+        self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
+
+    def fit_full(self, K):
+        """
+        The laplace approximation algorithm
+        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
+        :K: Covariance matrix
+        """
+        self.K = K.copy()
+        if self.rasm:
+            self.f_hat = self.rasm_mode(K)
+        else:
+            self.f_hat = self.ncg_mode(K)
+
+        #At this point get the hessian matrix
+        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data))
+
+        if not self.likelihood_function.log_concave:
+            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                       #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                       #To cause the posterior to become less certain than the prior and likelihood,
+                                       #This is a property only held by non-log-concave likelihoods
+
+        #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
+        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W)
+        self.Bi, _, _, B_det = pdinv(self.B)
+
+        Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
+        self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
+
+        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
+        solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
+        a = b - mdot(self.W_12, solve_chol)
+        self.f_Ki_f = np.dot(self.f_hat.T, a)
+        self.ln_K_det = pddet(self.K)
+
+        self.ln_z_hat = (- 0.5*self.f_Ki_f
+                         - 0.5*self.ln_K_det
+                         + 0.5*self.ln_Ki_W_i_det
+                         + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+                         )
+
+        return self._compute_GP_variables()
+
+    def _compute_B_statistics(self, K, W):
+        """Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :K: Covariance matrix
+        :W: Negative hessian at a point (diagonal matrix)
+        :returns: (B, L)
+        """
+        #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
+        W_12 = np.sqrt(W)
+        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
+        L = jitchol(B)
+        return (B, L, W_12)
+
+    def ncg_mode(self, K):
+        """
+        Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
+        :K: Covariance matrix
+        :returns: f_mode
+        """
+        self.Ki, _, _, self.ln_K_det = pdinv(K)
+
+        f = np.zeros((self.N, 1))
+
+        #FIXME: Can we get rid of this horrible reshaping?
+        #ONLY WORKS FOR 1D DATA
+        def obj(f):
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
+                        + self.NORMAL_CONST)
+            return float(res)
+
+        def obj_grad(f):
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
+            return np.squeeze(res)
+
+        def obj_hess(f):
+            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
+            return np.squeeze(res)
+
+        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        return f_hat[:, None]
+
+    def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50):
+        """
+        Rasmussens numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+
+        :K: Covariance matrix
+        :MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
+        :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
+        :returns: f_mode
+        """
+        f = np.zeros((self.N, 1))
+        new_obj = -np.inf
+        old_obj = np.inf
+
+        def obj(a, f):
+            #Careful of shape of data!
+            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
+
+        difference = np.inf
+        epsilon = 1e-6
+        step_size = 1
+        rs = 0
+        i = 0
+        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
+            f_old = f.copy()
+            W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data))
+            if not self.likelihood_function.log_concave:
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                    # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                    # To cause the posterior to become less certain than the prior and likelihood,
+                                    # This is a property only held by non-log-concave likelihoods
+            B, L, W_12 = self._compute_B_statistics(K, W)
+
+            W_f = np.dot(W, f)
+            grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None]
+            #Find K_i_f
+            b = W_f + grad
+
+            #a should be equal to Ki*f now so should be able to use it
+            c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
+            solve_L = cho_solve((L, True), np.dot(W_12, c))
+            f = c - np.dot(K, np.dot(W_12, solve_L))
+
+            solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b)))
+            a = b - np.dot(W_12, solve_L)
+            #f = np.dot(K, a)
+
+            tmp_old_obj = old_obj
+            old_obj = new_obj
+            new_obj = obj(a, f)
+            difference = new_obj - old_obj
+            if difference < 0:
+                #print "Objective function rose", difference
+                #If the objective function isn't rising, restart optimization
+                step_size *= 0.9
+                #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                #objective function isn't increasing, try reducing step size
+                #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
+                old_obj = tmp_old_obj
+                rs += 1
+
+            difference = abs(difference)
+            i += 1
+
+        self.i = i
+        return f
diff --git a/python/likelihoods/__init__.py b/python/likelihoods/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
new file mode 100644
index 00000000..0d421882
--- /dev/null
+++ b/python/likelihoods/likelihood_function.py
@@ -0,0 +1,253 @@
+from scipy.special import gammaln, gamma
+from scipy import integrate
+import numpy as np
+from GPy.likelihoods.likelihood_functions import likelihood_function
+from scipy import stats
+
+
+class student_t(likelihood_function):
+    """Student t likelihood distribution
+    For nomanclature see Bayesian Data Analysis 2003 p576
+
+    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fifj
+    """
+    def __init__(self, deg_free, sigma=2):
+        self.v = deg_free
+        self.sigma = sigma
+
+        #FIXME: This should be in the superclass
+        self.log_concave = False
+
+    @property
+    def variance(self, extra_data=None):
+        return (self.v / float(self.v - 2)) * (self.sigma**2)
+
+    def link_function(self, y, f, extra_data=None):
+        """link_function $\ln p(y|f)$
+        $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        e = y - f
+        objective = (gammaln((self.v + 1) * 0.5)
+                     - gammaln(self.v * 0.5)
+                     + np.log(self.sigma * np.sqrt(self.v * np.pi))
+                     - (self.v + 1) * 0.5
+                     * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                     )
+        return np.sum(objective)
+
+    def link_grad(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
+
+        $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: gradient of likelihood evaluated at points
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        return np.squeeze(grad)
+
+    def link_hess(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        Will return diagonal of hessian, since every where else it is 0
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
+        return np.squeeze(hess)
+
+    def predictive_values(self, mu, var):
+        """
+        Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
+
+        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        """
+
+        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
+        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
+        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
+        #Which was also given to us as (var)
+        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
+        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
+        true_var = var + self.variance
+
+        #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
+        #need the 95 and 5 percentiles.
+        #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
+        p_025 = mu - 2.*true_var
+        p_975 = mu + 2.*true_var
+
+        return mu, np.nan*mu, p_025, p_975
+
+    def sample_predicted_values(self, mu, var):
+        """ Experimental sample approches and numerical integration """
+        #p_025 = stats.t.ppf(.025, mu)
+        #p_975 = stats.t.ppf(.975, mu)
+
+        num_test_points = mu.shape[0]
+        #Each mu is the latent point f* at the test point x*,
+        #and the var is the gaussian variance at this point
+        #Take lots of samples from this, so we have lots of possible values
+        #for latent point f* for each test point x* weighted by how likely we were to pick it
+        print "Taking %d samples of f*".format(num_test_points)
+        num_f_samples = 10
+        num_y_samples = 10
+        student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
+        print "Student t means shape: ", student_t_means.shape
+
+        #Now we have lots of f*, lets work out the likelihood of getting this by sampling
+        #from a student t centred on this point, sample many points from this distribution
+        #centred on f*
+        #for test_point, f in enumerate(student_t_means):
+            #print test_point
+            #print f.shape
+            #student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
+                                            #scale=self.sigma,
+                                            #size=(num_f_samples, num_y_samples))
+            #print student_t_samples.shape
+
+        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
+                                        scale=self.sigma,
+                                        size=(num_test_points, num_y_samples, num_f_samples))
+        student_t_samples = np.reshape(student_t_samples,
+                                       (num_test_points, num_y_samples*num_f_samples))
+
+        #Now take the 97.5 and 0.25 percentile of these points
+        p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
+        p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
+
+        ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
+        def t_gaussian(f, mu, var):
+            return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
+                    * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
+                    )
+
+        def t_gauss_int(mu, var):
+            print "Mu: ", mu
+            print "var: ", var
+            result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
+            print "Result: ", result
+            return result[0]
+
+        vec_t_gauss_int = np.vectorize(t_gauss_int)
+
+        p = vec_t_gauss_int(mu, var)
+        p_025 = mu - p
+        p_975 = mu + p
+        return mu, np.nan*mu, p_025, p_975
+
+
+class weibull_survival(likelihood_function):
+    """Weibull t likelihood distribution for survival analysis with censoring
+        For nomanclature see Bayesian Survival Analysis
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fifj
+    """
+    def __init__(self, shape, scale):
+        self.shape = shape
+        self.scale = scale
+
+        #FIXME: This should be in the superclass
+        self.log_concave = True
+
+    def link_function(self, y, f, extra_data=None):
+        """
+        link_function $\ln p(y|f)$, i.e. log likelihood
+
+        $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$
+
+        :y: time of event data
+        :f: latent variables f
+        :extra_data: the censoring indicator, 1 for censored, 0 for not
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        v = extra_data
+        objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f)  # FIXME: CHECK THIS WITH BOOK, wheres scale?
+        return np.sum(objective)
+
+    def link_grad(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
+
+        $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i})
+
+        :y: data
+        :f: latent variables f
+        :extra_data: the censoring indicator, 1 for censored, 0 for not
+        :returns: gradient of likelihood evaluated at points
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        v = extra_data
+        grad = v - (y**self.shape)*np.exp(f)
+        return np.squeeze(grad)
+
+    def link_hess(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        Will return diagonal of hessian, since every where else it is 0
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used hessian
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        hess = (y**self.shape)*np.exp(f)
+        return np.squeeze(hess)
diff --git a/python/models/__init__.py b/python/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/models/coxGP.py b/python/models/coxGP.py
new file mode 100644
index 00000000..f61a8f46
--- /dev/null
+++ b/python/models/coxGP.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2013, Alan Saul
+
+from GPy.models import GP
+from .. import likelihoods
+from GPy import kern
+
+
+class cox_GP_regression(GP):
+    """
+    Cox Gaussian Process model for regression
+    """
+
+    def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None):
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1])
+
+        likelihood = likelihoods.cox_piecewise(Y, normalize=normalize_Y)
+
+        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices)
diff --git a/python/testing/__init__.py b/python/testing/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/testing/cox_tests.py b/python/testing/cox_tests.py
new file mode 100644
index 00000000..526f5c92
--- /dev/null
+++ b/python/testing/cox_tests.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2013, Alan Saul
+
+import unittest
+import numpy as np
+import GPy
+
+class coxGPTests(unittest.TestCase):
+    def test_laplace_approx(self):
+        pass
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
+