From 67248ab7c2b0becf471fe08638d35cf0786ee1a2 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 12 Mar 2013 03:16:33 -0700
Subject: [PATCH 01/22] Initial commit

---
 .gitignore | 35 +++++++++++++++++++++++++++++++++++
 README.md  |  4 ++++
 2 files changed, 39 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..d2d6f360
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,35 @@
+*.py[cod]
+
+# C extensions
+*.so
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+lib
+lib64
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+nosetests.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..317fa353
--- /dev/null
+++ b/README.md
@@ -0,0 +1,4 @@
+coxGP
+=====
+
+Gaussian Process models of Cox proportional hazard models
\ No newline at end of file

From 68eb83955c585b08cf93cbd659f749cff5b62bb3 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 12 Mar 2013 17:42:00 +0000
Subject: [PATCH 02/22] Initial commit, setting up the laplace approximation
 for a student t

---
 python/examples/laplace_approximations.py | 37 ++++++++++++++++
 python/likelihoods/Laplace.py             | 54 +++++++++++++++++++++++
 python/likelihoods/likelihood_function.py | 51 +++++++++++++++++++++
 python/models/coxGP.py                    | 19 ++++++++
 python/testing/cox_tests.py               | 14 ++++++
 5 files changed, 175 insertions(+)
 create mode 100644 python/examples/laplace_approximations.py
 create mode 100644 python/likelihoods/Laplace.py
 create mode 100644 python/likelihoods/likelihood_function.py
 create mode 100644 python/models/coxGP.py
 create mode 100644 python/testing/cox_tests.py

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
new file mode 100644
index 00000000..2f059831
--- /dev/null
+++ b/python/examples/laplace_approximations.py
@@ -0,0 +1,37 @@
+import GPy
+import numpy as np
+import scipy as sp
+import scipy.stats
+import matplotlib.pyplot as plt
+
+
+def student_t_approx():
+    """
+    Example of regressing with a student t likelihood
+    """
+    #Start a function, any function
+    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
+    Y = np.sin(X)
+
+    #Add some extreme value noise to some of the datapoints
+    percent_corrupted = 0.05
+    corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
+    indices = np.arange(Y.shape[0])
+    np.random.shuffle(indices)
+    corrupted_indices = indices[:corrupted_datums]
+    print corrupted_indices
+    noise = np.random.uniform(-10,10,(len(corrupted_indices), 1))
+    Y[corrupted_indices] += noise
+
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X,Y)
+
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize()
+    # plot
+    m.plot()
+    print m
+
+    #with a student t distribution, since it has heavy tails it should work well
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
new file mode 100644
index 00000000..a0dbc65c
--- /dev/null
+++ b/python/likelihoods/Laplace.py
@@ -0,0 +1,54 @@
+import nump as np
+import GPy
+from GPy.util.linalg import jitchol
+
+class Laplace(GPy.likelihoods.likelihood):
+    """Laplace approximation to a posterior"""
+
+    def __init__(self,data,likelihood_function):
+        """
+        Laplace Approximation
+
+        First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
+        then find the z^{prime} which allows this to be a normalised gaussian instead of a
+        non-normalized gaussian
+
+        Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
+        which makes a gaussian the same as the laplace approximation
+
+        Arguments
+        ---------
+
+        :data: @todo
+        :likelihood_function: @todo
+
+        """
+        GPy.likelihoods.likelihood.__init__(self)
+
+        self.data = data
+        self.likelihood_function = likelihood_function
+
+        #Inital values
+        self.N, self.D = self.data.shape
+
+    def _compute_GP_variables(self):
+        """
+        Generates data Y which would give the normal distribution identical to the laplace approximation
+
+        GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
+        that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
+        """
+        raise NotImplementedError
+
+    def fit_full(self, K):
+        """
+        The laplace approximation algorithm
+        For nomenclature see Rasmussen & Williams 2006
+        :K: Covariance matrix
+        """
+        self.f = np.zeros(self.N)
+
+        #Find \hat(f) using a newton raphson optimizer for example
+
+        #At this point get the hessian matrix
+
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
new file mode 100644
index 00000000..fd19675b
--- /dev/null
+++ b/python/likelihoods/likelihood_function.py
@@ -0,0 +1,51 @@
+import GPy
+from scipy.special import gamma, gammaln
+
+class student_t(GPy.likelihoods.likelihood_function):
+    """Student t likelihood distribution
+    For nomanclature see Bayesian Data Analysis 2003 p576
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fi
+    """
+    def __init__(self, deg_free, sigma=1):
+        self.v = deg_free
+        self.sigma = 1
+
+    def link_function(self, y_i, f_i):
+        """link_function $\ln p(y_i|f_i)$
+
+        :y_i: datum number i
+        :f_i: latent variable f_i
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        e = y_i - f_i
+        return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v)
+
+    def link_grad(self, y_i, f_i):
+        """gradient of the link function at y_i, given f_i w.r.t f_i
+
+        :y_i: datum number i
+        :f_i: latent variable f_i
+        :returns: float(gradient of likelihood evaluated at this point)
+
+        """
+        pass
+
+    def link_hess(self, y_i, f_i, f_j):
+        """hessian at this point (the hessian will be 0 unless i == j)
+        i.e. second derivative w.r.t f_i and f_j
+
+        :y_i: @todo
+        :f_i: @todo
+        :f_j: @todo
+        :returns: @todo
+
+        """
+        if f_i =
+        pass
+
diff --git a/python/models/coxGP.py b/python/models/coxGP.py
new file mode 100644
index 00000000..f61a8f46
--- /dev/null
+++ b/python/models/coxGP.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2013, Alan Saul
+
+from GPy.models import GP
+from .. import likelihoods
+from GPy import kern
+
+
+class cox_GP_regression(GP):
+    """
+    Cox Gaussian Process model for regression
+    """
+
+    def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None):
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1])
+
+        likelihood = likelihoods.cox_piecewise(Y, normalize=normalize_Y)
+
+        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices)
diff --git a/python/testing/cox_tests.py b/python/testing/cox_tests.py
new file mode 100644
index 00000000..526f5c92
--- /dev/null
+++ b/python/testing/cox_tests.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2013, Alan Saul
+
+import unittest
+import numpy as np
+import GPy
+
+class coxGPTests(unittest.TestCase):
+    def test_laplace_approx(self):
+        pass
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
+

From ad2c266c65120e1fabf0cf1825fc0c661084611b Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 13 Mar 2013 11:54:33 +0000
Subject: [PATCH 03/22] Added some comments

---
 python/likelihoods/likelihood_function.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index fd19675b..5d4e51ce 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -5,6 +5,9 @@ class student_t(GPy.likelihoods.likelihood_function):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
 
+    $$\ln(\frac{\Gamma(\frac{(v+1)}{2})}{\Gamma(\sqrt(v \pi \Gamma(\frac{v}{2}))})+ \ln(1+\frac{(y_i-f_i)^2}{\sigma v})^{-\frac{(v+1)}{2}}$$
+    TODO:Double check this
+
     Laplace:
     Needs functions to calculate
     ln p(yi|fi)
@@ -17,6 +20,8 @@ class student_t(GPy.likelihoods.likelihood_function):
 
     def link_function(self, y_i, f_i):
         """link_function $\ln p(y_i|f_i)$
+        $$\ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2}) - \ln \frac{v \pi \sigma}{2} - \frac{v+1}{2}\ln (1 + \frac{(y_{i} - f_{i})^{2}}{v\sigma})$$
+        TODO: Double check this
 
         :y_i: datum number i
         :f_i: latent variable f_i
@@ -24,11 +29,15 @@ class student_t(GPy.likelihoods.likelihood_function):
 
         """
         e = y_i - f_i
-        return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v)
+        return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) #Check the /v!
 
     def link_grad(self, y_i, f_i):
         """gradient of the link function at y_i, given f_i w.r.t f_i
 
+        derivative of log((gamma((v+1)/2)/gamma(sqrt(v*pi*gamma(v/2))))*(1+(t^2)/(a*v))^((-(v+1))/2)) with respect to t
+        $$\frac{(y_i - f_i)(v + 1)}{\sigma v (y_{i} - f_{i})^{2}}$$
+        TODO: Double check this
+
         :y_i: datum number i
         :f_i: latent variable f_i
         :returns: float(gradient of likelihood evaluated at this point)
@@ -40,6 +49,8 @@ class student_t(GPy.likelihoods.likelihood_function):
         """hessian at this point (the hessian will be 0 unless i == j)
         i.e. second derivative w.r.t f_i and f_j
 
+        second derivative of
+
         :y_i: @todo
         :f_i: @todo
         :f_j: @todo

From 3f114aa020fb678b1c52eb441bb079d9a0b8cd00 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 13 Mar 2013 17:55:41 +0000
Subject: [PATCH 04/22] Got most of laplace approximation working

---
 __init__.py                               |  0
 python/__init__.py                        |  0
 python/examples/__init__.py               |  0
 python/examples/laplace_approximations.py | 44 +++++++++++--
 python/likelihoods/Laplace.py             | 45 +++++++++++--
 python/likelihoods/__init__.py            |  0
 python/likelihoods/likelihood_function.py | 80 +++++++++++++----------
 python/models/__init__.py                 |  0
 python/testing/__init__.py                |  0
 9 files changed, 124 insertions(+), 45 deletions(-)
 create mode 100644 __init__.py
 create mode 100644 python/__init__.py
 create mode 100644 python/examples/__init__.py
 create mode 100644 python/likelihoods/__init__.py
 create mode 100644 python/models/__init__.py
 create mode 100644 python/testing/__init__.py

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/__init__.py b/python/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/examples/__init__.py b/python/examples/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 2f059831..0e1d3305 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -1,8 +1,9 @@
 import GPy
 import numpy as np
-import scipy as sp
-import scipy.stats
 import matplotlib.pyplot as plt
+from scipy.stats import t
+from coxGP.python.likelihoods.Laplace import Laplace
+from coxGP.python.likelihoods.likelihood_function import student_t
 
 
 def student_t_approx():
@@ -13,6 +14,41 @@ def student_t_approx():
     X = np.sort(np.random.uniform(0, 15, 70))[:, None]
     Y = np.sin(X)
 
+    #Add student t random noise to datapoints
+    deg_free = 1
+    noise = t.rvs(deg_free, loc=1.8, scale=1, size=Y.shape)
+    Y += noise
+
+    # Kernel object
+    print X.shape
+    kernel = GPy.kern.rbf(X.shape[1])
+
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X, Y, kernel=kernel)
+
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize()
+    # plot
+    #m.plot()
+    print m
+
+    #with a student t distribution, since it has heavy tails it should work well
+    likelihood_function = student_t(deg_free, sigma=1)
+    lap = Laplace(Y, likelihood_function)
+    cov = kernel.K(X)
+    lap.fit_full(cov)
+
+
+def noisy_laplace_approx():
+    """
+    Example of regressing with a student t likelihood
+    """
+    #Start a function, any function
+    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
+    Y = np.sin(X)
+
     #Add some extreme value noise to some of the datapoints
     percent_corrupted = 0.05
     corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
@@ -20,12 +56,12 @@ def student_t_approx():
     np.random.shuffle(indices)
     corrupted_indices = indices[:corrupted_datums]
     print corrupted_indices
-    noise = np.random.uniform(-10,10,(len(corrupted_indices), 1))
+    noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1))
     Y[corrupted_indices] += noise
 
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
-    m = GPy.models.GP_regression(X,Y)
+    m = GPy.models.GP_regression(X, Y)
 
     # optimize
     m.ensure_default_constraints()
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index a0dbc65c..6efbfa30 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,8 +1,14 @@
-import nump as np
+import numpy as np
+import scipy as sp
 import GPy
 from GPy.util.linalg import jitchol
+from functools import partial
+from GPy.likelihoods.likelihood import likelihood
+from GPy.util.linalg import pdinv,mdot
 
-class Laplace(GPy.likelihoods.likelihood):
+
+
+class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
     def __init__(self,data,likelihood_function):
@@ -23,8 +29,6 @@ class Laplace(GPy.likelihoods.likelihood):
         :likelihood_function: @todo
 
         """
-        GPy.likelihoods.likelihood.__init__(self)
-
         self.data = data
         self.likelihood_function = likelihood_function
 
@@ -38,7 +42,7 @@ class Laplace(GPy.likelihoods.likelihood):
         GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
         that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
         """
-        raise NotImplementedError
+        z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised
 
     def fit_full(self, K):
         """
@@ -46,9 +50,38 @@ class Laplace(GPy.likelihoods.likelihood):
         For nomenclature see Rasmussen & Williams 2006
         :K: Covariance matrix
         """
-        self.f = np.zeros(self.N)
+        f = np.zeros((self.N, 1))
+        print K.shape
+        print f.shape
+        print self.data.shape
+        (Ki, _, _, log_Kdet) = pdinv(K)
+        obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2*np.pi))
 
         #Find \hat(f) using a newton raphson optimizer for example
+        #TODO: Add newton-raphson as subclass of optimizer class
+
+        #FIXME: Can we get rid of this horrible reshaping?
+        def obj(f):
+            f = f[:, None]
+            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (Ki, f)) + obj_constant)
+            return float(res)
+
+        def obj_grad(f):
+            f = f[:, None]
+            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(Ki, f))
+            return np.squeeze(res)
+
+        def obj_hess(f):
+            f = f[:, None]
+            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - Ki)
+            return np.squeeze(res)
+
+        self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
 
         #At this point get the hessian matrix
+        self.hess_hat = obj_hess(f_hat)
 
+        #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
+        self.height_unnormalised = obj(f_hat) #FIXME: Is it -1?
+
+        return _compute_GP_variables()
diff --git a/python/likelihoods/__init__.py b/python/likelihoods/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 5d4e51ce..78731199 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -1,62 +1,72 @@
-import GPy
-from scipy.special import gamma, gammaln
+from scipy.special import gammaln
+import numpy as np
+from GPy.likelihoods.likelihood_functions import likelihood_function
 
-class student_t(GPy.likelihoods.likelihood_function):
+
+class student_t(likelihood_function):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
 
-    $$\ln(\frac{\Gamma(\frac{(v+1)}{2})}{\Gamma(\sqrt(v \pi \Gamma(\frac{v}{2}))})+ \ln(1+\frac{(y_i-f_i)^2}{\sigma v})^{-\frac{(v+1)}{2}}$$
-    TODO:Double check this
+    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
     Laplace:
     Needs functions to calculate
     ln p(yi|fi)
     dln p(yi|fi)_dfi
-    d2ln p(yi|fi)_d2fi
+    d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free, sigma=1):
         self.v = deg_free
         self.sigma = 1
 
-    def link_function(self, y_i, f_i):
-        """link_function $\ln p(y_i|f_i)$
-        $$\ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2}) - \ln \frac{v \pi \sigma}{2} - \frac{v+1}{2}\ln (1 + \frac{(y_{i} - f_{i})^{2}}{v\sigma})$$
-        TODO: Double check this
+    def link_function(self, y, f):
+        """link_function $\ln p(y|f)$
+        $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
-        :y_i: datum number i
-        :f_i: latent variable f_i
+        :y: datum number i
+        :f: latent variable f
         :returns: float(likelihood evaluated for this point)
 
         """
-        e = y_i - f_i
-        return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) #Check the /v!
+        e = y - f
+        #print "Link ", y.shape, f.shape, e.shape
+        objective = (gammaln((self.v + 1) * 0.5)
+                - gammaln(self.v * 0.5)
+                + np.log(self.sigma * np.sqrt(self.v * np.pi))
+                - (self.v + 1) * 0.5
+                * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                )
+        return np.sum(objective)
 
-    def link_grad(self, y_i, f_i):
-        """gradient of the link function at y_i, given f_i w.r.t f_i
+    def link_grad(self, y, f):
+        """
+        Gradient of the link function at y, given f w.r.t f
 
-        derivative of log((gamma((v+1)/2)/gamma(sqrt(v*pi*gamma(v/2))))*(1+(t^2)/(a*v))^((-(v+1))/2)) with respect to t
-        $$\frac{(y_i - f_i)(v + 1)}{\sigma v (y_{i} - f_{i})^{2}}$$
-        TODO: Double check this
+        $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
 
-        :y_i: datum number i
-        :f_i: latent variable f_i
+        :y: datum number i
+        :f: latent variable f
         :returns: float(gradient of likelihood evaluated at this point)
 
         """
-        pass
-
-    def link_hess(self, y_i, f_i, f_j):
-        """hessian at this point (the hessian will be 0 unless i == j)
-        i.e. second derivative w.r.t f_i and f_j
-
-        second derivative of
-
-        :y_i: @todo
-        :f_i: @todo
-        :f_j: @todo
-        :returns: @todo
+        e = y - f
+        #print "Grad ", y.shape, f.shape, e.shape
+        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        return grad
 
+    def link_hess(self, y, f):
         """
-        if f_i =
-        pass
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
 
+        Will return diaganol of hessian, since every where else it is 0
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: datum number i
+        :f: latent variable f
+        :returns: float(second derivative of likelihood evaluated at this point)
+        """
+        e = y - f
+        hess = ((self.v + 1) * e) / ((((self.sigma**2)*self.v) + e**2)**2)
+        return hess
diff --git a/python/models/__init__.py b/python/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/testing/__init__.py b/python/testing/__init__.py
new file mode 100644
index 00000000..e69de29b

From f9535c858a653e08a32a8633fe37577c87812820 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 14 Mar 2013 15:30:22 +0000
Subject: [PATCH 05/22] Trying to 'debug'

---
 python/examples/laplace_approximations.py | 22 +++++++++++---
 python/likelihoods/Laplace.py             | 25 +++++++++------
 python/likelihoods/likelihood_function.py | 37 ++++++++++++-----------
 3 files changed, 52 insertions(+), 32 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 0e1d3305..5642d8a4 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -1,7 +1,7 @@
 import GPy
 import numpy as np
 import matplotlib.pyplot as plt
-from scipy.stats import t
+from scipy.stats import t, norm
 from coxGP.python.likelihoods.Laplace import Laplace
 from coxGP.python.likelihoods.likelihood_function import student_t
 
@@ -11,12 +11,13 @@ def student_t_approx():
     Example of regressing with a student t likelihood
     """
     #Start a function, any function
-    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
+    X = np.sort(np.random.uniform(0, 15, 100))[:, None]
     Y = np.sin(X)
 
     #Add student t random noise to datapoints
-    deg_free = 1
-    noise = t.rvs(deg_free, loc=1.8, scale=1, size=Y.shape)
+    deg_free = 2.5
+    t_rv = t(deg_free, loc=5, scale=1)
+    noise = t_rv.rvs(size=Y.shape)
     Y += noise
 
     # Kernel object
@@ -39,6 +40,19 @@ def student_t_approx():
     lap = Laplace(Y, likelihood_function)
     cov = kernel.K(X)
     lap.fit_full(cov)
+    #Get one sample (just look at a single Y
+    mode = float(lap.f_hat[0])
+    variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables
+    #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables
+    normalised_approx = norm(loc=mode, scale=variance)
+    print "Normal with mode %f, and variance %f" % (mode, variance)
+    print lap.height_unnormalised
+
+    test_range = np.arange(0, 10, 0.1)
+    print np.diagonal(lap.hess_hat)
+    plt.plot(test_range, t_rv.pdf(test_range))
+    plt.plot(test_range, normalised_approx.pdf(test_range))
+    plt.show()
 
 
 def noisy_laplace_approx():
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 6efbfa30..08ae0e6f 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -5,13 +5,13 @@ from GPy.util.linalg import jitchol
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
-
+from scipy.stats import norm
 
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self,data,likelihood_function):
+    def __init__(self, data, likelihood_function):
         """
         Laplace Approximation
 
@@ -42,7 +42,13 @@ class Laplace(likelihood):
         GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
         that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
         """
-        z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised
+        #z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised
+        normalised_approx = norm(loc=self.f_hat, scale=self.hess_hat)
+        self.Z = normalised_approx.pdf(self.f_hat)/self.height_unnormalised
+        #self.Y =
+        #self.YYT =
+        #self.covariance_matrix =
+        #self.precision =
 
     def fit_full(self, K):
         """
@@ -51,11 +57,9 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         f = np.zeros((self.N, 1))
-        print K.shape
-        print f.shape
-        print self.data.shape
+        #K = np.diag(np.ones(self.N))
         (Ki, _, _, log_Kdet) = pdinv(K)
-        obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2*np.pi))
+        obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
 
         #Find \hat(f) using a newton raphson optimizer for example
         #TODO: Add newton-raphson as subclass of optimizer class
@@ -77,11 +81,12 @@ class Laplace(likelihood):
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        print self.f_hat
 
         #At this point get the hessian matrix
-        self.hess_hat = obj_hess(f_hat)
+        self.hess_hat = obj_hess(self.f_hat)
 
         #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
-        self.height_unnormalised = obj(f_hat) #FIXME: Is it -1?
+        self.height_unnormalised = obj(self.f_hat) #FIXME: Is it -1?
 
-        return _compute_GP_variables()
+        return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 78731199..46128de7 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -15,27 +15,27 @@ class student_t(likelihood_function):
     dln p(yi|fi)_dfi
     d2ln p(yi|fi)_d2fifj
     """
-    def __init__(self, deg_free, sigma=1):
+    def __init__(self, deg_free, sigma=2):
         self.v = deg_free
-        self.sigma = 1
+        self.sigma = sigma
 
     def link_function(self, y, f):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
-        :y: datum number i
-        :f: latent variable f
+        :y: data
+        :f: latent variables f
         :returns: float(likelihood evaluated for this point)
 
         """
+        assert y.shape[0] == f.shape[0]
         e = y - f
-        #print "Link ", y.shape, f.shape, e.shape
         objective = (gammaln((self.v + 1) * 0.5)
-                - gammaln(self.v * 0.5)
-                + np.log(self.sigma * np.sqrt(self.v * np.pi))
-                - (self.v + 1) * 0.5
-                * np.log(1 + ((e**2 / self.sigma**2) / self.v))
-                )
+                     - gammaln(self.v * 0.5)
+                     + np.log(self.sigma * np.sqrt(self.v * np.pi))
+                     - (self.v + 1) * 0.5
+                     * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                     )
         return np.sum(objective)
 
     def link_grad(self, y, f):
@@ -44,13 +44,13 @@ class student_t(likelihood_function):
 
         $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
 
-        :y: datum number i
-        :f: latent variable f
-        :returns: float(gradient of likelihood evaluated at this point)
+        :y: data
+        :f: latent variables f
+        :returns: gradient of likelihood evaluated at points
 
         """
+        assert y.shape[0] == f.shape[0]
         e = y - f
-        #print "Grad ", y.shape, f.shape, e.shape
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
         return grad
 
@@ -63,10 +63,11 @@ class student_t(likelihood_function):
 
         $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
 
-        :y: datum number i
-        :f: latent variable f
-        :returns: float(second derivative of likelihood evaluated at this point)
+        :y: data
+        :f: latent variables f
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
+        assert y.shape[0] == f.shape[0]
         e = y - f
-        hess = ((self.v + 1) * e) / ((((self.sigma**2)*self.v) + e**2)**2)
+        hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
         return hess

From 34ae852eea8d5f6cdc48028d4f21457c7f0b5259 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 15 Mar 2013 17:38:13 +0000
Subject: [PATCH 06/22] got an idea of how to implement! written in docs

---
 python/likelihoods/Laplace.py | 38 ++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 08ae0e6f..568fcef0 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -41,10 +41,26 @@ class Laplace(likelihood):
 
         GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
         that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
+
+        Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
+        then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
+        due to the z rescaling.
+
+        at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
+
+        This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
+        giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
+
+        $$\tilde{Y} = \tilde{\Sigma} Hf$$
+        where
+        $$\tilde{\Sigma}^{-1} = H - K^{-1}$$
+        i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
+        since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
+        and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
+
         """
-        #z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised
-        normalised_approx = norm(loc=self.f_hat, scale=self.hess_hat)
-        self.Z = normalised_approx.pdf(self.f_hat)/self.height_unnormalised
+        self.Sigma_tilde = self.hess_hat -
+        self.Z =
         #self.Y =
         #self.YYT =
         #self.covariance_matrix =
@@ -58,8 +74,8 @@ class Laplace(likelihood):
         """
         f = np.zeros((self.N, 1))
         #K = np.diag(np.ones(self.N))
-        (Ki, _, _, log_Kdet) = pdinv(K)
-        obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
+        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
+        obj_constant = (0.5 * self.log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
 
         #Find \hat(f) using a newton raphson optimizer for example
         #TODO: Add newton-raphson as subclass of optimizer class
@@ -67,17 +83,17 @@ class Laplace(likelihood):
         #FIXME: Can we get rid of this horrible reshaping?
         def obj(f):
             f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (Ki, f)) + obj_constant)
+            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (self.Ki, f)) + obj_constant)
             return float(res)
 
         def obj_grad(f):
             f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(Ki, f))
+            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
             f = f[:, None]
-            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - Ki)
+            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
@@ -87,6 +103,10 @@ class Laplace(likelihood):
         self.hess_hat = obj_hess(self.f_hat)
 
         #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
-        self.height_unnormalised = obj(self.f_hat) #FIXME: Is it -1?
+        self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
+        #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
+        #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
+        #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
+        self.z_hat = np.exp(-0.5*np.log(np.linalg.det(hess_hat)) + self.height_unnormalised)
 
         return self._compute_GP_variables()

From 2bf1cf0eb6596773c2f75a06f152b3a7cfd66081 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 18 Mar 2013 15:59:12 +0000
Subject: [PATCH 07/22] following naming convention better, lots of inverses
 which should be able to get rid of one or two, unsure if it works

---
 python/examples/laplace_approximations.py | 17 +++++----
 python/likelihoods/Laplace.py             | 43 +++++++++++++----------
 python/likelihoods/likelihood_function.py |  9 ++---
 3 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 5642d8a4..aa8cdcb4 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -41,18 +41,21 @@ def student_t_approx():
     cov = kernel.K(X)
     lap.fit_full(cov)
     #Get one sample (just look at a single Y
-    mode = float(lap.f_hat[0])
-    variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables
+    #mode = float(lap.f_hat[0])
+    #variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables
     #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables
-    normalised_approx = norm(loc=mode, scale=variance)
-    print "Normal with mode %f, and variance %f" % (mode, variance)
-    print lap.height_unnormalised
 
     test_range = np.arange(0, 10, 0.1)
-    print np.diagonal(lap.hess_hat)
     plt.plot(test_range, t_rv.pdf(test_range))
-    plt.plot(test_range, normalised_approx.pdf(test_range))
+    for i in xrange(X.shape[0]):
+        mode = lap.f_hat[i]
+        covariance = lap.hess_hat_i[i,i]
+        scaling = np.exp(lap.ln_z_hat)
+        normalised_approx = norm(loc=mode, scale=covariance)
+        print "Normal with mode %f, and variance %f" % (mode, covariance)
+        plt.plot(test_range, normalised_approx.pdf(test_range))
     plt.show()
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
 
 def noisy_laplace_approx():
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 568fcef0..9d622b0d 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,12 +1,10 @@
 import numpy as np
 import scipy as sp
 import GPy
-from GPy.util.linalg import jitchol
+#from GPy.util.linalg import jitchol
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
-from scipy.stats import norm
-
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -35,6 +33,8 @@ class Laplace(likelihood):
         #Inital values
         self.N, self.D = self.data.shape
 
+        self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
+
     def _compute_GP_variables(self):
         """
         Generates data Y which would give the normal distribution identical to the laplace approximation
@@ -59,12 +59,15 @@ class Laplace(likelihood):
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
 
         """
-        self.Sigma_tilde = self.hess_hat -
-        self.Z =
-        #self.Y =
-        #self.YYT =
-        #self.covariance_matrix =
-        #self.precision =
+        self.Sigma_tilde_i = self.hess_hat + self.Ki
+        #Do we really need to inverse Sigma_tilde_i? :(
+        (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i)
+        Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess?
+        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde, (self.Sigma_tilde_i, Y_tilde))))
+        self.Y = Y_tilde
+        self.covariance_matrix = self.Sigma_tilde
+        self.precision = np.diag(self.Sigma_tilde)[:, None]
+        self.YYT = np.dot(self.Y, self.Y)
 
     def fit_full(self, K):
         """
@@ -75,38 +78,40 @@ class Laplace(likelihood):
         f = np.zeros((self.N, 1))
         #K = np.diag(np.ones(self.N))
         (self.Ki, _, _, self.log_Kdet) = pdinv(K)
-        obj_constant = (0.5 * self.log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
-
+        LOG_K_CONST = -(0.5 * self.log_Kdet)
+        OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST
         #Find \hat(f) using a newton raphson optimizer for example
         #TODO: Add newton-raphson as subclass of optimizer class
 
         #FIXME: Can we get rid of this horrible reshaping?
         def obj(f):
-            f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (self.Ki, f)) + obj_constant)
+            #f = f[:, None]
+            res = -1 * (self.likelihood_function.link_function(self.data[:,0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
             return float(res)
 
         def obj_grad(f):
-            f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(self.Ki, f))
+            #f = f[:, None]
+            res = -1 * (self.likelihood_function.link_grad(self.data[:,0], f) - mdot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
-            f = f[:, None]
-            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - self.Ki)
+            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
         print self.f_hat
 
         #At this point get the hessian matrix
-        self.hess_hat = obj_hess(self.f_hat)
+        self.hess_hat = -1*np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) #-1*obj_hess(self.f_hat) + self.Ki
+        #self.hess_hat = -1*obj_hess(self.f_hat) + self.Ki
+        (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat + self.Ki)
 
         #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
         self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
         #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
-        self.z_hat = np.exp(-0.5*np.log(np.linalg.det(hess_hat)) + self.height_unnormalised)
+        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) + self.height_unnormalised - self.NORMAL_CONST #Unsure whether its log_hess or log_hess_i
+
 
         return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 46128de7..8adbf86c 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -28,7 +28,7 @@ class student_t(likelihood_function):
         :returns: float(likelihood evaluated for this point)
 
         """
-        assert y.shape[0] == f.shape[0]
+        assert y.shape == f.shape
         e = y - f
         objective = (gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
@@ -49,7 +49,7 @@ class student_t(likelihood_function):
         :returns: gradient of likelihood evaluated at points
 
         """
-        assert y.shape[0] == f.shape[0]
+        assert y.shape == f.shape
         e = y - f
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
         return grad
@@ -67,7 +67,8 @@ class student_t(likelihood_function):
         :f: latent variables f
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
-        assert y.shape[0] == f.shape[0]
+        assert y.shape == f.shape
         e = y - f
-        hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
+        #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
+        hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2)
         return hess

From 46d59c94b27cabe61056b71aa26d1293779c0697 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 19 Mar 2013 11:47:53 +0000
Subject: [PATCH 08/22] Just breaking some things...

---
 python/examples/laplace_approximations.py | 88 +++++++++++++++--------
 python/likelihoods/Laplace.py             | 52 ++++++++++----
 python/likelihoods/likelihood_function.py | 16 ++++-
 3 files changed, 113 insertions(+), 43 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index aa8cdcb4..73c8f67f 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -16,47 +16,75 @@ def student_t_approx():
 
     #Add student t random noise to datapoints
     deg_free = 2.5
-    t_rv = t(deg_free, loc=5, scale=1)
+    t_rv = t(deg_free, loc=0, scale=1)
     noise = t_rv.rvs(size=Y.shape)
     Y += noise
 
+    #Add some extreme value noise to some of the datapoints
+    #percent_corrupted = 0.05
+    #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
+    #indices = np.arange(Y.shape[0])
+    #np.random.shuffle(indices)
+    #corrupted_indices = indices[:corrupted_datums]
+    #print corrupted_indices
+    #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
+    #Y[corrupted_indices] += noise
+
     # Kernel object
-    print X.shape
-    kernel = GPy.kern.rbf(X.shape[1])
+    #print X.shape
+    #kernel = GPy.kern.rbf(X.shape[1])
 
-    #A GP should completely break down due to the points as they get a lot of weight
-    # create simple GP model
-    m = GPy.models.GP_regression(X, Y, kernel=kernel)
+    ##A GP should completely break down due to the points as they get a lot of weight
+    ## create simple GP model
+    #m = GPy.models.GP_regression(X, Y, kernel=kernel)
 
-    # optimize
-    m.ensure_default_constraints()
-    m.optimize()
-    # plot
-    #m.plot()
-    print m
+    ## optimize
+    #m.ensure_default_constraints()
+    #m.optimize()
+    ## plot
+    ##m.plot()
+    #print m
 
     #with a student t distribution, since it has heavy tails it should work well
-    likelihood_function = student_t(deg_free, sigma=1)
-    lap = Laplace(Y, likelihood_function)
-    cov = kernel.K(X)
-    lap.fit_full(cov)
-    #Get one sample (just look at a single Y
-    #mode = float(lap.f_hat[0])
-    #variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables
-    #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables
+    #likelihood_function = student_t(deg_free, sigma=1)
+    #lap = Laplace(Y, likelihood_function)
+    #cov = kernel.K(X)
+    #lap.fit_full(cov)
 
-    test_range = np.arange(0, 10, 0.1)
-    plt.plot(test_range, t_rv.pdf(test_range))
-    for i in xrange(X.shape[0]):
-        mode = lap.f_hat[i]
-        covariance = lap.hess_hat_i[i,i]
-        scaling = np.exp(lap.ln_z_hat)
-        normalised_approx = norm(loc=mode, scale=covariance)
-        print "Normal with mode %f, and variance %f" % (mode, covariance)
-        plt.plot(test_range, normalised_approx.pdf(test_range))
-    plt.show()
+    #test_range = np.arange(0, 10, 0.1)
+    #plt.plot(test_range, t_rv.pdf(test_range))
+    #for i in xrange(X.shape[0]):
+        #mode = lap.f_hat[i]
+        #covariance = lap.hess_hat_i[i,i]
+        #scaling = np.exp(lap.ln_z_hat)
+        #normalised_approx = norm(loc=mode, scale=covariance)
+        #print "Normal with mode %f, and variance %f" % (mode, covariance)
+        #plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    #plt.show()
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+    # Likelihood object
+    t_distribution = student_t(deg_free, sigma=1)
+    stu_t_likelihood = Laplace(Y, t_distribution)
+    kernel = GPy.kern.rbf(X.shape[1])
+
+    m = GPy.models.GP(X, stu_t_likelihood, kernel)
+    m.ensure_default_constraints()
+
+    m.update_likelihood_approximation()
+    print "NEW MODEL"
+    print(m)
+
+    # optimize
+    #m.optimize()
+    print(m)
+
+    # plot
+    m.plot()
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
+    return m
+
 
 def noisy_laplace_approx():
     """
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 9d622b0d..23db6abd 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -5,6 +5,7 @@ import GPy
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
+import numpy.testing.assert_array_equal
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -35,6 +36,29 @@ class Laplace(likelihood):
 
         self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
 
+        #Initial values for the GP variables
+        self.Y = np.zeros((self.N,1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:,None]
+        self.Z = 0
+        self.YYT = None
+
+    def predictive_values(self,mu,var):
+        return self.likelihood_function.predictive_values(mu,var)
+
+    def _get_params(self):
+        return np.zeros(0)
+
+    def _get_param_names(self):
+        return []
+
+    def _set_params(self,p):
+        pass # TODO: Laplace likelihood might want to take some parameters...
+
+    def _gradients(self,partial):
+        raise NotImplementedError
+        #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
+
     def _compute_GP_variables(self):
         """
         Generates data Y which would give the normal distribution identical to the laplace approximation
@@ -63,11 +87,14 @@ class Laplace(likelihood):
         #Do we really need to inverse Sigma_tilde_i? :(
         (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i)
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess?
-        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde, (self.Sigma_tilde_i, Y_tilde))))
+        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))))
+
+        self.Z = self.Z_tilde
         self.Y = Y_tilde
         self.covariance_matrix = self.Sigma_tilde
-        self.precision = np.diag(self.Sigma_tilde)[:, None]
-        self.YYT = np.dot(self.Y, self.Y)
+        self.precision = 1/np.diag(self.Sigma_tilde)[:, None]
+        self.YYT = np.dot(self.Y, self.Y.T)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def fit_full(self, K):
         """
@@ -76,7 +103,6 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         f = np.zeros((self.N, 1))
-        #K = np.diag(np.ones(self.N))
         (self.Ki, _, _, self.log_Kdet) = pdinv(K)
         LOG_K_CONST = -(0.5 * self.log_Kdet)
         OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST
@@ -95,23 +121,25 @@ class Laplace(likelihood):
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki)
+            res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
-        print self.f_hat
 
         #At this point get the hessian matrix
-        self.hess_hat = -1*np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) #-1*obj_hess(self.f_hat) + self.Ki
-        #self.hess_hat = -1*obj_hess(self.f_hat) + self.Ki
-        (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat + self.Ki)
+        self.hess_hat = np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) + self.Ki
+        (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat)
+        (self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat_i)
+
+        np.testing.assert_array_equal(self.hess_hat, hess_hat_new)
 
         #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
-        self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
+        #self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
         #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
-        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) + self.height_unnormalised - self.NORMAL_CONST #Unsure whether its log_hess or log_hess_i
-
+        #Unsure whether its log_hess or log_hess_i
+        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(f.T, (self.Ki, f))
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 8adbf86c..e70cdc8d 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -1,7 +1,7 @@
 from scipy.special import gammaln
 import numpy as np
 from GPy.likelihoods.likelihood_functions import likelihood_function
-
+from scipy import stats
 
 class student_t(likelihood_function):
     """Student t likelihood distribution
@@ -72,3 +72,17 @@ class student_t(likelihood_function):
         #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
         hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2)
         return hess
+
+    def predictive_values(self, mu, var):
+        """
+        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
+        """
+        mean = np.exp(mu)
+        p_025 = stats.t.ppf(025,mean)
+        p_975 = stats.t.ppf(975,mean)
+
+        #p_025 = tmp[:,0]
+        #p_975 = tmp[:,1]
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        return mean,p_025,p_975
+

From a9d555597653c24bc67812776514e29066216d66 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 19 Mar 2013 18:21:57 +0000
Subject: [PATCH 09/22] Worked out in terms of W, needs gradients implementing

---
 python/examples/laplace_approximations.py | 44 ++++++++++-----------
 python/likelihoods/Laplace.py             | 48 +++++++++++++++--------
 python/likelihoods/likelihood_function.py |  5 ++-
 3 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 73c8f67f..c8d06ab2 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -15,13 +15,13 @@ def student_t_approx():
     Y = np.sin(X)
 
     #Add student t random noise to datapoints
-    deg_free = 2.5
+    deg_free = 3.5
     t_rv = t(deg_free, loc=0, scale=1)
     noise = t_rv.rvs(size=Y.shape)
     Y += noise
 
     #Add some extreme value noise to some of the datapoints
-    #percent_corrupted = 0.05
+    #percent_corrupted = 0.15
     #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
     #indices = np.arange(Y.shape[0])
     #np.random.shuffle(indices)
@@ -31,11 +31,11 @@ def student_t_approx():
     #Y[corrupted_indices] += noise
 
     # Kernel object
-    #print X.shape
-    #kernel = GPy.kern.rbf(X.shape[1])
+    print X.shape
+    kernel = GPy.kern.rbf(X.shape[1])
 
-    ##A GP should completely break down due to the points as they get a lot of weight
-    ## create simple GP model
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
     #m = GPy.models.GP_regression(X, Y, kernel=kernel)
 
     ## optimize
@@ -46,27 +46,27 @@ def student_t_approx():
     #print m
 
     #with a student t distribution, since it has heavy tails it should work well
-    #likelihood_function = student_t(deg_free, sigma=1)
-    #lap = Laplace(Y, likelihood_function)
-    #cov = kernel.K(X)
-    #lap.fit_full(cov)
+    likelihood_function = student_t(deg_free, sigma=1)
+    lap = Laplace(Y, likelihood_function)
+    cov = kernel.K(X)
+    lap.fit_full(cov)
 
-    #test_range = np.arange(0, 10, 0.1)
-    #plt.plot(test_range, t_rv.pdf(test_range))
-    #for i in xrange(X.shape[0]):
-        #mode = lap.f_hat[i]
-        #covariance = lap.hess_hat_i[i,i]
-        #scaling = np.exp(lap.ln_z_hat)
-        #normalised_approx = norm(loc=mode, scale=covariance)
-        #print "Normal with mode %f, and variance %f" % (mode, covariance)
-        #plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
-    #plt.show()
-    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    test_range = np.arange(0, 10, 0.1)
+    plt.plot(test_range, t_rv.pdf(test_range))
+    for i in xrange(X.shape[0]):
+        mode = lap.f_hat[i]
+        covariance = lap.hess_hat_i[i,i]
+        scaling = np.exp(lap.ln_z_hat)
+        normalised_approx = norm(loc=mode, scale=covariance)
+        print "Normal with mode %f, and variance %f" % (mode, covariance)
+        plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    plt.show()
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     # Likelihood object
     t_distribution = student_t(deg_free, sigma=1)
     stu_t_likelihood = Laplace(Y, t_distribution)
-    kernel = GPy.kern.rbf(X.shape[1])
+    kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
     m = GPy.models.GP(X, stu_t_likelihood, kernel)
     m.ensure_default_constraints()
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 23db6abd..84128e3a 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,11 +1,11 @@
 import numpy as np
 import scipy as sp
 import GPy
-#from GPy.util.linalg import jitchol
+from scipy.linalg import cholesky, eig, inv
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
-import numpy.testing.assert_array_equal
+#import numpy.testing.assert_array_equal
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -56,8 +56,8 @@ class Laplace(likelihood):
         pass # TODO: Laplace likelihood might want to take some parameters...
 
     def _gradients(self,partial):
+        return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
-        #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
 
     def _compute_GP_variables(self):
         """
@@ -83,16 +83,23 @@ class Laplace(likelihood):
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
 
         """
-        self.Sigma_tilde_i = self.hess_hat + self.Ki
+        self.Sigma_tilde_i = self.hess_hat_i #self.W #self.hess_hat_i - self.Ki
         #Do we really need to inverse Sigma_tilde_i? :(
-        (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i)
-        Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess?
-        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))))
+        if self.likelihood_function.log_concave:
+            (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
+        else:
+            self.Sigma_tilde = inv(self.Sigma_tilde_i)
+        #f_hat? should be f but we must have optimized for them I guess?
+        Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
+        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST
+                              - 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                              + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                              )
 
         self.Z = self.Z_tilde
         self.Y = Y_tilde
         self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1/np.diag(self.Sigma_tilde)[:, None]
+        self.precision = 1 / np.diag(self.Sigma_tilde)[:, None]
         self.YYT = np.dot(self.Y, self.Y.T)
         import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
@@ -112,34 +119,41 @@ class Laplace(likelihood):
         #FIXME: Can we get rid of this horrible reshaping?
         def obj(f):
             #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data[:,0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
             return float(res)
 
         def obj_grad(f):
             #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data[:,0], f) - mdot(self.Ki, f))
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki)
+            res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
 
         #At this point get the hessian matrix
-        self.hess_hat = np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) + self.Ki
+        self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat))
+        self.hess_hat = self.Ki + self.W
         (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat)
-        (self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat_i)
 
-        np.testing.assert_array_equal(self.hess_hat, hess_hat_new)
+        #Check hess_hat is positive definite
+        try:
+            cholesky(self.hess_hat)
+        except:
+            raise ValueError("Must be positive definite")
+
+        #Check its eigenvalues are positive
+        eigenvalues = eig(self.hess_hat)
+        if not np.all(eigenvalues > 0):
+            raise ValueError("Eigen values not positive")
 
-        #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
-        #self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
         #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
         #Unsure whether its log_hess or log_hess_i
-        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(f.T, (self.Ki, f))
+        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(self.f_hat.T, (self.Ki, self.f_hat))
         import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index e70cdc8d..c4823703 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -19,6 +19,9 @@ class student_t(likelihood_function):
         self.v = deg_free
         self.sigma = sigma
 
+        #FIXME: This should be in the superclass
+        self.log_concave = False
+
     def link_function(self, y, f):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
@@ -70,7 +73,7 @@ class student_t(likelihood_function):
         assert y.shape == f.shape
         e = y - f
         #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
-        hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2)
+        hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
         return hess
 
     def predictive_values(self, mu, var):

From 474d5484b06bdbceefa08fa573d28326bb3f8a92 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 21 Mar 2013 14:00:22 +0000
Subject: [PATCH 10/22] Changing definitions again...

---
 python/examples/laplace_approximations.py | 15 +++++---
 python/likelihoods/Laplace.py             | 44 +++++++++++++++--------
 python/likelihoods/likelihood_function.py | 10 ++----
 3 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index c8d06ab2..6f2b19aa 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -15,8 +15,9 @@ def student_t_approx():
     Y = np.sin(X)
 
     #Add student t random noise to datapoints
-    deg_free = 3.5
-    t_rv = t(deg_free, loc=0, scale=1)
+    deg_free = 100000.5
+    real_var = 4
+    t_rv = t(deg_free, loc=0, scale=real_var)
     noise = t_rv.rvs(size=Y.shape)
     Y += noise
 
@@ -46,7 +47,7 @@ def student_t_approx():
     #print m
 
     #with a student t distribution, since it has heavy tails it should work well
-    likelihood_function = student_t(deg_free, sigma=1)
+    likelihood_function = student_t(deg_free, sigma=real_var)
     lap = Laplace(Y, likelihood_function)
     cov = kernel.K(X)
     lap.fit_full(cov)
@@ -64,7 +65,7 @@ def student_t_approx():
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     # Likelihood object
-    t_distribution = student_t(deg_free, sigma=1)
+    t_distribution = student_t(deg_free, sigma=real_var)
     stu_t_likelihood = Laplace(Y, t_distribution)
     kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
@@ -77,12 +78,16 @@ def student_t_approx():
 
     # optimize
     #m.optimize()
-    print(m)
+    #print(m)
 
     # plot
     m.plot()
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
+    m.optimize()
+    print(m)
+
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     return m
 
 
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 84128e3a..b002034d 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,7 +1,7 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv
+from scipy.linalg import cholesky, eig, inv, det
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
@@ -43,8 +43,10 @@ class Laplace(likelihood):
         self.Z = 0
         self.YYT = None
 
-    def predictive_values(self,mu,var):
-        return self.likelihood_function.predictive_values(mu,var)
+    def predictive_values(self, mu, var, full_cov):
+        if full_cov:
+            raise NotImplementedError("Cannot make correlated predictions with an EP likelihood")
+        return self.likelihood_function.predictive_values(mu, var)
 
     def _get_params(self):
         return np.zeros(0)
@@ -52,10 +54,10 @@ class Laplace(likelihood):
     def _get_param_names(self):
         return []
 
-    def _set_params(self,p):
+    def _set_params(self, p):
         pass # TODO: Laplace likelihood might want to take some parameters...
 
-    def _gradients(self,partial):
+    def _gradients(self, partial):
         return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
 
@@ -83,7 +85,13 @@ class Laplace(likelihood):
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
 
         """
-        self.Sigma_tilde_i = self.hess_hat_i #self.W #self.hess_hat_i - self.Ki
+        self.Sigma_tilde_i = self.W #self.hess_hat_i
+        #Check it isn't singular!
+        epsilon = 1e-2
+        """
+        if np.abs(det(self.Sigma_tilde_i)) < epsilon:
+            raise ValueError("inverse covariance must be non-singular to inverse!")
+        """
         #Do we really need to inverse Sigma_tilde_i? :(
         if self.likelihood_function.log_concave:
             (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
@@ -91,12 +99,17 @@ class Laplace(likelihood):
             self.Sigma_tilde = inv(self.Sigma_tilde_i)
         #f_hat? should be f but we must have optimized for them I guess?
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
-        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST
-                              - 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
-                              + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
-                              )
+        #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
+                        #- 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                        #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                   #)
+        Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
+                   + 0.5*self.log_hess_hat_det
+                   + 0.5*mdot(self.f_hat, self.Ki , self.f_hat)
+                   + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                   )
 
-        self.Z = self.Z_tilde
+        self.Z = Z_tilde
         self.Y = Y_tilde
         self.covariance_matrix = self.Sigma_tilde
         self.precision = 1 / np.diag(self.Sigma_tilde)[:, None]
@@ -128,7 +141,7 @@ class Laplace(likelihood):
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
+            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
@@ -153,7 +166,10 @@ class Laplace(likelihood):
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
         #Unsure whether its log_hess or log_hess_i
-        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(self.f_hat.T, (self.Ki, self.f_hat))
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        self.ln_z_hat = (-0.5*self.log_hess_hat_det
+                         - 0.5*self.log_Kdet
+                         -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat)
+                         - mdot(self.f_hat.T, (self.Ki, self.f_hat))
+                         )
 
         return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index c4823703..a299fe3a 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -81,11 +81,7 @@ class student_t(likelihood_function):
         Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
         """
         mean = np.exp(mu)
-        p_025 = stats.t.ppf(025,mean)
-        p_975 = stats.t.ppf(975,mean)
-
-        #p_025 = tmp[:,0]
-        #p_975 = tmp[:,1]
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-        return mean,p_025,p_975
+        p_025 = stats.t.ppf(.025, mean)
+        p_975 = stats.t.ppf(.975, mean)
 
+        return mean, np.nan*mean, p_025, p_975

From 7b0d0550cb01f0c4eca567e80f950e7f54ecb7b2 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 22 Mar 2013 12:50:47 +0000
Subject: [PATCH 11/22] Seemed to be working, now its not

---
 python/examples/laplace_approximations.py | 118 +++++++++++++---------
 python/likelihoods/Laplace.py             |  37 +++----
 2 files changed, 92 insertions(+), 63 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 6f2b19aa..5fb39e08 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -11,15 +11,22 @@ def student_t_approx():
     Example of regressing with a student t likelihood
     """
     #Start a function, any function
-    X = np.sort(np.random.uniform(0, 15, 100))[:, None]
-    Y = np.sin(X)
+    X = np.linspace(0.0, 10.0, 100)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*0.1
+    Yc = Y.copy()
+
+    Y = Y/Y.max()
+
+    Yc[10] += 5
+    Yc[15] += 20
+    Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 100000.5
-    real_var = 4
-    t_rv = t(deg_free, loc=0, scale=real_var)
-    noise = t_rv.rvs(size=Y.shape)
-    Y += noise
+    deg_free = 1000000 #100000.5
+    real_var = 0.1
+    #t_rv = t(deg_free, loc=0, scale=real_var)
+    #noise = t_rvrvs(size=Y.shape)
+    #Y += noise
 
     #Add some extreme value noise to some of the datapoints
     #percent_corrupted = 0.15
@@ -30,64 +37,83 @@ def student_t_approx():
     #print corrupted_indices
     #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
     #Y[corrupted_indices] += noise
-
+    plt.figure(1)
     # Kernel object
-    print X.shape
-    kernel = GPy.kern.rbf(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
 
-    #A GP should completely break down due to the points as they get a lot of weight
-    # create simple GP model
-    #m = GPy.models.GP_regression(X, Y, kernel=kernel)
-
-    ## optimize
+    #print "Clean Gaussian"
+    ##A GP should completely break down due to the points as they get a lot of weight
+    ## create simple GP model
+    #m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    ### optimize
     #m.ensure_default_constraints()
+    ##m.unconstrain('noise')
+    ##m.constrain_fixed('noise', 0.1)
     #m.optimize()
     ## plot
-    ##m.plot()
+    #plt.subplot(221)
+    #m.plot()
     #print m
 
-    #with a student t distribution, since it has heavy tails it should work well
-    likelihood_function = student_t(deg_free, sigma=real_var)
-    lap = Laplace(Y, likelihood_function)
-    cov = kernel.K(X)
-    lap.fit_full(cov)
+    ##Corrupt
+    #print "Corrupt Gaussian"
+    #m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
+    #m.ensure_default_constraints()
+    ##m.unconstrain('noise')
+    ##m.constrain_fixed('noise', 0.1)
+    #m.optimize()
+    #plt.subplot(222)
+    #m.plot()
+    #print m
 
-    test_range = np.arange(0, 10, 0.1)
-    plt.plot(test_range, t_rv.pdf(test_range))
-    for i in xrange(X.shape[0]):
-        mode = lap.f_hat[i]
-        covariance = lap.hess_hat_i[i,i]
-        scaling = np.exp(lap.ln_z_hat)
-        normalised_approx = norm(loc=mode, scale=covariance)
-        print "Normal with mode %f, and variance %f" % (mode, covariance)
-        plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
-    plt.show()
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    ##with a student t distribution, since it has heavy tails it should work well
+    ##likelihood_function = student_t(deg_free, sigma=real_var)
+    ##lap = Laplace(Y, likelihood_function)
+    ##cov = kernel.K(X)
+    ##lap.fit_full(cov)
+
+    ##test_range = np.arange(0, 10, 0.1)
+    ##plt.plot(test_range, t_rv.pdf(test_range))
+    ##for i in xrange(X.shape[0]):
+        ##mode = lap.f_hat[i]
+        ##covariance = lap.hess_hat_i[i,i]
+        ##scaling = np.exp(lap.ln_z_hat)
+        ##normalised_approx = norm(loc=mode, scale=covariance)
+        ##print "Normal with mode %f, and variance %f" % (mode, covariance)
+        ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    ##plt.show()
 
     # Likelihood object
-    t_distribution = student_t(deg_free, sigma=real_var)
+    t_distribution = student_t(deg_free, sigma=np.sqrt(real_var))
     stu_t_likelihood = Laplace(Y, t_distribution)
-    kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
-    m = GPy.models.GP(X, stu_t_likelihood, kernel)
+    print "Clean student t"
+    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     m.ensure_default_constraints()
-
     m.update_likelihood_approximation()
-    print "NEW MODEL"
-    print(m)
-
     # optimize
-    #m.optimize()
-    #print(m)
-
-    # plot
-    m.plot()
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-
     m.optimize()
     print(m)
+    # plot
+    plt.subplot(211)
+    m.plot_f()
+
+    print "Corrupt student t"
+    t_distribution = student_t(deg_free, sigma=np.sqrt(real_var))
+    corrupt_stu_t_likelihood = Laplace(Yc, t_distribution)
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(212)
+    m.plot_f()
 
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
     return m
 
 
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index b002034d..d86523d8 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -33,13 +33,15 @@ class Laplace(likelihood):
 
         #Inital values
         self.N, self.D = self.data.shape
+        self.is_heteroscedastic = True
+        self.Nparams = 0
 
         self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
 
         #Initial values for the GP variables
-        self.Y = np.zeros((self.N,1))
+        self.Y = np.zeros((self.N, 1))
         self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)[:,None]
+        self.precision = np.ones(self.N)[:, None]
         self.Z = 0
         self.YYT = None
 
@@ -58,6 +60,7 @@ class Laplace(likelihood):
         pass # TODO: Laplace likelihood might want to take some parameters...
 
     def _gradients(self, partial):
+        #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
 
@@ -88,10 +91,8 @@ class Laplace(likelihood):
         self.Sigma_tilde_i = self.W #self.hess_hat_i
         #Check it isn't singular!
         epsilon = 1e-2
-        """
         if np.abs(det(self.Sigma_tilde_i)) < epsilon:
             raise ValueError("inverse covariance must be non-singular to inverse!")
-        """
         #Do we really need to inverse Sigma_tilde_i? :(
         if self.likelihood_function.log_concave:
             (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
@@ -99,21 +100,17 @@ class Laplace(likelihood):
             self.Sigma_tilde = inv(self.Sigma_tilde_i)
         #f_hat? should be f but we must have optimized for them I guess?
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
-        #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                        #- 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
-                        #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
-                   #)
         Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                   + 0.5*self.log_hess_hat_det
-                   + 0.5*mdot(self.f_hat, self.Ki , self.f_hat)
-                   + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                    + 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                    + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                    - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
                    )
 
         self.Z = Z_tilde
-        self.Y = Y_tilde
+        self.Y = Y_tilde[:, None]
+        self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = self.Sigma_tilde
         self.precision = 1 / np.diag(self.Sigma_tilde)[:, None]
-        self.YYT = np.dot(self.Y, self.Y.T)
         import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def fit_full(self, K):
@@ -122,6 +119,7 @@ class Laplace(likelihood):
         For nomenclature see Rasmussen & Williams 2006
         :K: Covariance matrix
         """
+        self.K = K.copy()
         f = np.zeros((self.N, 1))
         (self.Ki, _, _, self.log_Kdet) = pdinv(K)
         LOG_K_CONST = -(0.5 * self.log_Kdet)
@@ -148,6 +146,11 @@ class Laplace(likelihood):
 
         #At this point get the hessian matrix
         self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat))
+        if not self.likelihood_function.log_concave:
+            self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                   #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                   #To cause the posterior to become less certain than the prior and likelihood,
+                                   #This is a property only held by non-log-concave likelihoods
         self.hess_hat = self.Ki + self.W
         (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat)
 
@@ -166,10 +169,10 @@ class Laplace(likelihood):
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
         #Unsure whether its log_hess or log_hess_i
-        self.ln_z_hat = (-0.5*self.log_hess_hat_det
-                         - 0.5*self.log_Kdet
-                         -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat)
-                         - mdot(self.f_hat.T, (self.Ki, self.f_hat))
+        self.ln_z_hat = (- 0.5*self.log_hess_hat_det
+                         + 0.5*self.log_Kdet
+                         + self.likelihood_function.link_function(self.data[:,0], self.f_hat)
+                         - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
                          )
 
         return self._compute_GP_variables()

From 15d5c2f22dff65a518a4f6a155e457a6516fca17 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 28 Mar 2013 17:42:42 +0000
Subject: [PATCH 12/22] Working laplace, just needs predictive values

---
 python/examples/laplace_approximations.py | 80 +++++++++++++----------
 python/likelihoods/Laplace.py             | 15 +++--
 python/likelihoods/likelihood_function.py | 72 ++++++++++++++++++--
 3 files changed, 121 insertions(+), 46 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 5fb39e08..37681849 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -10,20 +10,23 @@ def student_t_approx():
     """
     Example of regressing with a student t likelihood
     """
+    real_var = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 100)[:, None]
-    Y = np.sin(X) + np.random.randn(*X.shape)*0.1
+    X = np.linspace(0.0, 10.0, 30)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
     Yc = Y.copy()
 
-    Y = Y/Y.max()
+    #Y = Y/Y.max()
 
-    Yc[10] += 5
-    Yc[15] += 20
-    Yc = Yc/Yc.max()
+    #Yc[10] += 100
+    Yc[25] += 10
+    Yc[23] += 10
+    Yc[24] += 10
+    #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 1000000 #100000.5
-    real_var = 0.1
+    deg_free = 20 #100000.5
+    real_sd = np.sqrt(real_var)
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -38,36 +41,37 @@ def student_t_approx():
     #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
     #Y[corrupted_indices] += noise
     plt.figure(1)
+    plt.suptitle('Gaussian likelihood')
     # Kernel object
     kernel1 = GPy.kern.rbf(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
 
-    #print "Clean Gaussian"
-    ##A GP should completely break down due to the points as they get a lot of weight
-    ## create simple GP model
-    #m = GPy.models.GP_regression(X, Y, kernel=kernel1)
-    ### optimize
-    #m.ensure_default_constraints()
-    ##m.unconstrain('noise')
-    ##m.constrain_fixed('noise', 0.1)
-    #m.optimize()
-    ## plot
-    #plt.subplot(221)
-    #m.plot()
-    #print m
+    print "Clean Gaussian"
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    ## optimize
+    m.ensure_default_constraints()
+    #m.unconstrain('noise')
+    #m.constrain_fixed('noise', 0.1)
+    m.optimize()
+    # plot
+    plt.subplot(211)
+    m.plot()
+    print m
 
     ##Corrupt
-    #print "Corrupt Gaussian"
-    #m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
-    #m.ensure_default_constraints()
-    ##m.unconstrain('noise')
-    ##m.constrain_fixed('noise', 0.1)
-    #m.optimize()
-    #plt.subplot(222)
-    #m.plot()
-    #print m
+    print "Corrupt Gaussian"
+    m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
+    m.ensure_default_constraints()
+    #m.unconstrain('noise')
+    #m.constrain_fixed('noise', 0.1)
+    m.optimize()
+    plt.subplot(212)
+    m.plot()
+    print m
 
     ##with a student t distribution, since it has heavy tails it should work well
     ##likelihood_function = student_t(deg_free, sigma=real_var)
@@ -86,9 +90,13 @@ def student_t_approx():
         ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
     ##plt.show()
 
+    plt.figure(2)
+    plt.suptitle('Student-t likelihood')
+    edited_real_sd = real_sd
+
     # Likelihood object
-    t_distribution = student_t(deg_free, sigma=np.sqrt(real_var))
-    stu_t_likelihood = Laplace(Y, t_distribution)
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = Laplace(Yc, t_distribution)
 
     print "Clean student t"
     m = GPy.models.GP(X, stu_t_likelihood, kernel3)
@@ -100,9 +108,11 @@ def student_t_approx():
     # plot
     plt.subplot(211)
     m.plot_f()
+    plt.ylim(-2.5,2.5)
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     print "Corrupt student t"
-    t_distribution = student_t(deg_free, sigma=np.sqrt(real_var))
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
     corrupt_stu_t_likelihood = Laplace(Yc, t_distribution)
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
@@ -110,8 +120,8 @@ def student_t_approx():
     m.optimize()
     print(m)
     plt.subplot(212)
-    m.plot_f()
-
+    m.plot()
+    plt.ylim(-2.5,2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     return m
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index d86523d8..1411c22b 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -88,11 +88,12 @@ class Laplace(likelihood):
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
 
         """
-        self.Sigma_tilde_i = self.W #self.hess_hat_i
+        self.Sigma_tilde_i = self.W
         #Check it isn't singular!
-        epsilon = 1e-2
+        epsilon = 1e-6
         if np.abs(det(self.Sigma_tilde_i)) < epsilon:
-            raise ValueError("inverse covariance must be non-singular to inverse!")
+            print "WARNING: Transformed covariance matrix is signular!"
+            #raise ValueError("inverse covariance must be non-singular to invert!")
         #Do we really need to inverse Sigma_tilde_i? :(
         if self.likelihood_function.log_concave:
             (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
@@ -110,8 +111,12 @@ class Laplace(likelihood):
         self.Y = Y_tilde[:, None]
         self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1 / np.diag(self.Sigma_tilde)[:, None]
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        #if not self.likelihood_function.log_concave:
+            #self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                   ##If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                   ##To cause the posterior to become less certain than the prior and likelihood,
+                                   ##This is a property only held by non-log-concave likelihoods
+        self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
 
     def fit_full(self, K):
         """
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index a299fe3a..7ac9c661 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -1,4 +1,5 @@
-from scipy.special import gammaln
+from scipy.special import gammaln, gamma
+from scipy import integrate
 import numpy as np
 from GPy.likelihoods.likelihood_functions import likelihood_function
 from scipy import stats
@@ -79,9 +80,68 @@ class student_t(likelihood_function):
     def predictive_values(self, mu, var):
         """
         Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
-        """
-        mean = np.exp(mu)
-        p_025 = stats.t.ppf(.025, mean)
-        p_975 = stats.t.ppf(.975, mean)
 
-        return mean, np.nan*mean, p_025, p_975
+        Need to find what the variance is at the latent points for a student t*normal
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+
+(((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        """
+        #p_025 = stats.t.ppf(.025, mu)
+        #p_975 = stats.t.ppf(.975, mu)
+
+        num_test_points = mu.shape[0]
+        #Each mu is the latent point f* at the test point x*,
+        #and the var is the gaussian variance at this point
+        #Take lots of samples from this, so we have lots of possible values
+        #for latent point f* for each test point x* weighted by how likely we were to pick it
+        print "Taking %d samples of f*".format(num_test_points)
+        num_f_samples = 10
+        num_y_samples = 10
+        student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
+        print "Student t means shape: ", student_t_means.shape
+
+        #Now we have lots of f*, lets work out the likelihood of getting this by sampling
+        #from a student t centred on this point, sample many points from this distribution
+        #centred on f*
+        #for test_point, f in enumerate(student_t_means):
+            #print test_point
+            #print f.shape
+            #student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
+                                            #scale=self.sigma,
+                                            #size=(num_f_samples, num_y_samples))
+            #print student_t_samples.shape
+
+        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:,None],
+                                        scale=self.sigma,
+                                        size=(num_test_points, num_y_samples, num_f_samples))
+        student_t_samples = np.reshape(student_t_samples,
+                                       (num_test_points, num_y_samples*num_f_samples))
+
+        #Now take the 97.5 and 0.25 percentile of these points
+        p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
+        p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
+
+        p_025 = 1+p_025
+        p_975 = 1+p_975
+
+        ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
+        def t_gaussian(f, mu, var):
+            return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
+                        * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
+                    )
+
+        def t_gauss_int(mu, var):
+            print "Mu: ", mu
+            print "var: ", var
+            result = integrate.quad(t_gaussian, -np.inf, 0.975, args=(mu, var))
+            print "Result: ", result
+            return result[0]
+
+        vec_t_gauss_int = np.vectorize(t_gauss_int)
+
+        p_025 = vec_t_gauss_int(mu, var)
+        p_975 = vec_t_gauss_int(mu, var)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+        return mu, np.nan*mu, p_025, p_975

From ffc168c1d20f36b1e72501176c4a7bb88ff41614 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 2 Apr 2013 12:33:01 +0100
Subject: [PATCH 13/22] Added predicted values for student t, works well

---
 python/examples/laplace_approximations.py | 48 +++++++++++------------
 python/likelihoods/likelihood_function.py | 41 ++++++++++++++-----
 2 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 37681849..6374a5fd 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -18,7 +18,7 @@ def student_t_approx():
 
     #Y = Y/Y.max()
 
-    #Yc[10] += 100
+    Yc[10] += 100
     Yc[25] += 10
     Yc[23] += 10
     Yc[24] += 10
@@ -52,51 +52,30 @@ def student_t_approx():
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
     m = GPy.models.GP_regression(X, Y, kernel=kernel1)
-    ## optimize
+    # optimize
     m.ensure_default_constraints()
-    #m.unconstrain('noise')
-    #m.constrain_fixed('noise', 0.1)
     m.optimize()
     # plot
     plt.subplot(211)
     m.plot()
     print m
 
-    ##Corrupt
+    #Corrupt
     print "Corrupt Gaussian"
     m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
     m.ensure_default_constraints()
-    #m.unconstrain('noise')
-    #m.constrain_fixed('noise', 0.1)
     m.optimize()
     plt.subplot(212)
     m.plot()
     print m
 
-    ##with a student t distribution, since it has heavy tails it should work well
-    ##likelihood_function = student_t(deg_free, sigma=real_var)
-    ##lap = Laplace(Y, likelihood_function)
-    ##cov = kernel.K(X)
-    ##lap.fit_full(cov)
-
-    ##test_range = np.arange(0, 10, 0.1)
-    ##plt.plot(test_range, t_rv.pdf(test_range))
-    ##for i in xrange(X.shape[0]):
-        ##mode = lap.f_hat[i]
-        ##covariance = lap.hess_hat_i[i,i]
-        ##scaling = np.exp(lap.ln_z_hat)
-        ##normalised_approx = norm(loc=mode, scale=covariance)
-        ##print "Normal with mode %f, and variance %f" % (mode, covariance)
-        ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
-    ##plt.show()
-
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
     edited_real_sd = real_sd
 
     # Likelihood object
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Yc, t_distribution)
+    stu_t_likelihood = Laplace(Y, t_distribution)
 
     print "Clean student t"
     m = GPy.models.GP(X, stu_t_likelihood, kernel3)
@@ -107,7 +86,7 @@ def student_t_approx():
     print(m)
     # plot
     plt.subplot(211)
-    m.plot_f()
+    m.plot()
     plt.ylim(-2.5,2.5)
     #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
@@ -124,6 +103,23 @@ def student_t_approx():
     plt.ylim(-2.5,2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
+    ###with a student t distribution, since it has heavy tails it should work well
+    ###likelihood_function = student_t(deg_free, sigma=real_var)
+    ###lap = Laplace(Y, likelihood_function)
+    ###cov = kernel.K(X)
+    ###lap.fit_full(cov)
+
+    ###test_range = np.arange(0, 10, 0.1)
+    ###plt.plot(test_range, t_rv.pdf(test_range))
+    ###for i in xrange(X.shape[0]):
+        ###mode = lap.f_hat[i]
+        ###covariance = lap.hess_hat_i[i,i]
+        ###scaling = np.exp(lap.ln_z_hat)
+        ###normalised_approx = norm(loc=mode, scale=covariance)
+        ###print "Normal with mode %f, and variance %f" % (mode, covariance)
+        ###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    ###plt.show()
+
     return m
 
 
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 7ac9c661..61b5c427 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -23,6 +23,10 @@ class student_t(likelihood_function):
         #FIXME: This should be in the superclass
         self.log_concave = False
 
+    @property
+    def variance(self):
+        return (self.v / float(self.v - 2)) * (self.sigma**2)
+
     def link_function(self, y, f):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
@@ -79,14 +83,32 @@ class student_t(likelihood_function):
 
     def predictive_values(self, mu, var):
         """
-        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
+        Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
 
-        Need to find what the variance is at the latent points for a student t*normal
-        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
 
-(((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
-*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
         """
+
+        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
+        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
+        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
+        #Which was also given to us as (var)
+        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
+        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
+        true_var = var + self.variance
+
+        #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
+        #need the 95 and 5 percentiles.
+        #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
+        p_025 = mu - 2.*true_var
+        p_975 = mu + 2.*true_var
+
+        return mu, np.nan*mu, p_025, p_975
+
+    def sample_predicted_values(self, mu, var):
+        """ Experimental sample approches and numerical integration """
         #p_025 = stats.t.ppf(.025, mu)
         #p_975 = stats.t.ppf(.975, mu)
 
@@ -134,14 +156,13 @@ class student_t(likelihood_function):
         def t_gauss_int(mu, var):
             print "Mu: ", mu
             print "var: ", var
-            result = integrate.quad(t_gaussian, -np.inf, 0.975, args=(mu, var))
+            result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
             print "Result: ", result
             return result[0]
 
         vec_t_gauss_int = np.vectorize(t_gauss_int)
 
-        p_025 = vec_t_gauss_int(mu, var)
-        p_975 = vec_t_gauss_int(mu, var)
+        p = vec_t_gauss_int(mu, var)
+        p_025 = mu - p
+        p_975 = mu + p
         import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-
-        return mu, np.nan*mu, p_025, p_975

From afa5b1f9561189b3774a895b765d708186c10f5c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 2 Apr 2013 12:39:57 +0100
Subject: [PATCH 14/22] Tidying up

---
 python/likelihoods/likelihood_function.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 61b5c427..50f9b620 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -88,7 +88,6 @@ class student_t(likelihood_function):
         Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
         (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
         *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
-
         """
 
         #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
@@ -144,9 +143,6 @@ class student_t(likelihood_function):
         p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
         p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
 
-        p_025 = 1+p_025
-        p_975 = 1+p_975
-
         ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
         def t_gaussian(f, mu, var):
             return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))

From 0312f319ad4eef37f0c173120d80cc373d149519 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 2 Apr 2013 20:00:31 +0100
Subject: [PATCH 15/22] Still working on rasmussen, link function needs
 vectorizing I think

---
 python/examples/laplace_approximations.py |  58 ++++++---
 python/likelihoods/Laplace.py             | 137 ++++++++++++++++------
 python/likelihoods/likelihood_function.py |  13 +-
 3 files changed, 154 insertions(+), 54 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 6374a5fd..a1c71c71 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -16,6 +16,9 @@ def student_t_approx():
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
     Yc = Y.copy()
 
+    X_full = np.linspace(0.0, 10.0, 500)[:, None]
+    Y_full = np.sin(X_full)
+
     #Y = Y/Y.max()
 
     Yc[10] += 100
@@ -25,7 +28,7 @@ def student_t_approx():
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 20 #100000.5
+    deg_free = 10
     real_sd = np.sqrt(real_var)
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
@@ -47,6 +50,8 @@ def student_t_approx():
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
+    kernel5 = kernel1.copy()
+    kernel6 = kernel1.copy()
 
     print "Clean Gaussian"
     #A GP should completely break down due to the points as they get a lot of weight
@@ -58,6 +63,7 @@ def student_t_approx():
     # plot
     plt.subplot(211)
     m.plot()
+    plt.plot(X_full, Y_full)
     print m
 
     #Corrupt
@@ -67,40 +73,64 @@ def student_t_approx():
     m.optimize()
     plt.subplot(212)
     m.plot()
+    plt.plot(X_full, Y_full)
     print m
 
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
     edited_real_sd = real_sd
 
-    # Likelihood object
+    print "Clean student t, ncg"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Y, t_distribution)
-
-    print "Clean student t"
+    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
     m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
-    # optimize
     m.optimize()
     print(m)
-    # plot
-    plt.subplot(211)
+    plt.subplot(221)
     m.plot()
-    plt.ylim(-2.5,2.5)
-    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
 
-    print "Corrupt student t"
+    print "Corrupt student t, ncg"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = Laplace(Yc, t_distribution)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(223)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Clean student t, rasm"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(222)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Corrupt student t, rasm"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
-    plt.subplot(212)
+    plt.subplot(224)
     m.plot()
-    plt.ylim(-2.5,2.5)
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     ###with a student t distribution, since it has heavy tails it should work well
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 1411c22b..8eb69869 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,16 +1,15 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv, det
-from functools import partial
+from scipy.linalg import cholesky, eig, inv, det, cho_solve
 from GPy.likelihoods.likelihood import likelihood
-from GPy.util.linalg import pdinv,mdot
+from GPy.util.linalg import pdinv, mdot, jitchol
 #import numpy.testing.assert_array_equal
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self, data, likelihood_function):
+    def __init__(self, data, likelihood_function, rasm=True):
         """
         Laplace Approximation
 
@@ -30,6 +29,7 @@ class Laplace(likelihood):
         """
         self.data = data
         self.likelihood_function = likelihood_function
+        self.rasm = rasm
 
         #Inital values
         self.N, self.D = self.data.shape
@@ -102,20 +102,16 @@ class Laplace(likelihood):
         #f_hat? should be f but we must have optimized for them I guess?
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
         Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                    + 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                    + 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
                     + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
                     - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
                    )
 
-        self.Z = Z_tilde
-        self.Y = Y_tilde[:, None]
+        #Convert to float as its (1, 1) and Z must be a scalar
+        self.Z = np.float64(Z_tilde)
+        self.Y = Y_tilde
         self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = self.Sigma_tilde
-        #if not self.likelihood_function.log_concave:
-            #self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-                                   ##If the likelihood is non-log-concave. We wan't to say that there is a negative variance
-                                   ##To cause the posterior to become less certain than the prior and likelihood,
-                                   ##This is a property only held by non-log-concave likelihoods
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
 
     def fit_full(self, K):
@@ -125,32 +121,15 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        f = np.zeros((self.N, 1))
-        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
-        LOG_K_CONST = -(0.5 * self.log_Kdet)
-        OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST
-        #Find \hat(f) using a newton raphson optimizer for example
-        #TODO: Add newton-raphson as subclass of optimizer class
-
-        #FIXME: Can we get rid of this horrible reshaping?
-        def obj(f):
-            #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
-            return float(res)
-
-        def obj_grad(f):
-            #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
-            return np.squeeze(res)
-
-        def obj_hess(f):
-            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
-            return np.squeeze(res)
-
-        self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        self.Ki, _, _, self.log_Kdet = pdinv(K)
+        if self.rasm:
+            self.f_hat = self.rasm_mode(K)
+        else:
+            self.f_hat = self.ncg_mode(K)
 
         #At this point get the hessian matrix
-        self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat))
+        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat))
+
         if not self.likelihood_function.log_concave:
             self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -176,8 +155,92 @@ class Laplace(likelihood):
         #Unsure whether its log_hess or log_hess_i
         self.ln_z_hat = (- 0.5*self.log_hess_hat_det
                          + 0.5*self.log_Kdet
-                         + self.likelihood_function.link_function(self.data[:,0], self.f_hat)
+                         + self.likelihood_function.link_function(self.data, self.f_hat)
+                         #+ self.likelihood_function.link_function(self.data, self.f_hat)
                          - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
                          )
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
+
+    def ncg_mode(self, K):
+        """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
+        :K: Covariance matrix
+        :returns: f_mode
+        """
+        self.K = K.copy()
+        f = np.zeros((self.N, 1))
+        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
+        LOG_K_CONST = -(0.5 * self.log_Kdet)
+
+        #FIXME: Can we get rid of this horrible reshaping?
+        def obj(f):
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f))
+                        + self.NORMAL_CONST + LOG_K_CONST)
+            return float(res)
+
+        def obj_grad(f):
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
+            return np.squeeze(res)
+
+        def obj_hess(f):
+            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
+            return np.squeeze(res)
+
+        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        return f_hat[:, None]
+
+    def rasm_mode(self, K):
+        """
+        Rasmussens numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+
+        :K: Covariance matrix
+        :returns: f_mode
+        """
+        f = np.zeros((self.N, 1))
+        new_obj = -np.inf
+        old_obj = np.inf
+
+        def obj(a, f):
+            #Careful of shape of data!
+            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f)
+
+        difference = np.inf
+        epsilon = 1e-16
+        step_size = 1
+        while difference > epsilon:
+            W = -np.diag(self.likelihood_function.link_hess(self.data, f))
+            if not self.likelihood_function.log_concave:
+                #if np.any(W < 0):
+                    #print "NEGATIVE VALUES :("
+                    #pass
+                W[W < 0] = 1e-6     #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                    #To cause the posterior to become less certain than the prior and likelihood,
+                                    #This is a property only held by non-log-concave likelihoods
+            #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
+            W_12 = np.sqrt(W)
+            B = np.eye(self.N) + mdot(W_12, K, W_12)
+            L = jitchol(B)
+            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f))
+            #TODO: Check L is lower
+            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
+            a = b - mdot(W_12, solve_L)
+            f = np.dot(K, a)
+            old_obj = new_obj
+            new_obj = obj(a, f)
+            difference = new_obj - old_obj
+            #print "Difference: ", new_obj - old_obj
+            if difference < 0:
+                #If the objective function isn't rising, restart optimization
+                print "Reducing step-size, restarting"
+                #objective function isn't increasing, try reducing step size
+                step_size *= 0.9
+                f = np.zeros((self.N, 1))
+                new_obj = -np.inf
+                old_obj = np.inf
+
+            difference = abs(difference)
+
+        return f
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 50f9b620..15859a81 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -36,7 +36,10 @@ class student_t(likelihood_function):
         :returns: float(likelihood evaluated for this point)
 
         """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
         assert y.shape == f.shape
+
         e = y - f
         objective = (gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
@@ -44,6 +47,7 @@ class student_t(likelihood_function):
                      - (self.v + 1) * 0.5
                      * np.log(1 + ((e**2 / self.sigma**2) / self.v))
                      )
+        print (e**2).shape
         return np.sum(objective)
 
     def link_grad(self, y, f):
@@ -57,10 +61,12 @@ class student_t(likelihood_function):
         :returns: gradient of likelihood evaluated at points
 
         """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
-        return grad
+        return np.squeeze(grad)
 
     def link_hess(self, y, f):
         """
@@ -75,11 +81,12 @@ class student_t(likelihood_function):
         :f: latent variables f
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
-        return hess
+        return np.squeeze(hess)
 
     def predictive_values(self, mu, var):
         """

From 2006a94caa859d195a7c2af1236eb84656b68cfc Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 3 Apr 2013 10:55:58 +0100
Subject: [PATCH 16/22] Fixed broadcasting bug, rasm now appears to work

---
 python/likelihoods/Laplace.py             | 16 ++++++++++------
 python/likelihoods/likelihood_function.py |  1 -
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 8eb69869..e967a743 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -159,7 +159,6 @@ class Laplace(likelihood):
                          #+ self.likelihood_function.link_function(self.data, self.f_hat)
                          - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
                          )
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
 
@@ -190,7 +189,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
         return f_hat[:, None]
 
-    def rasm_mode(self, K):
+    def rasm_mode(self, K, MAX_ITER=5000, MAX_RESTART=30):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -209,7 +208,9 @@ class Laplace(likelihood):
         difference = np.inf
         epsilon = 1e-16
         step_size = 1
-        while difference > epsilon:
+        rs = 0
+        i = 0
+        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
                 #if np.any(W < 0):
@@ -223,7 +224,7 @@ class Laplace(likelihood):
             W_12 = np.sqrt(W)
             B = np.eye(self.N) + mdot(W_12, K, W_12)
             L = jitchol(B)
-            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f))
+            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)[:, None])
             #TODO: Check L is lower
             solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
             a = b - mdot(W_12, solve_L)
@@ -234,13 +235,16 @@ class Laplace(likelihood):
             #print "Difference: ", new_obj - old_obj
             if difference < 0:
                 #If the objective function isn't rising, restart optimization
-                print "Reducing step-size, restarting"
-                #objective function isn't increasing, try reducing step size
                 step_size *= 0.9
+                print "Objective function rose"
+                print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                #objective function isn't increasing, try reducing step size
                 f = np.zeros((self.N, 1))
                 new_obj = -np.inf
                 old_obj = np.inf
+                rs += 1
 
             difference = abs(difference)
+            i += 1
 
         return f
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 15859a81..49174ce7 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -47,7 +47,6 @@ class student_t(likelihood_function):
                      - (self.v + 1) * 0.5
                      * np.log(1 + ((e**2 / self.sigma**2) / self.v))
                      )
-        print (e**2).shape
         return np.sum(objective)
 
     def link_grad(self, y, f):

From 4a14a82dfba4bd3c48d4175bb8a861bab24a0d10 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 5 Apr 2013 17:34:11 +0100
Subject: [PATCH 17/22] Got the mode finding without computing Ki

---
 python/examples/laplace_approximations.py |  85 +++++++++-----
 python/likelihoods/Laplace.py             | 130 ++++++++++++++++------
 2 files changed, 152 insertions(+), 63 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index a1c71c71..7ab26406 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -6,6 +6,38 @@ from coxGP.python.likelihoods.Laplace import Laplace
 from coxGP.python.likelihoods.likelihood_function import student_t
 
 
+def timing():
+    real_var = 0.1
+    times = 1000
+    deg_free = 10
+    real_sd = np.sqrt(real_var)
+    the_is = np.zeros(times)
+    X = np.linspace(0.0, 10.0, 30)[:, None]
+    for a in xrange(times):
+        Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+        Yc = Y.copy()
+
+        Yc[10] += 100
+        Yc[25] += 10
+        Yc[23] += 10
+        Yc[24] += 10
+
+        edited_real_sd = real_sd
+        kernel1 = GPy.kern.rbf(X.shape[1])
+
+        t_distribution = student_t(deg_free, sigma=edited_real_sd)
+        corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
+        m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
+        m.ensure_default_constraints()
+        m.update_likelihood_approximation()
+        m.optimize()
+        the_is[a] = m.likelihood.i
+
+    print the_is
+    print np.mean(the_is)
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+
 def student_t_approx():
     """
     Example of regressing with a student t likelihood
@@ -80,32 +112,6 @@ def student_t_approx():
     plt.suptitle('Student-t likelihood')
     edited_real_sd = real_sd
 
-    print "Clean student t, ncg"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
-    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    plt.subplot(221)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
-
-    print "Corrupt student t, ncg"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
-    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    plt.subplot(223)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
-
     print "Clean student t, rasm"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
@@ -133,6 +139,33 @@ def student_t_approx():
     plt.ylim(-2.5, 2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
+    print "Clean student t, ncg"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(221)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Corrupt student t, ncg"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(223)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+
     ###with a student t distribution, since it has heavy tails it should work well
     ###likelihood_function = student_t(deg_free, sigma=real_var)
     ###lap = Laplace(Y, likelihood_function)
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index e967a743..396a0bc7 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -100,12 +100,19 @@ class Laplace(likelihood):
         else:
             self.Sigma_tilde = inv(self.Sigma_tilde_i)
         #f_hat? should be f but we must have optimized for them I guess?
-        Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
-        Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                    + 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
-                    + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
-                    - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
-                   )
+        #Y_tilde = mdot(self.Sigma_tilde, self.hess_hat_i, self.f_hat)
+        Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
+        #KW = np.dot(self.K, self.W)
+        #KW_i, _, _, _ = pdinv(KW)
+        #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat)
+        #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
+                    #+ 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
+                    #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                    #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
+                   #)
+        _, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12))
+        f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat)
+        Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -121,7 +128,7 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        self.Ki, _, _, self.log_Kdet = pdinv(K)
+        self.Ki, _, _, log_Kdet = pdinv(K)
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
@@ -135,33 +142,64 @@ class Laplace(likelihood):
                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                    #To cause the posterior to become less certain than the prior and likelihood,
                                    #This is a property only held by non-log-concave likelihoods
-        self.hess_hat = self.Ki + self.W
-        (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat)
+        #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
+        self.B, L, self.W_12 = self._compute_B_statistics(K, self.W)
+        self.Bi, _, _, B_det = pdinv(self.B)
+        #ln_W_det = np.linalg.det(self.W)
+        #ln_B_det = np.linalg.det(self.B)
+        ln_det = np.linalg.det(np.eye(self.N) - mdot(self.W_12, self.Bi, self.W_12, K))
+        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None]
+        #TODO: Check L is lower
+        solve_L = cho_solve((L, True), mdot(self.W_12, (K, b)))
+        a = b - mdot(self.W_12, solve_L)
+        self.f_Ki_f = np.dot(self.f_hat.T, a)
 
-        #Check hess_hat is positive definite
-        try:
-            cholesky(self.hess_hat)
-        except:
-            raise ValueError("Must be positive definite")
+        #self.hess_hat = self.Ki + self.W
+        #(self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat)
 
-        #Check its eigenvalues are positive
-        eigenvalues = eig(self.hess_hat)
-        if not np.all(eigenvalues > 0):
-            raise ValueError("Eigen values not positive")
+        ##Check hess_hat is positive definite
+        #try:
+            #cholesky(self.hess_hat)
+        #except:
+            #raise ValueError("Must be positive definite")
+
+        ##Check its eigenvalues are positive
+        #eigenvalues = eig(self.hess_hat)
+        #if not np.all(eigenvalues > 0):
+            #raise ValueError("Eigen values not positive")
 
         #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
         #Unsure whether its log_hess or log_hess_i
-        self.ln_z_hat = (- 0.5*self.log_hess_hat_det
-                         + 0.5*self.log_Kdet
-                         + self.likelihood_function.link_function(self.data, self.f_hat)
+        #self.ln_z_hat = (- 0.5*self.log_hess_hat_i_det
+                         #+ 0.5*self.log_Kdet
                          #+ self.likelihood_function.link_function(self.data, self.f_hat)
-                         - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
+                         ##+ self.likelihood_function.link_function(self.data, self.f_hat)
+                         #- 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
+                         #)
+        self.ln_z_hat = (- 0.5*log_Kdet
+                         - 0.5*self.f_Ki_f
+                         + self.likelihood_function.link_function(self.data, self.f_hat)
+                         + 0.5*ln_det
                          )
 
         return self._compute_GP_variables()
 
+    def _compute_B_statistics(self, K, W):
+        """Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :K: Covariance matrix
+        :W: Negative hessian at a point (diagonal matrix)
+        :returns: (B, L)
+        """
+        #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
+        W_12 = np.sqrt(W)
+        B = np.eye(K.shape[0]) + mdot(W_12, K, W_12)
+        L = jitchol(B)
+        return (B, L, W_12)
+
     def ncg_mode(self, K):
         """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
         :K: Covariance matrix
@@ -189,7 +227,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=5000, MAX_RESTART=30):
+    def rasm_mode(self, K, MAX_ITER=5000000000000000, MAX_RESTART=30):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -206,11 +244,12 @@ class Laplace(likelihood):
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f)
 
         difference = np.inf
-        epsilon = 1e-16
+        epsilon = 1e-6
         step_size = 1
         rs = 0
         i = 0
-        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
+        while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART:
+            f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
                 #if np.any(W < 0):
@@ -220,31 +259,48 @@ class Laplace(likelihood):
                                     #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     #To cause the posterior to become less certain than the prior and likelihood,
                                     #This is a property only held by non-log-concave likelihoods
-            #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
-            W_12 = np.sqrt(W)
-            B = np.eye(self.N) + mdot(W_12, K, W_12)
-            L = jitchol(B)
-            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)[:, None])
+            B, L, W_12 = self._compute_B_statistics(K, W)
+
+            W_f = np.dot(W, f)
+            grad = self.likelihood_function.link_grad(self.data, f)[:, None]
+            #Find K_i_f
+            b = W_f + grad
+            #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None]
             #TODO: Check L is lower
             solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
             a = b - mdot(W_12, solve_L)
-            f = np.dot(K, a)
+            #f = np.dot(K, a)
+
+            #a should be equal to Ki*f now so should be able to use it
+            c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
+            solve_L = cho_solve((L, True), mdot(W_12, c))
+            f = c - mdot(K, W_12, solve_L)
+
+            #K_w_f = mdot(K, (W, f))
+            #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f
+            #d = f + K_w_f + c
+            #solve_L = cho_solve((L, True), mdot(W_12, d))
+            #f = c - mdot(K, (W_12, solve_L))
+            #a = mdot(self.Ki, f)
+
+            tmp_old_obj = old_obj
             old_obj = new_obj
             new_obj = obj(a, f)
             difference = new_obj - old_obj
-            #print "Difference: ", new_obj - old_obj
+            #print "Difference: ", difference
             if difference < 0:
+                #print "Objective function rose", difference
                 #If the objective function isn't rising, restart optimization
                 step_size *= 0.9
-                print "Objective function rose"
-                print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
                 #objective function isn't increasing, try reducing step size
-                f = np.zeros((self.N, 1))
-                new_obj = -np.inf
-                old_obj = np.inf
+                #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
+                old_obj = tmp_old_obj
                 rs += 1
 
             difference = abs(difference)
             i += 1
 
+        self.i = i
+        print "{i} steps".format(i=i)
         return f

From 31d8faecf866307c69dcade761ddb77d628b773e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 5 Apr 2013 17:56:02 +0100
Subject: [PATCH 18/22] Added timing and realised mdot can be faster as its
 almost always a diagonal matrix its multiplying with

---
 python/examples/laplace_approximations.py |  9 +++++---
 python/likelihoods/Laplace.py             | 25 ++++++++++++++---------
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 7ab26406..28a92c61 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -8,11 +8,12 @@ from coxGP.python.likelihoods.likelihood_function import student_t
 
 def timing():
     real_var = 0.1
-    times = 1000
+    times = 1
     deg_free = 10
     real_sd = np.sqrt(real_var)
     the_is = np.zeros(times)
-    X = np.linspace(0.0, 10.0, 30)[:, None]
+    X = np.linspace(0.0, 10.0, 500)[:, None]
+
     for a in xrange(times):
         Y = np.sin(X) + np.random.randn(*X.shape)*real_var
         Yc = Y.copy()
@@ -21,6 +22,8 @@ def timing():
         Yc[25] += 10
         Yc[23] += 10
         Yc[24] += 10
+        Yc[300] += 10
+        Yc[400] += 10000
 
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
@@ -33,9 +36,9 @@ def timing():
         m.optimize()
         the_is[a] = m.likelihood.i
 
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print the_is
     print np.mean(the_is)
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
 
 def student_t_approx():
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 396a0bc7..734bf6c8 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -128,7 +128,9 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        self.Ki, _, _, log_Kdet = pdinv(K)
+        print "Inverting K"
+        #self.Ki, _, _, log_Kdet = pdinv(K)
+        print "K inverted, optimising"
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
@@ -196,6 +198,7 @@ class Laplace(likelihood):
         """
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         B = np.eye(K.shape[0]) + mdot(W_12, K, W_12)
         L = jitchol(B)
         return (B, L, W_12)
@@ -205,9 +208,7 @@ class Laplace(likelihood):
         :K: Covariance matrix
         :returns: f_mode
         """
-        self.K = K.copy()
         f = np.zeros((self.N, 1))
-        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
         LOG_K_CONST = -(0.5 * self.log_Kdet)
 
         #FIXME: Can we get rid of this horrible reshaping?
@@ -227,7 +228,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=5000000000000000, MAX_RESTART=30):
+    def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -249,6 +250,7 @@ class Laplace(likelihood):
         rs = 0
         i = 0
         while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART:
+            print "optimising"
             f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
@@ -259,22 +261,25 @@ class Laplace(likelihood):
                                     #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     #To cause the posterior to become less certain than the prior and likelihood,
                                     #This is a property only held by non-log-concave likelihoods
+            print "Decomposing"
             B, L, W_12 = self._compute_B_statistics(K, W)
+            print "Finding f"
 
-            W_f = np.dot(W, f)
+            W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal!
             grad = self.likelihood_function.link_grad(self.data, f)[:, None]
             #Find K_i_f
             b = W_f + grad
             #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None]
             #TODO: Check L is lower
-            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
-            a = b - mdot(W_12, solve_L)
+
+            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal!
+            a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
             #f = np.dot(K, a)
 
             #a should be equal to Ki*f now so should be able to use it
             c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-            solve_L = cho_solve((L, True), mdot(W_12, c))
-            f = c - mdot(K, W_12, solve_L)
+            solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal!
+            f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
 
             #K_w_f = mdot(K, (W, f))
             #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f
@@ -302,5 +307,5 @@ class Laplace(likelihood):
             i += 1
 
         self.i = i
-        print "{i} steps".format(i=i)
+        #print "{i} steps".format(i=i)
         return f

From 431f93ef231875aeb6adbe6be2c70ea807aafdce Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 8 Apr 2013 18:09:07 +0100
Subject: [PATCH 19/22] Stabalised most of the algorithm (apart from the end
 inversion which is impossible)

---
 python/likelihoods/Laplace.py | 132 ++++++++++++++++++----------------
 1 file changed, 72 insertions(+), 60 deletions(-)

diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 734bf6c8..77359769 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -3,9 +3,15 @@ import scipy as sp
 import GPy
 from scipy.linalg import cholesky, eig, inv, det, cho_solve
 from GPy.likelihoods.likelihood import likelihood
-from GPy.util.linalg import pdinv, mdot, jitchol
+from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
+from scipy.linalg.lapack import dtrtrs
 #import numpy.testing.assert_array_equal
 
+#TODO: Move this to utils
+def det_ln_diag(A):
+    return np.log(np.diagonal(A)).sum()
+
+
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
@@ -60,7 +66,6 @@ class Laplace(likelihood):
         pass # TODO: Laplace likelihood might want to take some parameters...
 
     def _gradients(self, partial):
-        #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
 
@@ -99,9 +104,26 @@ class Laplace(likelihood):
             (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
         else:
             self.Sigma_tilde = inv(self.Sigma_tilde_i)
-        #f_hat? should be f but we must have optimized for them I guess?
-        #Y_tilde = mdot(self.Sigma_tilde, self.hess_hat_i, self.f_hat)
         Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
+
+        #dtritri -> L -> L_i
+        #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
+        #((L.T*w)_i + I)f_hat = y_tilde
+        L = jitchol(self.K)
+        Li = chol_inv(L)
+        Lt_W = np.dot(L.T, self.W)
+        if np.abs(det(Lt_W)) < epsilon:
+            print "WARNING: Transformed covariance matrix is signular!"
+        Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
+        Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+        #if np.abs(det(KW)) < epsilon:
+            #print "WARNING: Transformed covariance matrix is signular!"
+        #KW_i = inv(KW)
+        #Y_tilde = mdot(KW_i + np.eye(self.N), self.f_hat)
+
+        #Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
         #KW = np.dot(self.K, self.W)
         #KW_i, _, _, _ = pdinv(KW)
         #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat)
@@ -110,16 +132,38 @@ class Laplace(likelihood):
                     #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
                     #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
                    #)
-        _, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12))
-        f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat)
-        Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f
+        #_, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12))
+        #f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat)
+        #Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f
+
+        #f_W_f = mdot(self.f_hat.T, self.W, self.f_hat)
+        #f_Y_f = mdot(Y_tilde, self.W, Y_tilde)
+        #Z_tilde = (np.dot(self.W, self.f_hat) - 0.5*y_W_y + self.ln_z_hat
+                   #- 0.5*mdot(self.f_hat, (
+
+        f_Ki_W_f = mdot(self.f_hat.T, (self.Ki + self.W), self.f_hat)
+        y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
+        y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
+        self.ln_W_det = det_ln_diag(self.W)
+        Z_tilde = (self.NORMAL_CONST
+                   - 0.5*self.ln_K_det
+                   - 0.5*self.ln_W_det
+                   - 0.5*self.ln_Ki_W_i_det
+                   - 0.5*f_Ki_W_f
+                   - 0.5*y_W_y
+                   + y_W_f
+                   + self.ln_z_hat
+                   )
+
+        Sigma_tilde = inv(self.W) # Damn
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
         self.YYT = np.dot(self.Y, self.Y.T)
-        self.covariance_matrix = self.Sigma_tilde
+        self.covariance_matrix = Sigma_tilde
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def fit_full(self, K):
         """
@@ -128,9 +172,7 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        print "Inverting K"
-        #self.Ki, _, _, log_Kdet = pdinv(K)
-        print "K inverted, optimising"
+        self.Ki, _, _, self.ln_K_det = pdinv(K)
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
@@ -144,46 +186,24 @@ class Laplace(likelihood):
                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                    #To cause the posterior to become less certain than the prior and likelihood,
                                    #This is a property only held by non-log-concave likelihoods
+
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
-        self.B, L, self.W_12 = self._compute_B_statistics(K, self.W)
+        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
-        #ln_W_det = np.linalg.det(self.W)
-        #ln_B_det = np.linalg.det(self.B)
-        ln_det = np.linalg.det(np.eye(self.N) - mdot(self.W_12, self.Bi, self.W_12, K))
+
+        Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
+        self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
+
         b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None]
-        #TODO: Check L is lower
-        solve_L = cho_solve((L, True), mdot(self.W_12, (K, b)))
-        a = b - mdot(self.W_12, solve_L)
+        solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
+        a = b - mdot(self.W_12, solve_chol)
         self.f_Ki_f = np.dot(self.f_hat.T, a)
 
-        #self.hess_hat = self.Ki + self.W
-        #(self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat)
-
-        ##Check hess_hat is positive definite
-        #try:
-            #cholesky(self.hess_hat)
-        #except:
-            #raise ValueError("Must be positive definite")
-
-        ##Check its eigenvalues are positive
-        #eigenvalues = eig(self.hess_hat)
-        #if not np.all(eigenvalues > 0):
-            #raise ValueError("Eigen values not positive")
-
-        #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
-        #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
-        #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
-        #Unsure whether its log_hess or log_hess_i
-        #self.ln_z_hat = (- 0.5*self.log_hess_hat_i_det
-                         #+ 0.5*self.log_Kdet
-                         #+ self.likelihood_function.link_function(self.data, self.f_hat)
-                         ##+ self.likelihood_function.link_function(self.data, self.f_hat)
-                         #- 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
-                         #)
-        self.ln_z_hat = (- 0.5*log_Kdet
+        self.ln_z_hat = (  self.NORMAL_CONST
                          - 0.5*self.f_Ki_f
+                         - 0.5*self.ln_K_det
+                         + 0.5*self.ln_Ki_W_i_det
                          + self.likelihood_function.link_function(self.data, self.f_hat)
-                         + 0.5*ln_det
                          )
 
         return self._compute_GP_variables()
@@ -198,7 +218,7 @@ class Laplace(likelihood):
         """
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         B = np.eye(K.shape[0]) + mdot(W_12, K, W_12)
         L = jitchol(B)
         return (B, L, W_12)
@@ -209,12 +229,12 @@ class Laplace(likelihood):
         :returns: f_mode
         """
         f = np.zeros((self.N, 1))
-        LOG_K_CONST = -(0.5 * self.log_Kdet)
 
         #FIXME: Can we get rid of this horrible reshaping?
+        #ONLY WORKS FOR 1D DATA
         def obj(f):
             res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f))
-                        + self.NORMAL_CONST + LOG_K_CONST)
+                        + self.NORMAL_CONST)
             return float(res)
 
         def obj_grad(f):
@@ -249,21 +269,15 @@ class Laplace(likelihood):
         step_size = 1
         rs = 0
         i = 0
-        while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART:
-            print "optimising"
+        while difference > epsilon:  # and i < MAX_ITER and rs < MAX_RESTART:
             f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
-                #if np.any(W < 0):
-                    #print "NEGATIVE VALUES :("
-                    #pass
                 W[W < 0] = 1e-6     #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     #To cause the posterior to become less certain than the prior and likelihood,
                                     #This is a property only held by non-log-concave likelihoods
-            print "Decomposing"
             B, L, W_12 = self._compute_B_statistics(K, W)
-            print "Finding f"
 
             W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal!
             grad = self.likelihood_function.link_grad(self.data, f)[:, None]
@@ -272,15 +286,15 @@ class Laplace(likelihood):
             #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None]
             #TODO: Check L is lower
 
-            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal!
-            a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
-            #f = np.dot(K, a)
-
             #a should be equal to Ki*f now so should be able to use it
             c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
             solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal!
             f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
 
+            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal!
+            a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
+            #f = np.dot(K, a)
+
             #K_w_f = mdot(K, (W, f))
             #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f
             #d = f + K_w_f + c
@@ -292,7 +306,6 @@ class Laplace(likelihood):
             old_obj = new_obj
             new_obj = obj(a, f)
             difference = new_obj - old_obj
-            #print "Difference: ", difference
             if difference < 0:
                 #print "Objective function rose", difference
                 #If the objective function isn't rising, restart optimization
@@ -307,5 +320,4 @@ class Laplace(likelihood):
             i += 1
 
         self.i = i
-        #print "{i} steps".format(i=i)
         return f

From e0c1e4a4df600d24f075cc13a359a4bc77dfcff3 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 8 Apr 2013 19:58:54 +0100
Subject: [PATCH 20/22] Fixed laplace approximation and made more numerically
 stable with cholesky decompositions, and commented

---
 python/examples/laplace_approximations.py |   1 -
 python/likelihoods/Laplace.py             | 142 ++++++++++------------
 2 files changed, 65 insertions(+), 78 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 28a92c61..0500ba02 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -140,7 +140,6 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     print "Clean student t, ncg"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 77359769..27ab7613 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,17 +1,32 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv, det, cho_solve
+from scipy.linalg import cholesky, eig, inv, cho_solve
+from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
 from scipy.linalg.lapack import dtrtrs
-#import numpy.testing.assert_array_equal
 
 #TODO: Move this to utils
+
+
 def det_ln_diag(A):
+    """
+    log determinant of a diagonal matrix
+    $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
+    """
     return np.log(np.diagonal(A)).sum()
 
 
+def pddet(A):
+    """
+    Determinant of a positive definite matrix
+    """
+    L = cholesky(A)
+    logdetA = 2*sum(np.log(np.diag(L)))
+    return logdetA
+
+
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
@@ -30,7 +45,8 @@ class Laplace(likelihood):
         ---------
 
         :data: @todo
-        :likelihood_function: @todo
+        :likelihood_function: likelihood function - subclass of likelihood_function
+        :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation
 
         """
         self.data = data
@@ -63,10 +79,10 @@ class Laplace(likelihood):
         return []
 
     def _set_params(self, p):
-        pass # TODO: Laplace likelihood might want to take some parameters...
+        pass  # TODO: Laplace likelihood might want to take some parameters...
 
     def _gradients(self, partial):
-        return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
+        return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
 
     def _compute_GP_variables(self):
@@ -91,20 +107,10 @@ class Laplace(likelihood):
         i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
         since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
+        $$\tilde{\Sigma} = W^{-1}$$
 
         """
-        self.Sigma_tilde_i = self.W
-        #Check it isn't singular!
         epsilon = 1e-6
-        if np.abs(det(self.Sigma_tilde_i)) < epsilon:
-            print "WARNING: Transformed covariance matrix is signular!"
-            #raise ValueError("inverse covariance must be non-singular to invert!")
-        #Do we really need to inverse Sigma_tilde_i? :(
-        if self.likelihood_function.log_concave:
-            (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
-        else:
-            self.Sigma_tilde = inv(self.Sigma_tilde_i)
-        Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
 
         #dtritri -> L -> L_i
         #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
@@ -112,42 +118,25 @@ class Laplace(likelihood):
         L = jitchol(self.K)
         Li = chol_inv(L)
         Lt_W = np.dot(L.T, self.W)
-        if np.abs(det(Lt_W)) < epsilon:
-            print "WARNING: Transformed covariance matrix is signular!"
+
+        ##Check it isn't singular!
+        if cond(Lt_W) > 1e14:
+            print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
+
         Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
         Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
-        #if np.abs(det(KW)) < epsilon:
-            #print "WARNING: Transformed covariance matrix is signular!"
-        #KW_i = inv(KW)
-        #Y_tilde = mdot(KW_i + np.eye(self.N), self.f_hat)
+        #f.T(Ki + W)f
+        f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
+                    + mdot(self.f_hat.T, self.W, self.f_hat)
+                    )
 
-        #Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
-        #KW = np.dot(self.K, self.W)
-        #KW_i, _, _, _ = pdinv(KW)
-        #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat)
-        #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                    #+ 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
-                    #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
-                    #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
-                   #)
-        #_, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12))
-        #f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat)
-        #Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f
-
-        #f_W_f = mdot(self.f_hat.T, self.W, self.f_hat)
-        #f_Y_f = mdot(Y_tilde, self.W, Y_tilde)
-        #Z_tilde = (np.dot(self.W, self.f_hat) - 0.5*y_W_y + self.ln_z_hat
-                   #- 0.5*mdot(self.f_hat, (
-
-        f_Ki_W_f = mdot(self.f_hat.T, (self.Ki + self.W), self.f_hat)
         y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
         y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
-        self.ln_W_det = det_ln_diag(self.W)
+        ln_W_det = det_ln_diag(self.W)
         Z_tilde = (self.NORMAL_CONST
                    - 0.5*self.ln_K_det
-                   - 0.5*self.ln_W_det
+                   - 0.5*ln_W_det
                    - 0.5*self.ln_Ki_W_i_det
                    - 0.5*f_Ki_W_f
                    - 0.5*y_W_y
@@ -155,7 +144,11 @@ class Laplace(likelihood):
                    + self.ln_z_hat
                    )
 
-        Sigma_tilde = inv(self.W) # Damn
+        ##Check it isn't singular!
+        if cond(self.W) > 1e14:
+            print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
+
+        Sigma_tilde = inv(self.W)  # Damn
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -163,16 +156,14 @@ class Laplace(likelihood):
         self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = Sigma_tilde
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def fit_full(self, K):
         """
         The laplace approximation algorithm
-        For nomenclature see Rasmussen & Williams 2006
+        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
         :K: Covariance matrix
         """
         self.K = K.copy()
-        self.Ki, _, _, self.ln_K_det = pdinv(K)
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
@@ -182,10 +173,10 @@ class Laplace(likelihood):
         self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat))
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-                                   #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
-                                   #To cause the posterior to become less certain than the prior and likelihood,
-                                   #This is a property only held by non-log-concave likelihoods
+            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                       #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                       #To cause the posterior to become less certain than the prior and likelihood,
+                                       #This is a property only held by non-log-concave likelihoods
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
         self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W)
@@ -198,8 +189,9 @@ class Laplace(likelihood):
         solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
         a = b - mdot(self.W_12, solve_chol)
         self.f_Ki_f = np.dot(self.f_hat.T, a)
+        self.ln_K_det = pddet(self.K)
 
-        self.ln_z_hat = (  self.NORMAL_CONST
+        self.ln_z_hat = (self.NORMAL_CONST
                          - 0.5*self.f_Ki_f
                          - 0.5*self.ln_K_det
                          + 0.5*self.ln_Ki_W_i_det
@@ -219,26 +211,29 @@ class Laplace(likelihood):
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
         #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-        B = np.eye(K.shape[0]) + mdot(W_12, K, W_12)
+        B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
         L = jitchol(B)
         return (B, L, W_12)
 
     def ncg_mode(self, K):
-        """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
+        """
+        Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
         :K: Covariance matrix
         :returns: f_mode
         """
+        self.Ki, _, _, self.ln_K_det = pdinv(K)
+
         f = np.zeros((self.N, 1))
 
         #FIXME: Can we get rid of this horrible reshaping?
         #ONLY WORKS FOR 1D DATA
         def obj(f):
-            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f))
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
                         + self.NORMAL_CONST)
             return float(res)
 
         def obj_grad(f):
-            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - np.dot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
@@ -254,6 +249,8 @@ class Laplace(likelihood):
         For nomenclature see Rasmussen & Williams 2006
 
         :K: Covariance matrix
+        :MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
+        :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
         :returns: f_mode
         """
         f = np.zeros((self.N, 1))
@@ -269,39 +266,30 @@ class Laplace(likelihood):
         step_size = 1
         rs = 0
         i = 0
-        while difference > epsilon:  # and i < MAX_ITER and rs < MAX_RESTART:
+        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-6     #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
-                                    #To cause the posterior to become less certain than the prior and likelihood,
-                                    #This is a property only held by non-log-concave likelihoods
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                    # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                    # To cause the posterior to become less certain than the prior and likelihood,
+                                    # This is a property only held by non-log-concave likelihoods
             B, L, W_12 = self._compute_B_statistics(K, W)
 
-            W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal!
+            W_f = np.dot(W, f)
             grad = self.likelihood_function.link_grad(self.data, f)[:, None]
             #Find K_i_f
             b = W_f + grad
-            #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None]
-            #TODO: Check L is lower
 
             #a should be equal to Ki*f now so should be able to use it
-            c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-            solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal!
-            f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
+            c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
+            solve_L = cho_solve((L, True), np.dot(W_12, c))
+            f = c - np.dot(K, np.dot(W_12, solve_L))
 
-            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal!
-            a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
+            solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b)))
+            a = b - np.dot(W_12, solve_L)
             #f = np.dot(K, a)
 
-            #K_w_f = mdot(K, (W, f))
-            #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f
-            #d = f + K_w_f + c
-            #solve_L = cho_solve((L, True), mdot(W_12, d))
-            #f = c - mdot(K, (W_12, solve_L))
-            #a = mdot(self.Ki, f)
-
             tmp_old_obj = old_obj
             old_obj = new_obj
             new_obj = obj(a, f)

From 65481d7a73b8fe965a99b82126431ae2668958db Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 10 Apr 2013 13:43:13 +0100
Subject: [PATCH 21/22] Fixed the z scalings

---
 python/examples/laplace_approximations.py |  8 +++----
 python/likelihoods/Laplace.py             | 28 +++++++++++++++--------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 0500ba02..5b1331b6 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -12,7 +12,7 @@ def timing():
     deg_free = 10
     real_sd = np.sqrt(real_var)
     the_is = np.zeros(times)
-    X = np.linspace(0.0, 10.0, 500)[:, None]
+    X = np.linspace(0.0, 10.0, 300)[:, None]
 
     for a in xrange(times):
         Y = np.sin(X) + np.random.randn(*X.shape)*real_var
@@ -22,8 +22,8 @@ def timing():
         Yc[25] += 10
         Yc[23] += 10
         Yc[24] += 10
-        Yc[300] += 10
-        Yc[400] += 10000
+        Yc[250] += 10
+        #Yc[4] += 10000
 
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
@@ -36,7 +36,7 @@ def timing():
         m.optimize()
         the_is[a] = m.likelihood.i
 
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print the_is
     print np.mean(the_is)
 
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 27ab7613..8ef8fb62 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,7 +1,7 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv, cho_solve
+from scipy.linalg import cholesky, eig, inv, cho_solve, det
 from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
@@ -134,15 +134,24 @@ class Laplace(likelihood):
         y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
         y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
         ln_W_det = det_ln_diag(self.W)
-        Z_tilde = (self.NORMAL_CONST
-                   - 0.5*self.ln_K_det
-                   - 0.5*ln_W_det
-                   - 0.5*self.ln_Ki_W_i_det
-                   - 0.5*f_Ki_W_f
-                   - 0.5*y_W_y
-                   + y_W_f
+        Z_tilde = (- self.NORMAL_CONST
+                   + 0.5*self.ln_K_det
+                   + 0.5*ln_W_det
+                   + 0.5*self.ln_Ki_W_i_det
+                   + 0.5*f_Ki_W_f
+                   + 0.5*y_W_y
+                   - y_W_f
                    + self.ln_z_hat
                    )
+        #Z_tilde = (self.NORMAL_CONST
+                   #- 0.5*self.ln_K_det
+                   #- 0.5*ln_W_det
+                   #- 0.5*self.ln_Ki_W_i_det
+                   #- 0.5*f_Ki_W_f
+                   #- 0.5*y_W_y
+                   #+ y_W_f
+                   #+ self.ln_z_hat
+                   #)
 
         ##Check it isn't singular!
         if cond(self.W) > 1e14:
@@ -191,8 +200,7 @@ class Laplace(likelihood):
         self.f_Ki_f = np.dot(self.f_hat.T, a)
         self.ln_K_det = pddet(self.K)
 
-        self.ln_z_hat = (self.NORMAL_CONST
-                         - 0.5*self.f_Ki_f
+        self.ln_z_hat = (- 0.5*self.f_Ki_f
                          - 0.5*self.ln_K_det
                          + 0.5*self.ln_Ki_W_i_det
                          + self.likelihood_function.link_function(self.data, self.f_hat)

From 9bbb11b825f7c395a040e2385d6a2c88aa1c143e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 10 Apr 2013 15:43:31 +0100
Subject: [PATCH 22/22] Adding weibull likelihood, requires 'extra_data' to be
 passed to likelihood, i.e. the censoring information

---
 python/likelihoods/Laplace.py             | 24 +++---
 python/likelihoods/likelihood_function.py | 99 +++++++++++++++++++++--
 2 files changed, 104 insertions(+), 19 deletions(-)

diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 8ef8fb62..4d94ba0f 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -30,7 +30,7 @@ def pddet(A):
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self, data, likelihood_function, rasm=True):
+    def __init__(self, data, likelihood_function, extra_data=None, rasm=True):
         """
         Laplace Approximation
 
@@ -44,13 +44,15 @@ class Laplace(likelihood):
         Arguments
         ---------
 
-        :data: @todo
+        :data: array of data the likelihood function is approximating
         :likelihood_function: likelihood function - subclass of likelihood_function
+        :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
         :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation
 
         """
         self.data = data
         self.likelihood_function = likelihood_function
+        self.extra_data = extra_data
         self.rasm = rasm
 
         #Inital values
@@ -179,7 +181,7 @@ class Laplace(likelihood):
             self.f_hat = self.ncg_mode(K)
 
         #At this point get the hessian matrix
-        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat))
+        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data))
 
         if not self.likelihood_function.log_concave:
             self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
@@ -194,7 +196,7 @@ class Laplace(likelihood):
         Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
         self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
 
-        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None]
+        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
         solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
         a = b - mdot(self.W_12, solve_chol)
         self.f_Ki_f = np.dot(self.f_hat.T, a)
@@ -203,7 +205,7 @@ class Laplace(likelihood):
         self.ln_z_hat = (- 0.5*self.f_Ki_f
                          - 0.5*self.ln_K_det
                          + 0.5*self.ln_Ki_W_i_det
-                         + self.likelihood_function.link_function(self.data, self.f_hat)
+                         + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
                          )
 
         return self._compute_GP_variables()
@@ -236,16 +238,16 @@ class Laplace(likelihood):
         #FIXME: Can we get rid of this horrible reshaping?
         #ONLY WORKS FOR 1D DATA
         def obj(f):
-            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
                         + self.NORMAL_CONST)
             return float(res)
 
         def obj_grad(f):
-            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - np.dot(self.Ki, f))
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
+            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
             return np.squeeze(res)
 
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
@@ -267,7 +269,7 @@ class Laplace(likelihood):
 
         def obj(a, f):
             #Careful of shape of data!
-            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f)
+            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
         epsilon = 1e-6
@@ -276,7 +278,7 @@ class Laplace(likelihood):
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             f_old = f.copy()
-            W = -np.diag(self.likelihood_function.link_hess(self.data, f))
+            W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data))
             if not self.likelihood_function.log_concave:
                 W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -285,7 +287,7 @@ class Laplace(likelihood):
             B, L, W_12 = self._compute_B_statistics(K, W)
 
             W_f = np.dot(W, f)
-            grad = self.likelihood_function.link_grad(self.data, f)[:, None]
+            grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None]
             #Find K_i_f
             b = W_f + grad
 
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 49174ce7..0d421882 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -4,6 +4,7 @@ import numpy as np
 from GPy.likelihoods.likelihood_functions import likelihood_function
 from scipy import stats
 
+
 class student_t(likelihood_function):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
@@ -24,15 +25,16 @@ class student_t(likelihood_function):
         self.log_concave = False
 
     @property
-    def variance(self):
+    def variance(self, extra_data=None):
         return (self.v / float(self.v - 2)) * (self.sigma**2)
 
-    def link_function(self, y, f):
+    def link_function(self, y, f, extra_data=None):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
         :y: data
         :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
         :returns: float(likelihood evaluated for this point)
 
         """
@@ -49,7 +51,7 @@ class student_t(likelihood_function):
                      )
         return np.sum(objective)
 
-    def link_grad(self, y, f):
+    def link_grad(self, y, f, extra_data=None):
         """
         Gradient of the link function at y, given f w.r.t f
 
@@ -57,6 +59,7 @@ class student_t(likelihood_function):
 
         :y: data
         :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
         :returns: gradient of likelihood evaluated at points
 
         """
@@ -67,17 +70,18 @@ class student_t(likelihood_function):
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
         return np.squeeze(grad)
 
-    def link_hess(self, y, f):
+    def link_hess(self, y, f, extra_data=None):
         """
         Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
         i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
 
-        Will return diaganol of hessian, since every where else it is 0
+        Will return diagonal of hessian, since every where else it is 0
 
         $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
 
         :y: data
         :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
         y = np.squeeze(y)
@@ -139,7 +143,7 @@ class student_t(likelihood_function):
                                             #size=(num_f_samples, num_y_samples))
             #print student_t_samples.shape
 
-        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:,None],
+        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
                                         scale=self.sigma,
                                         size=(num_test_points, num_y_samples, num_f_samples))
         student_t_samples = np.reshape(student_t_samples,
@@ -152,7 +156,7 @@ class student_t(likelihood_function):
         ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
         def t_gaussian(f, mu, var):
             return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
-                        * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
+                    * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
                     )
 
         def t_gauss_int(mu, var):
@@ -167,4 +171,83 @@ class student_t(likelihood_function):
         p = vec_t_gauss_int(mu, var)
         p_025 = mu - p
         p_975 = mu + p
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        return mu, np.nan*mu, p_025, p_975
+
+
+class weibull_survival(likelihood_function):
+    """Weibull t likelihood distribution for survival analysis with censoring
+        For nomanclature see Bayesian Survival Analysis
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fifj
+    """
+    def __init__(self, shape, scale):
+        self.shape = shape
+        self.scale = scale
+
+        #FIXME: This should be in the superclass
+        self.log_concave = True
+
+    def link_function(self, y, f, extra_data=None):
+        """
+        link_function $\ln p(y|f)$, i.e. log likelihood
+
+        $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$
+
+        :y: time of event data
+        :f: latent variables f
+        :extra_data: the censoring indicator, 1 for censored, 0 for not
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        v = extra_data
+        objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f)  # FIXME: CHECK THIS WITH BOOK, wheres scale?
+        return np.sum(objective)
+
+    def link_grad(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
+
+        $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i})
+
+        :y: data
+        :f: latent variables f
+        :extra_data: the censoring indicator, 1 for censored, 0 for not
+        :returns: gradient of likelihood evaluated at points
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        v = extra_data
+        grad = v - (y**self.shape)*np.exp(f)
+        return np.squeeze(grad)
+
+    def link_hess(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        Will return diagonal of hessian, since every where else it is 0
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used hessian
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        hess = (y**self.shape)*np.exp(f)
+        return np.squeeze(hess)