From 67248ab7c2b0becf471fe08638d35cf0786ee1a2 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 12 Mar 2013 03:16:33 -0700
Subject: [PATCH 01/71] Initial commit

---
 .gitignore | 35 +++++++++++++++++++++++++++++++++++
 README.md  |  4 ++++
 2 files changed, 39 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..d2d6f360
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,35 @@
+*.py[cod]
+
+# C extensions
+*.so
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+lib
+lib64
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+nosetests.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..317fa353
--- /dev/null
+++ b/README.md
@@ -0,0 +1,4 @@
+coxGP
+=====
+
+Gaussian Process models of Cox proportional hazard models
\ No newline at end of file

From 68eb83955c585b08cf93cbd659f749cff5b62bb3 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 12 Mar 2013 17:42:00 +0000
Subject: [PATCH 02/71] Initial commit, setting up the laplace approximation
 for a student t

---
 python/examples/laplace_approximations.py | 37 ++++++++++++++++
 python/likelihoods/Laplace.py             | 54 +++++++++++++++++++++++
 python/likelihoods/likelihood_function.py | 51 +++++++++++++++++++++
 python/models/coxGP.py                    | 19 ++++++++
 python/testing/cox_tests.py               | 14 ++++++
 5 files changed, 175 insertions(+)
 create mode 100644 python/examples/laplace_approximations.py
 create mode 100644 python/likelihoods/Laplace.py
 create mode 100644 python/likelihoods/likelihood_function.py
 create mode 100644 python/models/coxGP.py
 create mode 100644 python/testing/cox_tests.py

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
new file mode 100644
index 00000000..2f059831
--- /dev/null
+++ b/python/examples/laplace_approximations.py
@@ -0,0 +1,37 @@
+import GPy
+import numpy as np
+import scipy as sp
+import scipy.stats
+import matplotlib.pyplot as plt
+
+
+def student_t_approx():
+    """
+    Example of regressing with a student t likelihood
+    """
+    #Start a function, any function
+    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
+    Y = np.sin(X)
+
+    #Add some extreme value noise to some of the datapoints
+    percent_corrupted = 0.05
+    corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
+    indices = np.arange(Y.shape[0])
+    np.random.shuffle(indices)
+    corrupted_indices = indices[:corrupted_datums]
+    print corrupted_indices
+    noise = np.random.uniform(-10,10,(len(corrupted_indices), 1))
+    Y[corrupted_indices] += noise
+
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X,Y)
+
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize()
+    # plot
+    m.plot()
+    print m
+
+    #with a student t distribution, since it has heavy tails it should work well
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
new file mode 100644
index 00000000..a0dbc65c
--- /dev/null
+++ b/python/likelihoods/Laplace.py
@@ -0,0 +1,54 @@
+import nump as np
+import GPy
+from GPy.util.linalg import jitchol
+
+class Laplace(GPy.likelihoods.likelihood):
+    """Laplace approximation to a posterior"""
+
+    def __init__(self,data,likelihood_function):
+        """
+        Laplace Approximation
+
+        First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
+        then find the z^{prime} which allows this to be a normalised gaussian instead of a
+        non-normalized gaussian
+
+        Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
+        which makes a gaussian the same as the laplace approximation
+
+        Arguments
+        ---------
+
+        :data: @todo
+        :likelihood_function: @todo
+
+        """
+        GPy.likelihoods.likelihood.__init__(self)
+
+        self.data = data
+        self.likelihood_function = likelihood_function
+
+        #Inital values
+        self.N, self.D = self.data.shape
+
+    def _compute_GP_variables(self):
+        """
+        Generates data Y which would give the normal distribution identical to the laplace approximation
+
+        GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
+        that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
+        """
+        raise NotImplementedError
+
+    def fit_full(self, K):
+        """
+        The laplace approximation algorithm
+        For nomenclature see Rasmussen & Williams 2006
+        :K: Covariance matrix
+        """
+        self.f = np.zeros(self.N)
+
+        #Find \hat(f) using a newton raphson optimizer for example
+
+        #At this point get the hessian matrix
+
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
new file mode 100644
index 00000000..fd19675b
--- /dev/null
+++ b/python/likelihoods/likelihood_function.py
@@ -0,0 +1,51 @@
+import GPy
+from scipy.special import gamma, gammaln
+
+class student_t(GPy.likelihoods.likelihood_function):
+    """Student t likelihood distribution
+    For nomanclature see Bayesian Data Analysis 2003 p576
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fi
+    """
+    def __init__(self, deg_free, sigma=1):
+        self.v = deg_free
+        self.sigma = 1
+
+    def link_function(self, y_i, f_i):
+        """link_function $\ln p(y_i|f_i)$
+
+        :y_i: datum number i
+        :f_i: latent variable f_i
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        e = y_i - f_i
+        return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v)
+
+    def link_grad(self, y_i, f_i):
+        """gradient of the link function at y_i, given f_i w.r.t f_i
+
+        :y_i: datum number i
+        :f_i: latent variable f_i
+        :returns: float(gradient of likelihood evaluated at this point)
+
+        """
+        pass
+
+    def link_hess(self, y_i, f_i, f_j):
+        """hessian at this point (the hessian will be 0 unless i == j)
+        i.e. second derivative w.r.t f_i and f_j
+
+        :y_i: @todo
+        :f_i: @todo
+        :f_j: @todo
+        :returns: @todo
+
+        """
+        if f_i =
+        pass
+
diff --git a/python/models/coxGP.py b/python/models/coxGP.py
new file mode 100644
index 00000000..f61a8f46
--- /dev/null
+++ b/python/models/coxGP.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2013, Alan Saul
+
+from GPy.models import GP
+from .. import likelihoods
+from GPy import kern
+
+
+class cox_GP_regression(GP):
+    """
+    Cox Gaussian Process model for regression
+    """
+
+    def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None):
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1])
+
+        likelihood = likelihoods.cox_piecewise(Y, normalize=normalize_Y)
+
+        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices)
diff --git a/python/testing/cox_tests.py b/python/testing/cox_tests.py
new file mode 100644
index 00000000..526f5c92
--- /dev/null
+++ b/python/testing/cox_tests.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2013, Alan Saul
+
+import unittest
+import numpy as np
+import GPy
+
+class coxGPTests(unittest.TestCase):
+    def test_laplace_approx(self):
+        pass
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
+

From ad2c266c65120e1fabf0cf1825fc0c661084611b Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 13 Mar 2013 11:54:33 +0000
Subject: [PATCH 03/71] Added some comments

---
 python/likelihoods/likelihood_function.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index fd19675b..5d4e51ce 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -5,6 +5,9 @@ class student_t(GPy.likelihoods.likelihood_function):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
 
+    $$\ln(\frac{\Gamma(\frac{(v+1)}{2})}{\Gamma(\sqrt(v \pi \Gamma(\frac{v}{2}))})+ \ln(1+\frac{(y_i-f_i)^2}{\sigma v})^{-\frac{(v+1)}{2}}$$
+    TODO:Double check this
+
     Laplace:
     Needs functions to calculate
     ln p(yi|fi)
@@ -17,6 +20,8 @@ class student_t(GPy.likelihoods.likelihood_function):
 
     def link_function(self, y_i, f_i):
         """link_function $\ln p(y_i|f_i)$
+        $$\ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2}) - \ln \frac{v \pi \sigma}{2} - \frac{v+1}{2}\ln (1 + \frac{(y_{i} - f_{i})^{2}}{v\sigma})$$
+        TODO: Double check this
 
         :y_i: datum number i
         :f_i: latent variable f_i
@@ -24,11 +29,15 @@ class student_t(GPy.likelihoods.likelihood_function):
 
         """
         e = y_i - f_i
-        return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v)
+        return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) #Check the /v!
 
     def link_grad(self, y_i, f_i):
         """gradient of the link function at y_i, given f_i w.r.t f_i
 
+        derivative of log((gamma((v+1)/2)/gamma(sqrt(v*pi*gamma(v/2))))*(1+(t^2)/(a*v))^((-(v+1))/2)) with respect to t
+        $$\frac{(y_i - f_i)(v + 1)}{\sigma v (y_{i} - f_{i})^{2}}$$
+        TODO: Double check this
+
         :y_i: datum number i
         :f_i: latent variable f_i
         :returns: float(gradient of likelihood evaluated at this point)
@@ -40,6 +49,8 @@ class student_t(GPy.likelihoods.likelihood_function):
         """hessian at this point (the hessian will be 0 unless i == j)
         i.e. second derivative w.r.t f_i and f_j
 
+        second derivative of
+
         :y_i: @todo
         :f_i: @todo
         :f_j: @todo

From 3f114aa020fb678b1c52eb441bb079d9a0b8cd00 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 13 Mar 2013 17:55:41 +0000
Subject: [PATCH 04/71] Got most of laplace approximation working

---
 __init__.py                               |  0
 python/__init__.py                        |  0
 python/examples/__init__.py               |  0
 python/examples/laplace_approximations.py | 44 +++++++++++--
 python/likelihoods/Laplace.py             | 45 +++++++++++--
 python/likelihoods/__init__.py            |  0
 python/likelihoods/likelihood_function.py | 80 +++++++++++++----------
 python/models/__init__.py                 |  0
 python/testing/__init__.py                |  0
 9 files changed, 124 insertions(+), 45 deletions(-)
 create mode 100644 __init__.py
 create mode 100644 python/__init__.py
 create mode 100644 python/examples/__init__.py
 create mode 100644 python/likelihoods/__init__.py
 create mode 100644 python/models/__init__.py
 create mode 100644 python/testing/__init__.py

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/__init__.py b/python/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/examples/__init__.py b/python/examples/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 2f059831..0e1d3305 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -1,8 +1,9 @@
 import GPy
 import numpy as np
-import scipy as sp
-import scipy.stats
 import matplotlib.pyplot as plt
+from scipy.stats import t
+from coxGP.python.likelihoods.Laplace import Laplace
+from coxGP.python.likelihoods.likelihood_function import student_t
 
 
 def student_t_approx():
@@ -13,6 +14,41 @@ def student_t_approx():
     X = np.sort(np.random.uniform(0, 15, 70))[:, None]
     Y = np.sin(X)
 
+    #Add student t random noise to datapoints
+    deg_free = 1
+    noise = t.rvs(deg_free, loc=1.8, scale=1, size=Y.shape)
+    Y += noise
+
+    # Kernel object
+    print X.shape
+    kernel = GPy.kern.rbf(X.shape[1])
+
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X, Y, kernel=kernel)
+
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize()
+    # plot
+    #m.plot()
+    print m
+
+    #with a student t distribution, since it has heavy tails it should work well
+    likelihood_function = student_t(deg_free, sigma=1)
+    lap = Laplace(Y, likelihood_function)
+    cov = kernel.K(X)
+    lap.fit_full(cov)
+
+
+def noisy_laplace_approx():
+    """
+    Example of regressing with a student t likelihood
+    """
+    #Start a function, any function
+    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
+    Y = np.sin(X)
+
     #Add some extreme value noise to some of the datapoints
     percent_corrupted = 0.05
     corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
@@ -20,12 +56,12 @@ def student_t_approx():
     np.random.shuffle(indices)
     corrupted_indices = indices[:corrupted_datums]
     print corrupted_indices
-    noise = np.random.uniform(-10,10,(len(corrupted_indices), 1))
+    noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1))
     Y[corrupted_indices] += noise
 
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
-    m = GPy.models.GP_regression(X,Y)
+    m = GPy.models.GP_regression(X, Y)
 
     # optimize
     m.ensure_default_constraints()
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index a0dbc65c..6efbfa30 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,8 +1,14 @@
-import nump as np
+import numpy as np
+import scipy as sp
 import GPy
 from GPy.util.linalg import jitchol
+from functools import partial
+from GPy.likelihoods.likelihood import likelihood
+from GPy.util.linalg import pdinv,mdot
 
-class Laplace(GPy.likelihoods.likelihood):
+
+
+class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
     def __init__(self,data,likelihood_function):
@@ -23,8 +29,6 @@ class Laplace(GPy.likelihoods.likelihood):
         :likelihood_function: @todo
 
         """
-        GPy.likelihoods.likelihood.__init__(self)
-
         self.data = data
         self.likelihood_function = likelihood_function
 
@@ -38,7 +42,7 @@ class Laplace(GPy.likelihoods.likelihood):
         GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
         that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
         """
-        raise NotImplementedError
+        z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised
 
     def fit_full(self, K):
         """
@@ -46,9 +50,38 @@ class Laplace(GPy.likelihoods.likelihood):
         For nomenclature see Rasmussen & Williams 2006
         :K: Covariance matrix
         """
-        self.f = np.zeros(self.N)
+        f = np.zeros((self.N, 1))
+        print K.shape
+        print f.shape
+        print self.data.shape
+        (Ki, _, _, log_Kdet) = pdinv(K)
+        obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2*np.pi))
 
         #Find \hat(f) using a newton raphson optimizer for example
+        #TODO: Add newton-raphson as subclass of optimizer class
+
+        #FIXME: Can we get rid of this horrible reshaping?
+        def obj(f):
+            f = f[:, None]
+            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (Ki, f)) + obj_constant)
+            return float(res)
+
+        def obj_grad(f):
+            f = f[:, None]
+            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(Ki, f))
+            return np.squeeze(res)
+
+        def obj_hess(f):
+            f = f[:, None]
+            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - Ki)
+            return np.squeeze(res)
+
+        self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
 
         #At this point get the hessian matrix
+        self.hess_hat = obj_hess(f_hat)
 
+        #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
+        self.height_unnormalised = obj(f_hat) #FIXME: Is it -1?
+
+        return _compute_GP_variables()
diff --git a/python/likelihoods/__init__.py b/python/likelihoods/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 5d4e51ce..78731199 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -1,62 +1,72 @@
-import GPy
-from scipy.special import gamma, gammaln
+from scipy.special import gammaln
+import numpy as np
+from GPy.likelihoods.likelihood_functions import likelihood_function
 
-class student_t(GPy.likelihoods.likelihood_function):
+
+class student_t(likelihood_function):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
 
-    $$\ln(\frac{\Gamma(\frac{(v+1)}{2})}{\Gamma(\sqrt(v \pi \Gamma(\frac{v}{2}))})+ \ln(1+\frac{(y_i-f_i)^2}{\sigma v})^{-\frac{(v+1)}{2}}$$
-    TODO:Double check this
+    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
     Laplace:
     Needs functions to calculate
     ln p(yi|fi)
     dln p(yi|fi)_dfi
-    d2ln p(yi|fi)_d2fi
+    d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free, sigma=1):
         self.v = deg_free
         self.sigma = 1
 
-    def link_function(self, y_i, f_i):
-        """link_function $\ln p(y_i|f_i)$
-        $$\ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2}) - \ln \frac{v \pi \sigma}{2} - \frac{v+1}{2}\ln (1 + \frac{(y_{i} - f_{i})^{2}}{v\sigma})$$
-        TODO: Double check this
+    def link_function(self, y, f):
+        """link_function $\ln p(y|f)$
+        $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
-        :y_i: datum number i
-        :f_i: latent variable f_i
+        :y: datum number i
+        :f: latent variable f
         :returns: float(likelihood evaluated for this point)
 
         """
-        e = y_i - f_i
-        return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) #Check the /v!
+        e = y - f
+        #print "Link ", y.shape, f.shape, e.shape
+        objective = (gammaln((self.v + 1) * 0.5)
+                - gammaln(self.v * 0.5)
+                + np.log(self.sigma * np.sqrt(self.v * np.pi))
+                - (self.v + 1) * 0.5
+                * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                )
+        return np.sum(objective)
 
-    def link_grad(self, y_i, f_i):
-        """gradient of the link function at y_i, given f_i w.r.t f_i
+    def link_grad(self, y, f):
+        """
+        Gradient of the link function at y, given f w.r.t f
 
-        derivative of log((gamma((v+1)/2)/gamma(sqrt(v*pi*gamma(v/2))))*(1+(t^2)/(a*v))^((-(v+1))/2)) with respect to t
-        $$\frac{(y_i - f_i)(v + 1)}{\sigma v (y_{i} - f_{i})^{2}}$$
-        TODO: Double check this
+        $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
 
-        :y_i: datum number i
-        :f_i: latent variable f_i
+        :y: datum number i
+        :f: latent variable f
         :returns: float(gradient of likelihood evaluated at this point)
 
         """
-        pass
-
-    def link_hess(self, y_i, f_i, f_j):
-        """hessian at this point (the hessian will be 0 unless i == j)
-        i.e. second derivative w.r.t f_i and f_j
-
-        second derivative of
-
-        :y_i: @todo
-        :f_i: @todo
-        :f_j: @todo
-        :returns: @todo
+        e = y - f
+        #print "Grad ", y.shape, f.shape, e.shape
+        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        return grad
 
+    def link_hess(self, y, f):
         """
-        if f_i =
-        pass
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
 
+        Will return diaganol of hessian, since every where else it is 0
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: datum number i
+        :f: latent variable f
+        :returns: float(second derivative of likelihood evaluated at this point)
+        """
+        e = y - f
+        hess = ((self.v + 1) * e) / ((((self.sigma**2)*self.v) + e**2)**2)
+        return hess
diff --git a/python/models/__init__.py b/python/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/testing/__init__.py b/python/testing/__init__.py
new file mode 100644
index 00000000..e69de29b

From f9535c858a653e08a32a8633fe37577c87812820 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 14 Mar 2013 15:30:22 +0000
Subject: [PATCH 05/71] Trying to 'debug'

---
 python/examples/laplace_approximations.py | 22 +++++++++++---
 python/likelihoods/Laplace.py             | 25 +++++++++------
 python/likelihoods/likelihood_function.py | 37 ++++++++++++-----------
 3 files changed, 52 insertions(+), 32 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 0e1d3305..5642d8a4 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -1,7 +1,7 @@
 import GPy
 import numpy as np
 import matplotlib.pyplot as plt
-from scipy.stats import t
+from scipy.stats import t, norm
 from coxGP.python.likelihoods.Laplace import Laplace
 from coxGP.python.likelihoods.likelihood_function import student_t
 
@@ -11,12 +11,13 @@ def student_t_approx():
     Example of regressing with a student t likelihood
     """
     #Start a function, any function
-    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
+    X = np.sort(np.random.uniform(0, 15, 100))[:, None]
     Y = np.sin(X)
 
     #Add student t random noise to datapoints
-    deg_free = 1
-    noise = t.rvs(deg_free, loc=1.8, scale=1, size=Y.shape)
+    deg_free = 2.5
+    t_rv = t(deg_free, loc=5, scale=1)
+    noise = t_rv.rvs(size=Y.shape)
     Y += noise
 
     # Kernel object
@@ -39,6 +40,19 @@ def student_t_approx():
     lap = Laplace(Y, likelihood_function)
     cov = kernel.K(X)
     lap.fit_full(cov)
+    #Get one sample (just look at a single Y
+    mode = float(lap.f_hat[0])
+    variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables
+    #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables
+    normalised_approx = norm(loc=mode, scale=variance)
+    print "Normal with mode %f, and variance %f" % (mode, variance)
+    print lap.height_unnormalised
+
+    test_range = np.arange(0, 10, 0.1)
+    print np.diagonal(lap.hess_hat)
+    plt.plot(test_range, t_rv.pdf(test_range))
+    plt.plot(test_range, normalised_approx.pdf(test_range))
+    plt.show()
 
 
 def noisy_laplace_approx():
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 6efbfa30..08ae0e6f 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -5,13 +5,13 @@ from GPy.util.linalg import jitchol
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
-
+from scipy.stats import norm
 
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self,data,likelihood_function):
+    def __init__(self, data, likelihood_function):
         """
         Laplace Approximation
 
@@ -42,7 +42,13 @@ class Laplace(likelihood):
         GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
         that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
         """
-        z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised
+        #z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised
+        normalised_approx = norm(loc=self.f_hat, scale=self.hess_hat)
+        self.Z = normalised_approx.pdf(self.f_hat)/self.height_unnormalised
+        #self.Y =
+        #self.YYT =
+        #self.covariance_matrix =
+        #self.precision =
 
     def fit_full(self, K):
         """
@@ -51,11 +57,9 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         f = np.zeros((self.N, 1))
-        print K.shape
-        print f.shape
-        print self.data.shape
+        #K = np.diag(np.ones(self.N))
         (Ki, _, _, log_Kdet) = pdinv(K)
-        obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2*np.pi))
+        obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
 
         #Find \hat(f) using a newton raphson optimizer for example
         #TODO: Add newton-raphson as subclass of optimizer class
@@ -77,11 +81,12 @@ class Laplace(likelihood):
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        print self.f_hat
 
         #At this point get the hessian matrix
-        self.hess_hat = obj_hess(f_hat)
+        self.hess_hat = obj_hess(self.f_hat)
 
         #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
-        self.height_unnormalised = obj(f_hat) #FIXME: Is it -1?
+        self.height_unnormalised = obj(self.f_hat) #FIXME: Is it -1?
 
-        return _compute_GP_variables()
+        return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 78731199..46128de7 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -15,27 +15,27 @@ class student_t(likelihood_function):
     dln p(yi|fi)_dfi
     d2ln p(yi|fi)_d2fifj
     """
-    def __init__(self, deg_free, sigma=1):
+    def __init__(self, deg_free, sigma=2):
         self.v = deg_free
-        self.sigma = 1
+        self.sigma = sigma
 
     def link_function(self, y, f):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
-        :y: datum number i
-        :f: latent variable f
+        :y: data
+        :f: latent variables f
         :returns: float(likelihood evaluated for this point)
 
         """
+        assert y.shape[0] == f.shape[0]
         e = y - f
-        #print "Link ", y.shape, f.shape, e.shape
         objective = (gammaln((self.v + 1) * 0.5)
-                - gammaln(self.v * 0.5)
-                + np.log(self.sigma * np.sqrt(self.v * np.pi))
-                - (self.v + 1) * 0.5
-                * np.log(1 + ((e**2 / self.sigma**2) / self.v))
-                )
+                     - gammaln(self.v * 0.5)
+                     + np.log(self.sigma * np.sqrt(self.v * np.pi))
+                     - (self.v + 1) * 0.5
+                     * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                     )
         return np.sum(objective)
 
     def link_grad(self, y, f):
@@ -44,13 +44,13 @@ class student_t(likelihood_function):
 
         $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
 
-        :y: datum number i
-        :f: latent variable f
-        :returns: float(gradient of likelihood evaluated at this point)
+        :y: data
+        :f: latent variables f
+        :returns: gradient of likelihood evaluated at points
 
         """
+        assert y.shape[0] == f.shape[0]
         e = y - f
-        #print "Grad ", y.shape, f.shape, e.shape
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
         return grad
 
@@ -63,10 +63,11 @@ class student_t(likelihood_function):
 
         $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
 
-        :y: datum number i
-        :f: latent variable f
-        :returns: float(second derivative of likelihood evaluated at this point)
+        :y: data
+        :f: latent variables f
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
+        assert y.shape[0] == f.shape[0]
         e = y - f
-        hess = ((self.v + 1) * e) / ((((self.sigma**2)*self.v) + e**2)**2)
+        hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
         return hess

From 34ae852eea8d5f6cdc48028d4f21457c7f0b5259 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 15 Mar 2013 17:38:13 +0000
Subject: [PATCH 06/71] got an idea of how to implement! written in docs

---
 python/likelihoods/Laplace.py | 38 ++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 08ae0e6f..568fcef0 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -41,10 +41,26 @@ class Laplace(likelihood):
 
         GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
         that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
+
+        Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
+        then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
+        due to the z rescaling.
+
+        at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
+
+        This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
+        giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
+
+        $$\tilde{Y} = \tilde{\Sigma} Hf$$
+        where
+        $$\tilde{\Sigma}^{-1} = H - K^{-1}$$
+        i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
+        since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
+        and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
+
         """
-        #z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised
-        normalised_approx = norm(loc=self.f_hat, scale=self.hess_hat)
-        self.Z = normalised_approx.pdf(self.f_hat)/self.height_unnormalised
+        self.Sigma_tilde = self.hess_hat -
+        self.Z =
         #self.Y =
         #self.YYT =
         #self.covariance_matrix =
@@ -58,8 +74,8 @@ class Laplace(likelihood):
         """
         f = np.zeros((self.N, 1))
         #K = np.diag(np.ones(self.N))
-        (Ki, _, _, log_Kdet) = pdinv(K)
-        obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
+        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
+        obj_constant = (0.5 * self.log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
 
         #Find \hat(f) using a newton raphson optimizer for example
         #TODO: Add newton-raphson as subclass of optimizer class
@@ -67,17 +83,17 @@ class Laplace(likelihood):
         #FIXME: Can we get rid of this horrible reshaping?
         def obj(f):
             f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (Ki, f)) + obj_constant)
+            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (self.Ki, f)) + obj_constant)
             return float(res)
 
         def obj_grad(f):
             f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(Ki, f))
+            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
             f = f[:, None]
-            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - Ki)
+            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
@@ -87,6 +103,10 @@ class Laplace(likelihood):
         self.hess_hat = obj_hess(self.f_hat)
 
         #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
-        self.height_unnormalised = obj(self.f_hat) #FIXME: Is it -1?
+        self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
+        #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
+        #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
+        #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
+        self.z_hat = np.exp(-0.5*np.log(np.linalg.det(hess_hat)) + self.height_unnormalised)
 
         return self._compute_GP_variables()

From 2bf1cf0eb6596773c2f75a06f152b3a7cfd66081 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 18 Mar 2013 15:59:12 +0000
Subject: [PATCH 07/71] following naming convention better, lots of inverses
 which should be able to get rid of one or two, unsure if it works

---
 python/examples/laplace_approximations.py | 17 +++++----
 python/likelihoods/Laplace.py             | 43 +++++++++++++----------
 python/likelihoods/likelihood_function.py |  9 ++---
 3 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 5642d8a4..aa8cdcb4 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -41,18 +41,21 @@ def student_t_approx():
     cov = kernel.K(X)
     lap.fit_full(cov)
     #Get one sample (just look at a single Y
-    mode = float(lap.f_hat[0])
-    variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables
+    #mode = float(lap.f_hat[0])
+    #variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables
     #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables
-    normalised_approx = norm(loc=mode, scale=variance)
-    print "Normal with mode %f, and variance %f" % (mode, variance)
-    print lap.height_unnormalised
 
     test_range = np.arange(0, 10, 0.1)
-    print np.diagonal(lap.hess_hat)
     plt.plot(test_range, t_rv.pdf(test_range))
-    plt.plot(test_range, normalised_approx.pdf(test_range))
+    for i in xrange(X.shape[0]):
+        mode = lap.f_hat[i]
+        covariance = lap.hess_hat_i[i,i]
+        scaling = np.exp(lap.ln_z_hat)
+        normalised_approx = norm(loc=mode, scale=covariance)
+        print "Normal with mode %f, and variance %f" % (mode, covariance)
+        plt.plot(test_range, normalised_approx.pdf(test_range))
     plt.show()
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
 
 def noisy_laplace_approx():
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 568fcef0..9d622b0d 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,12 +1,10 @@
 import numpy as np
 import scipy as sp
 import GPy
-from GPy.util.linalg import jitchol
+#from GPy.util.linalg import jitchol
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
-from scipy.stats import norm
-
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -35,6 +33,8 @@ class Laplace(likelihood):
         #Inital values
         self.N, self.D = self.data.shape
 
+        self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
+
     def _compute_GP_variables(self):
         """
         Generates data Y which would give the normal distribution identical to the laplace approximation
@@ -59,12 +59,15 @@ class Laplace(likelihood):
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
 
         """
-        self.Sigma_tilde = self.hess_hat -
-        self.Z =
-        #self.Y =
-        #self.YYT =
-        #self.covariance_matrix =
-        #self.precision =
+        self.Sigma_tilde_i = self.hess_hat + self.Ki
+        #Do we really need to inverse Sigma_tilde_i? :(
+        (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i)
+        Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess?
+        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde, (self.Sigma_tilde_i, Y_tilde))))
+        self.Y = Y_tilde
+        self.covariance_matrix = self.Sigma_tilde
+        self.precision = np.diag(self.Sigma_tilde)[:, None]
+        self.YYT = np.dot(self.Y, self.Y)
 
     def fit_full(self, K):
         """
@@ -75,38 +78,40 @@ class Laplace(likelihood):
         f = np.zeros((self.N, 1))
         #K = np.diag(np.ones(self.N))
         (self.Ki, _, _, self.log_Kdet) = pdinv(K)
-        obj_constant = (0.5 * self.log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
-
+        LOG_K_CONST = -(0.5 * self.log_Kdet)
+        OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST
         #Find \hat(f) using a newton raphson optimizer for example
         #TODO: Add newton-raphson as subclass of optimizer class
 
         #FIXME: Can we get rid of this horrible reshaping?
         def obj(f):
-            f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (self.Ki, f)) + obj_constant)
+            #f = f[:, None]
+            res = -1 * (self.likelihood_function.link_function(self.data[:,0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
             return float(res)
 
         def obj_grad(f):
-            f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(self.Ki, f))
+            #f = f[:, None]
+            res = -1 * (self.likelihood_function.link_grad(self.data[:,0], f) - mdot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
-            f = f[:, None]
-            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - self.Ki)
+            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
         print self.f_hat
 
         #At this point get the hessian matrix
-        self.hess_hat = obj_hess(self.f_hat)
+        self.hess_hat = -1*np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) #-1*obj_hess(self.f_hat) + self.Ki
+        #self.hess_hat = -1*obj_hess(self.f_hat) + self.Ki
+        (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat + self.Ki)
 
         #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
         self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
         #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
-        self.z_hat = np.exp(-0.5*np.log(np.linalg.det(hess_hat)) + self.height_unnormalised)
+        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) + self.height_unnormalised - self.NORMAL_CONST #Unsure whether its log_hess or log_hess_i
+
 
         return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 46128de7..8adbf86c 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -28,7 +28,7 @@ class student_t(likelihood_function):
         :returns: float(likelihood evaluated for this point)
 
         """
-        assert y.shape[0] == f.shape[0]
+        assert y.shape == f.shape
         e = y - f
         objective = (gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
@@ -49,7 +49,7 @@ class student_t(likelihood_function):
         :returns: gradient of likelihood evaluated at points
 
         """
-        assert y.shape[0] == f.shape[0]
+        assert y.shape == f.shape
         e = y - f
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
         return grad
@@ -67,7 +67,8 @@ class student_t(likelihood_function):
         :f: latent variables f
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
-        assert y.shape[0] == f.shape[0]
+        assert y.shape == f.shape
         e = y - f
-        hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
+        #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
+        hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2)
         return hess

From 46d59c94b27cabe61056b71aa26d1293779c0697 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 19 Mar 2013 11:47:53 +0000
Subject: [PATCH 08/71] Just breaking some things...

---
 python/examples/laplace_approximations.py | 88 +++++++++++++++--------
 python/likelihoods/Laplace.py             | 52 ++++++++++----
 python/likelihoods/likelihood_function.py | 16 ++++-
 3 files changed, 113 insertions(+), 43 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index aa8cdcb4..73c8f67f 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -16,47 +16,75 @@ def student_t_approx():
 
     #Add student t random noise to datapoints
     deg_free = 2.5
-    t_rv = t(deg_free, loc=5, scale=1)
+    t_rv = t(deg_free, loc=0, scale=1)
     noise = t_rv.rvs(size=Y.shape)
     Y += noise
 
+    #Add some extreme value noise to some of the datapoints
+    #percent_corrupted = 0.05
+    #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
+    #indices = np.arange(Y.shape[0])
+    #np.random.shuffle(indices)
+    #corrupted_indices = indices[:corrupted_datums]
+    #print corrupted_indices
+    #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
+    #Y[corrupted_indices] += noise
+
     # Kernel object
-    print X.shape
-    kernel = GPy.kern.rbf(X.shape[1])
+    #print X.shape
+    #kernel = GPy.kern.rbf(X.shape[1])
 
-    #A GP should completely break down due to the points as they get a lot of weight
-    # create simple GP model
-    m = GPy.models.GP_regression(X, Y, kernel=kernel)
+    ##A GP should completely break down due to the points as they get a lot of weight
+    ## create simple GP model
+    #m = GPy.models.GP_regression(X, Y, kernel=kernel)
 
-    # optimize
-    m.ensure_default_constraints()
-    m.optimize()
-    # plot
-    #m.plot()
-    print m
+    ## optimize
+    #m.ensure_default_constraints()
+    #m.optimize()
+    ## plot
+    ##m.plot()
+    #print m
 
     #with a student t distribution, since it has heavy tails it should work well
-    likelihood_function = student_t(deg_free, sigma=1)
-    lap = Laplace(Y, likelihood_function)
-    cov = kernel.K(X)
-    lap.fit_full(cov)
-    #Get one sample (just look at a single Y
-    #mode = float(lap.f_hat[0])
-    #variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables
-    #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables
+    #likelihood_function = student_t(deg_free, sigma=1)
+    #lap = Laplace(Y, likelihood_function)
+    #cov = kernel.K(X)
+    #lap.fit_full(cov)
 
-    test_range = np.arange(0, 10, 0.1)
-    plt.plot(test_range, t_rv.pdf(test_range))
-    for i in xrange(X.shape[0]):
-        mode = lap.f_hat[i]
-        covariance = lap.hess_hat_i[i,i]
-        scaling = np.exp(lap.ln_z_hat)
-        normalised_approx = norm(loc=mode, scale=covariance)
-        print "Normal with mode %f, and variance %f" % (mode, covariance)
-        plt.plot(test_range, normalised_approx.pdf(test_range))
-    plt.show()
+    #test_range = np.arange(0, 10, 0.1)
+    #plt.plot(test_range, t_rv.pdf(test_range))
+    #for i in xrange(X.shape[0]):
+        #mode = lap.f_hat[i]
+        #covariance = lap.hess_hat_i[i,i]
+        #scaling = np.exp(lap.ln_z_hat)
+        #normalised_approx = norm(loc=mode, scale=covariance)
+        #print "Normal with mode %f, and variance %f" % (mode, covariance)
+        #plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    #plt.show()
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+    # Likelihood object
+    t_distribution = student_t(deg_free, sigma=1)
+    stu_t_likelihood = Laplace(Y, t_distribution)
+    kernel = GPy.kern.rbf(X.shape[1])
+
+    m = GPy.models.GP(X, stu_t_likelihood, kernel)
+    m.ensure_default_constraints()
+
+    m.update_likelihood_approximation()
+    print "NEW MODEL"
+    print(m)
+
+    # optimize
+    #m.optimize()
+    print(m)
+
+    # plot
+    m.plot()
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
+    return m
+
 
 def noisy_laplace_approx():
     """
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 9d622b0d..23db6abd 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -5,6 +5,7 @@ import GPy
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
+import numpy.testing.assert_array_equal
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -35,6 +36,29 @@ class Laplace(likelihood):
 
         self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
 
+        #Initial values for the GP variables
+        self.Y = np.zeros((self.N,1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:,None]
+        self.Z = 0
+        self.YYT = None
+
+    def predictive_values(self,mu,var):
+        return self.likelihood_function.predictive_values(mu,var)
+
+    def _get_params(self):
+        return np.zeros(0)
+
+    def _get_param_names(self):
+        return []
+
+    def _set_params(self,p):
+        pass # TODO: Laplace likelihood might want to take some parameters...
+
+    def _gradients(self,partial):
+        raise NotImplementedError
+        #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
+
     def _compute_GP_variables(self):
         """
         Generates data Y which would give the normal distribution identical to the laplace approximation
@@ -63,11 +87,14 @@ class Laplace(likelihood):
         #Do we really need to inverse Sigma_tilde_i? :(
         (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i)
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess?
-        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde, (self.Sigma_tilde_i, Y_tilde))))
+        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))))
+
+        self.Z = self.Z_tilde
         self.Y = Y_tilde
         self.covariance_matrix = self.Sigma_tilde
-        self.precision = np.diag(self.Sigma_tilde)[:, None]
-        self.YYT = np.dot(self.Y, self.Y)
+        self.precision = 1/np.diag(self.Sigma_tilde)[:, None]
+        self.YYT = np.dot(self.Y, self.Y.T)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def fit_full(self, K):
         """
@@ -76,7 +103,6 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         f = np.zeros((self.N, 1))
-        #K = np.diag(np.ones(self.N))
         (self.Ki, _, _, self.log_Kdet) = pdinv(K)
         LOG_K_CONST = -(0.5 * self.log_Kdet)
         OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST
@@ -95,23 +121,25 @@ class Laplace(likelihood):
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki)
+            res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
-        print self.f_hat
 
         #At this point get the hessian matrix
-        self.hess_hat = -1*np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) #-1*obj_hess(self.f_hat) + self.Ki
-        #self.hess_hat = -1*obj_hess(self.f_hat) + self.Ki
-        (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat + self.Ki)
+        self.hess_hat = np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) + self.Ki
+        (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat)
+        (self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat_i)
+
+        np.testing.assert_array_equal(self.hess_hat, hess_hat_new)
 
         #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
-        self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
+        #self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
         #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
-        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) + self.height_unnormalised - self.NORMAL_CONST #Unsure whether its log_hess or log_hess_i
-
+        #Unsure whether its log_hess or log_hess_i
+        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(f.T, (self.Ki, f))
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 8adbf86c..e70cdc8d 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -1,7 +1,7 @@
 from scipy.special import gammaln
 import numpy as np
 from GPy.likelihoods.likelihood_functions import likelihood_function
-
+from scipy import stats
 
 class student_t(likelihood_function):
     """Student t likelihood distribution
@@ -72,3 +72,17 @@ class student_t(likelihood_function):
         #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
         hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2)
         return hess
+
+    def predictive_values(self, mu, var):
+        """
+        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
+        """
+        mean = np.exp(mu)
+        p_025 = stats.t.ppf(025,mean)
+        p_975 = stats.t.ppf(975,mean)
+
+        #p_025 = tmp[:,0]
+        #p_975 = tmp[:,1]
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        return mean,p_025,p_975
+

From a9d555597653c24bc67812776514e29066216d66 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 19 Mar 2013 18:21:57 +0000
Subject: [PATCH 09/71] Worked out in terms of W, needs gradients implementing

---
 python/examples/laplace_approximations.py | 44 ++++++++++-----------
 python/likelihoods/Laplace.py             | 48 +++++++++++++++--------
 python/likelihoods/likelihood_function.py |  5 ++-
 3 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 73c8f67f..c8d06ab2 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -15,13 +15,13 @@ def student_t_approx():
     Y = np.sin(X)
 
     #Add student t random noise to datapoints
-    deg_free = 2.5
+    deg_free = 3.5
     t_rv = t(deg_free, loc=0, scale=1)
     noise = t_rv.rvs(size=Y.shape)
     Y += noise
 
     #Add some extreme value noise to some of the datapoints
-    #percent_corrupted = 0.05
+    #percent_corrupted = 0.15
     #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
     #indices = np.arange(Y.shape[0])
     #np.random.shuffle(indices)
@@ -31,11 +31,11 @@ def student_t_approx():
     #Y[corrupted_indices] += noise
 
     # Kernel object
-    #print X.shape
-    #kernel = GPy.kern.rbf(X.shape[1])
+    print X.shape
+    kernel = GPy.kern.rbf(X.shape[1])
 
-    ##A GP should completely break down due to the points as they get a lot of weight
-    ## create simple GP model
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
     #m = GPy.models.GP_regression(X, Y, kernel=kernel)
 
     ## optimize
@@ -46,27 +46,27 @@ def student_t_approx():
     #print m
 
     #with a student t distribution, since it has heavy tails it should work well
-    #likelihood_function = student_t(deg_free, sigma=1)
-    #lap = Laplace(Y, likelihood_function)
-    #cov = kernel.K(X)
-    #lap.fit_full(cov)
+    likelihood_function = student_t(deg_free, sigma=1)
+    lap = Laplace(Y, likelihood_function)
+    cov = kernel.K(X)
+    lap.fit_full(cov)
 
-    #test_range = np.arange(0, 10, 0.1)
-    #plt.plot(test_range, t_rv.pdf(test_range))
-    #for i in xrange(X.shape[0]):
-        #mode = lap.f_hat[i]
-        #covariance = lap.hess_hat_i[i,i]
-        #scaling = np.exp(lap.ln_z_hat)
-        #normalised_approx = norm(loc=mode, scale=covariance)
-        #print "Normal with mode %f, and variance %f" % (mode, covariance)
-        #plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
-    #plt.show()
-    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    test_range = np.arange(0, 10, 0.1)
+    plt.plot(test_range, t_rv.pdf(test_range))
+    for i in xrange(X.shape[0]):
+        mode = lap.f_hat[i]
+        covariance = lap.hess_hat_i[i,i]
+        scaling = np.exp(lap.ln_z_hat)
+        normalised_approx = norm(loc=mode, scale=covariance)
+        print "Normal with mode %f, and variance %f" % (mode, covariance)
+        plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    plt.show()
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     # Likelihood object
     t_distribution = student_t(deg_free, sigma=1)
     stu_t_likelihood = Laplace(Y, t_distribution)
-    kernel = GPy.kern.rbf(X.shape[1])
+    kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
     m = GPy.models.GP(X, stu_t_likelihood, kernel)
     m.ensure_default_constraints()
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 23db6abd..84128e3a 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,11 +1,11 @@
 import numpy as np
 import scipy as sp
 import GPy
-#from GPy.util.linalg import jitchol
+from scipy.linalg import cholesky, eig, inv
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
-import numpy.testing.assert_array_equal
+#import numpy.testing.assert_array_equal
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -56,8 +56,8 @@ class Laplace(likelihood):
         pass # TODO: Laplace likelihood might want to take some parameters...
 
     def _gradients(self,partial):
+        return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
-        #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
 
     def _compute_GP_variables(self):
         """
@@ -83,16 +83,23 @@ class Laplace(likelihood):
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
 
         """
-        self.Sigma_tilde_i = self.hess_hat + self.Ki
+        self.Sigma_tilde_i = self.hess_hat_i #self.W #self.hess_hat_i - self.Ki
         #Do we really need to inverse Sigma_tilde_i? :(
-        (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i)
-        Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess?
-        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))))
+        if self.likelihood_function.log_concave:
+            (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
+        else:
+            self.Sigma_tilde = inv(self.Sigma_tilde_i)
+        #f_hat? should be f but we must have optimized for them I guess?
+        Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
+        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST
+                              - 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                              + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                              )
 
         self.Z = self.Z_tilde
         self.Y = Y_tilde
         self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1/np.diag(self.Sigma_tilde)[:, None]
+        self.precision = 1 / np.diag(self.Sigma_tilde)[:, None]
         self.YYT = np.dot(self.Y, self.Y.T)
         import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
@@ -112,34 +119,41 @@ class Laplace(likelihood):
         #FIXME: Can we get rid of this horrible reshaping?
         def obj(f):
             #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data[:,0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
             return float(res)
 
         def obj_grad(f):
             #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data[:,0], f) - mdot(self.Ki, f))
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki)
+            res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
 
         #At this point get the hessian matrix
-        self.hess_hat = np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) + self.Ki
+        self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat))
+        self.hess_hat = self.Ki + self.W
         (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat)
-        (self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat_i)
 
-        np.testing.assert_array_equal(self.hess_hat, hess_hat_new)
+        #Check hess_hat is positive definite
+        try:
+            cholesky(self.hess_hat)
+        except:
+            raise ValueError("Must be positive definite")
+
+        #Check its eigenvalues are positive
+        eigenvalues = eig(self.hess_hat)
+        if not np.all(eigenvalues > 0):
+            raise ValueError("Eigen values not positive")
 
-        #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
-        #self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
         #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
         #Unsure whether its log_hess or log_hess_i
-        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(f.T, (self.Ki, f))
+        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(self.f_hat.T, (self.Ki, self.f_hat))
         import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index e70cdc8d..c4823703 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -19,6 +19,9 @@ class student_t(likelihood_function):
         self.v = deg_free
         self.sigma = sigma
 
+        #FIXME: This should be in the superclass
+        self.log_concave = False
+
     def link_function(self, y, f):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
@@ -70,7 +73,7 @@ class student_t(likelihood_function):
         assert y.shape == f.shape
         e = y - f
         #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
-        hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2)
+        hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
         return hess
 
     def predictive_values(self, mu, var):

From 474d5484b06bdbceefa08fa573d28326bb3f8a92 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 21 Mar 2013 14:00:22 +0000
Subject: [PATCH 10/71] Changing definitions again...

---
 python/examples/laplace_approximations.py | 15 +++++---
 python/likelihoods/Laplace.py             | 44 +++++++++++++++--------
 python/likelihoods/likelihood_function.py | 10 ++----
 3 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index c8d06ab2..6f2b19aa 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -15,8 +15,9 @@ def student_t_approx():
     Y = np.sin(X)
 
     #Add student t random noise to datapoints
-    deg_free = 3.5
-    t_rv = t(deg_free, loc=0, scale=1)
+    deg_free = 100000.5
+    real_var = 4
+    t_rv = t(deg_free, loc=0, scale=real_var)
     noise = t_rv.rvs(size=Y.shape)
     Y += noise
 
@@ -46,7 +47,7 @@ def student_t_approx():
     #print m
 
     #with a student t distribution, since it has heavy tails it should work well
-    likelihood_function = student_t(deg_free, sigma=1)
+    likelihood_function = student_t(deg_free, sigma=real_var)
     lap = Laplace(Y, likelihood_function)
     cov = kernel.K(X)
     lap.fit_full(cov)
@@ -64,7 +65,7 @@ def student_t_approx():
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     # Likelihood object
-    t_distribution = student_t(deg_free, sigma=1)
+    t_distribution = student_t(deg_free, sigma=real_var)
     stu_t_likelihood = Laplace(Y, t_distribution)
     kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
@@ -77,12 +78,16 @@ def student_t_approx():
 
     # optimize
     #m.optimize()
-    print(m)
+    #print(m)
 
     # plot
     m.plot()
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
+    m.optimize()
+    print(m)
+
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     return m
 
 
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 84128e3a..b002034d 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,7 +1,7 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv
+from scipy.linalg import cholesky, eig, inv, det
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
@@ -43,8 +43,10 @@ class Laplace(likelihood):
         self.Z = 0
         self.YYT = None
 
-    def predictive_values(self,mu,var):
-        return self.likelihood_function.predictive_values(mu,var)
+    def predictive_values(self, mu, var, full_cov):
+        if full_cov:
+            raise NotImplementedError("Cannot make correlated predictions with an EP likelihood")
+        return self.likelihood_function.predictive_values(mu, var)
 
     def _get_params(self):
         return np.zeros(0)
@@ -52,10 +54,10 @@ class Laplace(likelihood):
     def _get_param_names(self):
         return []
 
-    def _set_params(self,p):
+    def _set_params(self, p):
         pass # TODO: Laplace likelihood might want to take some parameters...
 
-    def _gradients(self,partial):
+    def _gradients(self, partial):
         return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
 
@@ -83,7 +85,13 @@ class Laplace(likelihood):
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
 
         """
-        self.Sigma_tilde_i = self.hess_hat_i #self.W #self.hess_hat_i - self.Ki
+        self.Sigma_tilde_i = self.W #self.hess_hat_i
+        #Check it isn't singular!
+        epsilon = 1e-2
+        """
+        if np.abs(det(self.Sigma_tilde_i)) < epsilon:
+            raise ValueError("inverse covariance must be non-singular to inverse!")
+        """
         #Do we really need to inverse Sigma_tilde_i? :(
         if self.likelihood_function.log_concave:
             (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
@@ -91,12 +99,17 @@ class Laplace(likelihood):
             self.Sigma_tilde = inv(self.Sigma_tilde_i)
         #f_hat? should be f but we must have optimized for them I guess?
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
-        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST
-                              - 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
-                              + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
-                              )
+        #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
+                        #- 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                        #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                   #)
+        Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
+                   + 0.5*self.log_hess_hat_det
+                   + 0.5*mdot(self.f_hat, self.Ki , self.f_hat)
+                   + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                   )
 
-        self.Z = self.Z_tilde
+        self.Z = Z_tilde
         self.Y = Y_tilde
         self.covariance_matrix = self.Sigma_tilde
         self.precision = 1 / np.diag(self.Sigma_tilde)[:, None]
@@ -128,7 +141,7 @@ class Laplace(likelihood):
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
+            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
@@ -153,7 +166,10 @@ class Laplace(likelihood):
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
         #Unsure whether its log_hess or log_hess_i
-        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(self.f_hat.T, (self.Ki, self.f_hat))
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        self.ln_z_hat = (-0.5*self.log_hess_hat_det
+                         - 0.5*self.log_Kdet
+                         -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat)
+                         - mdot(self.f_hat.T, (self.Ki, self.f_hat))
+                         )
 
         return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index c4823703..a299fe3a 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -81,11 +81,7 @@ class student_t(likelihood_function):
         Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
         """
         mean = np.exp(mu)
-        p_025 = stats.t.ppf(025,mean)
-        p_975 = stats.t.ppf(975,mean)
-
-        #p_025 = tmp[:,0]
-        #p_975 = tmp[:,1]
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-        return mean,p_025,p_975
+        p_025 = stats.t.ppf(.025, mean)
+        p_975 = stats.t.ppf(.975, mean)
 
+        return mean, np.nan*mean, p_025, p_975

From 7b0d0550cb01f0c4eca567e80f950e7f54ecb7b2 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 22 Mar 2013 12:50:47 +0000
Subject: [PATCH 11/71] Seemed to be working, now its not

---
 python/examples/laplace_approximations.py | 118 +++++++++++++---------
 python/likelihoods/Laplace.py             |  37 +++----
 2 files changed, 92 insertions(+), 63 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 6f2b19aa..5fb39e08 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -11,15 +11,22 @@ def student_t_approx():
     Example of regressing with a student t likelihood
     """
     #Start a function, any function
-    X = np.sort(np.random.uniform(0, 15, 100))[:, None]
-    Y = np.sin(X)
+    X = np.linspace(0.0, 10.0, 100)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*0.1
+    Yc = Y.copy()
+
+    Y = Y/Y.max()
+
+    Yc[10] += 5
+    Yc[15] += 20
+    Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 100000.5
-    real_var = 4
-    t_rv = t(deg_free, loc=0, scale=real_var)
-    noise = t_rv.rvs(size=Y.shape)
-    Y += noise
+    deg_free = 1000000 #100000.5
+    real_var = 0.1
+    #t_rv = t(deg_free, loc=0, scale=real_var)
+    #noise = t_rvrvs(size=Y.shape)
+    #Y += noise
 
     #Add some extreme value noise to some of the datapoints
     #percent_corrupted = 0.15
@@ -30,64 +37,83 @@ def student_t_approx():
     #print corrupted_indices
     #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
     #Y[corrupted_indices] += noise
-
+    plt.figure(1)
     # Kernel object
-    print X.shape
-    kernel = GPy.kern.rbf(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
 
-    #A GP should completely break down due to the points as they get a lot of weight
-    # create simple GP model
-    #m = GPy.models.GP_regression(X, Y, kernel=kernel)
-
-    ## optimize
+    #print "Clean Gaussian"
+    ##A GP should completely break down due to the points as they get a lot of weight
+    ## create simple GP model
+    #m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    ### optimize
     #m.ensure_default_constraints()
+    ##m.unconstrain('noise')
+    ##m.constrain_fixed('noise', 0.1)
     #m.optimize()
     ## plot
-    ##m.plot()
+    #plt.subplot(221)
+    #m.plot()
     #print m
 
-    #with a student t distribution, since it has heavy tails it should work well
-    likelihood_function = student_t(deg_free, sigma=real_var)
-    lap = Laplace(Y, likelihood_function)
-    cov = kernel.K(X)
-    lap.fit_full(cov)
+    ##Corrupt
+    #print "Corrupt Gaussian"
+    #m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
+    #m.ensure_default_constraints()
+    ##m.unconstrain('noise')
+    ##m.constrain_fixed('noise', 0.1)
+    #m.optimize()
+    #plt.subplot(222)
+    #m.plot()
+    #print m
 
-    test_range = np.arange(0, 10, 0.1)
-    plt.plot(test_range, t_rv.pdf(test_range))
-    for i in xrange(X.shape[0]):
-        mode = lap.f_hat[i]
-        covariance = lap.hess_hat_i[i,i]
-        scaling = np.exp(lap.ln_z_hat)
-        normalised_approx = norm(loc=mode, scale=covariance)
-        print "Normal with mode %f, and variance %f" % (mode, covariance)
-        plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
-    plt.show()
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    ##with a student t distribution, since it has heavy tails it should work well
+    ##likelihood_function = student_t(deg_free, sigma=real_var)
+    ##lap = Laplace(Y, likelihood_function)
+    ##cov = kernel.K(X)
+    ##lap.fit_full(cov)
+
+    ##test_range = np.arange(0, 10, 0.1)
+    ##plt.plot(test_range, t_rv.pdf(test_range))
+    ##for i in xrange(X.shape[0]):
+        ##mode = lap.f_hat[i]
+        ##covariance = lap.hess_hat_i[i,i]
+        ##scaling = np.exp(lap.ln_z_hat)
+        ##normalised_approx = norm(loc=mode, scale=covariance)
+        ##print "Normal with mode %f, and variance %f" % (mode, covariance)
+        ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    ##plt.show()
 
     # Likelihood object
-    t_distribution = student_t(deg_free, sigma=real_var)
+    t_distribution = student_t(deg_free, sigma=np.sqrt(real_var))
     stu_t_likelihood = Laplace(Y, t_distribution)
-    kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
-    m = GPy.models.GP(X, stu_t_likelihood, kernel)
+    print "Clean student t"
+    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     m.ensure_default_constraints()
-
     m.update_likelihood_approximation()
-    print "NEW MODEL"
-    print(m)
-
     # optimize
-    #m.optimize()
-    #print(m)
-
-    # plot
-    m.plot()
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-
     m.optimize()
     print(m)
+    # plot
+    plt.subplot(211)
+    m.plot_f()
+
+    print "Corrupt student t"
+    t_distribution = student_t(deg_free, sigma=np.sqrt(real_var))
+    corrupt_stu_t_likelihood = Laplace(Yc, t_distribution)
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(212)
+    m.plot_f()
 
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
     return m
 
 
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index b002034d..d86523d8 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -33,13 +33,15 @@ class Laplace(likelihood):
 
         #Inital values
         self.N, self.D = self.data.shape
+        self.is_heteroscedastic = True
+        self.Nparams = 0
 
         self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
 
         #Initial values for the GP variables
-        self.Y = np.zeros((self.N,1))
+        self.Y = np.zeros((self.N, 1))
         self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)[:,None]
+        self.precision = np.ones(self.N)[:, None]
         self.Z = 0
         self.YYT = None
 
@@ -58,6 +60,7 @@ class Laplace(likelihood):
         pass # TODO: Laplace likelihood might want to take some parameters...
 
     def _gradients(self, partial):
+        #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
 
@@ -88,10 +91,8 @@ class Laplace(likelihood):
         self.Sigma_tilde_i = self.W #self.hess_hat_i
         #Check it isn't singular!
         epsilon = 1e-2
-        """
         if np.abs(det(self.Sigma_tilde_i)) < epsilon:
             raise ValueError("inverse covariance must be non-singular to inverse!")
-        """
         #Do we really need to inverse Sigma_tilde_i? :(
         if self.likelihood_function.log_concave:
             (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
@@ -99,21 +100,17 @@ class Laplace(likelihood):
             self.Sigma_tilde = inv(self.Sigma_tilde_i)
         #f_hat? should be f but we must have optimized for them I guess?
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
-        #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                        #- 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
-                        #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
-                   #)
         Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                   + 0.5*self.log_hess_hat_det
-                   + 0.5*mdot(self.f_hat, self.Ki , self.f_hat)
-                   + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                    + 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                    + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                    - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
                    )
 
         self.Z = Z_tilde
-        self.Y = Y_tilde
+        self.Y = Y_tilde[:, None]
+        self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = self.Sigma_tilde
         self.precision = 1 / np.diag(self.Sigma_tilde)[:, None]
-        self.YYT = np.dot(self.Y, self.Y.T)
         import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def fit_full(self, K):
@@ -122,6 +119,7 @@ class Laplace(likelihood):
         For nomenclature see Rasmussen & Williams 2006
         :K: Covariance matrix
         """
+        self.K = K.copy()
         f = np.zeros((self.N, 1))
         (self.Ki, _, _, self.log_Kdet) = pdinv(K)
         LOG_K_CONST = -(0.5 * self.log_Kdet)
@@ -148,6 +146,11 @@ class Laplace(likelihood):
 
         #At this point get the hessian matrix
         self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat))
+        if not self.likelihood_function.log_concave:
+            self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                   #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                   #To cause the posterior to become less certain than the prior and likelihood,
+                                   #This is a property only held by non-log-concave likelihoods
         self.hess_hat = self.Ki + self.W
         (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat)
 
@@ -166,10 +169,10 @@ class Laplace(likelihood):
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
         #Unsure whether its log_hess or log_hess_i
-        self.ln_z_hat = (-0.5*self.log_hess_hat_det
-                         - 0.5*self.log_Kdet
-                         -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat)
-                         - mdot(self.f_hat.T, (self.Ki, self.f_hat))
+        self.ln_z_hat = (- 0.5*self.log_hess_hat_det
+                         + 0.5*self.log_Kdet
+                         + self.likelihood_function.link_function(self.data[:,0], self.f_hat)
+                         - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
                          )
 
         return self._compute_GP_variables()

From 15d5c2f22dff65a518a4f6a155e457a6516fca17 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 28 Mar 2013 17:42:42 +0000
Subject: [PATCH 12/71] Working laplace, just needs predictive values

---
 python/examples/laplace_approximations.py | 80 +++++++++++++----------
 python/likelihoods/Laplace.py             | 15 +++--
 python/likelihoods/likelihood_function.py | 72 ++++++++++++++++++--
 3 files changed, 121 insertions(+), 46 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 5fb39e08..37681849 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -10,20 +10,23 @@ def student_t_approx():
     """
     Example of regressing with a student t likelihood
     """
+    real_var = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 100)[:, None]
-    Y = np.sin(X) + np.random.randn(*X.shape)*0.1
+    X = np.linspace(0.0, 10.0, 30)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
     Yc = Y.copy()
 
-    Y = Y/Y.max()
+    #Y = Y/Y.max()
 
-    Yc[10] += 5
-    Yc[15] += 20
-    Yc = Yc/Yc.max()
+    #Yc[10] += 100
+    Yc[25] += 10
+    Yc[23] += 10
+    Yc[24] += 10
+    #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 1000000 #100000.5
-    real_var = 0.1
+    deg_free = 20 #100000.5
+    real_sd = np.sqrt(real_var)
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -38,36 +41,37 @@ def student_t_approx():
     #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
     #Y[corrupted_indices] += noise
     plt.figure(1)
+    plt.suptitle('Gaussian likelihood')
     # Kernel object
     kernel1 = GPy.kern.rbf(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
 
-    #print "Clean Gaussian"
-    ##A GP should completely break down due to the points as they get a lot of weight
-    ## create simple GP model
-    #m = GPy.models.GP_regression(X, Y, kernel=kernel1)
-    ### optimize
-    #m.ensure_default_constraints()
-    ##m.unconstrain('noise')
-    ##m.constrain_fixed('noise', 0.1)
-    #m.optimize()
-    ## plot
-    #plt.subplot(221)
-    #m.plot()
-    #print m
+    print "Clean Gaussian"
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    ## optimize
+    m.ensure_default_constraints()
+    #m.unconstrain('noise')
+    #m.constrain_fixed('noise', 0.1)
+    m.optimize()
+    # plot
+    plt.subplot(211)
+    m.plot()
+    print m
 
     ##Corrupt
-    #print "Corrupt Gaussian"
-    #m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
-    #m.ensure_default_constraints()
-    ##m.unconstrain('noise')
-    ##m.constrain_fixed('noise', 0.1)
-    #m.optimize()
-    #plt.subplot(222)
-    #m.plot()
-    #print m
+    print "Corrupt Gaussian"
+    m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
+    m.ensure_default_constraints()
+    #m.unconstrain('noise')
+    #m.constrain_fixed('noise', 0.1)
+    m.optimize()
+    plt.subplot(212)
+    m.plot()
+    print m
 
     ##with a student t distribution, since it has heavy tails it should work well
     ##likelihood_function = student_t(deg_free, sigma=real_var)
@@ -86,9 +90,13 @@ def student_t_approx():
         ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
     ##plt.show()
 
+    plt.figure(2)
+    plt.suptitle('Student-t likelihood')
+    edited_real_sd = real_sd
+
     # Likelihood object
-    t_distribution = student_t(deg_free, sigma=np.sqrt(real_var))
-    stu_t_likelihood = Laplace(Y, t_distribution)
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = Laplace(Yc, t_distribution)
 
     print "Clean student t"
     m = GPy.models.GP(X, stu_t_likelihood, kernel3)
@@ -100,9 +108,11 @@ def student_t_approx():
     # plot
     plt.subplot(211)
     m.plot_f()
+    plt.ylim(-2.5,2.5)
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     print "Corrupt student t"
-    t_distribution = student_t(deg_free, sigma=np.sqrt(real_var))
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
     corrupt_stu_t_likelihood = Laplace(Yc, t_distribution)
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
@@ -110,8 +120,8 @@ def student_t_approx():
     m.optimize()
     print(m)
     plt.subplot(212)
-    m.plot_f()
-
+    m.plot()
+    plt.ylim(-2.5,2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     return m
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index d86523d8..1411c22b 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -88,11 +88,12 @@ class Laplace(likelihood):
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
 
         """
-        self.Sigma_tilde_i = self.W #self.hess_hat_i
+        self.Sigma_tilde_i = self.W
         #Check it isn't singular!
-        epsilon = 1e-2
+        epsilon = 1e-6
         if np.abs(det(self.Sigma_tilde_i)) < epsilon:
-            raise ValueError("inverse covariance must be non-singular to inverse!")
+            print "WARNING: Transformed covariance matrix is signular!"
+            #raise ValueError("inverse covariance must be non-singular to invert!")
         #Do we really need to inverse Sigma_tilde_i? :(
         if self.likelihood_function.log_concave:
             (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
@@ -110,8 +111,12 @@ class Laplace(likelihood):
         self.Y = Y_tilde[:, None]
         self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1 / np.diag(self.Sigma_tilde)[:, None]
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        #if not self.likelihood_function.log_concave:
+            #self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                   ##If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                   ##To cause the posterior to become less certain than the prior and likelihood,
+                                   ##This is a property only held by non-log-concave likelihoods
+        self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
 
     def fit_full(self, K):
         """
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index a299fe3a..7ac9c661 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -1,4 +1,5 @@
-from scipy.special import gammaln
+from scipy.special import gammaln, gamma
+from scipy import integrate
 import numpy as np
 from GPy.likelihoods.likelihood_functions import likelihood_function
 from scipy import stats
@@ -79,9 +80,68 @@ class student_t(likelihood_function):
     def predictive_values(self, mu, var):
         """
         Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
-        """
-        mean = np.exp(mu)
-        p_025 = stats.t.ppf(.025, mean)
-        p_975 = stats.t.ppf(.975, mean)
 
-        return mean, np.nan*mean, p_025, p_975
+        Need to find what the variance is at the latent points for a student t*normal
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+
+(((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        """
+        #p_025 = stats.t.ppf(.025, mu)
+        #p_975 = stats.t.ppf(.975, mu)
+
+        num_test_points = mu.shape[0]
+        #Each mu is the latent point f* at the test point x*,
+        #and the var is the gaussian variance at this point
+        #Take lots of samples from this, so we have lots of possible values
+        #for latent point f* for each test point x* weighted by how likely we were to pick it
+        print "Taking %d samples of f*".format(num_test_points)
+        num_f_samples = 10
+        num_y_samples = 10
+        student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
+        print "Student t means shape: ", student_t_means.shape
+
+        #Now we have lots of f*, lets work out the likelihood of getting this by sampling
+        #from a student t centred on this point, sample many points from this distribution
+        #centred on f*
+        #for test_point, f in enumerate(student_t_means):
+            #print test_point
+            #print f.shape
+            #student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
+                                            #scale=self.sigma,
+                                            #size=(num_f_samples, num_y_samples))
+            #print student_t_samples.shape
+
+        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:,None],
+                                        scale=self.sigma,
+                                        size=(num_test_points, num_y_samples, num_f_samples))
+        student_t_samples = np.reshape(student_t_samples,
+                                       (num_test_points, num_y_samples*num_f_samples))
+
+        #Now take the 97.5 and 0.25 percentile of these points
+        p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
+        p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
+
+        p_025 = 1+p_025
+        p_975 = 1+p_975
+
+        ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
+        def t_gaussian(f, mu, var):
+            return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
+                        * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
+                    )
+
+        def t_gauss_int(mu, var):
+            print "Mu: ", mu
+            print "var: ", var
+            result = integrate.quad(t_gaussian, -np.inf, 0.975, args=(mu, var))
+            print "Result: ", result
+            return result[0]
+
+        vec_t_gauss_int = np.vectorize(t_gauss_int)
+
+        p_025 = vec_t_gauss_int(mu, var)
+        p_975 = vec_t_gauss_int(mu, var)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+        return mu, np.nan*mu, p_025, p_975

From ffc168c1d20f36b1e72501176c4a7bb88ff41614 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 2 Apr 2013 12:33:01 +0100
Subject: [PATCH 13/71] Added predicted values for student t, works well

---
 python/examples/laplace_approximations.py | 48 +++++++++++------------
 python/likelihoods/likelihood_function.py | 41 ++++++++++++++-----
 2 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 37681849..6374a5fd 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -18,7 +18,7 @@ def student_t_approx():
 
     #Y = Y/Y.max()
 
-    #Yc[10] += 100
+    Yc[10] += 100
     Yc[25] += 10
     Yc[23] += 10
     Yc[24] += 10
@@ -52,51 +52,30 @@ def student_t_approx():
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
     m = GPy.models.GP_regression(X, Y, kernel=kernel1)
-    ## optimize
+    # optimize
     m.ensure_default_constraints()
-    #m.unconstrain('noise')
-    #m.constrain_fixed('noise', 0.1)
     m.optimize()
     # plot
     plt.subplot(211)
     m.plot()
     print m
 
-    ##Corrupt
+    #Corrupt
     print "Corrupt Gaussian"
     m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
     m.ensure_default_constraints()
-    #m.unconstrain('noise')
-    #m.constrain_fixed('noise', 0.1)
     m.optimize()
     plt.subplot(212)
     m.plot()
     print m
 
-    ##with a student t distribution, since it has heavy tails it should work well
-    ##likelihood_function = student_t(deg_free, sigma=real_var)
-    ##lap = Laplace(Y, likelihood_function)
-    ##cov = kernel.K(X)
-    ##lap.fit_full(cov)
-
-    ##test_range = np.arange(0, 10, 0.1)
-    ##plt.plot(test_range, t_rv.pdf(test_range))
-    ##for i in xrange(X.shape[0]):
-        ##mode = lap.f_hat[i]
-        ##covariance = lap.hess_hat_i[i,i]
-        ##scaling = np.exp(lap.ln_z_hat)
-        ##normalised_approx = norm(loc=mode, scale=covariance)
-        ##print "Normal with mode %f, and variance %f" % (mode, covariance)
-        ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
-    ##plt.show()
-
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
     edited_real_sd = real_sd
 
     # Likelihood object
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Yc, t_distribution)
+    stu_t_likelihood = Laplace(Y, t_distribution)
 
     print "Clean student t"
     m = GPy.models.GP(X, stu_t_likelihood, kernel3)
@@ -107,7 +86,7 @@ def student_t_approx():
     print(m)
     # plot
     plt.subplot(211)
-    m.plot_f()
+    m.plot()
     plt.ylim(-2.5,2.5)
     #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
@@ -124,6 +103,23 @@ def student_t_approx():
     plt.ylim(-2.5,2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
+    ###with a student t distribution, since it has heavy tails it should work well
+    ###likelihood_function = student_t(deg_free, sigma=real_var)
+    ###lap = Laplace(Y, likelihood_function)
+    ###cov = kernel.K(X)
+    ###lap.fit_full(cov)
+
+    ###test_range = np.arange(0, 10, 0.1)
+    ###plt.plot(test_range, t_rv.pdf(test_range))
+    ###for i in xrange(X.shape[0]):
+        ###mode = lap.f_hat[i]
+        ###covariance = lap.hess_hat_i[i,i]
+        ###scaling = np.exp(lap.ln_z_hat)
+        ###normalised_approx = norm(loc=mode, scale=covariance)
+        ###print "Normal with mode %f, and variance %f" % (mode, covariance)
+        ###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    ###plt.show()
+
     return m
 
 
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 7ac9c661..61b5c427 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -23,6 +23,10 @@ class student_t(likelihood_function):
         #FIXME: This should be in the superclass
         self.log_concave = False
 
+    @property
+    def variance(self):
+        return (self.v / float(self.v - 2)) * (self.sigma**2)
+
     def link_function(self, y, f):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
@@ -79,14 +83,32 @@ class student_t(likelihood_function):
 
     def predictive_values(self, mu, var):
         """
-        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
+        Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
 
-        Need to find what the variance is at the latent points for a student t*normal
-        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
 
-(((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
-*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
         """
+
+        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
+        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
+        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
+        #Which was also given to us as (var)
+        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
+        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
+        true_var = var + self.variance
+
+        #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
+        #need the 95 and 5 percentiles.
+        #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
+        p_025 = mu - 2.*true_var
+        p_975 = mu + 2.*true_var
+
+        return mu, np.nan*mu, p_025, p_975
+
+    def sample_predicted_values(self, mu, var):
+        """ Experimental sample approches and numerical integration """
         #p_025 = stats.t.ppf(.025, mu)
         #p_975 = stats.t.ppf(.975, mu)
 
@@ -134,14 +156,13 @@ class student_t(likelihood_function):
         def t_gauss_int(mu, var):
             print "Mu: ", mu
             print "var: ", var
-            result = integrate.quad(t_gaussian, -np.inf, 0.975, args=(mu, var))
+            result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
             print "Result: ", result
             return result[0]
 
         vec_t_gauss_int = np.vectorize(t_gauss_int)
 
-        p_025 = vec_t_gauss_int(mu, var)
-        p_975 = vec_t_gauss_int(mu, var)
+        p = vec_t_gauss_int(mu, var)
+        p_025 = mu - p
+        p_975 = mu + p
         import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-
-        return mu, np.nan*mu, p_025, p_975

From afa5b1f9561189b3774a895b765d708186c10f5c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 2 Apr 2013 12:39:57 +0100
Subject: [PATCH 14/71] Tidying up

---
 python/likelihoods/likelihood_function.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 61b5c427..50f9b620 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -88,7 +88,6 @@ class student_t(likelihood_function):
         Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
         (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
         *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
-
         """
 
         #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
@@ -144,9 +143,6 @@ class student_t(likelihood_function):
         p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
         p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
 
-        p_025 = 1+p_025
-        p_975 = 1+p_975
-
         ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
         def t_gaussian(f, mu, var):
             return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))

From 0312f319ad4eef37f0c173120d80cc373d149519 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 2 Apr 2013 20:00:31 +0100
Subject: [PATCH 15/71] Still working on rasmussen, link function needs
 vectorizing I think

---
 python/examples/laplace_approximations.py |  58 ++++++---
 python/likelihoods/Laplace.py             | 137 ++++++++++++++++------
 python/likelihoods/likelihood_function.py |  13 +-
 3 files changed, 154 insertions(+), 54 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 6374a5fd..a1c71c71 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -16,6 +16,9 @@ def student_t_approx():
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
     Yc = Y.copy()
 
+    X_full = np.linspace(0.0, 10.0, 500)[:, None]
+    Y_full = np.sin(X_full)
+
     #Y = Y/Y.max()
 
     Yc[10] += 100
@@ -25,7 +28,7 @@ def student_t_approx():
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 20 #100000.5
+    deg_free = 10
     real_sd = np.sqrt(real_var)
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
@@ -47,6 +50,8 @@ def student_t_approx():
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
+    kernel5 = kernel1.copy()
+    kernel6 = kernel1.copy()
 
     print "Clean Gaussian"
     #A GP should completely break down due to the points as they get a lot of weight
@@ -58,6 +63,7 @@ def student_t_approx():
     # plot
     plt.subplot(211)
     m.plot()
+    plt.plot(X_full, Y_full)
     print m
 
     #Corrupt
@@ -67,40 +73,64 @@ def student_t_approx():
     m.optimize()
     plt.subplot(212)
     m.plot()
+    plt.plot(X_full, Y_full)
     print m
 
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
     edited_real_sd = real_sd
 
-    # Likelihood object
+    print "Clean student t, ncg"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Y, t_distribution)
-
-    print "Clean student t"
+    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
     m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
-    # optimize
     m.optimize()
     print(m)
-    # plot
-    plt.subplot(211)
+    plt.subplot(221)
     m.plot()
-    plt.ylim(-2.5,2.5)
-    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
 
-    print "Corrupt student t"
+    print "Corrupt student t, ncg"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = Laplace(Yc, t_distribution)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(223)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Clean student t, rasm"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(222)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Corrupt student t, rasm"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
-    plt.subplot(212)
+    plt.subplot(224)
     m.plot()
-    plt.ylim(-2.5,2.5)
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     ###with a student t distribution, since it has heavy tails it should work well
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 1411c22b..8eb69869 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,16 +1,15 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv, det
-from functools import partial
+from scipy.linalg import cholesky, eig, inv, det, cho_solve
 from GPy.likelihoods.likelihood import likelihood
-from GPy.util.linalg import pdinv,mdot
+from GPy.util.linalg import pdinv, mdot, jitchol
 #import numpy.testing.assert_array_equal
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self, data, likelihood_function):
+    def __init__(self, data, likelihood_function, rasm=True):
         """
         Laplace Approximation
 
@@ -30,6 +29,7 @@ class Laplace(likelihood):
         """
         self.data = data
         self.likelihood_function = likelihood_function
+        self.rasm = rasm
 
         #Inital values
         self.N, self.D = self.data.shape
@@ -102,20 +102,16 @@ class Laplace(likelihood):
         #f_hat? should be f but we must have optimized for them I guess?
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
         Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                    + 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                    + 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
                     + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
                     - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
                    )
 
-        self.Z = Z_tilde
-        self.Y = Y_tilde[:, None]
+        #Convert to float as its (1, 1) and Z must be a scalar
+        self.Z = np.float64(Z_tilde)
+        self.Y = Y_tilde
         self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = self.Sigma_tilde
-        #if not self.likelihood_function.log_concave:
-            #self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-                                   ##If the likelihood is non-log-concave. We wan't to say that there is a negative variance
-                                   ##To cause the posterior to become less certain than the prior and likelihood,
-                                   ##This is a property only held by non-log-concave likelihoods
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
 
     def fit_full(self, K):
@@ -125,32 +121,15 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        f = np.zeros((self.N, 1))
-        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
-        LOG_K_CONST = -(0.5 * self.log_Kdet)
-        OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST
-        #Find \hat(f) using a newton raphson optimizer for example
-        #TODO: Add newton-raphson as subclass of optimizer class
-
-        #FIXME: Can we get rid of this horrible reshaping?
-        def obj(f):
-            #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
-            return float(res)
-
-        def obj_grad(f):
-            #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
-            return np.squeeze(res)
-
-        def obj_hess(f):
-            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
-            return np.squeeze(res)
-
-        self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        self.Ki, _, _, self.log_Kdet = pdinv(K)
+        if self.rasm:
+            self.f_hat = self.rasm_mode(K)
+        else:
+            self.f_hat = self.ncg_mode(K)
 
         #At this point get the hessian matrix
-        self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat))
+        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat))
+
         if not self.likelihood_function.log_concave:
             self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -176,8 +155,92 @@ class Laplace(likelihood):
         #Unsure whether its log_hess or log_hess_i
         self.ln_z_hat = (- 0.5*self.log_hess_hat_det
                          + 0.5*self.log_Kdet
-                         + self.likelihood_function.link_function(self.data[:,0], self.f_hat)
+                         + self.likelihood_function.link_function(self.data, self.f_hat)
+                         #+ self.likelihood_function.link_function(self.data, self.f_hat)
                          - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
                          )
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
+
+    def ncg_mode(self, K):
+        """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
+        :K: Covariance matrix
+        :returns: f_mode
+        """
+        self.K = K.copy()
+        f = np.zeros((self.N, 1))
+        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
+        LOG_K_CONST = -(0.5 * self.log_Kdet)
+
+        #FIXME: Can we get rid of this horrible reshaping?
+        def obj(f):
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f))
+                        + self.NORMAL_CONST + LOG_K_CONST)
+            return float(res)
+
+        def obj_grad(f):
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
+            return np.squeeze(res)
+
+        def obj_hess(f):
+            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
+            return np.squeeze(res)
+
+        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        return f_hat[:, None]
+
+    def rasm_mode(self, K):
+        """
+        Rasmussens numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+
+        :K: Covariance matrix
+        :returns: f_mode
+        """
+        f = np.zeros((self.N, 1))
+        new_obj = -np.inf
+        old_obj = np.inf
+
+        def obj(a, f):
+            #Careful of shape of data!
+            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f)
+
+        difference = np.inf
+        epsilon = 1e-16
+        step_size = 1
+        while difference > epsilon:
+            W = -np.diag(self.likelihood_function.link_hess(self.data, f))
+            if not self.likelihood_function.log_concave:
+                #if np.any(W < 0):
+                    #print "NEGATIVE VALUES :("
+                    #pass
+                W[W < 0] = 1e-6     #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                    #To cause the posterior to become less certain than the prior and likelihood,
+                                    #This is a property only held by non-log-concave likelihoods
+            #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
+            W_12 = np.sqrt(W)
+            B = np.eye(self.N) + mdot(W_12, K, W_12)
+            L = jitchol(B)
+            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f))
+            #TODO: Check L is lower
+            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
+            a = b - mdot(W_12, solve_L)
+            f = np.dot(K, a)
+            old_obj = new_obj
+            new_obj = obj(a, f)
+            difference = new_obj - old_obj
+            #print "Difference: ", new_obj - old_obj
+            if difference < 0:
+                #If the objective function isn't rising, restart optimization
+                print "Reducing step-size, restarting"
+                #objective function isn't increasing, try reducing step size
+                step_size *= 0.9
+                f = np.zeros((self.N, 1))
+                new_obj = -np.inf
+                old_obj = np.inf
+
+            difference = abs(difference)
+
+        return f
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 50f9b620..15859a81 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -36,7 +36,10 @@ class student_t(likelihood_function):
         :returns: float(likelihood evaluated for this point)
 
         """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
         assert y.shape == f.shape
+
         e = y - f
         objective = (gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
@@ -44,6 +47,7 @@ class student_t(likelihood_function):
                      - (self.v + 1) * 0.5
                      * np.log(1 + ((e**2 / self.sigma**2) / self.v))
                      )
+        print (e**2).shape
         return np.sum(objective)
 
     def link_grad(self, y, f):
@@ -57,10 +61,12 @@ class student_t(likelihood_function):
         :returns: gradient of likelihood evaluated at points
 
         """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
-        return grad
+        return np.squeeze(grad)
 
     def link_hess(self, y, f):
         """
@@ -75,11 +81,12 @@ class student_t(likelihood_function):
         :f: latent variables f
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
-        return hess
+        return np.squeeze(hess)
 
     def predictive_values(self, mu, var):
         """

From 2006a94caa859d195a7c2af1236eb84656b68cfc Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 3 Apr 2013 10:55:58 +0100
Subject: [PATCH 16/71] Fixed broadcasting bug, rasm now appears to work

---
 python/likelihoods/Laplace.py             | 16 ++++++++++------
 python/likelihoods/likelihood_function.py |  1 -
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 8eb69869..e967a743 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -159,7 +159,6 @@ class Laplace(likelihood):
                          #+ self.likelihood_function.link_function(self.data, self.f_hat)
                          - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
                          )
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
 
@@ -190,7 +189,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
         return f_hat[:, None]
 
-    def rasm_mode(self, K):
+    def rasm_mode(self, K, MAX_ITER=5000, MAX_RESTART=30):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -209,7 +208,9 @@ class Laplace(likelihood):
         difference = np.inf
         epsilon = 1e-16
         step_size = 1
-        while difference > epsilon:
+        rs = 0
+        i = 0
+        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
                 #if np.any(W < 0):
@@ -223,7 +224,7 @@ class Laplace(likelihood):
             W_12 = np.sqrt(W)
             B = np.eye(self.N) + mdot(W_12, K, W_12)
             L = jitchol(B)
-            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f))
+            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)[:, None])
             #TODO: Check L is lower
             solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
             a = b - mdot(W_12, solve_L)
@@ -234,13 +235,16 @@ class Laplace(likelihood):
             #print "Difference: ", new_obj - old_obj
             if difference < 0:
                 #If the objective function isn't rising, restart optimization
-                print "Reducing step-size, restarting"
-                #objective function isn't increasing, try reducing step size
                 step_size *= 0.9
+                print "Objective function rose"
+                print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                #objective function isn't increasing, try reducing step size
                 f = np.zeros((self.N, 1))
                 new_obj = -np.inf
                 old_obj = np.inf
+                rs += 1
 
             difference = abs(difference)
+            i += 1
 
         return f
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 15859a81..49174ce7 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -47,7 +47,6 @@ class student_t(likelihood_function):
                      - (self.v + 1) * 0.5
                      * np.log(1 + ((e**2 / self.sigma**2) / self.v))
                      )
-        print (e**2).shape
         return np.sum(objective)
 
     def link_grad(self, y, f):

From 4a14a82dfba4bd3c48d4175bb8a861bab24a0d10 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 5 Apr 2013 17:34:11 +0100
Subject: [PATCH 17/71] Got the mode finding without computing Ki

---
 python/examples/laplace_approximations.py |  85 +++++++++-----
 python/likelihoods/Laplace.py             | 130 ++++++++++++++++------
 2 files changed, 152 insertions(+), 63 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index a1c71c71..7ab26406 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -6,6 +6,38 @@ from coxGP.python.likelihoods.Laplace import Laplace
 from coxGP.python.likelihoods.likelihood_function import student_t
 
 
+def timing():
+    real_var = 0.1
+    times = 1000
+    deg_free = 10
+    real_sd = np.sqrt(real_var)
+    the_is = np.zeros(times)
+    X = np.linspace(0.0, 10.0, 30)[:, None]
+    for a in xrange(times):
+        Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+        Yc = Y.copy()
+
+        Yc[10] += 100
+        Yc[25] += 10
+        Yc[23] += 10
+        Yc[24] += 10
+
+        edited_real_sd = real_sd
+        kernel1 = GPy.kern.rbf(X.shape[1])
+
+        t_distribution = student_t(deg_free, sigma=edited_real_sd)
+        corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
+        m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
+        m.ensure_default_constraints()
+        m.update_likelihood_approximation()
+        m.optimize()
+        the_is[a] = m.likelihood.i
+
+    print the_is
+    print np.mean(the_is)
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+
 def student_t_approx():
     """
     Example of regressing with a student t likelihood
@@ -80,32 +112,6 @@ def student_t_approx():
     plt.suptitle('Student-t likelihood')
     edited_real_sd = real_sd
 
-    print "Clean student t, ncg"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
-    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    plt.subplot(221)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
-
-    print "Corrupt student t, ncg"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
-    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    plt.subplot(223)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
-
     print "Clean student t, rasm"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
@@ -133,6 +139,33 @@ def student_t_approx():
     plt.ylim(-2.5, 2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
+    print "Clean student t, ncg"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(221)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Corrupt student t, ncg"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(223)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+
     ###with a student t distribution, since it has heavy tails it should work well
     ###likelihood_function = student_t(deg_free, sigma=real_var)
     ###lap = Laplace(Y, likelihood_function)
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index e967a743..396a0bc7 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -100,12 +100,19 @@ class Laplace(likelihood):
         else:
             self.Sigma_tilde = inv(self.Sigma_tilde_i)
         #f_hat? should be f but we must have optimized for them I guess?
-        Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
-        Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                    + 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
-                    + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
-                    - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
-                   )
+        #Y_tilde = mdot(self.Sigma_tilde, self.hess_hat_i, self.f_hat)
+        Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
+        #KW = np.dot(self.K, self.W)
+        #KW_i, _, _, _ = pdinv(KW)
+        #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat)
+        #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
+                    #+ 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
+                    #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                    #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
+                   #)
+        _, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12))
+        f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat)
+        Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -121,7 +128,7 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        self.Ki, _, _, self.log_Kdet = pdinv(K)
+        self.Ki, _, _, log_Kdet = pdinv(K)
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
@@ -135,33 +142,64 @@ class Laplace(likelihood):
                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                    #To cause the posterior to become less certain than the prior and likelihood,
                                    #This is a property only held by non-log-concave likelihoods
-        self.hess_hat = self.Ki + self.W
-        (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat)
+        #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
+        self.B, L, self.W_12 = self._compute_B_statistics(K, self.W)
+        self.Bi, _, _, B_det = pdinv(self.B)
+        #ln_W_det = np.linalg.det(self.W)
+        #ln_B_det = np.linalg.det(self.B)
+        ln_det = np.linalg.det(np.eye(self.N) - mdot(self.W_12, self.Bi, self.W_12, K))
+        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None]
+        #TODO: Check L is lower
+        solve_L = cho_solve((L, True), mdot(self.W_12, (K, b)))
+        a = b - mdot(self.W_12, solve_L)
+        self.f_Ki_f = np.dot(self.f_hat.T, a)
 
-        #Check hess_hat is positive definite
-        try:
-            cholesky(self.hess_hat)
-        except:
-            raise ValueError("Must be positive definite")
+        #self.hess_hat = self.Ki + self.W
+        #(self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat)
 
-        #Check its eigenvalues are positive
-        eigenvalues = eig(self.hess_hat)
-        if not np.all(eigenvalues > 0):
-            raise ValueError("Eigen values not positive")
+        ##Check hess_hat is positive definite
+        #try:
+            #cholesky(self.hess_hat)
+        #except:
+            #raise ValueError("Must be positive definite")
+
+        ##Check its eigenvalues are positive
+        #eigenvalues = eig(self.hess_hat)
+        #if not np.all(eigenvalues > 0):
+            #raise ValueError("Eigen values not positive")
 
         #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
         #Unsure whether its log_hess or log_hess_i
-        self.ln_z_hat = (- 0.5*self.log_hess_hat_det
-                         + 0.5*self.log_Kdet
-                         + self.likelihood_function.link_function(self.data, self.f_hat)
+        #self.ln_z_hat = (- 0.5*self.log_hess_hat_i_det
+                         #+ 0.5*self.log_Kdet
                          #+ self.likelihood_function.link_function(self.data, self.f_hat)
-                         - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
+                         ##+ self.likelihood_function.link_function(self.data, self.f_hat)
+                         #- 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
+                         #)
+        self.ln_z_hat = (- 0.5*log_Kdet
+                         - 0.5*self.f_Ki_f
+                         + self.likelihood_function.link_function(self.data, self.f_hat)
+                         + 0.5*ln_det
                          )
 
         return self._compute_GP_variables()
 
+    def _compute_B_statistics(self, K, W):
+        """Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :K: Covariance matrix
+        :W: Negative hessian at a point (diagonal matrix)
+        :returns: (B, L)
+        """
+        #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
+        W_12 = np.sqrt(W)
+        B = np.eye(K.shape[0]) + mdot(W_12, K, W_12)
+        L = jitchol(B)
+        return (B, L, W_12)
+
     def ncg_mode(self, K):
         """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
         :K: Covariance matrix
@@ -189,7 +227,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=5000, MAX_RESTART=30):
+    def rasm_mode(self, K, MAX_ITER=5000000000000000, MAX_RESTART=30):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -206,11 +244,12 @@ class Laplace(likelihood):
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f)
 
         difference = np.inf
-        epsilon = 1e-16
+        epsilon = 1e-6
         step_size = 1
         rs = 0
         i = 0
-        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
+        while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART:
+            f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
                 #if np.any(W < 0):
@@ -220,31 +259,48 @@ class Laplace(likelihood):
                                     #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     #To cause the posterior to become less certain than the prior and likelihood,
                                     #This is a property only held by non-log-concave likelihoods
-            #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
-            W_12 = np.sqrt(W)
-            B = np.eye(self.N) + mdot(W_12, K, W_12)
-            L = jitchol(B)
-            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)[:, None])
+            B, L, W_12 = self._compute_B_statistics(K, W)
+
+            W_f = np.dot(W, f)
+            grad = self.likelihood_function.link_grad(self.data, f)[:, None]
+            #Find K_i_f
+            b = W_f + grad
+            #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None]
             #TODO: Check L is lower
             solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
             a = b - mdot(W_12, solve_L)
-            f = np.dot(K, a)
+            #f = np.dot(K, a)
+
+            #a should be equal to Ki*f now so should be able to use it
+            c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
+            solve_L = cho_solve((L, True), mdot(W_12, c))
+            f = c - mdot(K, W_12, solve_L)
+
+            #K_w_f = mdot(K, (W, f))
+            #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f
+            #d = f + K_w_f + c
+            #solve_L = cho_solve((L, True), mdot(W_12, d))
+            #f = c - mdot(K, (W_12, solve_L))
+            #a = mdot(self.Ki, f)
+
+            tmp_old_obj = old_obj
             old_obj = new_obj
             new_obj = obj(a, f)
             difference = new_obj - old_obj
-            #print "Difference: ", new_obj - old_obj
+            #print "Difference: ", difference
             if difference < 0:
+                #print "Objective function rose", difference
                 #If the objective function isn't rising, restart optimization
                 step_size *= 0.9
-                print "Objective function rose"
-                print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
                 #objective function isn't increasing, try reducing step size
-                f = np.zeros((self.N, 1))
-                new_obj = -np.inf
-                old_obj = np.inf
+                #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
+                old_obj = tmp_old_obj
                 rs += 1
 
             difference = abs(difference)
             i += 1
 
+        self.i = i
+        print "{i} steps".format(i=i)
         return f

From 31d8faecf866307c69dcade761ddb77d628b773e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 5 Apr 2013 17:56:02 +0100
Subject: [PATCH 18/71] Added timing and realised mdot can be faster as its
 almost always a diagonal matrix its multiplying with

---
 python/examples/laplace_approximations.py |  9 +++++---
 python/likelihoods/Laplace.py             | 25 ++++++++++++++---------
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 7ab26406..28a92c61 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -8,11 +8,12 @@ from coxGP.python.likelihoods.likelihood_function import student_t
 
 def timing():
     real_var = 0.1
-    times = 1000
+    times = 1
     deg_free = 10
     real_sd = np.sqrt(real_var)
     the_is = np.zeros(times)
-    X = np.linspace(0.0, 10.0, 30)[:, None]
+    X = np.linspace(0.0, 10.0, 500)[:, None]
+
     for a in xrange(times):
         Y = np.sin(X) + np.random.randn(*X.shape)*real_var
         Yc = Y.copy()
@@ -21,6 +22,8 @@ def timing():
         Yc[25] += 10
         Yc[23] += 10
         Yc[24] += 10
+        Yc[300] += 10
+        Yc[400] += 10000
 
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
@@ -33,9 +36,9 @@ def timing():
         m.optimize()
         the_is[a] = m.likelihood.i
 
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print the_is
     print np.mean(the_is)
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
 
 def student_t_approx():
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 396a0bc7..734bf6c8 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -128,7 +128,9 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        self.Ki, _, _, log_Kdet = pdinv(K)
+        print "Inverting K"
+        #self.Ki, _, _, log_Kdet = pdinv(K)
+        print "K inverted, optimising"
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
@@ -196,6 +198,7 @@ class Laplace(likelihood):
         """
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         B = np.eye(K.shape[0]) + mdot(W_12, K, W_12)
         L = jitchol(B)
         return (B, L, W_12)
@@ -205,9 +208,7 @@ class Laplace(likelihood):
         :K: Covariance matrix
         :returns: f_mode
         """
-        self.K = K.copy()
         f = np.zeros((self.N, 1))
-        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
         LOG_K_CONST = -(0.5 * self.log_Kdet)
 
         #FIXME: Can we get rid of this horrible reshaping?
@@ -227,7 +228,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=5000000000000000, MAX_RESTART=30):
+    def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -249,6 +250,7 @@ class Laplace(likelihood):
         rs = 0
         i = 0
         while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART:
+            print "optimising"
             f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
@@ -259,22 +261,25 @@ class Laplace(likelihood):
                                     #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     #To cause the posterior to become less certain than the prior and likelihood,
                                     #This is a property only held by non-log-concave likelihoods
+            print "Decomposing"
             B, L, W_12 = self._compute_B_statistics(K, W)
+            print "Finding f"
 
-            W_f = np.dot(W, f)
+            W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal!
             grad = self.likelihood_function.link_grad(self.data, f)[:, None]
             #Find K_i_f
             b = W_f + grad
             #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None]
             #TODO: Check L is lower
-            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
-            a = b - mdot(W_12, solve_L)
+
+            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal!
+            a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
             #f = np.dot(K, a)
 
             #a should be equal to Ki*f now so should be able to use it
             c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-            solve_L = cho_solve((L, True), mdot(W_12, c))
-            f = c - mdot(K, W_12, solve_L)
+            solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal!
+            f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
 
             #K_w_f = mdot(K, (W, f))
             #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f
@@ -302,5 +307,5 @@ class Laplace(likelihood):
             i += 1
 
         self.i = i
-        print "{i} steps".format(i=i)
+        #print "{i} steps".format(i=i)
         return f

From 431f93ef231875aeb6adbe6be2c70ea807aafdce Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 8 Apr 2013 18:09:07 +0100
Subject: [PATCH 19/71] Stabalised most of the algorithm (apart from the end
 inversion which is impossible)

---
 python/likelihoods/Laplace.py | 132 ++++++++++++++++++----------------
 1 file changed, 72 insertions(+), 60 deletions(-)

diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 734bf6c8..77359769 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -3,9 +3,15 @@ import scipy as sp
 import GPy
 from scipy.linalg import cholesky, eig, inv, det, cho_solve
 from GPy.likelihoods.likelihood import likelihood
-from GPy.util.linalg import pdinv, mdot, jitchol
+from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
+from scipy.linalg.lapack import dtrtrs
 #import numpy.testing.assert_array_equal
 
+#TODO: Move this to utils
+def det_ln_diag(A):
+    return np.log(np.diagonal(A)).sum()
+
+
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
@@ -60,7 +66,6 @@ class Laplace(likelihood):
         pass # TODO: Laplace likelihood might want to take some parameters...
 
     def _gradients(self, partial):
-        #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
 
@@ -99,9 +104,26 @@ class Laplace(likelihood):
             (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
         else:
             self.Sigma_tilde = inv(self.Sigma_tilde_i)
-        #f_hat? should be f but we must have optimized for them I guess?
-        #Y_tilde = mdot(self.Sigma_tilde, self.hess_hat_i, self.f_hat)
         Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
+
+        #dtritri -> L -> L_i
+        #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
+        #((L.T*w)_i + I)f_hat = y_tilde
+        L = jitchol(self.K)
+        Li = chol_inv(L)
+        Lt_W = np.dot(L.T, self.W)
+        if np.abs(det(Lt_W)) < epsilon:
+            print "WARNING: Transformed covariance matrix is signular!"
+        Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
+        Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+        #if np.abs(det(KW)) < epsilon:
+            #print "WARNING: Transformed covariance matrix is signular!"
+        #KW_i = inv(KW)
+        #Y_tilde = mdot(KW_i + np.eye(self.N), self.f_hat)
+
+        #Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
         #KW = np.dot(self.K, self.W)
         #KW_i, _, _, _ = pdinv(KW)
         #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat)
@@ -110,16 +132,38 @@ class Laplace(likelihood):
                     #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
                     #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
                    #)
-        _, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12))
-        f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat)
-        Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f
+        #_, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12))
+        #f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat)
+        #Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f
+
+        #f_W_f = mdot(self.f_hat.T, self.W, self.f_hat)
+        #f_Y_f = mdot(Y_tilde, self.W, Y_tilde)
+        #Z_tilde = (np.dot(self.W, self.f_hat) - 0.5*y_W_y + self.ln_z_hat
+                   #- 0.5*mdot(self.f_hat, (
+
+        f_Ki_W_f = mdot(self.f_hat.T, (self.Ki + self.W), self.f_hat)
+        y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
+        y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
+        self.ln_W_det = det_ln_diag(self.W)
+        Z_tilde = (self.NORMAL_CONST
+                   - 0.5*self.ln_K_det
+                   - 0.5*self.ln_W_det
+                   - 0.5*self.ln_Ki_W_i_det
+                   - 0.5*f_Ki_W_f
+                   - 0.5*y_W_y
+                   + y_W_f
+                   + self.ln_z_hat
+                   )
+
+        Sigma_tilde = inv(self.W) # Damn
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
         self.YYT = np.dot(self.Y, self.Y.T)
-        self.covariance_matrix = self.Sigma_tilde
+        self.covariance_matrix = Sigma_tilde
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def fit_full(self, K):
         """
@@ -128,9 +172,7 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        print "Inverting K"
-        #self.Ki, _, _, log_Kdet = pdinv(K)
-        print "K inverted, optimising"
+        self.Ki, _, _, self.ln_K_det = pdinv(K)
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
@@ -144,46 +186,24 @@ class Laplace(likelihood):
                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                    #To cause the posterior to become less certain than the prior and likelihood,
                                    #This is a property only held by non-log-concave likelihoods
+
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
-        self.B, L, self.W_12 = self._compute_B_statistics(K, self.W)
+        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
-        #ln_W_det = np.linalg.det(self.W)
-        #ln_B_det = np.linalg.det(self.B)
-        ln_det = np.linalg.det(np.eye(self.N) - mdot(self.W_12, self.Bi, self.W_12, K))
+
+        Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
+        self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
+
         b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None]
-        #TODO: Check L is lower
-        solve_L = cho_solve((L, True), mdot(self.W_12, (K, b)))
-        a = b - mdot(self.W_12, solve_L)
+        solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
+        a = b - mdot(self.W_12, solve_chol)
         self.f_Ki_f = np.dot(self.f_hat.T, a)
 
-        #self.hess_hat = self.Ki + self.W
-        #(self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat)
-
-        ##Check hess_hat is positive definite
-        #try:
-            #cholesky(self.hess_hat)
-        #except:
-            #raise ValueError("Must be positive definite")
-
-        ##Check its eigenvalues are positive
-        #eigenvalues = eig(self.hess_hat)
-        #if not np.all(eigenvalues > 0):
-            #raise ValueError("Eigen values not positive")
-
-        #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
-        #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
-        #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
-        #Unsure whether its log_hess or log_hess_i
-        #self.ln_z_hat = (- 0.5*self.log_hess_hat_i_det
-                         #+ 0.5*self.log_Kdet
-                         #+ self.likelihood_function.link_function(self.data, self.f_hat)
-                         ##+ self.likelihood_function.link_function(self.data, self.f_hat)
-                         #- 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
-                         #)
-        self.ln_z_hat = (- 0.5*log_Kdet
+        self.ln_z_hat = (  self.NORMAL_CONST
                          - 0.5*self.f_Ki_f
+                         - 0.5*self.ln_K_det
+                         + 0.5*self.ln_Ki_W_i_det
                          + self.likelihood_function.link_function(self.data, self.f_hat)
-                         + 0.5*ln_det
                          )
 
         return self._compute_GP_variables()
@@ -198,7 +218,7 @@ class Laplace(likelihood):
         """
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         B = np.eye(K.shape[0]) + mdot(W_12, K, W_12)
         L = jitchol(B)
         return (B, L, W_12)
@@ -209,12 +229,12 @@ class Laplace(likelihood):
         :returns: f_mode
         """
         f = np.zeros((self.N, 1))
-        LOG_K_CONST = -(0.5 * self.log_Kdet)
 
         #FIXME: Can we get rid of this horrible reshaping?
+        #ONLY WORKS FOR 1D DATA
         def obj(f):
             res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f))
-                        + self.NORMAL_CONST + LOG_K_CONST)
+                        + self.NORMAL_CONST)
             return float(res)
 
         def obj_grad(f):
@@ -249,21 +269,15 @@ class Laplace(likelihood):
         step_size = 1
         rs = 0
         i = 0
-        while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART:
-            print "optimising"
+        while difference > epsilon:  # and i < MAX_ITER and rs < MAX_RESTART:
             f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
-                #if np.any(W < 0):
-                    #print "NEGATIVE VALUES :("
-                    #pass
                 W[W < 0] = 1e-6     #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     #To cause the posterior to become less certain than the prior and likelihood,
                                     #This is a property only held by non-log-concave likelihoods
-            print "Decomposing"
             B, L, W_12 = self._compute_B_statistics(K, W)
-            print "Finding f"
 
             W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal!
             grad = self.likelihood_function.link_grad(self.data, f)[:, None]
@@ -272,15 +286,15 @@ class Laplace(likelihood):
             #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None]
             #TODO: Check L is lower
 
-            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal!
-            a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
-            #f = np.dot(K, a)
-
             #a should be equal to Ki*f now so should be able to use it
             c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
             solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal!
             f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
 
+            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal!
+            a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
+            #f = np.dot(K, a)
+
             #K_w_f = mdot(K, (W, f))
             #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f
             #d = f + K_w_f + c
@@ -292,7 +306,6 @@ class Laplace(likelihood):
             old_obj = new_obj
             new_obj = obj(a, f)
             difference = new_obj - old_obj
-            #print "Difference: ", difference
             if difference < 0:
                 #print "Objective function rose", difference
                 #If the objective function isn't rising, restart optimization
@@ -307,5 +320,4 @@ class Laplace(likelihood):
             i += 1
 
         self.i = i
-        #print "{i} steps".format(i=i)
         return f

From e0c1e4a4df600d24f075cc13a359a4bc77dfcff3 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 8 Apr 2013 19:58:54 +0100
Subject: [PATCH 20/71] Fixed laplace approximation and made more numerically
 stable with cholesky decompositions, and commented

---
 python/examples/laplace_approximations.py |   1 -
 python/likelihoods/Laplace.py             | 142 ++++++++++------------
 2 files changed, 65 insertions(+), 78 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 28a92c61..0500ba02 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -140,7 +140,6 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     print "Clean student t, ncg"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 77359769..27ab7613 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,17 +1,32 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv, det, cho_solve
+from scipy.linalg import cholesky, eig, inv, cho_solve
+from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
 from scipy.linalg.lapack import dtrtrs
-#import numpy.testing.assert_array_equal
 
 #TODO: Move this to utils
+
+
 def det_ln_diag(A):
+    """
+    log determinant of a diagonal matrix
+    $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
+    """
     return np.log(np.diagonal(A)).sum()
 
 
+def pddet(A):
+    """
+    Determinant of a positive definite matrix
+    """
+    L = cholesky(A)
+    logdetA = 2*sum(np.log(np.diag(L)))
+    return logdetA
+
+
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
@@ -30,7 +45,8 @@ class Laplace(likelihood):
         ---------
 
         :data: @todo
-        :likelihood_function: @todo
+        :likelihood_function: likelihood function - subclass of likelihood_function
+        :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation
 
         """
         self.data = data
@@ -63,10 +79,10 @@ class Laplace(likelihood):
         return []
 
     def _set_params(self, p):
-        pass # TODO: Laplace likelihood might want to take some parameters...
+        pass  # TODO: Laplace likelihood might want to take some parameters...
 
     def _gradients(self, partial):
-        return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
+        return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
 
     def _compute_GP_variables(self):
@@ -91,20 +107,10 @@ class Laplace(likelihood):
         i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
         since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
+        $$\tilde{\Sigma} = W^{-1}$$
 
         """
-        self.Sigma_tilde_i = self.W
-        #Check it isn't singular!
         epsilon = 1e-6
-        if np.abs(det(self.Sigma_tilde_i)) < epsilon:
-            print "WARNING: Transformed covariance matrix is signular!"
-            #raise ValueError("inverse covariance must be non-singular to invert!")
-        #Do we really need to inverse Sigma_tilde_i? :(
-        if self.likelihood_function.log_concave:
-            (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
-        else:
-            self.Sigma_tilde = inv(self.Sigma_tilde_i)
-        Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
 
         #dtritri -> L -> L_i
         #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
@@ -112,42 +118,25 @@ class Laplace(likelihood):
         L = jitchol(self.K)
         Li = chol_inv(L)
         Lt_W = np.dot(L.T, self.W)
-        if np.abs(det(Lt_W)) < epsilon:
-            print "WARNING: Transformed covariance matrix is signular!"
+
+        ##Check it isn't singular!
+        if cond(Lt_W) > 1e14:
+            print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
+
         Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
         Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
-        #if np.abs(det(KW)) < epsilon:
-            #print "WARNING: Transformed covariance matrix is signular!"
-        #KW_i = inv(KW)
-        #Y_tilde = mdot(KW_i + np.eye(self.N), self.f_hat)
+        #f.T(Ki + W)f
+        f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
+                    + mdot(self.f_hat.T, self.W, self.f_hat)
+                    )
 
-        #Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
-        #KW = np.dot(self.K, self.W)
-        #KW_i, _, _, _ = pdinv(KW)
-        #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat)
-        #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                    #+ 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
-                    #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
-                    #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
-                   #)
-        #_, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12))
-        #f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat)
-        #Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f
-
-        #f_W_f = mdot(self.f_hat.T, self.W, self.f_hat)
-        #f_Y_f = mdot(Y_tilde, self.W, Y_tilde)
-        #Z_tilde = (np.dot(self.W, self.f_hat) - 0.5*y_W_y + self.ln_z_hat
-                   #- 0.5*mdot(self.f_hat, (
-
-        f_Ki_W_f = mdot(self.f_hat.T, (self.Ki + self.W), self.f_hat)
         y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
         y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
-        self.ln_W_det = det_ln_diag(self.W)
+        ln_W_det = det_ln_diag(self.W)
         Z_tilde = (self.NORMAL_CONST
                    - 0.5*self.ln_K_det
-                   - 0.5*self.ln_W_det
+                   - 0.5*ln_W_det
                    - 0.5*self.ln_Ki_W_i_det
                    - 0.5*f_Ki_W_f
                    - 0.5*y_W_y
@@ -155,7 +144,11 @@ class Laplace(likelihood):
                    + self.ln_z_hat
                    )
 
-        Sigma_tilde = inv(self.W) # Damn
+        ##Check it isn't singular!
+        if cond(self.W) > 1e14:
+            print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
+
+        Sigma_tilde = inv(self.W)  # Damn
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -163,16 +156,14 @@ class Laplace(likelihood):
         self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = Sigma_tilde
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def fit_full(self, K):
         """
         The laplace approximation algorithm
-        For nomenclature see Rasmussen & Williams 2006
+        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
         :K: Covariance matrix
         """
         self.K = K.copy()
-        self.Ki, _, _, self.ln_K_det = pdinv(K)
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
@@ -182,10 +173,10 @@ class Laplace(likelihood):
         self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat))
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-                                   #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
-                                   #To cause the posterior to become less certain than the prior and likelihood,
-                                   #This is a property only held by non-log-concave likelihoods
+            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                       #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                       #To cause the posterior to become less certain than the prior and likelihood,
+                                       #This is a property only held by non-log-concave likelihoods
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
         self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W)
@@ -198,8 +189,9 @@ class Laplace(likelihood):
         solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
         a = b - mdot(self.W_12, solve_chol)
         self.f_Ki_f = np.dot(self.f_hat.T, a)
+        self.ln_K_det = pddet(self.K)
 
-        self.ln_z_hat = (  self.NORMAL_CONST
+        self.ln_z_hat = (self.NORMAL_CONST
                          - 0.5*self.f_Ki_f
                          - 0.5*self.ln_K_det
                          + 0.5*self.ln_Ki_W_i_det
@@ -219,26 +211,29 @@ class Laplace(likelihood):
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
         #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-        B = np.eye(K.shape[0]) + mdot(W_12, K, W_12)
+        B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
         L = jitchol(B)
         return (B, L, W_12)
 
     def ncg_mode(self, K):
-        """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
+        """
+        Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
         :K: Covariance matrix
         :returns: f_mode
         """
+        self.Ki, _, _, self.ln_K_det = pdinv(K)
+
         f = np.zeros((self.N, 1))
 
         #FIXME: Can we get rid of this horrible reshaping?
         #ONLY WORKS FOR 1D DATA
         def obj(f):
-            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f))
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
                         + self.NORMAL_CONST)
             return float(res)
 
         def obj_grad(f):
-            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - np.dot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
@@ -254,6 +249,8 @@ class Laplace(likelihood):
         For nomenclature see Rasmussen & Williams 2006
 
         :K: Covariance matrix
+        :MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
+        :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
         :returns: f_mode
         """
         f = np.zeros((self.N, 1))
@@ -269,39 +266,30 @@ class Laplace(likelihood):
         step_size = 1
         rs = 0
         i = 0
-        while difference > epsilon:  # and i < MAX_ITER and rs < MAX_RESTART:
+        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-6     #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
-                                    #To cause the posterior to become less certain than the prior and likelihood,
-                                    #This is a property only held by non-log-concave likelihoods
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                    # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                    # To cause the posterior to become less certain than the prior and likelihood,
+                                    # This is a property only held by non-log-concave likelihoods
             B, L, W_12 = self._compute_B_statistics(K, W)
 
-            W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal!
+            W_f = np.dot(W, f)
             grad = self.likelihood_function.link_grad(self.data, f)[:, None]
             #Find K_i_f
             b = W_f + grad
-            #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None]
-            #TODO: Check L is lower
 
             #a should be equal to Ki*f now so should be able to use it
-            c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-            solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal!
-            f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
+            c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
+            solve_L = cho_solve((L, True), np.dot(W_12, c))
+            f = c - np.dot(K, np.dot(W_12, solve_L))
 
-            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal!
-            a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
+            solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b)))
+            a = b - np.dot(W_12, solve_L)
             #f = np.dot(K, a)
 
-            #K_w_f = mdot(K, (W, f))
-            #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f
-            #d = f + K_w_f + c
-            #solve_L = cho_solve((L, True), mdot(W_12, d))
-            #f = c - mdot(K, (W_12, solve_L))
-            #a = mdot(self.Ki, f)
-
             tmp_old_obj = old_obj
             old_obj = new_obj
             new_obj = obj(a, f)

From 65481d7a73b8fe965a99b82126431ae2668958db Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 10 Apr 2013 13:43:13 +0100
Subject: [PATCH 21/71] Fixed the z scalings

---
 python/examples/laplace_approximations.py |  8 +++----
 python/likelihoods/Laplace.py             | 28 +++++++++++++++--------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 0500ba02..5b1331b6 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -12,7 +12,7 @@ def timing():
     deg_free = 10
     real_sd = np.sqrt(real_var)
     the_is = np.zeros(times)
-    X = np.linspace(0.0, 10.0, 500)[:, None]
+    X = np.linspace(0.0, 10.0, 300)[:, None]
 
     for a in xrange(times):
         Y = np.sin(X) + np.random.randn(*X.shape)*real_var
@@ -22,8 +22,8 @@ def timing():
         Yc[25] += 10
         Yc[23] += 10
         Yc[24] += 10
-        Yc[300] += 10
-        Yc[400] += 10000
+        Yc[250] += 10
+        #Yc[4] += 10000
 
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
@@ -36,7 +36,7 @@ def timing():
         m.optimize()
         the_is[a] = m.likelihood.i
 
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print the_is
     print np.mean(the_is)
 
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 27ab7613..8ef8fb62 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,7 +1,7 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv, cho_solve
+from scipy.linalg import cholesky, eig, inv, cho_solve, det
 from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
@@ -134,15 +134,24 @@ class Laplace(likelihood):
         y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
         y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
         ln_W_det = det_ln_diag(self.W)
-        Z_tilde = (self.NORMAL_CONST
-                   - 0.5*self.ln_K_det
-                   - 0.5*ln_W_det
-                   - 0.5*self.ln_Ki_W_i_det
-                   - 0.5*f_Ki_W_f
-                   - 0.5*y_W_y
-                   + y_W_f
+        Z_tilde = (- self.NORMAL_CONST
+                   + 0.5*self.ln_K_det
+                   + 0.5*ln_W_det
+                   + 0.5*self.ln_Ki_W_i_det
+                   + 0.5*f_Ki_W_f
+                   + 0.5*y_W_y
+                   - y_W_f
                    + self.ln_z_hat
                    )
+        #Z_tilde = (self.NORMAL_CONST
+                   #- 0.5*self.ln_K_det
+                   #- 0.5*ln_W_det
+                   #- 0.5*self.ln_Ki_W_i_det
+                   #- 0.5*f_Ki_W_f
+                   #- 0.5*y_W_y
+                   #+ y_W_f
+                   #+ self.ln_z_hat
+                   #)
 
         ##Check it isn't singular!
         if cond(self.W) > 1e14:
@@ -191,8 +200,7 @@ class Laplace(likelihood):
         self.f_Ki_f = np.dot(self.f_hat.T, a)
         self.ln_K_det = pddet(self.K)
 
-        self.ln_z_hat = (self.NORMAL_CONST
-                         - 0.5*self.f_Ki_f
+        self.ln_z_hat = (- 0.5*self.f_Ki_f
                          - 0.5*self.ln_K_det
                          + 0.5*self.ln_Ki_W_i_det
                          + self.likelihood_function.link_function(self.data, self.f_hat)

From 9bbb11b825f7c395a040e2385d6a2c88aa1c143e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 10 Apr 2013 15:43:31 +0100
Subject: [PATCH 22/71] Adding weibull likelihood, requires 'extra_data' to be
 passed to likelihood, i.e. the censoring information

---
 python/likelihoods/Laplace.py             | 24 +++---
 python/likelihoods/likelihood_function.py | 99 +++++++++++++++++++++--
 2 files changed, 104 insertions(+), 19 deletions(-)

diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 8ef8fb62..4d94ba0f 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -30,7 +30,7 @@ def pddet(A):
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self, data, likelihood_function, rasm=True):
+    def __init__(self, data, likelihood_function, extra_data=None, rasm=True):
         """
         Laplace Approximation
 
@@ -44,13 +44,15 @@ class Laplace(likelihood):
         Arguments
         ---------
 
-        :data: @todo
+        :data: array of data the likelihood function is approximating
         :likelihood_function: likelihood function - subclass of likelihood_function
+        :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
         :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation
 
         """
         self.data = data
         self.likelihood_function = likelihood_function
+        self.extra_data = extra_data
         self.rasm = rasm
 
         #Inital values
@@ -179,7 +181,7 @@ class Laplace(likelihood):
             self.f_hat = self.ncg_mode(K)
 
         #At this point get the hessian matrix
-        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat))
+        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data))
 
         if not self.likelihood_function.log_concave:
             self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
@@ -194,7 +196,7 @@ class Laplace(likelihood):
         Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
         self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
 
-        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None]
+        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
         solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
         a = b - mdot(self.W_12, solve_chol)
         self.f_Ki_f = np.dot(self.f_hat.T, a)
@@ -203,7 +205,7 @@ class Laplace(likelihood):
         self.ln_z_hat = (- 0.5*self.f_Ki_f
                          - 0.5*self.ln_K_det
                          + 0.5*self.ln_Ki_W_i_det
-                         + self.likelihood_function.link_function(self.data, self.f_hat)
+                         + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
                          )
 
         return self._compute_GP_variables()
@@ -236,16 +238,16 @@ class Laplace(likelihood):
         #FIXME: Can we get rid of this horrible reshaping?
         #ONLY WORKS FOR 1D DATA
         def obj(f):
-            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
                         + self.NORMAL_CONST)
             return float(res)
 
         def obj_grad(f):
-            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - np.dot(self.Ki, f))
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
+            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
             return np.squeeze(res)
 
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
@@ -267,7 +269,7 @@ class Laplace(likelihood):
 
         def obj(a, f):
             #Careful of shape of data!
-            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f)
+            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
         epsilon = 1e-6
@@ -276,7 +278,7 @@ class Laplace(likelihood):
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             f_old = f.copy()
-            W = -np.diag(self.likelihood_function.link_hess(self.data, f))
+            W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data))
             if not self.likelihood_function.log_concave:
                 W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -285,7 +287,7 @@ class Laplace(likelihood):
             B, L, W_12 = self._compute_B_statistics(K, W)
 
             W_f = np.dot(W, f)
-            grad = self.likelihood_function.link_grad(self.data, f)[:, None]
+            grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None]
             #Find K_i_f
             b = W_f + grad
 
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 49174ce7..0d421882 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -4,6 +4,7 @@ import numpy as np
 from GPy.likelihoods.likelihood_functions import likelihood_function
 from scipy import stats
 
+
 class student_t(likelihood_function):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
@@ -24,15 +25,16 @@ class student_t(likelihood_function):
         self.log_concave = False
 
     @property
-    def variance(self):
+    def variance(self, extra_data=None):
         return (self.v / float(self.v - 2)) * (self.sigma**2)
 
-    def link_function(self, y, f):
+    def link_function(self, y, f, extra_data=None):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
         :y: data
         :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
         :returns: float(likelihood evaluated for this point)
 
         """
@@ -49,7 +51,7 @@ class student_t(likelihood_function):
                      )
         return np.sum(objective)
 
-    def link_grad(self, y, f):
+    def link_grad(self, y, f, extra_data=None):
         """
         Gradient of the link function at y, given f w.r.t f
 
@@ -57,6 +59,7 @@ class student_t(likelihood_function):
 
         :y: data
         :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
         :returns: gradient of likelihood evaluated at points
 
         """
@@ -67,17 +70,18 @@ class student_t(likelihood_function):
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
         return np.squeeze(grad)
 
-    def link_hess(self, y, f):
+    def link_hess(self, y, f, extra_data=None):
         """
         Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
         i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
 
-        Will return diaganol of hessian, since every where else it is 0
+        Will return diagonal of hessian, since every where else it is 0
 
         $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
 
         :y: data
         :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
         y = np.squeeze(y)
@@ -139,7 +143,7 @@ class student_t(likelihood_function):
                                             #size=(num_f_samples, num_y_samples))
             #print student_t_samples.shape
 
-        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:,None],
+        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
                                         scale=self.sigma,
                                         size=(num_test_points, num_y_samples, num_f_samples))
         student_t_samples = np.reshape(student_t_samples,
@@ -152,7 +156,7 @@ class student_t(likelihood_function):
         ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
         def t_gaussian(f, mu, var):
             return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
-                        * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
+                    * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
                     )
 
         def t_gauss_int(mu, var):
@@ -167,4 +171,83 @@ class student_t(likelihood_function):
         p = vec_t_gauss_int(mu, var)
         p_025 = mu - p
         p_975 = mu + p
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        return mu, np.nan*mu, p_025, p_975
+
+
+class weibull_survival(likelihood_function):
+    """Weibull t likelihood distribution for survival analysis with censoring
+        For nomanclature see Bayesian Survival Analysis
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fifj
+    """
+    def __init__(self, shape, scale):
+        self.shape = shape
+        self.scale = scale
+
+        #FIXME: This should be in the superclass
+        self.log_concave = True
+
+    def link_function(self, y, f, extra_data=None):
+        """
+        link_function $\ln p(y|f)$, i.e. log likelihood
+
+        $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$
+
+        :y: time of event data
+        :f: latent variables f
+        :extra_data: the censoring indicator, 1 for censored, 0 for not
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        v = extra_data
+        objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f)  # FIXME: CHECK THIS WITH BOOK, wheres scale?
+        return np.sum(objective)
+
+    def link_grad(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
+
+        $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i})
+
+        :y: data
+        :f: latent variables f
+        :extra_data: the censoring indicator, 1 for censored, 0 for not
+        :returns: gradient of likelihood evaluated at points
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        v = extra_data
+        grad = v - (y**self.shape)*np.exp(f)
+        return np.squeeze(grad)
+
+    def link_hess(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        Will return diagonal of hessian, since every where else it is 0
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used hessian
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        hess = (y**self.shape)*np.exp(f)
+        return np.squeeze(hess)

From 296c093611f46c8632a7235f7d414581f5969294 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 15 Apr 2013 12:08:22 +0100
Subject: [PATCH 23/71] Tidy up comments

---
 python/likelihoods/likelihood_function.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 0d421882..f14faf33 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -9,7 +9,7 @@ class student_t(likelihood_function):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
 
-    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
+    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2)$$
 
     Laplace:
     Needs functions to calculate

From 1e707f125c7e9313b4444b23811425ddc555dba3 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 15 Apr 2013 12:10:42 +0100
Subject: [PATCH 24/71] Make directory structure match that of GPy

---
 {python => GPy}/__init__.py                        | 0
 {python => GPy}/examples/__init__.py               | 0
 {python => GPy}/examples/laplace_approximations.py | 0
 {python => GPy}/likelihoods/Laplace.py             | 0
 {python => GPy}/likelihoods/__init__.py            | 0
 {python => GPy}/likelihoods/likelihood_function.py | 0
 {python => GPy}/models/__init__.py                 | 0
 {python => GPy}/models/coxGP.py                    | 0
 {python => GPy}/testing/__init__.py                | 0
 {python => GPy}/testing/cox_tests.py               | 0
 10 files changed, 0 insertions(+), 0 deletions(-)
 rename {python => GPy}/__init__.py (100%)
 rename {python => GPy}/examples/__init__.py (100%)
 rename {python => GPy}/examples/laplace_approximations.py (100%)
 rename {python => GPy}/likelihoods/Laplace.py (100%)
 rename {python => GPy}/likelihoods/__init__.py (100%)
 rename {python => GPy}/likelihoods/likelihood_function.py (100%)
 rename {python => GPy}/models/__init__.py (100%)
 rename {python => GPy}/models/coxGP.py (100%)
 rename {python => GPy}/testing/__init__.py (100%)
 rename {python => GPy}/testing/cox_tests.py (100%)

diff --git a/python/__init__.py b/GPy/__init__.py
similarity index 100%
rename from python/__init__.py
rename to GPy/__init__.py
diff --git a/python/examples/__init__.py b/GPy/examples/__init__.py
similarity index 100%
rename from python/examples/__init__.py
rename to GPy/examples/__init__.py
diff --git a/python/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
similarity index 100%
rename from python/examples/laplace_approximations.py
rename to GPy/examples/laplace_approximations.py
diff --git a/python/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
similarity index 100%
rename from python/likelihoods/Laplace.py
rename to GPy/likelihoods/Laplace.py
diff --git a/python/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
similarity index 100%
rename from python/likelihoods/__init__.py
rename to GPy/likelihoods/__init__.py
diff --git a/python/likelihoods/likelihood_function.py b/GPy/likelihoods/likelihood_function.py
similarity index 100%
rename from python/likelihoods/likelihood_function.py
rename to GPy/likelihoods/likelihood_function.py
diff --git a/python/models/__init__.py b/GPy/models/__init__.py
similarity index 100%
rename from python/models/__init__.py
rename to GPy/models/__init__.py
diff --git a/python/models/coxGP.py b/GPy/models/coxGP.py
similarity index 100%
rename from python/models/coxGP.py
rename to GPy/models/coxGP.py
diff --git a/python/testing/__init__.py b/GPy/testing/__init__.py
similarity index 100%
rename from python/testing/__init__.py
rename to GPy/testing/__init__.py
diff --git a/python/testing/cox_tests.py b/GPy/testing/cox_tests.py
similarity index 100%
rename from python/testing/cox_tests.py
rename to GPy/testing/cox_tests.py

From 589aeda88cc938a537ecb5a5df34dd276bae5a37 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 15 Apr 2013 15:44:29 +0100
Subject: [PATCH 25/71] Should be working now, needed to change relative path
 names

---
 GPy/examples/classification.py         |  3 +--
 GPy/examples/laplace_approximations.py | 29 +++++++++++---------------
 GPy/likelihoods/__init__.py            |  2 +-
 3 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index 5df019e4..4899e75e 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -17,8 +17,7 @@ def crescent_data(seed=default_seed): #FIXME
     :param seed : seed value for data generation.
     :type seed: int
     :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
-    :type inducing: int
-    """
+    :type inducing: int """
 
     data = GPy.util.datasets.crescent_data(seed=seed)
 
diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 5b1331b6..07801150 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -1,10 +1,6 @@
 import GPy
 import numpy as np
 import matplotlib.pyplot as plt
-from scipy.stats import t, norm
-from coxGP.python.likelihoods.Laplace import Laplace
-from coxGP.python.likelihoods.likelihood_function import student_t
-
 
 def timing():
     real_var = 0.1
@@ -28,15 +24,14 @@ def timing():
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
 
-        t_distribution = student_t(deg_free, sigma=edited_real_sd)
-        corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
+        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+        corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=True)
         m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
         m.ensure_default_constraints()
         m.update_likelihood_approximation()
         m.optimize()
         the_is[a] = m.likelihood.i
 
-    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print the_is
     print np.mean(the_is)
 
@@ -116,8 +111,8 @@ def student_t_approx():
     edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
@@ -129,8 +124,8 @@ def student_t_approx():
     plt.ylim(-2.5, 2.5)
 
     print "Corrupt student t, rasm"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
@@ -142,8 +137,8 @@ def student_t_approx():
     plt.ylim(-2.5, 2.5)
 
     print "Clean student t, ncg"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
     m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
@@ -155,8 +150,8 @@ def student_t_approx():
     plt.ylim(-2.5, 2.5)
 
     print "Corrupt student t, ncg"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=False)
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
@@ -169,8 +164,8 @@ def student_t_approx():
 
 
     ###with a student t distribution, since it has heavy tails it should work well
-    ###likelihood_function = student_t(deg_free, sigma=real_var)
-    ###lap = Laplace(Y, likelihood_function)
+    ###likelihood_functions = student_t(deg_free, sigma=real_var)
+    ###lap = Laplace(Y, likelihood_functions)
     ###cov = kernel.K(X)
     ###lap.fit_full(cov)
 
diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 83413255..9becb1b1 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -1,4 +1,4 @@
 from EP import EP
 from Gaussian import Gaussian
-# TODO: from Laplace import Laplace
+from Laplace import Laplace
 import likelihood_functions as functions

From 01671b6c570b7c40a2b1a326ab2c68606834c674 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 16 Apr 2013 16:34:26 +0100
Subject: [PATCH 26/71] Merged likelihood functions

---
 GPy/examples/laplace_approximations.py  |   4 +-
 GPy/likelihoods/likelihood_function.py  | 253 -----------------------
 GPy/likelihoods/likelihood_functions.py | 254 +++++++++++++++++++++++-
 3 files changed, 254 insertions(+), 257 deletions(-)
 delete mode 100644 GPy/likelihoods/likelihood_function.py

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 07801150..5d1c1224 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -164,8 +164,8 @@ def student_t_approx():
 
 
     ###with a student t distribution, since it has heavy tails it should work well
-    ###likelihood_functions = student_t(deg_free, sigma=real_var)
-    ###lap = Laplace(Y, likelihood_functions)
+    ###likelihood_function = student_t(deg_free, sigma=real_var)
+    ###lap = Laplace(Y, likelihood_function)
     ###cov = kernel.K(X)
     ###lap.fit_full(cov)
 
diff --git a/GPy/likelihoods/likelihood_function.py b/GPy/likelihoods/likelihood_function.py
deleted file mode 100644
index f14faf33..00000000
--- a/GPy/likelihoods/likelihood_function.py
+++ /dev/null
@@ -1,253 +0,0 @@
-from scipy.special import gammaln, gamma
-from scipy import integrate
-import numpy as np
-from GPy.likelihoods.likelihood_functions import likelihood_function
-from scipy import stats
-
-
-class student_t(likelihood_function):
-    """Student t likelihood distribution
-    For nomanclature see Bayesian Data Analysis 2003 p576
-
-    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2)$$
-
-    Laplace:
-    Needs functions to calculate
-    ln p(yi|fi)
-    dln p(yi|fi)_dfi
-    d2ln p(yi|fi)_d2fifj
-    """
-    def __init__(self, deg_free, sigma=2):
-        self.v = deg_free
-        self.sigma = sigma
-
-        #FIXME: This should be in the superclass
-        self.log_concave = False
-
-    @property
-    def variance(self, extra_data=None):
-        return (self.v / float(self.v - 2)) * (self.sigma**2)
-
-    def link_function(self, y, f, extra_data=None):
-        """link_function $\ln p(y|f)$
-        $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: float(likelihood evaluated for this point)
-
-        """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
-        assert y.shape == f.shape
-
-        e = y - f
-        objective = (gammaln((self.v + 1) * 0.5)
-                     - gammaln(self.v * 0.5)
-                     + np.log(self.sigma * np.sqrt(self.v * np.pi))
-                     - (self.v + 1) * 0.5
-                     * np.log(1 + ((e**2 / self.sigma**2) / self.v))
-                     )
-        return np.sum(objective)
-
-    def link_grad(self, y, f, extra_data=None):
-        """
-        Gradient of the link function at y, given f w.r.t f
-
-        $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: gradient of likelihood evaluated at points
-
-        """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
-        assert y.shape == f.shape
-        e = y - f
-        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
-        return np.squeeze(grad)
-
-    def link_hess(self, y, f, extra_data=None):
-        """
-        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
-        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
-
-        Will return diagonal of hessian, since every where else it is 0
-
-        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
-        """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
-        assert y.shape == f.shape
-        e = y - f
-        hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
-        return np.squeeze(hess)
-
-    def predictive_values(self, mu, var):
-        """
-        Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
-
-        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
-        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
-        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
-        """
-
-        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
-        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
-        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
-        #Which was also given to us as (var)
-        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
-        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
-        true_var = var + self.variance
-
-        #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
-        #need the 95 and 5 percentiles.
-        #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
-        p_025 = mu - 2.*true_var
-        p_975 = mu + 2.*true_var
-
-        return mu, np.nan*mu, p_025, p_975
-
-    def sample_predicted_values(self, mu, var):
-        """ Experimental sample approches and numerical integration """
-        #p_025 = stats.t.ppf(.025, mu)
-        #p_975 = stats.t.ppf(.975, mu)
-
-        num_test_points = mu.shape[0]
-        #Each mu is the latent point f* at the test point x*,
-        #and the var is the gaussian variance at this point
-        #Take lots of samples from this, so we have lots of possible values
-        #for latent point f* for each test point x* weighted by how likely we were to pick it
-        print "Taking %d samples of f*".format(num_test_points)
-        num_f_samples = 10
-        num_y_samples = 10
-        student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
-        print "Student t means shape: ", student_t_means.shape
-
-        #Now we have lots of f*, lets work out the likelihood of getting this by sampling
-        #from a student t centred on this point, sample many points from this distribution
-        #centred on f*
-        #for test_point, f in enumerate(student_t_means):
-            #print test_point
-            #print f.shape
-            #student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
-                                            #scale=self.sigma,
-                                            #size=(num_f_samples, num_y_samples))
-            #print student_t_samples.shape
-
-        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
-                                        scale=self.sigma,
-                                        size=(num_test_points, num_y_samples, num_f_samples))
-        student_t_samples = np.reshape(student_t_samples,
-                                       (num_test_points, num_y_samples*num_f_samples))
-
-        #Now take the 97.5 and 0.25 percentile of these points
-        p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
-        p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
-
-        ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
-        def t_gaussian(f, mu, var):
-            return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
-                    * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
-                    )
-
-        def t_gauss_int(mu, var):
-            print "Mu: ", mu
-            print "var: ", var
-            result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
-            print "Result: ", result
-            return result[0]
-
-        vec_t_gauss_int = np.vectorize(t_gauss_int)
-
-        p = vec_t_gauss_int(mu, var)
-        p_025 = mu - p
-        p_975 = mu + p
-        return mu, np.nan*mu, p_025, p_975
-
-
-class weibull_survival(likelihood_function):
-    """Weibull t likelihood distribution for survival analysis with censoring
-        For nomanclature see Bayesian Survival Analysis
-
-    Laplace:
-    Needs functions to calculate
-    ln p(yi|fi)
-    dln p(yi|fi)_dfi
-    d2ln p(yi|fi)_d2fifj
-    """
-    def __init__(self, shape, scale):
-        self.shape = shape
-        self.scale = scale
-
-        #FIXME: This should be in the superclass
-        self.log_concave = True
-
-    def link_function(self, y, f, extra_data=None):
-        """
-        link_function $\ln p(y|f)$, i.e. log likelihood
-
-        $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$
-
-        :y: time of event data
-        :f: latent variables f
-        :extra_data: the censoring indicator, 1 for censored, 0 for not
-        :returns: float(likelihood evaluated for this point)
-
-        """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
-        assert y.shape == f.shape
-
-        v = extra_data
-        objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f)  # FIXME: CHECK THIS WITH BOOK, wheres scale?
-        return np.sum(objective)
-
-    def link_grad(self, y, f, extra_data=None):
-        """
-        Gradient of the link function at y, given f w.r.t f
-
-        $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i})
-
-        :y: data
-        :f: latent variables f
-        :extra_data: the censoring indicator, 1 for censored, 0 for not
-        :returns: gradient of likelihood evaluated at points
-
-        """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
-        assert y.shape == f.shape
-
-        v = extra_data
-        grad = v - (y**self.shape)*np.exp(f)
-        return np.squeeze(grad)
-
-    def link_hess(self, y, f, extra_data=None):
-        """
-        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
-        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
-
-        Will return diagonal of hessian, since every where else it is 0
-
-        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used hessian
-        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
-        """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
-        assert y.shape == f.shape
-
-        hess = (y**self.shape)*np.exp(f)
-        return np.squeeze(hess)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 4b8e7013..c759e15f 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -1,12 +1,14 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
 import numpy as np
-from scipy import stats
+from scipy import stats, integrate
 import scipy as sp
 import pylab as pb
 from ..util.plot import gpplot
+from scipy.special import gammaln, gamma
+#from GPy.likelihoods.likelihood_functions import likelihood_function
+
 
 class likelihood_function:
     """
@@ -132,3 +134,251 @@ class Poisson(likelihood_function):
         p_025 = tmp[:,0]
         p_975 = tmp[:,1]
         return mean,np.nan*mean,p_025,p_975 # better variance here TODO
+
+
+class student_t(likelihood_function):
+    """Student t likelihood distribution
+    For nomanclature see Bayesian Data Analysis 2003 p576
+
+    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2)$$
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fifj
+    """
+    def __init__(self, deg_free, sigma=2):
+        self.v = deg_free
+        self.sigma = sigma
+
+        #FIXME: This should be in the superclass
+        self.log_concave = False
+
+    @property
+    def variance(self, extra_data=None):
+        return (self.v / float(self.v - 2)) * (self.sigma**2)
+
+    def link_function(self, y, f, extra_data=None):
+        """link_function $\ln p(y|f)$
+        $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        e = y - f
+        objective = (gammaln((self.v + 1) * 0.5)
+                     - gammaln(self.v * 0.5)
+                     + np.log(self.sigma * np.sqrt(self.v * np.pi))
+                     - (self.v + 1) * 0.5
+                     * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                     )
+        return np.sum(objective)
+
+    def link_grad(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
+
+        $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: gradient of likelihood evaluated at points
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        return np.squeeze(grad)
+
+    def link_hess(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        Will return diagonal of hessian, since every where else it is 0
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
+        return np.squeeze(hess)
+
+    def predictive_values(self, mu, var):
+        """
+        Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
+
+        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        """
+
+        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
+        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
+        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
+        #Which was also given to us as (var)
+        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
+        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
+        true_var = var + self.variance
+
+        #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
+        #need the 95 and 5 percentiles.
+        #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
+        p_025 = mu - 2.*true_var
+        p_975 = mu + 2.*true_var
+
+        return mu, np.nan*mu, p_025, p_975
+
+    def sample_predicted_values(self, mu, var):
+        """ Experimental sample approches and numerical integration """
+        #p_025 = stats.t.ppf(.025, mu)
+        #p_975 = stats.t.ppf(.975, mu)
+
+        num_test_points = mu.shape[0]
+        #Each mu is the latent point f* at the test point x*,
+        #and the var is the gaussian variance at this point
+        #Take lots of samples from this, so we have lots of possible values
+        #for latent point f* for each test point x* weighted by how likely we were to pick it
+        print "Taking %d samples of f*".format(num_test_points)
+        num_f_samples = 10
+        num_y_samples = 10
+        student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
+        print "Student t means shape: ", student_t_means.shape
+
+        #Now we have lots of f*, lets work out the likelihood of getting this by sampling
+        #from a student t centred on this point, sample many points from this distribution
+        #centred on f*
+        #for test_point, f in enumerate(student_t_means):
+            #print test_point
+            #print f.shape
+            #student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
+                                            #scale=self.sigma,
+                                            #size=(num_f_samples, num_y_samples))
+            #print student_t_samples.shape
+
+        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
+                                        scale=self.sigma,
+                                        size=(num_test_points, num_y_samples, num_f_samples))
+        student_t_samples = np.reshape(student_t_samples,
+                                       (num_test_points, num_y_samples*num_f_samples))
+
+        #Now take the 97.5 and 0.25 percentile of these points
+        p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
+        p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
+
+        ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
+        def t_gaussian(f, mu, var):
+            return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
+                    * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
+                    )
+
+        def t_gauss_int(mu, var):
+            print "Mu: ", mu
+            print "var: ", var
+            result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
+            print "Result: ", result
+            return result[0]
+
+        vec_t_gauss_int = np.vectorize(t_gauss_int)
+
+        p = vec_t_gauss_int(mu, var)
+        p_025 = mu - p
+        p_975 = mu + p
+        return mu, np.nan*mu, p_025, p_975
+
+
+class weibull_survival(likelihood_function):
+    """Weibull t likelihood distribution for survival analysis with censoring
+        For nomanclature see Bayesian Survival Analysis
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fifj
+    """
+    def __init__(self, shape, scale):
+        self.shape = shape
+        self.scale = scale
+
+        #FIXME: This should be in the superclass
+        self.log_concave = True
+
+    def link_function(self, y, f, extra_data=None):
+        """
+        link_function $\ln p(y|f)$, i.e. log likelihood
+
+        $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$
+
+        :y: time of event data
+        :f: latent variables f
+        :extra_data: the censoring indicator, 1 for censored, 0 for not
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        v = extra_data
+        objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f)  # FIXME: CHECK THIS WITH BOOK, wheres scale?
+        return np.sum(objective)
+
+    def link_grad(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
+
+        $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i})
+
+        :y: data
+        :f: latent variables f
+        :extra_data: the censoring indicator, 1 for censored, 0 for not
+        :returns: gradient of likelihood evaluated at points
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        v = extra_data
+        grad = v - (y**self.shape)*np.exp(f)
+        return np.squeeze(grad)
+
+    def link_hess(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        Will return diagonal of hessian, since every where else it is 0
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used hessian
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        hess = (y**self.shape)*np.exp(f)
+        return np.squeeze(hess)

From 1420aa532c5df8eaf4e6db5b89e77f4b375ebf1c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 19 Apr 2013 12:23:00 +0100
Subject: [PATCH 27/71] Attempted to introduce gradient methods, won't work yet
 I doubt

---
 GPy/examples/__init__.py                |   1 +
 GPy/likelihoods/Laplace.py              | 120 ++++++++++++++++++------
 GPy/likelihoods/likelihood_functions.py |  58 +++++++++++-
 GPy/models/GP.py                        |  16 +++-
 GPy/util/linalg.py                      |  19 +++-
 5 files changed, 177 insertions(+), 37 deletions(-)

diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py
index 551bff54..68832e77 100644
--- a/GPy/examples/__init__.py
+++ b/GPy/examples/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
+import laplace_approximations
 import classification
 import regression
 import dimensionality_reduction
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 4d94ba0f..b1b41957 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -4,28 +4,9 @@ import GPy
 from scipy.linalg import cholesky, eig, inv, cho_solve, det
 from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
-from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
+from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.lapack import dtrtrs
 
-#TODO: Move this to utils
-
-
-def det_ln_diag(A):
-    """
-    log determinant of a diagonal matrix
-    $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
-    """
-    return np.log(np.diagonal(A)).sum()
-
-
-def pddet(A):
-    """
-    Determinant of a positive definite matrix
-    """
-    L = cholesky(A)
-    logdetA = 2*sum(np.log(np.diag(L)))
-    return logdetA
-
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -75,17 +56,92 @@ class Laplace(likelihood):
         return self.likelihood_function.predictive_values(mu, var)
 
     def _get_params(self):
-        return np.zeros(0)
+        return np.asarray(self.likelihood_function._get_params())
 
     def _get_param_names(self):
-        return []
+        return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
-        pass  # TODO: Laplace likelihood might want to take some parameters...
+        return self.likelihood_function._set_params()
+
+    def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
+        """
+        Find the gradients of the marginal likelihood w.r.t both thetaK and thetaL
+
+        dL_dthetaK differs from that of normal likelihoods as it has additional terms coming from
+        changes to y_tilde and changes to Sigma_tilde when the kernel parameters are adjusted
+
+        Similar terms arise when finding the gradients with respect to changes in the liklihood
+        parameters
+        """
+        return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
+
+    def _shared_gradients_components(self):
+        dL_dytil = -np.dot((self.K+self.Sigma_tilde), self.Y)
+        dytil_dfhat = np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        return dL_dytil, dytil_dfhat
+
+    def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
+        """
+                           #explicit                #implicit                     #implicit
+        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
+        :param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance)
+        :param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers
+        :returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters
+        """
+        dL_dytil, dytil_dfhat = self._shared_gradients_components()
+
+        I_KW_i, _, _, _ = pdinv(np.eye(self.N) + np.dot(self.K, self.W))
+        #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
+        dfhat_dthetaK = I_KW_i*dK_dthetaK*self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
+
+        dytil_dthetaK = dytil_dfhat*dfhat_dthetaK
+
+        #FIXME: Careful dL_dK = dL_d_K_Sigma
+        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
+        dL_dSigma = dL_d_K_Sigma
+        d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+                     #explicit           #implicit
+        dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS
+        dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde)
+
+        dL_dthetaK_implicit = dL_dytil*dytil_dthetaK + dL_dSigma*dSigma_dthetaK
+        return dL_dthetaK_implicit
 
     def _gradients(self, partial):
-        return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
-        raise NotImplementedError
+        """
+        Gradients with respect to likelihood parameters
+
+        Complicated, it differs for parameters of the kernel \theta_{K}, and
+        parameters of the likelihood, \theta_{L}
+
+        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
+        dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL)
+        dL_dK*dK_dthetaL = 0
+
+        dytil_dthetaX = dytil_dfhat * dfhat_dthetaX
+        dytil_dfhat = Sigma*Ki + I
+
+        fhat = K*log_p(y|fhat)                                          from rasm p125
+        dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat)          from rasm p125
+
+        dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi
+        dW_dthetaX = d_dthetaX[d2phi_d2fhat]
+        d2phi_d2fhat = Hessian function of likelihood
+
+        partial = dL_dK
+        """
+        dL_dytil, dytil_dfhat = self._shared_gradients_components()
+        dfhat_dthetaL = self.likelihood_function.df_dtheta()
+
+        dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
+        dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case
+
+        dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
+        dL_dthetaL = 0 + dL_dytil*dytil_dthetaL + dL_dSigma*dSigma_dthetaL
+        return dL_dthetaL
+        #return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
 
     def _compute_GP_variables(self):
         """
@@ -112,8 +168,9 @@ class Laplace(likelihood):
         $$\tilde{\Sigma} = W^{-1}$$
 
         """
-        epsilon = 1e-6
+        epsilon = 1e14
 
+        #Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
         #dtritri -> L -> L_i
         #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
         #((L.T*w)_i + I)f_hat = y_tilde
@@ -122,11 +179,12 @@ class Laplace(likelihood):
         Lt_W = np.dot(L.T, self.W)
 
         ##Check it isn't singular!
-        if cond(Lt_W) > 1e14:
+        if cond(Lt_W) > epsilon:
             print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
 
         Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
-        Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
+        self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
+        Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
 
         #f.T(Ki + W)f
         f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
@@ -156,16 +214,16 @@ class Laplace(likelihood):
                    #)
 
         ##Check it isn't singular!
-        if cond(self.W) > 1e14:
+        if cond(self.W) > epsilon:
             print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
 
-        Sigma_tilde = inv(self.W)  # Damn
+        self.Sigma_tilde = inv(self.W)  # Damn
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
         self.YYT = np.dot(self.Y, self.Y.T)
-        self.covariance_matrix = Sigma_tilde
+        self.covariance_matrix = self.Sigma_tilde
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
 
     def fit_full(self, K):
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index c759e15f..6e72b029 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -20,6 +20,16 @@ class likelihood_function:
     def __init__(self,location=0,scale=1):
         self.location = location
         self.scale = scale
+        self.log_concave = True
+
+    def _get_params(self):
+        return np.zeros(0)
+
+    def _get_param_names(self):
+        return []
+
+    def _set_params(self, p):
+        pass
 
 class probit(likelihood_function):
     """
@@ -149,12 +159,22 @@ class student_t(likelihood_function):
     d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free, sigma=2):
+        super(student_t, self).__init__()
         self.v = deg_free
         self.sigma = sigma
-
-        #FIXME: This should be in the superclass
         self.log_concave = False
 
+    def _get_params(self):
+        return np.asarray(self.sigma)
+
+    def _get_param_names(self):
+        return ["t_noise_variance"]
+
+    def _set_params(self, x):
+        self.sigma = float(x)
+        #self.covariance_matrix = np.eye(self.N)*self._variance
+        #self.precision = 1./self._variance
+
     @property
     def variance(self, extra_data=None):
         return (self.v / float(self.v - 2)) * (self.sigma**2)
@@ -222,6 +242,40 @@ class student_t(likelihood_function):
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
         return np.squeeze(hess)
 
+    def d3link(self, y, f, extra_data=None):
+        """
+        Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+
+        $$\frac{-2(v+1)((f-y)^{3} - 3\sigma^{2}v(f-y))}{((f-y)^{2} + \sigma^{2}v)^{3}}$$
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        #NB f-y not y-f
+        e = f - y
+        d3link_d3f = (  (-2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
+                      / ((e**2 + (self.sigma**2)*self.v)**3)
+                     )
+        return d3link_d3f
+
+    def link_hess_grad_sigma(self, y, f, extra_data=None):
+        """
+        Gradient of the hessian w.r.t sigma parameter
+
+        $$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}}
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        hess_grad_sigma = (  (2*self.sigma*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
+                           / ((e**2 + (self.sigma**2)*self.v)**3)
+                          )
+        return hess_grad_sigma
+
+    def _gradients(self, y, f, extra_data=None):
+        return [self.link_hess_grad_sigma] # list as we might learn many parameters
+
     def predictive_values(self, mu, var):
         """
         Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index cfda0cfe..1024b5ef 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -8,7 +8,7 @@ from .. import kern
 from ..core import model
 from ..util.linalg import pdinv,mdot
 from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango
-from ..likelihoods import EP
+from ..likelihoods import EP, Laplace
 
 class GP(model):
     """
@@ -128,7 +128,19 @@ class GP(model):
 
         For the likelihood parameters, pass in alpha = K^-1 y
         """
-        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK,X=self.X,slices1=self.Xslices,slices2=self.Xslices), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        if isinstance(self.likelihood, Laplace):
+            dL_dthetaK_explicit = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+            #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
+            fake_dL_dKs = np.ones(self.dL_dK.shape)
+            dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+
+            dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
+            dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        else:
+            dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        return np.hstack((dL_dthetaK, dL_dthetaL))
 
     def _raw_predict(self,_Xnew,slices=None, full_cov=False):
         """
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index f88099a4..cb899397 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -14,6 +14,21 @@ import types
 #import scipy.lib.lapack.flapack
 import scipy as sp
 
+def det_ln_diag(A):
+    """
+    log determinant of a diagonal matrix
+    $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
+    """
+    return np.log(np.diagonal(A)).sum()
+
+def pddet(A):
+    """
+    Determinant of a positive definite matrix
+    """
+    L = cholesky(A)
+    logdetA = 2*sum(np.log(np.diag(L)))
+    return logdetA
+
 def trace_dot(a,b):
     """
     efficiently compute the trace of the matrix product of a and b
@@ -166,8 +181,8 @@ def PCA(Y, Q):
     """
     if not np.allclose(Y.mean(axis=0), 0.0):
         print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
-        
-        #Y -= Y.mean(axis=0) 
+
+        #Y -= Y.mean(axis=0)
 
     Z = linalg.svd(Y-Y.mean(axis=0), full_matrices = False)
     [X, W] = [Z[0][:,0:Q], np.dot(np.diag(Z[1]), Z[2]).T[:,0:Q]]

From 267a8e427c147aa5ac98e3f42c58d90492e53b4c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 19 Apr 2013 17:41:01 +0100
Subject: [PATCH 28/71] Adding gradients, shapes starting to make sense

---
 GPy/likelihoods/Laplace.py              | 53 ++++++++++++++++---------
 GPy/likelihoods/likelihood_functions.py | 28 +++++++++----
 GPy/models/GP.py                        |  6 +--
 GPy/util/linalg.py                      |  2 +-
 4 files changed, 60 insertions(+), 29 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index b1b41957..b5c0bdfe 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -1,11 +1,12 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv, cho_solve, det
+from scipy.linalg import inv, cho_solve, det
 from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.lapack import dtrtrs
+import pylab as plt
 
 
 class Laplace(likelihood):
@@ -62,7 +63,7 @@ class Laplace(likelihood):
         return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
-        return self.likelihood_function._set_params()
+        return self.likelihood_function._set_params(p)
 
     def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
         """
@@ -77,8 +78,8 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        dL_dytil = -np.dot((self.K+self.Sigma_tilde), self.Y)
-        dytil_dfhat = np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde))
+        dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -91,12 +92,18 @@ class Laplace(likelihood):
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
-        I_KW_i, _, _, _ = pdinv(np.eye(self.N) + np.dot(self.K, self.W))
+        A = np.eye(self.N) + np.dot(self.K, self.W)
+        plt.imshow(A)
+        plt.show()
+        I_KW_i, _, _, _ = pdinv(A)
+
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
-        dfhat_dthetaK = I_KW_i*dK_dthetaK*self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-
-        dytil_dthetaK = dytil_dfhat*dfhat_dthetaK
+        #Derivative for each f dimension, for each of K's hyper parameters
+        dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
+        for ind_j, thetaj in enumerate(dK_dthetaK):
+            dfhat_dthetaK[:, ind_j] = mdot(I_KW_i, thetaj, self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data))
 
+        dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
         #FIXME: Careful dL_dK = dL_d_K_Sigma
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
         dL_dSigma = dL_d_K_Sigma
@@ -105,8 +112,9 @@ class Laplace(likelihood):
         dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS
         dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde)
 
-        dL_dthetaK_implicit = dL_dytil*dytil_dthetaK + dL_dSigma*dSigma_dthetaK
-        return dL_dthetaK_implicit
+        dL_dthetaK_implicit = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)# + np.dot(dL_dSigma, dSigma_dthetaK)
+        #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
+        return np.squeeze(dL_dthetaK_implicit)
 
     def _gradients(self, partial):
         """
@@ -132,16 +140,25 @@ class Laplace(likelihood):
         partial = dL_dK
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
-        dfhat_dthetaL = self.likelihood_function.df_dtheta()
+        dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
 
-        dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-        dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
+        #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        #Derivative for each f dimension, for each of K's hyper parameters
+        dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
+        for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
+            dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
+                                             dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
+                                             self.Sigma_tilde
+                                             )
+
+        #TODO: This is Wi*A*Wi, can be more numerically stable with a trick
+        #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
         dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case
 
-        dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
-        dL_dthetaL = 0 + dL_dytil*dytil_dthetaL + dL_dSigma*dSigma_dthetaL
-        return dL_dthetaL
-        #return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
+        #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
+        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
+        dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
+        return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
         """
@@ -335,7 +352,7 @@ class Laplace(likelihood):
         rs = 0
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
-            f_old = f.copy()
+            #f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data))
             if not self.likelihood_function.log_concave:
                 W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 6e72b029..64791047 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -159,10 +159,10 @@ class student_t(likelihood_function):
     d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free, sigma=2):
-        super(student_t, self).__init__()
         self.v = deg_free
         self.sigma = sigma
         self.log_concave = False
+        #super(student_t, self).__init__()
 
     def _get_params(self):
         return np.asarray(self.sigma)
@@ -258,9 +258,9 @@ class student_t(likelihood_function):
                      )
         return d3link_d3f
 
-    def link_hess_grad_sigma(self, y, f, extra_data=None):
+    def link_hess_grad_std(self, y, f, extra_data=None):
         """
-        Gradient of the hessian w.r.t sigma parameter
+        Gradient of the hessian w.r.t sigma parameter (standard deviation)
 
         $$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}}
         """
@@ -273,8 +273,24 @@ class student_t(likelihood_function):
                           )
         return hess_grad_sigma
 
+    def link_grad_std(self, y, f, extra_data=None):
+        """
+        Gradient of the likelihood w.r.t sigma parameter (standard deviation)
+
+        $$\frac{-2\sigma(v+1)(y-f)}{(v\sigma^{2} + (y-f)^{2})^{2}}$$
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
+                      / ((self.v*(self.sigma**2) + e**2)**2)
+                     )
+        return grad_sigma
+
     def _gradients(self, y, f, extra_data=None):
-        return [self.link_hess_grad_sigma] # list as we might learn many parameters
+        return [self.link_grad_std(y, f, extra_data=extra_data)[:, None],
+                self.link_hess_grad_std(y, f, extra_data=extra_data)[:, None]] # list as we might learn many parameters
 
     def predictive_values(self, mu, var):
         """
@@ -372,9 +388,7 @@ class weibull_survival(likelihood_function):
     def __init__(self, shape, scale):
         self.shape = shape
         self.scale = scale
-
-        #FIXME: This should be in the superclass
-        self.log_concave = True
+        self.log_concave = True # Or false?
 
     def link_function(self, y, f, extra_data=None):
         """
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 1024b5ef..24037afe 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -128,17 +128,17 @@ class GP(model):
 
         For the likelihood parameters, pass in alpha = K^-1 y
         """
+        dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
         if isinstance(self.likelihood, Laplace):
-            dL_dthetaK_explicit = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+            dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
             fake_dL_dKs = np.ones(self.dL_dK.shape)
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
 
             dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
             dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
-            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            dL_dthetaL = self.likelihood._gradients(partial=self.dL_dK)
         else:
-            dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
         return np.hstack((dL_dthetaK, dL_dthetaL))
 
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index cb899397..20293ed8 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -25,7 +25,7 @@ def pddet(A):
     """
     Determinant of a positive definite matrix
     """
-    L = cholesky(A)
+    L = jitchol(A)
     logdetA = 2*sum(np.log(np.diag(L)))
     return logdetA
 

From 9de0b23f65470dfa3ec2fad756f2ab901f29ef0c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 29 Apr 2013 18:08:46 +0100
Subject: [PATCH 29/71] Plotting problematic kernel

---
 GPy/likelihoods/Laplace.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index b5c0bdfe..9cacb0e1 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -92,9 +92,12 @@ class Laplace(likelihood):
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
-        A = np.eye(self.N) + np.dot(self.K, self.W)
-        plt.imshow(A)
-        plt.show()
+        print "Computing K gradients"
+        I = np.eye(self.N)
+        C = np.dot(self.K, self.W)
+        A = I + C
+        #plt.imshow(A)
+        #plt.show()
         I_KW_i, _, _, _ = pdinv(A)
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
@@ -250,6 +253,8 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
+        #assert np.all(self.K.T == self.K)
+        #self.K_safe = K.copy()
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:

From f95666a8f9cb07209d80226ed1c5b0352b9eed75 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 6 May 2013 10:15:39 +0100
Subject: [PATCH 30/71] Merging

---
 GPy/likelihoods/Laplace.py |  1 +
 GPy/models/GP.py           | 15 +++++----------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 9cacb0e1..5e28212e 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -98,6 +98,7 @@ class Laplace(likelihood):
         A = I + C
         #plt.imshow(A)
         #plt.show()
+        ki, _, _, _ = pdinv(self.K)
         I_KW_i, _, _, _ = pdinv(A)
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index d353e5dd..96ec6582 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -6,15 +6,9 @@ import numpy as np
 import pylab as pb
 from .. import kern
 from ..core import model
-<<<<<<< HEAD
-from ..util.linalg import pdinv,mdot
-from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango
-from ..likelihoods import EP, Laplace
-=======
 from ..util.linalg import pdinv, mdot
 from ..util.plot import gpplot, x_frame1D, x_frame2D, Tango
-from ..likelihoods import EP
->>>>>>> upstream/devel
+from ..likelihoods import EP, Laplace
 
 class GP(model):
     """
@@ -34,6 +28,7 @@ class GP(model):
 
     """
     def __init__(self, X, likelihood, kernel, normalize_X=False):
+        self.has_uncertain_inputs=False
 
         # parse arguments
         self.X = X
@@ -128,12 +123,12 @@ class GP(model):
 
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
-        dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+        dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
         if isinstance(self.likelihood, Laplace):
             dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
             fake_dL_dKs = np.ones(self.dL_dK.shape)
-            dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+            dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
             dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
             dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
@@ -251,7 +246,7 @@ class GP(model):
         else:
             raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
 
-    def plot(self, samples=0, plot_limits=None, which_data='all', which_functions='all', resolution=None, levels=20):
+    def plot(self, samples=0, plot_limits=None, which_data='all', which_functions='all', which_parts='all', resolution=None, levels=20):
         """
         TODO: Docstrings!
         :param levels: for 2D plotting, the number of contour levels to use

From a52c20f47008233495e20d96b4ab50be8eb7d4a3 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 7 May 2013 13:35:47 +0100
Subject: [PATCH 31/71] Added a debug examples

---
 GPy/examples/laplace_approximations.py | 84 +++++++++++++++++++++++++-
 GPy/likelihoods/Laplace.py             | 23 +++++--
 GPy/models/GP.py                       |  6 +-
 3 files changed, 104 insertions(+), 9 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 5d1c1224..7e5c55bf 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -35,12 +35,86 @@ def timing():
     print the_is
     print np.mean(the_is)
 
+def debug_student_t_noise_approx():
+    real_var = 0.2
+    #Start a function, any function
+    X = np.linspace(0.0, 10.0, 30)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+
+    X_full = np.linspace(0.0, 10.0, 500)[:, None]
+    Y_full = np.sin(X_full)
+
+    #Y = Y/Y.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 10000
+    real_sd = np.sqrt(real_var)
+    print "Real noise: ", real_sd
+
+    initial_var_guess = 0.01
+    #t_rv = t(deg_free, loc=0, scale=real_var)
+    #noise = t_rvrvs(size=Y.shape)
+    #Y += noise
+
+    plt.figure(1)
+    plt.suptitle('Gaussian likelihood')
+    # Kernel object
+    kernel1 = GPy.kern.rbf(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
+    kernel5 = kernel1.copy()
+    kernel6 = kernel1.copy()
+
+    print "Clean Gaussian"
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize()
+    # plot
+    plt.subplot(131)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    print m
+
+    plt.suptitle('Student-t likelihood')
+    edited_real_sd = initial_var_guess #real_sd
+
+    print "Clean student t, rasm"
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(132)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Clean student t, ncg"
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(133)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    plt.show()
 
 def student_t_approx():
     """
     Example of regressing with a student t likelihood
     """
-    real_var = 0.1
+    real_var = 0.2
     #Start a function, any function
     X = np.linspace(0.0, 10.0, 30)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
@@ -58,8 +132,11 @@ def student_t_approx():
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10
+    deg_free = 1000000000000
     real_sd = np.sqrt(real_var)
+    print "Real noise: ", real_sd
+
+    initial_var_guess = 0.01
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -73,6 +150,7 @@ def student_t_approx():
     #print corrupted_indices
     #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
     #Y[corrupted_indices] += noise
+
     plt.figure(1)
     plt.suptitle('Gaussian likelihood')
     # Kernel object
@@ -108,7 +186,7 @@ def student_t_approx():
 
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
-    edited_real_sd = real_sd
+    edited_real_sd = initial_var_guess #real_sd
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 5e28212e..02f2c93f 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -5,7 +5,7 @@ from scipy.linalg import inv, cho_solve, det
 from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
-from scipy.linalg.lapack import dtrtrs
+from scipy.linalg.flapack import dtrtrs
 import pylab as plt
 
 
@@ -63,6 +63,7 @@ class Laplace(likelihood):
         return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
+        print "Setting noise sd: ", p
         return self.likelihood_function._set_params(p)
 
     def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -79,7 +80,9 @@ class Laplace(likelihood):
 
     def _shared_gradients_components(self):
         dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde))
-        dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        #dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        Ki = inv(self.K)
+        dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -93,19 +96,26 @@ class Laplace(likelihood):
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
         print "Computing K gradients"
+        print "dytil_dfhat: ", np.mean(dytil_dfhat)
         I = np.eye(self.N)
         C = np.dot(self.K, self.W)
         A = I + C
         #plt.imshow(A)
         #plt.show()
-        ki, _, _, _ = pdinv(self.K)
-        I_KW_i, _, _, _ = pdinv(A)
+
+        #FIXME: K ISNT SYMMETRIC SO NEITHER IS A AND IT MAKES IT NON-PD!
+        #ki, _, _, _ = pdinv(self.K)
+        #I_KW_i, _, _, _ = pdinv(A)
+
+        I_KW_i = inv(A)
+
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
         #Derivative for each f dimension, for each of K's hyper parameters
         dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
+        grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
         for ind_j, thetaj in enumerate(dK_dthetaK):
-            dfhat_dthetaK[:, ind_j] = mdot(I_KW_i, thetaj, self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data))
+            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad))
 
         dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
         #FIXME: Careful dL_dK = dL_d_K_Sigma
@@ -116,8 +126,11 @@ class Laplace(likelihood):
         dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS
         dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde)
 
+        print "dL_dytil: ", np.mean(dL_dytil)
+        print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
         dL_dthetaK_implicit = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)# + np.dot(dL_dSigma, dSigma_dthetaK)
         #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
+        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
         return np.squeeze(dL_dthetaK_implicit)
 
     def _gradients(self, partial):
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 96ec6582..07c7a708 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -116,7 +116,6 @@ class GP(model):
         """
         return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
 
-
     def _log_likelihood_gradients(self):
         """
         The gradient of all parameters.
@@ -132,9 +131,14 @@ class GP(model):
 
             dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
             dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
+
+            print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
+
             dL_dthetaL = self.likelihood._gradients(partial=self.dL_dK)
         else:
+            print "dL_dthetaK: ", dL_dthetaK
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        print "dL_dthetaL: ", dL_dthetaL
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
 

From 84f12c1079a10db7dfe0737c5de1ca5b74d3b2d0 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 8 May 2013 12:36:31 +0100
Subject: [PATCH 32/71] Scale and switch KW+I

---
 GPy/examples/laplace_approximations.py |  5 ++--
 GPy/likelihoods/Laplace.py             | 37 +++++++++++++++-----------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 7e5c55bf..704297ef 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -36,7 +36,7 @@ def timing():
     print np.mean(the_is)
 
 def debug_student_t_noise_approx():
-    real_var = 0.2
+    real_var = 0.1
     #Start a function, any function
     X = np.linspace(0.0, 10.0, 30)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
@@ -44,7 +44,7 @@ def debug_student_t_noise_approx():
     X_full = np.linspace(0.0, 10.0, 500)[:, None]
     Y_full = np.sin(X_full)
 
-    #Y = Y/Y.max()
+    Y = Y/Y.max()
 
     #Add student t random noise to datapoints
     deg_free = 10000
@@ -56,6 +56,7 @@ def debug_student_t_noise_approx():
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
 
+    plt.close('all')
     plt.figure(1)
     plt.suptitle('Gaussian likelihood')
     # Kernel object
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 02f2c93f..934b2a90 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -3,8 +3,8 @@ import scipy as sp
 import GPy
 from scipy.linalg import inv, cho_solve, det
 from numpy.linalg import cond
-from GPy.likelihoods.likelihood import likelihood
-from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
+from likelihood import likelihood
+from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.flapack import dtrtrs
 import pylab as plt
 
@@ -79,10 +79,10 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde))
-        #dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
-        Ki = inv(self.K)
-        dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5?
+        dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        #Ki = inv(self.K)
+        #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -95,6 +95,10 @@ class Laplace(likelihood):
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
+        d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+
+        dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
+
         print "Computing K gradients"
         print "dytil_dfhat: ", np.mean(dytil_dfhat)
         I = np.eye(self.N)
@@ -103,12 +107,7 @@ class Laplace(likelihood):
         #plt.imshow(A)
         #plt.show()
 
-        #FIXME: K ISNT SYMMETRIC SO NEITHER IS A AND IT MAKES IT NON-PD!
-        #ki, _, _, _ = pdinv(self.K)
-        #I_KW_i, _, _, _ = pdinv(A)
-
-        I_KW_i = inv(A)
-
+        I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
         #Derivative for each f dimension, for each of K's hyper parameters
@@ -121,14 +120,20 @@ class Laplace(likelihood):
         #FIXME: Careful dL_dK = dL_d_K_Sigma
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
         dL_dSigma = dL_d_K_Sigma
-        d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+        #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
                      #explicit           #implicit
-        dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS
-        dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde)
+        dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
+        dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
+        for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
+            dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
 
         print "dL_dytil: ", np.mean(dL_dytil)
         print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
-        dL_dthetaK_implicit = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)# + np.dot(dL_dSigma, dSigma_dthetaK)
+
+        #FIXME: Won't handle multi dimensional data
+        dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
+        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=(0,1))
+        dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
         #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
         import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
         return np.squeeze(dL_dthetaK_implicit)

From 6c4866662c9f20dbc3a9a5d08aab85bf95e1e84d Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 8 May 2013 16:05:01 +0100
Subject: [PATCH 33/71] Seem to have gradients much closer now

---
 GPy/examples/laplace_approximations.py  | 34 +++++----
 GPy/likelihoods/Laplace.py              | 99 ++++++++++++++++++-------
 GPy/likelihoods/likelihood_functions.py | 19 +++--
 GPy/models/GP.py                        | 18 +++--
 4 files changed, 110 insertions(+), 60 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 704297ef..57ae9be7 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -36,6 +36,7 @@ def timing():
     print np.mean(the_is)
 
 def debug_student_t_noise_approx():
+    plot = False
     real_var = 0.1
     #Start a function, any function
     X = np.linspace(0.0, 10.0, 30)[:, None]
@@ -57,8 +58,6 @@ def debug_student_t_noise_approx():
     #Y += noise
 
     plt.close('all')
-    plt.figure(1)
-    plt.suptitle('Gaussian likelihood')
     # Kernel object
     kernel1 = GPy.kern.rbf(X.shape[1])
     kernel2 = kernel1.copy()
@@ -75,12 +74,14 @@ def debug_student_t_noise_approx():
     m.ensure_default_constraints()
     m.optimize()
     # plot
-    plt.subplot(131)
-    m.plot()
-    plt.plot(X_full, Y_full)
+    if plot:
+        plt.figure(1)
+        plt.suptitle('Gaussian likelihood')
+        plt.subplot(131)
+        m.plot()
+        plt.plot(X_full, Y_full)
     print m
 
-    plt.suptitle('Student-t likelihood')
     edited_real_sd = initial_var_guess #real_sd
 
     print "Clean student t, rasm"
@@ -91,10 +92,12 @@ def debug_student_t_noise_approx():
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
-    plt.subplot(132)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
+    if plot:
+        plt.suptitle('Student-t likelihood')
+        plt.subplot(132)
+        m.plot()
+        plt.plot(X_full, Y_full)
+        plt.ylim(-2.5, 2.5)
 
     print "Clean student t, ncg"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -104,12 +107,13 @@ def debug_student_t_noise_approx():
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
-    plt.subplot(133)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
+    if plot:
+        plt.subplot(133)
+        m.plot()
+        plt.plot(X_full, Y_full)
+        plt.ylim(-2.5, 2.5)
 
-    plt.show()
+    #plt.show()
 
 def student_t_approx():
     """
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 934b2a90..566e4e25 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -5,8 +5,8 @@ from scipy.linalg import inv, cho_solve, det
 from numpy.linalg import cond
 from likelihood import likelihood
 from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
-from scipy.linalg.flapack import dtrtrs
-import pylab as plt
+from scipy.linalg.lapack import dtrtrs
+#import pylab as plt
 
 
 class Laplace(likelihood):
@@ -79,9 +79,9 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5?
+        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
         dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
-        #Ki = inv(self.K)
+        #Ki, _, _, _ = pdinv(self.K)
         #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
         return dL_dytil, dytil_dfhat
 
@@ -95,9 +95,8 @@ class Laplace(likelihood):
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
-        d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
 
-        dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
+        #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
 
         print "Computing K gradients"
         print "dytil_dfhat: ", np.mean(dytil_dfhat)
@@ -107,7 +106,8 @@ class Laplace(likelihood):
         #plt.imshow(A)
         #plt.show()
 
-        I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
+        #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
+        I_KW_i = self.Bi # could use self.B_chol??
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
         #Derivative for each f dimension, for each of K's hyper parameters
@@ -117,25 +117,44 @@ class Laplace(likelihood):
             dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad))
 
         dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
-        #FIXME: Careful dL_dK = dL_d_K_Sigma
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
         dL_dSigma = dL_d_K_Sigma
         #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
                      #explicit           #implicit
-        dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
-        dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
-        for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
-            dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
-
-        print "dL_dytil: ", np.mean(dL_dytil)
-        print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
+        #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
+        #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
+        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+        Wi = np.diagonal(self.Sigma_tilde) #Convenience
+        dSigma_dthetaK_explicit = 0
+        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
+        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
+        #dSigma_dthetaK_implicit = -np.sum(np.dot(dWi_dfhat, dfhat_dthetaK), axis=0)
+        dSigma_dthetaK_implicit = np.dot(dWi_dfhat, dfhat_dthetaK)
+        dSigma_dthetaK = dSigma_dthetaK_explicit + dSigma_dthetaK_implicit
+        #dSigma_dthetaK = 0 + np.dot(, dfhat_dthetaK)
+        #for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
+            #dSigma_dthetaK_explicit = 0
+            #dSigma_dthetaK_implicit = -np.dot(Wi, dW_dfhat
+            #dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
 
         #FIXME: Won't handle multi dimensional data
         dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
-        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=(0,1))
+        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
         dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
         #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
-        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+
+        #print "\n"
+        #print "dL_dytil: ", np.mean(dL_dytil)
+        #print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
+        #print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
+        #print "\n"
+        #print "dL_dSigma: ", np.mean(dL_dSigma)
+        #print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK)
+        #print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
+        #print "\n"
+        #print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
+        #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+
         return np.squeeze(dL_dthetaK_implicit)
 
     def _gradients(self, partial):
@@ -159,27 +178,51 @@ class Laplace(likelihood):
         dW_dthetaX = d_dthetaX[d2phi_d2fhat]
         d2phi_d2fhat = Hessian function of likelihood
 
-        partial = dL_dK
+        partial = dL_d_K_Sigma
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
-        dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+
+        dlikelihood_dthetaL_explicit, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dlikelihood_dfhat = self.likelihood_function.link_hess(self.data, self.f_hat, self.extra_data)
+        dfhat_dthetaL_cyclic = 0 #what is this? how can dfhat_dthetaL be used in the value of itself?
+        dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
+        dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
+        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
+
+        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
+        dL_dSigma = partial #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
+
+        Wi = np.diagonal(self.Sigma_tilde) #Convenience
+        #-1 as we are looking at W which is -1*d2log p(y|f)
+        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
+        dSigma_dthetaL_explicit = np.diagflat(-(Wi*(-1*d2likelihood_dthetaL)*Wi))
+
+        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
+        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL_cyclic)
+        dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
 
         #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
         #Derivative for each f dimension, for each of K's hyper parameters
-        dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
-        for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
-            dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
-                                             dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
-                                             self.Sigma_tilde
-                                             )
+        #dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
+        #for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
+            #dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
+                                             #dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
+                                             #self.Sigma_tilde
+                                             #)
 
         #TODO: This is Wi*A*Wi, can be more numerically stable with a trick
         #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
-        dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case
 
         #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
-        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
-        dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
+        #dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
+        #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
+
+        dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
+        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
+        dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
+
         return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index cd6467d7..2176aac0 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -248,17 +248,16 @@ class student_t(likelihood_function):
         """
         Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
 
-        $$\frac{-2(v+1)((f-y)^{3} - 3\sigma^{2}v(f-y))}{((f-y)^{2} + \sigma^{2}v)^{3}}$$
+        $$\frac{2(v+1)((y-f)^{3} - 3\sigma^{2}v(y-f))}{((y-f)^{2} + \sigma^{2}v)^{3}}$$
         """
         y = np.squeeze(y)
         f = np.squeeze(f)
         assert y.shape == f.shape
-        #NB f-y not y-f
-        e = f - y
-        d3link_d3f = (  (-2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
+        e = y - f
+        d3link_d3f = (  (2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
                       / ((e**2 + (self.sigma**2)*self.v)**3)
                      )
-        return d3link_d3f
+        return np.squeeze(d3link_d3f)
 
     def link_hess_grad_std(self, y, f, extra_data=None):
         """
@@ -270,10 +269,10 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        hess_grad_sigma = (  (2*self.sigma*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
+        hess_grad_sigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
                            / ((e**2 + (self.sigma**2)*self.v)**3)
                           )
-        return hess_grad_sigma
+        return np.squeeze(hess_grad_sigma)
 
     def link_grad_std(self, y, f, extra_data=None):
         """
@@ -288,11 +287,11 @@ class student_t(likelihood_function):
         grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
                       / ((self.v*(self.sigma**2) + e**2)**2)
                      )
-        return grad_sigma
+        return np.squeeze(grad_sigma)
 
     def _gradients(self, y, f, extra_data=None):
-        return [self.link_grad_std(y, f, extra_data=extra_data)[:, None],
-                self.link_hess_grad_std(y, f, extra_data=extra_data)[:, None]] # list as we might learn many parameters
+        return [self.link_grad_std(y, f, extra_data=extra_data),
+                self.link_hess_grad_std(y, f, extra_data=extra_data)] # list as we might learn many parameters
 
     def predictive_values(self, mu, var):
         """
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index a346b47b..1682ee6c 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -125,19 +125,23 @@ class GP(model):
         if isinstance(self.likelihood, Laplace):
             dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            fake_dL_dKs = np.ones(self.dL_dK.shape)
+            fake_dL_dKs = np.eye(self.dL_dK.shape[0])
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
-            dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
+            #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
+            dL_dthetaK_implicit = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
             dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
 
-            print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
+            #print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
 
-            dL_dthetaL = self.likelihood._gradients(partial=self.dL_dK)
-        else:
-            print "dL_dthetaK: ", dL_dthetaK
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-        print "dL_dthetaL: ", dL_dthetaL
+            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        else:
+            #print "dL_dthetaK: ", dL_dthetaK
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+        #print "dL_dthetaL: ", dL_dthetaL
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
 

From 9500b12b532e2f9abd68621a0ce8662e4553cb2c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 8 May 2013 20:53:23 +0100
Subject: [PATCH 34/71] Working on putting callback to update laplace in
 callback

---
 GPy/inference/optimization.py           | 13 ++++++++++++-
 GPy/likelihoods/Laplace.py              |  1 -
 GPy/likelihoods/likelihood_functions.py |  4 ++++
 GPy/models/GP.py                        | 10 ++++++++++
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/GPy/inference/optimization.py b/GPy/inference/optimization.py
index 75cd94ba..1445eed0 100644
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@@ -29,7 +29,7 @@ class Optimizer():
     :rtype: optimizer object.
 
     """
-    def __init__(self, x_init, messages=False, model = None, max_f_eval=1e4, max_iters = 1e3, ftol=None, gtol=None, xtol=None):
+    def __init__(self, x_init, messages=False, model = None, max_f_eval=1e4, max_iters = 1e3, ftol=None, gtol=None, xtol=None, callback=None):
         self.opt_name = None
         self.x_init = x_init
         self.messages = messages
@@ -45,6 +45,7 @@ class Optimizer():
         self.gtol = gtol
         self.ftol = ftol
         self.model = model
+        self.callback = callback
 
     def run(self, **kwargs):
         start = dt.datetime.now()
@@ -94,6 +95,8 @@ class opt_tnc(Optimizer):
             opt_dict['ftol'] = self.ftol
         if self.gtol is not None:
             opt_dict['pgtol'] = self.gtol
+        if self.callback is not None:
+            opt_dict['callback'] = self.callback
 
         opt_result = optimize.fmin_tnc(f_fp, self.x_init, messages = self.messages,
                        maxfun = self.max_f_eval, **opt_dict)
@@ -128,6 +131,8 @@ class opt_lbfgsb(Optimizer):
             print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it"
         if self.gtol is not None:
             opt_dict['pgtol'] = self.gtol
+        if self.callback is not None:
+            opt_dict['callback'] = self.callback
 
         opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint = iprint,
                                             maxfun = self.max_f_eval, **opt_dict)
@@ -155,6 +160,8 @@ class opt_simplex(Optimizer):
             opt_dict['ftol'] = self.ftol
         if self.gtol is not None:
             print "WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it"
+        if self.callback is not None:
+            opt_dict['callback'] = self.callback
 
         opt_result = optimize.fmin(f, self.x_init, (), disp = self.messages,
                    maxfun = self.max_f_eval, full_output=True, **opt_dict)
@@ -187,6 +194,8 @@ class opt_rasm(Optimizer):
             print "WARNING: minimize doesn't have an ftol arg, so I'm going to ignore it"
         if self.gtol is not None:
             print "WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it"
+        if self.callback is not None:
+            print "WARNING: minimize doesn't have a callback arg, so I'm going to ignore it"
 
         opt_result = rasm.minimize(self.x_init, f_fp, (), messages = self.messages,
                                    maxnumfuneval = self.max_f_eval)
@@ -205,6 +214,8 @@ class opt_SCG(Optimizer):
     def opt(self, f_fp = None, f = None, fp = None):
         assert not f is None
         assert not fp is None
+        if self.callback is not None:
+            print "WARNING: SCG doesn't have a callback arg, so I'm going to ignore it"
         opt_result = SCG(f,fp,self.x_init, display=self.messages, maxiters=self.max_iters, max_f_eval=self.max_f_eval, xtol=self.xtol, ftol=self.ftol)
         self.x_opt = opt_result[0]
         self.trace = opt_result[1]
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 566e4e25..208b1102 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -63,7 +63,6 @@ class Laplace(likelihood):
         return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
-        print "Setting noise sd: ", p
         return self.likelihood_function._set_params(p)
 
     def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 2176aac0..61c79385 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -166,6 +166,8 @@ class student_t(likelihood_function):
         self.log_concave = False
         #super(student_t, self).__init__()
 
+        self._set_params(np.asarray(sigma))
+
     def _get_params(self):
         return np.asarray(self.sigma)
 
@@ -174,6 +176,8 @@ class student_t(likelihood_function):
 
     def _set_params(self, x):
         self.sigma = float(x)
+        print "Setting student t sigma: ", x
+        print x
         #self.covariance_matrix = np.eye(self.N)*self._variance
         #self.precision = 1./self._variance
 
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 1682ee6c..79284b59 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -86,6 +86,16 @@ class GP(model):
     def _get_param_names(self):
         return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
 
+    def _update_params_callback(self, p):
+        #FIXME:Check the transforming
+        #Set the new parameters of the kernel and likelihood within the optimization
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
+        self.likelihood._set_params(p[self.kern.Nparam_transformed():])
+        #update the likelihood approximation within the optimisation with the current parameters
+        self.update_likelihood_approximation()
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
     def update_likelihood_approximation(self):
         """
         Approximates a non-gaussian likelihood using Expectation Propagation

From 5472c5c6ba445c49fcdb98ccef4635f17a801b28 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 13 May 2013 18:36:02 +0100
Subject: [PATCH 35/71] Almost have likelihood gradients working but kernels
 still way off

---
 GPy/examples/laplace_approximations.py  | 39 ++++++-----
 GPy/likelihoods/Laplace.py              | 88 ++++++++++++++++---------
 GPy/likelihoods/likelihood_functions.py |  4 +-
 GPy/models/GP.py                        | 20 +++---
 4 files changed, 91 insertions(+), 60 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 57ae9be7..2054881c 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -52,7 +52,7 @@ def debug_student_t_noise_approx():
     real_sd = np.sqrt(real_var)
     print "Real noise: ", real_sd
 
-    initial_var_guess = 0.01
+    initial_var_guess = 1
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -84,14 +84,21 @@ def debug_student_t_noise_approx():
 
     edited_real_sd = initial_var_guess #real_sd
 
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
-    m.ensure_default_constraints()
+    #m.constrain_positive('rbf')
+    m.constrain_fixed('rbf_v', 1.0898)
+    m.constrain_fixed('rbf_l', 1.8651)
+    m.constrain_positive('t_noi')
+    #m.constrain_fixed('t_noise_variance', real_sd)
     m.update_likelihood_approximation()
-    m.optimize()
+    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
+    m.optimize('scg', messages=True)
     print(m)
+    return m
     if plot:
         plt.suptitle('Student-t likelihood')
         plt.subplot(132)
@@ -99,19 +106,19 @@ def debug_student_t_noise_approx():
         plt.plot(X_full, Y_full)
         plt.ylim(-2.5, 2.5)
 
-    print "Clean student t, ncg"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
-    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    if plot:
-        plt.subplot(133)
-        m.plot()
-        plt.plot(X_full, Y_full)
-        plt.ylim(-2.5, 2.5)
+    #print "Clean student t, ncg"
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
+    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    #m.ensure_default_constraints()
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    #print(m)
+    #if plot:
+        #plt.subplot(133)
+        #m.plot()
+        #plt.plot(X_full, Y_full)
+        #plt.ylim(-2.5, 2.5)
 
     #plt.show()
 
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 208b1102..5b3e8f43 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -63,6 +63,7 @@ class Laplace(likelihood):
         return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
+        #print "Setting laplace param with: ", p
         return self.likelihood_function._set_params(p)
 
     def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -78,10 +79,24 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
-        dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
-        #Ki, _, _, _ = pdinv(self.K)
-        #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        dL_dytil = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
+
+        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+        Wi = np.diagonal(self.Sigma_tilde) #Convenience
+        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
+        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
+
+        Ki, _, _, _ = pdinv(self.K)
+        #dytil_dfhat_implicit = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
+        #dytil_dfhat = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
+
+        #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
+        #dytil_dfhat_explicit = self.Wi__Ki_W
+        #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
+        #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
+
+        a = mdot(dWi_dfhat, Ki, self.f_hat)
+        dytil_dfhat = mdot(dWi_dfhat, Ki, self.f_hat) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -94,18 +109,18 @@ class Laplace(likelihood):
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
-
         #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
 
-        print "Computing K gradients"
-        print "dytil_dfhat: ", np.mean(dytil_dfhat)
-        I = np.eye(self.N)
-        C = np.dot(self.K, self.W)
-        A = I + C
+        #print "Computing K gradients"
+        #print "dytil_dfhat: ", np.mean(dytil_dfhat)
+        #I = np.eye(self.N)
+        #C = np.dot(self.K, self.W)
+        #A = I + C
         #plt.imshow(A)
         #plt.show()
 
         #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
+        #B = I + w12*K*w12
         I_KW_i = self.Bi # could use self.B_chol??
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
@@ -113,15 +128,22 @@ class Laplace(likelihood):
         dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
         grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
         for ind_j, thetaj in enumerate(dK_dthetaK):
-            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad))
+            #dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad)))
+            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad)
 
+        print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat)
+        print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK)
         dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
+        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
+        print "\n"
+
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
         dL_dSigma = dL_d_K_Sigma
         #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
                      #explicit           #implicit
         #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
         #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
+
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         Wi = np.diagonal(self.Sigma_tilde) #Convenience
         dSigma_dthetaK_explicit = 0
@@ -140,19 +162,16 @@ class Laplace(likelihood):
         dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
         dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
         dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
-        #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
 
-        #print "\n"
-        #print "dL_dytil: ", np.mean(dL_dytil)
-        #print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
-        #print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
-        #print "\n"
-        #print "dL_dSigma: ", np.mean(dL_dSigma)
-        #print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK)
-        #print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
-        #print "\n"
-        #print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
-        #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+        print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil)
+        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
+        print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
+        print "\n"
+        print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma)
+        print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK)
+        print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
+        print "\n"
+        print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
 
         return np.squeeze(dL_dthetaK_implicit)
 
@@ -182,11 +201,15 @@ class Laplace(likelihood):
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
         #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
 
-        dlikelihood_dthetaL_explicit, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-        dlikelihood_dfhat = self.likelihood_function.link_hess(self.data, self.f_hat, self.extra_data)
-        dfhat_dthetaL_cyclic = 0 #what is this? how can dfhat_dthetaL be used in the value of itself?
-        dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
-        dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
+        dlikelihood_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
+        #dfhat_dthetaL_cyclic = 0 #FIXME: what is this? how can dfhat_dthetaL be used in the value of itself?
+        #dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
+        #dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
+        #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
+        KW_I_i = self.Bi # could use self.B_chol??
+        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihood_dfhat))
+
         dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
 
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
@@ -199,7 +222,7 @@ class Laplace(likelihood):
 
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL_cyclic)
+        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL)
         dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
 
         #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
@@ -219,8 +242,10 @@ class Laplace(likelihood):
         #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
 
         dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
-        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
+        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma[:, None].T, dSigma_dthetaL), axis=0)
         dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
+        dL_dthetaL_via_Sigma_old = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
@@ -257,7 +282,7 @@ class Laplace(likelihood):
         #((L.T*w)_i + I)f_hat = y_tilde
         L = jitchol(self.K)
         Li = chol_inv(L)
-        Lt_W = np.dot(L.T, self.W)
+        Lt_W = np.dot(L.T, self.W) #FIXME: Can make Faster
 
         ##Check it isn't singular!
         if cond(Lt_W) > epsilon:
@@ -361,7 +386,6 @@ class Laplace(likelihood):
         """
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
         L = jitchol(B)
         return (B, L, W_12)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 61c79385..6eef9f33 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -176,8 +176,6 @@ class student_t(likelihood_function):
 
     def _set_params(self, x):
         self.sigma = float(x)
-        print "Setting student t sigma: ", x
-        print x
         #self.covariance_matrix = np.eye(self.N)*self._variance
         #self.precision = 1./self._variance
 
@@ -288,7 +286,7 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
+        grad_sigma = (  (2*self.sigma*self.v*(self.v + 1)*e)
                       / ((self.v*(self.sigma**2) + e**2)**2)
                      )
         return np.squeeze(grad_sigma)
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 79284b59..ff852766 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -66,6 +66,10 @@ class GP(model):
         # self.likelihood._set_params(p[self.kern.Nparam:])               # test by Nicolas
         self.likelihood._set_params(p[self.kern.Nparam_transformed():])  # test by Nicolas
 
+        if isinstance(self.likelihood, Laplace):
+            print "Updating approx: ", p
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
 
         self.K = self.kern.K(self.X)
         self.K += self.likelihood.covariance_matrix
@@ -87,14 +91,12 @@ class GP(model):
         return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
 
     def _update_params_callback(self, p):
-        #FIXME:Check the transforming
-        #Set the new parameters of the kernel and likelihood within the optimization
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        #parameters will be in transformed space
         self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
+        #set_params_transformed for likelihood doesn't exist?
         self.likelihood._set_params(p[self.kern.Nparam_transformed():])
         #update the likelihood approximation within the optimisation with the current parameters
         self.update_likelihood_approximation()
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def update_likelihood_approximation(self):
         """
@@ -123,7 +125,9 @@ class GP(model):
         model for a new variable Y* = v_tilde/tau_tilde, with a covariance
         matrix K* = K + diag(1./tau_tilde) plus a normalization term.
         """
-        return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        print "Log likelihood: ", l
+        return l
 
     def _log_likelihood_gradients(self):
         """
@@ -135,7 +139,7 @@ class GP(model):
         if isinstance(self.likelihood, Laplace):
             dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            fake_dL_dKs = np.eye(self.dL_dK.shape[0])
+            fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
             #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
@@ -145,13 +149,11 @@ class GP(model):
             #print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
 
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            #print "dL_dthetaL: ", dL_dthetaL
             print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         else:
-            #print "dL_dthetaK: ", dL_dthetaK
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
             print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-        #print "dL_dthetaL: ", dL_dthetaL
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
 

From 787a038401ee959fbbd8bfe354c84c1d4cbd56fa Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 14 May 2013 16:23:18 +0100
Subject: [PATCH 36/71] Still getting closer to grads for likelihood

---
 GPy/examples/laplace_approximations.py  |  4 ++--
 GPy/likelihoods/Laplace.py              | 16 ++++++----------
 GPy/likelihoods/likelihood_functions.py |  4 ++--
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 2054881c..eb725b53 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -95,10 +95,10 @@ def debug_student_t_noise_approx():
     m.constrain_positive('t_noi')
     #m.constrain_fixed('t_noise_variance', real_sd)
     m.update_likelihood_approximation()
-    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
-    m.optimize('scg', messages=True)
     print(m)
     return m
+    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
+    m.optimize('scg', messages=True)
     if plot:
         plt.suptitle('Student-t likelihood')
         plt.subplot(132)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 5b3e8f43..2af51f2b 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -201,24 +201,22 @@ class Laplace(likelihood):
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
         #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
 
-        dlikelihood_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
         dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-        #dfhat_dthetaL_cyclic = 0 #FIXME: what is this? how can dfhat_dthetaL be used in the value of itself?
-        #dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
-        #dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
         #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
         KW_I_i = self.Bi # could use self.B_chol??
-        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihood_dfhat))
+        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL))
+        #dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
 
         dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
 
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
-        dL_dSigma = partial #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
+        dL_dSigma = np.diagflat(partial) #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
 
         Wi = np.diagonal(self.Sigma_tilde) #Convenience
         #-1 as we are looking at W which is -1*d2log p(y|f)
         #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
-        dSigma_dthetaL_explicit = np.diagflat(-(Wi*(-1*d2likelihood_dthetaL)*Wi))
+        dSigma_dthetaL_explicit = np.diagflat(-1*(Wi*(-1*d2likelihood_dthetaL)*Wi))
 
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
@@ -242,10 +240,8 @@ class Laplace(likelihood):
         #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
 
         dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
-        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma[:, None].T, dSigma_dthetaL), axis=0)
+        dL_dthetaL_via_Sigma = np.sum(np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0))
         dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
-        dL_dthetaL_via_Sigma_old = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 6eef9f33..1a9dac75 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -256,7 +256,7 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        d3link_d3f = (  (2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
+        d3link_d3f = (  (2*(self.v + 1)*(-1*e)*(e**2 - 3*(self.sigma**2)*self.v))
                       / ((e**2 + (self.sigma**2)*self.v)**3)
                      )
         return np.squeeze(d3link_d3f)
@@ -286,7 +286,7 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        grad_sigma = (  (2*self.sigma*self.v*(self.v + 1)*e)
+        grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
                       / ((self.v*(self.sigma**2) + e**2)**2)
                      )
         return np.squeeze(grad_sigma)

From 569311b5107c6ec6cb2cc41587701f5526fb70dd Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 15 May 2013 19:25:55 +0100
Subject: [PATCH 37/71] Gradients almost there for dytil_dfhat, diagonal terms
 are right

---
 GPy/likelihoods/Laplace.py              |  21 ++--
 GPy/likelihoods/likelihood_functions.py |   4 +-
 GPy/testing/laplace_approx.tests.py     | 123 ++++++++++++++++++++++++
 3 files changed, 140 insertions(+), 8 deletions(-)
 create mode 100644 GPy/testing/laplace_approx.tests.py

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 2af51f2b..ce3f870f 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -96,7 +96,10 @@ class Laplace(likelihood):
         #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
 
         a = mdot(dWi_dfhat, Ki, self.f_hat)
-        dytil_dfhat = mdot(dWi_dfhat, Ki, self.f_hat) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
+        b = np.dot(self.Sigma_tilde, Ki)
+        dytil_dfhat = - np.dot(dWi_dfhat, np.dot(Ki, self.f_hat)) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
+        #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N)
+        self.dytil_dfhat = dytil_dfhat
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -330,19 +333,25 @@ class Laplace(likelihood):
 
     def fit_full(self, K):
         """
-        The laplace approximation algorithm
+        The laplace approximation algorithm, find K and expand hessian
         For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
         :K: Covariance matrix
         """
         self.K = K.copy()
-        #assert np.all(self.K.T == self.K)
-        #self.K_safe = K.copy()
+
+        #Find mode
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
             self.f_hat = self.ncg_mode(K)
 
+        #Compute hessian and other variables at mode
+        self._compute_likelihood_variables()
+
+    def _compute_likelihood_variables(self):
         #At this point get the hessian matrix
+        #print "Data: ", self.data
+        #print "fhat: ", self.f_hat
         self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data))
 
         if not self.likelihood_function.log_concave:
@@ -352,14 +361,14 @@ class Laplace(likelihood):
                                        #This is a property only held by non-log-concave likelihoods
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
-        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W)
+        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
 
         Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
         self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
 
         b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
-        solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
+        solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b)))
         a = b - mdot(self.W_12, solve_chol)
         self.f_Ki_f = np.dot(self.f_hat.T, a)
         self.ln_K_det = pddet(self.K)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 0d194c01..646293d2 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -10,8 +10,7 @@ from scipy.special import gammaln, gamma
 from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 
 class likelihood_function:
-    """
-    Likelihood class for doing Expectation propagation
+    """ Likelihood class for doing Expectation propagation
 
     :param Y: observed output (Nx1 numpy.darray)
     ..Note:: Y values allowed depend on the likelihood_function used
@@ -241,6 +240,7 @@ class student_t(likelihood_function):
         y = np.squeeze(y)
         f = np.squeeze(f)
         assert y.shape == f.shape
+
         e = y - f
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
         return np.squeeze(hess)
diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx.tests.py
new file mode 100644
index 00000000..394950d5
--- /dev/null
+++ b/GPy/testing/laplace_approx.tests.py
@@ -0,0 +1,123 @@
+import unittest
+import numpy as np
+
+import GPy
+from GPy.models import GP
+from GPy.util.linalg import pdinv, tdot
+from scipy import linalg
+
+class LikelihoodGradParam(GP):
+    def __init__(self, X, likelihood_function, kernel, param_name=None, function=None, **kwargs):
+        super(LikelihoodGradParam, self).__init__(X, likelihood_function, kernel)
+        self.param_name = param_name
+        self.func = function
+        #self.func_params = kwargs
+        #self.parameter = self.likelihood.__getattribute__(self.param_name)
+
+    def _get_param_names(self):
+        f_hats = ["f_{}".format(i) for i in range(len(self.likelihood.f_hat))]
+        return f_hats
+
+    def _get_params(self):
+        return np.hstack([np.squeeze(self.likelihood.f_hat)])
+        #return np.hstack([self.likelihood.__getattribute__(self.param_name)])
+
+    def hack_dL_dK(self):
+        self.K = self.kern.K(self.X)
+        self.K += self.likelihood.covariance_matrix
+
+        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
+
+        # the gradient of the likelihood wrt the covariance matrix
+        if self.likelihood.YYT is None:
+            alpha, _ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y, lower=1)
+            self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki)
+        else:
+            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
+            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
+            self.dL_dK = 0.5 * (tmp - self.D * self.Ki)
+
+    def _set_params(self, x):
+        self.likelihood.f_hat = x.reshape(self.N, 1)
+        self.likelihood._compute_likelihood_variables()
+        self.hack_dL_dK()
+
+    def log_likelihood(self):
+        return self.func(self.likelihood)[0, 0]
+
+    def _log_likelihood_gradients(self):
+        #gradient = self.likelihood.__getattribute__(self.param_name)
+        self.likelihood._compute_likelihood_variables()
+        self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        gradient = getattr(self.likelihood, self.param_name)
+        #Need to sum over fhats? For dytil_dfhat...
+        #gradient = np.flatten(gradient, axis=0)
+        #return gradient[:, 0]
+        return gradient[0, :]
+
+
+class LaplaceTests(unittest.TestCase):
+    def setUp(self):
+        real_var = 0.1
+        #Start a function, any function
+        #self.X = np.linspace(0.0, 10.0, 30)[:, None]
+        self.X = np.random.randn(2,1)
+        #self.X = np.ones((10,1))
+        Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var
+        self.Y = Y/Y.max()
+        self.kernel = GPy.kern.rbf(self.X.shape[1])
+
+        deg_free = 10000
+        real_sd = np.sqrt(real_var)
+        initial_sd_guess = 1
+
+        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=initial_sd_guess)
+        self.stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
+        self.stu_t_likelihood.fit_full(self.kernel.K(self.X))
+        self.m = LikelihoodGradParam(self.X, self.stu_t_likelihood, self.kernel, None, None)
+        self.m.constrain_fixed('rbf_v', 1.0898)
+        self.m.constrain_fixed('rbf_l', 1.8651)
+
+    def tearDown(self):
+        self.m = None
+
+    def test_dy_dfhat(self):
+        def ytil(likelihood):
+            Sigma_tilde = likelihood.Sigma_tilde
+            K = likelihood.K
+            Ki, _, _, _ = pdinv(K)
+            f_hat = likelihood.f_hat
+            Sigma, _, _, _ = pdinv(Sigma_tilde)
+            return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat)
+
+        self.m.func = ytil
+        self.m.param_name = 'dytil_dfhat'
+        self.m.randomize()
+        #try:
+        self.m.checkgrad(verbose=1)
+        assert self.m.checkgrad()
+        #except:
+            #import ipdb;ipdb.set_trace()
+
+
+    #def test_dL_dytil(self):
+        #def L(likelihood):
+            ##-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+            #Sigma_tilde = likelihood.Sigma_tilde
+            #Ki = likelihood.K
+            #f_hat = likelihood.f_hat
+            #Sigma, _, _, _ = pdinv(Sigma_tilde)
+            #return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat)
+
+        #self.m.func = L
+        #self.m.param_name = 'dL_dytil'
+        #m.randomize()
+        ##try:
+        #m.checkgrad(verbose=1)
+        #assert m.checkgrad()
+        #except:
+            #import ipdb;ipdb.set_trace()
+
+if __name__ == "__main__":
+    unittest.main()
+

From 21ae81de29c36ad94d8d7fc412db869c7926719a Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 16 May 2013 12:00:15 +0100
Subject: [PATCH 38/71] Workong on doing explicit gradients

---
 GPy/likelihoods/Laplace.py          | 13 +++++++++++++
 GPy/testing/laplace_approx.tests.py |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index ce3f870f..f2197e55 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -97,6 +97,19 @@ class Laplace(likelihood):
 
         a = mdot(dWi_dfhat, Ki, self.f_hat)
         b = np.dot(self.Sigma_tilde, Ki)
+        #dytil_dfhat = np.zeros(self.K.shape)
+        #for col in range(self.N):
+            #for row in range(self.N):
+                #t1 = 0
+                #for l in range(self.N):
+                    #t1 += dWi_dfhat[col, col]*Ki[col,l]*self.f_hat[l, 0]
+                ##t2 = np.zeros((1, self.N))
+                #t2 = np.dot(self.Sigma_tilde, Ki[:, col])
+                ##for k in range(self.N):
+                    ##t2[:] += self.Sigma_tilde[k, k]*Ki[k, col]
+                #dytil_dfhat[row, col] = (t1 + t2)[row]
+        #dytil_dfhat += np.eye(self.N)
+
         dytil_dfhat = - np.dot(dWi_dfhat, np.dot(Ki, self.f_hat)) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
         #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N)
         self.dytil_dfhat = dytil_dfhat
diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx.tests.py
index 394950d5..73dfbfd6 100644
--- a/GPy/testing/laplace_approx.tests.py
+++ b/GPy/testing/laplace_approx.tests.py
@@ -61,7 +61,7 @@ class LaplaceTests(unittest.TestCase):
         real_var = 0.1
         #Start a function, any function
         #self.X = np.linspace(0.0, 10.0, 30)[:, None]
-        self.X = np.random.randn(2,1)
+        self.X = np.random.randn(9,1)
         #self.X = np.ones((10,1))
         Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var
         self.Y = Y/Y.max()

From e5d7ee972848e5eb5ec1186c3150d9720328076f Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 16 May 2013 12:06:09 +0100
Subject: [PATCH 39/71] FIXED DYTIL_DFHAT

---
 GPy/likelihoods/Laplace.py          | 6 +++---
 GPy/testing/laplace_approx.tests.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index f2197e55..42897f80 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -105,12 +105,12 @@ class Laplace(likelihood):
                     #t1 += dWi_dfhat[col, col]*Ki[col,l]*self.f_hat[l, 0]
                 ##t2 = np.zeros((1, self.N))
                 #t2 = np.dot(self.Sigma_tilde, Ki[:, col])
-                ##for k in range(self.N):
-                    ##t2[:] += self.Sigma_tilde[k, k]*Ki[k, col]
+                ###for k in range(self.N):
+                    ###t2[:] += self.Sigma_tilde[k, k]*Ki[k, col]
                 #dytil_dfhat[row, col] = (t1 + t2)[row]
         #dytil_dfhat += np.eye(self.N)
 
-        dytil_dfhat = - np.dot(dWi_dfhat, np.dot(Ki, self.f_hat)) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
+        dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
         #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N)
         self.dytil_dfhat = dytil_dfhat
         return dL_dytil, dytil_dfhat
diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx.tests.py
index 73dfbfd6..2b3af2ad 100644
--- a/GPy/testing/laplace_approx.tests.py
+++ b/GPy/testing/laplace_approx.tests.py
@@ -60,8 +60,8 @@ class LaplaceTests(unittest.TestCase):
     def setUp(self):
         real_var = 0.1
         #Start a function, any function
-        #self.X = np.linspace(0.0, 10.0, 30)[:, None]
-        self.X = np.random.randn(9,1)
+        self.X = np.linspace(0.0, 10.0, 30)[:, None]
+        #self.X = np.random.randn(,1)
         #self.X = np.ones((10,1))
         Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var
         self.Y = Y/Y.max()

From 48d693791eabf51e64b28706910a9a9444457825 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 16 May 2013 12:22:37 +0100
Subject: [PATCH 40/71] changed name

---
 GPy/examples/laplace_approximations.py        |  2 +-
 GPy/likelihoods/Laplace.py                    | 25 ++++---------------
 ...pprox.tests.py => laplace_approx_tests.py} |  0
 3 files changed, 6 insertions(+), 21 deletions(-)
 rename GPy/testing/{laplace_approx.tests.py => laplace_approx_tests.py} (100%)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index eb725b53..4d8e96b8 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -39,7 +39,7 @@ def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 30)[:, None]
+    X = np.linspace(0.0, 10.0, 2)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
     X_full = np.linspace(0.0, 10.0, 500)[:, None]
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 42897f80..b0dde03f 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -95,23 +95,7 @@ class Laplace(likelihood):
         #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
         #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
 
-        a = mdot(dWi_dfhat, Ki, self.f_hat)
-        b = np.dot(self.Sigma_tilde, Ki)
-        #dytil_dfhat = np.zeros(self.K.shape)
-        #for col in range(self.N):
-            #for row in range(self.N):
-                #t1 = 0
-                #for l in range(self.N):
-                    #t1 += dWi_dfhat[col, col]*Ki[col,l]*self.f_hat[l, 0]
-                ##t2 = np.zeros((1, self.N))
-                #t2 = np.dot(self.Sigma_tilde, Ki[:, col])
-                ###for k in range(self.N):
-                    ###t2[:] += self.Sigma_tilde[k, k]*Ki[k, col]
-                #dytil_dfhat[row, col] = (t1 + t2)[row]
-        #dytil_dfhat += np.eye(self.N)
-
         dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
-        #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N)
         self.dytil_dfhat = dytil_dfhat
         return dL_dytil, dytil_dfhat
 
@@ -219,10 +203,10 @@ class Laplace(likelihood):
 
         dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
         dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-        #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
-        KW_I_i = self.Bi # could use self.B_chol??
+        KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
+        #KW_I_i = self.Bi # could use self.B_chol??
         dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL))
-        #dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
+        dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
 
         dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
 
@@ -383,7 +367,8 @@ class Laplace(likelihood):
         b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
         solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b)))
         a = b - mdot(self.W_12, solve_chol)
-        self.f_Ki_f = np.dot(self.f_hat.T, a)
+        self.Ki_f = a
+        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
         self.ln_K_det = pddet(self.K)
 
         self.ln_z_hat = (- 0.5*self.f_Ki_f
diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx_tests.py
similarity index 100%
rename from GPy/testing/laplace_approx.tests.py
rename to GPy/testing/laplace_approx_tests.py

From 146d7e2458cbfc69f8303b0b413e50cebf7fd7f7 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 17 May 2013 17:42:00 +0100
Subject: [PATCH 41/71] Trying to fix dL_dytil gradient

---
 GPy/likelihoods/Laplace.py          |  23 +++++-
 GPy/testing/laplace_approx_tests.py | 109 +++++++++++++++++-----------
 2 files changed, 84 insertions(+), 48 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index b0dde03f..af20d36a 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -79,16 +79,29 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        dL_dytil = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
+        Ki, _, _, _ = pdinv(self.K)
+
+        #Y__KS_i = np.dot(self.Y.T, inv(self.K+self.Sigma_tilde))
+        #dL_dytil = -0.5*Y__KS_i #or *0.5? Shouldn't this be -y*R
+        #dL_dytil = -0.5*np.trace(np.dot(inv(self.K+self.Sigma_tilde), (np.dot(self.Y, self.Y.T) + self.Y.T)))
+        #dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde),
+        #dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y)
+        c = inv(self.K+self.Sigma_tilde)
+        dL_dytil_simple_term =  -0.5*np.diag(np.dot(c, self.Y) + np.dot(self.Y.T, c))
+
+        P = np.diagflat(1/np.dot(Ki, self.f_hat))
+        K_Wi_i = inv(self.K+self.Sigma_tilde)
+
+        dL_dytil_difficult_term = np.diag(( -0.5*(np.dot(self.K + self.Sigma_tilde, P))
+                                            +0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P)
+                                           ) * np.eye(self.N))
+        dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term
 
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         Wi = np.diagonal(self.Sigma_tilde) #Convenience
         #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
         dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
 
-        Ki, _, _, _ = pdinv(self.K)
-        #dytil_dfhat_implicit = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
-        #dytil_dfhat = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
 
         #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
         #dytil_dfhat_explicit = self.Wi__Ki_W
@@ -97,6 +110,8 @@ class Laplace(likelihood):
 
         dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
         self.dytil_dfhat = dytil_dfhat
+        #dytil_dfhat = np.eye(dytil_dfhat.shape[0])
+        self.dL_dfhat = np.dot(dL_dytil, dytil_dfhat) #FIXME: Purely for checkgradding....
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
diff --git a/GPy/testing/laplace_approx_tests.py b/GPy/testing/laplace_approx_tests.py
index 2b3af2ad..acb1c822 100644
--- a/GPy/testing/laplace_approx_tests.py
+++ b/GPy/testing/laplace_approx_tests.py
@@ -1,26 +1,29 @@
 import unittest
 import numpy as np
+np.random.seed(82)
 
 import GPy
 from GPy.models import GP
 from GPy.util.linalg import pdinv, tdot
 from scipy import linalg
 
-class LikelihoodGradParam(GP):
-    def __init__(self, X, likelihood_function, kernel, param_name=None, function=None, **kwargs):
-        super(LikelihoodGradParam, self).__init__(X, likelihood_function, kernel)
+class LikelihoodParamGrad(GP):
+    def __init__(self, X=None, likelihood_function=None, kernel=None, param_name=None, function=None, dparam_name=None, **kwargs):
         self.param_name = param_name
+        self.dparam_name = dparam_name
         self.func = function
+        super(LikelihoodParamGrad, self).__init__(X, likelihood_function, kernel)
         #self.func_params = kwargs
         #self.parameter = self.likelihood.__getattribute__(self.param_name)
 
     def _get_param_names(self):
-        f_hats = ["f_{}".format(i) for i in range(len(self.likelihood.f_hat))]
-        return f_hats
+        params = getattr(self.likelihood, self.dparam_name)
+        params_names = ["{}_{}".format(self.dparam_name, i) for i in range(len(params))]
+        return params_names
 
     def _get_params(self):
-        return np.hstack([np.squeeze(self.likelihood.f_hat)])
-        #return np.hstack([self.likelihood.__getattribute__(self.param_name)])
+        params = getattr(self.likelihood, self.dparam_name)
+        return np.hstack([params])
 
     def hack_dL_dK(self):
         self.K = self.kern.K(self.X)
@@ -38,29 +41,56 @@ class LikelihoodGradParam(GP):
             self.dL_dK = 0.5 * (tmp - self.D * self.Ki)
 
     def _set_params(self, x):
-        self.likelihood.f_hat = x.reshape(self.N, 1)
+        raise NotImplementedError
+
+    def log_likelihood(self):
+        raise NotImplementedError
+
+    def _log_likelihood_gradients(self):
+        raise NotImplementedError
+
+
+class Likelihood_F_Grad(LikelihoodParamGrad):
+    def __init__(self, **kwargs):
+        super(Likelihood_F_Grad, self).__init__(**kwargs)
+
+    def _set_params(self, x):
+        params = getattr(self.likelihood, self.dparam_name)
+        setattr(self.likelihood, self.dparam_name, x.reshape(*params.shape))
         self.likelihood._compute_likelihood_variables()
         self.hack_dL_dK()
 
     def log_likelihood(self):
-        return self.func(self.likelihood)[0, 0]
+        ll = self.func(self)
+        if self.param_name == "dL_dfhat_":
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        if len(ll.shape) == 0 or len(ll.shape) == 1:
+            return ll.sum()
+        elif len(ll.shape) == 2:
+            #print "Only checking first likelihood"
+            return ll[0, 0]
+        else:
+            raise ValueError('Not implemented for larger matricies yet')
+        return ll
 
     def _log_likelihood_gradients(self):
-        #gradient = self.likelihood.__getattribute__(self.param_name)
         self.likelihood._compute_likelihood_variables()
         self.likelihood._gradients(partial=np.diag(self.dL_dK))
         gradient = getattr(self.likelihood, self.param_name)
-        #Need to sum over fhats? For dytil_dfhat...
-        #gradient = np.flatten(gradient, axis=0)
-        #return gradient[:, 0]
-        return gradient[0, :]
+        if len(gradient.shape) == 1:
+            return gradient
+        elif len(gradient.shape) == 2:
+            #print "Only checking first gradients"
+            return gradient[0,: ]
+        else:
+            raise ValueError('Not implemented for larger matricies yet')
 
 
 class LaplaceTests(unittest.TestCase):
     def setUp(self):
         real_var = 0.1
         #Start a function, any function
-        self.X = np.linspace(0.0, 10.0, 30)[:, None]
+        self.X = np.linspace(0.0, 10.0, 4)[:, None]
         #self.X = np.random.randn(,1)
         #self.X = np.ones((10,1))
         Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var
@@ -74,49 +104,40 @@ class LaplaceTests(unittest.TestCase):
         t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=initial_sd_guess)
         self.stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
         self.stu_t_likelihood.fit_full(self.kernel.K(self.X))
-        self.m = LikelihoodGradParam(self.X, self.stu_t_likelihood, self.kernel, None, None)
-        self.m.constrain_fixed('rbf_v', 1.0898)
-        self.m.constrain_fixed('rbf_l', 1.8651)
 
     def tearDown(self):
         self.m = None
 
     def test_dy_dfhat(self):
-        def ytil(likelihood):
-            Sigma_tilde = likelihood.Sigma_tilde
-            K = likelihood.K
+        def ytil(self):
+            Sigma_tilde = self.likelihood.Sigma_tilde
+            K = self.likelihood.K
             Ki, _, _, _ = pdinv(K)
-            f_hat = likelihood.f_hat
+            f_hat = self.likelihood.f_hat
             Sigma, _, _, _ = pdinv(Sigma_tilde)
             return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat)
 
-        self.m.func = ytil
-        self.m.param_name = 'dytil_dfhat'
+        self.m = Likelihood_F_Grad(X=self.X, likelihood_function=self.stu_t_likelihood,
+                                   kernel=self.kernel, param_name='dytil_dfhat',
+                                   function=ytil, dparam_name='f_hat')
+        #self.m.constrain_fixed('rbf_v', 1.0898)
+        #self.m.constrain_fixed('rbf_l', 1.8651)
         self.m.randomize()
-        #try:
         self.m.checkgrad(verbose=1)
         assert self.m.checkgrad()
-        #except:
-            #import ipdb;ipdb.set_trace()
 
+    def test_dL_dfhat(self):
+        def L(self):
+            return np.array(-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z)
 
-    #def test_dL_dytil(self):
-        #def L(likelihood):
-            ##-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
-            #Sigma_tilde = likelihood.Sigma_tilde
-            #Ki = likelihood.K
-            #f_hat = likelihood.f_hat
-            #Sigma, _, _, _ = pdinv(Sigma_tilde)
-            #return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat)
-
-        #self.m.func = L
-        #self.m.param_name = 'dL_dytil'
-        #m.randomize()
-        ##try:
-        #m.checkgrad(verbose=1)
-        #assert m.checkgrad()
-        #except:
-            #import ipdb;ipdb.set_trace()
+        self.m = Likelihood_F_Grad(X=self.X, likelihood_function=self.stu_t_likelihood,
+                                    kernel=self.kernel, param_name='dL_dfhat',
+                                    function=L, dparam_name='f_hat')
+        self.m.constrain_fixed('rbf_v', 1.0898)
+        self.m.constrain_fixed('rbf_l', 1.8651)
+        self.m.randomize()
+        self.m.checkgrad(verbose=1)
+        assert self.m.checkgrad()
 
 if __name__ == "__main__":
     unittest.main()

From d63d370641846642bdc02f0295177f7f37b5f5fb Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 29 May 2013 13:46:55 +0100
Subject: [PATCH 42/71] About to rip out old chain rule method of learning
 gradients

---
 GPy/likelihoods/Laplace.py          | 4 +++-
 GPy/testing/laplace_approx_tests.py | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index af20d36a..666fa227 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -87,7 +87,7 @@ class Laplace(likelihood):
         #dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde),
         #dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y)
         c = inv(self.K+self.Sigma_tilde)
-        dL_dytil_simple_term =  -0.5*np.diag(np.dot(c, self.Y) + np.dot(self.Y.T, c))
+        dL_dytil_simple_term = -0.5*np.diag(2*np.dot(c, self.Y))
 
         P = np.diagflat(1/np.dot(Ki, self.f_hat))
         K_Wi_i = inv(self.K+self.Sigma_tilde)
@@ -96,6 +96,7 @@ class Laplace(likelihood):
                                             +0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P)
                                            ) * np.eye(self.N))
         dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term
+        dL_dytil = dL_dytil.reshape(1, self.N)
 
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         Wi = np.diagonal(self.Sigma_tilde) #Convenience
@@ -329,6 +330,7 @@ class Laplace(likelihood):
                    #+ y_W_f
                    #+ self.ln_z_hat
                    #)
+        self.Z_tilde = 0
 
         ##Check it isn't singular!
         if cond(self.W) > epsilon:
diff --git a/GPy/testing/laplace_approx_tests.py b/GPy/testing/laplace_approx_tests.py
index acb1c822..15d84c9c 100644
--- a/GPy/testing/laplace_approx_tests.py
+++ b/GPy/testing/laplace_approx_tests.py
@@ -62,8 +62,6 @@ class Likelihood_F_Grad(LikelihoodParamGrad):
 
     def log_likelihood(self):
         ll = self.func(self)
-        if self.param_name == "dL_dfhat_":
-            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         if len(ll.shape) == 0 or len(ll.shape) == 1:
             return ll.sum()
         elif len(ll.shape) == 2:
@@ -128,6 +126,7 @@ class LaplaceTests(unittest.TestCase):
 
     def test_dL_dfhat(self):
         def L(self):
+            #return np.array(-0.5 * self.D * self.K_logdet + self._model_fit_term()) #Ignore Z for now
             return np.array(-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z)
 
         self.m = Likelihood_F_Grad(X=self.X, likelihood_function=self.stu_t_likelihood,

From 117c377d13efe81b2df567936ff48e85f918efcd Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 29 May 2013 14:02:03 +0100
Subject: [PATCH 43/71] Ripped out all things Laplace parameter estimation,
 starting again with new tactic

---
 GPy/likelihoods/Laplace.py | 175 +------------------------------------
 GPy/models/GP.py           |   8 +-
 2 files changed, 4 insertions(+), 179 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 666fa227..69c0876b 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -79,187 +79,18 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        Ki, _, _, _ = pdinv(self.K)
-
-        #Y__KS_i = np.dot(self.Y.T, inv(self.K+self.Sigma_tilde))
-        #dL_dytil = -0.5*Y__KS_i #or *0.5? Shouldn't this be -y*R
-        #dL_dytil = -0.5*np.trace(np.dot(inv(self.K+self.Sigma_tilde), (np.dot(self.Y, self.Y.T) + self.Y.T)))
-        #dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde),
-        #dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y)
-        c = inv(self.K+self.Sigma_tilde)
-        dL_dytil_simple_term = -0.5*np.diag(2*np.dot(c, self.Y))
-
-        P = np.diagflat(1/np.dot(Ki, self.f_hat))
-        K_Wi_i = inv(self.K+self.Sigma_tilde)
-
-        dL_dytil_difficult_term = np.diag(( -0.5*(np.dot(self.K + self.Sigma_tilde, P))
-                                            +0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P)
-                                           ) * np.eye(self.N))
-        dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term
-        dL_dytil = dL_dytil.reshape(1, self.N)
-
-        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
-        Wi = np.diagonal(self.Sigma_tilde) #Convenience
-        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
-        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-
-
-        #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
-        #dytil_dfhat_explicit = self.Wi__Ki_W
-        #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
-        #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
-
-        dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
-        self.dytil_dfhat = dytil_dfhat
-        #dytil_dfhat = np.eye(dytil_dfhat.shape[0])
-        self.dL_dfhat = np.dot(dL_dytil, dytil_dfhat) #FIXME: Purely for checkgradding....
-        return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
         """
-                           #explicit                #implicit                     #implicit
-        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
-        :param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance)
-        :param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers
-        :returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters
+        Gradients with respect to prior kernel parameters
         """
-        dL_dytil, dytil_dfhat = self._shared_gradients_components()
-
-        #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
-
-        #print "Computing K gradients"
-        #print "dytil_dfhat: ", np.mean(dytil_dfhat)
-        #I = np.eye(self.N)
-        #C = np.dot(self.K, self.W)
-        #A = I + C
-        #plt.imshow(A)
-        #plt.show()
-
-        #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
-        #B = I + w12*K*w12
-        I_KW_i = self.Bi # could use self.B_chol??
-
-        #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
-        #Derivative for each f dimension, for each of K's hyper parameters
-        dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
-        grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-        for ind_j, thetaj in enumerate(dK_dthetaK):
-            #dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad)))
-            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad)
-
-        print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat)
-        print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK)
-        dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
-        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
-        print "\n"
-
-        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
-        dL_dSigma = dL_d_K_Sigma
-        #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
-                     #explicit           #implicit
-        #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
-        #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
-
-        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
-        Wi = np.diagonal(self.Sigma_tilde) #Convenience
-        dSigma_dthetaK_explicit = 0
-        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
-        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-        #dSigma_dthetaK_implicit = -np.sum(np.dot(dWi_dfhat, dfhat_dthetaK), axis=0)
-        dSigma_dthetaK_implicit = np.dot(dWi_dfhat, dfhat_dthetaK)
-        dSigma_dthetaK = dSigma_dthetaK_explicit + dSigma_dthetaK_implicit
-        #dSigma_dthetaK = 0 + np.dot(, dfhat_dthetaK)
-        #for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
-            #dSigma_dthetaK_explicit = 0
-            #dSigma_dthetaK_implicit = -np.dot(Wi, dW_dfhat
-            #dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
-
-        #FIXME: Won't handle multi dimensional data
-        dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
-        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
-        dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
-
-        print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil)
-        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
-        print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
-        print "\n"
-        print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma)
-        print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK)
-        print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
-        print "\n"
-        print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
-
-        return np.squeeze(dL_dthetaK_implicit)
+        return dL_dthetaK
 
     def _gradients(self, partial):
         """
         Gradients with respect to likelihood parameters
-
-        Complicated, it differs for parameters of the kernel \theta_{K}, and
-        parameters of the likelihood, \theta_{L}
-
-        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
-        dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL)
-        dL_dK*dK_dthetaL = 0
-
-        dytil_dthetaX = dytil_dfhat * dfhat_dthetaX
-        dytil_dfhat = Sigma*Ki + I
-
-        fhat = K*log_p(y|fhat)                                          from rasm p125
-        dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat)          from rasm p125
-
-        dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi
-        dW_dthetaX = d_dthetaX[d2phi_d2fhat]
-        d2phi_d2fhat = Hessian function of likelihood
-
-        partial = dL_d_K_Sigma
         """
-        dL_dytil, dytil_dfhat = self._shared_gradients_components()
-        #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-
-        dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-        dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-        KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
-        #KW_I_i = self.Bi # could use self.B_chol??
-        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL))
-        dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
-
-        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
-
-        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
-        dL_dSigma = np.diagflat(partial) #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
-
-        Wi = np.diagonal(self.Sigma_tilde) #Convenience
-        #-1 as we are looking at W which is -1*d2log p(y|f)
-        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
-        dSigma_dthetaL_explicit = np.diagflat(-1*(Wi*(-1*d2likelihood_dthetaL)*Wi))
-
-        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
-        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL)
-        dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
-
-        #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-        #Derivative for each f dimension, for each of K's hyper parameters
-        #dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
-        #for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
-            #dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
-                                             #dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
-                                             #self.Sigma_tilde
-                                             #)
-
-        #TODO: This is Wi*A*Wi, can be more numerically stable with a trick
-        #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
-
-        #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
-        #dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
-        #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
-
-        dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
-        dL_dthetaL_via_Sigma = np.sum(np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0))
-        dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
-
-        return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
+        return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
         """
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 17e2a1b1..da379eb1 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -150,14 +150,8 @@ class GP(model):
             fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
-            #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
-            dL_dthetaK_implicit = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
-            dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
-
-            #print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
-
+            dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            #print "dL_dthetaL: ", dL_dthetaL
             print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From 23ed2a2d15c28fe5d868639ad1358024808a328f Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 29 May 2013 17:33:06 +0100
Subject: [PATCH 44/71] Lots of name changing and went through all likelihood
 gradients again

---
 GPy/examples/laplace_approximations.py  | 27 ++++---
 GPy/likelihoods/Laplace.py              | 35 +++++++--
 GPy/likelihoods/likelihood_functions.py | 96 +++++++++++++++----------
 GPy/models/GP.py                        |  2 +-
 4 files changed, 103 insertions(+), 57 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 4d8e96b8..27f063dc 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -69,22 +69,21 @@ def debug_student_t_noise_approx():
     print "Clean Gaussian"
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
-    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
-    # optimize
-    m.ensure_default_constraints()
-    m.optimize()
-    # plot
-    if plot:
-        plt.figure(1)
-        plt.suptitle('Gaussian likelihood')
-        plt.subplot(131)
-        m.plot()
-        plt.plot(X_full, Y_full)
-    print m
+    #m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    ## optimize
+    #m.ensure_default_constraints()
+    #m.optimize()
+    ## plot
+    #if plot:
+        #plt.figure(1)
+        #plt.suptitle('Gaussian likelihood')
+        #plt.subplot(131)
+        #m.plot()
+        #plt.plot(X_full, Y_full)
+    #print m
 
     edited_real_sd = initial_var_guess #real_sd
 
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
@@ -95,10 +94,10 @@ def debug_student_t_noise_approx():
     m.constrain_positive('t_noi')
     #m.constrain_fixed('t_noise_variance', real_sd)
     m.update_likelihood_approximation()
+    m.optimize('scg', messages=True)
     print(m)
     return m
     #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
-    m.optimize('scg', messages=True)
     if plot:
         plt.suptitle('Student-t likelihood')
         plt.subplot(132)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 69c0876b..f8ba25f1 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -79,17 +79,40 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
+        Ki, _, _, _ = pdinv(self.K)
+        Ki_W_i = inv(Ki + self.W) #Do it non numerically stable for now
+        d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
+        dL_dfhat = -0.5*np.dot(np.diag(Ki_W_i), d3lik_d3fhat)
+        KW = np.dot(self.K, self.W)
+        I_KW_i = inv(np.eye(KW.shape[0]) + KW)
+        return dL_dfhat, Ki, I_KW_i
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
         """
         Gradients with respect to prior kernel parameters
         """
+        dL_dfhat, Ki, I_KW_i = self._shared_gradients_components()
+        K_Wi_i = inv(self.K + inv(self.W))
+        dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
+
+        dL_dthetaK = np.zeros(dK_dthetaK.shape)
+        for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
+            #Explicit
+            dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(np.dot(K_Wi_i, dK_dthetaK_i))
+            #Implicit
+            df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK, dlp)
+            dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
+
         return dL_dthetaK
 
     def _gradients(self, partial):
         """
         Gradients with respect to likelihood parameters
         """
+        dL_dfhat, Ki, I_KW_i = self._shared_gradients_components()
+        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
+        dL_dthetaL = np.zeros(dlik_dthetaL.shape)
+
         return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
@@ -197,7 +220,7 @@ class Laplace(likelihood):
         #At this point get the hessian matrix
         #print "Data: ", self.data
         #print "fhat: ", self.f_hat
-        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data))
+        self.W = -np.diag(self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data))
 
         if not self.likelihood_function.log_concave:
             self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
@@ -212,7 +235,7 @@ class Laplace(likelihood):
         Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
         self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
 
-        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
+        b = np.dot(self.W, self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
         solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b)))
         a = b - mdot(self.W_12, solve_chol)
         self.Ki_f = a
@@ -259,11 +282,11 @@ class Laplace(likelihood):
             return float(res)
 
         def obj_grad(f):
-            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
+            res = -1 * (self.likelihood_function.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
+            res = -1 * (--np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
             return np.squeeze(res)
 
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
@@ -294,7 +317,7 @@ class Laplace(likelihood):
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             #f_old = f.copy()
-            W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data))
+            W = -np.diag(self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data))
             if not self.likelihood_function.log_concave:
                 W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -303,7 +326,7 @@ class Laplace(likelihood):
             B, L, W_12 = self._compute_B_statistics(K, W)
 
             W_f = np.dot(W, f)
-            grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None]
+            grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)[:, None]
             #Find K_i_f
             b = W_f + grad
 
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 646293d2..d75e7218 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -159,10 +159,10 @@ class student_t(likelihood_function):
     d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free, sigma=2):
+        #super(student_t, self).__init__()
         self.v = deg_free
         self.sigma = sigma
         self.log_concave = False
-        #super(student_t, self).__init__()
 
         self._set_params(np.asarray(sigma))
 
@@ -174,8 +174,6 @@ class student_t(likelihood_function):
 
     def _set_params(self, x):
         self.sigma = float(x)
-        #self.covariance_matrix = np.eye(self.N)*self._variance
-        #self.precision = 1./self._variance
 
     @property
     def variance(self, extra_data=None):
@@ -185,6 +183,8 @@ class student_t(likelihood_function):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
+        For wolfram alpha import parts for derivative of sigma are -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+
         :y: data
         :f: latent variables f
         :extra_data: extra_data which is not used in student t distribution
@@ -198,17 +198,16 @@ class student_t(likelihood_function):
         e = y - f
         objective = (gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
-                     + np.log(self.sigma * np.sqrt(self.v * np.pi))
-                     - (self.v + 1) * 0.5
-                     * np.log(1 + ((e**2 / self.sigma**2) / self.v))
-                     )
+                     - np.log(self.sigma * np.sqrt(self.v * np.pi))
+                     - (self.v + 1) * 0.5 * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                    )
         return np.sum(objective)
 
-    def link_grad(self, y, f, extra_data=None):
+    def dlik_df(self, y, f, extra_data=None):
         """
         Gradient of the link function at y, given f w.r.t f
 
-        $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{-(v+1)(f_{i}-y_{i})}{(f_{i}-y_{i})^{2} + \sigma^{2}v}$$
 
         :y: data
         :f: latent variables f
@@ -220,17 +219,17 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        grad = -((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
         return np.squeeze(grad)
 
-    def link_hess(self, y, f, extra_data=None):
+    def d2lik_d2f(self, y, f, extra_data=None):
         """
         Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
         i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
 
         Will return diagonal of hessian, since every where else it is 0
 
-        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((f_{i}-y_{i})^{2} - \sigma^{2}v)}{((f_{i}-y_{i})^{2} + \sigma^{2}v)^{2}}$$
 
         :y: data
         :f: latent variables f
@@ -245,54 +244,79 @@ class student_t(likelihood_function):
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
         return np.squeeze(hess)
 
-    def d3link(self, y, f, extra_data=None):
+    def d3lik_d3f(self, y, f, extra_data=None):
         """
         Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
 
-        $$\frac{2(v+1)((y-f)^{3} - 3\sigma^{2}v(y-f))}{((y-f)^{2} + \sigma^{2}v)^{3}}$$
+        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((f_{i} - y_{i})^3 - 3(f_{i} - y_{i}) \sigma^{2} v))}{((f_{i} - y_{i}) + \sigma^{2} v)^3}$$
         """
         y = np.squeeze(y)
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        d3link_d3f = (  (2*(self.v + 1)*(-1*e)*(e**2 - 3*(self.sigma**2)*self.v))
-                      / ((e**2 + (self.sigma**2)*self.v)**3)
-                     )
-        return np.squeeze(d3link_d3f)
+        d3lik_d3f = ( -(2*(self.v + 1)*(e**3 - e*3*self.v*(self.sigma**2))) /
+                       ((e**2 + (self.sigma**2)*self.v)**3)
+                    )
+        return np.squeeze(d3lik_d3f)
 
-    def link_hess_grad_std(self, y, f, extra_data=None):
+    def link_dstd(self, y, f, extra_data=None):
         """
-        Gradient of the hessian w.r.t sigma parameter (standard deviation)
+        Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
 
-        $$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}}
+        Terms relavent to derivatives wrt sigma are:
+        -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+
+        $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
         """
         y = np.squeeze(y)
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        hess_grad_sigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
-                           / ((e**2 + (self.sigma**2)*self.v)**3)
-                          )
-        return np.squeeze(hess_grad_sigma)
+        dlik_dsigma = ( (1/self.sigma) -
+                        ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + (e**2) / ((self.sigma**2)*self.v) ) )
+                      )
+        return np.squeeze(dlik_dsigma)
 
-    def link_grad_std(self, y, f, extra_data=None):
+    def dlik_df_dstd(self, y, f, extra_data=None):
         """
-        Gradient of the likelihood w.r.t sigma parameter (standard deviation)
+        Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
 
-        $$\frac{-2\sigma(v+1)(y-f)}{(v\sigma^{2} + (y-f)^{2})^{2}}$$
+        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{2\sigma v(v + 1)(f-y)}{(f-y)^2 + \sigma^2 v)^2}$$
         """
         y = np.squeeze(y)
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
-                      / ((self.v*(self.sigma**2) + e**2)**2)
-                     )
-        return np.squeeze(grad_sigma)
+        dlik_grad_dsigma = ((2*self.sigma*self.v*(self.v + 1)*e)
+                            / ((self.v*(self.sigma**2) + e**2)**2)
+                           )
+        return np.squeeze(dlik_grad_dsigma)
+
+    def d2lik_d2f_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
+
+        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{(v + 1)((f-y)^2 - \sigma^2 v)}{((f-y)^2 + \sigma^2 v)}$$
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        dlik_hess_dsigma = ( ((v + 1)*(e**2 - (self.sigma**2)*self.v)) /
+                             ((e**2 + (self.sigma**2)*self.v)**2)
+                           )
+        return np.squeeze(dlik_hess_dsigma)
 
     def _gradients(self, y, f, extra_data=None):
-        return [self.link_grad_std(y, f, extra_data=extra_data),
-                self.link_hess_grad_std(y, f, extra_data=extra_data)] # list as we might learn many parameters
+        derivs = ([self.link_dstd(y, f, extra_data=extra_data)],
+                  [self.dlik_df_dstd(y, f, extra_data=extra_data)],
+                  [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+                 ) # lists as we might learn many parameters
+        # ensure we have gradients for every parameter we want to optimize
+        assert len(derivs[0]) == len(self._get_param_names())
+        assert len(derivs[1]) == len(self._get_param_names())
+        assert len(derivs[2]) == len(self._get_param_names())
+        return derivs
 
     def predictive_values(self, mu, var):
         """
@@ -412,7 +436,7 @@ class weibull_survival(likelihood_function):
         objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f)  # FIXME: CHECK THIS WITH BOOK, wheres scale?
         return np.sum(objective)
 
-    def link_grad(self, y, f, extra_data=None):
+    def dlik_df(self, y, f, extra_data=None):
         """
         Gradient of the link function at y, given f w.r.t f
 
@@ -432,7 +456,7 @@ class weibull_survival(likelihood_function):
         grad = v - (y**self.shape)*np.exp(f)
         return np.squeeze(grad)
 
-    def link_hess(self, y, f, extra_data=None):
+    def d2lik_d2f(self, y, f, extra_data=None):
         """
         Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
         i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index da379eb1..0b5a8db6 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -147,7 +147,7 @@ class GP(model):
         if isinstance(self.likelihood, Laplace):
             dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
+            fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
             dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)

From 20227fb2ac2c0d173eed515c7870864147a5d5d5 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 30 May 2013 16:17:37 +0100
Subject: [PATCH 45/71] Made more numerically stable in a hope that it will
 work and I will find a bug...

---
 GPy/examples/laplace_approximations.py  | 10 +++---
 GPy/likelihoods/Laplace.py              | 45 ++++++++++++++++---------
 GPy/likelihoods/likelihood_functions.py |  5 +--
 GPy/models/GP.py                        |  7 ++--
 4 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 27f063dc..203d308d 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -37,9 +37,9 @@ def timing():
 
 def debug_student_t_noise_approx():
     plot = False
-    real_var = 0.1
+    real_var = 0.4
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 2)[:, None]
+    X = np.linspace(0.0, 10.0, 100)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
     X_full = np.linspace(0.0, 10.0, 500)[:, None]
@@ -89,12 +89,12 @@ def debug_student_t_noise_approx():
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     #m.constrain_positive('rbf')
-    m.constrain_fixed('rbf_v', 1.0898)
-    m.constrain_fixed('rbf_l', 1.8651)
+    #m.constrain_fixed('rbf_v', 1.0898)
+    #m.constrain_fixed('rbf_l', 1.8651)
     m.constrain_positive('t_noi')
     #m.constrain_fixed('t_noise_variance', real_sd)
     m.update_likelihood_approximation()
-    m.optimize('scg', messages=True)
+    m.optimize(messages=True)
     print(m)
     return m
     #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index f8ba25f1..85af82f9 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -79,41 +79,54 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
+        #FIXME: Careful of side effects! And make sure W and K are up to date!
         Ki, _, _, _ = pdinv(self.K)
-        Ki_W_i = inv(Ki + self.W) #Do it non numerically stable for now
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
-        dL_dfhat = -0.5*np.dot(np.diag(Ki_W_i), d3lik_d3fhat)
-        KW = np.dot(self.K, self.W)
-        I_KW_i = inv(np.eye(KW.shape[0]) + KW)
-        return dL_dfhat, Ki, I_KW_i
+        #dL_dfhat = -0.5*np.diag(self.Ki_W_i)*d3lik_d3fhat
+        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)*d3lik_d3fhat)[:, None]
+        Wi_K_i = mdot(self.W_12, self.Bi, self.W_12) #same as rasms R
+        I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i)
+        return dL_dfhat, Ki, I_KW_i, Wi_K_i
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
         """
         Gradients with respect to prior kernel parameters
         """
-        dL_dfhat, Ki, I_KW_i = self._shared_gradients_components()
-        K_Wi_i = inv(self.K + inv(self.W))
-        dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
+        dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components()
+        dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)[:, None]
 
         dL_dthetaK = np.zeros(dK_dthetaK.shape)
         for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
             #Explicit
-            dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(np.dot(K_Wi_i, dK_dthetaK_i))
+            dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(Wi_K_i*dK_dthetaK_i)
             #Implicit
-            df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK, dlp)
+            df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp)
             dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
 
-        return dL_dthetaK
+        return np.squeeze(dL_dthetaK)
 
     def _gradients(self, partial):
         """
         Gradients with respect to likelihood parameters
         """
-        dL_dfhat, Ki, I_KW_i = self._shared_gradients_components()
+        dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components()
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
-        dL_dthetaL = np.zeros(dlik_dthetaL.shape)
 
-        return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
+        num_params = len(dlik_dthetaL)
+        #Ki_W_i = np.diag(inv(Ki + self.W))[:, None]
+        dL_dthetaL = np.zeros((1, num_params)) # make space for one derivative for each likelihood parameter
+        for thetaL_i in range(num_params):
+            #Explicit
+            #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
+            #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None])
+            #                                               might be +
+            dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            #Implicit
+            df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+            dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL)
+
+        return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
         """
@@ -232,8 +245,8 @@ class Laplace(likelihood):
         self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
 
-        Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
-        self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
+        self.Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
+        self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i)
 
         b = np.dot(self.W, self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
         solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b)))
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index d75e7218..c6186137 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -302,12 +302,13 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        dlik_hess_dsigma = ( ((v + 1)*(e**2 - (self.sigma**2)*self.v)) /
+        dlik_hess_dsigma = ( ((self.v + 1)*(e**2 - (self.sigma**2)*self.v)) /
                              ((e**2 + (self.sigma**2)*self.v)**2)
                            )
-        return np.squeeze(dlik_hess_dsigma)
+        return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):
+        #must be listed in same order as 'get_param_names'
         derivs = ([self.link_dstd(y, f, extra_data=extra_data)],
                   [self.dlik_df_dstd(y, f, extra_data=extra_data)],
                   [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 0b5a8db6..9ce83a5a 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -69,7 +69,6 @@ class GP(model):
         self.likelihood._set_params(p[self.kern.Nparam_transformed():])  # test by Nicolas
 
         if isinstance(self.likelihood, Laplace):
-            print "Updating approx: ", p
             self.likelihood.fit_full(self.kern.K(self.X))
             self.likelihood._set_params(self.likelihood._get_params())
 
@@ -134,7 +133,6 @@ class GP(model):
         matrix K* = K + diag(1./tau_tilde) plus a normalization term.
         """
         l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
-        print "Log likelihood: ", l
         return l
 
     def _log_likelihood_gradients(self):
@@ -145,17 +143,16 @@ class GP(model):
         """
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
         if isinstance(self.likelihood, Laplace):
-            dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
             fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
             dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+            #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+            #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
 

From f9857e08c0b4f130f2ae8ace5264e9ba65d9687c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 31 May 2013 11:55:32 +0100
Subject: [PATCH 46/71] Broken it by getting rid of squeeze, but now working on
 making it faster using proper vector multiplciation for diagonals

---
 GPy/examples/laplace_approximations.py  | 12 +++--
 GPy/likelihoods/Laplace.py              | 45 ++++++----------
 GPy/likelihoods/likelihood_functions.py | 69 +++++++++++++------------
 GPy/models/GP.py                        | 13 ++++-
 4 files changed, 69 insertions(+), 70 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 203d308d..5103eefb 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -37,9 +37,10 @@ def timing():
 
 def debug_student_t_noise_approx():
     plot = False
-    real_var = 0.4
+    real_var = 0.1
     #Start a function, any function
     X = np.linspace(0.0, 10.0, 100)[:, None]
+    #X = np.array([0.5])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
     X_full = np.linspace(0.0, 10.0, 500)[:, None]
@@ -52,7 +53,7 @@ def debug_student_t_noise_approx():
     real_sd = np.sqrt(real_var)
     print "Real noise: ", real_sd
 
-    initial_var_guess = 1
+    initial_var_guess = 0.02
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -91,12 +92,14 @@ def debug_student_t_noise_approx():
     #m.constrain_positive('rbf')
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 1.8651)
-    m.constrain_positive('t_noi')
     #m.constrain_fixed('t_noise_variance', real_sd)
+    m.constrain_positive('rbf')
+    m.constrain_fixed('t_noi', real_sd)
+    m.ensure_default_constraints()
     m.update_likelihood_approximation()
     m.optimize(messages=True)
     print(m)
-    return m
+    #return m
     #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
     if plot:
         plt.suptitle('Student-t likelihood')
@@ -104,6 +107,7 @@ def debug_student_t_noise_approx():
         m.plot()
         plt.plot(X_full, Y_full)
         plt.ylim(-2.5, 2.5)
+    return m
 
     #print "Clean student t, ncg"
     #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 85af82f9..027f014e 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -53,7 +53,7 @@ class Laplace(likelihood):
 
     def predictive_values(self, mu, var, full_cov):
         if full_cov:
-            raise NotImplementedError("Cannot make correlated predictions with an EP likelihood")
+            raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
         return self.likelihood_function.predictive_values(mu, var)
 
     def _get_params(self):
@@ -63,42 +63,28 @@ class Laplace(likelihood):
         return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
-        #print "Setting laplace param with: ", p
         return self.likelihood_function._set_params(p)
 
-    def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
-        """
-        Find the gradients of the marginal likelihood w.r.t both thetaK and thetaL
-
-        dL_dthetaK differs from that of normal likelihoods as it has additional terms coming from
-        changes to y_tilde and changes to Sigma_tilde when the kernel parameters are adjusted
-
-        Similar terms arise when finding the gradients with respect to changes in the liklihood
-        parameters
-        """
-        return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
-
     def _shared_gradients_components(self):
         #FIXME: Careful of side effects! And make sure W and K are up to date!
-        Ki, _, _, _ = pdinv(self.K)
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
-        #dL_dfhat = -0.5*np.diag(self.Ki_W_i)*d3lik_d3fhat
         dL_dfhat = -0.5*(np.diag(self.Ki_W_i)*d3lik_d3fhat)[:, None]
         Wi_K_i = mdot(self.W_12, self.Bi, self.W_12) #same as rasms R
         I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i)
-        return dL_dfhat, Ki, I_KW_i, Wi_K_i
+        return dL_dfhat, I_KW_i, Wi_K_i
 
-    def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
+    def _Kgradients(self, dK_dthetaK):
         """
         Gradients with respect to prior kernel parameters
         """
-        dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components()
+        dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
         dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)[:, None]
 
         dL_dthetaK = np.zeros(dK_dthetaK.shape)
         for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
             #Explicit
-            dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(Wi_K_i*dK_dthetaK_i)
+            f_Ki_dK_dtheta_Ki_f = mdot(self.Ki_f.T, dK_dthetaK_i, self.Ki_f)
+            dL_dthetaK[thetaK_i] = 0.5*f_Ki_dK_dtheta_Ki_f - 0.5*np.trace(Wi_K_i*dK_dthetaK_i)
             #Implicit
             df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp)
             dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
@@ -109,11 +95,12 @@ class Laplace(likelihood):
         """
         Gradients with respect to likelihood parameters
         """
-        dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components()
+        return np.zeros(1)
+        #return np.zeros(0)
+        dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
 
         num_params = len(dlik_dthetaL)
-        #Ki_W_i = np.diag(inv(Ki + self.W))[:, None]
         dL_dthetaL = np.zeros((1, num_params)) # make space for one derivative for each likelihood parameter
         for thetaL_i in range(num_params):
             #Explicit
@@ -123,7 +110,6 @@ class Laplace(likelihood):
             dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
-            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
             dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL)
 
         return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
@@ -230,10 +216,8 @@ class Laplace(likelihood):
         self._compute_likelihood_variables()
 
     def _compute_likelihood_variables(self):
-        #At this point get the hessian matrix
-        #print "Data: ", self.data
-        #print "fhat: ", self.f_hat
-        self.W = -np.diag(self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data))
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
             self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
@@ -273,7 +257,8 @@ class Laplace(likelihood):
         """
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
+        assert np.all(W_12.T*K*W_12 == np.dot(np.diagflat(W_12), np.dot(K, np.diagflat(W_12)))) # FIXME Take this out when you've done multiinput
+        B = np.eye(K.shape[0]) + W_12.T*K*W_12
         L = jitchol(B)
         return (B, L, W_12)
 
@@ -330,7 +315,7 @@ class Laplace(likelihood):
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             #f_old = f.copy()
-            W = -np.diag(self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data))
+            W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
                 W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -339,7 +324,7 @@ class Laplace(likelihood):
             B, L, W_12 = self._compute_B_statistics(K, W)
 
             W_f = np.dot(W, f)
-            grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)[:, None]
+            grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
             #Find K_i_f
             b = W_f + grad
 
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index c6186137..c3aee835 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -191,8 +191,8 @@ class student_t(likelihood_function):
         :returns: float(likelihood evaluated for this point)
 
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
 
         e = y - f
@@ -207,7 +207,7 @@ class student_t(likelihood_function):
         """
         Gradient of the link function at y, given f w.r.t f
 
-        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{-(v+1)(f_{i}-y_{i})}{(f_{i}-y_{i})^{2} + \sigma^{2}v}$$
+        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
 
         :y: data
         :f: latent variables f
@@ -215,51 +215,52 @@ class student_t(likelihood_function):
         :returns: gradient of likelihood evaluated at points
 
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        grad = -((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
-        return np.squeeze(grad)
+        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
         """
         Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
         i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
 
-        Will return diagonal of hessian, since every where else it is 0
+        Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+        (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
 
-        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((f_{i}-y_{i})^{2} - \sigma^{2}v)}{((f_{i}-y_{i})^{2} + \sigma^{2}v)^{2}}$$
+        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
 
         :y: data
         :f: latent variables f
         :extra_data: extra_data which is not used in student t distribution
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
 
         e = y - f
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
-        return np.squeeze(hess)
+        return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
         """
         Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
 
-        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((f_{i} - y_{i})^3 - 3(f_{i} - y_{i}) \sigma^{2} v))}{((f_{i} - y_{i}) + \sigma^{2} v)^3}$$
+        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        d3lik_d3f = ( -(2*(self.v + 1)*(e**3 - e*3*self.v*(self.sigma**2))) /
+        d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
                        ((e**2 + (self.sigma**2)*self.v)**3)
                     )
-        return np.squeeze(d3lik_d3f)
+        return d3lik_d3f
 
-    def link_dstd(self, y, f, extra_data=None):
+    def lik_dstd(self, y, f, extra_data=None):
         """
         Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
 
@@ -268,48 +269,48 @@ class student_t(likelihood_function):
 
         $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        dlik_dsigma = ( (1/self.sigma) -
-                        ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + (e**2) / ((self.sigma**2)*self.v) ) )
+        dlik_dsigma = ( - (1/self.sigma) +
+                        ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) )
                       )
-        return np.squeeze(dlik_dsigma)
+        return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
         """
         Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
 
-        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{2\sigma v(v + 1)(f-y)}{(f-y)^2 + \sigma^2 v)^2}$$
+        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        dlik_grad_dsigma = ((2*self.sigma*self.v*(self.v + 1)*e)
+        dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e)
                             / ((self.v*(self.sigma**2) + e**2)**2)
                            )
-        return np.squeeze(dlik_grad_dsigma)
+        return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
         """
         Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
 
-        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{(v + 1)((f-y)^2 - \sigma^2 v)}{((f-y)^2 + \sigma^2 v)}$$
+        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        dlik_hess_dsigma = ( ((self.v + 1)*(e**2 - (self.sigma**2)*self.v)) /
-                             ((e**2 + (self.sigma**2)*self.v)**2)
+        dlik_hess_dsigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
+                              ((e**2 + (self.sigma**2)*self.v)**3)
                            )
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):
         #must be listed in same order as 'get_param_names'
-        derivs = ([self.link_dstd(y, f, extra_data=extra_data)],
+        derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
                   [self.dlik_df_dstd(y, f, extra_data=extra_data)],
                   [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
                  ) # lists as we might learn many parameters
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 9ce83a5a..0f3dcb58 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -142,13 +142,22 @@ class GP(model):
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
+        print "dL_dthetaK before: ",dL_dthetaK
         if isinstance(self.likelihood, Laplace):
+            #Reapproximate incase it hasnt been done...
+            if isinstance(self.likelihood, Laplace):
+                self.likelihood.fit_full(self.kern.K(self.X))
+                self.likelihood._set_params(self.likelihood._get_params())
+
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
             fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
+            #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
+            #THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
 
-            dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
-            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
+            dL_dthetaL = 0 # self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            print "dL_dthetaK after: ",dL_dthetaK
             #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From e842f6e68735adaf95b31d0bc3c074dc39d553ea Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 31 May 2013 16:45:22 +0100
Subject: [PATCH 47/71] Made it use the fact that W is diagonal and put
 assertions in to ensure that the results are the same

---
 GPy/likelihoods/Laplace.py | 99 ++++++++++++++++++++++++++++----------
 GPy/models/GP.py           |  2 +-
 2 files changed, 75 insertions(+), 26 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 027f014e..af74755f 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -68,8 +68,11 @@ class Laplace(likelihood):
     def _shared_gradients_components(self):
         #FIXME: Careful of side effects! And make sure W and K are up to date!
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
-        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)*d3lik_d3fhat)[:, None]
-        Wi_K_i = mdot(self.W_12, self.Bi, self.W_12) #same as rasms R
+        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat)
+        Wi_K_i = mdot(np.diagflat(self.W_12), self.Bi, np.diagflat(self.W_12)) #same as rasms R
+        Wi_K_inew = self.W_12*self.Bi*self.W_12.T #same as rasms R
+        assert np.all(Wi_K_i == Wi_K_inew)
+
         I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i)
         return dL_dfhat, I_KW_i, Wi_K_i
 
@@ -78,7 +81,7 @@ class Laplace(likelihood):
         Gradients with respect to prior kernel parameters
         """
         dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
-        dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)[:, None]
+        dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
 
         dL_dthetaK = np.zeros(dK_dthetaK.shape)
         for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
@@ -89,7 +92,7 @@ class Laplace(likelihood):
             df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp)
             dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
 
-        return np.squeeze(dL_dthetaK)
+        return dL_dthetaK
 
     def _gradients(self, partial):
         """
@@ -112,7 +115,7 @@ class Laplace(likelihood):
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
             dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL)
 
-        return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
+        return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
         """
@@ -147,7 +150,9 @@ class Laplace(likelihood):
         #((L.T*w)_i + I)f_hat = y_tilde
         L = jitchol(self.K)
         Li = chol_inv(L)
-        Lt_W = np.dot(L.T, self.W) #FIXME: Can make Faster
+        Lt_W = np.dot(L.T, np.diagflat(self.W)) #FIXME: Can make Faster
+        Lt_Wnew = L.T*self.W.T
+        assert np.all(Lt_Wnew == Lt_W)
 
         ##Check it isn't singular!
         if cond(Lt_W) > epsilon:
@@ -159,12 +164,27 @@ class Laplace(likelihood):
 
         #f.T(Ki + W)f
         f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
-                    + mdot(self.f_hat.T, self.W, self.f_hat)
+                    + mdot(self.f_hat.T, np.diagflat(self.W), self.f_hat)
                     )
+        f_Ki_W_fnew = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
+                    + mdot(self.f_hat.T, self.W*self.f_hat)
+                    )
+        assert np.all(f_Ki_W_f == f_Ki_W_fnew)
 
-        y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
-        y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
-        ln_W_det = det_ln_diag(self.W)
+        y_W_f = mdot((Y_tilde.T, np.diagflat(self.W)), self.f_hat)
+        y_W_fnew = mdot(Y_tilde.T*self.W.T, self.f_hat)
+        assert np.all(y_W_f == y_W_fnew)
+
+
+        y_W_y = mdot((Y_tilde.T, np.diagflat(self.W)), Y_tilde)
+        y_W_ynew = mdot(Y_tilde.T, self.W*Y_tilde)
+        assert np.all(y_W_y == y_W_ynew)
+
+        ln_W_det = det_ln_diag(np.diagflat(self.W))
+        ln_W_detnew = np.log(self.W).sum()
+        assert np.all(ln_W_det == ln_W_detnew)
+
+        #FIXME: Revisit this
         Z_tilde = (- self.NORMAL_CONST
                    + 0.5*self.ln_K_det
                    + 0.5*ln_W_det
@@ -189,14 +209,16 @@ class Laplace(likelihood):
         if cond(self.W) > epsilon:
             print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
 
-        self.Sigma_tilde = inv(self.W)  # Damn
+        self.Sigma_tilde = inv(np.diagflat(self.W))  # Damn
+        Sigma_tildenew = np.diagflat(1.0/self.W)
+        assert np.all(self.Sigma_tilde == Sigma_tildenew)
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
         self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
+        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
 
     def fit_full(self, K):
         """
@@ -229,12 +251,24 @@ class Laplace(likelihood):
         self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
 
-        self.Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
+        self.Ki_W_i = self.K - mdot(self.K, (np.diagflat(self.W_12), self.Bi, np.diagflat(self.W_12)), self.K) # Funky, order matters on stability!
+        Ki_W_inew = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
+        assert np.all(self.Ki_W_i == Ki_W_inew)
+
         self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i)
 
-        b = np.dot(self.W, self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
-        solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b)))
-        a = b - mdot(self.W_12, solve_chol)
+        b = np.dot(np.diagflat(self.W), self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
+        bnew = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
+        assert np.all(b == bnew)
+
+        solve_chol = cho_solve((self.B_chol, True), mdot((np.diagflat(self.W_12), self.K), b))
+        solve_cholnew = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
+        assert np.all(solve_chol == solve_cholnew)
+
+        a = b - mdot(np.diagflat(self.W_12), solve_chol)
+        anew = b - self.W_12*solve_chol
+        assert np.all(a == anew)
+
         self.Ki_f = a
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
         self.ln_K_det = pddet(self.K)
@@ -255,10 +289,13 @@ class Laplace(likelihood):
         :W: Negative hessian at a point (diagonal matrix)
         :returns: (B, L)
         """
-        #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
+        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        assert np.all(W_12.T*K*W_12 == np.dot(np.diagflat(W_12), np.dot(K, np.diagflat(W_12)))) # FIXME Take this out when you've done multiinput
-        B = np.eye(K.shape[0]) + W_12.T*K*W_12
+        # FIXME Take this out when you've done multiinput, Weirdly this is
+        # better when its W_12.T*K*W_12 which shouldnt make a difference
+        # because K is symmetrical
+        assert np.allclose(W_12*K*W_12.T, np.dot(np.diagflat(W_12), np.dot(K, np.diagflat(W_12))))
+        B = np.eye(self.N) + W_12*K*W_12.T
         L = jitchol(B)
         return (B, L, W_12)
 
@@ -323,19 +360,31 @@ class Laplace(likelihood):
                                     # This is a property only held by non-log-concave likelihoods
             B, L, W_12 = self._compute_B_statistics(K, W)
 
-            W_f = np.dot(W, f)
+            W_f = np.dot(np.diagflat(W), f)
+            W_fnew = W*f
+            assert np.all(W_f == W_fnew)
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
             #Find K_i_f
             b = W_f + grad
 
             #a should be equal to Ki*f now so should be able to use it
             c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-            solve_L = cho_solve((L, True), np.dot(W_12, c))
-            f = c - np.dot(K, np.dot(W_12, solve_L))
 
-            solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b)))
-            a = b - np.dot(W_12, solve_L)
-            #f = np.dot(K, a)
+            solve_L = cho_solve((L, True), np.dot(np.diagflat(W_12), c))
+            solve_Lnew = cho_solve((L, True), W_12*c)
+            assert np.all(solve_L == solve_Lnew)
+
+            f = c - np.dot(K, np.dot(np.diagflat(W_12), solve_L))
+            fnew = c - np.dot(K, W_12*solve_L)
+            assert np.all(f == fnew)
+
+            solve_L = cho_solve((L, True), np.dot(np.diagflat(W_12), np.dot(K, b)))
+            solve_Lnew = cho_solve((L, True), W_12*np.dot(K, b))
+            assert np.all(solve_L == solve_Lnew)
+
+            a = b - np.dot(np.diagflat(W_12), solve_L)
+            anew = b - W_12*solve_L
+            assert np.all(a == anew)
 
             tmp_old_obj = old_obj
             old_obj = new_obj
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 0f3dcb58..787429de 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -156,7 +156,7 @@ class GP(model):
             #THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
 
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
-            dL_dthetaL = 0 # self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
             print "dL_dthetaK after: ",dL_dthetaK
             #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:

From 6c2975079517364f00b2345f0ef9b3d2f5a14103 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 31 May 2013 16:59:54 +0100
Subject: [PATCH 48/71] Took out all the asserts and using pure broadcasting
 method of diagonal now

---
 GPy/examples/laplace_approximations.py |  4 +-
 GPy/likelihoods/Laplace.py             | 70 ++++++--------------------
 GPy/models/GP.py                       |  3 +-
 3 files changed, 20 insertions(+), 57 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 5103eefb..14ff44a0 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -39,8 +39,8 @@ def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 100)[:, None]
-    #X = np.array([0.5])[:, None]
+    #X = np.linspace(0.0, 10.0, 100)[:, None]
+    X = np.array([0.5])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
     X_full = np.linspace(0.0, 10.0, 500)[:, None]
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index af74755f..74d37d48 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -69,9 +69,7 @@ class Laplace(likelihood):
         #FIXME: Careful of side effects! And make sure W and K are up to date!
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
         dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat)
-        Wi_K_i = mdot(np.diagflat(self.W_12), self.Bi, np.diagflat(self.W_12)) #same as rasms R
-        Wi_K_inew = self.W_12*self.Bi*self.W_12.T #same as rasms R
-        assert np.all(Wi_K_i == Wi_K_inew)
+        Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
 
         I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i)
         return dL_dfhat, I_KW_i, Wi_K_i
@@ -150,9 +148,7 @@ class Laplace(likelihood):
         #((L.T*w)_i + I)f_hat = y_tilde
         L = jitchol(self.K)
         Li = chol_inv(L)
-        Lt_W = np.dot(L.T, np.diagflat(self.W)) #FIXME: Can make Faster
-        Lt_Wnew = L.T*self.W.T
-        assert np.all(Lt_Wnew == Lt_W)
+        Lt_W = L.T*self.W.T
 
         ##Check it isn't singular!
         if cond(Lt_W) > epsilon:
@@ -164,25 +160,15 @@ class Laplace(likelihood):
 
         #f.T(Ki + W)f
         f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
-                    + mdot(self.f_hat.T, np.diagflat(self.W), self.f_hat)
-                    )
-        f_Ki_W_fnew = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
                     + mdot(self.f_hat.T, self.W*self.f_hat)
                     )
-        assert np.all(f_Ki_W_f == f_Ki_W_fnew)
 
-        y_W_f = mdot((Y_tilde.T, np.diagflat(self.W)), self.f_hat)
-        y_W_fnew = mdot(Y_tilde.T*self.W.T, self.f_hat)
-        assert np.all(y_W_f == y_W_fnew)
+        y_W_f = mdot(Y_tilde.T*self.W.T, self.f_hat)
 
 
-        y_W_y = mdot((Y_tilde.T, np.diagflat(self.W)), Y_tilde)
-        y_W_ynew = mdot(Y_tilde.T, self.W*Y_tilde)
-        assert np.all(y_W_y == y_W_ynew)
+        y_W_y = mdot(Y_tilde.T, self.W*Y_tilde)
 
-        ln_W_det = det_ln_diag(np.diagflat(self.W))
-        ln_W_detnew = np.log(self.W).sum()
-        assert np.all(ln_W_det == ln_W_detnew)
+        ln_W_det = np.log(self.W).sum()
 
         #FIXME: Revisit this
         Z_tilde = (- self.NORMAL_CONST
@@ -203,15 +189,13 @@ class Laplace(likelihood):
                    #+ y_W_f
                    #+ self.ln_z_hat
                    #)
-        self.Z_tilde = 0
+        #self.Z_tilde = 0
 
         ##Check it isn't singular!
         if cond(self.W) > epsilon:
             print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
 
-        self.Sigma_tilde = inv(np.diagflat(self.W))  # Damn
-        Sigma_tildenew = np.diagflat(1.0/self.W)
-        assert np.all(self.Sigma_tilde == Sigma_tildenew)
+        self.Sigma_tilde = np.diagflat(1.0/self.W)
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -251,23 +235,15 @@ class Laplace(likelihood):
         self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
 
-        self.Ki_W_i = self.K - mdot(self.K, (np.diagflat(self.W_12), self.Bi, np.diagflat(self.W_12)), self.K) # Funky, order matters on stability!
-        Ki_W_inew = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
-        assert np.all(self.Ki_W_i == Ki_W_inew)
+        self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
 
         self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i)
 
-        b = np.dot(np.diagflat(self.W), self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
-        bnew = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
-        assert np.all(b == bnew)
+        b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
 
-        solve_chol = cho_solve((self.B_chol, True), mdot((np.diagflat(self.W_12), self.K), b))
-        solve_cholnew = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
-        assert np.all(solve_chol == solve_cholnew)
+        solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
 
-        a = b - mdot(np.diagflat(self.W_12), solve_chol)
-        anew = b - self.W_12*solve_chol
-        assert np.all(a == anew)
+        a = b - self.W_12*solve_chol
 
         self.Ki_f = a
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
@@ -291,10 +267,6 @@ class Laplace(likelihood):
         """
         #W is diagonal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        # FIXME Take this out when you've done multiinput, Weirdly this is
-        # better when its W_12.T*K*W_12 which shouldnt make a difference
-        # because K is symmetrical
-        assert np.allclose(W_12*K*W_12.T, np.dot(np.diagflat(W_12), np.dot(K, np.diagflat(W_12))))
         B = np.eye(self.N) + W_12*K*W_12.T
         L = jitchol(B)
         return (B, L, W_12)
@@ -360,9 +332,7 @@ class Laplace(likelihood):
                                     # This is a property only held by non-log-concave likelihoods
             B, L, W_12 = self._compute_B_statistics(K, W)
 
-            W_f = np.dot(np.diagflat(W), f)
-            W_fnew = W*f
-            assert np.all(W_f == W_fnew)
+            W_f = W*f
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
             #Find K_i_f
             b = W_f + grad
@@ -370,21 +340,13 @@ class Laplace(likelihood):
             #a should be equal to Ki*f now so should be able to use it
             c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
 
-            solve_L = cho_solve((L, True), np.dot(np.diagflat(W_12), c))
-            solve_Lnew = cho_solve((L, True), W_12*c)
-            assert np.all(solve_L == solve_Lnew)
+            solve_L = cho_solve((L, True), W_12*c)
 
-            f = c - np.dot(K, np.dot(np.diagflat(W_12), solve_L))
-            fnew = c - np.dot(K, W_12*solve_L)
-            assert np.all(f == fnew)
+            f = c - np.dot(K, W_12*solve_L)
 
-            solve_L = cho_solve((L, True), np.dot(np.diagflat(W_12), np.dot(K, b)))
-            solve_Lnew = cho_solve((L, True), W_12*np.dot(K, b))
-            assert np.all(solve_L == solve_Lnew)
+            solve_L = cho_solve((L, True), W_12*np.dot(K, b))
 
-            a = b - np.dot(np.diagflat(W_12), solve_L)
-            anew = b - W_12*solve_L
-            assert np.all(a == anew)
+            a = b - W_12*solve_L
 
             tmp_old_obj = old_obj
             old_obj = new_obj
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 787429de..0ba20d7b 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -152,8 +152,9 @@ class GP(model):
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
             fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
             #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
+
+            #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
-            #THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
 
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From f3b8dfb2225c8a25a0b753ec0e2f63b28cdec827 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 3 Jun 2013 14:51:09 +0100
Subject: [PATCH 49/71] about to input new derivations for Z's...

---
 GPy/examples/laplace_approximations.py | 15 +++++++++++---
 GPy/likelihoods/Laplace.py             | 28 ++++++++++++++++----------
 GPy/models/GP.py                       | 17 ++++++++--------
 3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 14ff44a0..ee71a950 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -143,11 +143,12 @@ def student_t_approx():
     Yc[10] += 100
     Yc[25] += 10
     Yc[23] += 10
+    Yc[26] += 1000
     Yc[24] += 10
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 1000000000000
+    deg_free = 10
     real_sd = np.sqrt(real_var)
     print "Real noise: ", real_sd
 
@@ -187,21 +188,25 @@ def student_t_approx():
     plt.subplot(211)
     m.plot()
     plt.plot(X_full, Y_full)
+    plt.title('Gaussian clean')
     print m
 
     #Corrupt
     print "Corrupt Gaussian"
     m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
     m.ensure_default_constraints()
-    m.optimize()
+    #m.optimize()
     plt.subplot(212)
     m.plot()
     plt.plot(X_full, Y_full)
+    plt.title('Gaussian corrupt')
     print m
 
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
-    edited_real_sd = initial_var_guess #real_sd
+    edited_real_sd = real_sd #initial_var_guess
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -215,6 +220,7 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
+    plt.title('Student-t rasm clean')
 
     print "Corrupt student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -228,6 +234,7 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
+    plt.title('Student-t rasm corrupt')
 
     print "Clean student t, ncg"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -241,6 +248,7 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
+    plt.title('Student-t ncg clean')
 
     print "Corrupt student t, ncg"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -254,6 +262,7 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
+    plt.title('Student-t ncg corrupt')
 
 
     ###with a student t distribution, since it has heavy tails it should work well
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 74d37d48..45fddeaa 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -6,7 +6,10 @@ from numpy.linalg import cond
 from likelihood import likelihood
 from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.lapack import dtrtrs
+import random
 #import pylab as plt
+np.random.seed(50)
+random.seed(50)
 
 
 class Laplace(likelihood):
@@ -156,6 +159,7 @@ class Laplace(likelihood):
 
         Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
         self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
+
         Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
 
         #f.T(Ki + W)f
@@ -239,15 +243,15 @@ class Laplace(likelihood):
 
         self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i)
 
+        #Do the computation again at f to get Ki_f which is useful
         b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
-
         solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
-
         a = b - self.W_12*solve_chol
-
         self.Ki_f = a
+
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
         self.ln_K_det = pddet(self.K)
+        #_, _, _, self.ln_K_det = pdinv(self.K)
 
         self.ln_z_hat = (- 0.5*self.f_Ki_f
                          - 0.5*self.ln_K_det
@@ -296,7 +300,7 @@ class Laplace(likelihood):
             res = -1 * (--np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
             return np.squeeze(res)
 
-        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
     def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50):
@@ -336,17 +340,19 @@ class Laplace(likelihood):
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
             #Find K_i_f
             b = W_f + grad
+            b = step_size*b
 
-            #a should be equal to Ki*f now so should be able to use it
-            c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-
-            solve_L = cho_solve((L, True), W_12*c)
-
-            f = c - np.dot(K, W_12*solve_L)
+            #Need this to find the f we have a stepsize which we need to move in, rather than a full unit movement
+            #c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
+            #solve_L = cho_solve((L, True), W_12*c)
+            #f = c - np.dot(K, W_12*solve_L)
 
+            #FIXME: Can't we get rid of this? Don't we want to evaluate obj(c,f) and this is our new_obj?
+            #Why did I choose to evaluate the objective function at the new f with the old hessian? I'm sure there was a good reason,
+            #Document it!
             solve_L = cho_solve((L, True), W_12*np.dot(K, b))
-
             a = b - W_12*solve_L
+            f = np.dot(K, a)
 
             tmp_old_obj = old_obj
             old_obj = new_obj
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 0ba20d7b..e4ed52ef 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -142,23 +142,22 @@ class GP(model):
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
-        print "dL_dthetaK before: ",dL_dthetaK
         if isinstance(self.likelihood, Laplace):
             #Reapproximate incase it hasnt been done...
-            if isinstance(self.likelihood, Laplace):
-                self.likelihood.fit_full(self.kern.K(self.X))
-                self.likelihood._set_params(self.likelihood._get_params())
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
+            print self.kern._get_params()
 
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
+            #fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
             #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
 
             #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
-            dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
+            #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
-            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
-            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            print "dL_dthetaK after: ",dL_dthetaK
+            #dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
+            dL_dthetaL = 0 #self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            #print "dL_dthetaK after: ",dL_dthetaK
             #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From ac461e1b2aa65afa08359e1ac6d6cb8956e962b4 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 18 Jun 2013 17:55:58 +0100
Subject: [PATCH 50/71] Checkgrads with explicit and implicit components half
 the time

---
 GPy/examples/laplace_approximations.py |  69 +++++++--------
 GPy/likelihoods/Laplace.py             | 114 +++++++++++--------------
 GPy/models/GP.py                       |   7 +-
 GPy/util/linalg.py                     |   2 +-
 4 files changed, 91 insertions(+), 101 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index ee71a950..5120dfb5 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -39,11 +39,11 @@ def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
     #Start a function, any function
-    #X = np.linspace(0.0, 10.0, 100)[:, None]
-    X = np.array([0.5])[:, None]
+    X = np.linspace(0.0, 10.0, 15)[:, None]
+    #X = np.array([0.5])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
-    X_full = np.linspace(0.0, 10.0, 500)[:, None]
+    X_full = np.linspace(0.0, 10.0, 15)[:, None]
     Y_full = np.sin(X_full)
 
     Y = Y/Y.max()
@@ -83,7 +83,8 @@ def debug_student_t_noise_approx():
         #plt.plot(X_full, Y_full)
     #print m
 
-    edited_real_sd = initial_var_guess #real_sd
+    #edited_real_sd = initial_var_guess #real_sd
+    edited_real_sd = real_sd
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -94,7 +95,7 @@ def debug_student_t_noise_approx():
     #m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_variance', real_sd)
     m.constrain_positive('rbf')
-    m.constrain_fixed('t_noi', real_sd)
+    #m.constrain_fixed('t_noi', real_sd)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
     m.optimize(messages=True)
@@ -148,7 +149,7 @@ def student_t_approx():
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10
+    deg_free = 8
     real_sd = np.sqrt(real_var)
     print "Real noise: ", real_sd
 
@@ -202,8 +203,6 @@ def student_t_approx():
     plt.title('Gaussian corrupt')
     print m
 
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
     edited_real_sd = real_sd #initial_var_guess
@@ -236,33 +235,35 @@ def student_t_approx():
     plt.ylim(-2.5, 2.5)
     plt.title('Student-t rasm corrupt')
 
-    print "Clean student t, ncg"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
-    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    plt.subplot(221)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
-    plt.title('Student-t ncg clean')
+    return m
 
-    print "Corrupt student t, ncg"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=False)
-    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    plt.subplot(223)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
-    plt.title('Student-t ncg corrupt')
+    #print "Clean student t, ncg"
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
+    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    #m.ensure_default_constraints()
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    #print(m)
+    #plt.subplot(221)
+    #m.plot()
+    #plt.plot(X_full, Y_full)
+    #plt.ylim(-2.5, 2.5)
+    #plt.title('Student-t ncg clean')
+
+    #print "Corrupt student t, ncg"
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=False)
+    #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    #m.ensure_default_constraints()
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    #print(m)
+    #plt.subplot(223)
+    #m.plot()
+    #plt.plot(X_full, Y_full)
+    #plt.ylim(-2.5, 2.5)
+    #plt.title('Student-t ncg corrupt')
 
 
     ###with a student t distribution, since it has heavy tails it should work well
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 45fddeaa..a8347345 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -8,9 +8,6 @@ from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.lapack import dtrtrs
 import random
 #import pylab as plt
-np.random.seed(50)
-random.seed(50)
-
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -45,7 +42,7 @@ class Laplace(likelihood):
         self.is_heteroscedastic = True
         self.Nparams = 0
 
-        self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
+        self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
 
         #Initial values for the GP variables
         self.Y = np.zeros((self.N, 1))
@@ -72,26 +69,36 @@ class Laplace(likelihood):
         #FIXME: Careful of side effects! And make sure W and K are up to date!
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
         dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat)
+
         Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
 
         I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i)
         return dL_dfhat, I_KW_i, Wi_K_i
 
-    def _Kgradients(self, dK_dthetaK):
+    def _Kgradients(self, dK_dthetaK, X):
         """
         Gradients with respect to prior kernel parameters
         """
         dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
         dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
 
-        dL_dthetaK = np.zeros(dK_dthetaK.shape)
-        for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
-            #Explicit
-            f_Ki_dK_dtheta_Ki_f = mdot(self.Ki_f.T, dK_dthetaK_i, self.Ki_f)
-            dL_dthetaK[thetaK_i] = 0.5*f_Ki_dK_dtheta_Ki_f - 0.5*np.trace(Wi_K_i*dK_dthetaK_i)
-            #Implicit
-            df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp)
-            dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
+        #Implicit
+        impl = mdot(dlp, dL_dfhat.T, I_KW_i)
+        expl_a = - mdot(self.Ki_f, self.Ki_f.T)
+        expl_b = Wi_K_i
+        expl = 0.5*expl_a - 0.5*expl_b
+        dL_dthetaK_exp = dK_dthetaK(expl, X)
+        dL_dthetaK_imp = dK_dthetaK(impl, X)
+        dL_dthetaK = -(dL_dthetaK_imp + dL_dthetaK_exp)
+
+        #dL_dthetaK = np.zeros(dK_dthetaK.shape)
+        #for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
+            ##Explicit
+            #f_Ki_dK_dtheta_Ki_f = mdot(self.Ki_f.T, dK_dthetaK_i, self.Ki_f)
+            #dL_dthetaK[thetaK_i] = 0.5*f_Ki_dK_dtheta_Ki_f - 0.5*np.trace(Wi_K_i*dK_dthetaK_i)
+            ##Implicit
+            #df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp)
+            #dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
 
         return dL_dthetaK
 
@@ -99,13 +106,12 @@ class Laplace(likelihood):
         """
         Gradients with respect to likelihood parameters
         """
-        return np.zeros(1)
-        #return np.zeros(0)
+        #return np.zeros(1)
         dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
 
         num_params = len(dlik_dthetaL)
-        dL_dthetaL = np.zeros((1, num_params)) # make space for one derivative for each likelihood parameter
+        dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter
         for thetaL_i in range(num_params):
             #Explicit
             #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
@@ -143,8 +149,6 @@ class Laplace(likelihood):
         $$\tilde{\Sigma} = W^{-1}$$
 
         """
-        epsilon = 1e14
-
         #Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
         #dtritri -> L -> L_i
         #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
@@ -153,54 +157,38 @@ class Laplace(likelihood):
         Li = chol_inv(L)
         Lt_W = L.T*self.W.T
 
-        ##Check it isn't singular!
-        if cond(Lt_W) > epsilon:
-            print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
-
         Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
         self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
 
         Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
 
-        #f.T(Ki + W)f
-        f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
-                    + mdot(self.f_hat.T, self.W*self.f_hat)
-                    )
+        ln_W_det = det_ln_diag(self.W)
+        yf_W_yf = mdot((Y_tilde - self.f_hat).T, np.diagflat(self.W), (Y_tilde - self.f_hat))
 
-        y_W_f = mdot(Y_tilde.T*self.W.T, self.f_hat)
-
-
-        y_W_y = mdot(Y_tilde.T, self.W*Y_tilde)
-
-        ln_W_det = np.log(self.W).sum()
-
-        #FIXME: Revisit this
-        Z_tilde = (- self.NORMAL_CONST
-                   + 0.5*self.ln_K_det
-                   + 0.5*ln_W_det
-                   + 0.5*self.ln_Ki_W_i_det
-                   + 0.5*f_Ki_W_f
-                   + 0.5*y_W_y
-                   - y_W_f
-                   + self.ln_z_hat
-                   )
-        #Z_tilde = (self.NORMAL_CONST
-                   #- 0.5*self.ln_K_det
-                   #- 0.5*ln_W_det
-                   #- 0.5*self.ln_Ki_W_i_det
-                   #- 0.5*f_Ki_W_f
-                   #- 0.5*y_W_y
-                   #+ y_W_f
+        #Z_tilde = (+ self.NORMAL_CONST
                    #+ self.ln_z_hat
+                   #+ 0.5*self.ln_I_KW_det
+                   #- 0.5*ln_W_det
+                   #+ 0.5*self.f_Ki_f
+                   #+ 0.5*yf_W_yf
                    #)
-        #self.Z_tilde = 0
-
-        ##Check it isn't singular!
-        if cond(self.W) > epsilon:
-            print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
 
         self.Sigma_tilde = np.diagflat(1.0/self.W)
 
+        Ki, _, _, K_det = pdinv(self.K)
+        ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
+        W = np.diagflat(self.W)
+        Wi = self.Sigma_tilde
+        W12i = np.sqrt(Wi)
+        D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
+        fDf = mdot(self.f_hat.T, D, self.f_hat)
+        l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+        Z_tilde = (+ self.NORMAL_CONST
+                   + l
+                   + 0.5*ln_det_K_Wi__Bi
+                   - 0.5*fDf
+                  )
+
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
@@ -239,10 +227,6 @@ class Laplace(likelihood):
         self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
 
-        self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
-
-        self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i)
-
         #Do the computation again at f to get Ki_f which is useful
         b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
         solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
@@ -250,12 +234,14 @@ class Laplace(likelihood):
         self.Ki_f = a
 
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
-        self.ln_K_det = pddet(self.K)
-        #_, _, _, self.ln_K_det = pdinv(self.K)
+        self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
 
+        #For det, |I + KW| == |I + W_12*K*W_12|
+        self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
+
+        #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
         self.ln_z_hat = (- 0.5*self.f_Ki_f
-                         - 0.5*self.ln_K_det
-                         + 0.5*self.ln_Ki_W_i_det
+                         - self.ln_I_KW_det
                          + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
                          )
 
@@ -289,7 +275,7 @@ class Laplace(likelihood):
         #ONLY WORKS FOR 1D DATA
         def obj(f):
             res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
-                        + self.NORMAL_CONST)
+                        - self.NORMAL_CONST)
             return float(res)
 
         def obj_grad(f):
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index e4ed52ef..d56ee86f 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -141,6 +141,8 @@ class GP(model):
 
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
+        self.likelihood.fit_full(self.kern.K(self.X))
+        self.likelihood._set_params(self.likelihood._get_params())
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
         if isinstance(self.likelihood, Laplace):
             #Reapproximate incase it hasnt been done...
@@ -155,8 +157,9 @@ class GP(model):
             #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
             #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
-            #dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
-            dL_dthetaL = 0 #self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            dK_dthetaK = self.kern.dK_dtheta
+            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
             #print "dL_dthetaK after: ",dL_dthetaK
             #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 08e6fd99..f19acf1a 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -34,7 +34,7 @@ def det_ln_diag(A):
 
 def pddet(A):
     """
-    Determinant of a positive definite matrix
+    Determinant of a positive definite matrix, only symmetric matricies though
     """
     L = jitchol(A)
     logdetA = 2*sum(np.log(np.diag(L)))

From de689fa8e91928b7fc2d02f56d4eca14d82eaafd Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 19 Jun 2013 12:00:00 +0100
Subject: [PATCH 51/71] Now gradchecks everytime but student_t fit is bad,
 noise is underestimated by a long way

---
 GPy/examples/laplace_approximations.py  | 18 +++++++++--------
 GPy/likelihoods/Laplace.py              | 27 ++++++++++++++++---------
 GPy/likelihoods/likelihood_functions.py | 16 +--------------
 GPy/models/GP.py                        | 12 -----------
 4 files changed, 29 insertions(+), 44 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 5120dfb5..84527d08 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -39,28 +39,28 @@ def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 15)[:, None]
+    X = np.linspace(0.0, 10.0, 50)[:, None]
     #X = np.array([0.5])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
-    X_full = np.linspace(0.0, 10.0, 15)[:, None]
+    X_full = np.linspace(0.0, 10.0, 50)[:, None]
     Y_full = np.sin(X_full)
 
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10000
+    deg_free = 1000
     real_sd = np.sqrt(real_var)
-    print "Real noise: ", real_sd
+    print "Real noise std: ", real_sd
 
-    initial_var_guess = 0.02
+    initial_var_guess = 0.3
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
 
     plt.close('all')
     # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
@@ -83,22 +83,24 @@ def debug_student_t_noise_approx():
         #plt.plot(X_full, Y_full)
     #print m
 
-    #edited_real_sd = initial_var_guess #real_sd
+    edited_real_sd = initial_var_guess #real_sd
     edited_real_sd = real_sd
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m['white'] = 1e-3
     #m.constrain_positive('rbf')
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_variance', real_sd)
     m.constrain_positive('rbf')
+    m.constrain_positive('t_noise')
     #m.constrain_fixed('t_noi', real_sd)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
-    m.optimize(messages=True)
+    #m.optimize(messages=True)
     print(m)
     #return m
     #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index a8347345..5b1a814a 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -84,12 +84,13 @@ class Laplace(likelihood):
 
         #Implicit
         impl = mdot(dlp, dL_dfhat.T, I_KW_i)
-        expl_a = - mdot(self.Ki_f, self.Ki_f.T)
+        expl_a = mdot(self.Ki_f, self.Ki_f.T)
         expl_b = Wi_K_i
-        expl = 0.5*expl_a - 0.5*expl_b
+        expl = 0.5*expl_a + 0.5*expl_b
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
-        dL_dthetaK = -(dL_dthetaK_imp + dL_dthetaK_exp)
+        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp
 
         #dL_dthetaK = np.zeros(dK_dthetaK.shape)
         #for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
@@ -117,10 +118,12 @@ class Laplace(likelihood):
             #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
             #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None])
             #                                               might be +
-            dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
-            dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL)
+            dL_dthetaL_imp = np.dot(dL_dfhat.T, df_hat_dthetaL)
+            #print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
+            dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp
 
         return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
@@ -180,14 +183,20 @@ class Laplace(likelihood):
         W = np.diagflat(self.W)
         Wi = self.Sigma_tilde
         W12i = np.sqrt(Wi)
-        D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
-        fDf = mdot(self.f_hat.T, D, self.f_hat)
+        #D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
+        #fDf = mdot(self.f_hat.T, D, self.f_hat)
         l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+        #print "fDf:{}   l:{}   detKWiBi:{}   W:{}   Wi:{}   Bi:{}   Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum())
+
+        y_Wi_Ki_i_y = mdot(Y_tilde.T, pdinv(self.K + Wi)[0], Y_tilde)
         Z_tilde = (+ self.NORMAL_CONST
                    + l
                    + 0.5*ln_det_K_Wi__Bi
-                   - 0.5*fDf
+                   #- 0.5*fDf
+                   - 0.5*self.f_Ki_f
+                   + 0.5*y_Wi_Ki_i_y
                   )
+        #print "Ztilde: {}".format(Z_tilde)
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -316,7 +325,7 @@ class Laplace(likelihood):
             #f_old = f.copy()
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-5     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index c3aee835..041b59bd 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -170,7 +170,7 @@ class student_t(likelihood_function):
         return np.asarray(self.sigma)
 
     def _get_param_names(self):
-        return ["t_noise_variance"]
+        return ["t_noise_std"]
 
     def _set_params(self, x):
         self.sigma = float(x)
@@ -191,8 +191,6 @@ class student_t(likelihood_function):
         :returns: float(likelihood evaluated for this point)
 
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
 
         e = y - f
@@ -215,8 +213,6 @@ class student_t(likelihood_function):
         :returns: gradient of likelihood evaluated at points
 
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
@@ -237,8 +233,6 @@ class student_t(likelihood_function):
         :extra_data: extra_data which is not used in student t distribution
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
 
         e = y - f
@@ -251,8 +245,6 @@ class student_t(likelihood_function):
 
         $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
@@ -269,8 +261,6 @@ class student_t(likelihood_function):
 
         $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         dlik_dsigma = ( - (1/self.sigma) +
@@ -284,8 +274,6 @@ class student_t(likelihood_function):
 
         $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e)
@@ -299,8 +287,6 @@ class student_t(likelihood_function):
 
         $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         dlik_hess_dsigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index d56ee86f..636ebba0 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -145,18 +145,6 @@ class GP(model):
         self.likelihood._set_params(self.likelihood._get_params())
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
         if isinstance(self.likelihood, Laplace):
-            #Reapproximate incase it hasnt been done...
-            self.likelihood.fit_full(self.kern.K(self.X))
-            self.likelihood._set_params(self.likelihood._get_params())
-            print self.kern._get_params()
-
-            #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            #fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
-            #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
-
-            #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
-            #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
-
             dK_dthetaK = self.kern.dK_dtheta
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From e900509a7c146a80a866d29a4efaedfb10f1291a Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 19 Jun 2013 16:13:11 +0100
Subject: [PATCH 52/71] Fixed a sign wrong, now gradchecks weirdly only above
 certain points

---
 GPy/examples/laplace_approximations.py  | 61 ++++++++++++++++++++++---
 GPy/likelihoods/Laplace.py              | 47 +++----------------
 GPy/likelihoods/likelihood_functions.py |  7 ++-
 3 files changed, 64 insertions(+), 51 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 84527d08..887e35ae 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -35,6 +35,54 @@ def timing():
     print the_is
     print np.mean(the_is)
 
+def v_fail_test():
+    plt.close('all')
+    real_var = 0.1
+    X = np.linspace(0.0, 10.0, 50)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+    Y = Y/Y.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 10
+    real_sd = np.sqrt(real_var)
+    print "Real noise std: ", real_sd
+
+    kernel1 = GPy.kern.white(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+
+    edited_real_sd = 0.3#real_sd
+    edited_real_sd = real_sd
+
+    print "Clean student t, rasm"
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel1)
+    m.constrain_fixed('white', 1)
+    vs = 15
+    noises = 40
+    checkgrads = np.zeros((vs, noises))
+    vs_noises = np.zeros((vs, noises))
+    for v_ind, v in enumerate(np.linspace(1, 20, vs)):
+        m.likelihood.likelihood_function.v = v
+        print v
+        for noise_ind, noise in enumerate(np.linspace(0.0000001, 1, noises)):
+            m['t_noise'] = noise
+            m.update_likelihood_approximation()
+            checkgrads[v_ind, noise_ind] = m.checkgrad()
+            vs_noises[v_ind, noise_ind] = (float(v)/(float(v) - 2))*(noise**2)
+
+    plt.figure(1)
+    plt.title('Checkgrads')
+    plt.imshow(checkgrads, interpolation='nearest')
+    plt.xlabel('noise')
+    plt.ylabel('v')
+
+    plt.figure(2)
+    plt.title('variance change')
+    plt.imshow(vs_noises, interpolation='nearest')
+    plt.xlabel('noise')
+    plt.ylabel('v')
+    print(m)
+
 def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
@@ -49,7 +97,7 @@ def debug_student_t_noise_approx():
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 1000
+    deg_free = 10
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
 
@@ -60,7 +108,7 @@ def debug_student_t_noise_approx():
 
     plt.close('all')
     # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
@@ -90,12 +138,11 @@ def debug_student_t_noise_approx():
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
-    m['white'] = 1e-3
-    #m.constrain_positive('rbf')
-    #m.constrain_fixed('rbf_v', 1.0898)
-    #m.constrain_fixed('rbf_l', 1.8651)
+    #m['white'] = 1e-3
+    m.constrain_fixed('rbf_v', 1.0898)
+    m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_variance', real_sd)
-    m.constrain_positive('rbf')
+    #m.constrain_positive('rbf')
     m.constrain_positive('t_noise')
     #m.constrain_fixed('t_noi', real_sd)
     m.ensure_default_constraints()
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 5b1a814a..70ec568a 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -70,54 +70,38 @@ class Laplace(likelihood):
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
         dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat)
 
-        Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
-
-        I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i)
-        return dL_dfhat, I_KW_i, Wi_K_i
+        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
+        return dL_dfhat, I_KW_i
 
     def _Kgradients(self, dK_dthetaK, X):
         """
         Gradients with respect to prior kernel parameters
         """
-        dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
         dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
 
         #Implicit
         impl = mdot(dlp, dL_dfhat.T, I_KW_i)
         expl_a = mdot(self.Ki_f, self.Ki_f.T)
-        expl_b = Wi_K_i
+        expl_b = self.Wi_K_i
         expl = 0.5*expl_a + 0.5*expl_b
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
         #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
         dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp
-
-        #dL_dthetaK = np.zeros(dK_dthetaK.shape)
-        #for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
-            ##Explicit
-            #f_Ki_dK_dtheta_Ki_f = mdot(self.Ki_f.T, dK_dthetaK_i, self.Ki_f)
-            #dL_dthetaK[thetaK_i] = 0.5*f_Ki_dK_dtheta_Ki_f - 0.5*np.trace(Wi_K_i*dK_dthetaK_i)
-            ##Implicit
-            #df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp)
-            #dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
-
         return dL_dthetaK
 
     def _gradients(self, partial):
         """
         Gradients with respect to likelihood parameters
         """
-        #return np.zeros(1)
-        dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
 
         num_params = len(dlik_dthetaL)
         dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter
         for thetaL_i in range(num_params):
             #Explicit
-            #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
-            #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None])
-            #                                               might be +
             dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
@@ -165,34 +149,17 @@ class Laplace(likelihood):
 
         Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
 
-        ln_W_det = det_ln_diag(self.W)
-        yf_W_yf = mdot((Y_tilde - self.f_hat).T, np.diagflat(self.W), (Y_tilde - self.f_hat))
-
-        #Z_tilde = (+ self.NORMAL_CONST
-                   #+ self.ln_z_hat
-                   #+ 0.5*self.ln_I_KW_det
-                   #- 0.5*ln_W_det
-                   #+ 0.5*self.f_Ki_f
-                   #+ 0.5*yf_W_yf
-                   #)
-
         self.Sigma_tilde = np.diagflat(1.0/self.W)
 
-        Ki, _, _, K_det = pdinv(self.K)
+        self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
         ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
-        W = np.diagflat(self.W)
-        Wi = self.Sigma_tilde
-        W12i = np.sqrt(Wi)
-        #D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
-        #fDf = mdot(self.f_hat.T, D, self.f_hat)
         l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
         #print "fDf:{}   l:{}   detKWiBi:{}   W:{}   Wi:{}   Bi:{}   Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum())
 
-        y_Wi_Ki_i_y = mdot(Y_tilde.T, pdinv(self.K + Wi)[0], Y_tilde)
+        y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
         Z_tilde = (+ self.NORMAL_CONST
                    + l
                    + 0.5*ln_det_K_Wi__Bi
-                   #- 0.5*fDf
                    - 0.5*self.f_Ki_f
                    + 0.5*y_Wi_Ki_i_y
                   )
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 041b59bd..d6dbf55f 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -194,10 +194,10 @@ class student_t(likelihood_function):
         assert y.shape == f.shape
 
         e = y - f
-        objective = (gammaln((self.v + 1) * 0.5)
+        objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
                      - np.log(self.sigma * np.sqrt(self.v * np.pi))
-                     - (self.v + 1) * 0.5 * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                     - (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v))
                     )
         return np.sum(objective)
 
@@ -234,7 +234,6 @@ class student_t(likelihood_function):
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
         assert y.shape == f.shape
-
         e = y - f
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
         return hess
@@ -247,7 +246,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
+        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
                        ((e**2 + (self.sigma**2)*self.v)**3)
                     )
         return d3lik_d3f

From d4bfd99c21c835e5cf7873e20295561c031d5221 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 20 Jun 2013 14:30:25 +0100
Subject: [PATCH 53/71] Starting to fiddle with mode finding code

---
 GPy/examples/laplace_approximations.py  | 18 ++++++++++--------
 GPy/likelihoods/Laplace.py              | 12 ++++++------
 GPy/likelihoods/likelihood_functions.py |  1 -
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 887e35ae..d300806f 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -36,7 +36,7 @@ def timing():
     print np.mean(the_is)
 
 def v_fail_test():
-    plt.close('all')
+    #plt.close('all')
     real_var = 0.1
     X = np.linspace(0.0, 10.0, 50)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
@@ -57,6 +57,7 @@ def v_fail_test():
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel1)
     m.constrain_fixed('white', 1)
+    m.constrain_positive('t_noise')
     vs = 15
     noises = 40
     checkgrads = np.zeros((vs, noises))
@@ -64,23 +65,24 @@ def v_fail_test():
     for v_ind, v in enumerate(np.linspace(1, 20, vs)):
         m.likelihood.likelihood_function.v = v
         print v
-        for noise_ind, noise in enumerate(np.linspace(0.0000001, 1, noises)):
+        for noise_ind, noise in enumerate(np.linspace(0.0001, 1, noises)):
             m['t_noise'] = noise
             m.update_likelihood_approximation()
             checkgrads[v_ind, noise_ind] = m.checkgrad()
             vs_noises[v_ind, noise_ind] = (float(v)/(float(v) - 2))*(noise**2)
 
-    plt.figure(1)
+    plt.figure()
     plt.title('Checkgrads')
     plt.imshow(checkgrads, interpolation='nearest')
     plt.xlabel('noise')
     plt.ylabel('v')
 
-    plt.figure(2)
+    plt.figure()
     plt.title('variance change')
     plt.imshow(vs_noises, interpolation='nearest')
     plt.xlabel('noise')
     plt.ylabel('v')
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print(m)
 
 def debug_student_t_noise_approx():
@@ -139,13 +141,13 @@ def debug_student_t_noise_approx():
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     #m['white'] = 1e-3
-    m.constrain_fixed('rbf_v', 1.0898)
-    m.constrain_fixed('rbf_l', 1.8651)
+    #m.constrain_fixed('rbf_v', 1.0898)
+    #m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_variance', real_sd)
     #m.constrain_positive('rbf')
-    m.constrain_positive('t_noise')
+    #m.constrain_positive('t_noise')
+    m.constrain_positive('')
     #m.constrain_fixed('t_noi', real_sd)
-    m.ensure_default_constraints()
     m.update_likelihood_approximation()
     #m.optimize(messages=True)
     print(m)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 70ec568a..ed3229a9 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -68,8 +68,7 @@ class Laplace(likelihood):
     def _shared_gradients_components(self):
         #FIXME: Careful of side effects! And make sure W and K are up to date!
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
-        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat)
-
+        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T
         I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
         return dL_dfhat, I_KW_i
 
@@ -81,10 +80,10 @@ class Laplace(likelihood):
         dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
 
         #Implicit
-        impl = mdot(dlp, dL_dfhat.T, I_KW_i)
+        impl = mdot(dlp, dL_dfhat, I_KW_i)
         expl_a = mdot(self.Ki_f, self.Ki_f.T)
         expl_b = self.Wi_K_i
-        expl = 0.5*expl_a + 0.5*expl_b
+        expl = 0.5*expl_a - 0.5*expl_b # Might need to be -?
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
         #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
@@ -103,10 +102,11 @@ class Laplace(likelihood):
         for thetaL_i in range(num_params):
             #Explicit
             dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(mdot(self.Bi, self.K, dlik_hess_dthetaL[thetaL_i]))
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
-            dL_dthetaL_imp = np.dot(dL_dfhat.T, df_hat_dthetaL)
-            #print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
+            dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)
+            print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
             dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp
 
         return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index d6dbf55f..4d298122 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -192,7 +192,6 @@ class student_t(likelihood_function):
 
         """
         assert y.shape == f.shape
-
         e = y - f
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)

From e80fad197ca3250bca4e9d7830a23dadf8ae62e9 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 24 Jun 2013 15:39:38 +0100
Subject: [PATCH 54/71] trying to fix optimisation problem, fixed a few bugs
 but still fails at very low noise

---
 GPy/examples/laplace_approximations.py |  4 +-
 GPy/likelihoods/Laplace.py             | 79 +++++++++++++++-----------
 2 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index d300806f..7b9f10b1 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -90,7 +90,7 @@ def debug_student_t_noise_approx():
     real_var = 0.1
     #Start a function, any function
     X = np.linspace(0.0, 10.0, 50)[:, None]
-    #X = np.array([0.5])[:, None]
+    #X = np.array([0.5, 1])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
     X_full = np.linspace(0.0, 10.0, 50)[:, None]
@@ -99,7 +99,7 @@ def debug_student_t_noise_approx():
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10
+    deg_free = 100000
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
 
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index ed3229a9..b5362839 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -51,6 +51,8 @@ class Laplace(likelihood):
         self.Z = 0
         self.YYT = None
 
+        self.old_a = None
+
     def predictive_values(self, mu, var, full_cov):
         if full_cov:
             raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
@@ -83,7 +85,7 @@ class Laplace(likelihood):
         impl = mdot(dlp, dL_dfhat, I_KW_i)
         expl_a = mdot(self.Ki_f, self.Ki_f.T)
         expl_b = self.Wi_K_i
-        expl = 0.5*expl_a - 0.5*expl_b # Might need to be -?
+        expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
         #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
@@ -265,7 +267,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50):
+    def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=40):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -275,7 +277,12 @@ class Laplace(likelihood):
         :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
         :returns: f_mode
         """
-        f = np.zeros((self.N, 1))
+        if self.old_a is None:
+            old_a = np.zeros((self.N, 1))
+        else:
+            old_a = self.old_a
+
+        f = np.dot(self.K, old_a)
         new_obj = -np.inf
         old_obj = np.inf
 
@@ -292,7 +299,7 @@ class Laplace(likelihood):
             #f_old = f.copy()
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-5     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-8     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
@@ -300,38 +307,46 @@ class Laplace(likelihood):
 
             W_f = W*f
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
-            #Find K_i_f
+
             b = W_f + grad
-            b = step_size*b
-
-            #Need this to find the f we have a stepsize which we need to move in, rather than a full unit movement
-            #c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-            #solve_L = cho_solve((L, True), W_12*c)
-            #f = c - np.dot(K, W_12*solve_L)
-
-            #FIXME: Can't we get rid of this? Don't we want to evaluate obj(c,f) and this is our new_obj?
-            #Why did I choose to evaluate the objective function at the new f with the old hessian? I'm sure there was a good reason,
-            #Document it!
             solve_L = cho_solve((L, True), W_12*np.dot(K, b))
-            a = b - W_12*solve_L
-            f = np.dot(K, a)
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            full_step_a = b - W_12*solve_L
+            da = full_step_a - old_a
 
-            tmp_old_obj = old_obj
-            old_obj = new_obj
-            new_obj = obj(a, f)
-            difference = new_obj - old_obj
-            if difference < 0:
-                #print "Objective function rose", difference
-                #If the objective function isn't rising, restart optimization
-                step_size *= 0.9
-                #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
-                #objective function isn't increasing, try reducing step size
-                #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
-                old_obj = tmp_old_obj
-                rs += 1
+            update_passed = False
+            while not update_passed:
+                a = old_a + step_size*da
+                f = np.dot(K, a)
 
-            difference = abs(difference)
+                old_obj = new_obj
+                new_obj = np.float(obj(a, f))
+                difference = new_obj - old_obj
+                #print "difference: ",difference
+                if difference < 0:
+                    #print grad
+                    print "Objective function rose", np.float(difference)
+                    #If the objective function isn't rising, restart optimization
+                    step_size *= 0.8
+                    print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    #objective function isn't increasing, try reducing step size
+                    #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
+                    #old_obj = tmp_old_obj
+                    old_obj = new_obj
+                    rs += 1
+                else:
+                    update_passed = True
+
+            #print "Iter difference: ", difference
+            #print "F: ", f
+            #print "A: ", a
+            old_a = a
+            #print "Positive difference obj: ", np.float(difference)
+            difference = np.float(abs(difference))
             i += 1
 
-        self.i = i
+        #print "Positive difference obj: ", np.float(difference)
+        print "Iterations: ",i
+        print "Step size reductions", rs
+        print "Final difference: ", difference
         return f

From 064efd5535818b3ca6ec93baa83fc72ade12eb42 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 25 Jun 2013 18:20:00 +0100
Subject: [PATCH 55/71] Added another optimisation which doesn't use gradients.
 Seems like F is almost always found, but Y can be off, suggesting that
 Wi__Ki_W is wrong, maybe W?

---
 GPy/examples/laplace_approximations.py | 47 +++++++++---------
 GPy/likelihoods/Laplace.py             | 69 ++++++++++++++++----------
 2 files changed, 67 insertions(+), 49 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 7b9f10b1..61291e71 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -25,7 +25,7 @@ def timing():
         kernel1 = GPy.kern.rbf(X.shape[1])
 
         t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-        corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=True)
+        corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
         m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
         m.ensure_default_constraints()
         m.update_likelihood_approximation()
@@ -54,18 +54,17 @@ def v_fail_test():
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel1)
-    m.constrain_fixed('white', 1)
-    m.constrain_positive('t_noise')
-    vs = 15
+    m.constrain_positive('')
+    vs = 25
     noises = 40
     checkgrads = np.zeros((vs, noises))
     vs_noises = np.zeros((vs, noises))
-    for v_ind, v in enumerate(np.linspace(1, 20, vs)):
+    for v_ind, v in enumerate(np.linspace(1, 100, vs)):
         m.likelihood.likelihood_function.v = v
         print v
-        for noise_ind, noise in enumerate(np.linspace(0.0001, 1, noises)):
+        for noise_ind, noise in enumerate(np.linspace(0.0001, 10, noises)):
             m['t_noise'] = noise
             m.update_likelihood_approximation()
             checkgrads[v_ind, noise_ind] = m.checkgrad()
@@ -77,11 +76,11 @@ def v_fail_test():
     plt.xlabel('noise')
     plt.ylabel('v')
 
-    plt.figure()
-    plt.title('variance change')
-    plt.imshow(vs_noises, interpolation='nearest')
-    plt.xlabel('noise')
-    plt.ylabel('v')
+    #plt.figure()
+    #plt.title('variance change')
+    #plt.imshow(vs_noises, interpolation='nearest')
+    #plt.xlabel('noise')
+    #plt.ylabel('v')
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print(m)
 
@@ -93,13 +92,14 @@ def debug_student_t_noise_approx():
     #X = np.array([0.5, 1])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
-    X_full = np.linspace(0.0, 10.0, 50)[:, None]
+    X_full = X
     Y_full = np.sin(X_full)
 
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 100000
+    deg_free = 10
+
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
 
@@ -110,7 +110,7 @@ def debug_student_t_noise_approx():
 
     plt.close('all')
     # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1])# + GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
@@ -134,13 +134,13 @@ def debug_student_t_noise_approx():
     #print m
 
     edited_real_sd = initial_var_guess #real_sd
-    edited_real_sd = real_sd
+    #edited_real_sd = real_sd
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
-    #m['white'] = 1e-3
+    m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_variance', real_sd)
@@ -159,11 +159,12 @@ def debug_student_t_noise_approx():
         m.plot()
         plt.plot(X_full, Y_full)
         plt.ylim(-2.5, 2.5)
+    print "Real noise std: ", real_sd
     return m
 
     #print "Clean student t, ncg"
     #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
+    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
     #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
@@ -260,7 +261,7 @@ def student_t_approx():
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
@@ -274,7 +275,7 @@ def student_t_approx():
 
     print "Corrupt student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=True)
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
@@ -290,7 +291,7 @@ def student_t_approx():
 
     #print "Clean student t, ncg"
     #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
+    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
     #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
@@ -304,7 +305,7 @@ def student_t_approx():
 
     #print "Corrupt student t, ncg"
     #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=False)
+    #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
     #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index b5362839..b9d74846 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -12,7 +12,7 @@ import random
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self, data, likelihood_function, extra_data=None, rasm=True):
+    def __init__(self, data, likelihood_function, extra_data=None, opt='rasm'):
         """
         Laplace Approximation
 
@@ -29,13 +29,13 @@ class Laplace(likelihood):
         :data: array of data the likelihood function is approximating
         :likelihood_function: likelihood function - subclass of likelihood_function
         :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
-        :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation
+        :opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data)
 
         """
         self.data = data
         self.likelihood_function = likelihood_function
         self.extra_data = extra_data
-        self.rasm = rasm
+        self.opt = opt
 
         #Inital values
         self.N, self.D = self.data.shape
@@ -85,11 +85,12 @@ class Laplace(likelihood):
         impl = mdot(dlp, dL_dfhat, I_KW_i)
         expl_a = mdot(self.Ki_f, self.Ki_f.T)
         expl_b = self.Wi_K_i
+        #print "expl_a: {}, expl_b: {}".format(expl_a, expl_b)
         expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
-        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
-        dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp
+        print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        dL_dthetaK = dL_dthetaK_exp +dL_dthetaK_imp
         return dL_dthetaK
 
     def _gradients(self, partial):
@@ -109,7 +110,7 @@ class Laplace(likelihood):
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
             dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)
             print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
-            dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp
+            dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
 
         return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
@@ -165,7 +166,7 @@ class Laplace(likelihood):
                    - 0.5*self.f_Ki_f
                    + 0.5*y_Wi_Ki_i_y
                   )
-        #print "Ztilde: {}".format(Z_tilde)
+        print "Ztilde: {}".format(Z_tilde)
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -183,10 +184,11 @@ class Laplace(likelihood):
         self.K = K.copy()
 
         #Find mode
-        if self.rasm:
-            self.f_hat = self.rasm_mode(K)
-        else:
-            self.f_hat = self.ncg_mode(K)
+        self.f_hat = {
+            'rasm': self.rasm_mode,
+            'ncg': self.ncg_mode,
+            'nelder': self.nelder_mode
+        }[self.opt](self.K)
 
         #Compute hessian and other variables at mode
         self._compute_likelihood_variables()
@@ -196,20 +198,20 @@ class Laplace(likelihood):
         self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            self.W[self.W < 0] = 1e-5  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                        #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                        #To cause the posterior to become less certain than the prior and likelihood,
                                        #This is a property only held by non-log-concave likelihoods
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
-        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
-        self.Bi, _, _, B_det = pdinv(self.B)
+        #self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
+        #self.Bi, _, _, B_det = pdinv(self.B)
 
         #Do the computation again at f to get Ki_f which is useful
-        b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
-        solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
-        a = b - self.W_12*solve_chol
-        self.Ki_f = a
+        #b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
+        #solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
+        #a = b - self.W_12*solve_chol
+        self.Ki_f = self.a
 
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
         self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
@@ -239,6 +241,17 @@ class Laplace(likelihood):
         L = jitchol(B)
         return (B, L, W_12)
 
+    def nelder_mode(self, K):
+        f = np.zeros((self.N, 1))
+        self.Ki, _, _, self.ln_K_det = pdinv(K)
+        def obj(f):
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f)))
+            return float(res)
+
+        res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True})
+        f_new = res.x
+        return f_new[:, None]
+
     def ncg_mode(self, K):
         """
         Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
@@ -261,13 +274,13 @@ class Laplace(likelihood):
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (--np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
+            res = -1 * (np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
             return np.squeeze(res)
 
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=40):
+    def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=10):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -287,11 +300,10 @@ class Laplace(likelihood):
         old_obj = np.inf
 
         def obj(a, f):
-            #Careful of shape of data!
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-6
+        epsilon = 1e-9
         step_size = 1
         rs = 0
         i = 0
@@ -299,7 +311,7 @@ class Laplace(likelihood):
             #f_old = f.copy()
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-8     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
@@ -314,6 +326,7 @@ class Laplace(likelihood):
             full_step_a = b - W_12*solve_L
             da = full_step_a - old_a
 
+            f_old = f
             update_passed = False
             while not update_passed:
                 a = old_a + step_size*da
@@ -323,11 +336,11 @@ class Laplace(likelihood):
                 new_obj = np.float(obj(a, f))
                 difference = new_obj - old_obj
                 #print "difference: ",difference
-                if difference < 0:
+                if difference < -epsilon:
                     #print grad
                     print "Objective function rose", np.float(difference)
                     #If the objective function isn't rising, restart optimization
-                    step_size *= 0.8
+                    step_size *= 0.4
                     print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
                     #objective function isn't increasing, try reducing step size
                     #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
@@ -337,16 +350,20 @@ class Laplace(likelihood):
                 else:
                     update_passed = True
 
+            difference = np.abs(np.sum(f - f_old)) + abs(difference)
             #print "Iter difference: ", difference
             #print "F: ", f
             #print "A: ", a
             old_a = a
             #print "Positive difference obj: ", np.float(difference)
-            difference = np.float(abs(difference))
+            #difference = np.float(abs(difference))
             i += 1
 
         #print "Positive difference obj: ", np.float(difference)
         print "Iterations: ",i
         print "Step size reductions", rs
         print "Final difference: ", difference
+        self.a = a
+        self.B, self.B_chol, self.W_12 = B, L, W_12
+        self.Bi, _, _, B_det = pdinv(self.B)
         return f

From 617d73ca3271f080ed2e58efd9cbd9a49e301ac0 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 26 Jun 2013 15:44:26 +0100
Subject: [PATCH 56/71] Now checkgrads a lot more of the time, but still fails
 in optimisation, seems also odd that when parameter is fixed kernel
 parameters go to infinity

---
 GPy/examples/laplace_approximations.py | 17 +++++++++++------
 GPy/likelihoods/Laplace.py             | 23 ++++++++---------------
 GPy/models/GP.py                       |  7 +++++--
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 61291e71..0fd3efeb 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -98,7 +98,7 @@ def debug_student_t_noise_approx():
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10
+    deg_free = 100
 
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
@@ -133,20 +133,23 @@ def debug_student_t_noise_approx():
         #plt.plot(X_full, Y_full)
     #print m
 
-    edited_real_sd = initial_var_guess #real_sd
+    real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
+    edited_real_sd = real_stu_t_std#initial_var_guess #real_sd
     #edited_real_sd = real_sd
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
-    m['rbf_len'] = 1.5
+    #m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 1.8651)
-    #m.constrain_fixed('t_noise_variance', real_sd)
+    m.constrain_fixed('t_noise_std', edited_real_sd)
     #m.constrain_positive('rbf')
-    #m.constrain_positive('t_noise')
-    m.constrain_positive('')
+    #m.constrain_positive('t_noise_std')
+    #m.constrain_positive('')
+    m.ensure_default_constraints()
     #m.constrain_fixed('t_noi', real_sd)
     m.update_likelihood_approximation()
     #m.optimize(messages=True)
@@ -264,6 +267,7 @@ def student_t_approx():
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     m.ensure_default_constraints()
+    m.constrain_positive('t_noise')
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
@@ -278,6 +282,7 @@ def student_t_approx():
     corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
+    m.constrain_positive('t_noise')
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index b9d74846..1431a7c6 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -109,7 +109,7 @@ class Laplace(likelihood):
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
             dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)
-            print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
+            #print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
             dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
 
         return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
@@ -147,10 +147,11 @@ class Laplace(likelihood):
         Li = chol_inv(L)
         Lt_W = L.T*self.W.T
 
-        Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
+        Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
         self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
 
         Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
+        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         self.Sigma_tilde = np.diagflat(1.0/self.W)
 
@@ -166,7 +167,7 @@ class Laplace(likelihood):
                    - 0.5*self.f_Ki_f
                    + 0.5*y_Wi_Ki_i_y
                   )
-        print "Ztilde: {}".format(Z_tilde)
+        #print "Ztilde: {}".format(Z_tilde)
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -280,7 +281,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=10):
+    def rasm_mode(self, K, MAX_ITER=250, MAX_RESTART=10):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -308,7 +309,6 @@ class Laplace(likelihood):
         rs = 0
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
-            #f_old = f.copy()
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
                 W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
@@ -338,10 +338,10 @@ class Laplace(likelihood):
                 #print "difference: ",difference
                 if difference < -epsilon:
                     #print grad
-                    print "Objective function rose", np.float(difference)
+                    #print "Objective function rose", np.float(difference)
                     #If the objective function isn't rising, restart optimization
                     step_size *= 0.4
-                    print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
                     #objective function isn't increasing, try reducing step size
                     #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
                     #old_obj = tmp_old_obj
@@ -351,18 +351,11 @@ class Laplace(likelihood):
                     update_passed = True
 
             difference = np.abs(np.sum(f - f_old)) + abs(difference)
-            #print "Iter difference: ", difference
-            #print "F: ", f
-            #print "A: ", a
             old_a = a
-            #print "Positive difference obj: ", np.float(difference)
-            #difference = np.float(abs(difference))
             i += 1
 
         #print "Positive difference obj: ", np.float(difference)
-        print "Iterations: ",i
-        print "Step size reductions", rs
-        print "Final difference: ", difference
+        print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
         self.a = a
         self.B, self.B_chol, self.W_12 = B, L, W_12
         self.Bi, _, _, B_det = pdinv(self.B)
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 636ebba0..7b6fab27 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -141,10 +141,11 @@ class GP(model):
 
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
-        self.likelihood.fit_full(self.kern.K(self.X))
-        self.likelihood._set_params(self.likelihood._get_params())
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
+        print "dL_dthetaK should be: ", dL_dthetaK
         if isinstance(self.likelihood, Laplace):
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
             dK_dthetaK = self.kern.dK_dtheta
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
@@ -153,6 +154,8 @@ class GP(model):
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
             #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+        print "dL_dthetaK is: ", dL_dthetaK
+
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
 

From c90b1f0c99b84bf7e981113e5bfd83396b825ed1 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 27 Jun 2013 15:04:57 +0100
Subject: [PATCH 57/71] Added minimizer for finding f, doesn't help

---
 GPy/examples/laplace_approximations.py |  8 +--
 GPy/likelihoods/Laplace.py             | 80 ++++++++++++++++----------
 GPy/models/GP.py                       | 11 ++--
 3 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 0fd3efeb..abb5f4ce 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -58,13 +58,13 @@ def v_fail_test():
     m = GPy.models.GP(X, stu_t_likelihood, kernel1)
     m.constrain_positive('')
     vs = 25
-    noises = 40
+    noises = 30
     checkgrads = np.zeros((vs, noises))
     vs_noises = np.zeros((vs, noises))
     for v_ind, v in enumerate(np.linspace(1, 100, vs)):
         m.likelihood.likelihood_function.v = v
         print v
-        for noise_ind, noise in enumerate(np.linspace(0.0001, 10, noises)):
+        for noise_ind, noise in enumerate(np.linspace(0.0001, 100, noises)):
             m['t_noise'] = noise
             m.update_likelihood_approximation()
             checkgrads[v_ind, noise_ind] = m.checkgrad()
@@ -145,9 +145,9 @@ def debug_student_t_noise_approx():
     #m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 1.8651)
-    m.constrain_fixed('t_noise_std', edited_real_sd)
+    #m.constrain_fixed('t_noise_std', edited_real_sd)
     #m.constrain_positive('rbf')
-    #m.constrain_positive('t_noise_std')
+    m.constrain_positive('t_noise_std')
     #m.constrain_positive('')
     m.ensure_default_constraints()
     #m.constrain_fixed('t_noi', real_sd)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 1431a7c6..e096c5f4 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -90,7 +90,7 @@ class Laplace(likelihood):
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
         print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
-        dL_dthetaK = dL_dthetaK_exp +dL_dthetaK_imp
+        dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
         return dL_dthetaK
 
     def _gradients(self, partial):
@@ -126,7 +126,6 @@ class Laplace(likelihood):
         due to the z rescaling.
 
         at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
-
         This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
         giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
 
@@ -143,17 +142,18 @@ class Laplace(likelihood):
         #dtritri -> L -> L_i
         #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
         #((L.T*w)_i + I)f_hat = y_tilde
-        L = jitchol(self.K)
-        Li = chol_inv(L)
-        Lt_W = L.T*self.W.T
+        #L = jitchol(self.K)
+        #Li = chol_inv(L)
+        #Lt_W = L.T*self.W.T
 
-        Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
-        self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
+        #Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
+        #self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
+        #Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
 
-        Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
-        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        Wi = 1.0/self.W
+        self.Sigma_tilde = np.diagflat(Wi)
 
-        self.Sigma_tilde = np.diagflat(1.0/self.W)
+        Y_tilde = Wi*(self.Ki_f + self.W*self.f_hat)
 
         self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
         ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
@@ -281,7 +281,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=250, MAX_RESTART=10):
+    def rasm_mode(self, K, MAX_ITER=40, MAX_RESTART=10):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -297,6 +297,7 @@ class Laplace(likelihood):
             old_a = self.old_a
 
         f = np.dot(self.K, old_a)
+        self.f = f
         new_obj = -np.inf
         old_obj = np.inf
 
@@ -304,7 +305,7 @@ class Laplace(likelihood):
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-9
+        epsilon = 1e-6
         step_size = 1
         rs = 0
         i = 0
@@ -316,6 +317,8 @@ class Laplace(likelihood):
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
             B, L, W_12 = self._compute_B_statistics(K, W)
+            #if i > 30:
+                #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
             W_f = W*f
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
@@ -326,37 +329,52 @@ class Laplace(likelihood):
             full_step_a = b - W_12*solve_L
             da = full_step_a - old_a
 
-            f_old = f
-            update_passed = False
-            while not update_passed:
+            f_old = self.f.copy()
+
+            def inner_obj(step_size, old_a, da, K):
                 a = old_a + step_size*da
                 f = np.dot(K, a)
+                self.a = a
+                self.f = f
+                return -obj(a, f)
 
-                old_obj = new_obj
-                new_obj = np.float(obj(a, f))
-                difference = new_obj - old_obj
+            from functools import partial
+            i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K)
+            old_obj = new_obj
+            new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
+
+            #update_passed = False
+            #while not update_passed:
+                #a = old_a + step_size*da
+                #f = np.dot(K, a)
+
+                #old_obj = new_obj
+                #new_obj = obj(a, f)
+                #difference = new_obj - old_obj
                 #print "difference: ",difference
-                if difference < -epsilon:
-                    #print grad
+                #if difference < 0:
+                    ##print grad
                     #print "Objective function rose", np.float(difference)
-                    #If the objective function isn't rising, restart optimization
-                    step_size *= 0.4
+                    ##If the objective function isn't rising, restart optimization
+                    #step_size *= 0.8
                     #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
-                    #objective function isn't increasing, try reducing step size
-                    #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
-                    #old_obj = tmp_old_obj
-                    old_obj = new_obj
-                    rs += 1
-                else:
-                    update_passed = True
+                    ##objective function isn't increasing, try reducing step size
+                    ##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
+                    ##old_obj = tmp_old_obj
+                    #old_obj = new_obj
+                    #rs += 1
+                #else:
+                    #update_passed = True
 
+            f = self.f
+            difference = new_obj - old_obj
             difference = np.abs(np.sum(f - f_old)) + abs(difference)
-            old_a = a
+            old_a = self.a #a
             i += 1
 
         #print "Positive difference obj: ", np.float(difference)
         print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
-        self.a = a
+        #self.a = a
         self.B, self.B_chol, self.W_12 = B, L, W_12
         self.Bi, _, _, B_det = pdinv(self.B)
         return f
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 7b6fab27..1d57ed38 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -142,19 +142,18 @@ class GP(model):
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
-        print "dL_dthetaK should be: ", dL_dthetaK
+        #print "dL_dthetaK should be: ", dL_dthetaK
         if isinstance(self.likelihood, Laplace):
-            self.likelihood.fit_full(self.kern.K(self.X))
-            self.likelihood._set_params(self.likelihood._get_params())
+            #self.likelihood.fit_full(self.kern.K(self.X))
+            #self.likelihood._set_params(self.likelihood._get_params())
             dK_dthetaK = self.kern.dK_dtheta
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            #print "dL_dthetaK after: ",dL_dthetaK
-            #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+        #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         print "dL_dthetaK is: ", dL_dthetaK
+        print "dL_dthetaL is: ", dL_dthetaL
 
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))

From 26b3855af56ee220cfa00928f6f936bd1161acdf Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 1 Jul 2013 10:06:20 +0100
Subject: [PATCH 58/71] Everything seems to be gradchecking again

---
 GPy/examples/laplace_approximations.py  |  7 ++++++-
 GPy/likelihoods/Laplace.py              | 18 +++++++++---------
 GPy/likelihoods/likelihood_functions.py |  2 +-
 GPy/models/GP.py                        |  3 +--
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index abb5f4ce..24f2d88c 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -91,6 +91,8 @@ def debug_student_t_noise_approx():
     X = np.linspace(0.0, 10.0, 50)[:, None]
     #X = np.array([0.5, 1])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+    #ty = np.array([1., 9.97733584, 4.17841363])[:, None]
+    #Y = ty
 
     X_full = X
     Y_full = np.sin(X_full)
@@ -98,7 +100,7 @@ def debug_student_t_noise_approx():
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 100
+    deg_free = 10000
 
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
@@ -151,6 +153,9 @@ def debug_student_t_noise_approx():
     #m.constrain_positive('')
     m.ensure_default_constraints()
     #m.constrain_fixed('t_noi', real_sd)
+    #m['rbf_var'] = 0.20446332
+    #m['rbf_leng'] = 0.85776241
+    #m['t_noise'] = 0.667083294421005
     m.update_likelihood_approximation()
     #m.optimize(messages=True)
     print(m)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index e096c5f4..e4652f27 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -153,7 +153,7 @@ class Laplace(likelihood):
         Wi = 1.0/self.W
         self.Sigma_tilde = np.diagflat(Wi)
 
-        Y_tilde = Wi*(self.Ki_f + self.W*self.f_hat)
+        Y_tilde = Wi*self.Ki_f + self.f_hat
 
         self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
         ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
@@ -199,7 +199,7 @@ class Laplace(likelihood):
         self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-5  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            self.W[self.W < 0] = 1e-8  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                        #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                        #To cause the posterior to become less certain than the prior and likelihood,
                                        #This is a property only held by non-log-concave likelihoods
@@ -312,7 +312,7 @@ class Laplace(likelihood):
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 0#1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
@@ -329,8 +329,9 @@ class Laplace(likelihood):
             full_step_a = b - W_12*solve_L
             da = full_step_a - old_a
 
-            f_old = self.f.copy()
+            f_old = f.copy()
 
+            f_old = self.f.copy()
             def inner_obj(step_size, old_a, da, K):
                 a = old_a + step_size*da
                 f = np.dot(K, a)
@@ -340,7 +341,6 @@ class Laplace(likelihood):
 
             from functools import partial
             i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K)
-            old_obj = new_obj
             new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
 
             #update_passed = False
@@ -354,10 +354,10 @@ class Laplace(likelihood):
                 #print "difference: ",difference
                 #if difference < 0:
                     ##print grad
-                    #print "Objective function rose", np.float(difference)
+                    ##print "Objective function rose", np.float(difference)
                     ##If the objective function isn't rising, restart optimization
                     #step_size *= 0.8
-                    #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
                     ##objective function isn't increasing, try reducing step size
                     ##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
                     ##old_obj = tmp_old_obj
@@ -368,12 +368,12 @@ class Laplace(likelihood):
 
             f = self.f
             difference = new_obj - old_obj
-            difference = np.abs(np.sum(f - f_old)) + abs(difference)
+            difference = np.abs(np.sum(f - f_old)) #+ abs(difference)
             old_a = self.a #a
             i += 1
 
         #print "Positive difference obj: ", np.float(difference)
-        print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
+        #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
         #self.a = a
         self.B, self.B_chol, self.W_12 = B, L, W_12
         self.Bi, _, _, B_det = pdinv(self.B)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 4d298122..ebc87f56 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -274,7 +274,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e)
+        dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
                             / ((self.v*(self.sigma**2) + e**2)**2)
                            )
         return dlik_grad_dsigma
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 1d57ed38..20337ef5 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -152,8 +152,7 @@ class GP(model):
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
         #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-        print "dL_dthetaK is: ", dL_dthetaK
-        print "dL_dthetaL is: ", dL_dthetaL
+        print "dL_dthetaK: {}   dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL)
 
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))

From a7169ab1ab771e567e45d6a11ae9e13b13f3c754 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 1 Jul 2013 15:21:47 +0100
Subject: [PATCH 59/71] Fixed bug where B wasn't refering to current f location

---
 GPy/core/model.py                       |  3 +++
 GPy/examples/laplace_approximations.py  |  5 +++--
 GPy/likelihoods/Laplace.py              | 21 ++++++++++-----------
 GPy/likelihoods/likelihood_functions.py |  6 +++++-
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 94202396..83a4a428 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -244,6 +244,9 @@ class model(parameterised):
         LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
         prior_gradients = self._transform_gradients(self._log_prior_gradients())
         obj_grads = -LL_gradients - prior_gradients
+        print self
+        #self.checkgrad(verbose=1)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         return obj_f, obj_grads
 
     def optimize(self, optimizer=None, start=None, **kwargs):
diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 24f2d88c..bb621424 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -100,7 +100,7 @@ def debug_student_t_noise_approx():
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10000
+    deg_free = 1000
 
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
@@ -152,7 +152,7 @@ def debug_student_t_noise_approx():
     m.constrain_positive('t_noise_std')
     #m.constrain_positive('')
     m.ensure_default_constraints()
-    #m.constrain_fixed('t_noi', real_sd)
+    m.constrain_bounded('t_noi', 0.001, 10)
     #m['rbf_var'] = 0.20446332
     #m['rbf_leng'] = 0.85776241
     #m['t_noise'] = 0.667083294421005
@@ -168,6 +168,7 @@ def debug_student_t_noise_approx():
         plt.plot(X_full, Y_full)
         plt.ylim(-2.5, 2.5)
     print "Real noise std: ", real_sd
+    print "or Real noise std: ", real_stu_t_std
     return m
 
     #print "Clean student t, ncg"
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index e4652f27..4c9c67df 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -158,7 +158,6 @@ class Laplace(likelihood):
         self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
         ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
         l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
-        #print "fDf:{}   l:{}   detKWiBi:{}   W:{}   Wi:{}   Bi:{}   Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum())
 
         y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
         Z_tilde = (+ self.NORMAL_CONST
@@ -199,14 +198,14 @@ class Laplace(likelihood):
         self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-8  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                        #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                        #To cause the posterior to become less certain than the prior and likelihood,
                                        #This is a property only held by non-log-concave likelihoods
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
-        #self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
-        #self.Bi, _, _, B_det = pdinv(self.B)
+        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
+        self.Bi, _, _, B_det = pdinv(self.B)
 
         #Do the computation again at f to get Ki_f which is useful
         #b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
@@ -305,14 +304,14 @@ class Laplace(likelihood):
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-6
+        epsilon = 1e-10
         step_size = 1
         rs = 0
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 0#1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
@@ -335,13 +334,13 @@ class Laplace(likelihood):
             def inner_obj(step_size, old_a, da, K):
                 a = old_a + step_size*da
                 f = np.dot(K, a)
-                self.a = a
+                self.a = a # This is nasty, need to set something within an optimization though
                 self.f = f
                 return -obj(a, f)
 
             from functools import partial
             i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K)
-            new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
+            new_obj = sp.optimize.brent(i_o, tol=1e-6, maxiter=10)
 
             #update_passed = False
             #while not update_passed:
@@ -373,8 +372,8 @@ class Laplace(likelihood):
             i += 1
 
         #print "Positive difference obj: ", np.float(difference)
-        #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
+        print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
         #self.a = a
-        self.B, self.B_chol, self.W_12 = B, L, W_12
-        self.Bi, _, _, B_det = pdinv(self.B)
+        #self.B, self.B_chol, self.W_12 = B, L, W_12
+        #self.Bi, _, _, B_det = pdinv(self.B)
         return f
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index ebc87f56..57627198 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -195,8 +195,9 @@ class student_t(likelihood_function):
         e = y - f
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
-                     - np.log(self.sigma * np.sqrt(self.v * np.pi))
+                     - 0.5*np.log((self.sigma**2) * self.v * np.pi)
                      - (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v))
+                     #- (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2)))
                     )
         return np.sum(objective)
 
@@ -264,6 +265,7 @@ class student_t(likelihood_function):
         dlik_dsigma = ( - (1/self.sigma) +
                         ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) )
                       )
+        #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
@@ -290,6 +292,8 @@ class student_t(likelihood_function):
         dlik_hess_dsigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
                               ((e**2 + (self.sigma**2)*self.v)**3)
                            )
+        #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
+                             #/ ((e**2 + (self.sigma**2)*self.v)**3) )
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):

From ab6a3a571e4ef0aec66776f56921326166f09d40 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 2 Jul 2013 11:14:48 +0100
Subject: [PATCH 60/71] Playing trying to find what makes it want to go so low

---
 GPy/core/model.py                       |  2 +-
 GPy/examples/laplace_approximations.py  | 21 ++++++++++++++-------
 GPy/likelihoods/Laplace.py              | 18 +++++++++---------
 GPy/likelihoods/likelihood_functions.py |  4 ++--
 4 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 83a4a428..f97938a4 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -246,7 +246,7 @@ class model(parameterised):
         obj_grads = -LL_gradients - prior_gradients
         print self
         #self.checkgrad(verbose=1)
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         return obj_f, obj_grads
 
     def optimize(self, optimizer=None, start=None, **kwargs):
diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index bb621424..14400a08 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -88,9 +88,12 @@ def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 50)[:, None]
+    #X = np.linspace(0.0, 10.0, 50)[:, None]
+    X = np.random.rand(100)[:, None]
+    #X = np.random.rand(100)[:, None]
     #X = np.array([0.5, 1])[:, None]
-    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var
+    #Y = X + np.random.randn(*X.shape)*real_var
     #ty = np.array([1., 9.97733584, 4.17841363])[:, None]
     #Y = ty
 
@@ -112,7 +115,8 @@ def debug_student_t_noise_approx():
 
     plt.close('all')
     # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1])# + GPy.kern.white(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
@@ -136,7 +140,7 @@ def debug_student_t_noise_approx():
     #print m
 
     real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
-    edited_real_sd = real_stu_t_std#initial_var_guess #real_sd
+    edited_real_sd = real_stu_t_std + 1#initial_var_guess #real_sd
     #edited_real_sd = real_sd
 
     print "Clean student t, rasm"
@@ -149,13 +153,16 @@ def debug_student_t_noise_approx():
     #m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_std', edited_real_sd)
     #m.constrain_positive('rbf')
-    m.constrain_positive('t_noise_std')
+    #m.constrain_positive('t_noise_std')
     #m.constrain_positive('')
-    m.ensure_default_constraints()
-    m.constrain_bounded('t_noi', 0.001, 10)
+    #m.constrain_bounded('t_noi', 0.001, 10)
+    #m.constrain_fixed('t_noi', real_stu_t_std)
+    m.constrain_fixed('white', 0.01)
+    #m.constrain_fixed('t_no', 0.01)
     #m['rbf_var'] = 0.20446332
     #m['rbf_leng'] = 0.85776241
     #m['t_noise'] = 0.667083294421005
+    m.ensure_default_constraints()
     m.update_likelihood_approximation()
     #m.optimize(messages=True)
     print(m)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 4c9c67df..2ae68613 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -156,15 +156,15 @@ class Laplace(likelihood):
         Y_tilde = Wi*self.Ki_f + self.f_hat
 
         self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
-        ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
-        l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+        self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
+        self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
 
-        y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
-        Z_tilde = (+ self.NORMAL_CONST
-                   + l
-                   + 0.5*ln_det_K_Wi__Bi
+        self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+        Z_tilde = (#+ self.NORMAL_CONST
+                   + self.lik
+                   + 0.5*self.ln_det_K_Wi__Bi
                    - 0.5*self.f_Ki_f
-                   + 0.5*y_Wi_Ki_i_y
+                   + 0.5*self.y_Wi_Ki_i_y
                   )
         #print "Ztilde: {}".format(Z_tilde)
 
@@ -198,7 +198,7 @@ class Laplace(likelihood):
         self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            self.W[self.W < 0] = 1e-10  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                        #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                        #To cause the posterior to become less certain than the prior and likelihood,
                                        #This is a property only held by non-log-concave likelihoods
@@ -311,7 +311,7 @@ class Laplace(likelihood):
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-10     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 57627198..fd64dbe6 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -196,8 +196,8 @@ class student_t(likelihood_function):
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
                      - 0.5*np.log((self.sigma**2) * self.v * np.pi)
-                     - (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v))
-                     #- (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2)))
+                     #- (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v))
+                     - (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2)))
                     )
         return np.sum(objective)
 

From 4e5cefb4b5cb14a3c4f94dbd4d18eac8c70a84fd Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 8 Jul 2013 15:48:53 +0100
Subject: [PATCH 61/71] Reparameratised in terms of sigma2

---
 GPy/core/model.py                       |   3 -
 GPy/examples/laplace_approximations.py  |  34 ++--
 GPy/likelihoods/Laplace.py              |  12 +-
 GPy/likelihoods/likelihood_functions.py | 207 +++++++++++++++++++++---
 4 files changed, 207 insertions(+), 49 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index f97938a4..94202396 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -244,9 +244,6 @@ class model(parameterised):
         LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
         prior_gradients = self._transform_gradients(self._log_prior_gradients())
         obj_grads = -LL_gradients - prior_gradients
-        print self
-        #self.checkgrad(verbose=1)
-        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         return obj_f, obj_grads
 
     def optimize(self, optimizer=None, start=None, **kwargs):
diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 14400a08..d6b48ebf 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -24,7 +24,7 @@ def timing():
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
 
-        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
         corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
         m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
         m.ensure_default_constraints()
@@ -53,7 +53,7 @@ def v_fail_test():
     edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel1)
     m.constrain_positive('')
@@ -92,18 +92,18 @@ def debug_student_t_noise_approx():
     X = np.random.rand(100)[:, None]
     #X = np.random.rand(100)[:, None]
     #X = np.array([0.5, 1])[:, None]
-    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var
+    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + 1
     #Y = X + np.random.randn(*X.shape)*real_var
     #ty = np.array([1., 9.97733584, 4.17841363])[:, None]
     #Y = ty
 
     X_full = X
-    Y_full = np.sin(X_full)
+    Y_full = np.sin(X_full) + 1
 
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 1000
+    deg_free = 100
 
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
@@ -115,7 +115,7 @@ def debug_student_t_noise_approx():
 
     plt.close('all')
     # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
     #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
@@ -140,24 +140,24 @@ def debug_student_t_noise_approx():
     #print m
 
     real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
-    edited_real_sd = real_stu_t_std + 1#initial_var_guess #real_sd
+    edited_real_sd = real_stu_t_std**2 #initial_var_guess #real_sd
     #edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
 
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     #m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
-    #m.constrain_fixed('rbf_l', 1.8651)
+    #m.constrain_fixed('rbf_l', 0.2651)
     #m.constrain_fixed('t_noise_std', edited_real_sd)
     #m.constrain_positive('rbf')
-    #m.constrain_positive('t_noise_std')
+    m.constrain_positive('t_noise_std')
     #m.constrain_positive('')
     #m.constrain_bounded('t_noi', 0.001, 10)
     #m.constrain_fixed('t_noi', real_stu_t_std)
-    m.constrain_fixed('white', 0.01)
+    #m.constrain_fixed('white', 0.01)
     #m.constrain_fixed('t_no', 0.01)
     #m['rbf_var'] = 0.20446332
     #m['rbf_leng'] = 0.85776241
@@ -179,7 +179,7 @@ def debug_student_t_noise_approx():
     return m
 
     #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
     #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
@@ -276,7 +276,7 @@ def student_t_approx():
     edited_real_sd = real_sd #initial_var_guess
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     m.ensure_default_constraints()
@@ -291,7 +291,7 @@ def student_t_approx():
     plt.title('Student-t rasm clean')
 
     print "Corrupt student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
@@ -308,7 +308,7 @@ def student_t_approx():
     return m
 
     #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
     #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
@@ -322,7 +322,7 @@ def student_t_approx():
     #plt.title('Student-t ncg clean')
 
     #print "Corrupt student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
     #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
     #m.ensure_default_constraints()
@@ -337,7 +337,7 @@ def student_t_approx():
 
 
     ###with a student t distribution, since it has heavy tails it should work well
-    ###likelihood_function = student_t(deg_free, sigma=real_var)
+    ###likelihood_function = student_t(deg_free, sigma2=real_var)
     ###lap = Laplace(Y, likelihood_function)
     ###cov = kernel.K(X)
     ###lap.fit_full(cov)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 2ae68613..984112a5 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -220,10 +220,10 @@ class Laplace(likelihood):
         self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
 
         #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
-        self.ln_z_hat = (- 0.5*self.f_Ki_f
-                         - self.ln_I_KW_det
-                         + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
-                         )
+        #self.ln_z_hat = (- 0.5*self.f_Ki_f
+                         #- self.ln_I_KW_det
+                         #+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+                         #)
 
         return self._compute_GP_variables()
 
@@ -308,6 +308,8 @@ class Laplace(likelihood):
         step_size = 1
         rs = 0
         i = 0
+        #if self.likelihood_function.sigma < 0.001:
+            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
@@ -316,8 +318,6 @@ class Laplace(likelihood):
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
             B, L, W_12 = self._compute_B_statistics(K, W)
-            #if i > 30:
-                #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
             W_f = W*f
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index fd64dbe6..bfc759d7 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -158,26 +158,26 @@ class student_t(likelihood_function):
     dln p(yi|fi)_dfi
     d2ln p(yi|fi)_d2fifj
     """
-    def __init__(self, deg_free, sigma=2):
+    def __init__(self, deg_free, sigma2=2):
         #super(student_t, self).__init__()
         self.v = deg_free
-        self.sigma = sigma
+        self.sigma2 = sigma2
         self.log_concave = False
 
-        self._set_params(np.asarray(sigma))
+        self._set_params(np.asarray(sigma2))
 
     def _get_params(self):
-        return np.asarray(self.sigma)
+        return np.asarray(self.sigma2)
 
     def _get_param_names(self):
-        return ["t_noise_std"]
+        return ["t_noise_std2"]
 
     def _set_params(self, x):
-        self.sigma = float(x)
+        self.sigma2 = float(x)
 
     @property
     def variance(self, extra_data=None):
-        return (self.v / float(self.v - 2)) * (self.sigma**2)
+        return (self.v / float(self.v - 2)) * self.sigma2
 
     def link_function(self, y, f, extra_data=None):
         """link_function $\ln p(y|f)$
@@ -193,12 +193,16 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
+        A = gammaln((self.v + 1) * 0.5)
+        B = -gammaln(self.v * 0.5)
+        C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
+        D = (-(self.v + 1)*0.5)*np.log(1 + (e**2)/(self.v*self.sigma2))
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
-                     - 0.5*np.log((self.sigma**2) * self.v * np.pi)
-                     #- (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v))
-                     - (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2)))
+                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
+                     + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
                     )
+        #print "A: {} B: {} C: {} D: {} obj: {}".format(A,B,C,D.sum(),objective.sum())
         return np.sum(objective)
 
     def dlik_df(self, y, f, extra_data=None):
@@ -215,7 +219,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
         return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
@@ -235,7 +239,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -246,8 +250,8 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
-                       ((e**2 + (self.sigma**2)*self.v)**3)
+        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       ((e**2 + self.sigma2*self.v)**3)
                     )
         return d3lik_d3f
 
@@ -262,10 +266,16 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_dsigma = ( - (1/self.sigma) +
-                        ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) )
-                      )
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_dsigma = ( - (1/sigma) +
+                        #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      #)
+        #dlik_dsigma = ( - 1 +
+                        #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      #)
         #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
+        #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
+        dlik_dsigma = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
@@ -276,9 +286,11 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
-                            / ((self.v*(self.sigma**2) + e**2)**2)
-                           )
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
+                            #/ ((self.v*self.sigma2 + e**2)**2)
+                           #)
+        dlik_grad_dsigma = (-self.v*(self.v+1)*e)/((self.sigma2*self.v + e**2)**2)
         return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
@@ -289,11 +301,15 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_hess_dsigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
-                              ((e**2 + (self.sigma**2)*self.v)**3)
-                           )
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
+                              #((e**2 + self.sigma2*self.v)**3)
+                           #)
         #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
                              #/ ((e**2 + (self.sigma**2)*self.v)**3) )
+        dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
+                              / (self.sigma2*self.v + (e**2))**3
+                           )
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):
@@ -466,3 +482,148 @@ class weibull_survival(likelihood_function):
 
         hess = (y**self.shape)*np.exp(f)
         return np.squeeze(hess)
+
+#class gaussian(likelihood_function):
+    #"""
+    #Gaussian likelihood - this is a test class for approximation schemes
+    #"""
+    #def __init__(self, variance):
+        #self._set_params(np.asarray(variance))
+
+    #def _get_params(self):
+        #return np.asarray(self.sigma2)
+
+    #def _get_param_names(self):
+        #return ["noise_variance"]
+
+    #def _set_params(self, x):
+        #self.variance = float(x)
+
+    #def link_function(self, y, f, extra_data=None):
+        #"""link_function $\ln p(y|f)$
+        #$$\ln p(y_{i}|f_{i}) = \ln $$
+
+        #:y: data
+        #:f: latent variables f
+        #:extra_data: extra_data which is not used in student t distribution
+        #:returns: float(likelihood evaluated for this point)
+
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #objective = -0.5*self.D*
+        #return np.sum(objective)
+
+    #def dlik_df(self, y, f, extra_data=None):
+        #"""
+        #Gradient of the link function at y, given f w.r.t f
+
+        #$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
+
+        #:y: data
+        #:f: latent variables f
+        #:extra_data: extra_data which is not used in student t distribution
+        #:returns: gradient of likelihood evaluated at points
+
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        #return grad
+
+    #def d2lik_d2f(self, y, f, extra_data=None):
+        #"""
+        #Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        #i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        #Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+        #(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+
+        #$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
+
+        #:y: data
+        #:f: latent variables f
+        #:extra_data: extra_data which is not used in student t distribution
+        #:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
+        #return hess
+
+    #def d3lik_d3f(self, y, f, extra_data=None):
+        #"""
+        #Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+
+        #$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       #((e**2 + self.sigma2*self.v)**3)
+                    #)
+        #return d3lik_d3f
+
+    #def lik_dstd(self, y, f, extra_data=None):
+        #"""
+        #Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
+
+        #Terms relavent to derivatives wrt sigma are:
+        #-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+
+        #$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #sigma = np.sqrt(self.sigma2)
+        ##dlik_dsigma = ( - (1/sigma) +
+                        ##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      ##)
+        ##dlik_dsigma = ( - 1 +
+                        ##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      ##)
+        ##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
+        #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
+        #return dlik_dsigma
+
+    #def dlik_df_dstd(self, y, f, extra_data=None):
+        #"""
+        #Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
+
+        #$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
+                            #/ ((self.v*self.sigma2 + e**2)**2)
+                           #)
+        #return dlik_grad_dsigma
+
+    #def d2lik_d2f_dstd(self, y, f, extra_data=None):
+        #"""
+        #Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
+
+        #$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
+                              #((e**2 + self.sigma2*self.v)**3)
+                           #)
+        ##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
+                             ##/ ((e**2 + (self.sigma**2)*self.v)**3) )
+        #return dlik_hess_dsigma
+
+    #def _gradients(self, y, f, extra_data=None):
+        ##must be listed in same order as 'get_param_names'
+        #derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
+                  #[self.dlik_df_dstd(y, f, extra_data=extra_data)],
+                  #[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+                 #) # lists as we might learn many parameters
+        ## ensure we have gradients for every parameter we want to optimize
+        #assert len(derivs[0]) == len(self._get_param_names())
+        #assert len(derivs[1]) == len(self._get_param_names())
+        #assert len(derivs[2]) == len(self._get_param_names())
+        #return derivs

From 2a366619b340d25d5dd53836e2e66ffcfb2257d7 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 8 Jul 2013 16:09:20 +0100
Subject: [PATCH 62/71] Changed incorrect naming

---
 GPy/examples/laplace_approximations.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index d6b48ebf..78b4e986 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -84,6 +84,26 @@ def v_fail_test():
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print(m)
 
+def student_t_f_check():
+    real_var = 0.1
+    X = np.random.rand(100)[:, None]
+    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var
+    X_full = X
+    Y_full = np.sin(X_full)
+    Y = Y/Y.max()
+    deg_free = 1000
+    real_sd = np.sqrt(real_var)
+
+    kernel = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+    real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
+
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std**2)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    m = GPy.models.GP(X, stu_t_likelihood, kernel)
+    m.constrain_positive('t_noise_std2')
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+
 def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
@@ -151,9 +171,9 @@ def debug_student_t_noise_approx():
     #m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 0.2651)
-    #m.constrain_fixed('t_noise_std', edited_real_sd)
+    #m.constrain_fixed('t_noise_std2', edited_real_sd)
     #m.constrain_positive('rbf')
-    m.constrain_positive('t_noise_std')
+    m.constrain_positive('t_noise_std2')
     #m.constrain_positive('')
     #m.constrain_bounded('t_noi', 0.001, 10)
     #m.constrain_fixed('t_noi', real_stu_t_std)

From ee980227ac34262b192565cafb5e195cefee46d0 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 9 Jul 2013 11:35:42 +0100
Subject: [PATCH 63/71] Fixed 2*variance plotting instead of 2*std plotting,
 tidied up

---
 GPy/examples/laplace_approximations.py  | 93 ++++++++++++++++++++-----
 GPy/likelihoods/Laplace.py              |  2 +-
 GPy/likelihoods/likelihood_functions.py | 28 +-------
 GPy/models/GP.py                        |  2 +-
 4 files changed, 78 insertions(+), 47 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 78b4e986..b3048f5a 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -85,24 +85,78 @@ def v_fail_test():
     print(m)
 
 def student_t_f_check():
-    real_var = 0.1
+    plt.close('all')
+    real_std = 0.1
     X = np.random.rand(100)[:, None]
-    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
     X_full = X
     Y_full = np.sin(X_full)
-    Y = Y/Y.max()
-    deg_free = 1000
-    real_sd = np.sqrt(real_var)
+    #Y = Y/Y.max()
+    deg_free = 10000
 
-    kernel = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
-    real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
+    #GP
+    kernelgp = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp.randomize()
+    mgp.optimize()
 
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std**2)
+    kernelst = kernelgp.copy()
+    real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
+
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std2)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GP(X, stu_t_likelihood, kernel)
-    m.constrain_positive('t_noise_std2')
-    m.ensure_default_constraints()
+
+    plt.figure(1)
+    plt.suptitle('Student likelihood')
+    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m.constrain_fixed('rbf_var', mgp._get_params()[0])
+    m.constrain_fixed('rbf_len', mgp._get_params()[1])
+
     m.update_likelihood_approximation()
+    print "T std2 {} converted from original data, LL: {}".format(real_stu_t_std2, m.log_likelihood())
+    plt.subplot(221)
+    m.plot()
+    plt.title('Student t original data noise')
+
+    #Fix student t noise variance to same a GP
+    gp_noise = mgp._get_params()[2]
+    m['t_noise_std2'] = gp_noise
+    m.update_likelihood_approximation()
+    print "T std2 {} same as GP noise, LL: {}".format(gp_noise, m.log_likelihood())
+    plt.subplot(222)
+    m.plot()
+    plt.title('Student t GP noise')
+
+    #Fix student t noise to variance converted from the GP
+    real_stu_t_std2gp = (gp_noise)*((deg_free - 2)/float(deg_free))
+    m['t_noise_std2'] = real_stu_t_std2gp
+    m.update_likelihood_approximation()
+    print "T std2 {} converted to student t noise from GP noise, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.log_likelihood())
+    plt.subplot(223)
+    m.plot()
+    plt.title('Student t GP noise converted')
+
+    m.constrain_positive('t_noise_std2')
+    m.randomize()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print "T std2 {} var {} after optimising, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.likelihood.likelihood_function.variance, m.log_likelihood())
+    plt.subplot(224)
+    m.plot()
+    plt.title('Student t optimised')
+
+    plt.figure(2)
+    print "GP noise {} after optimising, LL: {}".format(gp_noise, mgp.log_likelihood())
+    plt.suptitle('Gaussian likelihood optimised')
+    mgp.plot()
+    print "Real std: {}".format(real_std)
+    print "Real variance {}".format(real_std**2)
+
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+    return m
 
 def debug_student_t_noise_approx():
     plot = False
@@ -218,16 +272,16 @@ def student_t_approx():
     """
     Example of regressing with a student t likelihood
     """
-    real_var = 0.2
+    real_std = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 30)[:, None]
-    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+    X = np.linspace(0.0, 10.0, 50)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_std
     Yc = Y.copy()
 
     X_full = np.linspace(0.0, 10.0, 500)[:, None]
     Y_full = np.sin(X_full)
 
-    #Y = Y/Y.max()
+    Y = Y/Y.max()
 
     Yc[10] += 100
     Yc[25] += 10
@@ -238,10 +292,9 @@ def student_t_approx():
 
     #Add student t random noise to datapoints
     deg_free = 8
-    real_sd = np.sqrt(real_var)
-    print "Real noise: ", real_sd
+    print "Real noise: ", real_std
 
-    initial_var_guess = 0.01
+    initial_var_guess = 0.1
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -293,7 +346,7 @@ def student_t_approx():
 
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
-    edited_real_sd = real_sd #initial_var_guess
+    edited_real_sd = real_std #initial_var_guess
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
@@ -301,6 +354,7 @@ def student_t_approx():
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
+    m.randomize()
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
@@ -316,6 +370,7 @@ def student_t_approx():
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
+    m.randomize()
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 984112a5..c5894ed6 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -89,7 +89,7 @@ class Laplace(likelihood):
         expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
-        print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
         dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
         return dL_dthetaK
 
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index bfc759d7..595fa63c 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -193,16 +193,11 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        A = gammaln((self.v + 1) * 0.5)
-        B = -gammaln(self.v * 0.5)
-        C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
-        D = (-(self.v + 1)*0.5)*np.log(1 + (e**2)/(self.v*self.sigma2))
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
                      - 0.5*np.log(self.sigma2 * self.v * np.pi)
                      + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
                     )
-        #print "A: {} B: {} C: {} D: {} obj: {}".format(A,B,C,D.sum(),objective.sum())
         return np.sum(objective)
 
     def dlik_df(self, y, f, extra_data=None):
@@ -266,15 +261,6 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        #dlik_dsigma = ( - (1/sigma) +
-                        #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      #)
-        #dlik_dsigma = ( - 1 +
-                        #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      #)
-        #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
-        #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
         dlik_dsigma = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
         return dlik_dsigma
 
@@ -286,10 +272,6 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
-                            #/ ((self.v*self.sigma2 + e**2)**2)
-                           #)
         dlik_grad_dsigma = (-self.v*(self.v+1)*e)/((self.sigma2*self.v + e**2)**2)
         return dlik_grad_dsigma
 
@@ -301,12 +283,6 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        #dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
-                              #((e**2 + self.sigma2*self.v)**3)
-                           #)
-        #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
-                             #/ ((e**2 + (self.sigma**2)*self.v)**3) )
         dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
                               / (self.sigma2*self.v + (e**2))**3
                            )
@@ -344,8 +320,8 @@ class student_t(likelihood_function):
         #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
         #need the 95 and 5 percentiles.
         #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
-        p_025 = mu - 2.*true_var
-        p_975 = mu + 2.*true_var
+        p_025 = mu - 2.*np.sqrt(true_var)
+        p_975 = mu + 2.*np.sqrt(true_var)
 
         return mu, np.nan*mu, p_025, p_975
 
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 20337ef5..cd4b7dac 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -152,7 +152,7 @@ class GP(model):
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
         #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-        print "dL_dthetaK: {}   dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL)
+        #print "dL_dthetaK: {}   dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL)
 
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))

From 57001851c46f34d075aa605ac1aa0ac0eb302c57 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 9 Jul 2013 20:05:03 +0100
Subject: [PATCH 64/71] Trying to debug kernel parameters learning (fails even
 when noise fixed) may be some instablility, seems like it can get it if it
 starts close

---
 GPy/examples/laplace_approximations.py | 103 ++++++++++++++++++++++---
 GPy/likelihoods/Laplace.py             |  18 +++--
 GPy/models/GP.py                       |  12 ++-
 3 files changed, 110 insertions(+), 23 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index b3048f5a..279bc597 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -1,6 +1,7 @@
 import GPy
 import numpy as np
 import matplotlib.pyplot as plt
+np.random.seed(1)
 
 def timing():
     real_var = 0.1
@@ -86,17 +87,67 @@ def v_fail_test():
 
 def student_t_f_check():
     plt.close('all')
-    real_std = 0.1
-    X = np.random.rand(100)[:, None]
+    X = np.linspace(0, 1, 50)[:, None]
+    real_std = 0.001
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
+    deg_free = 1000
+
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp.randomize()
+    mgp.optimize()
+    print mgp
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+    kernelst = kernelgp.copy()
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=1e-5)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m['rbf_v'] = mgp._get_params()[0]
+    m['rbf_l'] = mgp._get_params()[1] + 1
+    m.ensure_default_constraints()
+    m.constrain_positive('t_no')
+    print m
+    plt.figure()
+    plt.subplot(511)
+    m.plot()
+    print m
+    plt.subplot(512)
+    m.optimize(max_f_eval=15)
+    m.plot()
+    print m
+    plt.subplot(513)
+    m.optimize(max_f_eval=15)
+    m.plot()
+    print m
+    plt.subplot(514)
+    m.optimize(max_f_eval=15)
+    m.plot()
+    print m
+    plt.subplot(515)
+    m.optimize()
+    m.plot()
+    print "final optimised student t"
+    print m
+    print "real GP"
+    print mgp
+
+def student_t_fix_optimise_check():
+    plt.close('all')
+    real_var = 0.1
+    real_std = np.sqrt(real_var)
+    X = np.random.rand(200)[:, None]
     noise = np.random.randn(*X.shape)*real_std
     Y = np.sin(X*2*np.pi) + noise
     X_full = X
     Y_full = np.sin(X_full)
     #Y = Y/Y.max()
-    deg_free = 10000
+    deg_free = 1000
 
     #GP
-    kernelgp = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
     mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
     mgp.ensure_default_constraints()
     mgp.randomize()
@@ -113,10 +164,12 @@ def student_t_f_check():
     m = GPy.models.GP(X, stu_t_likelihood, kernelst)
     m.constrain_fixed('rbf_var', mgp._get_params()[0])
     m.constrain_fixed('rbf_len', mgp._get_params()[1])
+    m.constrain_positive('t_noise')
+    #m.ensure_default_constraints()
 
     m.update_likelihood_approximation()
     print "T std2 {} converted from original data, LL: {}".format(real_stu_t_std2, m.log_likelihood())
-    plt.subplot(221)
+    plt.subplot(231)
     m.plot()
     plt.title('Student t original data noise')
 
@@ -125,7 +178,7 @@ def student_t_f_check():
     m['t_noise_std2'] = gp_noise
     m.update_likelihood_approximation()
     print "T std2 {} same as GP noise, LL: {}".format(gp_noise, m.log_likelihood())
-    plt.subplot(222)
+    plt.subplot(232)
     m.plot()
     plt.title('Student t GP noise')
 
@@ -134,29 +187,57 @@ def student_t_f_check():
     m['t_noise_std2'] = real_stu_t_std2gp
     m.update_likelihood_approximation()
     print "T std2 {} converted to student t noise from GP noise, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.log_likelihood())
-    plt.subplot(223)
+    plt.subplot(233)
     m.plot()
     plt.title('Student t GP noise converted')
 
     m.constrain_positive('t_noise_std2')
     m.randomize()
     m.update_likelihood_approximation()
+    plt.subplot(234)
+    m.plot()
+    plt.title('Student t fixed rbf')
     m.optimize()
     print "T std2 {} var {} after optimising, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.likelihood.likelihood_function.variance, m.log_likelihood())
-    plt.subplot(224)
+    plt.subplot(235)
     m.plot()
-    plt.title('Student t optimised')
+    plt.title('Student t fixed rbf optimised')
 
     plt.figure(2)
+    mrbf = m.copy()
+    mrbf.unconstrain('')
+    mrbf.constrain_fixed('t_noise', m.likelihood.likelihood_function.sigma2)
+    gp_var = mgp._get_params()[0]
+    gp_len = mgp._get_params()[1]
+    mrbf.constrain_fixed('rbf_var', gp_var)
+    mrbf.constrain_positive('rbf_len')
+    mrbf.randomize()
+    print "Before optimize"
+    print mrbf
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    mrbf.checkgrad(verbose=1)
+    plt.subplot(121)
+    mrbf.plot()
+    plt.title('Student t fixed noise')
+    #mrbf.optimize()
+    print "After optimize"
+    print mrbf
+    plt.subplot(122)
+    mrbf.plot()
+    plt.title('Student t fixed noise optimized')
+    print mrbf
+
+    plt.figure(3)
     print "GP noise {} after optimising, LL: {}".format(gp_noise, mgp.log_likelihood())
     plt.suptitle('Gaussian likelihood optimised')
     mgp.plot()
     print "Real std: {}".format(real_std)
     print "Real variance {}".format(real_std**2)
 
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
-    return m
+    print "Len should be: {}".format(gp_len)
+    return mrbf
 
 def debug_student_t_noise_approx():
     plot = False
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index c5894ed6..5343f5dc 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -89,7 +89,7 @@ class Laplace(likelihood):
         expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
-        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
         dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
         return dL_dthetaK
 
@@ -290,10 +290,12 @@ class Laplace(likelihood):
         :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
         :returns: f_mode
         """
-        if self.old_a is None:
-            old_a = np.zeros((self.N, 1))
-        else:
-            old_a = self.old_a
+        old_a = np.zeros((self.N, 1))
+        #old_a = None
+        #if self.old_a is None:
+            #old_a = np.zeros((self.N, 1))
+        #else:
+            #old_a = self.old_a
 
         f = np.dot(self.K, old_a)
         self.f = f
@@ -308,8 +310,6 @@ class Laplace(likelihood):
         step_size = 1
         rs = 0
         i = 0
-        #if self.likelihood_function.sigma < 0.001:
-            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
@@ -371,8 +371,10 @@ class Laplace(likelihood):
             old_a = self.a #a
             i += 1
 
+        self.old_a = old_a
         #print "Positive difference obj: ", np.float(difference)
-        print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
+        #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
+        print "Iterations: {}, Final_difference: {}".format(i, difference)
         #self.a = a
         #self.B, self.B_chol, self.W_12 = B, L, W_12
         #self.Bi, _, _, B_det = pdinv(self.B)
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index cd4b7dac..0f56e21c 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -132,7 +132,11 @@ class GP(model):
         model for a new variable Y* = v_tilde/tau_tilde, with a covariance
         matrix K* = K + diag(1./tau_tilde) plus a normalization term.
         """
+        if isinstance(self.likelihood, Laplace):
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
         l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        print "K_ldet: {} mft: {} Z: {}".format(self.K_logdet, self._model_fit_term(), self.likelihood.Z)
         return l
 
     def _log_likelihood_gradients(self):
@@ -142,12 +146,12 @@ class GP(model):
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
-        #print "dL_dthetaK should be: ", dL_dthetaK
+        print "dL_dthetaK should be: ", dL_dthetaK
         if isinstance(self.likelihood, Laplace):
-            #self.likelihood.fit_full(self.kern.K(self.X))
-            #self.likelihood._set_params(self.likelihood._get_params())
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
             dK_dthetaK = self.kern.dK_dtheta
-            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
+            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy())
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From aa9860859000530ba3297e72236c359f2a36a42b Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 29 Jul 2013 15:29:46 +0100
Subject: [PATCH 65/71] Started adding gaussian likelihood, changed round
 preloading old_a

---
 GPy/core/model.py                       |   6 +
 GPy/examples/laplace_approximations.py  |  72 ++++++-
 GPy/likelihoods/Laplace.py              | 173 ++++++++++------
 GPy/likelihoods/likelihood_functions.py | 251 +++++++++++++-----------
 4 files changed, 321 insertions(+), 181 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 94202396..e3a9bb68 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -244,6 +244,12 @@ class model(parameterised):
         LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
         prior_gradients = self._transform_gradients(self._log_prior_gradients())
         obj_grads = -LL_gradients - prior_gradients
+        print self
+        print self._get_params()
+        print -obj_grads
+        self.plot()
+        if isinstance(self.likelihood, likelihoods.Laplace):
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         return obj_f, obj_grads
 
     def optimize(self, optimizer=None, start=None, **kwargs):
diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 279bc597..2b93122c 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -85,10 +85,60 @@ def v_fail_test():
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print(m)
 
+def student_t_obj_plane():
+    plt.close('all')
+    X = np.linspace(0, 1, 50)[:, None]
+    real_std = 0.002
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
+    deg_free = 1000
+
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp['noise'] = real_std**2
+    print "Gaussian"
+    print mgp
+
+    kernelst = kernelgp.copy()
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2))
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m.ensure_default_constraints()
+    m.constrain_fixed('t_no', real_std**2)
+    vs = 10
+    ls = 10
+    objs_t = np.zeros((vs, ls))
+    objs_g = np.zeros((vs, ls))
+    rbf_vs = np.linspace(1e-6, 8, vs)
+    rbf_ls = np.linspace(1e-2, 8, ls)
+    for v_id, rbf_v in enumerate(rbf_vs):
+        for l_id, rbf_l in enumerate(rbf_ls):
+            m['rbf_v'] = rbf_v
+            m['rbf_l'] = rbf_l
+            mgp['rbf_v'] = rbf_v
+            mgp['rbf_l'] = rbf_l
+            objs_t[v_id, l_id] = m.log_likelihood()
+            objs_g[v_id, l_id] = mgp.log_likelihood()
+    plt.figure()
+    plt.subplot(211)
+    plt.title('Student t')
+    plt.imshow(objs_t, interpolation='none')
+    plt.xlabel('variance')
+    plt.ylabel('lengthscale')
+    plt.subplot(212)
+    plt.title('Gaussian')
+    plt.imshow(objs_g, interpolation='none')
+    plt.xlabel('variance')
+    plt.ylabel('lengthscale')
+    plt.show()
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    return objs_t
+
 def student_t_f_check():
     plt.close('all')
     X = np.linspace(0, 1, 50)[:, None]
-    real_std = 0.001
+    real_std = 0.2
     noise = np.random.randn(*X.shape)*real_std
     Y = np.sin(X*2*np.pi) + noise
     deg_free = 1000
@@ -98,17 +148,26 @@ def student_t_f_check():
     mgp.ensure_default_constraints()
     mgp.randomize()
     mgp.optimize()
+    print "Gaussian"
     print mgp
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     kernelst = kernelgp.copy()
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=1e-5)
+    #kernelst += GPy.kern.bias(X.shape[1])
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernelst)
-    m['rbf_v'] = mgp._get_params()[0]
-    m['rbf_l'] = mgp._get_params()[1] + 1
+    #m['rbf_v'] = mgp._get_params()[0]
+    #m['rbf_l'] = mgp._get_params()[1] + 1
     m.ensure_default_constraints()
+    #m.constrain_fixed('rbf_v', mgp._get_params()[0])
+    #m.constrain_fixed('rbf_l', mgp._get_params()[1])
+    #m.constrain_bounded('t_no', 2*real_std**2, 1e3)
+    #m.constrain_positive('bias')
     m.constrain_positive('t_no')
+    m.randomize()
+    m['t_no'] = 0.3
+    m.likelihood.X = X
     print m
     plt.figure()
     plt.subplot(511)
@@ -143,7 +202,8 @@ def student_t_fix_optimise_check():
     Y = np.sin(X*2*np.pi) + noise
     X_full = X
     Y_full = np.sin(X_full)
-    #Y = Y/Y.max()
+    Y = Y/Y.max()
+    Y_full = Y_full/Y_full.max()
     deg_free = 1000
 
     #GP
@@ -219,7 +279,7 @@ def student_t_fix_optimise_check():
     plt.subplot(121)
     mrbf.plot()
     plt.title('Student t fixed noise')
-    #mrbf.optimize()
+    mrbf.optimize()
     print "After optimize"
     print mrbf
     plt.subplot(122)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 5343f5dc..8b39f222 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -156,17 +156,23 @@ class Laplace(likelihood):
         Y_tilde = Wi*self.Ki_f + self.f_hat
 
         self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
+        #self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6
+
         self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
         self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
 
         self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
-        Z_tilde = (#+ self.NORMAL_CONST
+        self.aA = 0.5*self.ln_det_K_Wi__Bi
+        self.bB = - 0.5*self.f_Ki_f
+        self.cC = 0.5*self.y_Wi_Ki_i_y
+        Z_tilde = (+ 100*self.NORMAL_CONST
                    + self.lik
                    + 0.5*self.ln_det_K_Wi__Bi
                    - 0.5*self.f_Ki_f
                    + 0.5*self.y_Wi_Ki_i_y
                   )
-        #print "Ztilde: {}".format(Z_tilde)
+        print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC)
+        print self.likelihood_function._get_params()
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -198,7 +204,7 @@ class Laplace(likelihood):
         self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-10  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                        #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                        #To cause the posterior to become less certain than the prior and likelihood,
                                        #This is a property only held by non-log-concave likelihoods
@@ -280,7 +286,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=40, MAX_RESTART=10):
+    def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -290,15 +296,19 @@ class Laplace(likelihood):
         :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
         :returns: f_mode
         """
-        old_a = np.zeros((self.N, 1))
-        #old_a = None
-        #if self.old_a is None:
-            #old_a = np.zeros((self.N, 1))
-        #else:
-            #old_a = self.old_a
+        self.old_before_s = self.likelihood_function._get_params()
+        print "before: ", self.old_before_s
+        #if self.old_before_s < 1e-5:
+            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+        #old_a = np.zeros((self.N, 1))
+        if self.old_a is None:
+            old_a = np.zeros((self.N, 1))
+            f = np.dot(K, old_a)
+        else:
+            old_a = self.old_a.copy()
+            f = self.f_hat.copy()
 
-        f = np.dot(self.K, old_a)
-        self.f = f
         new_obj = -np.inf
         old_obj = np.inf
 
@@ -306,18 +316,20 @@ class Laplace(likelihood):
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-10
+        epsilon = 1e-4
         step_size = 1
         rs = 0
         i = 0
-        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
+
+        while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
+            #W = np.maximum(W, 0)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-10     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
-            B, L, W_12 = self._compute_B_statistics(K, W)
+            B, L, W_12 = self._compute_B_statistics(K, W.copy())
 
             W_f = W*f
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
@@ -328,54 +340,105 @@ class Laplace(likelihood):
             full_step_a = b - W_12*solve_L
             da = full_step_a - old_a
 
-            f_old = f.copy()
-
-            f_old = self.f.copy()
-            def inner_obj(step_size, old_a, da, K):
-                a = old_a + step_size*da
-                f = np.dot(K, a)
-                self.a = a # This is nasty, need to set something within an optimization though
-                self.f = f
-                return -obj(a, f)
-
-            from functools import partial
-            i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K)
-            new_obj = sp.optimize.brent(i_o, tol=1e-6, maxiter=10)
-
-            #update_passed = False
-            #while not update_passed:
+            #f_old = f.copy()
+            #def inner_obj(step_size, old_a, da, K):
                 #a = old_a + step_size*da
                 #f = np.dot(K, a)
+                #self.a = a.copy() # This is nasty, need to set something within an optimization though
+                #self.f = f.copy()
+                #return -obj(a, f)
 
-                #old_obj = new_obj
-                #new_obj = obj(a, f)
-                #difference = new_obj - old_obj
-                #print "difference: ",difference
-                #if difference < 0:
-                    ##print grad
-                    ##print "Objective function rose", np.float(difference)
-                    ##If the objective function isn't rising, restart optimization
-                    #step_size *= 0.8
-                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
-                    ##objective function isn't increasing, try reducing step size
-                    ##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
-                    ##old_obj = tmp_old_obj
-                    #old_obj = new_obj
-                    #rs += 1
-                #else:
-                    #update_passed = True
+            #from functools import partial
+            #i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
+            ##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
+            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
+            #f = self.f.copy()
+            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
-            f = self.f
-            difference = new_obj - old_obj
-            difference = np.abs(np.sum(f - f_old)) #+ abs(difference)
-            old_a = self.a #a
+            f_old = f.copy()
+            update_passed = False
+            while not update_passed:
+                a = old_a + step_size*da
+                f = np.dot(K, a)
+
+                old_obj = new_obj
+                new_obj = obj(a, f)
+                difference = new_obj - old_obj
+                print "difference: ",difference
+                if difference < 0:
+                    #print "Objective function rose", np.float(difference)
+                    #If the objective function isn't rising, restart optimization
+                    step_size *= 0.8
+                    #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    #objective function isn't increasing, try reducing step size
+                    f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
+                    old_obj = new_obj
+                    rs += 1
+                else:
+                    update_passed = True
+
+            #difference = abs(new_obj - old_obj)
+            #old_obj = new_obj.copy()
+            difference = np.abs(np.sum(f - f_old))
+            #old_a = self.a.copy() #a
+            old_a = a.copy()
             i += 1
+            #print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
 
-        self.old_a = old_a
+        self.old_a = old_a.copy()
         #print "Positive difference obj: ", np.float(difference)
         #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
         print "Iterations: {}, Final_difference: {}".format(i, difference)
-        #self.a = a
+        if difference > 1e-4:
+            print "FAIL FAIL FAIL FAIL FAIL FAIL"
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+            if hasattr(self, 'X'):
+                import pylab as pb
+                pb.figure()
+                pb.subplot(311)
+                pb.title('old f_hat')
+                pb.plot(self.X, self.f_hat)
+                pb.subplot(312)
+                pb.title('old ff')
+                pb.plot(self.X, self.old_ff)
+                pb.subplot(313)
+                pb.title('new f_hat')
+                pb.plot(self.X, f)
+
+                pb.figure()
+                pb.subplot(121)
+                pb.title('old K')
+                pb.imshow(np.diagflat(self.old_K), interpolation='none')
+                pb.colorbar()
+                pb.subplot(122)
+                pb.title('new K')
+                pb.imshow(np.diagflat(K), interpolation='none')
+                pb.colorbar()
+
+                pb.figure()
+                pb.subplot(121)
+                pb.title('old W')
+                pb.imshow(np.diagflat(self.old_W), interpolation='none')
+                pb.colorbar()
+                pb.subplot(122)
+                pb.title('new W')
+                pb.imshow(np.diagflat(W), interpolation='none')
+                pb.colorbar()
+
+                import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+                pb.close('all')
+
+        #FIXME: DELETE THESE
+        self.old_W = W.copy()
+        self.old_grad = grad.copy()
+        self.old_B = B.copy()
+        self.old_W_12 = W_12.copy()
+        self.old_ff = f.copy()
+        self.old_K = self.K.copy()
+        self.old_s = self.likelihood_function._get_params()
+        print "after: ", self.old_s
+        #print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
+        self.a = a
         #self.B, self.B_chol, self.W_12 = B, L, W_12
         #self.Bi, _, _, B_det = pdinv(self.B)
         return f
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 595fa63c..62e09a1a 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -193,11 +193,16 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
+        #A = gammaln((self.v + 1) * 0.5)
+        #B = - gammaln(self.v * 0.5)
+        #C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
+        #D = + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
                      - 0.5*np.log(self.sigma2 * self.v * np.pi)
                      + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
                     )
+        #print "C: {} D: {} obj: {}".format(C, np.sum(D), objective.sum())
         return np.sum(objective)
 
     def dlik_df(self, y, f, extra_data=None):
@@ -459,147 +464,153 @@ class weibull_survival(likelihood_function):
         hess = (y**self.shape)*np.exp(f)
         return np.squeeze(hess)
 
-#class gaussian(likelihood_function):
-    #"""
-    #Gaussian likelihood - this is a test class for approximation schemes
-    #"""
-    #def __init__(self, variance):
-        #self._set_params(np.asarray(variance))
+class gaussian(likelihood_function):
+    """
+    Gaussian likelihood - this is a test class for approximation schemes
+    """
+    def __init__(self, variance):
+        self._set_params(np.asarray(variance))
 
-    #def _get_params(self):
-        #return np.asarray(self.sigma2)
+    def _get_params(self):
+        return np.asarray(self._variance)
 
-    #def _get_param_names(self):
-        #return ["noise_variance"]
+    def _get_param_names(self):
+        return ["noise_variance"]
 
-    #def _set_params(self, x):
-        #self.variance = float(x)
+    def _set_params(self, x):
+        self._variance = float(x)
+        self.covariance_matrix = np.eye(self.N) * self._variance
+        self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG
 
-    #def link_function(self, y, f, extra_data=None):
-        #"""link_function $\ln p(y|f)$
-        #$$\ln p(y_{i}|f_{i}) = \ln $$
+    def link_function(self, y, f, extra_data=None):
+        """link_function $\ln p(y|f)$
+        $$\ln p(y_{i}|f_{i}) = \ln $$
 
-        #:y: data
-        #:f: latent variables f
-        #:extra_data: extra_data which is not used in student t distribution
-        #:returns: float(likelihood evaluated for this point)
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: float(likelihood evaluated for this point)
 
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #objective = -0.5*self.D*
-        #return np.sum(objective)
+        """
+        assert y.shape == f.shape
+        e = y - f
+        eeT = np.dot(e, e.T)
+        objective = (- 0.5*self.D*np.log(2*np.pi)
+                     - 0.5*self.ln_K
+                     - 0.5*np.sum(np.multiply(self.Ki, eeT))
+                     )
+        return np.sum(objective)
 
-    #def dlik_df(self, y, f, extra_data=None):
-        #"""
-        #Gradient of the link function at y, given f w.r.t f
+    def dlik_df(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
 
-        #$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
+        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
 
-        #:y: data
-        #:f: latent variables f
-        #:extra_data: extra_data which is not used in student t distribution
-        #:returns: gradient of likelihood evaluated at points
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: gradient of likelihood evaluated at points
 
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
-        #return grad
+        """
+        assert y.shape == f.shape
+        e = y - f
+        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        return grad
 
-    #def d2lik_d2f(self, y, f, extra_data=None):
-        #"""
-        #Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
-        #i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+    def d2lik_d2f(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
 
-        #Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-        #(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+        Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+        (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
 
-        #$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
+        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
 
-        #:y: data
-        #:f: latent variables f
-        #:extra_data: extra_data which is not used in student t distribution
-        #:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
-        #return hess
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        assert y.shape == f.shape
+        e = y - f
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
+        return hess
 
-    #def d3lik_d3f(self, y, f, extra_data=None):
-        #"""
-        #Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+    def d3lik_d3f(self, y, f, extra_data=None):
+        """
+        Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
 
-        #$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
-                       #((e**2 + self.sigma2*self.v)**3)
-                    #)
-        #return d3lik_d3f
+        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       ((e**2 + self.sigma2*self.v)**3)
+                    )
+        return d3lik_d3f
 
-    #def lik_dstd(self, y, f, extra_data=None):
-        #"""
-        #Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
+    def lik_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
 
-        #Terms relavent to derivatives wrt sigma are:
-        #-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+        Terms relavent to derivatives wrt sigma are:
+        -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
 
-        #$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        ##dlik_dsigma = ( - (1/sigma) +
-                        ##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      ##)
-        ##dlik_dsigma = ( - 1 +
-                        ##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      ##)
-        ##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
-        #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
-        #return dlik_dsigma
+        $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        sigma = np.sqrt(self.sigma2)
+        #dlik_dsigma = ( - (1/sigma) +
+                        #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      #)
+        #dlik_dsigma = ( - 1 +
+                        #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      #)
+        #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
+        dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
+        return dlik_dsigma
 
-    #def dlik_df_dstd(self, y, f, extra_data=None):
-        #"""
-        #Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
+    def dlik_df_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
 
-        #$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
-                            #/ ((self.v*self.sigma2 + e**2)**2)
-                           #)
-        #return dlik_grad_dsigma
+        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        sigma = np.sqrt(self.sigma2)
+        dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
+                            / ((self.v*self.sigma2 + e**2)**2)
+                           )
+        return dlik_grad_dsigma
 
-    #def d2lik_d2f_dstd(self, y, f, extra_data=None):
-        #"""
-        #Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
+    def d2lik_d2f_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
 
-        #$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        #dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
-                              #((e**2 + self.sigma2*self.v)**3)
-                           #)
-        ##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
-                             ##/ ((e**2 + (self.sigma**2)*self.v)**3) )
-        #return dlik_hess_dsigma
+        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        sigma = np.sqrt(self.sigma2)
+        dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
+                              ((e**2 + self.sigma2*self.v)**3)
+                           )
+        #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
+                             #/ ((e**2 + (self.sigma**2)*self.v)**3) )
+        return dlik_hess_dsigma
 
-    #def _gradients(self, y, f, extra_data=None):
-        ##must be listed in same order as 'get_param_names'
-        #derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
-                  #[self.dlik_df_dstd(y, f, extra_data=extra_data)],
-                  #[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
-                 #) # lists as we might learn many parameters
-        ## ensure we have gradients for every parameter we want to optimize
-        #assert len(derivs[0]) == len(self._get_param_names())
-        #assert len(derivs[1]) == len(self._get_param_names())
-        #assert len(derivs[2]) == len(self._get_param_names())
-        #return derivs
+    def _gradients(self, y, f, extra_data=None):
+        #must be listed in same order as 'get_param_names'
+        derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
+                  [self.dlik_df_dstd(y, f, extra_data=extra_data)],
+                  [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+                 ) # lists as we might learn many parameters
+        # ensure we have gradients for every parameter we want to optimize
+        assert len(derivs[0]) == len(self._get_param_names())
+        assert len(derivs[1]) == len(self._get_param_names())
+        assert len(derivs[2]) == len(self._get_param_names())
+        return derivs

From fdb7b99e0bd8a740dd898317aab5cd506b97e34e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 29 Jul 2013 17:21:52 +0100
Subject: [PATCH 66/71] Got rid of some overdoing the approximation

---
 GPy/likelihoods/Laplace.py |  2 +-
 GPy/models/GP.py           | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 8b39f222..f86c47b6 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -165,7 +165,7 @@ class Laplace(likelihood):
         self.aA = 0.5*self.ln_det_K_Wi__Bi
         self.bB = - 0.5*self.f_Ki_f
         self.cC = 0.5*self.y_Wi_Ki_i_y
-        Z_tilde = (+ 100*self.NORMAL_CONST
+        Z_tilde = (#+ 100*self.NORMAL_CONST
                    + self.lik
                    + 0.5*self.ln_det_K_Wi__Bi
                    - 0.5*self.f_Ki_f
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 0f56e21c..77620488 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -132,9 +132,9 @@ class GP(model):
         model for a new variable Y* = v_tilde/tau_tilde, with a covariance
         matrix K* = K + diag(1./tau_tilde) plus a normalization term.
         """
-        if isinstance(self.likelihood, Laplace):
-            self.likelihood.fit_full(self.kern.K(self.X))
-            self.likelihood._set_params(self.likelihood._get_params())
+        #if isinstance(self.likelihood, Laplace):
+            #self.likelihood.fit_full(self.kern.K(self.X))
+            #self.likelihood._set_params(self.likelihood._get_params())
         l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
         print "K_ldet: {} mft: {} Z: {}".format(self.K_logdet, self._model_fit_term(), self.likelihood.Z)
         return l
@@ -148,8 +148,8 @@ class GP(model):
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
         print "dL_dthetaK should be: ", dL_dthetaK
         if isinstance(self.likelihood, Laplace):
-            self.likelihood.fit_full(self.kern.K(self.X))
-            self.likelihood._set_params(self.likelihood._get_params())
+            #self.likelihood.fit_full(self.kern.K(self.X))
+            #self.likelihood._set_params(self.likelihood._get_params())
             dK_dthetaK = self.kern.dK_dtheta
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy())
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From 9364efc755405fdb3b424f4e3ffc01e68694b31e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 30 Jul 2013 16:11:03 +0100
Subject: [PATCH 67/71] Started adding gaussian sanity checker

---
 GPy/examples/laplace_approximations.py  | 10 ++--
 GPy/likelihoods/Laplace.py              | 80 +++++++++++++------------
 GPy/likelihoods/likelihood_functions.py | 58 +++++-------------
 3 files changed, 60 insertions(+), 88 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 2b93122c..e8b6419f 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -168,23 +168,23 @@ def student_t_f_check():
     m.randomize()
     m['t_no'] = 0.3
     m.likelihood.X = X
-    print m
+    #print m
     plt.figure()
     plt.subplot(511)
     m.plot()
-    print m
+    #print m
     plt.subplot(512)
     m.optimize(max_f_eval=15)
     m.plot()
-    print m
+    #print m
     plt.subplot(513)
     m.optimize(max_f_eval=15)
     m.plot()
-    print m
+    #print m
     plt.subplot(514)
     m.optimize(max_f_eval=15)
     m.plot()
-    print m
+    #print m
     plt.subplot(515)
     m.optimize()
     m.plot()
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index f86c47b6..aeda17da 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -89,7 +89,8 @@ class Laplace(likelihood):
         expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
-        print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        #print "expl_a: {}, {}     expl_b: {}, {}".format(np.mean(expl_a), np.std(expl_a), np.mean(expl_b), np.std(expl_b))
         dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
         return dL_dthetaK
 
@@ -165,8 +166,7 @@ class Laplace(likelihood):
         self.aA = 0.5*self.ln_det_K_Wi__Bi
         self.bB = - 0.5*self.f_Ki_f
         self.cC = 0.5*self.y_Wi_Ki_i_y
-        Z_tilde = (#+ 100*self.NORMAL_CONST
-                   + self.lik
+        Z_tilde = (+ self.lik
                    + 0.5*self.ln_det_K_Wi__Bi
                    - 0.5*self.f_Ki_f
                    + 0.5*self.y_Wi_Ki_i_y
@@ -379,7 +379,8 @@ class Laplace(likelihood):
 
             #difference = abs(new_obj - old_obj)
             #old_obj = new_obj.copy()
-            difference = np.abs(np.sum(f - f_old))
+            #difference = np.abs(np.sum(f - f_old))
+            difference = np.abs(np.sum(a - old_a))
             #old_a = self.a.copy() #a
             old_a = a.copy()
             i += 1
@@ -391,42 +392,43 @@ class Laplace(likelihood):
         print "Iterations: {}, Final_difference: {}".format(i, difference)
         if difference > 1e-4:
             print "FAIL FAIL FAIL FAIL FAIL FAIL"
-            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-            if hasattr(self, 'X'):
-                import pylab as pb
-                pb.figure()
-                pb.subplot(311)
-                pb.title('old f_hat')
-                pb.plot(self.X, self.f_hat)
-                pb.subplot(312)
-                pb.title('old ff')
-                pb.plot(self.X, self.old_ff)
-                pb.subplot(313)
-                pb.title('new f_hat')
-                pb.plot(self.X, f)
-
-                pb.figure()
-                pb.subplot(121)
-                pb.title('old K')
-                pb.imshow(np.diagflat(self.old_K), interpolation='none')
-                pb.colorbar()
-                pb.subplot(122)
-                pb.title('new K')
-                pb.imshow(np.diagflat(K), interpolation='none')
-                pb.colorbar()
-
-                pb.figure()
-                pb.subplot(121)
-                pb.title('old W')
-                pb.imshow(np.diagflat(self.old_W), interpolation='none')
-                pb.colorbar()
-                pb.subplot(122)
-                pb.title('new W')
-                pb.imshow(np.diagflat(W), interpolation='none')
-                pb.colorbar()
-
+            if False:
                 import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-                pb.close('all')
+                if hasattr(self, 'X'):
+                    import pylab as pb
+                    pb.figure()
+                    pb.subplot(311)
+                    pb.title('old f_hat')
+                    pb.plot(self.X, self.f_hat)
+                    pb.subplot(312)
+                    pb.title('old ff')
+                    pb.plot(self.X, self.old_ff)
+                    pb.subplot(313)
+                    pb.title('new f_hat')
+                    pb.plot(self.X, f)
+
+                    pb.figure()
+                    pb.subplot(121)
+                    pb.title('old K')
+                    pb.imshow(np.diagflat(self.old_K), interpolation='none')
+                    pb.colorbar()
+                    pb.subplot(122)
+                    pb.title('new K')
+                    pb.imshow(np.diagflat(K), interpolation='none')
+                    pb.colorbar()
+
+                    pb.figure()
+                    pb.subplot(121)
+                    pb.title('old W')
+                    pb.imshow(np.diagflat(self.old_W), interpolation='none')
+                    pb.colorbar()
+                    pb.subplot(122)
+                    pb.title('new W')
+                    pb.imshow(np.diagflat(W), interpolation='none')
+                    pb.colorbar()
+
+                    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+                    pb.close('all')
 
         #FIXME: DELETE THESE
         self.old_W = W.copy()
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 62e09a1a..42af9c8d 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -239,7 +239,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -277,7 +277,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_grad_dsigma = (-self.v*(self.v+1)*e)/((self.sigma2*self.v + e**2)**2)
+        dlik_grad_dsigma = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
         return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
@@ -289,7 +289,7 @@ class student_t(likelihood_function):
         assert y.shape == f.shape
         e = y - f
         dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
-                              / (self.sigma2*self.v + (e**2))**3
+                              / ((self.sigma2*self.v + (e**2))**3)
                            )
         return dlik_hess_dsigma
 
@@ -479,7 +479,8 @@ class gaussian(likelihood_function):
 
     def _set_params(self, x):
         self._variance = float(x)
-        self.covariance_matrix = np.eye(self.N) * self._variance
+        self.I = np.eye(self.N)
+        self.covariance_matrix = self.I * self._variance
         self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG
 
     def link_function(self, y, f, extra_data=None):
@@ -505,8 +506,6 @@ class gaussian(likelihood_function):
         """
         Gradient of the link function at y, given f w.r.t f
 
-        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
-
         :y: data
         :f: latent variables f
         :extra_data: extra_data which is not used in student t distribution
@@ -514,8 +513,8 @@ class gaussian(likelihood_function):
 
         """
         assert y.shape == f.shape
-        e = y - f
-        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        s2_i = (1.0/self._variance)*self.I
+        grad = np.dot(s2_i, y) - 0.5*np.dot(s2_i, f)
         return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
@@ -526,16 +525,14 @@ class gaussian(likelihood_function):
         Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
         (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
 
-        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
-
         :y: data
         :f: latent variables f
         :extra_data: extra_data which is not used in student t distribution
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
         assert y.shape == f.shape
-        e = y - f
-        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
+        s2_i = (1.0/self._variance)*self.I
+        hess = np.diagonal(-0.5*s2_i)
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -545,46 +542,25 @@ class gaussian(likelihood_function):
         $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
         """
         assert y.shape == f.shape
-        e = y - f
-        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
-                       ((e**2 + self.sigma2*self.v)**3)
-                    )
+        d3lik_d3f = np.diagonal(0*self.I)
         return d3lik_d3f
 
     def lik_dstd(self, y, f, extra_data=None):
         """
         Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
-
-        Terms relavent to derivatives wrt sigma are:
-        -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
-
-        $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
         """
         assert y.shape == f.shape
         e = y - f
-        sigma = np.sqrt(self.sigma2)
-        #dlik_dsigma = ( - (1/sigma) +
-                        #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      #)
-        #dlik_dsigma = ( - 1 +
-                        #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      #)
-        #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
-        dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
+        dlik_dsigma = -0.5*self.N*self._variance - 0.5*np.dot(e.T, e)
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
         """
         Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
-
-        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
         """
         assert y.shape == f.shape
-        e = y - f
-        sigma = np.sqrt(self.sigma2)
-        dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
-                            / ((self.v*self.sigma2 + e**2)**2)
-                           )
+        s_4 = 1.0/(self._variance**2)
+        dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f))
         return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
@@ -594,13 +570,7 @@ class gaussian(likelihood_function):
         $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
         assert y.shape == f.shape
-        e = y - f
-        sigma = np.sqrt(self.sigma2)
-        dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
-                              ((e**2 + self.sigma2*self.v)**3)
-                           )
-        #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
-                             #/ ((e**2 + (self.sigma**2)*self.v)**3) )
+        dlik_hess_dsigma = 1.0/(2*(self._variance**2))
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):

From 1314868ea8cf4c81d0c76f90dd4a8b11a123c427 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 16 Aug 2013 11:16:47 +0100
Subject: [PATCH 68/71] Added gaussian checker and gaussian likelihood, not
 checkgrading yet

---
 GPy/examples/laplace_approximations.py  | 65 +++++++++++++++++++------
 GPy/likelihoods/likelihood_functions.py | 38 ++++++++++-----
 2 files changed, 77 insertions(+), 26 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index e8b6419f..02b38a79 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -170,28 +170,18 @@ def student_t_f_check():
     m.likelihood.X = X
     #print m
     plt.figure()
-    plt.subplot(511)
+    plt.subplot(211)
     m.plot()
-    #print m
-    plt.subplot(512)
-    m.optimize(max_f_eval=15)
-    m.plot()
-    #print m
-    plt.subplot(513)
-    m.optimize(max_f_eval=15)
-    m.plot()
-    #print m
-    plt.subplot(514)
-    m.optimize(max_f_eval=15)
-    m.plot()
-    #print m
-    plt.subplot(515)
+    print "OPTIMIZED ONCE"
+    plt.subplot(212)
     m.optimize()
     m.plot()
     print "final optimised student t"
     print m
     print "real GP"
     print mgp
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    return m
 
 def student_t_fix_optimise_check():
     plt.close('all')
@@ -602,3 +592,48 @@ def noisy_laplace_approx():
     print m
 
     #with a student t distribution, since it has heavy tails it should work well
+
+def gaussian_f_check():
+    plt.close('all')
+    X = np.linspace(0, 1, 50)[:, None]
+    real_std = 0.2
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
+
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp.randomize()
+    mgp.optimize()
+    print "Gaussian"
+    print mgp
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+    kernelg = kernelgp.copy()
+    #kernelst += GPy.kern.bias(X.shape[1])
+    N, D = X.shape
+    g_distribution = GPy.likelihoods.likelihood_functions.gaussian(variance=0.1, N=N, D=D)
+    g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm')
+    m = GPy.models.GP(X, g_likelihood, kernelg)
+    #m['rbf_v'] = mgp._get_params()[0]
+    #m['rbf_l'] = mgp._get_params()[1] + 1
+    m.ensure_default_constraints()
+    #m.constrain_fixed('rbf_v', mgp._get_params()[0])
+    #m.constrain_fixed('rbf_l', mgp._get_params()[1])
+    #m.constrain_bounded('t_no', 2*real_std**2, 1e3)
+    #m.constrain_positive('bias')
+    m.constrain_positive('noise_var')
+    m.randomize()
+    m['noise_variance'] = 0.1
+    m.likelihood.X = X
+    plt.figure()
+    plt.subplot(211)
+    m.plot()
+    plt.subplot(212)
+    m.optimize()
+    m.plot()
+    print "final optimised student t"
+    print m
+    print "real GP"
+    print mgp
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 42af9c8d..81d93f6b 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -9,7 +9,7 @@ from ..util.plot import gpplot
 from scipy.special import gammaln, gamma
 from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 
-class likelihood_function:
+class likelihood_function(object):
     """ Likelihood class for doing Expectation propagation
 
     :param Y: observed output (Nx1 numpy.darray)
@@ -159,7 +159,7 @@ class student_t(likelihood_function):
     d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free, sigma2=2):
-        #super(student_t, self).__init__()
+        super(student_t, self).__init__()
         self.v = deg_free
         self.sigma2 = sigma2
         self.log_concave = False
@@ -468,9 +468,16 @@ class gaussian(likelihood_function):
     """
     Gaussian likelihood - this is a test class for approximation schemes
     """
-    def __init__(self, variance):
+    def __init__(self, variance, D, N):
+        super(gaussian, self).__init__()
+        self.D = D
+        self.N = N
         self._set_params(np.asarray(variance))
 
+        #Don't support normalizing yet
+        self._bias = np.zeros((1, self.D))
+        self._scale = np.ones((1, self.D))
+
     def _get_params(self):
         return np.asarray(self._variance)
 
@@ -481,7 +488,8 @@ class gaussian(likelihood_function):
         self._variance = float(x)
         self.I = np.eye(self.N)
         self.covariance_matrix = self.I * self._variance
-        self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG
+        self.Ki = self.I*(1.0 / self._variance)
+        self.ln_K = np.trace(self.covariance_matrix)
 
     def link_function(self, y, f, extra_data=None):
         """link_function $\ln p(y|f)$
@@ -498,7 +506,8 @@ class gaussian(likelihood_function):
         eeT = np.dot(e, e.T)
         objective = (- 0.5*self.D*np.log(2*np.pi)
                      - 0.5*self.ln_K
-                     - 0.5*np.sum(np.multiply(self.Ki, eeT))
+                     #- 0.5*np.sum(np.multiply(self.Ki, eeT))
+                     - 0.5*np.dot(np.dot(e.T, self.Ki), e)
                      )
         return np.sum(objective)
 
@@ -514,7 +523,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         s2_i = (1.0/self._variance)*self.I
-        grad = np.dot(s2_i, y) - 0.5*np.dot(s2_i, f)
+        grad = np.dot(s2_i, y) - np.dot(s2_i, f)
         return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
@@ -532,7 +541,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         s2_i = (1.0/self._variance)*self.I
-        hess = np.diagonal(-0.5*s2_i)
+        hess = np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -542,7 +551,7 @@ class gaussian(likelihood_function):
         $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
         """
         assert y.shape == f.shape
-        d3lik_d3f = np.diagonal(0*self.I)
+        d3lik_d3f = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
         return d3lik_d3f
 
     def lik_dstd(self, y, f, extra_data=None):
@@ -551,7 +560,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_dsigma = -0.5*self.N*self._variance - 0.5*np.dot(e.T, e)
+        dlik_dsigma = -0.5*self.D/self._variance - 0.5*np.trace(np.dot(e.T, np.dot(self.I, e)))
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
@@ -560,7 +569,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         s_4 = 1.0/(self._variance**2)
-        dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f))
+        dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + np.dot(s_4, np.dot(self.I, f))
         return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
@@ -570,7 +579,7 @@ class gaussian(likelihood_function):
         $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
         assert y.shape == f.shape
-        dlik_hess_dsigma = 1.0/(2*(self._variance**2))
+        dlik_hess_dsigma = np.diag(1.0/(self._variance**2)*self.I)[:, None]
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):
@@ -584,3 +593,10 @@ class gaussian(likelihood_function):
         assert len(derivs[1]) == len(self._get_param_names())
         assert len(derivs[2]) == len(self._get_param_names())
         return derivs
+
+    def predictive_values(self, mu, var):
+        mean = mu * self._scale + self._bias
+        true_var = (var + self._variance) * self._scale ** 2
+        _5pc = mean - 2.*np.sqrt(true_var)
+        _95pc = mean + 2.*np.sqrt(true_var)
+        return mean, true_var, _5pc, _95pc

From 000491b25da515a595c25fbc57e3dcbc3ee4e3f4 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 28 Aug 2013 13:26:15 +0100
Subject: [PATCH 69/71] Gaussian likelihood errors, still not working

---
 GPy/likelihoods/likelihood_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 81d93f6b..25f770b5 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -560,7 +560,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_dsigma = -0.5*self.D/self._variance - 0.5*np.trace(np.dot(e.T, np.dot(self.I, e)))
+        dlik_dsigma = -0.5*self.N/self._variance - 0.5*np.trace(np.dot(e.T, np.dot(self.I, e)))
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
@@ -579,7 +579,7 @@ class gaussian(likelihood_function):
         $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
         assert y.shape == f.shape
-        dlik_hess_dsigma = np.diag(1.0/(self._variance**2)*self.I)[:, None]
+        dlik_hess_dsigma = np.diag((1.0/(self._variance**2))*self.I)[:, None]
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):

From 54954c63f83d566a383bd0d2b14dadaa66ce363e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 29 Aug 2013 13:47:56 +0100
Subject: [PATCH 70/71] A few typos

---
 GPy/examples/laplace_approximations.py | 2 +-
 GPy/likelihoods/Laplace.py             | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 02b38a79..8be08a8f 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -632,7 +632,7 @@ def gaussian_f_check():
     plt.subplot(212)
     m.optimize()
     m.plot()
-    print "final optimised student t"
+    print "final optimised gaussian"
     print m
     print "real GP"
     print mgp
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index aeda17da..58304c23 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -105,8 +105,15 @@ class Laplace(likelihood):
         dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter
         for thetaL_i in range(num_params):
             #Explicit
+            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            #a = 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            #d = dlik_hess_dthetaL[thetaL_i]
+            #e = pdinv(pdinv(self.K)[0] + np.diagflat(self.W))[0]
+            #b = 0.5*np.dot(np.diag(e).T, d)
+            #g = 0.5*(np.diag(self.K) - np.sum(cho_solve((self.B_chol, True), np.dot(np.diagflat(self.W_12),self.K))**2, 1))
+            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - np.dot(g.T, dlik_hess_dthetaL[thetaL_i])
             dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
-            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(mdot(self.Bi, self.K, dlik_hess_dthetaL[thetaL_i]))
+
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
             dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)

From f943cf9ddb9db80556ff7873108d22ac48113c2d Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 9 Sep 2013 11:54:32 +0100
Subject: [PATCH 71/71] Changed the gradients (perhaps for the worse)

---
 GPy/likelihoods/likelihood_functions.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 25f770b5..72d2ff82 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -523,7 +523,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         s2_i = (1.0/self._variance)*self.I
-        grad = np.dot(s2_i, y) - np.dot(s2_i, f)
+        grad = np.dot(s2_i, y) - 0.5*np.dot(s2_i, f)
         return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
@@ -541,7 +541,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         s2_i = (1.0/self._variance)*self.I
-        hess = np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
+        hess = 0.5*np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -560,7 +560,8 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_dsigma = -0.5*self.N/self._variance - 0.5*np.trace(np.dot(e.T, np.dot(self.I, e)))
+        s_4 = 1.0/(self._variance**2)
+        dlik_dsigma = -0.5*self.N*1/self._variance + 0.5*s_4*np.trace(np.dot(e.T, np.dot(self.I, e)))
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
@@ -569,7 +570,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         s_4 = 1.0/(self._variance**2)
-        dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + np.dot(s_4, np.dot(self.I, f))
+        dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f))
         return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
@@ -579,7 +580,7 @@ class gaussian(likelihood_function):
         $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
         assert y.shape == f.shape
-        dlik_hess_dsigma = np.diag((1.0/(self._variance**2))*self.I)[:, None]
+        dlik_hess_dsigma = 0.5*np.diag((1.0/(self._variance**2))*self.I)[:, None]
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):