From 67248ab7c2b0becf471fe08638d35cf0786ee1a2 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 12 Mar 2013 03:16:33 -0700
Subject: [PATCH 001/252] Initial commit

---
 .gitignore | 35 +++++++++++++++++++++++++++++++++++
 README.md  |  4 ++++
 2 files changed, 39 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..d2d6f360
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,35 @@
+*.py[cod]
+
+# C extensions
+*.so
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+lib
+lib64
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+nosetests.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..317fa353
--- /dev/null
+++ b/README.md
@@ -0,0 +1,4 @@
+coxGP
+=====
+
+Gaussian Process models of Cox proportional hazard models
\ No newline at end of file

From 68eb83955c585b08cf93cbd659f749cff5b62bb3 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 12 Mar 2013 17:42:00 +0000
Subject: [PATCH 002/252] Initial commit, setting up the laplace approximation
 for a student t

---
 python/examples/laplace_approximations.py | 37 ++++++++++++++++
 python/likelihoods/Laplace.py             | 54 +++++++++++++++++++++++
 python/likelihoods/likelihood_function.py | 51 +++++++++++++++++++++
 python/models/coxGP.py                    | 19 ++++++++
 python/testing/cox_tests.py               | 14 ++++++
 5 files changed, 175 insertions(+)
 create mode 100644 python/examples/laplace_approximations.py
 create mode 100644 python/likelihoods/Laplace.py
 create mode 100644 python/likelihoods/likelihood_function.py
 create mode 100644 python/models/coxGP.py
 create mode 100644 python/testing/cox_tests.py

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
new file mode 100644
index 00000000..2f059831
--- /dev/null
+++ b/python/examples/laplace_approximations.py
@@ -0,0 +1,37 @@
+import GPy
+import numpy as np
+import scipy as sp
+import scipy.stats
+import matplotlib.pyplot as plt
+
+
+def student_t_approx():
+    """
+    Example of regressing with a student t likelihood
+    """
+    #Start a function, any function
+    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
+    Y = np.sin(X)
+
+    #Add some extreme value noise to some of the datapoints
+    percent_corrupted = 0.05
+    corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
+    indices = np.arange(Y.shape[0])
+    np.random.shuffle(indices)
+    corrupted_indices = indices[:corrupted_datums]
+    print corrupted_indices
+    noise = np.random.uniform(-10,10,(len(corrupted_indices), 1))
+    Y[corrupted_indices] += noise
+
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X,Y)
+
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize()
+    # plot
+    m.plot()
+    print m
+
+    #with a student t distribution, since it has heavy tails it should work well
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
new file mode 100644
index 00000000..a0dbc65c
--- /dev/null
+++ b/python/likelihoods/Laplace.py
@@ -0,0 +1,54 @@
+import nump as np
+import GPy
+from GPy.util.linalg import jitchol
+
+class Laplace(GPy.likelihoods.likelihood):
+    """Laplace approximation to a posterior"""
+
+    def __init__(self,data,likelihood_function):
+        """
+        Laplace Approximation
+
+        First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
+        then find the z^{prime} which allows this to be a normalised gaussian instead of a
+        non-normalized gaussian
+
+        Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
+        which makes a gaussian the same as the laplace approximation
+
+        Arguments
+        ---------
+
+        :data: @todo
+        :likelihood_function: @todo
+
+        """
+        GPy.likelihoods.likelihood.__init__(self)
+
+        self.data = data
+        self.likelihood_function = likelihood_function
+
+        #Inital values
+        self.N, self.D = self.data.shape
+
+    def _compute_GP_variables(self):
+        """
+        Generates data Y which would give the normal distribution identical to the laplace approximation
+
+        GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
+        that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
+        """
+        raise NotImplementedError
+
+    def fit_full(self, K):
+        """
+        The laplace approximation algorithm
+        For nomenclature see Rasmussen & Williams 2006
+        :K: Covariance matrix
+        """
+        self.f = np.zeros(self.N)
+
+        #Find \hat(f) using a newton raphson optimizer for example
+
+        #At this point get the hessian matrix
+
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
new file mode 100644
index 00000000..fd19675b
--- /dev/null
+++ b/python/likelihoods/likelihood_function.py
@@ -0,0 +1,51 @@
+import GPy
+from scipy.special import gamma, gammaln
+
+class student_t(GPy.likelihoods.likelihood_function):
+    """Student t likelihood distribution
+    For nomanclature see Bayesian Data Analysis 2003 p576
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fi
+    """
+    def __init__(self, deg_free, sigma=1):
+        self.v = deg_free
+        self.sigma = 1
+
+    def link_function(self, y_i, f_i):
+        """link_function $\ln p(y_i|f_i)$
+
+        :y_i: datum number i
+        :f_i: latent variable f_i
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        e = y_i - f_i
+        return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v)
+
+    def link_grad(self, y_i, f_i):
+        """gradient of the link function at y_i, given f_i w.r.t f_i
+
+        :y_i: datum number i
+        :f_i: latent variable f_i
+        :returns: float(gradient of likelihood evaluated at this point)
+
+        """
+        pass
+
+    def link_hess(self, y_i, f_i, f_j):
+        """hessian at this point (the hessian will be 0 unless i == j)
+        i.e. second derivative w.r.t f_i and f_j
+
+        :y_i: @todo
+        :f_i: @todo
+        :f_j: @todo
+        :returns: @todo
+
+        """
+        if f_i =
+        pass
+
diff --git a/python/models/coxGP.py b/python/models/coxGP.py
new file mode 100644
index 00000000..f61a8f46
--- /dev/null
+++ b/python/models/coxGP.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2013, Alan Saul
+
+from GPy.models import GP
+from .. import likelihoods
+from GPy import kern
+
+
+class cox_GP_regression(GP):
+    """
+    Cox Gaussian Process model for regression
+    """
+
+    def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None):
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1])
+
+        likelihood = likelihoods.cox_piecewise(Y, normalize=normalize_Y)
+
+        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices)
diff --git a/python/testing/cox_tests.py b/python/testing/cox_tests.py
new file mode 100644
index 00000000..526f5c92
--- /dev/null
+++ b/python/testing/cox_tests.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2013, Alan Saul
+
+import unittest
+import numpy as np
+import GPy
+
+class coxGPTests(unittest.TestCase):
+    def test_laplace_approx(self):
+        pass
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
+

From ad2c266c65120e1fabf0cf1825fc0c661084611b Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 13 Mar 2013 11:54:33 +0000
Subject: [PATCH 003/252] Added some comments

---
 python/likelihoods/likelihood_function.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index fd19675b..5d4e51ce 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -5,6 +5,9 @@ class student_t(GPy.likelihoods.likelihood_function):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
 
+    $$\ln(\frac{\Gamma(\frac{(v+1)}{2})}{\Gamma(\sqrt(v \pi \Gamma(\frac{v}{2}))})+ \ln(1+\frac{(y_i-f_i)^2}{\sigma v})^{-\frac{(v+1)}{2}}$$
+    TODO:Double check this
+
     Laplace:
     Needs functions to calculate
     ln p(yi|fi)
@@ -17,6 +20,8 @@ class student_t(GPy.likelihoods.likelihood_function):
 
     def link_function(self, y_i, f_i):
         """link_function $\ln p(y_i|f_i)$
+        $$\ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2}) - \ln \frac{v \pi \sigma}{2} - \frac{v+1}{2}\ln (1 + \frac{(y_{i} - f_{i})^{2}}{v\sigma})$$
+        TODO: Double check this
 
         :y_i: datum number i
         :f_i: latent variable f_i
@@ -24,11 +29,15 @@ class student_t(GPy.likelihoods.likelihood_function):
 
         """
         e = y_i - f_i
-        return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v)
+        return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) #Check the /v!
 
     def link_grad(self, y_i, f_i):
         """gradient of the link function at y_i, given f_i w.r.t f_i
 
+        derivative of log((gamma((v+1)/2)/gamma(sqrt(v*pi*gamma(v/2))))*(1+(t^2)/(a*v))^((-(v+1))/2)) with respect to t
+        $$\frac{(y_i - f_i)(v + 1)}{\sigma v (y_{i} - f_{i})^{2}}$$
+        TODO: Double check this
+
         :y_i: datum number i
         :f_i: latent variable f_i
         :returns: float(gradient of likelihood evaluated at this point)
@@ -40,6 +49,8 @@ class student_t(GPy.likelihoods.likelihood_function):
         """hessian at this point (the hessian will be 0 unless i == j)
         i.e. second derivative w.r.t f_i and f_j
 
+        second derivative of
+
         :y_i: @todo
         :f_i: @todo
         :f_j: @todo

From 3f114aa020fb678b1c52eb441bb079d9a0b8cd00 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 13 Mar 2013 17:55:41 +0000
Subject: [PATCH 004/252] Got most of laplace approximation working

---
 __init__.py                               |  0
 python/__init__.py                        |  0
 python/examples/__init__.py               |  0
 python/examples/laplace_approximations.py | 44 +++++++++++--
 python/likelihoods/Laplace.py             | 45 +++++++++++--
 python/likelihoods/__init__.py            |  0
 python/likelihoods/likelihood_function.py | 80 +++++++++++++----------
 python/models/__init__.py                 |  0
 python/testing/__init__.py                |  0
 9 files changed, 124 insertions(+), 45 deletions(-)
 create mode 100644 __init__.py
 create mode 100644 python/__init__.py
 create mode 100644 python/examples/__init__.py
 create mode 100644 python/likelihoods/__init__.py
 create mode 100644 python/models/__init__.py
 create mode 100644 python/testing/__init__.py

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/__init__.py b/python/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/examples/__init__.py b/python/examples/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 2f059831..0e1d3305 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -1,8 +1,9 @@
 import GPy
 import numpy as np
-import scipy as sp
-import scipy.stats
 import matplotlib.pyplot as plt
+from scipy.stats import t
+from coxGP.python.likelihoods.Laplace import Laplace
+from coxGP.python.likelihoods.likelihood_function import student_t
 
 
 def student_t_approx():
@@ -13,6 +14,41 @@ def student_t_approx():
     X = np.sort(np.random.uniform(0, 15, 70))[:, None]
     Y = np.sin(X)
 
+    #Add student t random noise to datapoints
+    deg_free = 1
+    noise = t.rvs(deg_free, loc=1.8, scale=1, size=Y.shape)
+    Y += noise
+
+    # Kernel object
+    print X.shape
+    kernel = GPy.kern.rbf(X.shape[1])
+
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X, Y, kernel=kernel)
+
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize()
+    # plot
+    #m.plot()
+    print m
+
+    #with a student t distribution, since it has heavy tails it should work well
+    likelihood_function = student_t(deg_free, sigma=1)
+    lap = Laplace(Y, likelihood_function)
+    cov = kernel.K(X)
+    lap.fit_full(cov)
+
+
+def noisy_laplace_approx():
+    """
+    Example of regressing with a student t likelihood
+    """
+    #Start a function, any function
+    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
+    Y = np.sin(X)
+
     #Add some extreme value noise to some of the datapoints
     percent_corrupted = 0.05
     corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
@@ -20,12 +56,12 @@ def student_t_approx():
     np.random.shuffle(indices)
     corrupted_indices = indices[:corrupted_datums]
     print corrupted_indices
-    noise = np.random.uniform(-10,10,(len(corrupted_indices), 1))
+    noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1))
     Y[corrupted_indices] += noise
 
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
-    m = GPy.models.GP_regression(X,Y)
+    m = GPy.models.GP_regression(X, Y)
 
     # optimize
     m.ensure_default_constraints()
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index a0dbc65c..6efbfa30 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,8 +1,14 @@
-import nump as np
+import numpy as np
+import scipy as sp
 import GPy
 from GPy.util.linalg import jitchol
+from functools import partial
+from GPy.likelihoods.likelihood import likelihood
+from GPy.util.linalg import pdinv,mdot
 
-class Laplace(GPy.likelihoods.likelihood):
+
+
+class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
     def __init__(self,data,likelihood_function):
@@ -23,8 +29,6 @@ class Laplace(GPy.likelihoods.likelihood):
         :likelihood_function: @todo
 
         """
-        GPy.likelihoods.likelihood.__init__(self)
-
         self.data = data
         self.likelihood_function = likelihood_function
 
@@ -38,7 +42,7 @@ class Laplace(GPy.likelihoods.likelihood):
         GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
         that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
         """
-        raise NotImplementedError
+        z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised
 
     def fit_full(self, K):
         """
@@ -46,9 +50,38 @@ class Laplace(GPy.likelihoods.likelihood):
         For nomenclature see Rasmussen & Williams 2006
         :K: Covariance matrix
         """
-        self.f = np.zeros(self.N)
+        f = np.zeros((self.N, 1))
+        print K.shape
+        print f.shape
+        print self.data.shape
+        (Ki, _, _, log_Kdet) = pdinv(K)
+        obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2*np.pi))
 
         #Find \hat(f) using a newton raphson optimizer for example
+        #TODO: Add newton-raphson as subclass of optimizer class
+
+        #FIXME: Can we get rid of this horrible reshaping?
+        def obj(f):
+            f = f[:, None]
+            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (Ki, f)) + obj_constant)
+            return float(res)
+
+        def obj_grad(f):
+            f = f[:, None]
+            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(Ki, f))
+            return np.squeeze(res)
+
+        def obj_hess(f):
+            f = f[:, None]
+            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - Ki)
+            return np.squeeze(res)
+
+        self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
 
         #At this point get the hessian matrix
+        self.hess_hat = obj_hess(f_hat)
 
+        #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
+        self.height_unnormalised = obj(f_hat) #FIXME: Is it -1?
+
+        return _compute_GP_variables()
diff --git a/python/likelihoods/__init__.py b/python/likelihoods/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 5d4e51ce..78731199 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -1,62 +1,72 @@
-import GPy
-from scipy.special import gamma, gammaln
+from scipy.special import gammaln
+import numpy as np
+from GPy.likelihoods.likelihood_functions import likelihood_function
 
-class student_t(GPy.likelihoods.likelihood_function):
+
+class student_t(likelihood_function):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
 
-    $$\ln(\frac{\Gamma(\frac{(v+1)}{2})}{\Gamma(\sqrt(v \pi \Gamma(\frac{v}{2}))})+ \ln(1+\frac{(y_i-f_i)^2}{\sigma v})^{-\frac{(v+1)}{2}}$$
-    TODO:Double check this
+    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
     Laplace:
     Needs functions to calculate
     ln p(yi|fi)
     dln p(yi|fi)_dfi
-    d2ln p(yi|fi)_d2fi
+    d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free, sigma=1):
         self.v = deg_free
         self.sigma = 1
 
-    def link_function(self, y_i, f_i):
-        """link_function $\ln p(y_i|f_i)$
-        $$\ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2}) - \ln \frac{v \pi \sigma}{2} - \frac{v+1}{2}\ln (1 + \frac{(y_{i} - f_{i})^{2}}{v\sigma})$$
-        TODO: Double check this
+    def link_function(self, y, f):
+        """link_function $\ln p(y|f)$
+        $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
-        :y_i: datum number i
-        :f_i: latent variable f_i
+        :y: datum number i
+        :f: latent variable f
         :returns: float(likelihood evaluated for this point)
 
         """
-        e = y_i - f_i
-        return gammaln((v+1)*0.5) - gammaln(v*0.5) - np.ln(v*np.pi*sigma)*0.5 - (v+1)*0.5*np.ln(1 + ((e/sigma)**2)/v) #Check the /v!
+        e = y - f
+        #print "Link ", y.shape, f.shape, e.shape
+        objective = (gammaln((self.v + 1) * 0.5)
+                - gammaln(self.v * 0.5)
+                + np.log(self.sigma * np.sqrt(self.v * np.pi))
+                - (self.v + 1) * 0.5
+                * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                )
+        return np.sum(objective)
 
-    def link_grad(self, y_i, f_i):
-        """gradient of the link function at y_i, given f_i w.r.t f_i
+    def link_grad(self, y, f):
+        """
+        Gradient of the link function at y, given f w.r.t f
 
-        derivative of log((gamma((v+1)/2)/gamma(sqrt(v*pi*gamma(v/2))))*(1+(t^2)/(a*v))^((-(v+1))/2)) with respect to t
-        $$\frac{(y_i - f_i)(v + 1)}{\sigma v (y_{i} - f_{i})^{2}}$$
-        TODO: Double check this
+        $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
 
-        :y_i: datum number i
-        :f_i: latent variable f_i
+        :y: datum number i
+        :f: latent variable f
         :returns: float(gradient of likelihood evaluated at this point)
 
         """
-        pass
-
-    def link_hess(self, y_i, f_i, f_j):
-        """hessian at this point (the hessian will be 0 unless i == j)
-        i.e. second derivative w.r.t f_i and f_j
-
-        second derivative of
-
-        :y_i: @todo
-        :f_i: @todo
-        :f_j: @todo
-        :returns: @todo
+        e = y - f
+        #print "Grad ", y.shape, f.shape, e.shape
+        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        return grad
 
+    def link_hess(self, y, f):
         """
-        if f_i =
-        pass
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
 
+        Will return diaganol of hessian, since every where else it is 0
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: datum number i
+        :f: latent variable f
+        :returns: float(second derivative of likelihood evaluated at this point)
+        """
+        e = y - f
+        hess = ((self.v + 1) * e) / ((((self.sigma**2)*self.v) + e**2)**2)
+        return hess
diff --git a/python/models/__init__.py b/python/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/testing/__init__.py b/python/testing/__init__.py
new file mode 100644
index 00000000..e69de29b

From f9535c858a653e08a32a8633fe37577c87812820 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 14 Mar 2013 15:30:22 +0000
Subject: [PATCH 005/252] Trying to 'debug'

---
 python/examples/laplace_approximations.py | 22 +++++++++++---
 python/likelihoods/Laplace.py             | 25 +++++++++------
 python/likelihoods/likelihood_function.py | 37 ++++++++++++-----------
 3 files changed, 52 insertions(+), 32 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 0e1d3305..5642d8a4 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -1,7 +1,7 @@
 import GPy
 import numpy as np
 import matplotlib.pyplot as plt
-from scipy.stats import t
+from scipy.stats import t, norm
 from coxGP.python.likelihoods.Laplace import Laplace
 from coxGP.python.likelihoods.likelihood_function import student_t
 
@@ -11,12 +11,13 @@ def student_t_approx():
     Example of regressing with a student t likelihood
     """
     #Start a function, any function
-    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
+    X = np.sort(np.random.uniform(0, 15, 100))[:, None]
     Y = np.sin(X)
 
     #Add student t random noise to datapoints
-    deg_free = 1
-    noise = t.rvs(deg_free, loc=1.8, scale=1, size=Y.shape)
+    deg_free = 2.5
+    t_rv = t(deg_free, loc=5, scale=1)
+    noise = t_rv.rvs(size=Y.shape)
     Y += noise
 
     # Kernel object
@@ -39,6 +40,19 @@ def student_t_approx():
     lap = Laplace(Y, likelihood_function)
     cov = kernel.K(X)
     lap.fit_full(cov)
+    #Get one sample (just look at a single Y
+    mode = float(lap.f_hat[0])
+    variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables
+    #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables
+    normalised_approx = norm(loc=mode, scale=variance)
+    print "Normal with mode %f, and variance %f" % (mode, variance)
+    print lap.height_unnormalised
+
+    test_range = np.arange(0, 10, 0.1)
+    print np.diagonal(lap.hess_hat)
+    plt.plot(test_range, t_rv.pdf(test_range))
+    plt.plot(test_range, normalised_approx.pdf(test_range))
+    plt.show()
 
 
 def noisy_laplace_approx():
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 6efbfa30..08ae0e6f 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -5,13 +5,13 @@ from GPy.util.linalg import jitchol
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
-
+from scipy.stats import norm
 
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self,data,likelihood_function):
+    def __init__(self, data, likelihood_function):
         """
         Laplace Approximation
 
@@ -42,7 +42,13 @@ class Laplace(likelihood):
         GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
         that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
         """
-        z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised
+        #z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised
+        normalised_approx = norm(loc=self.f_hat, scale=self.hess_hat)
+        self.Z = normalised_approx.pdf(self.f_hat)/self.height_unnormalised
+        #self.Y =
+        #self.YYT =
+        #self.covariance_matrix =
+        #self.precision =
 
     def fit_full(self, K):
         """
@@ -51,11 +57,9 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         f = np.zeros((self.N, 1))
-        print K.shape
-        print f.shape
-        print self.data.shape
+        #K = np.diag(np.ones(self.N))
         (Ki, _, _, log_Kdet) = pdinv(K)
-        obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2*np.pi))
+        obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
 
         #Find \hat(f) using a newton raphson optimizer for example
         #TODO: Add newton-raphson as subclass of optimizer class
@@ -77,11 +81,12 @@ class Laplace(likelihood):
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        print self.f_hat
 
         #At this point get the hessian matrix
-        self.hess_hat = obj_hess(f_hat)
+        self.hess_hat = obj_hess(self.f_hat)
 
         #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
-        self.height_unnormalised = obj(f_hat) #FIXME: Is it -1?
+        self.height_unnormalised = obj(self.f_hat) #FIXME: Is it -1?
 
-        return _compute_GP_variables()
+        return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 78731199..46128de7 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -15,27 +15,27 @@ class student_t(likelihood_function):
     dln p(yi|fi)_dfi
     d2ln p(yi|fi)_d2fifj
     """
-    def __init__(self, deg_free, sigma=1):
+    def __init__(self, deg_free, sigma=2):
         self.v = deg_free
-        self.sigma = 1
+        self.sigma = sigma
 
     def link_function(self, y, f):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
-        :y: datum number i
-        :f: latent variable f
+        :y: data
+        :f: latent variables f
         :returns: float(likelihood evaluated for this point)
 
         """
+        assert y.shape[0] == f.shape[0]
         e = y - f
-        #print "Link ", y.shape, f.shape, e.shape
         objective = (gammaln((self.v + 1) * 0.5)
-                - gammaln(self.v * 0.5)
-                + np.log(self.sigma * np.sqrt(self.v * np.pi))
-                - (self.v + 1) * 0.5
-                * np.log(1 + ((e**2 / self.sigma**2) / self.v))
-                )
+                     - gammaln(self.v * 0.5)
+                     + np.log(self.sigma * np.sqrt(self.v * np.pi))
+                     - (self.v + 1) * 0.5
+                     * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                     )
         return np.sum(objective)
 
     def link_grad(self, y, f):
@@ -44,13 +44,13 @@ class student_t(likelihood_function):
 
         $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
 
-        :y: datum number i
-        :f: latent variable f
-        :returns: float(gradient of likelihood evaluated at this point)
+        :y: data
+        :f: latent variables f
+        :returns: gradient of likelihood evaluated at points
 
         """
+        assert y.shape[0] == f.shape[0]
         e = y - f
-        #print "Grad ", y.shape, f.shape, e.shape
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
         return grad
 
@@ -63,10 +63,11 @@ class student_t(likelihood_function):
 
         $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
 
-        :y: datum number i
-        :f: latent variable f
-        :returns: float(second derivative of likelihood evaluated at this point)
+        :y: data
+        :f: latent variables f
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
+        assert y.shape[0] == f.shape[0]
         e = y - f
-        hess = ((self.v + 1) * e) / ((((self.sigma**2)*self.v) + e**2)**2)
+        hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
         return hess

From 34ae852eea8d5f6cdc48028d4f21457c7f0b5259 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 15 Mar 2013 17:38:13 +0000
Subject: [PATCH 006/252] got an idea of how to implement! written in docs

---
 python/likelihoods/Laplace.py | 38 ++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 08ae0e6f..568fcef0 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -41,10 +41,26 @@ class Laplace(likelihood):
 
         GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
         that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
+
+        Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
+        then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
+        due to the z rescaling.
+
+        at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
+
+        This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
+        giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
+
+        $$\tilde{Y} = \tilde{\Sigma} Hf$$
+        where
+        $$\tilde{\Sigma}^{-1} = H - K^{-1}$$
+        i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
+        since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
+        and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
+
         """
-        #z_hat = N(f_hat|f_hat, hess_hat) / self.height_unnormalised
-        normalised_approx = norm(loc=self.f_hat, scale=self.hess_hat)
-        self.Z = normalised_approx.pdf(self.f_hat)/self.height_unnormalised
+        self.Sigma_tilde = self.hess_hat -
+        self.Z =
         #self.Y =
         #self.YYT =
         #self.covariance_matrix =
@@ -58,8 +74,8 @@ class Laplace(likelihood):
         """
         f = np.zeros((self.N, 1))
         #K = np.diag(np.ones(self.N))
-        (Ki, _, _, log_Kdet) = pdinv(K)
-        obj_constant = (0.5 * log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
+        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
+        obj_constant = (0.5 * self.log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
 
         #Find \hat(f) using a newton raphson optimizer for example
         #TODO: Add newton-raphson as subclass of optimizer class
@@ -67,17 +83,17 @@ class Laplace(likelihood):
         #FIXME: Can we get rid of this horrible reshaping?
         def obj(f):
             f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (Ki, f)) + obj_constant)
+            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (self.Ki, f)) + obj_constant)
             return float(res)
 
         def obj_grad(f):
             f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(Ki, f))
+            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
             f = f[:, None]
-            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - Ki)
+            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
@@ -87,6 +103,10 @@ class Laplace(likelihood):
         self.hess_hat = obj_hess(self.f_hat)
 
         #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
-        self.height_unnormalised = obj(self.f_hat) #FIXME: Is it -1?
+        self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
+        #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
+        #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
+        #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
+        self.z_hat = np.exp(-0.5*np.log(np.linalg.det(hess_hat)) + self.height_unnormalised)
 
         return self._compute_GP_variables()

From 2bf1cf0eb6596773c2f75a06f152b3a7cfd66081 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 18 Mar 2013 15:59:12 +0000
Subject: [PATCH 007/252] following naming convention better, lots of inverses
 which should be able to get rid of one or two, unsure if it works

---
 python/examples/laplace_approximations.py | 17 +++++----
 python/likelihoods/Laplace.py             | 43 +++++++++++++----------
 python/likelihoods/likelihood_function.py |  9 ++---
 3 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 5642d8a4..aa8cdcb4 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -41,18 +41,21 @@ def student_t_approx():
     cov = kernel.K(X)
     lap.fit_full(cov)
     #Get one sample (just look at a single Y
-    mode = float(lap.f_hat[0])
-    variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables
+    #mode = float(lap.f_hat[0])
+    #variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables
     #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables
-    normalised_approx = norm(loc=mode, scale=variance)
-    print "Normal with mode %f, and variance %f" % (mode, variance)
-    print lap.height_unnormalised
 
     test_range = np.arange(0, 10, 0.1)
-    print np.diagonal(lap.hess_hat)
     plt.plot(test_range, t_rv.pdf(test_range))
-    plt.plot(test_range, normalised_approx.pdf(test_range))
+    for i in xrange(X.shape[0]):
+        mode = lap.f_hat[i]
+        covariance = lap.hess_hat_i[i,i]
+        scaling = np.exp(lap.ln_z_hat)
+        normalised_approx = norm(loc=mode, scale=covariance)
+        print "Normal with mode %f, and variance %f" % (mode, covariance)
+        plt.plot(test_range, normalised_approx.pdf(test_range))
     plt.show()
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
 
 def noisy_laplace_approx():
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 568fcef0..9d622b0d 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,12 +1,10 @@
 import numpy as np
 import scipy as sp
 import GPy
-from GPy.util.linalg import jitchol
+#from GPy.util.linalg import jitchol
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
-from scipy.stats import norm
-
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -35,6 +33,8 @@ class Laplace(likelihood):
         #Inital values
         self.N, self.D = self.data.shape
 
+        self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
+
     def _compute_GP_variables(self):
         """
         Generates data Y which would give the normal distribution identical to the laplace approximation
@@ -59,12 +59,15 @@ class Laplace(likelihood):
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
 
         """
-        self.Sigma_tilde = self.hess_hat -
-        self.Z =
-        #self.Y =
-        #self.YYT =
-        #self.covariance_matrix =
-        #self.precision =
+        self.Sigma_tilde_i = self.hess_hat + self.Ki
+        #Do we really need to inverse Sigma_tilde_i? :(
+        (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i)
+        Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess?
+        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde, (self.Sigma_tilde_i, Y_tilde))))
+        self.Y = Y_tilde
+        self.covariance_matrix = self.Sigma_tilde
+        self.precision = np.diag(self.Sigma_tilde)[:, None]
+        self.YYT = np.dot(self.Y, self.Y)
 
     def fit_full(self, K):
         """
@@ -75,38 +78,40 @@ class Laplace(likelihood):
         f = np.zeros((self.N, 1))
         #K = np.diag(np.ones(self.N))
         (self.Ki, _, _, self.log_Kdet) = pdinv(K)
-        obj_constant = (0.5 * self.log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))
-
+        LOG_K_CONST = -(0.5 * self.log_Kdet)
+        OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST
         #Find \hat(f) using a newton raphson optimizer for example
         #TODO: Add newton-raphson as subclass of optimizer class
 
         #FIXME: Can we get rid of this horrible reshaping?
         def obj(f):
-            f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (self.Ki, f)) + obj_constant)
+            #f = f[:, None]
+            res = -1 * (self.likelihood_function.link_function(self.data[:,0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
             return float(res)
 
         def obj_grad(f):
-            f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(self.Ki, f))
+            #f = f[:, None]
+            res = -1 * (self.likelihood_function.link_grad(self.data[:,0], f) - mdot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
-            f = f[:, None]
-            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - self.Ki)
+            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
         print self.f_hat
 
         #At this point get the hessian matrix
-        self.hess_hat = obj_hess(self.f_hat)
+        self.hess_hat = -1*np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) #-1*obj_hess(self.f_hat) + self.Ki
+        #self.hess_hat = -1*obj_hess(self.f_hat) + self.Ki
+        (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat + self.Ki)
 
         #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
         self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
         #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
-        self.z_hat = np.exp(-0.5*np.log(np.linalg.det(hess_hat)) + self.height_unnormalised)
+        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) + self.height_unnormalised - self.NORMAL_CONST #Unsure whether its log_hess or log_hess_i
+
 
         return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 46128de7..8adbf86c 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -28,7 +28,7 @@ class student_t(likelihood_function):
         :returns: float(likelihood evaluated for this point)
 
         """
-        assert y.shape[0] == f.shape[0]
+        assert y.shape == f.shape
         e = y - f
         objective = (gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
@@ -49,7 +49,7 @@ class student_t(likelihood_function):
         :returns: gradient of likelihood evaluated at points
 
         """
-        assert y.shape[0] == f.shape[0]
+        assert y.shape == f.shape
         e = y - f
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
         return grad
@@ -67,7 +67,8 @@ class student_t(likelihood_function):
         :f: latent variables f
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
-        assert y.shape[0] == f.shape[0]
+        assert y.shape == f.shape
         e = y - f
-        hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
+        #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
+        hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2)
         return hess

From 46d59c94b27cabe61056b71aa26d1293779c0697 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 19 Mar 2013 11:47:53 +0000
Subject: [PATCH 008/252] Just breaking some things...

---
 python/examples/laplace_approximations.py | 88 +++++++++++++++--------
 python/likelihoods/Laplace.py             | 52 ++++++++++----
 python/likelihoods/likelihood_function.py | 16 ++++-
 3 files changed, 113 insertions(+), 43 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index aa8cdcb4..73c8f67f 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -16,47 +16,75 @@ def student_t_approx():
 
     #Add student t random noise to datapoints
     deg_free = 2.5
-    t_rv = t(deg_free, loc=5, scale=1)
+    t_rv = t(deg_free, loc=0, scale=1)
     noise = t_rv.rvs(size=Y.shape)
     Y += noise
 
+    #Add some extreme value noise to some of the datapoints
+    #percent_corrupted = 0.05
+    #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
+    #indices = np.arange(Y.shape[0])
+    #np.random.shuffle(indices)
+    #corrupted_indices = indices[:corrupted_datums]
+    #print corrupted_indices
+    #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
+    #Y[corrupted_indices] += noise
+
     # Kernel object
-    print X.shape
-    kernel = GPy.kern.rbf(X.shape[1])
+    #print X.shape
+    #kernel = GPy.kern.rbf(X.shape[1])
 
-    #A GP should completely break down due to the points as they get a lot of weight
-    # create simple GP model
-    m = GPy.models.GP_regression(X, Y, kernel=kernel)
+    ##A GP should completely break down due to the points as they get a lot of weight
+    ## create simple GP model
+    #m = GPy.models.GP_regression(X, Y, kernel=kernel)
 
-    # optimize
-    m.ensure_default_constraints()
-    m.optimize()
-    # plot
-    #m.plot()
-    print m
+    ## optimize
+    #m.ensure_default_constraints()
+    #m.optimize()
+    ## plot
+    ##m.plot()
+    #print m
 
     #with a student t distribution, since it has heavy tails it should work well
-    likelihood_function = student_t(deg_free, sigma=1)
-    lap = Laplace(Y, likelihood_function)
-    cov = kernel.K(X)
-    lap.fit_full(cov)
-    #Get one sample (just look at a single Y
-    #mode = float(lap.f_hat[0])
-    #variance = float((deg_free/(deg_free-2))) #BUG: Not convinced this is giving reasonable variables
-    #variance = float((deg_free/(deg_free-2)) + np.diagonal(lap.hess_hat)[0]) #BUG: Not convinced this is giving reasonable variables
+    #likelihood_function = student_t(deg_free, sigma=1)
+    #lap = Laplace(Y, likelihood_function)
+    #cov = kernel.K(X)
+    #lap.fit_full(cov)
 
-    test_range = np.arange(0, 10, 0.1)
-    plt.plot(test_range, t_rv.pdf(test_range))
-    for i in xrange(X.shape[0]):
-        mode = lap.f_hat[i]
-        covariance = lap.hess_hat_i[i,i]
-        scaling = np.exp(lap.ln_z_hat)
-        normalised_approx = norm(loc=mode, scale=covariance)
-        print "Normal with mode %f, and variance %f" % (mode, covariance)
-        plt.plot(test_range, normalised_approx.pdf(test_range))
-    plt.show()
+    #test_range = np.arange(0, 10, 0.1)
+    #plt.plot(test_range, t_rv.pdf(test_range))
+    #for i in xrange(X.shape[0]):
+        #mode = lap.f_hat[i]
+        #covariance = lap.hess_hat_i[i,i]
+        #scaling = np.exp(lap.ln_z_hat)
+        #normalised_approx = norm(loc=mode, scale=covariance)
+        #print "Normal with mode %f, and variance %f" % (mode, covariance)
+        #plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    #plt.show()
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+    # Likelihood object
+    t_distribution = student_t(deg_free, sigma=1)
+    stu_t_likelihood = Laplace(Y, t_distribution)
+    kernel = GPy.kern.rbf(X.shape[1])
+
+    m = GPy.models.GP(X, stu_t_likelihood, kernel)
+    m.ensure_default_constraints()
+
+    m.update_likelihood_approximation()
+    print "NEW MODEL"
+    print(m)
+
+    # optimize
+    #m.optimize()
+    print(m)
+
+    # plot
+    m.plot()
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
+    return m
+
 
 def noisy_laplace_approx():
     """
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 9d622b0d..23db6abd 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -5,6 +5,7 @@ import GPy
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
+import numpy.testing.assert_array_equal
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -35,6 +36,29 @@ class Laplace(likelihood):
 
         self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
 
+        #Initial values for the GP variables
+        self.Y = np.zeros((self.N,1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:,None]
+        self.Z = 0
+        self.YYT = None
+
+    def predictive_values(self,mu,var):
+        return self.likelihood_function.predictive_values(mu,var)
+
+    def _get_params(self):
+        return np.zeros(0)
+
+    def _get_param_names(self):
+        return []
+
+    def _set_params(self,p):
+        pass # TODO: Laplace likelihood might want to take some parameters...
+
+    def _gradients(self,partial):
+        raise NotImplementedError
+        #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
+
     def _compute_GP_variables(self):
         """
         Generates data Y which would give the normal distribution identical to the laplace approximation
@@ -63,11 +87,14 @@ class Laplace(likelihood):
         #Do we really need to inverse Sigma_tilde_i? :(
         (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i)
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess?
-        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde, (self.Sigma_tilde_i, Y_tilde))))
+        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))))
+
+        self.Z = self.Z_tilde
         self.Y = Y_tilde
         self.covariance_matrix = self.Sigma_tilde
-        self.precision = np.diag(self.Sigma_tilde)[:, None]
-        self.YYT = np.dot(self.Y, self.Y)
+        self.precision = 1/np.diag(self.Sigma_tilde)[:, None]
+        self.YYT = np.dot(self.Y, self.Y.T)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def fit_full(self, K):
         """
@@ -76,7 +103,6 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         f = np.zeros((self.N, 1))
-        #K = np.diag(np.ones(self.N))
         (self.Ki, _, _, self.log_Kdet) = pdinv(K)
         LOG_K_CONST = -(0.5 * self.log_Kdet)
         OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST
@@ -95,23 +121,25 @@ class Laplace(likelihood):
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki)
+            res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
-        print self.f_hat
 
         #At this point get the hessian matrix
-        self.hess_hat = -1*np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) #-1*obj_hess(self.f_hat) + self.Ki
-        #self.hess_hat = -1*obj_hess(self.f_hat) + self.Ki
-        (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat + self.Ki)
+        self.hess_hat = np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) + self.Ki
+        (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat)
+        (self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat_i)
+
+        np.testing.assert_array_equal(self.hess_hat, hess_hat_new)
 
         #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
-        self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
+        #self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
         #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
-        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) + self.height_unnormalised - self.NORMAL_CONST #Unsure whether its log_hess or log_hess_i
-
+        #Unsure whether its log_hess or log_hess_i
+        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(f.T, (self.Ki, f))
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 8adbf86c..e70cdc8d 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -1,7 +1,7 @@
 from scipy.special import gammaln
 import numpy as np
 from GPy.likelihoods.likelihood_functions import likelihood_function
-
+from scipy import stats
 
 class student_t(likelihood_function):
     """Student t likelihood distribution
@@ -72,3 +72,17 @@ class student_t(likelihood_function):
         #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
         hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2)
         return hess
+
+    def predictive_values(self, mu, var):
+        """
+        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
+        """
+        mean = np.exp(mu)
+        p_025 = stats.t.ppf(025,mean)
+        p_975 = stats.t.ppf(975,mean)
+
+        #p_025 = tmp[:,0]
+        #p_975 = tmp[:,1]
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        return mean,p_025,p_975
+

From a9d555597653c24bc67812776514e29066216d66 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 19 Mar 2013 18:21:57 +0000
Subject: [PATCH 009/252] Worked out in terms of W, needs gradients
 implementing

---
 python/examples/laplace_approximations.py | 44 ++++++++++-----------
 python/likelihoods/Laplace.py             | 48 +++++++++++++++--------
 python/likelihoods/likelihood_function.py |  5 ++-
 3 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 73c8f67f..c8d06ab2 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -15,13 +15,13 @@ def student_t_approx():
     Y = np.sin(X)
 
     #Add student t random noise to datapoints
-    deg_free = 2.5
+    deg_free = 3.5
     t_rv = t(deg_free, loc=0, scale=1)
     noise = t_rv.rvs(size=Y.shape)
     Y += noise
 
     #Add some extreme value noise to some of the datapoints
-    #percent_corrupted = 0.05
+    #percent_corrupted = 0.15
     #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
     #indices = np.arange(Y.shape[0])
     #np.random.shuffle(indices)
@@ -31,11 +31,11 @@ def student_t_approx():
     #Y[corrupted_indices] += noise
 
     # Kernel object
-    #print X.shape
-    #kernel = GPy.kern.rbf(X.shape[1])
+    print X.shape
+    kernel = GPy.kern.rbf(X.shape[1])
 
-    ##A GP should completely break down due to the points as they get a lot of weight
-    ## create simple GP model
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
     #m = GPy.models.GP_regression(X, Y, kernel=kernel)
 
     ## optimize
@@ -46,27 +46,27 @@ def student_t_approx():
     #print m
 
     #with a student t distribution, since it has heavy tails it should work well
-    #likelihood_function = student_t(deg_free, sigma=1)
-    #lap = Laplace(Y, likelihood_function)
-    #cov = kernel.K(X)
-    #lap.fit_full(cov)
+    likelihood_function = student_t(deg_free, sigma=1)
+    lap = Laplace(Y, likelihood_function)
+    cov = kernel.K(X)
+    lap.fit_full(cov)
 
-    #test_range = np.arange(0, 10, 0.1)
-    #plt.plot(test_range, t_rv.pdf(test_range))
-    #for i in xrange(X.shape[0]):
-        #mode = lap.f_hat[i]
-        #covariance = lap.hess_hat_i[i,i]
-        #scaling = np.exp(lap.ln_z_hat)
-        #normalised_approx = norm(loc=mode, scale=covariance)
-        #print "Normal with mode %f, and variance %f" % (mode, covariance)
-        #plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
-    #plt.show()
-    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    test_range = np.arange(0, 10, 0.1)
+    plt.plot(test_range, t_rv.pdf(test_range))
+    for i in xrange(X.shape[0]):
+        mode = lap.f_hat[i]
+        covariance = lap.hess_hat_i[i,i]
+        scaling = np.exp(lap.ln_z_hat)
+        normalised_approx = norm(loc=mode, scale=covariance)
+        print "Normal with mode %f, and variance %f" % (mode, covariance)
+        plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    plt.show()
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     # Likelihood object
     t_distribution = student_t(deg_free, sigma=1)
     stu_t_likelihood = Laplace(Y, t_distribution)
-    kernel = GPy.kern.rbf(X.shape[1])
+    kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
     m = GPy.models.GP(X, stu_t_likelihood, kernel)
     m.ensure_default_constraints()
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 23db6abd..84128e3a 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,11 +1,11 @@
 import numpy as np
 import scipy as sp
 import GPy
-#from GPy.util.linalg import jitchol
+from scipy.linalg import cholesky, eig, inv
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
-import numpy.testing.assert_array_equal
+#import numpy.testing.assert_array_equal
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -56,8 +56,8 @@ class Laplace(likelihood):
         pass # TODO: Laplace likelihood might want to take some parameters...
 
     def _gradients(self,partial):
+        return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
-        #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
 
     def _compute_GP_variables(self):
         """
@@ -83,16 +83,23 @@ class Laplace(likelihood):
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
 
         """
-        self.Sigma_tilde_i = self.hess_hat + self.Ki
+        self.Sigma_tilde_i = self.hess_hat_i #self.W #self.hess_hat_i - self.Ki
         #Do we really need to inverse Sigma_tilde_i? :(
-        (self.Sigma_tilde, _, _, self.log_Sig_i_det) = pdinv(self.Sigma_tilde_i)
-        Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat) #f_hat? should be f but we must have optimized for them I guess?
-        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST + (0.5 * mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))))
+        if self.likelihood_function.log_concave:
+            (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
+        else:
+            self.Sigma_tilde = inv(self.Sigma_tilde_i)
+        #f_hat? should be f but we must have optimized for them I guess?
+        Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
+        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST
+                              - 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                              + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                              )
 
         self.Z = self.Z_tilde
         self.Y = Y_tilde
         self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1/np.diag(self.Sigma_tilde)[:, None]
+        self.precision = 1 / np.diag(self.Sigma_tilde)[:, None]
         self.YYT = np.dot(self.Y, self.Y.T)
         import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
@@ -112,34 +119,41 @@ class Laplace(likelihood):
         #FIXME: Can we get rid of this horrible reshaping?
         def obj(f):
             #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data[:,0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
             return float(res)
 
         def obj_grad(f):
             #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data[:,0], f) - mdot(self.Ki, f))
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:,0], f)) - self.Ki)
+            res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
 
         #At this point get the hessian matrix
-        self.hess_hat = np.diag(self.likelihood_function.link_hess(self.data[:,0], self.f_hat)) + self.Ki
+        self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat))
+        self.hess_hat = self.Ki + self.W
         (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat)
-        (self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat_i)
 
-        np.testing.assert_array_equal(self.hess_hat, hess_hat_new)
+        #Check hess_hat is positive definite
+        try:
+            cholesky(self.hess_hat)
+        except:
+            raise ValueError("Must be positive definite")
+
+        #Check its eigenvalues are positive
+        eigenvalues = eig(self.hess_hat)
+        if not np.all(eigenvalues > 0):
+            raise ValueError("Eigen values not positive")
 
-        #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
-        #self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
         #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
         #Unsure whether its log_hess or log_hess_i
-        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(f.T, (self.Ki, f))
+        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(self.f_hat.T, (self.Ki, self.f_hat))
         import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index e70cdc8d..c4823703 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -19,6 +19,9 @@ class student_t(likelihood_function):
         self.v = deg_free
         self.sigma = sigma
 
+        #FIXME: This should be in the superclass
+        self.log_concave = False
+
     def link_function(self, y, f):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
@@ -70,7 +73,7 @@ class student_t(likelihood_function):
         assert y.shape == f.shape
         e = y - f
         #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
-        hess = ((self.v + 1) * (e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2) * self.v) + e**2)**2)
+        hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
         return hess
 
     def predictive_values(self, mu, var):

From 474d5484b06bdbceefa08fa573d28326bb3f8a92 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 21 Mar 2013 14:00:22 +0000
Subject: [PATCH 010/252] Changing definitions again...

---
 python/examples/laplace_approximations.py | 15 +++++---
 python/likelihoods/Laplace.py             | 44 +++++++++++++++--------
 python/likelihoods/likelihood_function.py | 10 ++----
 3 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index c8d06ab2..6f2b19aa 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -15,8 +15,9 @@ def student_t_approx():
     Y = np.sin(X)
 
     #Add student t random noise to datapoints
-    deg_free = 3.5
-    t_rv = t(deg_free, loc=0, scale=1)
+    deg_free = 100000.5
+    real_var = 4
+    t_rv = t(deg_free, loc=0, scale=real_var)
     noise = t_rv.rvs(size=Y.shape)
     Y += noise
 
@@ -46,7 +47,7 @@ def student_t_approx():
     #print m
 
     #with a student t distribution, since it has heavy tails it should work well
-    likelihood_function = student_t(deg_free, sigma=1)
+    likelihood_function = student_t(deg_free, sigma=real_var)
     lap = Laplace(Y, likelihood_function)
     cov = kernel.K(X)
     lap.fit_full(cov)
@@ -64,7 +65,7 @@ def student_t_approx():
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     # Likelihood object
-    t_distribution = student_t(deg_free, sigma=1)
+    t_distribution = student_t(deg_free, sigma=real_var)
     stu_t_likelihood = Laplace(Y, t_distribution)
     kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
@@ -77,12 +78,16 @@ def student_t_approx():
 
     # optimize
     #m.optimize()
-    print(m)
+    #print(m)
 
     # plot
     m.plot()
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
+    m.optimize()
+    print(m)
+
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     return m
 
 
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 84128e3a..b002034d 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,7 +1,7 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv
+from scipy.linalg import cholesky, eig, inv, det
 from functools import partial
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv,mdot
@@ -43,8 +43,10 @@ class Laplace(likelihood):
         self.Z = 0
         self.YYT = None
 
-    def predictive_values(self,mu,var):
-        return self.likelihood_function.predictive_values(mu,var)
+    def predictive_values(self, mu, var, full_cov):
+        if full_cov:
+            raise NotImplementedError("Cannot make correlated predictions with an EP likelihood")
+        return self.likelihood_function.predictive_values(mu, var)
 
     def _get_params(self):
         return np.zeros(0)
@@ -52,10 +54,10 @@ class Laplace(likelihood):
     def _get_param_names(self):
         return []
 
-    def _set_params(self,p):
+    def _set_params(self, p):
         pass # TODO: Laplace likelihood might want to take some parameters...
 
-    def _gradients(self,partial):
+    def _gradients(self, partial):
         return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
 
@@ -83,7 +85,13 @@ class Laplace(likelihood):
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
 
         """
-        self.Sigma_tilde_i = self.hess_hat_i #self.W #self.hess_hat_i - self.Ki
+        self.Sigma_tilde_i = self.W #self.hess_hat_i
+        #Check it isn't singular!
+        epsilon = 1e-2
+        """
+        if np.abs(det(self.Sigma_tilde_i)) < epsilon:
+            raise ValueError("inverse covariance must be non-singular to inverse!")
+        """
         #Do we really need to inverse Sigma_tilde_i? :(
         if self.likelihood_function.log_concave:
             (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
@@ -91,12 +99,17 @@ class Laplace(likelihood):
             self.Sigma_tilde = inv(self.Sigma_tilde_i)
         #f_hat? should be f but we must have optimized for them I guess?
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
-        self.Z_tilde = np.exp(self.ln_z_hat - self.NORMAL_CONST
-                              - 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
-                              + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
-                              )
+        #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
+                        #- 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                        #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                   #)
+        Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
+                   + 0.5*self.log_hess_hat_det
+                   + 0.5*mdot(self.f_hat, self.Ki , self.f_hat)
+                   + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                   )
 
-        self.Z = self.Z_tilde
+        self.Z = Z_tilde
         self.Y = Y_tilde
         self.covariance_matrix = self.Sigma_tilde
         self.precision = 1 / np.diag(self.Sigma_tilde)[:, None]
@@ -128,7 +141,7 @@ class Laplace(likelihood):
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (-np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
+            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
             return np.squeeze(res)
 
         self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
@@ -153,7 +166,10 @@ class Laplace(likelihood):
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
         #Unsure whether its log_hess or log_hess_i
-        self.ln_z_hat = -0.5*np.log(self.log_hess_hat_det) - 0.5*self.log_Kdet + -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat) - mdot(self.f_hat.T, (self.Ki, self.f_hat))
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        self.ln_z_hat = (-0.5*self.log_hess_hat_det
+                         - 0.5*self.log_Kdet
+                         -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat)
+                         - mdot(self.f_hat.T, (self.Ki, self.f_hat))
+                         )
 
         return self._compute_GP_variables()
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index c4823703..a299fe3a 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -81,11 +81,7 @@ class student_t(likelihood_function):
         Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
         """
         mean = np.exp(mu)
-        p_025 = stats.t.ppf(025,mean)
-        p_975 = stats.t.ppf(975,mean)
-
-        #p_025 = tmp[:,0]
-        #p_975 = tmp[:,1]
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-        return mean,p_025,p_975
+        p_025 = stats.t.ppf(.025, mean)
+        p_975 = stats.t.ppf(.975, mean)
 
+        return mean, np.nan*mean, p_025, p_975

From 7b0d0550cb01f0c4eca567e80f950e7f54ecb7b2 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 22 Mar 2013 12:50:47 +0000
Subject: [PATCH 011/252] Seemed to be working, now its not

---
 python/examples/laplace_approximations.py | 118 +++++++++++++---------
 python/likelihoods/Laplace.py             |  37 +++----
 2 files changed, 92 insertions(+), 63 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 6f2b19aa..5fb39e08 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -11,15 +11,22 @@ def student_t_approx():
     Example of regressing with a student t likelihood
     """
     #Start a function, any function
-    X = np.sort(np.random.uniform(0, 15, 100))[:, None]
-    Y = np.sin(X)
+    X = np.linspace(0.0, 10.0, 100)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*0.1
+    Yc = Y.copy()
+
+    Y = Y/Y.max()
+
+    Yc[10] += 5
+    Yc[15] += 20
+    Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 100000.5
-    real_var = 4
-    t_rv = t(deg_free, loc=0, scale=real_var)
-    noise = t_rv.rvs(size=Y.shape)
-    Y += noise
+    deg_free = 1000000 #100000.5
+    real_var = 0.1
+    #t_rv = t(deg_free, loc=0, scale=real_var)
+    #noise = t_rvrvs(size=Y.shape)
+    #Y += noise
 
     #Add some extreme value noise to some of the datapoints
     #percent_corrupted = 0.15
@@ -30,64 +37,83 @@ def student_t_approx():
     #print corrupted_indices
     #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
     #Y[corrupted_indices] += noise
-
+    plt.figure(1)
     # Kernel object
-    print X.shape
-    kernel = GPy.kern.rbf(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
 
-    #A GP should completely break down due to the points as they get a lot of weight
-    # create simple GP model
-    #m = GPy.models.GP_regression(X, Y, kernel=kernel)
-
-    ## optimize
+    #print "Clean Gaussian"
+    ##A GP should completely break down due to the points as they get a lot of weight
+    ## create simple GP model
+    #m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    ### optimize
     #m.ensure_default_constraints()
+    ##m.unconstrain('noise')
+    ##m.constrain_fixed('noise', 0.1)
     #m.optimize()
     ## plot
-    ##m.plot()
+    #plt.subplot(221)
+    #m.plot()
     #print m
 
-    #with a student t distribution, since it has heavy tails it should work well
-    likelihood_function = student_t(deg_free, sigma=real_var)
-    lap = Laplace(Y, likelihood_function)
-    cov = kernel.K(X)
-    lap.fit_full(cov)
+    ##Corrupt
+    #print "Corrupt Gaussian"
+    #m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
+    #m.ensure_default_constraints()
+    ##m.unconstrain('noise')
+    ##m.constrain_fixed('noise', 0.1)
+    #m.optimize()
+    #plt.subplot(222)
+    #m.plot()
+    #print m
 
-    test_range = np.arange(0, 10, 0.1)
-    plt.plot(test_range, t_rv.pdf(test_range))
-    for i in xrange(X.shape[0]):
-        mode = lap.f_hat[i]
-        covariance = lap.hess_hat_i[i,i]
-        scaling = np.exp(lap.ln_z_hat)
-        normalised_approx = norm(loc=mode, scale=covariance)
-        print "Normal with mode %f, and variance %f" % (mode, covariance)
-        plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
-    plt.show()
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    ##with a student t distribution, since it has heavy tails it should work well
+    ##likelihood_function = student_t(deg_free, sigma=real_var)
+    ##lap = Laplace(Y, likelihood_function)
+    ##cov = kernel.K(X)
+    ##lap.fit_full(cov)
+
+    ##test_range = np.arange(0, 10, 0.1)
+    ##plt.plot(test_range, t_rv.pdf(test_range))
+    ##for i in xrange(X.shape[0]):
+        ##mode = lap.f_hat[i]
+        ##covariance = lap.hess_hat_i[i,i]
+        ##scaling = np.exp(lap.ln_z_hat)
+        ##normalised_approx = norm(loc=mode, scale=covariance)
+        ##print "Normal with mode %f, and variance %f" % (mode, covariance)
+        ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    ##plt.show()
 
     # Likelihood object
-    t_distribution = student_t(deg_free, sigma=real_var)
+    t_distribution = student_t(deg_free, sigma=np.sqrt(real_var))
     stu_t_likelihood = Laplace(Y, t_distribution)
-    kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
-    m = GPy.models.GP(X, stu_t_likelihood, kernel)
+    print "Clean student t"
+    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     m.ensure_default_constraints()
-
     m.update_likelihood_approximation()
-    print "NEW MODEL"
-    print(m)
-
     # optimize
-    #m.optimize()
-    #print(m)
-
-    # plot
-    m.plot()
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-
     m.optimize()
     print(m)
+    # plot
+    plt.subplot(211)
+    m.plot_f()
+
+    print "Corrupt student t"
+    t_distribution = student_t(deg_free, sigma=np.sqrt(real_var))
+    corrupt_stu_t_likelihood = Laplace(Yc, t_distribution)
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(212)
+    m.plot_f()
 
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
     return m
 
 
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index b002034d..d86523d8 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -33,13 +33,15 @@ class Laplace(likelihood):
 
         #Inital values
         self.N, self.D = self.data.shape
+        self.is_heteroscedastic = True
+        self.Nparams = 0
 
         self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
 
         #Initial values for the GP variables
-        self.Y = np.zeros((self.N,1))
+        self.Y = np.zeros((self.N, 1))
         self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)[:,None]
+        self.precision = np.ones(self.N)[:, None]
         self.Z = 0
         self.YYT = None
 
@@ -58,6 +60,7 @@ class Laplace(likelihood):
         pass # TODO: Laplace likelihood might want to take some parameters...
 
     def _gradients(self, partial):
+        #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
 
@@ -88,10 +91,8 @@ class Laplace(likelihood):
         self.Sigma_tilde_i = self.W #self.hess_hat_i
         #Check it isn't singular!
         epsilon = 1e-2
-        """
         if np.abs(det(self.Sigma_tilde_i)) < epsilon:
             raise ValueError("inverse covariance must be non-singular to inverse!")
-        """
         #Do we really need to inverse Sigma_tilde_i? :(
         if self.likelihood_function.log_concave:
             (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
@@ -99,21 +100,17 @@ class Laplace(likelihood):
             self.Sigma_tilde = inv(self.Sigma_tilde_i)
         #f_hat? should be f but we must have optimized for them I guess?
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
-        #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                        #- 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
-                        #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
-                   #)
         Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                   + 0.5*self.log_hess_hat_det
-                   + 0.5*mdot(self.f_hat, self.Ki , self.f_hat)
-                   + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                    + 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                    + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                    - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
                    )
 
         self.Z = Z_tilde
-        self.Y = Y_tilde
+        self.Y = Y_tilde[:, None]
+        self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = self.Sigma_tilde
         self.precision = 1 / np.diag(self.Sigma_tilde)[:, None]
-        self.YYT = np.dot(self.Y, self.Y.T)
         import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def fit_full(self, K):
@@ -122,6 +119,7 @@ class Laplace(likelihood):
         For nomenclature see Rasmussen & Williams 2006
         :K: Covariance matrix
         """
+        self.K = K.copy()
         f = np.zeros((self.N, 1))
         (self.Ki, _, _, self.log_Kdet) = pdinv(K)
         LOG_K_CONST = -(0.5 * self.log_Kdet)
@@ -148,6 +146,11 @@ class Laplace(likelihood):
 
         #At this point get the hessian matrix
         self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat))
+        if not self.likelihood_function.log_concave:
+            self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                   #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                   #To cause the posterior to become less certain than the prior and likelihood,
+                                   #This is a property only held by non-log-concave likelihoods
         self.hess_hat = self.Ki + self.W
         (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat)
 
@@ -166,10 +169,10 @@ class Laplace(likelihood):
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
         #Unsure whether its log_hess or log_hess_i
-        self.ln_z_hat = (-0.5*self.log_hess_hat_det
-                         - 0.5*self.log_Kdet
-                         -1*self.likelihood_function.link_function(self.data[:,0], self.f_hat)
-                         - mdot(self.f_hat.T, (self.Ki, self.f_hat))
+        self.ln_z_hat = (- 0.5*self.log_hess_hat_det
+                         + 0.5*self.log_Kdet
+                         + self.likelihood_function.link_function(self.data[:,0], self.f_hat)
+                         - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
                          )
 
         return self._compute_GP_variables()

From 15d5c2f22dff65a518a4f6a155e457a6516fca17 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 28 Mar 2013 17:42:42 +0000
Subject: [PATCH 012/252] Working laplace, just needs predictive values

---
 python/examples/laplace_approximations.py | 80 +++++++++++++----------
 python/likelihoods/Laplace.py             | 15 +++--
 python/likelihoods/likelihood_function.py | 72 ++++++++++++++++++--
 3 files changed, 121 insertions(+), 46 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 5fb39e08..37681849 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -10,20 +10,23 @@ def student_t_approx():
     """
     Example of regressing with a student t likelihood
     """
+    real_var = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 100)[:, None]
-    Y = np.sin(X) + np.random.randn(*X.shape)*0.1
+    X = np.linspace(0.0, 10.0, 30)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
     Yc = Y.copy()
 
-    Y = Y/Y.max()
+    #Y = Y/Y.max()
 
-    Yc[10] += 5
-    Yc[15] += 20
-    Yc = Yc/Yc.max()
+    #Yc[10] += 100
+    Yc[25] += 10
+    Yc[23] += 10
+    Yc[24] += 10
+    #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 1000000 #100000.5
-    real_var = 0.1
+    deg_free = 20 #100000.5
+    real_sd = np.sqrt(real_var)
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -38,36 +41,37 @@ def student_t_approx():
     #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
     #Y[corrupted_indices] += noise
     plt.figure(1)
+    plt.suptitle('Gaussian likelihood')
     # Kernel object
     kernel1 = GPy.kern.rbf(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
 
-    #print "Clean Gaussian"
-    ##A GP should completely break down due to the points as they get a lot of weight
-    ## create simple GP model
-    #m = GPy.models.GP_regression(X, Y, kernel=kernel1)
-    ### optimize
-    #m.ensure_default_constraints()
-    ##m.unconstrain('noise')
-    ##m.constrain_fixed('noise', 0.1)
-    #m.optimize()
-    ## plot
-    #plt.subplot(221)
-    #m.plot()
-    #print m
+    print "Clean Gaussian"
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    ## optimize
+    m.ensure_default_constraints()
+    #m.unconstrain('noise')
+    #m.constrain_fixed('noise', 0.1)
+    m.optimize()
+    # plot
+    plt.subplot(211)
+    m.plot()
+    print m
 
     ##Corrupt
-    #print "Corrupt Gaussian"
-    #m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
-    #m.ensure_default_constraints()
-    ##m.unconstrain('noise')
-    ##m.constrain_fixed('noise', 0.1)
-    #m.optimize()
-    #plt.subplot(222)
-    #m.plot()
-    #print m
+    print "Corrupt Gaussian"
+    m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
+    m.ensure_default_constraints()
+    #m.unconstrain('noise')
+    #m.constrain_fixed('noise', 0.1)
+    m.optimize()
+    plt.subplot(212)
+    m.plot()
+    print m
 
     ##with a student t distribution, since it has heavy tails it should work well
     ##likelihood_function = student_t(deg_free, sigma=real_var)
@@ -86,9 +90,13 @@ def student_t_approx():
         ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
     ##plt.show()
 
+    plt.figure(2)
+    plt.suptitle('Student-t likelihood')
+    edited_real_sd = real_sd
+
     # Likelihood object
-    t_distribution = student_t(deg_free, sigma=np.sqrt(real_var))
-    stu_t_likelihood = Laplace(Y, t_distribution)
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = Laplace(Yc, t_distribution)
 
     print "Clean student t"
     m = GPy.models.GP(X, stu_t_likelihood, kernel3)
@@ -100,9 +108,11 @@ def student_t_approx():
     # plot
     plt.subplot(211)
     m.plot_f()
+    plt.ylim(-2.5,2.5)
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     print "Corrupt student t"
-    t_distribution = student_t(deg_free, sigma=np.sqrt(real_var))
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
     corrupt_stu_t_likelihood = Laplace(Yc, t_distribution)
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
@@ -110,8 +120,8 @@ def student_t_approx():
     m.optimize()
     print(m)
     plt.subplot(212)
-    m.plot_f()
-
+    m.plot()
+    plt.ylim(-2.5,2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     return m
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index d86523d8..1411c22b 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -88,11 +88,12 @@ class Laplace(likelihood):
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
 
         """
-        self.Sigma_tilde_i = self.W #self.hess_hat_i
+        self.Sigma_tilde_i = self.W
         #Check it isn't singular!
-        epsilon = 1e-2
+        epsilon = 1e-6
         if np.abs(det(self.Sigma_tilde_i)) < epsilon:
-            raise ValueError("inverse covariance must be non-singular to inverse!")
+            print "WARNING: Transformed covariance matrix is signular!"
+            #raise ValueError("inverse covariance must be non-singular to invert!")
         #Do we really need to inverse Sigma_tilde_i? :(
         if self.likelihood_function.log_concave:
             (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
@@ -110,8 +111,12 @@ class Laplace(likelihood):
         self.Y = Y_tilde[:, None]
         self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1 / np.diag(self.Sigma_tilde)[:, None]
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        #if not self.likelihood_function.log_concave:
+            #self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                   ##If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                   ##To cause the posterior to become less certain than the prior and likelihood,
+                                   ##This is a property only held by non-log-concave likelihoods
+        self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
 
     def fit_full(self, K):
         """
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index a299fe3a..7ac9c661 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -1,4 +1,5 @@
-from scipy.special import gammaln
+from scipy.special import gammaln, gamma
+from scipy import integrate
 import numpy as np
 from GPy.likelihoods.likelihood_functions import likelihood_function
 from scipy import stats
@@ -79,9 +80,68 @@ class student_t(likelihood_function):
     def predictive_values(self, mu, var):
         """
         Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
-        """
-        mean = np.exp(mu)
-        p_025 = stats.t.ppf(.025, mean)
-        p_975 = stats.t.ppf(.975, mean)
 
-        return mean, np.nan*mean, p_025, p_975
+        Need to find what the variance is at the latent points for a student t*normal
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+
+(((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        """
+        #p_025 = stats.t.ppf(.025, mu)
+        #p_975 = stats.t.ppf(.975, mu)
+
+        num_test_points = mu.shape[0]
+        #Each mu is the latent point f* at the test point x*,
+        #and the var is the gaussian variance at this point
+        #Take lots of samples from this, so we have lots of possible values
+        #for latent point f* for each test point x* weighted by how likely we were to pick it
+        print "Taking %d samples of f*".format(num_test_points)
+        num_f_samples = 10
+        num_y_samples = 10
+        student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
+        print "Student t means shape: ", student_t_means.shape
+
+        #Now we have lots of f*, lets work out the likelihood of getting this by sampling
+        #from a student t centred on this point, sample many points from this distribution
+        #centred on f*
+        #for test_point, f in enumerate(student_t_means):
+            #print test_point
+            #print f.shape
+            #student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
+                                            #scale=self.sigma,
+                                            #size=(num_f_samples, num_y_samples))
+            #print student_t_samples.shape
+
+        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:,None],
+                                        scale=self.sigma,
+                                        size=(num_test_points, num_y_samples, num_f_samples))
+        student_t_samples = np.reshape(student_t_samples,
+                                       (num_test_points, num_y_samples*num_f_samples))
+
+        #Now take the 97.5 and 0.25 percentile of these points
+        p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
+        p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
+
+        p_025 = 1+p_025
+        p_975 = 1+p_975
+
+        ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
+        def t_gaussian(f, mu, var):
+            return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
+                        * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
+                    )
+
+        def t_gauss_int(mu, var):
+            print "Mu: ", mu
+            print "var: ", var
+            result = integrate.quad(t_gaussian, -np.inf, 0.975, args=(mu, var))
+            print "Result: ", result
+            return result[0]
+
+        vec_t_gauss_int = np.vectorize(t_gauss_int)
+
+        p_025 = vec_t_gauss_int(mu, var)
+        p_975 = vec_t_gauss_int(mu, var)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+        return mu, np.nan*mu, p_025, p_975

From ffc168c1d20f36b1e72501176c4a7bb88ff41614 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 2 Apr 2013 12:33:01 +0100
Subject: [PATCH 013/252] Added predicted values for student t, works well

---
 python/examples/laplace_approximations.py | 48 +++++++++++------------
 python/likelihoods/likelihood_function.py | 41 ++++++++++++++-----
 2 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 37681849..6374a5fd 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -18,7 +18,7 @@ def student_t_approx():
 
     #Y = Y/Y.max()
 
-    #Yc[10] += 100
+    Yc[10] += 100
     Yc[25] += 10
     Yc[23] += 10
     Yc[24] += 10
@@ -52,51 +52,30 @@ def student_t_approx():
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
     m = GPy.models.GP_regression(X, Y, kernel=kernel1)
-    ## optimize
+    # optimize
     m.ensure_default_constraints()
-    #m.unconstrain('noise')
-    #m.constrain_fixed('noise', 0.1)
     m.optimize()
     # plot
     plt.subplot(211)
     m.plot()
     print m
 
-    ##Corrupt
+    #Corrupt
     print "Corrupt Gaussian"
     m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
     m.ensure_default_constraints()
-    #m.unconstrain('noise')
-    #m.constrain_fixed('noise', 0.1)
     m.optimize()
     plt.subplot(212)
     m.plot()
     print m
 
-    ##with a student t distribution, since it has heavy tails it should work well
-    ##likelihood_function = student_t(deg_free, sigma=real_var)
-    ##lap = Laplace(Y, likelihood_function)
-    ##cov = kernel.K(X)
-    ##lap.fit_full(cov)
-
-    ##test_range = np.arange(0, 10, 0.1)
-    ##plt.plot(test_range, t_rv.pdf(test_range))
-    ##for i in xrange(X.shape[0]):
-        ##mode = lap.f_hat[i]
-        ##covariance = lap.hess_hat_i[i,i]
-        ##scaling = np.exp(lap.ln_z_hat)
-        ##normalised_approx = norm(loc=mode, scale=covariance)
-        ##print "Normal with mode %f, and variance %f" % (mode, covariance)
-        ##plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
-    ##plt.show()
-
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
     edited_real_sd = real_sd
 
     # Likelihood object
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Yc, t_distribution)
+    stu_t_likelihood = Laplace(Y, t_distribution)
 
     print "Clean student t"
     m = GPy.models.GP(X, stu_t_likelihood, kernel3)
@@ -107,7 +86,7 @@ def student_t_approx():
     print(m)
     # plot
     plt.subplot(211)
-    m.plot_f()
+    m.plot()
     plt.ylim(-2.5,2.5)
     #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
@@ -124,6 +103,23 @@ def student_t_approx():
     plt.ylim(-2.5,2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
+    ###with a student t distribution, since it has heavy tails it should work well
+    ###likelihood_function = student_t(deg_free, sigma=real_var)
+    ###lap = Laplace(Y, likelihood_function)
+    ###cov = kernel.K(X)
+    ###lap.fit_full(cov)
+
+    ###test_range = np.arange(0, 10, 0.1)
+    ###plt.plot(test_range, t_rv.pdf(test_range))
+    ###for i in xrange(X.shape[0]):
+        ###mode = lap.f_hat[i]
+        ###covariance = lap.hess_hat_i[i,i]
+        ###scaling = np.exp(lap.ln_z_hat)
+        ###normalised_approx = norm(loc=mode, scale=covariance)
+        ###print "Normal with mode %f, and variance %f" % (mode, covariance)
+        ###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    ###plt.show()
+
     return m
 
 
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 7ac9c661..61b5c427 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -23,6 +23,10 @@ class student_t(likelihood_function):
         #FIXME: This should be in the superclass
         self.log_concave = False
 
+    @property
+    def variance(self):
+        return (self.v / float(self.v - 2)) * (self.sigma**2)
+
     def link_function(self, y, f):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
@@ -79,14 +83,32 @@ class student_t(likelihood_function):
 
     def predictive_values(self, mu, var):
         """
-        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
+        Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
 
-        Need to find what the variance is at the latent points for a student t*normal
-        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
 
-(((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
-*((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
         """
+
+        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
+        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
+        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
+        #Which was also given to us as (var)
+        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
+        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
+        true_var = var + self.variance
+
+        #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
+        #need the 95 and 5 percentiles.
+        #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
+        p_025 = mu - 2.*true_var
+        p_975 = mu + 2.*true_var
+
+        return mu, np.nan*mu, p_025, p_975
+
+    def sample_predicted_values(self, mu, var):
+        """ Experimental sample approches and numerical integration """
         #p_025 = stats.t.ppf(.025, mu)
         #p_975 = stats.t.ppf(.975, mu)
 
@@ -134,14 +156,13 @@ class student_t(likelihood_function):
         def t_gauss_int(mu, var):
             print "Mu: ", mu
             print "var: ", var
-            result = integrate.quad(t_gaussian, -np.inf, 0.975, args=(mu, var))
+            result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
             print "Result: ", result
             return result[0]
 
         vec_t_gauss_int = np.vectorize(t_gauss_int)
 
-        p_025 = vec_t_gauss_int(mu, var)
-        p_975 = vec_t_gauss_int(mu, var)
+        p = vec_t_gauss_int(mu, var)
+        p_025 = mu - p
+        p_975 = mu + p
         import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-
-        return mu, np.nan*mu, p_025, p_975

From afa5b1f9561189b3774a895b765d708186c10f5c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 2 Apr 2013 12:39:57 +0100
Subject: [PATCH 014/252] Tidying up

---
 python/likelihoods/likelihood_function.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 61b5c427..50f9b620 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -88,7 +88,6 @@ class student_t(likelihood_function):
         Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
         (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
         *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
-
         """
 
         #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
@@ -144,9 +143,6 @@ class student_t(likelihood_function):
         p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
         p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
 
-        p_025 = 1+p_025
-        p_975 = 1+p_975
-
         ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
         def t_gaussian(f, mu, var):
             return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))

From 0312f319ad4eef37f0c173120d80cc373d149519 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 2 Apr 2013 20:00:31 +0100
Subject: [PATCH 015/252] Still working on rasmussen, link function needs
 vectorizing I think

---
 python/examples/laplace_approximations.py |  58 ++++++---
 python/likelihoods/Laplace.py             | 137 ++++++++++++++++------
 python/likelihoods/likelihood_function.py |  13 +-
 3 files changed, 154 insertions(+), 54 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 6374a5fd..a1c71c71 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -16,6 +16,9 @@ def student_t_approx():
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
     Yc = Y.copy()
 
+    X_full = np.linspace(0.0, 10.0, 500)[:, None]
+    Y_full = np.sin(X_full)
+
     #Y = Y/Y.max()
 
     Yc[10] += 100
@@ -25,7 +28,7 @@ def student_t_approx():
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 20 #100000.5
+    deg_free = 10
     real_sd = np.sqrt(real_var)
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
@@ -47,6 +50,8 @@ def student_t_approx():
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
+    kernel5 = kernel1.copy()
+    kernel6 = kernel1.copy()
 
     print "Clean Gaussian"
     #A GP should completely break down due to the points as they get a lot of weight
@@ -58,6 +63,7 @@ def student_t_approx():
     # plot
     plt.subplot(211)
     m.plot()
+    plt.plot(X_full, Y_full)
     print m
 
     #Corrupt
@@ -67,40 +73,64 @@ def student_t_approx():
     m.optimize()
     plt.subplot(212)
     m.plot()
+    plt.plot(X_full, Y_full)
     print m
 
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
     edited_real_sd = real_sd
 
-    # Likelihood object
+    print "Clean student t, ncg"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Y, t_distribution)
-
-    print "Clean student t"
+    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
     m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
-    # optimize
     m.optimize()
     print(m)
-    # plot
-    plt.subplot(211)
+    plt.subplot(221)
     m.plot()
-    plt.ylim(-2.5,2.5)
-    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
 
-    print "Corrupt student t"
+    print "Corrupt student t, ncg"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = Laplace(Yc, t_distribution)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(223)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Clean student t, rasm"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(222)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Corrupt student t, rasm"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
-    plt.subplot(212)
+    plt.subplot(224)
     m.plot()
-    plt.ylim(-2.5,2.5)
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     ###with a student t distribution, since it has heavy tails it should work well
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 1411c22b..8eb69869 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,16 +1,15 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv, det
-from functools import partial
+from scipy.linalg import cholesky, eig, inv, det, cho_solve
 from GPy.likelihoods.likelihood import likelihood
-from GPy.util.linalg import pdinv,mdot
+from GPy.util.linalg import pdinv, mdot, jitchol
 #import numpy.testing.assert_array_equal
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self, data, likelihood_function):
+    def __init__(self, data, likelihood_function, rasm=True):
         """
         Laplace Approximation
 
@@ -30,6 +29,7 @@ class Laplace(likelihood):
         """
         self.data = data
         self.likelihood_function = likelihood_function
+        self.rasm = rasm
 
         #Inital values
         self.N, self.D = self.data.shape
@@ -102,20 +102,16 @@ class Laplace(likelihood):
         #f_hat? should be f but we must have optimized for them I guess?
         Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
         Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                    + 0.5*mdot(self.f_hat, self.hess_hat, self.f_hat)
+                    + 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
                     + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
                     - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
                    )
 
-        self.Z = Z_tilde
-        self.Y = Y_tilde[:, None]
+        #Convert to float as its (1, 1) and Z must be a scalar
+        self.Z = np.float64(Z_tilde)
+        self.Y = Y_tilde
         self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = self.Sigma_tilde
-        #if not self.likelihood_function.log_concave:
-            #self.covariance_matrix[self.covariance_matrix < 0] = 1e+6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-                                   ##If the likelihood is non-log-concave. We wan't to say that there is a negative variance
-                                   ##To cause the posterior to become less certain than the prior and likelihood,
-                                   ##This is a property only held by non-log-concave likelihoods
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
 
     def fit_full(self, K):
@@ -125,32 +121,15 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        f = np.zeros((self.N, 1))
-        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
-        LOG_K_CONST = -(0.5 * self.log_Kdet)
-        OBJ_CONST = self.NORMAL_CONST + LOG_K_CONST
-        #Find \hat(f) using a newton raphson optimizer for example
-        #TODO: Add newton-raphson as subclass of optimizer class
-
-        #FIXME: Can we get rid of this horrible reshaping?
-        def obj(f):
-            #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f)) + OBJ_CONST)
-            return float(res)
-
-        def obj_grad(f):
-            #f = f[:, None]
-            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
-            return np.squeeze(res)
-
-        def obj_hess(f):
-            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
-            return np.squeeze(res)
-
-        self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        self.Ki, _, _, self.log_Kdet = pdinv(K)
+        if self.rasm:
+            self.f_hat = self.rasm_mode(K)
+        else:
+            self.f_hat = self.ncg_mode(K)
 
         #At this point get the hessian matrix
-        self.W = -np.diag(self.likelihood_function.link_hess(self.data[:, 0], self.f_hat))
+        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat))
+
         if not self.likelihood_function.log_concave:
             self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -176,8 +155,92 @@ class Laplace(likelihood):
         #Unsure whether its log_hess or log_hess_i
         self.ln_z_hat = (- 0.5*self.log_hess_hat_det
                          + 0.5*self.log_Kdet
-                         + self.likelihood_function.link_function(self.data[:,0], self.f_hat)
+                         + self.likelihood_function.link_function(self.data, self.f_hat)
+                         #+ self.likelihood_function.link_function(self.data, self.f_hat)
                          - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
                          )
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
+
+    def ncg_mode(self, K):
+        """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
+        :K: Covariance matrix
+        :returns: f_mode
+        """
+        self.K = K.copy()
+        f = np.zeros((self.N, 1))
+        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
+        LOG_K_CONST = -(0.5 * self.log_Kdet)
+
+        #FIXME: Can we get rid of this horrible reshaping?
+        def obj(f):
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f))
+                        + self.NORMAL_CONST + LOG_K_CONST)
+            return float(res)
+
+        def obj_grad(f):
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
+            return np.squeeze(res)
+
+        def obj_hess(f):
+            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
+            return np.squeeze(res)
+
+        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        return f_hat[:, None]
+
+    def rasm_mode(self, K):
+        """
+        Rasmussens numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+
+        :K: Covariance matrix
+        :returns: f_mode
+        """
+        f = np.zeros((self.N, 1))
+        new_obj = -np.inf
+        old_obj = np.inf
+
+        def obj(a, f):
+            #Careful of shape of data!
+            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f)
+
+        difference = np.inf
+        epsilon = 1e-16
+        step_size = 1
+        while difference > epsilon:
+            W = -np.diag(self.likelihood_function.link_hess(self.data, f))
+            if not self.likelihood_function.log_concave:
+                #if np.any(W < 0):
+                    #print "NEGATIVE VALUES :("
+                    #pass
+                W[W < 0] = 1e-6     #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                    #To cause the posterior to become less certain than the prior and likelihood,
+                                    #This is a property only held by non-log-concave likelihoods
+            #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
+            W_12 = np.sqrt(W)
+            B = np.eye(self.N) + mdot(W_12, K, W_12)
+            L = jitchol(B)
+            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f))
+            #TODO: Check L is lower
+            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
+            a = b - mdot(W_12, solve_L)
+            f = np.dot(K, a)
+            old_obj = new_obj
+            new_obj = obj(a, f)
+            difference = new_obj - old_obj
+            #print "Difference: ", new_obj - old_obj
+            if difference < 0:
+                #If the objective function isn't rising, restart optimization
+                print "Reducing step-size, restarting"
+                #objective function isn't increasing, try reducing step size
+                step_size *= 0.9
+                f = np.zeros((self.N, 1))
+                new_obj = -np.inf
+                old_obj = np.inf
+
+            difference = abs(difference)
+
+        return f
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 50f9b620..15859a81 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -36,7 +36,10 @@ class student_t(likelihood_function):
         :returns: float(likelihood evaluated for this point)
 
         """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
         assert y.shape == f.shape
+
         e = y - f
         objective = (gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
@@ -44,6 +47,7 @@ class student_t(likelihood_function):
                      - (self.v + 1) * 0.5
                      * np.log(1 + ((e**2 / self.sigma**2) / self.v))
                      )
+        print (e**2).shape
         return np.sum(objective)
 
     def link_grad(self, y, f):
@@ -57,10 +61,12 @@ class student_t(likelihood_function):
         :returns: gradient of likelihood evaluated at points
 
         """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
-        return grad
+        return np.squeeze(grad)
 
     def link_hess(self, y, f):
         """
@@ -75,11 +81,12 @@ class student_t(likelihood_function):
         :f: latent variables f
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        #hess = ((self.v + 1) * e) / ((((self.sigma**2) * self.v) + e**2)**2)
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
-        return hess
+        return np.squeeze(hess)
 
     def predictive_values(self, mu, var):
         """

From 2006a94caa859d195a7c2af1236eb84656b68cfc Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 3 Apr 2013 10:55:58 +0100
Subject: [PATCH 016/252] Fixed broadcasting bug, rasm now appears to work

---
 python/likelihoods/Laplace.py             | 16 ++++++++++------
 python/likelihoods/likelihood_function.py |  1 -
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 8eb69869..e967a743 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -159,7 +159,6 @@ class Laplace(likelihood):
                          #+ self.likelihood_function.link_function(self.data, self.f_hat)
                          - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
                          )
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return self._compute_GP_variables()
 
@@ -190,7 +189,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
         return f_hat[:, None]
 
-    def rasm_mode(self, K):
+    def rasm_mode(self, K, MAX_ITER=5000, MAX_RESTART=30):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -209,7 +208,9 @@ class Laplace(likelihood):
         difference = np.inf
         epsilon = 1e-16
         step_size = 1
-        while difference > epsilon:
+        rs = 0
+        i = 0
+        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
                 #if np.any(W < 0):
@@ -223,7 +224,7 @@ class Laplace(likelihood):
             W_12 = np.sqrt(W)
             B = np.eye(self.N) + mdot(W_12, K, W_12)
             L = jitchol(B)
-            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f))
+            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)[:, None])
             #TODO: Check L is lower
             solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
             a = b - mdot(W_12, solve_L)
@@ -234,13 +235,16 @@ class Laplace(likelihood):
             #print "Difference: ", new_obj - old_obj
             if difference < 0:
                 #If the objective function isn't rising, restart optimization
-                print "Reducing step-size, restarting"
-                #objective function isn't increasing, try reducing step size
                 step_size *= 0.9
+                print "Objective function rose"
+                print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                #objective function isn't increasing, try reducing step size
                 f = np.zeros((self.N, 1))
                 new_obj = -np.inf
                 old_obj = np.inf
+                rs += 1
 
             difference = abs(difference)
+            i += 1
 
         return f
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 15859a81..49174ce7 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -47,7 +47,6 @@ class student_t(likelihood_function):
                      - (self.v + 1) * 0.5
                      * np.log(1 + ((e**2 / self.sigma**2) / self.v))
                      )
-        print (e**2).shape
         return np.sum(objective)
 
     def link_grad(self, y, f):

From 4a14a82dfba4bd3c48d4175bb8a861bab24a0d10 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 5 Apr 2013 17:34:11 +0100
Subject: [PATCH 017/252] Got the mode finding without computing Ki

---
 python/examples/laplace_approximations.py |  85 +++++++++-----
 python/likelihoods/Laplace.py             | 130 ++++++++++++++++------
 2 files changed, 152 insertions(+), 63 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index a1c71c71..7ab26406 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -6,6 +6,38 @@ from coxGP.python.likelihoods.Laplace import Laplace
 from coxGP.python.likelihoods.likelihood_function import student_t
 
 
+def timing():
+    real_var = 0.1
+    times = 1000
+    deg_free = 10
+    real_sd = np.sqrt(real_var)
+    the_is = np.zeros(times)
+    X = np.linspace(0.0, 10.0, 30)[:, None]
+    for a in xrange(times):
+        Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+        Yc = Y.copy()
+
+        Yc[10] += 100
+        Yc[25] += 10
+        Yc[23] += 10
+        Yc[24] += 10
+
+        edited_real_sd = real_sd
+        kernel1 = GPy.kern.rbf(X.shape[1])
+
+        t_distribution = student_t(deg_free, sigma=edited_real_sd)
+        corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
+        m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
+        m.ensure_default_constraints()
+        m.update_likelihood_approximation()
+        m.optimize()
+        the_is[a] = m.likelihood.i
+
+    print the_is
+    print np.mean(the_is)
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+
 def student_t_approx():
     """
     Example of regressing with a student t likelihood
@@ -80,32 +112,6 @@ def student_t_approx():
     plt.suptitle('Student-t likelihood')
     edited_real_sd = real_sd
 
-    print "Clean student t, ncg"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
-    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    plt.subplot(221)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
-
-    print "Corrupt student t, ncg"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
-    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    plt.subplot(223)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
-
     print "Clean student t, rasm"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
@@ -133,6 +139,33 @@ def student_t_approx():
     plt.ylim(-2.5, 2.5)
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
+    print "Clean student t, ncg"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(221)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Corrupt student t, ncg"
+    t_distribution = student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(223)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+
     ###with a student t distribution, since it has heavy tails it should work well
     ###likelihood_function = student_t(deg_free, sigma=real_var)
     ###lap = Laplace(Y, likelihood_function)
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index e967a743..396a0bc7 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -100,12 +100,19 @@ class Laplace(likelihood):
         else:
             self.Sigma_tilde = inv(self.Sigma_tilde_i)
         #f_hat? should be f but we must have optimized for them I guess?
-        Y_tilde = mdot(self.Sigma_tilde, self.hess_hat, self.f_hat)
-        Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                    + 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
-                    + 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
-                    - mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
-                   )
+        #Y_tilde = mdot(self.Sigma_tilde, self.hess_hat_i, self.f_hat)
+        Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
+        #KW = np.dot(self.K, self.W)
+        #KW_i, _, _, _ = pdinv(KW)
+        #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat)
+        #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
+                    #+ 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
+                    #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
+                    #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
+                   #)
+        _, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12))
+        f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat)
+        Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -121,7 +128,7 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        self.Ki, _, _, self.log_Kdet = pdinv(K)
+        self.Ki, _, _, log_Kdet = pdinv(K)
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
@@ -135,33 +142,64 @@ class Laplace(likelihood):
                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                    #To cause the posterior to become less certain than the prior and likelihood,
                                    #This is a property only held by non-log-concave likelihoods
-        self.hess_hat = self.Ki + self.W
-        (self.hess_hat_i, _, _, self.log_hess_hat_det) = pdinv(self.hess_hat)
+        #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
+        self.B, L, self.W_12 = self._compute_B_statistics(K, self.W)
+        self.Bi, _, _, B_det = pdinv(self.B)
+        #ln_W_det = np.linalg.det(self.W)
+        #ln_B_det = np.linalg.det(self.B)
+        ln_det = np.linalg.det(np.eye(self.N) - mdot(self.W_12, self.Bi, self.W_12, K))
+        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None]
+        #TODO: Check L is lower
+        solve_L = cho_solve((L, True), mdot(self.W_12, (K, b)))
+        a = b - mdot(self.W_12, solve_L)
+        self.f_Ki_f = np.dot(self.f_hat.T, a)
 
-        #Check hess_hat is positive definite
-        try:
-            cholesky(self.hess_hat)
-        except:
-            raise ValueError("Must be positive definite")
+        #self.hess_hat = self.Ki + self.W
+        #(self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat)
 
-        #Check its eigenvalues are positive
-        eigenvalues = eig(self.hess_hat)
-        if not np.all(eigenvalues > 0):
-            raise ValueError("Eigen values not positive")
+        ##Check hess_hat is positive definite
+        #try:
+            #cholesky(self.hess_hat)
+        #except:
+            #raise ValueError("Must be positive definite")
+
+        ##Check its eigenvalues are positive
+        #eigenvalues = eig(self.hess_hat)
+        #if not np.all(eigenvalues > 0):
+            #raise ValueError("Eigen values not positive")
 
         #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
         #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
         #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
         #Unsure whether its log_hess or log_hess_i
-        self.ln_z_hat = (- 0.5*self.log_hess_hat_det
-                         + 0.5*self.log_Kdet
-                         + self.likelihood_function.link_function(self.data, self.f_hat)
+        #self.ln_z_hat = (- 0.5*self.log_hess_hat_i_det
+                         #+ 0.5*self.log_Kdet
                          #+ self.likelihood_function.link_function(self.data, self.f_hat)
-                         - 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
+                         ##+ self.likelihood_function.link_function(self.data, self.f_hat)
+                         #- 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
+                         #)
+        self.ln_z_hat = (- 0.5*log_Kdet
+                         - 0.5*self.f_Ki_f
+                         + self.likelihood_function.link_function(self.data, self.f_hat)
+                         + 0.5*ln_det
                          )
 
         return self._compute_GP_variables()
 
+    def _compute_B_statistics(self, K, W):
+        """Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :K: Covariance matrix
+        :W: Negative hessian at a point (diagonal matrix)
+        :returns: (B, L)
+        """
+        #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
+        W_12 = np.sqrt(W)
+        B = np.eye(K.shape[0]) + mdot(W_12, K, W_12)
+        L = jitchol(B)
+        return (B, L, W_12)
+
     def ncg_mode(self, K):
         """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
         :K: Covariance matrix
@@ -189,7 +227,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=5000, MAX_RESTART=30):
+    def rasm_mode(self, K, MAX_ITER=5000000000000000, MAX_RESTART=30):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -206,11 +244,12 @@ class Laplace(likelihood):
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f)
 
         difference = np.inf
-        epsilon = 1e-16
+        epsilon = 1e-6
         step_size = 1
         rs = 0
         i = 0
-        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
+        while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART:
+            f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
                 #if np.any(W < 0):
@@ -220,31 +259,48 @@ class Laplace(likelihood):
                                     #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     #To cause the posterior to become less certain than the prior and likelihood,
                                     #This is a property only held by non-log-concave likelihoods
-            #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
-            W_12 = np.sqrt(W)
-            B = np.eye(self.N) + mdot(W_12, K, W_12)
-            L = jitchol(B)
-            b = (np.dot(W, f) + step_size * self.likelihood_function.link_grad(self.data, f)[:, None])
+            B, L, W_12 = self._compute_B_statistics(K, W)
+
+            W_f = np.dot(W, f)
+            grad = self.likelihood_function.link_grad(self.data, f)[:, None]
+            #Find K_i_f
+            b = W_f + grad
+            #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None]
             #TODO: Check L is lower
             solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
             a = b - mdot(W_12, solve_L)
-            f = np.dot(K, a)
+            #f = np.dot(K, a)
+
+            #a should be equal to Ki*f now so should be able to use it
+            c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
+            solve_L = cho_solve((L, True), mdot(W_12, c))
+            f = c - mdot(K, W_12, solve_L)
+
+            #K_w_f = mdot(K, (W, f))
+            #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f
+            #d = f + K_w_f + c
+            #solve_L = cho_solve((L, True), mdot(W_12, d))
+            #f = c - mdot(K, (W_12, solve_L))
+            #a = mdot(self.Ki, f)
+
+            tmp_old_obj = old_obj
             old_obj = new_obj
             new_obj = obj(a, f)
             difference = new_obj - old_obj
-            #print "Difference: ", new_obj - old_obj
+            #print "Difference: ", difference
             if difference < 0:
+                #print "Objective function rose", difference
                 #If the objective function isn't rising, restart optimization
                 step_size *= 0.9
-                print "Objective function rose"
-                print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
                 #objective function isn't increasing, try reducing step size
-                f = np.zeros((self.N, 1))
-                new_obj = -np.inf
-                old_obj = np.inf
+                #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
+                old_obj = tmp_old_obj
                 rs += 1
 
             difference = abs(difference)
             i += 1
 
+        self.i = i
+        print "{i} steps".format(i=i)
         return f

From 31d8faecf866307c69dcade761ddb77d628b773e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 5 Apr 2013 17:56:02 +0100
Subject: [PATCH 018/252] Added timing and realised mdot can be faster as its
 almost always a diagonal matrix its multiplying with

---
 python/examples/laplace_approximations.py |  9 +++++---
 python/likelihoods/Laplace.py             | 25 ++++++++++++++---------
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 7ab26406..28a92c61 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -8,11 +8,12 @@ from coxGP.python.likelihoods.likelihood_function import student_t
 
 def timing():
     real_var = 0.1
-    times = 1000
+    times = 1
     deg_free = 10
     real_sd = np.sqrt(real_var)
     the_is = np.zeros(times)
-    X = np.linspace(0.0, 10.0, 30)[:, None]
+    X = np.linspace(0.0, 10.0, 500)[:, None]
+
     for a in xrange(times):
         Y = np.sin(X) + np.random.randn(*X.shape)*real_var
         Yc = Y.copy()
@@ -21,6 +22,8 @@ def timing():
         Yc[25] += 10
         Yc[23] += 10
         Yc[24] += 10
+        Yc[300] += 10
+        Yc[400] += 10000
 
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
@@ -33,9 +36,9 @@ def timing():
         m.optimize()
         the_is[a] = m.likelihood.i
 
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print the_is
     print np.mean(the_is)
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
 
 def student_t_approx():
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 396a0bc7..734bf6c8 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -128,7 +128,9 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        self.Ki, _, _, log_Kdet = pdinv(K)
+        print "Inverting K"
+        #self.Ki, _, _, log_Kdet = pdinv(K)
+        print "K inverted, optimising"
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
@@ -196,6 +198,7 @@ class Laplace(likelihood):
         """
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         B = np.eye(K.shape[0]) + mdot(W_12, K, W_12)
         L = jitchol(B)
         return (B, L, W_12)
@@ -205,9 +208,7 @@ class Laplace(likelihood):
         :K: Covariance matrix
         :returns: f_mode
         """
-        self.K = K.copy()
         f = np.zeros((self.N, 1))
-        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
         LOG_K_CONST = -(0.5 * self.log_Kdet)
 
         #FIXME: Can we get rid of this horrible reshaping?
@@ -227,7 +228,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=5000000000000000, MAX_RESTART=30):
+    def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -249,6 +250,7 @@ class Laplace(likelihood):
         rs = 0
         i = 0
         while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART:
+            print "optimising"
             f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
@@ -259,22 +261,25 @@ class Laplace(likelihood):
                                     #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     #To cause the posterior to become less certain than the prior and likelihood,
                                     #This is a property only held by non-log-concave likelihoods
+            print "Decomposing"
             B, L, W_12 = self._compute_B_statistics(K, W)
+            print "Finding f"
 
-            W_f = np.dot(W, f)
+            W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal!
             grad = self.likelihood_function.link_grad(self.data, f)[:, None]
             #Find K_i_f
             b = W_f + grad
             #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None]
             #TODO: Check L is lower
-            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))
-            a = b - mdot(W_12, solve_L)
+
+            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal!
+            a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
             #f = np.dot(K, a)
 
             #a should be equal to Ki*f now so should be able to use it
             c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-            solve_L = cho_solve((L, True), mdot(W_12, c))
-            f = c - mdot(K, W_12, solve_L)
+            solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal!
+            f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
 
             #K_w_f = mdot(K, (W, f))
             #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f
@@ -302,5 +307,5 @@ class Laplace(likelihood):
             i += 1
 
         self.i = i
-        print "{i} steps".format(i=i)
+        #print "{i} steps".format(i=i)
         return f

From 431f93ef231875aeb6adbe6be2c70ea807aafdce Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 8 Apr 2013 18:09:07 +0100
Subject: [PATCH 019/252] Stabalised most of the algorithm (apart from the end
 inversion which is impossible)

---
 python/likelihoods/Laplace.py | 132 ++++++++++++++++++----------------
 1 file changed, 72 insertions(+), 60 deletions(-)

diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 734bf6c8..77359769 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -3,9 +3,15 @@ import scipy as sp
 import GPy
 from scipy.linalg import cholesky, eig, inv, det, cho_solve
 from GPy.likelihoods.likelihood import likelihood
-from GPy.util.linalg import pdinv, mdot, jitchol
+from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
+from scipy.linalg.lapack import dtrtrs
 #import numpy.testing.assert_array_equal
 
+#TODO: Move this to utils
+def det_ln_diag(A):
+    return np.log(np.diagonal(A)).sum()
+
+
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
@@ -60,7 +66,6 @@ class Laplace(likelihood):
         pass # TODO: Laplace likelihood might want to take some parameters...
 
     def _gradients(self, partial):
-        #return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
 
@@ -99,9 +104,26 @@ class Laplace(likelihood):
             (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
         else:
             self.Sigma_tilde = inv(self.Sigma_tilde_i)
-        #f_hat? should be f but we must have optimized for them I guess?
-        #Y_tilde = mdot(self.Sigma_tilde, self.hess_hat_i, self.f_hat)
         Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
+
+        #dtritri -> L -> L_i
+        #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
+        #((L.T*w)_i + I)f_hat = y_tilde
+        L = jitchol(self.K)
+        Li = chol_inv(L)
+        Lt_W = np.dot(L.T, self.W)
+        if np.abs(det(Lt_W)) < epsilon:
+            print "WARNING: Transformed covariance matrix is signular!"
+        Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
+        Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+        #if np.abs(det(KW)) < epsilon:
+            #print "WARNING: Transformed covariance matrix is signular!"
+        #KW_i = inv(KW)
+        #Y_tilde = mdot(KW_i + np.eye(self.N), self.f_hat)
+
+        #Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
         #KW = np.dot(self.K, self.W)
         #KW_i, _, _, _ = pdinv(KW)
         #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat)
@@ -110,16 +132,38 @@ class Laplace(likelihood):
                     #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
                     #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
                    #)
-        _, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12))
-        f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat)
-        Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f
+        #_, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12))
+        #f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat)
+        #Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f
+
+        #f_W_f = mdot(self.f_hat.T, self.W, self.f_hat)
+        #f_Y_f = mdot(Y_tilde, self.W, Y_tilde)
+        #Z_tilde = (np.dot(self.W, self.f_hat) - 0.5*y_W_y + self.ln_z_hat
+                   #- 0.5*mdot(self.f_hat, (
+
+        f_Ki_W_f = mdot(self.f_hat.T, (self.Ki + self.W), self.f_hat)
+        y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
+        y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
+        self.ln_W_det = det_ln_diag(self.W)
+        Z_tilde = (self.NORMAL_CONST
+                   - 0.5*self.ln_K_det
+                   - 0.5*self.ln_W_det
+                   - 0.5*self.ln_Ki_W_i_det
+                   - 0.5*f_Ki_W_f
+                   - 0.5*y_W_y
+                   + y_W_f
+                   + self.ln_z_hat
+                   )
+
+        Sigma_tilde = inv(self.W) # Damn
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
         self.YYT = np.dot(self.Y, self.Y.T)
-        self.covariance_matrix = self.Sigma_tilde
+        self.covariance_matrix = Sigma_tilde
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def fit_full(self, K):
         """
@@ -128,9 +172,7 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
-        print "Inverting K"
-        #self.Ki, _, _, log_Kdet = pdinv(K)
-        print "K inverted, optimising"
+        self.Ki, _, _, self.ln_K_det = pdinv(K)
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
@@ -144,46 +186,24 @@ class Laplace(likelihood):
                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                    #To cause the posterior to become less certain than the prior and likelihood,
                                    #This is a property only held by non-log-concave likelihoods
+
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
-        self.B, L, self.W_12 = self._compute_B_statistics(K, self.W)
+        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
-        #ln_W_det = np.linalg.det(self.W)
-        #ln_B_det = np.linalg.det(self.B)
-        ln_det = np.linalg.det(np.eye(self.N) - mdot(self.W_12, self.Bi, self.W_12, K))
+
+        Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
+        self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
+
         b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None]
-        #TODO: Check L is lower
-        solve_L = cho_solve((L, True), mdot(self.W_12, (K, b)))
-        a = b - mdot(self.W_12, solve_L)
+        solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
+        a = b - mdot(self.W_12, solve_chol)
         self.f_Ki_f = np.dot(self.f_hat.T, a)
 
-        #self.hess_hat = self.Ki + self.W
-        #(self.hess_hat, _, _, self.log_hess_hat_i_det) = pdinv(self.hess_hat)
-
-        ##Check hess_hat is positive definite
-        #try:
-            #cholesky(self.hess_hat)
-        #except:
-            #raise ValueError("Must be positive definite")
-
-        ##Check its eigenvalues are positive
-        #eigenvalues = eig(self.hess_hat)
-        #if not np.all(eigenvalues > 0):
-            #raise ValueError("Eigen values not positive")
-
-        #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
-        #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
-        #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
-        #Unsure whether its log_hess or log_hess_i
-        #self.ln_z_hat = (- 0.5*self.log_hess_hat_i_det
-                         #+ 0.5*self.log_Kdet
-                         #+ self.likelihood_function.link_function(self.data, self.f_hat)
-                         ##+ self.likelihood_function.link_function(self.data, self.f_hat)
-                         #- 0.5*mdot(self.f_hat.T, (self.Ki, self.f_hat))
-                         #)
-        self.ln_z_hat = (- 0.5*log_Kdet
+        self.ln_z_hat = (  self.NORMAL_CONST
                          - 0.5*self.f_Ki_f
+                         - 0.5*self.ln_K_det
+                         + 0.5*self.ln_Ki_W_i_det
                          + self.likelihood_function.link_function(self.data, self.f_hat)
-                         + 0.5*ln_det
                          )
 
         return self._compute_GP_variables()
@@ -198,7 +218,7 @@ class Laplace(likelihood):
         """
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         B = np.eye(K.shape[0]) + mdot(W_12, K, W_12)
         L = jitchol(B)
         return (B, L, W_12)
@@ -209,12 +229,12 @@ class Laplace(likelihood):
         :returns: f_mode
         """
         f = np.zeros((self.N, 1))
-        LOG_K_CONST = -(0.5 * self.log_Kdet)
 
         #FIXME: Can we get rid of this horrible reshaping?
+        #ONLY WORKS FOR 1D DATA
         def obj(f):
             res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f))
-                        + self.NORMAL_CONST + LOG_K_CONST)
+                        + self.NORMAL_CONST)
             return float(res)
 
         def obj_grad(f):
@@ -249,21 +269,15 @@ class Laplace(likelihood):
         step_size = 1
         rs = 0
         i = 0
-        while difference > epsilon:# and i < MAX_ITER and rs < MAX_RESTART:
-            print "optimising"
+        while difference > epsilon:  # and i < MAX_ITER and rs < MAX_RESTART:
             f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
-                #if np.any(W < 0):
-                    #print "NEGATIVE VALUES :("
-                    #pass
                 W[W < 0] = 1e-6     #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     #To cause the posterior to become less certain than the prior and likelihood,
                                     #This is a property only held by non-log-concave likelihoods
-            print "Decomposing"
             B, L, W_12 = self._compute_B_statistics(K, W)
-            print "Finding f"
 
             W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal!
             grad = self.likelihood_function.link_grad(self.data, f)[:, None]
@@ -272,15 +286,15 @@ class Laplace(likelihood):
             #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None]
             #TODO: Check L is lower
 
-            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal!
-            a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
-            #f = np.dot(K, a)
-
             #a should be equal to Ki*f now so should be able to use it
             c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
             solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal!
             f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
 
+            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal!
+            a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
+            #f = np.dot(K, a)
+
             #K_w_f = mdot(K, (W, f))
             #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f
             #d = f + K_w_f + c
@@ -292,7 +306,6 @@ class Laplace(likelihood):
             old_obj = new_obj
             new_obj = obj(a, f)
             difference = new_obj - old_obj
-            #print "Difference: ", difference
             if difference < 0:
                 #print "Objective function rose", difference
                 #If the objective function isn't rising, restart optimization
@@ -307,5 +320,4 @@ class Laplace(likelihood):
             i += 1
 
         self.i = i
-        #print "{i} steps".format(i=i)
         return f

From e0c1e4a4df600d24f075cc13a359a4bc77dfcff3 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 8 Apr 2013 19:58:54 +0100
Subject: [PATCH 020/252] Fixed laplace approximation and made more numerically
 stable with cholesky decompositions, and commented

---
 python/examples/laplace_approximations.py |   1 -
 python/likelihoods/Laplace.py             | 142 ++++++++++------------
 2 files changed, 65 insertions(+), 78 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 28a92c61..0500ba02 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -140,7 +140,6 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     print "Clean student t, ncg"
     t_distribution = student_t(deg_free, sigma=edited_real_sd)
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 77359769..27ab7613 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,17 +1,32 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv, det, cho_solve
+from scipy.linalg import cholesky, eig, inv, cho_solve
+from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
 from scipy.linalg.lapack import dtrtrs
-#import numpy.testing.assert_array_equal
 
 #TODO: Move this to utils
+
+
 def det_ln_diag(A):
+    """
+    log determinant of a diagonal matrix
+    $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
+    """
     return np.log(np.diagonal(A)).sum()
 
 
+def pddet(A):
+    """
+    Determinant of a positive definite matrix
+    """
+    L = cholesky(A)
+    logdetA = 2*sum(np.log(np.diag(L)))
+    return logdetA
+
+
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
@@ -30,7 +45,8 @@ class Laplace(likelihood):
         ---------
 
         :data: @todo
-        :likelihood_function: @todo
+        :likelihood_function: likelihood function - subclass of likelihood_function
+        :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation
 
         """
         self.data = data
@@ -63,10 +79,10 @@ class Laplace(likelihood):
         return []
 
     def _set_params(self, p):
-        pass # TODO: Laplace likelihood might want to take some parameters...
+        pass  # TODO: Laplace likelihood might want to take some parameters...
 
     def _gradients(self, partial):
-        return np.zeros(0) # TODO: Laplace likelihood might want to take some parameters...
+        return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
         raise NotImplementedError
 
     def _compute_GP_variables(self):
@@ -91,20 +107,10 @@ class Laplace(likelihood):
         i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
         since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
         and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
+        $$\tilde{\Sigma} = W^{-1}$$
 
         """
-        self.Sigma_tilde_i = self.W
-        #Check it isn't singular!
         epsilon = 1e-6
-        if np.abs(det(self.Sigma_tilde_i)) < epsilon:
-            print "WARNING: Transformed covariance matrix is signular!"
-            #raise ValueError("inverse covariance must be non-singular to invert!")
-        #Do we really need to inverse Sigma_tilde_i? :(
-        if self.likelihood_function.log_concave:
-            (self.Sigma_tilde, _, _, _) = pdinv(self.Sigma_tilde_i)
-        else:
-            self.Sigma_tilde = inv(self.Sigma_tilde_i)
-        Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
 
         #dtritri -> L -> L_i
         #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
@@ -112,42 +118,25 @@ class Laplace(likelihood):
         L = jitchol(self.K)
         Li = chol_inv(L)
         Lt_W = np.dot(L.T, self.W)
-        if np.abs(det(Lt_W)) < epsilon:
-            print "WARNING: Transformed covariance matrix is signular!"
+
+        ##Check it isn't singular!
+        if cond(Lt_W) > 1e14:
+            print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
+
         Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
         Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
-        #if np.abs(det(KW)) < epsilon:
-            #print "WARNING: Transformed covariance matrix is signular!"
-        #KW_i = inv(KW)
-        #Y_tilde = mdot(KW_i + np.eye(self.N), self.f_hat)
+        #f.T(Ki + W)f
+        f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
+                    + mdot(self.f_hat.T, self.W, self.f_hat)
+                    )
 
-        #Y_tilde = mdot(self.Sigma_tilde, (self.Ki + self.W), self.f_hat)
-        #KW = np.dot(self.K, self.W)
-        #KW_i, _, _, _ = pdinv(KW)
-        #Y_tilde = mdot((KW_i + np.eye(self.N)), self.f_hat)
-        #Z_tilde = (self.ln_z_hat - self.NORMAL_CONST
-                    #+ 0.5*mdot(self.f_hat.T, (self.hess_hat, self.f_hat))
-                    #+ 0.5*mdot(Y_tilde.T, (self.Sigma_tilde_i, Y_tilde))
-                    #- mdot(Y_tilde.T, (self.Sigma_tilde_i, self.f_hat))
-                   #)
-        #_, _, _, ln_W12_Bi_W12_i = pdinv(mdot(self.W_12, self.Bi, self.W_12))
-        #f_Si_f = mdot(self.f_hat.T, self.Sigma_tilde_i, self.f_hat)
-        #Z_tilde = -self.NORMAL_CONST + self.ln_z_hat -0.5*ln_W12_Bi_W12_i - 0.5*self.f_Ki_f - 0.5*f_Si_f
-
-        #f_W_f = mdot(self.f_hat.T, self.W, self.f_hat)
-        #f_Y_f = mdot(Y_tilde, self.W, Y_tilde)
-        #Z_tilde = (np.dot(self.W, self.f_hat) - 0.5*y_W_y + self.ln_z_hat
-                   #- 0.5*mdot(self.f_hat, (
-
-        f_Ki_W_f = mdot(self.f_hat.T, (self.Ki + self.W), self.f_hat)
         y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
         y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
-        self.ln_W_det = det_ln_diag(self.W)
+        ln_W_det = det_ln_diag(self.W)
         Z_tilde = (self.NORMAL_CONST
                    - 0.5*self.ln_K_det
-                   - 0.5*self.ln_W_det
+                   - 0.5*ln_W_det
                    - 0.5*self.ln_Ki_W_i_det
                    - 0.5*f_Ki_W_f
                    - 0.5*y_W_y
@@ -155,7 +144,11 @@ class Laplace(likelihood):
                    + self.ln_z_hat
                    )
 
-        Sigma_tilde = inv(self.W) # Damn
+        ##Check it isn't singular!
+        if cond(self.W) > 1e14:
+            print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
+
+        Sigma_tilde = inv(self.W)  # Damn
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -163,16 +156,14 @@ class Laplace(likelihood):
         self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = Sigma_tilde
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def fit_full(self, K):
         """
         The laplace approximation algorithm
-        For nomenclature see Rasmussen & Williams 2006
+        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
         :K: Covariance matrix
         """
         self.K = K.copy()
-        self.Ki, _, _, self.ln_K_det = pdinv(K)
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
@@ -182,10 +173,10 @@ class Laplace(likelihood):
         self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat))
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-6 #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-                                   #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
-                                   #To cause the posterior to become less certain than the prior and likelihood,
-                                   #This is a property only held by non-log-concave likelihoods
+            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                       #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                       #To cause the posterior to become less certain than the prior and likelihood,
+                                       #This is a property only held by non-log-concave likelihoods
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
         self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W)
@@ -198,8 +189,9 @@ class Laplace(likelihood):
         solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
         a = b - mdot(self.W_12, solve_chol)
         self.f_Ki_f = np.dot(self.f_hat.T, a)
+        self.ln_K_det = pddet(self.K)
 
-        self.ln_z_hat = (  self.NORMAL_CONST
+        self.ln_z_hat = (self.NORMAL_CONST
                          - 0.5*self.f_Ki_f
                          - 0.5*self.ln_K_det
                          + 0.5*self.ln_Ki_W_i_det
@@ -219,26 +211,29 @@ class Laplace(likelihood):
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
         #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-        B = np.eye(K.shape[0]) + mdot(W_12, K, W_12)
+        B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
         L = jitchol(B)
         return (B, L, W_12)
 
     def ncg_mode(self, K):
-        """Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
+        """
+        Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
         :K: Covariance matrix
         :returns: f_mode
         """
+        self.Ki, _, _, self.ln_K_det = pdinv(K)
+
         f = np.zeros((self.N, 1))
 
         #FIXME: Can we get rid of this horrible reshaping?
         #ONLY WORKS FOR 1D DATA
         def obj(f):
-            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * mdot(f.T, (self.Ki, f))
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
                         + self.NORMAL_CONST)
             return float(res)
 
         def obj_grad(f):
-            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - mdot(self.Ki, f))
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - np.dot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
@@ -254,6 +249,8 @@ class Laplace(likelihood):
         For nomenclature see Rasmussen & Williams 2006
 
         :K: Covariance matrix
+        :MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
+        :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
         :returns: f_mode
         """
         f = np.zeros((self.N, 1))
@@ -269,39 +266,30 @@ class Laplace(likelihood):
         step_size = 1
         rs = 0
         i = 0
-        while difference > epsilon:  # and i < MAX_ITER and rs < MAX_RESTART:
+        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f))
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-6     #FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
-                                    #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
-                                    #To cause the posterior to become less certain than the prior and likelihood,
-                                    #This is a property only held by non-log-concave likelihoods
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                    # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                    # To cause the posterior to become less certain than the prior and likelihood,
+                                    # This is a property only held by non-log-concave likelihoods
             B, L, W_12 = self._compute_B_statistics(K, W)
 
-            W_f = np.dot(W, f)#FIXME: Make this fast as W_12 is diagonal!
+            W_f = np.dot(W, f)
             grad = self.likelihood_function.link_grad(self.data, f)[:, None]
             #Find K_i_f
             b = W_f + grad
-            #b = np.dot(W, f) + np.dot(self.Ki, f)*(1-step_size) + step_size*self.likelihood_function.link_grad(self.data, f)[:, None]
-            #TODO: Check L is lower
 
             #a should be equal to Ki*f now so should be able to use it
-            c = mdot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-            solve_L = cho_solve((L, True), mdot(W_12, c))#FIXME: Make this fast as W_12 is diagonal!
-            f = c - mdot(K, W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
+            c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
+            solve_L = cho_solve((L, True), np.dot(W_12, c))
+            f = c - np.dot(K, np.dot(W_12, solve_L))
 
-            solve_L = cho_solve((L, True), mdot(W_12, (K, b)))#FIXME: Make this fast as W_12 is diagonal!
-            a = b - mdot(W_12, solve_L)#FIXME: Make this fast as W_12 is diagonal!
+            solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b)))
+            a = b - np.dot(W_12, solve_L)
             #f = np.dot(K, a)
 
-            #K_w_f = mdot(K, (W, f))
-            #c = step_size*mdot(K, self.likelihood_function.link_grad(self.data, f)[:, None]) - step_size*f
-            #d = f + K_w_f + c
-            #solve_L = cho_solve((L, True), mdot(W_12, d))
-            #f = c - mdot(K, (W_12, solve_L))
-            #a = mdot(self.Ki, f)
-
             tmp_old_obj = old_obj
             old_obj = new_obj
             new_obj = obj(a, f)

From 65481d7a73b8fe965a99b82126431ae2668958db Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 10 Apr 2013 13:43:13 +0100
Subject: [PATCH 021/252] Fixed the z scalings

---
 python/examples/laplace_approximations.py |  8 +++----
 python/likelihoods/Laplace.py             | 28 +++++++++++++++--------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/python/examples/laplace_approximations.py b/python/examples/laplace_approximations.py
index 0500ba02..5b1331b6 100644
--- a/python/examples/laplace_approximations.py
+++ b/python/examples/laplace_approximations.py
@@ -12,7 +12,7 @@ def timing():
     deg_free = 10
     real_sd = np.sqrt(real_var)
     the_is = np.zeros(times)
-    X = np.linspace(0.0, 10.0, 500)[:, None]
+    X = np.linspace(0.0, 10.0, 300)[:, None]
 
     for a in xrange(times):
         Y = np.sin(X) + np.random.randn(*X.shape)*real_var
@@ -22,8 +22,8 @@ def timing():
         Yc[25] += 10
         Yc[23] += 10
         Yc[24] += 10
-        Yc[300] += 10
-        Yc[400] += 10000
+        Yc[250] += 10
+        #Yc[4] += 10000
 
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
@@ -36,7 +36,7 @@ def timing():
         m.optimize()
         the_is[a] = m.likelihood.i
 
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print the_is
     print np.mean(the_is)
 
diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 27ab7613..8ef8fb62 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -1,7 +1,7 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv, cho_solve
+from scipy.linalg import cholesky, eig, inv, cho_solve, det
 from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
@@ -134,15 +134,24 @@ class Laplace(likelihood):
         y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
         y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
         ln_W_det = det_ln_diag(self.W)
-        Z_tilde = (self.NORMAL_CONST
-                   - 0.5*self.ln_K_det
-                   - 0.5*ln_W_det
-                   - 0.5*self.ln_Ki_W_i_det
-                   - 0.5*f_Ki_W_f
-                   - 0.5*y_W_y
-                   + y_W_f
+        Z_tilde = (- self.NORMAL_CONST
+                   + 0.5*self.ln_K_det
+                   + 0.5*ln_W_det
+                   + 0.5*self.ln_Ki_W_i_det
+                   + 0.5*f_Ki_W_f
+                   + 0.5*y_W_y
+                   - y_W_f
                    + self.ln_z_hat
                    )
+        #Z_tilde = (self.NORMAL_CONST
+                   #- 0.5*self.ln_K_det
+                   #- 0.5*ln_W_det
+                   #- 0.5*self.ln_Ki_W_i_det
+                   #- 0.5*f_Ki_W_f
+                   #- 0.5*y_W_y
+                   #+ y_W_f
+                   #+ self.ln_z_hat
+                   #)
 
         ##Check it isn't singular!
         if cond(self.W) > 1e14:
@@ -191,8 +200,7 @@ class Laplace(likelihood):
         self.f_Ki_f = np.dot(self.f_hat.T, a)
         self.ln_K_det = pddet(self.K)
 
-        self.ln_z_hat = (self.NORMAL_CONST
-                         - 0.5*self.f_Ki_f
+        self.ln_z_hat = (- 0.5*self.f_Ki_f
                          - 0.5*self.ln_K_det
                          + 0.5*self.ln_Ki_W_i_det
                          + self.likelihood_function.link_function(self.data, self.f_hat)

From 9bbb11b825f7c395a040e2385d6a2c88aa1c143e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 10 Apr 2013 15:43:31 +0100
Subject: [PATCH 022/252] Adding weibull likelihood, requires 'extra_data' to
 be passed to likelihood, i.e. the censoring information

---
 python/likelihoods/Laplace.py             | 24 +++---
 python/likelihoods/likelihood_function.py | 99 +++++++++++++++++++++--
 2 files changed, 104 insertions(+), 19 deletions(-)

diff --git a/python/likelihoods/Laplace.py b/python/likelihoods/Laplace.py
index 8ef8fb62..4d94ba0f 100644
--- a/python/likelihoods/Laplace.py
+++ b/python/likelihoods/Laplace.py
@@ -30,7 +30,7 @@ def pddet(A):
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self, data, likelihood_function, rasm=True):
+    def __init__(self, data, likelihood_function, extra_data=None, rasm=True):
         """
         Laplace Approximation
 
@@ -44,13 +44,15 @@ class Laplace(likelihood):
         Arguments
         ---------
 
-        :data: @todo
+        :data: array of data the likelihood function is approximating
         :likelihood_function: likelihood function - subclass of likelihood_function
+        :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
         :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation
 
         """
         self.data = data
         self.likelihood_function = likelihood_function
+        self.extra_data = extra_data
         self.rasm = rasm
 
         #Inital values
@@ -179,7 +181,7 @@ class Laplace(likelihood):
             self.f_hat = self.ncg_mode(K)
 
         #At this point get the hessian matrix
-        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat))
+        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data))
 
         if not self.likelihood_function.log_concave:
             self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
@@ -194,7 +196,7 @@ class Laplace(likelihood):
         Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
         self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
 
-        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat)[:, None]
+        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
         solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
         a = b - mdot(self.W_12, solve_chol)
         self.f_Ki_f = np.dot(self.f_hat.T, a)
@@ -203,7 +205,7 @@ class Laplace(likelihood):
         self.ln_z_hat = (- 0.5*self.f_Ki_f
                          - 0.5*self.ln_K_det
                          + 0.5*self.ln_Ki_W_i_det
-                         + self.likelihood_function.link_function(self.data, self.f_hat)
+                         + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
                          )
 
         return self._compute_GP_variables()
@@ -236,16 +238,16 @@ class Laplace(likelihood):
         #FIXME: Can we get rid of this horrible reshaping?
         #ONLY WORKS FOR 1D DATA
         def obj(f):
-            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
                         + self.NORMAL_CONST)
             return float(res)
 
         def obj_grad(f):
-            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f) - np.dot(self.Ki, f))
+            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f)) - self.Ki)
+            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
             return np.squeeze(res)
 
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
@@ -267,7 +269,7 @@ class Laplace(likelihood):
 
         def obj(a, f):
             #Careful of shape of data!
-            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f)
+            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
         epsilon = 1e-6
@@ -276,7 +278,7 @@ class Laplace(likelihood):
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             f_old = f.copy()
-            W = -np.diag(self.likelihood_function.link_hess(self.data, f))
+            W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data))
             if not self.likelihood_function.log_concave:
                 W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -285,7 +287,7 @@ class Laplace(likelihood):
             B, L, W_12 = self._compute_B_statistics(K, W)
 
             W_f = np.dot(W, f)
-            grad = self.likelihood_function.link_grad(self.data, f)[:, None]
+            grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None]
             #Find K_i_f
             b = W_f + grad
 
diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 49174ce7..0d421882 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -4,6 +4,7 @@ import numpy as np
 from GPy.likelihoods.likelihood_functions import likelihood_function
 from scipy import stats
 
+
 class student_t(likelihood_function):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
@@ -24,15 +25,16 @@ class student_t(likelihood_function):
         self.log_concave = False
 
     @property
-    def variance(self):
+    def variance(self, extra_data=None):
         return (self.v / float(self.v - 2)) * (self.sigma**2)
 
-    def link_function(self, y, f):
+    def link_function(self, y, f, extra_data=None):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
         :y: data
         :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
         :returns: float(likelihood evaluated for this point)
 
         """
@@ -49,7 +51,7 @@ class student_t(likelihood_function):
                      )
         return np.sum(objective)
 
-    def link_grad(self, y, f):
+    def link_grad(self, y, f, extra_data=None):
         """
         Gradient of the link function at y, given f w.r.t f
 
@@ -57,6 +59,7 @@ class student_t(likelihood_function):
 
         :y: data
         :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
         :returns: gradient of likelihood evaluated at points
 
         """
@@ -67,17 +70,18 @@ class student_t(likelihood_function):
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
         return np.squeeze(grad)
 
-    def link_hess(self, y, f):
+    def link_hess(self, y, f, extra_data=None):
         """
         Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
         i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
 
-        Will return diaganol of hessian, since every where else it is 0
+        Will return diagonal of hessian, since every where else it is 0
 
         $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
 
         :y: data
         :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
         y = np.squeeze(y)
@@ -139,7 +143,7 @@ class student_t(likelihood_function):
                                             #size=(num_f_samples, num_y_samples))
             #print student_t_samples.shape
 
-        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:,None],
+        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
                                         scale=self.sigma,
                                         size=(num_test_points, num_y_samples, num_f_samples))
         student_t_samples = np.reshape(student_t_samples,
@@ -152,7 +156,7 @@ class student_t(likelihood_function):
         ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
         def t_gaussian(f, mu, var):
             return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
-                        * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
+                    * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
                     )
 
         def t_gauss_int(mu, var):
@@ -167,4 +171,83 @@ class student_t(likelihood_function):
         p = vec_t_gauss_int(mu, var)
         p_025 = mu - p
         p_975 = mu + p
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        return mu, np.nan*mu, p_025, p_975
+
+
+class weibull_survival(likelihood_function):
+    """Weibull t likelihood distribution for survival analysis with censoring
+        For nomanclature see Bayesian Survival Analysis
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fifj
+    """
+    def __init__(self, shape, scale):
+        self.shape = shape
+        self.scale = scale
+
+        #FIXME: This should be in the superclass
+        self.log_concave = True
+
+    def link_function(self, y, f, extra_data=None):
+        """
+        link_function $\ln p(y|f)$, i.e. log likelihood
+
+        $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$
+
+        :y: time of event data
+        :f: latent variables f
+        :extra_data: the censoring indicator, 1 for censored, 0 for not
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        v = extra_data
+        objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f)  # FIXME: CHECK THIS WITH BOOK, wheres scale?
+        return np.sum(objective)
+
+    def link_grad(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
+
+        $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i})
+
+        :y: data
+        :f: latent variables f
+        :extra_data: the censoring indicator, 1 for censored, 0 for not
+        :returns: gradient of likelihood evaluated at points
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        v = extra_data
+        grad = v - (y**self.shape)*np.exp(f)
+        return np.squeeze(grad)
+
+    def link_hess(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        Will return diagonal of hessian, since every where else it is 0
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used hessian
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        hess = (y**self.shape)*np.exp(f)
+        return np.squeeze(hess)

From 296c093611f46c8632a7235f7d414581f5969294 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 15 Apr 2013 12:08:22 +0100
Subject: [PATCH 023/252] Tidy up comments

---
 python/likelihoods/likelihood_function.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/likelihoods/likelihood_function.py b/python/likelihoods/likelihood_function.py
index 0d421882..f14faf33 100644
--- a/python/likelihoods/likelihood_function.py
+++ b/python/likelihoods/likelihood_function.py
@@ -9,7 +9,7 @@ class student_t(likelihood_function):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
 
-    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
+    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2)$$
 
     Laplace:
     Needs functions to calculate

From 1e707f125c7e9313b4444b23811425ddc555dba3 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 15 Apr 2013 12:10:42 +0100
Subject: [PATCH 024/252] Make directory structure match that of GPy

---
 {python => GPy}/__init__.py                        | 0
 {python => GPy}/examples/__init__.py               | 0
 {python => GPy}/examples/laplace_approximations.py | 0
 {python => GPy}/likelihoods/Laplace.py             | 0
 {python => GPy}/likelihoods/__init__.py            | 0
 {python => GPy}/likelihoods/likelihood_function.py | 0
 {python => GPy}/models/__init__.py                 | 0
 {python => GPy}/models/coxGP.py                    | 0
 {python => GPy}/testing/__init__.py                | 0
 {python => GPy}/testing/cox_tests.py               | 0
 10 files changed, 0 insertions(+), 0 deletions(-)
 rename {python => GPy}/__init__.py (100%)
 rename {python => GPy}/examples/__init__.py (100%)
 rename {python => GPy}/examples/laplace_approximations.py (100%)
 rename {python => GPy}/likelihoods/Laplace.py (100%)
 rename {python => GPy}/likelihoods/__init__.py (100%)
 rename {python => GPy}/likelihoods/likelihood_function.py (100%)
 rename {python => GPy}/models/__init__.py (100%)
 rename {python => GPy}/models/coxGP.py (100%)
 rename {python => GPy}/testing/__init__.py (100%)
 rename {python => GPy}/testing/cox_tests.py (100%)

diff --git a/python/__init__.py b/GPy/__init__.py
similarity index 100%
rename from python/__init__.py
rename to GPy/__init__.py
diff --git a/python/examples/__init__.py b/GPy/examples/__init__.py
similarity index 100%
rename from python/examples/__init__.py
rename to GPy/examples/__init__.py
diff --git a/python/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
similarity index 100%
rename from python/examples/laplace_approximations.py
rename to GPy/examples/laplace_approximations.py
diff --git a/python/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
similarity index 100%
rename from python/likelihoods/Laplace.py
rename to GPy/likelihoods/Laplace.py
diff --git a/python/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
similarity index 100%
rename from python/likelihoods/__init__.py
rename to GPy/likelihoods/__init__.py
diff --git a/python/likelihoods/likelihood_function.py b/GPy/likelihoods/likelihood_function.py
similarity index 100%
rename from python/likelihoods/likelihood_function.py
rename to GPy/likelihoods/likelihood_function.py
diff --git a/python/models/__init__.py b/GPy/models/__init__.py
similarity index 100%
rename from python/models/__init__.py
rename to GPy/models/__init__.py
diff --git a/python/models/coxGP.py b/GPy/models/coxGP.py
similarity index 100%
rename from python/models/coxGP.py
rename to GPy/models/coxGP.py
diff --git a/python/testing/__init__.py b/GPy/testing/__init__.py
similarity index 100%
rename from python/testing/__init__.py
rename to GPy/testing/__init__.py
diff --git a/python/testing/cox_tests.py b/GPy/testing/cox_tests.py
similarity index 100%
rename from python/testing/cox_tests.py
rename to GPy/testing/cox_tests.py

From 589aeda88cc938a537ecb5a5df34dd276bae5a37 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 15 Apr 2013 15:44:29 +0100
Subject: [PATCH 025/252] Should be working now, needed to change relative path
 names

---
 GPy/examples/classification.py         |  3 +--
 GPy/examples/laplace_approximations.py | 29 +++++++++++---------------
 GPy/likelihoods/__init__.py            |  2 +-
 3 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index 5df019e4..4899e75e 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -17,8 +17,7 @@ def crescent_data(seed=default_seed): #FIXME
     :param seed : seed value for data generation.
     :type seed: int
     :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
-    :type inducing: int
-    """
+    :type inducing: int """
 
     data = GPy.util.datasets.crescent_data(seed=seed)
 
diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 5b1331b6..07801150 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -1,10 +1,6 @@
 import GPy
 import numpy as np
 import matplotlib.pyplot as plt
-from scipy.stats import t, norm
-from coxGP.python.likelihoods.Laplace import Laplace
-from coxGP.python.likelihoods.likelihood_function import student_t
-
 
 def timing():
     real_var = 0.1
@@ -28,15 +24,14 @@ def timing():
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
 
-        t_distribution = student_t(deg_free, sigma=edited_real_sd)
-        corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
+        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+        corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=True)
         m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
         m.ensure_default_constraints()
         m.update_likelihood_approximation()
         m.optimize()
         the_is[a] = m.likelihood.i
 
-    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print the_is
     print np.mean(the_is)
 
@@ -116,8 +111,8 @@ def student_t_approx():
     edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Y.copy(), t_distribution, rasm=True)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
@@ -129,8 +124,8 @@ def student_t_approx():
     plt.ylim(-2.5, 2.5)
 
     print "Corrupt student t, rasm"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=True)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
@@ -142,8 +137,8 @@ def student_t_approx():
     plt.ylim(-2.5, 2.5)
 
     print "Clean student t, ncg"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = Laplace(Y, t_distribution, rasm=False)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
     m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
@@ -155,8 +150,8 @@ def student_t_approx():
     plt.ylim(-2.5, 2.5)
 
     print "Corrupt student t, ncg"
-    t_distribution = student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = Laplace(Yc.copy(), t_distribution, rasm=False)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=False)
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
@@ -169,8 +164,8 @@ def student_t_approx():
 
 
     ###with a student t distribution, since it has heavy tails it should work well
-    ###likelihood_function = student_t(deg_free, sigma=real_var)
-    ###lap = Laplace(Y, likelihood_function)
+    ###likelihood_functions = student_t(deg_free, sigma=real_var)
+    ###lap = Laplace(Y, likelihood_functions)
     ###cov = kernel.K(X)
     ###lap.fit_full(cov)
 
diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 83413255..9becb1b1 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -1,4 +1,4 @@
 from EP import EP
 from Gaussian import Gaussian
-# TODO: from Laplace import Laplace
+from Laplace import Laplace
 import likelihood_functions as functions

From 01671b6c570b7c40a2b1a326ab2c68606834c674 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 16 Apr 2013 16:34:26 +0100
Subject: [PATCH 026/252] Merged likelihood functions

---
 GPy/examples/laplace_approximations.py  |   4 +-
 GPy/likelihoods/likelihood_function.py  | 253 -----------------------
 GPy/likelihoods/likelihood_functions.py | 254 +++++++++++++++++++++++-
 3 files changed, 254 insertions(+), 257 deletions(-)
 delete mode 100644 GPy/likelihoods/likelihood_function.py

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 07801150..5d1c1224 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -164,8 +164,8 @@ def student_t_approx():
 
 
     ###with a student t distribution, since it has heavy tails it should work well
-    ###likelihood_functions = student_t(deg_free, sigma=real_var)
-    ###lap = Laplace(Y, likelihood_functions)
+    ###likelihood_function = student_t(deg_free, sigma=real_var)
+    ###lap = Laplace(Y, likelihood_function)
     ###cov = kernel.K(X)
     ###lap.fit_full(cov)
 
diff --git a/GPy/likelihoods/likelihood_function.py b/GPy/likelihoods/likelihood_function.py
deleted file mode 100644
index f14faf33..00000000
--- a/GPy/likelihoods/likelihood_function.py
+++ /dev/null
@@ -1,253 +0,0 @@
-from scipy.special import gammaln, gamma
-from scipy import integrate
-import numpy as np
-from GPy.likelihoods.likelihood_functions import likelihood_function
-from scipy import stats
-
-
-class student_t(likelihood_function):
-    """Student t likelihood distribution
-    For nomanclature see Bayesian Data Analysis 2003 p576
-
-    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2)$$
-
-    Laplace:
-    Needs functions to calculate
-    ln p(yi|fi)
-    dln p(yi|fi)_dfi
-    d2ln p(yi|fi)_d2fifj
-    """
-    def __init__(self, deg_free, sigma=2):
-        self.v = deg_free
-        self.sigma = sigma
-
-        #FIXME: This should be in the superclass
-        self.log_concave = False
-
-    @property
-    def variance(self, extra_data=None):
-        return (self.v / float(self.v - 2)) * (self.sigma**2)
-
-    def link_function(self, y, f, extra_data=None):
-        """link_function $\ln p(y|f)$
-        $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: float(likelihood evaluated for this point)
-
-        """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
-        assert y.shape == f.shape
-
-        e = y - f
-        objective = (gammaln((self.v + 1) * 0.5)
-                     - gammaln(self.v * 0.5)
-                     + np.log(self.sigma * np.sqrt(self.v * np.pi))
-                     - (self.v + 1) * 0.5
-                     * np.log(1 + ((e**2 / self.sigma**2) / self.v))
-                     )
-        return np.sum(objective)
-
-    def link_grad(self, y, f, extra_data=None):
-        """
-        Gradient of the link function at y, given f w.r.t f
-
-        $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: gradient of likelihood evaluated at points
-
-        """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
-        assert y.shape == f.shape
-        e = y - f
-        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
-        return np.squeeze(grad)
-
-    def link_hess(self, y, f, extra_data=None):
-        """
-        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
-        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
-
-        Will return diagonal of hessian, since every where else it is 0
-
-        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
-        """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
-        assert y.shape == f.shape
-        e = y - f
-        hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
-        return np.squeeze(hess)
-
-    def predictive_values(self, mu, var):
-        """
-        Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
-
-        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
-        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
-        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
-        """
-
-        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
-        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
-        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
-        #Which was also given to us as (var)
-        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
-        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
-        true_var = var + self.variance
-
-        #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
-        #need the 95 and 5 percentiles.
-        #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
-        p_025 = mu - 2.*true_var
-        p_975 = mu + 2.*true_var
-
-        return mu, np.nan*mu, p_025, p_975
-
-    def sample_predicted_values(self, mu, var):
-        """ Experimental sample approches and numerical integration """
-        #p_025 = stats.t.ppf(.025, mu)
-        #p_975 = stats.t.ppf(.975, mu)
-
-        num_test_points = mu.shape[0]
-        #Each mu is the latent point f* at the test point x*,
-        #and the var is the gaussian variance at this point
-        #Take lots of samples from this, so we have lots of possible values
-        #for latent point f* for each test point x* weighted by how likely we were to pick it
-        print "Taking %d samples of f*".format(num_test_points)
-        num_f_samples = 10
-        num_y_samples = 10
-        student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
-        print "Student t means shape: ", student_t_means.shape
-
-        #Now we have lots of f*, lets work out the likelihood of getting this by sampling
-        #from a student t centred on this point, sample many points from this distribution
-        #centred on f*
-        #for test_point, f in enumerate(student_t_means):
-            #print test_point
-            #print f.shape
-            #student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
-                                            #scale=self.sigma,
-                                            #size=(num_f_samples, num_y_samples))
-            #print student_t_samples.shape
-
-        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
-                                        scale=self.sigma,
-                                        size=(num_test_points, num_y_samples, num_f_samples))
-        student_t_samples = np.reshape(student_t_samples,
-                                       (num_test_points, num_y_samples*num_f_samples))
-
-        #Now take the 97.5 and 0.25 percentile of these points
-        p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
-        p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
-
-        ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
-        def t_gaussian(f, mu, var):
-            return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
-                    * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
-                    )
-
-        def t_gauss_int(mu, var):
-            print "Mu: ", mu
-            print "var: ", var
-            result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
-            print "Result: ", result
-            return result[0]
-
-        vec_t_gauss_int = np.vectorize(t_gauss_int)
-
-        p = vec_t_gauss_int(mu, var)
-        p_025 = mu - p
-        p_975 = mu + p
-        return mu, np.nan*mu, p_025, p_975
-
-
-class weibull_survival(likelihood_function):
-    """Weibull t likelihood distribution for survival analysis with censoring
-        For nomanclature see Bayesian Survival Analysis
-
-    Laplace:
-    Needs functions to calculate
-    ln p(yi|fi)
-    dln p(yi|fi)_dfi
-    d2ln p(yi|fi)_d2fifj
-    """
-    def __init__(self, shape, scale):
-        self.shape = shape
-        self.scale = scale
-
-        #FIXME: This should be in the superclass
-        self.log_concave = True
-
-    def link_function(self, y, f, extra_data=None):
-        """
-        link_function $\ln p(y|f)$, i.e. log likelihood
-
-        $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$
-
-        :y: time of event data
-        :f: latent variables f
-        :extra_data: the censoring indicator, 1 for censored, 0 for not
-        :returns: float(likelihood evaluated for this point)
-
-        """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
-        assert y.shape == f.shape
-
-        v = extra_data
-        objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f)  # FIXME: CHECK THIS WITH BOOK, wheres scale?
-        return np.sum(objective)
-
-    def link_grad(self, y, f, extra_data=None):
-        """
-        Gradient of the link function at y, given f w.r.t f
-
-        $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i})
-
-        :y: data
-        :f: latent variables f
-        :extra_data: the censoring indicator, 1 for censored, 0 for not
-        :returns: gradient of likelihood evaluated at points
-
-        """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
-        assert y.shape == f.shape
-
-        v = extra_data
-        grad = v - (y**self.shape)*np.exp(f)
-        return np.squeeze(grad)
-
-    def link_hess(self, y, f, extra_data=None):
-        """
-        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
-        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
-
-        Will return diagonal of hessian, since every where else it is 0
-
-        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used hessian
-        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
-        """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
-        assert y.shape == f.shape
-
-        hess = (y**self.shape)*np.exp(f)
-        return np.squeeze(hess)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 4b8e7013..c759e15f 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -1,12 +1,14 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
 import numpy as np
-from scipy import stats
+from scipy import stats, integrate
 import scipy as sp
 import pylab as pb
 from ..util.plot import gpplot
+from scipy.special import gammaln, gamma
+#from GPy.likelihoods.likelihood_functions import likelihood_function
+
 
 class likelihood_function:
     """
@@ -132,3 +134,251 @@ class Poisson(likelihood_function):
         p_025 = tmp[:,0]
         p_975 = tmp[:,1]
         return mean,np.nan*mean,p_025,p_975 # better variance here TODO
+
+
+class student_t(likelihood_function):
+    """Student t likelihood distribution
+    For nomanclature see Bayesian Data Analysis 2003 p576
+
+    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2)$$
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fifj
+    """
+    def __init__(self, deg_free, sigma=2):
+        self.v = deg_free
+        self.sigma = sigma
+
+        #FIXME: This should be in the superclass
+        self.log_concave = False
+
+    @property
+    def variance(self, extra_data=None):
+        return (self.v / float(self.v - 2)) * (self.sigma**2)
+
+    def link_function(self, y, f, extra_data=None):
+        """link_function $\ln p(y|f)$
+        $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        e = y - f
+        objective = (gammaln((self.v + 1) * 0.5)
+                     - gammaln(self.v * 0.5)
+                     + np.log(self.sigma * np.sqrt(self.v * np.pi))
+                     - (self.v + 1) * 0.5
+                     * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                     )
+        return np.sum(objective)
+
+    def link_grad(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
+
+        $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: gradient of likelihood evaluated at points
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        return np.squeeze(grad)
+
+    def link_hess(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        Will return diagonal of hessian, since every where else it is 0
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
+        return np.squeeze(hess)
+
+    def predictive_values(self, mu, var):
+        """
+        Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
+
+        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        """
+
+        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
+        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
+        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
+        #Which was also given to us as (var)
+        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
+        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
+        true_var = var + self.variance
+
+        #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
+        #need the 95 and 5 percentiles.
+        #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
+        p_025 = mu - 2.*true_var
+        p_975 = mu + 2.*true_var
+
+        return mu, np.nan*mu, p_025, p_975
+
+    def sample_predicted_values(self, mu, var):
+        """ Experimental sample approches and numerical integration """
+        #p_025 = stats.t.ppf(.025, mu)
+        #p_975 = stats.t.ppf(.975, mu)
+
+        num_test_points = mu.shape[0]
+        #Each mu is the latent point f* at the test point x*,
+        #and the var is the gaussian variance at this point
+        #Take lots of samples from this, so we have lots of possible values
+        #for latent point f* for each test point x* weighted by how likely we were to pick it
+        print "Taking %d samples of f*".format(num_test_points)
+        num_f_samples = 10
+        num_y_samples = 10
+        student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
+        print "Student t means shape: ", student_t_means.shape
+
+        #Now we have lots of f*, lets work out the likelihood of getting this by sampling
+        #from a student t centred on this point, sample many points from this distribution
+        #centred on f*
+        #for test_point, f in enumerate(student_t_means):
+            #print test_point
+            #print f.shape
+            #student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
+                                            #scale=self.sigma,
+                                            #size=(num_f_samples, num_y_samples))
+            #print student_t_samples.shape
+
+        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
+                                        scale=self.sigma,
+                                        size=(num_test_points, num_y_samples, num_f_samples))
+        student_t_samples = np.reshape(student_t_samples,
+                                       (num_test_points, num_y_samples*num_f_samples))
+
+        #Now take the 97.5 and 0.25 percentile of these points
+        p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
+        p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
+
+        ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
+        def t_gaussian(f, mu, var):
+            return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
+                    * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
+                    )
+
+        def t_gauss_int(mu, var):
+            print "Mu: ", mu
+            print "var: ", var
+            result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
+            print "Result: ", result
+            return result[0]
+
+        vec_t_gauss_int = np.vectorize(t_gauss_int)
+
+        p = vec_t_gauss_int(mu, var)
+        p_025 = mu - p
+        p_975 = mu + p
+        return mu, np.nan*mu, p_025, p_975
+
+
+class weibull_survival(likelihood_function):
+    """Weibull t likelihood distribution for survival analysis with censoring
+        For nomanclature see Bayesian Survival Analysis
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fifj
+    """
+    def __init__(self, shape, scale):
+        self.shape = shape
+        self.scale = scale
+
+        #FIXME: This should be in the superclass
+        self.log_concave = True
+
+    def link_function(self, y, f, extra_data=None):
+        """
+        link_function $\ln p(y|f)$, i.e. log likelihood
+
+        $$\ln p(y|f) = v_{i}(\ln \alpha + (\alpha - 1)\ln y_{i} + f_{i}) - y_{i}^{\alpha}\exp(f_{i})$$
+
+        :y: time of event data
+        :f: latent variables f
+        :extra_data: the censoring indicator, 1 for censored, 0 for not
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        v = extra_data
+        objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f)  # FIXME: CHECK THIS WITH BOOK, wheres scale?
+        return np.sum(objective)
+
+    def link_grad(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
+
+        $$\frac{d}{df} \ln p(y_{i}|f_{i}) = v_{i} - y_{i}\exp(f_{i})
+
+        :y: data
+        :f: latent variables f
+        :extra_data: the censoring indicator, 1 for censored, 0 for not
+        :returns: gradient of likelihood evaluated at points
+
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        v = extra_data
+        grad = v - (y**self.shape)*np.exp(f)
+        return np.squeeze(grad)
+
+    def link_hess(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        Will return diagonal of hessian, since every where else it is 0
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used hessian
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+
+        hess = (y**self.shape)*np.exp(f)
+        return np.squeeze(hess)

From 1420aa532c5df8eaf4e6db5b89e77f4b375ebf1c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 19 Apr 2013 12:23:00 +0100
Subject: [PATCH 027/252] Attempted to introduce gradient methods, won't work
 yet I doubt

---
 GPy/examples/__init__.py                |   1 +
 GPy/likelihoods/Laplace.py              | 120 ++++++++++++++++++------
 GPy/likelihoods/likelihood_functions.py |  58 +++++++++++-
 GPy/models/GP.py                        |  16 +++-
 GPy/util/linalg.py                      |  19 +++-
 5 files changed, 177 insertions(+), 37 deletions(-)

diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py
index 551bff54..68832e77 100644
--- a/GPy/examples/__init__.py
+++ b/GPy/examples/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
+import laplace_approximations
 import classification
 import regression
 import dimensionality_reduction
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 4d94ba0f..b1b41957 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -4,28 +4,9 @@ import GPy
 from scipy.linalg import cholesky, eig, inv, cho_solve, det
 from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
-from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv
+from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.lapack import dtrtrs
 
-#TODO: Move this to utils
-
-
-def det_ln_diag(A):
-    """
-    log determinant of a diagonal matrix
-    $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
-    """
-    return np.log(np.diagonal(A)).sum()
-
-
-def pddet(A):
-    """
-    Determinant of a positive definite matrix
-    """
-    L = cholesky(A)
-    logdetA = 2*sum(np.log(np.diag(L)))
-    return logdetA
-
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -75,17 +56,92 @@ class Laplace(likelihood):
         return self.likelihood_function.predictive_values(mu, var)
 
     def _get_params(self):
-        return np.zeros(0)
+        return np.asarray(self.likelihood_function._get_params())
 
     def _get_param_names(self):
-        return []
+        return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
-        pass  # TODO: Laplace likelihood might want to take some parameters...
+        return self.likelihood_function._set_params()
+
+    def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
+        """
+        Find the gradients of the marginal likelihood w.r.t both thetaK and thetaL
+
+        dL_dthetaK differs from that of normal likelihoods as it has additional terms coming from
+        changes to y_tilde and changes to Sigma_tilde when the kernel parameters are adjusted
+
+        Similar terms arise when finding the gradients with respect to changes in the liklihood
+        parameters
+        """
+        return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
+
+    def _shared_gradients_components(self):
+        dL_dytil = -np.dot((self.K+self.Sigma_tilde), self.Y)
+        dytil_dfhat = np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        return dL_dytil, dytil_dfhat
+
+    def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
+        """
+                           #explicit                #implicit                     #implicit
+        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
+        :param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance)
+        :param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers
+        :returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters
+        """
+        dL_dytil, dytil_dfhat = self._shared_gradients_components()
+
+        I_KW_i, _, _, _ = pdinv(np.eye(self.N) + np.dot(self.K, self.W))
+        #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
+        dfhat_dthetaK = I_KW_i*dK_dthetaK*self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
+
+        dytil_dthetaK = dytil_dfhat*dfhat_dthetaK
+
+        #FIXME: Careful dL_dK = dL_d_K_Sigma
+        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
+        dL_dSigma = dL_d_K_Sigma
+        d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+                     #explicit           #implicit
+        dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS
+        dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde)
+
+        dL_dthetaK_implicit = dL_dytil*dytil_dthetaK + dL_dSigma*dSigma_dthetaK
+        return dL_dthetaK_implicit
 
     def _gradients(self, partial):
-        return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
-        raise NotImplementedError
+        """
+        Gradients with respect to likelihood parameters
+
+        Complicated, it differs for parameters of the kernel \theta_{K}, and
+        parameters of the likelihood, \theta_{L}
+
+        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
+        dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL)
+        dL_dK*dK_dthetaL = 0
+
+        dytil_dthetaX = dytil_dfhat * dfhat_dthetaX
+        dytil_dfhat = Sigma*Ki + I
+
+        fhat = K*log_p(y|fhat)                                          from rasm p125
+        dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat)          from rasm p125
+
+        dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi
+        dW_dthetaX = d_dthetaX[d2phi_d2fhat]
+        d2phi_d2fhat = Hessian function of likelihood
+
+        partial = dL_dK
+        """
+        dL_dytil, dytil_dfhat = self._shared_gradients_components()
+        dfhat_dthetaL = self.likelihood_function.df_dtheta()
+
+        dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
+        dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case
+
+        dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
+        dL_dthetaL = 0 + dL_dytil*dytil_dthetaL + dL_dSigma*dSigma_dthetaL
+        return dL_dthetaL
+        #return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
 
     def _compute_GP_variables(self):
         """
@@ -112,8 +168,9 @@ class Laplace(likelihood):
         $$\tilde{\Sigma} = W^{-1}$$
 
         """
-        epsilon = 1e-6
+        epsilon = 1e14
 
+        #Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
         #dtritri -> L -> L_i
         #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
         #((L.T*w)_i + I)f_hat = y_tilde
@@ -122,11 +179,12 @@ class Laplace(likelihood):
         Lt_W = np.dot(L.T, self.W)
 
         ##Check it isn't singular!
-        if cond(Lt_W) > 1e14:
+        if cond(Lt_W) > epsilon:
             print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
 
         Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
-        Y_tilde = np.dot(Lt_W_i_Li + np.eye(self.N), self.f_hat)
+        self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
+        Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
 
         #f.T(Ki + W)f
         f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
@@ -156,16 +214,16 @@ class Laplace(likelihood):
                    #)
 
         ##Check it isn't singular!
-        if cond(self.W) > 1e14:
+        if cond(self.W) > epsilon:
             print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
 
-        Sigma_tilde = inv(self.W)  # Damn
+        self.Sigma_tilde = inv(self.W)  # Damn
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
         self.YYT = np.dot(self.Y, self.Y.T)
-        self.covariance_matrix = Sigma_tilde
+        self.covariance_matrix = self.Sigma_tilde
         self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
 
     def fit_full(self, K):
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index c759e15f..6e72b029 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -20,6 +20,16 @@ class likelihood_function:
     def __init__(self,location=0,scale=1):
         self.location = location
         self.scale = scale
+        self.log_concave = True
+
+    def _get_params(self):
+        return np.zeros(0)
+
+    def _get_param_names(self):
+        return []
+
+    def _set_params(self, p):
+        pass
 
 class probit(likelihood_function):
     """
@@ -149,12 +159,22 @@ class student_t(likelihood_function):
     d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free, sigma=2):
+        super(student_t, self).__init__()
         self.v = deg_free
         self.sigma = sigma
-
-        #FIXME: This should be in the superclass
         self.log_concave = False
 
+    def _get_params(self):
+        return np.asarray(self.sigma)
+
+    def _get_param_names(self):
+        return ["t_noise_variance"]
+
+    def _set_params(self, x):
+        self.sigma = float(x)
+        #self.covariance_matrix = np.eye(self.N)*self._variance
+        #self.precision = 1./self._variance
+
     @property
     def variance(self, extra_data=None):
         return (self.v / float(self.v - 2)) * (self.sigma**2)
@@ -222,6 +242,40 @@ class student_t(likelihood_function):
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
         return np.squeeze(hess)
 
+    def d3link(self, y, f, extra_data=None):
+        """
+        Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+
+        $$\frac{-2(v+1)((f-y)^{3} - 3\sigma^{2}v(f-y))}{((f-y)^{2} + \sigma^{2}v)^{3}}$$
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        #NB f-y not y-f
+        e = f - y
+        d3link_d3f = (  (-2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
+                      / ((e**2 + (self.sigma**2)*self.v)**3)
+                     )
+        return d3link_d3f
+
+    def link_hess_grad_sigma(self, y, f, extra_data=None):
+        """
+        Gradient of the hessian w.r.t sigma parameter
+
+        $$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}}
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        hess_grad_sigma = (  (2*self.sigma*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
+                           / ((e**2 + (self.sigma**2)*self.v)**3)
+                          )
+        return hess_grad_sigma
+
+    def _gradients(self, y, f, extra_data=None):
+        return [self.link_hess_grad_sigma] # list as we might learn many parameters
+
     def predictive_values(self, mu, var):
         """
         Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index cfda0cfe..1024b5ef 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -8,7 +8,7 @@ from .. import kern
 from ..core import model
 from ..util.linalg import pdinv,mdot
 from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango
-from ..likelihoods import EP
+from ..likelihoods import EP, Laplace
 
 class GP(model):
     """
@@ -128,7 +128,19 @@ class GP(model):
 
         For the likelihood parameters, pass in alpha = K^-1 y
         """
-        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK,X=self.X,slices1=self.Xslices,slices2=self.Xslices), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        if isinstance(self.likelihood, Laplace):
+            dL_dthetaK_explicit = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+            #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
+            fake_dL_dKs = np.ones(self.dL_dK.shape)
+            dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+
+            dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
+            dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        else:
+            dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        return np.hstack((dL_dthetaK, dL_dthetaL))
 
     def _raw_predict(self,_Xnew,slices=None, full_cov=False):
         """
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index f88099a4..cb899397 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -14,6 +14,21 @@ import types
 #import scipy.lib.lapack.flapack
 import scipy as sp
 
+def det_ln_diag(A):
+    """
+    log determinant of a diagonal matrix
+    $$\ln |A| = \ln \prod{A_{ii}} = \sum{\ln A_{ii}}$$
+    """
+    return np.log(np.diagonal(A)).sum()
+
+def pddet(A):
+    """
+    Determinant of a positive definite matrix
+    """
+    L = cholesky(A)
+    logdetA = 2*sum(np.log(np.diag(L)))
+    return logdetA
+
 def trace_dot(a,b):
     """
     efficiently compute the trace of the matrix product of a and b
@@ -166,8 +181,8 @@ def PCA(Y, Q):
     """
     if not np.allclose(Y.mean(axis=0), 0.0):
         print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
-        
-        #Y -= Y.mean(axis=0) 
+
+        #Y -= Y.mean(axis=0)
 
     Z = linalg.svd(Y-Y.mean(axis=0), full_matrices = False)
     [X, W] = [Z[0][:,0:Q], np.dot(np.diag(Z[1]), Z[2]).T[:,0:Q]]

From 267a8e427c147aa5ac98e3f42c58d90492e53b4c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 19 Apr 2013 17:41:01 +0100
Subject: [PATCH 028/252] Adding gradients, shapes starting to make sense

---
 GPy/likelihoods/Laplace.py              | 53 ++++++++++++++++---------
 GPy/likelihoods/likelihood_functions.py | 28 +++++++++----
 GPy/models/GP.py                        |  6 +--
 GPy/util/linalg.py                      |  2 +-
 4 files changed, 60 insertions(+), 29 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index b1b41957..b5c0bdfe 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -1,11 +1,12 @@
 import numpy as np
 import scipy as sp
 import GPy
-from scipy.linalg import cholesky, eig, inv, cho_solve, det
+from scipy.linalg import inv, cho_solve, det
 from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.lapack import dtrtrs
+import pylab as plt
 
 
 class Laplace(likelihood):
@@ -62,7 +63,7 @@ class Laplace(likelihood):
         return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
-        return self.likelihood_function._set_params()
+        return self.likelihood_function._set_params(p)
 
     def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
         """
@@ -77,8 +78,8 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        dL_dytil = -np.dot((self.K+self.Sigma_tilde), self.Y)
-        dytil_dfhat = np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde))
+        dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -91,12 +92,18 @@ class Laplace(likelihood):
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
-        I_KW_i, _, _, _ = pdinv(np.eye(self.N) + np.dot(self.K, self.W))
+        A = np.eye(self.N) + np.dot(self.K, self.W)
+        plt.imshow(A)
+        plt.show()
+        I_KW_i, _, _, _ = pdinv(A)
+
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
-        dfhat_dthetaK = I_KW_i*dK_dthetaK*self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-
-        dytil_dthetaK = dytil_dfhat*dfhat_dthetaK
+        #Derivative for each f dimension, for each of K's hyper parameters
+        dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
+        for ind_j, thetaj in enumerate(dK_dthetaK):
+            dfhat_dthetaK[:, ind_j] = mdot(I_KW_i, thetaj, self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data))
 
+        dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
         #FIXME: Careful dL_dK = dL_d_K_Sigma
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
         dL_dSigma = dL_d_K_Sigma
@@ -105,8 +112,9 @@ class Laplace(likelihood):
         dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS
         dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde)
 
-        dL_dthetaK_implicit = dL_dytil*dytil_dthetaK + dL_dSigma*dSigma_dthetaK
-        return dL_dthetaK_implicit
+        dL_dthetaK_implicit = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)# + np.dot(dL_dSigma, dSigma_dthetaK)
+        #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
+        return np.squeeze(dL_dthetaK_implicit)
 
     def _gradients(self, partial):
         """
@@ -132,16 +140,25 @@ class Laplace(likelihood):
         partial = dL_dK
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
-        dfhat_dthetaL = self.likelihood_function.df_dtheta()
+        dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
 
-        dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-        dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
+        #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        #Derivative for each f dimension, for each of K's hyper parameters
+        dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
+        for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
+            dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
+                                             dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
+                                             self.Sigma_tilde
+                                             )
+
+        #TODO: This is Wi*A*Wi, can be more numerically stable with a trick
+        #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
         dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case
 
-        dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
-        dL_dthetaL = 0 + dL_dytil*dytil_dthetaL + dL_dSigma*dSigma_dthetaL
-        return dL_dthetaL
-        #return np.zeros(0)  # TODO: Laplace likelihood might want to take some parameters...
+        #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
+        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
+        dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
+        return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
         """
@@ -335,7 +352,7 @@ class Laplace(likelihood):
         rs = 0
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
-            f_old = f.copy()
+            #f_old = f.copy()
             W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data))
             if not self.likelihood_function.log_concave:
                 W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 6e72b029..64791047 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -159,10 +159,10 @@ class student_t(likelihood_function):
     d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free, sigma=2):
-        super(student_t, self).__init__()
         self.v = deg_free
         self.sigma = sigma
         self.log_concave = False
+        #super(student_t, self).__init__()
 
     def _get_params(self):
         return np.asarray(self.sigma)
@@ -258,9 +258,9 @@ class student_t(likelihood_function):
                      )
         return d3link_d3f
 
-    def link_hess_grad_sigma(self, y, f, extra_data=None):
+    def link_hess_grad_std(self, y, f, extra_data=None):
         """
-        Gradient of the hessian w.r.t sigma parameter
+        Gradient of the hessian w.r.t sigma parameter (standard deviation)
 
         $$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}}
         """
@@ -273,8 +273,24 @@ class student_t(likelihood_function):
                           )
         return hess_grad_sigma
 
+    def link_grad_std(self, y, f, extra_data=None):
+        """
+        Gradient of the likelihood w.r.t sigma parameter (standard deviation)
+
+        $$\frac{-2\sigma(v+1)(y-f)}{(v\sigma^{2} + (y-f)^{2})^{2}}$$
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
+                      / ((self.v*(self.sigma**2) + e**2)**2)
+                     )
+        return grad_sigma
+
     def _gradients(self, y, f, extra_data=None):
-        return [self.link_hess_grad_sigma] # list as we might learn many parameters
+        return [self.link_grad_std(y, f, extra_data=extra_data)[:, None],
+                self.link_hess_grad_std(y, f, extra_data=extra_data)[:, None]] # list as we might learn many parameters
 
     def predictive_values(self, mu, var):
         """
@@ -372,9 +388,7 @@ class weibull_survival(likelihood_function):
     def __init__(self, shape, scale):
         self.shape = shape
         self.scale = scale
-
-        #FIXME: This should be in the superclass
-        self.log_concave = True
+        self.log_concave = True # Or false?
 
     def link_function(self, y, f, extra_data=None):
         """
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 1024b5ef..24037afe 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -128,17 +128,17 @@ class GP(model):
 
         For the likelihood parameters, pass in alpha = K^-1 y
         """
+        dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
         if isinstance(self.likelihood, Laplace):
-            dL_dthetaK_explicit = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+            dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
             fake_dL_dKs = np.ones(self.dL_dK.shape)
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
 
             dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
             dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
-            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            dL_dthetaL = self.likelihood._gradients(partial=self.dL_dK)
         else:
-            dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
         return np.hstack((dL_dthetaK, dL_dthetaL))
 
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index cb899397..20293ed8 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -25,7 +25,7 @@ def pddet(A):
     """
     Determinant of a positive definite matrix
     """
-    L = cholesky(A)
+    L = jitchol(A)
     logdetA = 2*sum(np.log(np.diag(L)))
     return logdetA
 

From 9de0b23f65470dfa3ec2fad756f2ab901f29ef0c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 29 Apr 2013 18:08:46 +0100
Subject: [PATCH 029/252] Plotting problematic kernel

---
 GPy/likelihoods/Laplace.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index b5c0bdfe..9cacb0e1 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -92,9 +92,12 @@ class Laplace(likelihood):
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
-        A = np.eye(self.N) + np.dot(self.K, self.W)
-        plt.imshow(A)
-        plt.show()
+        print "Computing K gradients"
+        I = np.eye(self.N)
+        C = np.dot(self.K, self.W)
+        A = I + C
+        #plt.imshow(A)
+        #plt.show()
         I_KW_i, _, _, _ = pdinv(A)
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
@@ -250,6 +253,8 @@ class Laplace(likelihood):
         :K: Covariance matrix
         """
         self.K = K.copy()
+        #assert np.all(self.K.T == self.K)
+        #self.K_safe = K.copy()
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:

From f95666a8f9cb07209d80226ed1c5b0352b9eed75 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 6 May 2013 10:15:39 +0100
Subject: [PATCH 030/252] Merging

---
 GPy/likelihoods/Laplace.py |  1 +
 GPy/models/GP.py           | 15 +++++----------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 9cacb0e1..5e28212e 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -98,6 +98,7 @@ class Laplace(likelihood):
         A = I + C
         #plt.imshow(A)
         #plt.show()
+        ki, _, _, _ = pdinv(self.K)
         I_KW_i, _, _, _ = pdinv(A)
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index d353e5dd..96ec6582 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -6,15 +6,9 @@ import numpy as np
 import pylab as pb
 from .. import kern
 from ..core import model
-<<<<<<< HEAD
-from ..util.linalg import pdinv,mdot
-from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango
-from ..likelihoods import EP, Laplace
-=======
 from ..util.linalg import pdinv, mdot
 from ..util.plot import gpplot, x_frame1D, x_frame2D, Tango
-from ..likelihoods import EP
->>>>>>> upstream/devel
+from ..likelihoods import EP, Laplace
 
 class GP(model):
     """
@@ -34,6 +28,7 @@ class GP(model):
 
     """
     def __init__(self, X, likelihood, kernel, normalize_X=False):
+        self.has_uncertain_inputs=False
 
         # parse arguments
         self.X = X
@@ -128,12 +123,12 @@ class GP(model):
 
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
-        dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+        dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
         if isinstance(self.likelihood, Laplace):
             dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
             fake_dL_dKs = np.ones(self.dL_dK.shape)
-            dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X, slices1=self.Xslices, slices2=self.Xslices)
+            dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
             dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
             dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
@@ -251,7 +246,7 @@ class GP(model):
         else:
             raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
 
-    def plot(self, samples=0, plot_limits=None, which_data='all', which_functions='all', resolution=None, levels=20):
+    def plot(self, samples=0, plot_limits=None, which_data='all', which_functions='all', which_parts='all', resolution=None, levels=20):
         """
         TODO: Docstrings!
         :param levels: for 2D plotting, the number of contour levels to use

From a52c20f47008233495e20d96b4ab50be8eb7d4a3 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 7 May 2013 13:35:47 +0100
Subject: [PATCH 031/252] Added a debug examples

---
 GPy/examples/laplace_approximations.py | 84 +++++++++++++++++++++++++-
 GPy/likelihoods/Laplace.py             | 23 +++++--
 GPy/models/GP.py                       |  6 +-
 3 files changed, 104 insertions(+), 9 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 5d1c1224..7e5c55bf 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -35,12 +35,86 @@ def timing():
     print the_is
     print np.mean(the_is)
 
+def debug_student_t_noise_approx():
+    real_var = 0.2
+    #Start a function, any function
+    X = np.linspace(0.0, 10.0, 30)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+
+    X_full = np.linspace(0.0, 10.0, 500)[:, None]
+    Y_full = np.sin(X_full)
+
+    #Y = Y/Y.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 10000
+    real_sd = np.sqrt(real_var)
+    print "Real noise: ", real_sd
+
+    initial_var_guess = 0.01
+    #t_rv = t(deg_free, loc=0, scale=real_var)
+    #noise = t_rvrvs(size=Y.shape)
+    #Y += noise
+
+    plt.figure(1)
+    plt.suptitle('Gaussian likelihood')
+    # Kernel object
+    kernel1 = GPy.kern.rbf(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
+    kernel5 = kernel1.copy()
+    kernel6 = kernel1.copy()
+
+    print "Clean Gaussian"
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize()
+    # plot
+    plt.subplot(131)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    print m
+
+    plt.suptitle('Student-t likelihood')
+    edited_real_sd = initial_var_guess #real_sd
+
+    print "Clean student t, rasm"
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(132)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    print "Clean student t, ncg"
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(133)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+
+    plt.show()
 
 def student_t_approx():
     """
     Example of regressing with a student t likelihood
     """
-    real_var = 0.1
+    real_var = 0.2
     #Start a function, any function
     X = np.linspace(0.0, 10.0, 30)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
@@ -58,8 +132,11 @@ def student_t_approx():
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10
+    deg_free = 1000000000000
     real_sd = np.sqrt(real_var)
+    print "Real noise: ", real_sd
+
+    initial_var_guess = 0.01
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -73,6 +150,7 @@ def student_t_approx():
     #print corrupted_indices
     #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
     #Y[corrupted_indices] += noise
+
     plt.figure(1)
     plt.suptitle('Gaussian likelihood')
     # Kernel object
@@ -108,7 +186,7 @@ def student_t_approx():
 
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
-    edited_real_sd = real_sd
+    edited_real_sd = initial_var_guess #real_sd
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 5e28212e..02f2c93f 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -5,7 +5,7 @@ from scipy.linalg import inv, cho_solve, det
 from numpy.linalg import cond
 from GPy.likelihoods.likelihood import likelihood
 from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
-from scipy.linalg.lapack import dtrtrs
+from scipy.linalg.flapack import dtrtrs
 import pylab as plt
 
 
@@ -63,6 +63,7 @@ class Laplace(likelihood):
         return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
+        print "Setting noise sd: ", p
         return self.likelihood_function._set_params(p)
 
     def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -79,7 +80,9 @@ class Laplace(likelihood):
 
     def _shared_gradients_components(self):
         dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde))
-        dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        #dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        Ki = inv(self.K)
+        dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -93,19 +96,26 @@ class Laplace(likelihood):
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
         print "Computing K gradients"
+        print "dytil_dfhat: ", np.mean(dytil_dfhat)
         I = np.eye(self.N)
         C = np.dot(self.K, self.W)
         A = I + C
         #plt.imshow(A)
         #plt.show()
-        ki, _, _, _ = pdinv(self.K)
-        I_KW_i, _, _, _ = pdinv(A)
+
+        #FIXME: K ISNT SYMMETRIC SO NEITHER IS A AND IT MAKES IT NON-PD!
+        #ki, _, _, _ = pdinv(self.K)
+        #I_KW_i, _, _, _ = pdinv(A)
+
+        I_KW_i = inv(A)
+
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
         #Derivative for each f dimension, for each of K's hyper parameters
         dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
+        grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
         for ind_j, thetaj in enumerate(dK_dthetaK):
-            dfhat_dthetaK[:, ind_j] = mdot(I_KW_i, thetaj, self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data))
+            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad))
 
         dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
         #FIXME: Careful dL_dK = dL_d_K_Sigma
@@ -116,8 +126,11 @@ class Laplace(likelihood):
         dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS
         dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde)
 
+        print "dL_dytil: ", np.mean(dL_dytil)
+        print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
         dL_dthetaK_implicit = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)# + np.dot(dL_dSigma, dSigma_dthetaK)
         #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
+        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
         return np.squeeze(dL_dthetaK_implicit)
 
     def _gradients(self, partial):
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 96ec6582..07c7a708 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -116,7 +116,6 @@ class GP(model):
         """
         return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
 
-
     def _log_likelihood_gradients(self):
         """
         The gradient of all parameters.
@@ -132,9 +131,14 @@ class GP(model):
 
             dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
             dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
+
+            print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
+
             dL_dthetaL = self.likelihood._gradients(partial=self.dL_dK)
         else:
+            print "dL_dthetaK: ", dL_dthetaK
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        print "dL_dthetaL: ", dL_dthetaL
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
 

From 84f12c1079a10db7dfe0737c5de1ca5b74d3b2d0 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 8 May 2013 12:36:31 +0100
Subject: [PATCH 032/252] Scale and switch KW+I

---
 GPy/examples/laplace_approximations.py |  5 ++--
 GPy/likelihoods/Laplace.py             | 37 +++++++++++++++-----------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 7e5c55bf..704297ef 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -36,7 +36,7 @@ def timing():
     print np.mean(the_is)
 
 def debug_student_t_noise_approx():
-    real_var = 0.2
+    real_var = 0.1
     #Start a function, any function
     X = np.linspace(0.0, 10.0, 30)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
@@ -44,7 +44,7 @@ def debug_student_t_noise_approx():
     X_full = np.linspace(0.0, 10.0, 500)[:, None]
     Y_full = np.sin(X_full)
 
-    #Y = Y/Y.max()
+    Y = Y/Y.max()
 
     #Add student t random noise to datapoints
     deg_free = 10000
@@ -56,6 +56,7 @@ def debug_student_t_noise_approx():
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
 
+    plt.close('all')
     plt.figure(1)
     plt.suptitle('Gaussian likelihood')
     # Kernel object
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 02f2c93f..934b2a90 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -3,8 +3,8 @@ import scipy as sp
 import GPy
 from scipy.linalg import inv, cho_solve, det
 from numpy.linalg import cond
-from GPy.likelihoods.likelihood import likelihood
-from GPy.util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
+from likelihood import likelihood
+from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.flapack import dtrtrs
 import pylab as plt
 
@@ -79,10 +79,10 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde))
-        #dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
-        Ki = inv(self.K)
-        dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5?
+        dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        #Ki = inv(self.K)
+        #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -95,6 +95,10 @@ class Laplace(likelihood):
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
+        d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+
+        dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
+
         print "Computing K gradients"
         print "dytil_dfhat: ", np.mean(dytil_dfhat)
         I = np.eye(self.N)
@@ -103,12 +107,7 @@ class Laplace(likelihood):
         #plt.imshow(A)
         #plt.show()
 
-        #FIXME: K ISNT SYMMETRIC SO NEITHER IS A AND IT MAKES IT NON-PD!
-        #ki, _, _, _ = pdinv(self.K)
-        #I_KW_i, _, _, _ = pdinv(A)
-
-        I_KW_i = inv(A)
-
+        I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
         #Derivative for each f dimension, for each of K's hyper parameters
@@ -121,14 +120,20 @@ class Laplace(likelihood):
         #FIXME: Careful dL_dK = dL_d_K_Sigma
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
         dL_dSigma = dL_d_K_Sigma
-        d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+        #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
                      #explicit           #implicit
-        dSigmai_dthetaK = 0 #+ np.sum(d3phi_d3fhat*dfhat_dthetaK) #FIXME: CAREFUL OF THIS SUM! SHOULD SUM OVER FHAT NOT THETAS
-        dSigma_dthetaK = -mdot(self.Sigma_tilde, dSigmai_dthetaK, self.Sigma_tilde)
+        dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
+        dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
+        for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
+            dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
 
         print "dL_dytil: ", np.mean(dL_dytil)
         print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
-        dL_dthetaK_implicit = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)# + np.dot(dL_dSigma, dSigma_dthetaK)
+
+        #FIXME: Won't handle multi dimensional data
+        dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
+        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=(0,1))
+        dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
         #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
         import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
         return np.squeeze(dL_dthetaK_implicit)

From 6c4866662c9f20dbc3a9a5d08aab85bf95e1e84d Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 8 May 2013 16:05:01 +0100
Subject: [PATCH 033/252] Seem to have gradients much closer now

---
 GPy/examples/laplace_approximations.py  | 34 +++++----
 GPy/likelihoods/Laplace.py              | 99 ++++++++++++++++++-------
 GPy/likelihoods/likelihood_functions.py | 19 +++--
 GPy/models/GP.py                        | 18 +++--
 4 files changed, 110 insertions(+), 60 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 704297ef..57ae9be7 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -36,6 +36,7 @@ def timing():
     print np.mean(the_is)
 
 def debug_student_t_noise_approx():
+    plot = False
     real_var = 0.1
     #Start a function, any function
     X = np.linspace(0.0, 10.0, 30)[:, None]
@@ -57,8 +58,6 @@ def debug_student_t_noise_approx():
     #Y += noise
 
     plt.close('all')
-    plt.figure(1)
-    plt.suptitle('Gaussian likelihood')
     # Kernel object
     kernel1 = GPy.kern.rbf(X.shape[1])
     kernel2 = kernel1.copy()
@@ -75,12 +74,14 @@ def debug_student_t_noise_approx():
     m.ensure_default_constraints()
     m.optimize()
     # plot
-    plt.subplot(131)
-    m.plot()
-    plt.plot(X_full, Y_full)
+    if plot:
+        plt.figure(1)
+        plt.suptitle('Gaussian likelihood')
+        plt.subplot(131)
+        m.plot()
+        plt.plot(X_full, Y_full)
     print m
 
-    plt.suptitle('Student-t likelihood')
     edited_real_sd = initial_var_guess #real_sd
 
     print "Clean student t, rasm"
@@ -91,10 +92,12 @@ def debug_student_t_noise_approx():
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
-    plt.subplot(132)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
+    if plot:
+        plt.suptitle('Student-t likelihood')
+        plt.subplot(132)
+        m.plot()
+        plt.plot(X_full, Y_full)
+        plt.ylim(-2.5, 2.5)
 
     print "Clean student t, ncg"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -104,12 +107,13 @@ def debug_student_t_noise_approx():
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
-    plt.subplot(133)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
+    if plot:
+        plt.subplot(133)
+        m.plot()
+        plt.plot(X_full, Y_full)
+        plt.ylim(-2.5, 2.5)
 
-    plt.show()
+    #plt.show()
 
 def student_t_approx():
     """
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 934b2a90..566e4e25 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -5,8 +5,8 @@ from scipy.linalg import inv, cho_solve, det
 from numpy.linalg import cond
 from likelihood import likelihood
 from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
-from scipy.linalg.flapack import dtrtrs
-import pylab as plt
+from scipy.linalg.lapack import dtrtrs
+#import pylab as plt
 
 
 class Laplace(likelihood):
@@ -79,9 +79,9 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5?
+        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
         dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
-        #Ki = inv(self.K)
+        #Ki, _, _, _ = pdinv(self.K)
         #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
         return dL_dytil, dytil_dfhat
 
@@ -95,9 +95,8 @@ class Laplace(likelihood):
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
-        d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
 
-        dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
+        #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
 
         print "Computing K gradients"
         print "dytil_dfhat: ", np.mean(dytil_dfhat)
@@ -107,7 +106,8 @@ class Laplace(likelihood):
         #plt.imshow(A)
         #plt.show()
 
-        I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
+        #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
+        I_KW_i = self.Bi # could use self.B_chol??
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
         #Derivative for each f dimension, for each of K's hyper parameters
@@ -117,25 +117,44 @@ class Laplace(likelihood):
             dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad))
 
         dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
-        #FIXME: Careful dL_dK = dL_d_K_Sigma
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
         dL_dSigma = dL_d_K_Sigma
         #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
                      #explicit           #implicit
-        dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
-        dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
-        for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
-            dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
-
-        print "dL_dytil: ", np.mean(dL_dytil)
-        print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
+        #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
+        #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
+        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+        Wi = np.diagonal(self.Sigma_tilde) #Convenience
+        dSigma_dthetaK_explicit = 0
+        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
+        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
+        #dSigma_dthetaK_implicit = -np.sum(np.dot(dWi_dfhat, dfhat_dthetaK), axis=0)
+        dSigma_dthetaK_implicit = np.dot(dWi_dfhat, dfhat_dthetaK)
+        dSigma_dthetaK = dSigma_dthetaK_explicit + dSigma_dthetaK_implicit
+        #dSigma_dthetaK = 0 + np.dot(, dfhat_dthetaK)
+        #for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
+            #dSigma_dthetaK_explicit = 0
+            #dSigma_dthetaK_implicit = -np.dot(Wi, dW_dfhat
+            #dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
 
         #FIXME: Won't handle multi dimensional data
         dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
-        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=(0,1))
+        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
         dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
         #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
-        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+
+        #print "\n"
+        #print "dL_dytil: ", np.mean(dL_dytil)
+        #print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
+        #print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
+        #print "\n"
+        #print "dL_dSigma: ", np.mean(dL_dSigma)
+        #print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK)
+        #print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
+        #print "\n"
+        #print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
+        #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+
         return np.squeeze(dL_dthetaK_implicit)
 
     def _gradients(self, partial):
@@ -159,27 +178,51 @@ class Laplace(likelihood):
         dW_dthetaX = d_dthetaX[d2phi_d2fhat]
         d2phi_d2fhat = Hessian function of likelihood
 
-        partial = dL_dK
+        partial = dL_d_K_Sigma
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
-        dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+
+        dlikelihood_dthetaL_explicit, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dlikelihood_dfhat = self.likelihood_function.link_hess(self.data, self.f_hat, self.extra_data)
+        dfhat_dthetaL_cyclic = 0 #what is this? how can dfhat_dthetaL be used in the value of itself?
+        dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
+        dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
+        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
+
+        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
+        dL_dSigma = partial #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
+
+        Wi = np.diagonal(self.Sigma_tilde) #Convenience
+        #-1 as we are looking at W which is -1*d2log p(y|f)
+        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
+        dSigma_dthetaL_explicit = np.diagflat(-(Wi*(-1*d2likelihood_dthetaL)*Wi))
+
+        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
+        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL_cyclic)
+        dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
 
         #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
         #Derivative for each f dimension, for each of K's hyper parameters
-        dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
-        for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
-            dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
-                                             dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
-                                             self.Sigma_tilde
-                                             )
+        #dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
+        #for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
+            #dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
+                                             #dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
+                                             #self.Sigma_tilde
+                                             #)
 
         #TODO: This is Wi*A*Wi, can be more numerically stable with a trick
         #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
-        dL_dSigma = partial # partial is dL_dK but K here is K+Sigma_tilde.... which is fine in this case
 
         #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
-        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
-        dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
+        #dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
+        #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
+
+        dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
+        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
+        dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
+
         return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index cd6467d7..2176aac0 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -248,17 +248,16 @@ class student_t(likelihood_function):
         """
         Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
 
-        $$\frac{-2(v+1)((f-y)^{3} - 3\sigma^{2}v(f-y))}{((f-y)^{2} + \sigma^{2}v)^{3}}$$
+        $$\frac{2(v+1)((y-f)^{3} - 3\sigma^{2}v(y-f))}{((y-f)^{2} + \sigma^{2}v)^{3}}$$
         """
         y = np.squeeze(y)
         f = np.squeeze(f)
         assert y.shape == f.shape
-        #NB f-y not y-f
-        e = f - y
-        d3link_d3f = (  (-2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
+        e = y - f
+        d3link_d3f = (  (2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
                       / ((e**2 + (self.sigma**2)*self.v)**3)
                      )
-        return d3link_d3f
+        return np.squeeze(d3link_d3f)
 
     def link_hess_grad_std(self, y, f, extra_data=None):
         """
@@ -270,10 +269,10 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        hess_grad_sigma = (  (2*self.sigma*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
+        hess_grad_sigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
                            / ((e**2 + (self.sigma**2)*self.v)**3)
                           )
-        return hess_grad_sigma
+        return np.squeeze(hess_grad_sigma)
 
     def link_grad_std(self, y, f, extra_data=None):
         """
@@ -288,11 +287,11 @@ class student_t(likelihood_function):
         grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
                       / ((self.v*(self.sigma**2) + e**2)**2)
                      )
-        return grad_sigma
+        return np.squeeze(grad_sigma)
 
     def _gradients(self, y, f, extra_data=None):
-        return [self.link_grad_std(y, f, extra_data=extra_data)[:, None],
-                self.link_hess_grad_std(y, f, extra_data=extra_data)[:, None]] # list as we might learn many parameters
+        return [self.link_grad_std(y, f, extra_data=extra_data),
+                self.link_hess_grad_std(y, f, extra_data=extra_data)] # list as we might learn many parameters
 
     def predictive_values(self, mu, var):
         """
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index a346b47b..1682ee6c 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -125,19 +125,23 @@ class GP(model):
         if isinstance(self.likelihood, Laplace):
             dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            fake_dL_dKs = np.ones(self.dL_dK.shape)
+            fake_dL_dKs = np.eye(self.dL_dK.shape[0])
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
-            dL_dthetaK_implicit = self.likelihood._Kgradients(self.dL_dK, dK_dthetaK)
+            #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
+            dL_dthetaK_implicit = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
             dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
 
-            print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
+            #print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
 
-            dL_dthetaL = self.likelihood._gradients(partial=self.dL_dK)
-        else:
-            print "dL_dthetaK: ", dL_dthetaK
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-        print "dL_dthetaL: ", dL_dthetaL
+            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        else:
+            #print "dL_dthetaK: ", dL_dthetaK
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+        #print "dL_dthetaL: ", dL_dthetaL
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
 

From 9500b12b532e2f9abd68621a0ce8662e4553cb2c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 8 May 2013 20:53:23 +0100
Subject: [PATCH 034/252] Working on putting callback to update laplace in
 callback

---
 GPy/inference/optimization.py           | 13 ++++++++++++-
 GPy/likelihoods/Laplace.py              |  1 -
 GPy/likelihoods/likelihood_functions.py |  4 ++++
 GPy/models/GP.py                        | 10 ++++++++++
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/GPy/inference/optimization.py b/GPy/inference/optimization.py
index 75cd94ba..1445eed0 100644
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@@ -29,7 +29,7 @@ class Optimizer():
     :rtype: optimizer object.
 
     """
-    def __init__(self, x_init, messages=False, model = None, max_f_eval=1e4, max_iters = 1e3, ftol=None, gtol=None, xtol=None):
+    def __init__(self, x_init, messages=False, model = None, max_f_eval=1e4, max_iters = 1e3, ftol=None, gtol=None, xtol=None, callback=None):
         self.opt_name = None
         self.x_init = x_init
         self.messages = messages
@@ -45,6 +45,7 @@ class Optimizer():
         self.gtol = gtol
         self.ftol = ftol
         self.model = model
+        self.callback = callback
 
     def run(self, **kwargs):
         start = dt.datetime.now()
@@ -94,6 +95,8 @@ class opt_tnc(Optimizer):
             opt_dict['ftol'] = self.ftol
         if self.gtol is not None:
             opt_dict['pgtol'] = self.gtol
+        if self.callback is not None:
+            opt_dict['callback'] = self.callback
 
         opt_result = optimize.fmin_tnc(f_fp, self.x_init, messages = self.messages,
                        maxfun = self.max_f_eval, **opt_dict)
@@ -128,6 +131,8 @@ class opt_lbfgsb(Optimizer):
             print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it"
         if self.gtol is not None:
             opt_dict['pgtol'] = self.gtol
+        if self.callback is not None:
+            opt_dict['callback'] = self.callback
 
         opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint = iprint,
                                             maxfun = self.max_f_eval, **opt_dict)
@@ -155,6 +160,8 @@ class opt_simplex(Optimizer):
             opt_dict['ftol'] = self.ftol
         if self.gtol is not None:
             print "WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it"
+        if self.callback is not None:
+            opt_dict['callback'] = self.callback
 
         opt_result = optimize.fmin(f, self.x_init, (), disp = self.messages,
                    maxfun = self.max_f_eval, full_output=True, **opt_dict)
@@ -187,6 +194,8 @@ class opt_rasm(Optimizer):
             print "WARNING: minimize doesn't have an ftol arg, so I'm going to ignore it"
         if self.gtol is not None:
             print "WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it"
+        if self.callback is not None:
+            print "WARNING: minimize doesn't have a callback arg, so I'm going to ignore it"
 
         opt_result = rasm.minimize(self.x_init, f_fp, (), messages = self.messages,
                                    maxnumfuneval = self.max_f_eval)
@@ -205,6 +214,8 @@ class opt_SCG(Optimizer):
     def opt(self, f_fp = None, f = None, fp = None):
         assert not f is None
         assert not fp is None
+        if self.callback is not None:
+            print "WARNING: SCG doesn't have a callback arg, so I'm going to ignore it"
         opt_result = SCG(f,fp,self.x_init, display=self.messages, maxiters=self.max_iters, max_f_eval=self.max_f_eval, xtol=self.xtol, ftol=self.ftol)
         self.x_opt = opt_result[0]
         self.trace = opt_result[1]
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 566e4e25..208b1102 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -63,7 +63,6 @@ class Laplace(likelihood):
         return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
-        print "Setting noise sd: ", p
         return self.likelihood_function._set_params(p)
 
     def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 2176aac0..61c79385 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -166,6 +166,8 @@ class student_t(likelihood_function):
         self.log_concave = False
         #super(student_t, self).__init__()
 
+        self._set_params(np.asarray(sigma))
+
     def _get_params(self):
         return np.asarray(self.sigma)
 
@@ -174,6 +176,8 @@ class student_t(likelihood_function):
 
     def _set_params(self, x):
         self.sigma = float(x)
+        print "Setting student t sigma: ", x
+        print x
         #self.covariance_matrix = np.eye(self.N)*self._variance
         #self.precision = 1./self._variance
 
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 1682ee6c..79284b59 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -86,6 +86,16 @@ class GP(model):
     def _get_param_names(self):
         return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
 
+    def _update_params_callback(self, p):
+        #FIXME:Check the transforming
+        #Set the new parameters of the kernel and likelihood within the optimization
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
+        self.likelihood._set_params(p[self.kern.Nparam_transformed():])
+        #update the likelihood approximation within the optimisation with the current parameters
+        self.update_likelihood_approximation()
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
     def update_likelihood_approximation(self):
         """
         Approximates a non-gaussian likelihood using Expectation Propagation

From 5472c5c6ba445c49fcdb98ccef4635f17a801b28 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 13 May 2013 18:36:02 +0100
Subject: [PATCH 035/252] Almost have likelihood gradients working but kernels
 still way off

---
 GPy/examples/laplace_approximations.py  | 39 ++++++-----
 GPy/likelihoods/Laplace.py              | 88 ++++++++++++++++---------
 GPy/likelihoods/likelihood_functions.py |  4 +-
 GPy/models/GP.py                        | 20 +++---
 4 files changed, 91 insertions(+), 60 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 57ae9be7..2054881c 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -52,7 +52,7 @@ def debug_student_t_noise_approx():
     real_sd = np.sqrt(real_var)
     print "Real noise: ", real_sd
 
-    initial_var_guess = 0.01
+    initial_var_guess = 1
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -84,14 +84,21 @@ def debug_student_t_noise_approx():
 
     edited_real_sd = initial_var_guess #real_sd
 
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
-    m.ensure_default_constraints()
+    #m.constrain_positive('rbf')
+    m.constrain_fixed('rbf_v', 1.0898)
+    m.constrain_fixed('rbf_l', 1.8651)
+    m.constrain_positive('t_noi')
+    #m.constrain_fixed('t_noise_variance', real_sd)
     m.update_likelihood_approximation()
-    m.optimize()
+    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
+    m.optimize('scg', messages=True)
     print(m)
+    return m
     if plot:
         plt.suptitle('Student-t likelihood')
         plt.subplot(132)
@@ -99,19 +106,19 @@ def debug_student_t_noise_approx():
         plt.plot(X_full, Y_full)
         plt.ylim(-2.5, 2.5)
 
-    print "Clean student t, ncg"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
-    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    if plot:
-        plt.subplot(133)
-        m.plot()
-        plt.plot(X_full, Y_full)
-        plt.ylim(-2.5, 2.5)
+    #print "Clean student t, ncg"
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
+    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    #m.ensure_default_constraints()
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    #print(m)
+    #if plot:
+        #plt.subplot(133)
+        #m.plot()
+        #plt.plot(X_full, Y_full)
+        #plt.ylim(-2.5, 2.5)
 
     #plt.show()
 
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 208b1102..5b3e8f43 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -63,6 +63,7 @@ class Laplace(likelihood):
         return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
+        #print "Setting laplace param with: ", p
         return self.likelihood_function._set_params(p)
 
     def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -78,10 +79,24 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        dL_dytil = -np.dot(self.Y.T, (self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
-        dytil_dfhat = self.Wi__Ki_W # np.dot(self.Sigma_tilde, self.Ki) + np.eye(self.N) # or self.Wi__Ki_W?
-        #Ki, _, _, _ = pdinv(self.K)
-        #dytil_dfhat = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W?
+        dL_dytil = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
+
+        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
+        Wi = np.diagonal(self.Sigma_tilde) #Convenience
+        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
+        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
+
+        Ki, _, _, _ = pdinv(self.K)
+        #dytil_dfhat_implicit = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
+        #dytil_dfhat = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
+
+        #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
+        #dytil_dfhat_explicit = self.Wi__Ki_W
+        #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
+        #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
+
+        a = mdot(dWi_dfhat, Ki, self.f_hat)
+        dytil_dfhat = mdot(dWi_dfhat, Ki, self.f_hat) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -94,18 +109,18 @@ class Laplace(likelihood):
         """
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
 
-
         #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
 
-        print "Computing K gradients"
-        print "dytil_dfhat: ", np.mean(dytil_dfhat)
-        I = np.eye(self.N)
-        C = np.dot(self.K, self.W)
-        A = I + C
+        #print "Computing K gradients"
+        #print "dytil_dfhat: ", np.mean(dytil_dfhat)
+        #I = np.eye(self.N)
+        #C = np.dot(self.K, self.W)
+        #A = I + C
         #plt.imshow(A)
         #plt.show()
 
         #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
+        #B = I + w12*K*w12
         I_KW_i = self.Bi # could use self.B_chol??
 
         #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
@@ -113,15 +128,22 @@ class Laplace(likelihood):
         dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
         grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
         for ind_j, thetaj in enumerate(dK_dthetaK):
-            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, np.dot(thetaj, grad))
+            #dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad)))
+            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad)
 
+        print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat)
+        print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK)
         dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
+        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
+        print "\n"
+
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
         dL_dSigma = dL_d_K_Sigma
         #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
                      #explicit           #implicit
         #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
         #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
+
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         Wi = np.diagonal(self.Sigma_tilde) #Convenience
         dSigma_dthetaK_explicit = 0
@@ -140,19 +162,16 @@ class Laplace(likelihood):
         dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
         dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
         dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
-        #dL_dthetaK_implicit = np.dot(dL_dytil.T, dytil_dthetaK.T)
 
-        #print "\n"
-        #print "dL_dytil: ", np.mean(dL_dytil)
-        #print "dytil_dthetaK: ", np.mean(dytil_dthetaK)
-        #print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
-        #print "\n"
-        #print "dL_dSigma: ", np.mean(dL_dSigma)
-        #print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK)
-        #print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
-        #print "\n"
-        #print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
-        #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+        print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil)
+        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
+        print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
+        print "\n"
+        print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma)
+        print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK)
+        print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
+        print "\n"
+        print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
 
         return np.squeeze(dL_dthetaK_implicit)
 
@@ -182,11 +201,15 @@ class Laplace(likelihood):
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
         #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
 
-        dlikelihood_dthetaL_explicit, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-        dlikelihood_dfhat = self.likelihood_function.link_hess(self.data, self.f_hat, self.extra_data)
-        dfhat_dthetaL_cyclic = 0 #what is this? how can dfhat_dthetaL be used in the value of itself?
-        dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
-        dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
+        dlikelihood_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
+        #dfhat_dthetaL_cyclic = 0 #FIXME: what is this? how can dfhat_dthetaL be used in the value of itself?
+        #dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
+        #dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
+        #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
+        KW_I_i = self.Bi # could use self.B_chol??
+        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihood_dfhat))
+
         dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
 
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
@@ -199,7 +222,7 @@ class Laplace(likelihood):
 
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL_cyclic)
+        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL)
         dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
 
         #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
@@ -219,8 +242,10 @@ class Laplace(likelihood):
         #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
 
         dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
-        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
+        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma[:, None].T, dSigma_dthetaL), axis=0)
         dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
+        dL_dthetaL_via_Sigma_old = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
@@ -257,7 +282,7 @@ class Laplace(likelihood):
         #((L.T*w)_i + I)f_hat = y_tilde
         L = jitchol(self.K)
         Li = chol_inv(L)
-        Lt_W = np.dot(L.T, self.W)
+        Lt_W = np.dot(L.T, self.W) #FIXME: Can make Faster
 
         ##Check it isn't singular!
         if cond(Lt_W) > epsilon:
@@ -361,7 +386,6 @@ class Laplace(likelihood):
         """
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
         L = jitchol(B)
         return (B, L, W_12)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 61c79385..6eef9f33 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -176,8 +176,6 @@ class student_t(likelihood_function):
 
     def _set_params(self, x):
         self.sigma = float(x)
-        print "Setting student t sigma: ", x
-        print x
         #self.covariance_matrix = np.eye(self.N)*self._variance
         #self.precision = 1./self._variance
 
@@ -288,7 +286,7 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
+        grad_sigma = (  (2*self.sigma*self.v*(self.v + 1)*e)
                       / ((self.v*(self.sigma**2) + e**2)**2)
                      )
         return np.squeeze(grad_sigma)
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 79284b59..ff852766 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -66,6 +66,10 @@ class GP(model):
         # self.likelihood._set_params(p[self.kern.Nparam:])               # test by Nicolas
         self.likelihood._set_params(p[self.kern.Nparam_transformed():])  # test by Nicolas
 
+        if isinstance(self.likelihood, Laplace):
+            print "Updating approx: ", p
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
 
         self.K = self.kern.K(self.X)
         self.K += self.likelihood.covariance_matrix
@@ -87,14 +91,12 @@ class GP(model):
         return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
 
     def _update_params_callback(self, p):
-        #FIXME:Check the transforming
-        #Set the new parameters of the kernel and likelihood within the optimization
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        #parameters will be in transformed space
         self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
+        #set_params_transformed for likelihood doesn't exist?
         self.likelihood._set_params(p[self.kern.Nparam_transformed():])
         #update the likelihood approximation within the optimisation with the current parameters
         self.update_likelihood_approximation()
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     def update_likelihood_approximation(self):
         """
@@ -123,7 +125,9 @@ class GP(model):
         model for a new variable Y* = v_tilde/tau_tilde, with a covariance
         matrix K* = K + diag(1./tau_tilde) plus a normalization term.
         """
-        return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        print "Log likelihood: ", l
+        return l
 
     def _log_likelihood_gradients(self):
         """
@@ -135,7 +139,7 @@ class GP(model):
         if isinstance(self.likelihood, Laplace):
             dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            fake_dL_dKs = np.eye(self.dL_dK.shape[0])
+            fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
             #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
@@ -145,13 +149,11 @@ class GP(model):
             #print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
 
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            #print "dL_dthetaL: ", dL_dthetaL
             print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         else:
-            #print "dL_dthetaK: ", dL_dthetaK
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
             print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-        #print "dL_dthetaL: ", dL_dthetaL
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
 

From 787a038401ee959fbbd8bfe354c84c1d4cbd56fa Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 14 May 2013 16:23:18 +0100
Subject: [PATCH 036/252] Still getting closer to grads for likelihood

---
 GPy/examples/laplace_approximations.py  |  4 ++--
 GPy/likelihoods/Laplace.py              | 16 ++++++----------
 GPy/likelihoods/likelihood_functions.py |  4 ++--
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 2054881c..eb725b53 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -95,10 +95,10 @@ def debug_student_t_noise_approx():
     m.constrain_positive('t_noi')
     #m.constrain_fixed('t_noise_variance', real_sd)
     m.update_likelihood_approximation()
-    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
-    m.optimize('scg', messages=True)
     print(m)
     return m
+    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
+    m.optimize('scg', messages=True)
     if plot:
         plt.suptitle('Student-t likelihood')
         plt.subplot(132)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 5b3e8f43..2af51f2b 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -201,24 +201,22 @@ class Laplace(likelihood):
         dL_dytil, dytil_dfhat = self._shared_gradients_components()
         #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
 
-        dlikelihood_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
+        dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
         dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-        #dfhat_dthetaL_cyclic = 0 #FIXME: what is this? how can dfhat_dthetaL be used in the value of itself?
-        #dlikelihood_dthetaL_implicit = np.dot(dlikelihood_dfhat, dfhat_dthetaL_cyclic) # may need a sum over f
-        #dfhat_dthetaL = np.dot(self.K, (dlikelihood_dthetaL_explicit + dlikelihood_dthetaL_implicit)[:, None])
         #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
         KW_I_i = self.Bi # could use self.B_chol??
-        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihood_dfhat))
+        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL))
+        #dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
 
         dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
 
         #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
-        dL_dSigma = partial #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
+        dL_dSigma = np.diagflat(partial) #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
 
         Wi = np.diagonal(self.Sigma_tilde) #Convenience
         #-1 as we are looking at W which is -1*d2log p(y|f)
         #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
-        dSigma_dthetaL_explicit = np.diagflat(-(Wi*(-1*d2likelihood_dthetaL)*Wi))
+        dSigma_dthetaL_explicit = np.diagflat(-1*(Wi*(-1*d2likelihood_dthetaL)*Wi))
 
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
@@ -242,10 +240,8 @@ class Laplace(likelihood):
         #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
 
         dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
-        dL_dthetaL_via_Sigma = np.sum(np.dot(dL_dSigma[:, None].T, dSigma_dthetaL), axis=0)
+        dL_dthetaL_via_Sigma = np.sum(np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0))
         dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
-        dL_dthetaL_via_Sigma_old = np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0)
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 6eef9f33..1a9dac75 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -256,7 +256,7 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        d3link_d3f = (  (2*(self.v + 1)*(e**3 - 3*(self.sigma**2)*self.v*e))
+        d3link_d3f = (  (2*(self.v + 1)*(-1*e)*(e**2 - 3*(self.sigma**2)*self.v))
                       / ((e**2 + (self.sigma**2)*self.v)**3)
                      )
         return np.squeeze(d3link_d3f)
@@ -286,7 +286,7 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        grad_sigma = (  (2*self.sigma*self.v*(self.v + 1)*e)
+        grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
                       / ((self.v*(self.sigma**2) + e**2)**2)
                      )
         return np.squeeze(grad_sigma)

From 569311b5107c6ec6cb2cc41587701f5526fb70dd Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 15 May 2013 19:25:55 +0100
Subject: [PATCH 037/252] Gradients almost there for dytil_dfhat, diagonal
 terms are right

---
 GPy/likelihoods/Laplace.py              |  21 ++--
 GPy/likelihoods/likelihood_functions.py |   4 +-
 GPy/testing/laplace_approx.tests.py     | 123 ++++++++++++++++++++++++
 3 files changed, 140 insertions(+), 8 deletions(-)
 create mode 100644 GPy/testing/laplace_approx.tests.py

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 2af51f2b..ce3f870f 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -96,7 +96,10 @@ class Laplace(likelihood):
         #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
 
         a = mdot(dWi_dfhat, Ki, self.f_hat)
-        dytil_dfhat = mdot(dWi_dfhat, Ki, self.f_hat) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
+        b = np.dot(self.Sigma_tilde, Ki)
+        dytil_dfhat = - np.dot(dWi_dfhat, np.dot(Ki, self.f_hat)) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
+        #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N)
+        self.dytil_dfhat = dytil_dfhat
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
@@ -330,19 +333,25 @@ class Laplace(likelihood):
 
     def fit_full(self, K):
         """
-        The laplace approximation algorithm
+        The laplace approximation algorithm, find K and expand hessian
         For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
         :K: Covariance matrix
         """
         self.K = K.copy()
-        #assert np.all(self.K.T == self.K)
-        #self.K_safe = K.copy()
+
+        #Find mode
         if self.rasm:
             self.f_hat = self.rasm_mode(K)
         else:
             self.f_hat = self.ncg_mode(K)
 
+        #Compute hessian and other variables at mode
+        self._compute_likelihood_variables()
+
+    def _compute_likelihood_variables(self):
         #At this point get the hessian matrix
+        #print "Data: ", self.data
+        #print "fhat: ", self.f_hat
         self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data))
 
         if not self.likelihood_function.log_concave:
@@ -352,14 +361,14 @@ class Laplace(likelihood):
                                        #This is a property only held by non-log-concave likelihoods
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
-        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(K, self.W)
+        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
 
         Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
         self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
 
         b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
-        solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (K, b)))
+        solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b)))
         a = b - mdot(self.W_12, solve_chol)
         self.f_Ki_f = np.dot(self.f_hat.T, a)
         self.ln_K_det = pddet(self.K)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 0d194c01..646293d2 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -10,8 +10,7 @@ from scipy.special import gammaln, gamma
 from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 
 class likelihood_function:
-    """
-    Likelihood class for doing Expectation propagation
+    """ Likelihood class for doing Expectation propagation
 
     :param Y: observed output (Nx1 numpy.darray)
     ..Note:: Y values allowed depend on the likelihood_function used
@@ -241,6 +240,7 @@ class student_t(likelihood_function):
         y = np.squeeze(y)
         f = np.squeeze(f)
         assert y.shape == f.shape
+
         e = y - f
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
         return np.squeeze(hess)
diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx.tests.py
new file mode 100644
index 00000000..394950d5
--- /dev/null
+++ b/GPy/testing/laplace_approx.tests.py
@@ -0,0 +1,123 @@
+import unittest
+import numpy as np
+
+import GPy
+from GPy.models import GP
+from GPy.util.linalg import pdinv, tdot
+from scipy import linalg
+
+class LikelihoodGradParam(GP):
+    def __init__(self, X, likelihood_function, kernel, param_name=None, function=None, **kwargs):
+        super(LikelihoodGradParam, self).__init__(X, likelihood_function, kernel)
+        self.param_name = param_name
+        self.func = function
+        #self.func_params = kwargs
+        #self.parameter = self.likelihood.__getattribute__(self.param_name)
+
+    def _get_param_names(self):
+        f_hats = ["f_{}".format(i) for i in range(len(self.likelihood.f_hat))]
+        return f_hats
+
+    def _get_params(self):
+        return np.hstack([np.squeeze(self.likelihood.f_hat)])
+        #return np.hstack([self.likelihood.__getattribute__(self.param_name)])
+
+    def hack_dL_dK(self):
+        self.K = self.kern.K(self.X)
+        self.K += self.likelihood.covariance_matrix
+
+        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
+
+        # the gradient of the likelihood wrt the covariance matrix
+        if self.likelihood.YYT is None:
+            alpha, _ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y, lower=1)
+            self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki)
+        else:
+            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
+            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
+            self.dL_dK = 0.5 * (tmp - self.D * self.Ki)
+
+    def _set_params(self, x):
+        self.likelihood.f_hat = x.reshape(self.N, 1)
+        self.likelihood._compute_likelihood_variables()
+        self.hack_dL_dK()
+
+    def log_likelihood(self):
+        return self.func(self.likelihood)[0, 0]
+
+    def _log_likelihood_gradients(self):
+        #gradient = self.likelihood.__getattribute__(self.param_name)
+        self.likelihood._compute_likelihood_variables()
+        self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        gradient = getattr(self.likelihood, self.param_name)
+        #Need to sum over fhats? For dytil_dfhat...
+        #gradient = np.flatten(gradient, axis=0)
+        #return gradient[:, 0]
+        return gradient[0, :]
+
+
+class LaplaceTests(unittest.TestCase):
+    def setUp(self):
+        real_var = 0.1
+        #Start a function, any function
+        #self.X = np.linspace(0.0, 10.0, 30)[:, None]
+        self.X = np.random.randn(2,1)
+        #self.X = np.ones((10,1))
+        Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var
+        self.Y = Y/Y.max()
+        self.kernel = GPy.kern.rbf(self.X.shape[1])
+
+        deg_free = 10000
+        real_sd = np.sqrt(real_var)
+        initial_sd_guess = 1
+
+        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=initial_sd_guess)
+        self.stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
+        self.stu_t_likelihood.fit_full(self.kernel.K(self.X))
+        self.m = LikelihoodGradParam(self.X, self.stu_t_likelihood, self.kernel, None, None)
+        self.m.constrain_fixed('rbf_v', 1.0898)
+        self.m.constrain_fixed('rbf_l', 1.8651)
+
+    def tearDown(self):
+        self.m = None
+
+    def test_dy_dfhat(self):
+        def ytil(likelihood):
+            Sigma_tilde = likelihood.Sigma_tilde
+            K = likelihood.K
+            Ki, _, _, _ = pdinv(K)
+            f_hat = likelihood.f_hat
+            Sigma, _, _, _ = pdinv(Sigma_tilde)
+            return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat)
+
+        self.m.func = ytil
+        self.m.param_name = 'dytil_dfhat'
+        self.m.randomize()
+        #try:
+        self.m.checkgrad(verbose=1)
+        assert self.m.checkgrad()
+        #except:
+            #import ipdb;ipdb.set_trace()
+
+
+    #def test_dL_dytil(self):
+        #def L(likelihood):
+            ##-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+            #Sigma_tilde = likelihood.Sigma_tilde
+            #Ki = likelihood.K
+            #f_hat = likelihood.f_hat
+            #Sigma, _, _, _ = pdinv(Sigma_tilde)
+            #return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat)
+
+        #self.m.func = L
+        #self.m.param_name = 'dL_dytil'
+        #m.randomize()
+        ##try:
+        #m.checkgrad(verbose=1)
+        #assert m.checkgrad()
+        #except:
+            #import ipdb;ipdb.set_trace()
+
+if __name__ == "__main__":
+    unittest.main()
+

From 21ae81de29c36ad94d8d7fc412db869c7926719a Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 16 May 2013 12:00:15 +0100
Subject: [PATCH 038/252] Workong on doing explicit gradients

---
 GPy/likelihoods/Laplace.py          | 13 +++++++++++++
 GPy/testing/laplace_approx.tests.py |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index ce3f870f..f2197e55 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -97,6 +97,19 @@ class Laplace(likelihood):
 
         a = mdot(dWi_dfhat, Ki, self.f_hat)
         b = np.dot(self.Sigma_tilde, Ki)
+        #dytil_dfhat = np.zeros(self.K.shape)
+        #for col in range(self.N):
+            #for row in range(self.N):
+                #t1 = 0
+                #for l in range(self.N):
+                    #t1 += dWi_dfhat[col, col]*Ki[col,l]*self.f_hat[l, 0]
+                ##t2 = np.zeros((1, self.N))
+                #t2 = np.dot(self.Sigma_tilde, Ki[:, col])
+                ##for k in range(self.N):
+                    ##t2[:] += self.Sigma_tilde[k, k]*Ki[k, col]
+                #dytil_dfhat[row, col] = (t1 + t2)[row]
+        #dytil_dfhat += np.eye(self.N)
+
         dytil_dfhat = - np.dot(dWi_dfhat, np.dot(Ki, self.f_hat)) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
         #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N)
         self.dytil_dfhat = dytil_dfhat
diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx.tests.py
index 394950d5..73dfbfd6 100644
--- a/GPy/testing/laplace_approx.tests.py
+++ b/GPy/testing/laplace_approx.tests.py
@@ -61,7 +61,7 @@ class LaplaceTests(unittest.TestCase):
         real_var = 0.1
         #Start a function, any function
         #self.X = np.linspace(0.0, 10.0, 30)[:, None]
-        self.X = np.random.randn(2,1)
+        self.X = np.random.randn(9,1)
         #self.X = np.ones((10,1))
         Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var
         self.Y = Y/Y.max()

From e5d7ee972848e5eb5ec1186c3150d9720328076f Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 16 May 2013 12:06:09 +0100
Subject: [PATCH 039/252] FIXED DYTIL_DFHAT

---
 GPy/likelihoods/Laplace.py          | 6 +++---
 GPy/testing/laplace_approx.tests.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index f2197e55..42897f80 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -105,12 +105,12 @@ class Laplace(likelihood):
                     #t1 += dWi_dfhat[col, col]*Ki[col,l]*self.f_hat[l, 0]
                 ##t2 = np.zeros((1, self.N))
                 #t2 = np.dot(self.Sigma_tilde, Ki[:, col])
-                ##for k in range(self.N):
-                    ##t2[:] += self.Sigma_tilde[k, k]*Ki[k, col]
+                ###for k in range(self.N):
+                    ###t2[:] += self.Sigma_tilde[k, k]*Ki[k, col]
                 #dytil_dfhat[row, col] = (t1 + t2)[row]
         #dytil_dfhat += np.eye(self.N)
 
-        dytil_dfhat = - np.dot(dWi_dfhat, np.dot(Ki, self.f_hat)) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
+        dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
         #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N)
         self.dytil_dfhat = dytil_dfhat
         return dL_dytil, dytil_dfhat
diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx.tests.py
index 73dfbfd6..2b3af2ad 100644
--- a/GPy/testing/laplace_approx.tests.py
+++ b/GPy/testing/laplace_approx.tests.py
@@ -60,8 +60,8 @@ class LaplaceTests(unittest.TestCase):
     def setUp(self):
         real_var = 0.1
         #Start a function, any function
-        #self.X = np.linspace(0.0, 10.0, 30)[:, None]
-        self.X = np.random.randn(9,1)
+        self.X = np.linspace(0.0, 10.0, 30)[:, None]
+        #self.X = np.random.randn(,1)
         #self.X = np.ones((10,1))
         Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var
         self.Y = Y/Y.max()

From 48d693791eabf51e64b28706910a9a9444457825 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 16 May 2013 12:22:37 +0100
Subject: [PATCH 040/252] changed name

---
 GPy/examples/laplace_approximations.py        |  2 +-
 GPy/likelihoods/Laplace.py                    | 25 ++++---------------
 ...pprox.tests.py => laplace_approx_tests.py} |  0
 3 files changed, 6 insertions(+), 21 deletions(-)
 rename GPy/testing/{laplace_approx.tests.py => laplace_approx_tests.py} (100%)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index eb725b53..4d8e96b8 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -39,7 +39,7 @@ def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 30)[:, None]
+    X = np.linspace(0.0, 10.0, 2)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
     X_full = np.linspace(0.0, 10.0, 500)[:, None]
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 42897f80..b0dde03f 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -95,23 +95,7 @@ class Laplace(likelihood):
         #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
         #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
 
-        a = mdot(dWi_dfhat, Ki, self.f_hat)
-        b = np.dot(self.Sigma_tilde, Ki)
-        #dytil_dfhat = np.zeros(self.K.shape)
-        #for col in range(self.N):
-            #for row in range(self.N):
-                #t1 = 0
-                #for l in range(self.N):
-                    #t1 += dWi_dfhat[col, col]*Ki[col,l]*self.f_hat[l, 0]
-                ##t2 = np.zeros((1, self.N))
-                #t2 = np.dot(self.Sigma_tilde, Ki[:, col])
-                ###for k in range(self.N):
-                    ###t2[:] += self.Sigma_tilde[k, k]*Ki[k, col]
-                #dytil_dfhat[row, col] = (t1 + t2)[row]
-        #dytil_dfhat += np.eye(self.N)
-
         dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
-        #dytil_dfhat = - (np.dot(dWi_dfhat, Ki)*self.f_hat[:, None] + np.dot(self.Sigma_tilde, Ki)).sum(-1) + np.eye(self.N)
         self.dytil_dfhat = dytil_dfhat
         return dL_dytil, dytil_dfhat
 
@@ -219,10 +203,10 @@ class Laplace(likelihood):
 
         dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
         dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-        #KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
-        KW_I_i = self.Bi # could use self.B_chol??
+        KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
+        #KW_I_i = self.Bi # could use self.B_chol??
         dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL))
-        #dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
+        dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
 
         dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
 
@@ -383,7 +367,8 @@ class Laplace(likelihood):
         b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
         solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b)))
         a = b - mdot(self.W_12, solve_chol)
-        self.f_Ki_f = np.dot(self.f_hat.T, a)
+        self.Ki_f = a
+        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
         self.ln_K_det = pddet(self.K)
 
         self.ln_z_hat = (- 0.5*self.f_Ki_f
diff --git a/GPy/testing/laplace_approx.tests.py b/GPy/testing/laplace_approx_tests.py
similarity index 100%
rename from GPy/testing/laplace_approx.tests.py
rename to GPy/testing/laplace_approx_tests.py

From 146d7e2458cbfc69f8303b0b413e50cebf7fd7f7 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 17 May 2013 17:42:00 +0100
Subject: [PATCH 041/252] Trying to fix dL_dytil gradient

---
 GPy/likelihoods/Laplace.py          |  23 +++++-
 GPy/testing/laplace_approx_tests.py | 109 +++++++++++++++++-----------
 2 files changed, 84 insertions(+), 48 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index b0dde03f..af20d36a 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -79,16 +79,29 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        dL_dytil = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde)) #or *0.5? Shouldn't this be -y*R
+        Ki, _, _, _ = pdinv(self.K)
+
+        #Y__KS_i = np.dot(self.Y.T, inv(self.K+self.Sigma_tilde))
+        #dL_dytil = -0.5*Y__KS_i #or *0.5? Shouldn't this be -y*R
+        #dL_dytil = -0.5*np.trace(np.dot(inv(self.K+self.Sigma_tilde), (np.dot(self.Y, self.Y.T) + self.Y.T)))
+        #dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde),
+        #dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y)
+        c = inv(self.K+self.Sigma_tilde)
+        dL_dytil_simple_term =  -0.5*np.diag(np.dot(c, self.Y) + np.dot(self.Y.T, c))
+
+        P = np.diagflat(1/np.dot(Ki, self.f_hat))
+        K_Wi_i = inv(self.K+self.Sigma_tilde)
+
+        dL_dytil_difficult_term = np.diag(( -0.5*(np.dot(self.K + self.Sigma_tilde, P))
+                                            +0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P)
+                                           ) * np.eye(self.N))
+        dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term
 
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         Wi = np.diagonal(self.Sigma_tilde) #Convenience
         #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
         dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
 
-        Ki, _, _, _ = pdinv(self.K)
-        #dytil_dfhat_implicit = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
-        #dytil_dfhat = np.dot(dWi_dfhat, Ki) + np.eye(self.N)
 
         #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
         #dytil_dfhat_explicit = self.Wi__Ki_W
@@ -97,6 +110,8 @@ class Laplace(likelihood):
 
         dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
         self.dytil_dfhat = dytil_dfhat
+        #dytil_dfhat = np.eye(dytil_dfhat.shape[0])
+        self.dL_dfhat = np.dot(dL_dytil, dytil_dfhat) #FIXME: Purely for checkgradding....
         return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
diff --git a/GPy/testing/laplace_approx_tests.py b/GPy/testing/laplace_approx_tests.py
index 2b3af2ad..acb1c822 100644
--- a/GPy/testing/laplace_approx_tests.py
+++ b/GPy/testing/laplace_approx_tests.py
@@ -1,26 +1,29 @@
 import unittest
 import numpy as np
+np.random.seed(82)
 
 import GPy
 from GPy.models import GP
 from GPy.util.linalg import pdinv, tdot
 from scipy import linalg
 
-class LikelihoodGradParam(GP):
-    def __init__(self, X, likelihood_function, kernel, param_name=None, function=None, **kwargs):
-        super(LikelihoodGradParam, self).__init__(X, likelihood_function, kernel)
+class LikelihoodParamGrad(GP):
+    def __init__(self, X=None, likelihood_function=None, kernel=None, param_name=None, function=None, dparam_name=None, **kwargs):
         self.param_name = param_name
+        self.dparam_name = dparam_name
         self.func = function
+        super(LikelihoodParamGrad, self).__init__(X, likelihood_function, kernel)
         #self.func_params = kwargs
         #self.parameter = self.likelihood.__getattribute__(self.param_name)
 
     def _get_param_names(self):
-        f_hats = ["f_{}".format(i) for i in range(len(self.likelihood.f_hat))]
-        return f_hats
+        params = getattr(self.likelihood, self.dparam_name)
+        params_names = ["{}_{}".format(self.dparam_name, i) for i in range(len(params))]
+        return params_names
 
     def _get_params(self):
-        return np.hstack([np.squeeze(self.likelihood.f_hat)])
-        #return np.hstack([self.likelihood.__getattribute__(self.param_name)])
+        params = getattr(self.likelihood, self.dparam_name)
+        return np.hstack([params])
 
     def hack_dL_dK(self):
         self.K = self.kern.K(self.X)
@@ -38,29 +41,56 @@ class LikelihoodGradParam(GP):
             self.dL_dK = 0.5 * (tmp - self.D * self.Ki)
 
     def _set_params(self, x):
-        self.likelihood.f_hat = x.reshape(self.N, 1)
+        raise NotImplementedError
+
+    def log_likelihood(self):
+        raise NotImplementedError
+
+    def _log_likelihood_gradients(self):
+        raise NotImplementedError
+
+
+class Likelihood_F_Grad(LikelihoodParamGrad):
+    def __init__(self, **kwargs):
+        super(Likelihood_F_Grad, self).__init__(**kwargs)
+
+    def _set_params(self, x):
+        params = getattr(self.likelihood, self.dparam_name)
+        setattr(self.likelihood, self.dparam_name, x.reshape(*params.shape))
         self.likelihood._compute_likelihood_variables()
         self.hack_dL_dK()
 
     def log_likelihood(self):
-        return self.func(self.likelihood)[0, 0]
+        ll = self.func(self)
+        if self.param_name == "dL_dfhat_":
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        if len(ll.shape) == 0 or len(ll.shape) == 1:
+            return ll.sum()
+        elif len(ll.shape) == 2:
+            #print "Only checking first likelihood"
+            return ll[0, 0]
+        else:
+            raise ValueError('Not implemented for larger matricies yet')
+        return ll
 
     def _log_likelihood_gradients(self):
-        #gradient = self.likelihood.__getattribute__(self.param_name)
         self.likelihood._compute_likelihood_variables()
         self.likelihood._gradients(partial=np.diag(self.dL_dK))
         gradient = getattr(self.likelihood, self.param_name)
-        #Need to sum over fhats? For dytil_dfhat...
-        #gradient = np.flatten(gradient, axis=0)
-        #return gradient[:, 0]
-        return gradient[0, :]
+        if len(gradient.shape) == 1:
+            return gradient
+        elif len(gradient.shape) == 2:
+            #print "Only checking first gradients"
+            return gradient[0,: ]
+        else:
+            raise ValueError('Not implemented for larger matricies yet')
 
 
 class LaplaceTests(unittest.TestCase):
     def setUp(self):
         real_var = 0.1
         #Start a function, any function
-        self.X = np.linspace(0.0, 10.0, 30)[:, None]
+        self.X = np.linspace(0.0, 10.0, 4)[:, None]
         #self.X = np.random.randn(,1)
         #self.X = np.ones((10,1))
         Y = np.sin(self.X) + np.random.randn(*self.X.shape)*real_var
@@ -74,49 +104,40 @@ class LaplaceTests(unittest.TestCase):
         t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=initial_sd_guess)
         self.stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
         self.stu_t_likelihood.fit_full(self.kernel.K(self.X))
-        self.m = LikelihoodGradParam(self.X, self.stu_t_likelihood, self.kernel, None, None)
-        self.m.constrain_fixed('rbf_v', 1.0898)
-        self.m.constrain_fixed('rbf_l', 1.8651)
 
     def tearDown(self):
         self.m = None
 
     def test_dy_dfhat(self):
-        def ytil(likelihood):
-            Sigma_tilde = likelihood.Sigma_tilde
-            K = likelihood.K
+        def ytil(self):
+            Sigma_tilde = self.likelihood.Sigma_tilde
+            K = self.likelihood.K
             Ki, _, _, _ = pdinv(K)
-            f_hat = likelihood.f_hat
+            f_hat = self.likelihood.f_hat
             Sigma, _, _, _ = pdinv(Sigma_tilde)
             return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat)
 
-        self.m.func = ytil
-        self.m.param_name = 'dytil_dfhat'
+        self.m = Likelihood_F_Grad(X=self.X, likelihood_function=self.stu_t_likelihood,
+                                   kernel=self.kernel, param_name='dytil_dfhat',
+                                   function=ytil, dparam_name='f_hat')
+        #self.m.constrain_fixed('rbf_v', 1.0898)
+        #self.m.constrain_fixed('rbf_l', 1.8651)
         self.m.randomize()
-        #try:
         self.m.checkgrad(verbose=1)
         assert self.m.checkgrad()
-        #except:
-            #import ipdb;ipdb.set_trace()
 
+    def test_dL_dfhat(self):
+        def L(self):
+            return np.array(-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z)
 
-    #def test_dL_dytil(self):
-        #def L(likelihood):
-            ##-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
-            #Sigma_tilde = likelihood.Sigma_tilde
-            #Ki = likelihood.K
-            #f_hat = likelihood.f_hat
-            #Sigma, _, _, _ = pdinv(Sigma_tilde)
-            #return np.dot(np.dot(Sigma_tilde, (Ki + Sigma)), f_hat)
-
-        #self.m.func = L
-        #self.m.param_name = 'dL_dytil'
-        #m.randomize()
-        ##try:
-        #m.checkgrad(verbose=1)
-        #assert m.checkgrad()
-        #except:
-            #import ipdb;ipdb.set_trace()
+        self.m = Likelihood_F_Grad(X=self.X, likelihood_function=self.stu_t_likelihood,
+                                    kernel=self.kernel, param_name='dL_dfhat',
+                                    function=L, dparam_name='f_hat')
+        self.m.constrain_fixed('rbf_v', 1.0898)
+        self.m.constrain_fixed('rbf_l', 1.8651)
+        self.m.randomize()
+        self.m.checkgrad(verbose=1)
+        assert self.m.checkgrad()
 
 if __name__ == "__main__":
     unittest.main()

From d63d370641846642bdc02f0295177f7f37b5f5fb Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 29 May 2013 13:46:55 +0100
Subject: [PATCH 042/252] About to rip out old chain rule method of learning
 gradients

---
 GPy/likelihoods/Laplace.py          | 4 +++-
 GPy/testing/laplace_approx_tests.py | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index af20d36a..666fa227 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -87,7 +87,7 @@ class Laplace(likelihood):
         #dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde),
         #dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y)
         c = inv(self.K+self.Sigma_tilde)
-        dL_dytil_simple_term =  -0.5*np.diag(np.dot(c, self.Y) + np.dot(self.Y.T, c))
+        dL_dytil_simple_term = -0.5*np.diag(2*np.dot(c, self.Y))
 
         P = np.diagflat(1/np.dot(Ki, self.f_hat))
         K_Wi_i = inv(self.K+self.Sigma_tilde)
@@ -96,6 +96,7 @@ class Laplace(likelihood):
                                             +0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P)
                                            ) * np.eye(self.N))
         dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term
+        dL_dytil = dL_dytil.reshape(1, self.N)
 
         d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
         Wi = np.diagonal(self.Sigma_tilde) #Convenience
@@ -329,6 +330,7 @@ class Laplace(likelihood):
                    #+ y_W_f
                    #+ self.ln_z_hat
                    #)
+        self.Z_tilde = 0
 
         ##Check it isn't singular!
         if cond(self.W) > epsilon:
diff --git a/GPy/testing/laplace_approx_tests.py b/GPy/testing/laplace_approx_tests.py
index acb1c822..15d84c9c 100644
--- a/GPy/testing/laplace_approx_tests.py
+++ b/GPy/testing/laplace_approx_tests.py
@@ -62,8 +62,6 @@ class Likelihood_F_Grad(LikelihoodParamGrad):
 
     def log_likelihood(self):
         ll = self.func(self)
-        if self.param_name == "dL_dfhat_":
-            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         if len(ll.shape) == 0 or len(ll.shape) == 1:
             return ll.sum()
         elif len(ll.shape) == 2:
@@ -128,6 +126,7 @@ class LaplaceTests(unittest.TestCase):
 
     def test_dL_dfhat(self):
         def L(self):
+            #return np.array(-0.5 * self.D * self.K_logdet + self._model_fit_term()) #Ignore Z for now
             return np.array(-0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z)
 
         self.m = Likelihood_F_Grad(X=self.X, likelihood_function=self.stu_t_likelihood,

From 117c377d13efe81b2df567936ff48e85f918efcd Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 29 May 2013 14:02:03 +0100
Subject: [PATCH 043/252] Ripped out all things Laplace parameter estimation,
 starting again with new tactic

---
 GPy/likelihoods/Laplace.py | 175 +------------------------------------
 GPy/models/GP.py           |   8 +-
 2 files changed, 4 insertions(+), 179 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 666fa227..69c0876b 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -79,187 +79,18 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
-        Ki, _, _, _ = pdinv(self.K)
-
-        #Y__KS_i = np.dot(self.Y.T, inv(self.K+self.Sigma_tilde))
-        #dL_dytil = -0.5*Y__KS_i #or *0.5? Shouldn't this be -y*R
-        #dL_dytil = -0.5*np.trace(np.dot(inv(self.K+self.Sigma_tilde), (np.dot(self.Y, self.Y.T) + self.Y.T)))
-        #dL_dytil_simple_term = -0.5*np.dot(inv(self.K+self.Sigma_tilde),
-        #dL_dytil_simple_term = -np.dot(self.Y.T, inv(self.K+self.Sigma_tilde), self.Y)
-        c = inv(self.K+self.Sigma_tilde)
-        dL_dytil_simple_term = -0.5*np.diag(2*np.dot(c, self.Y))
-
-        P = np.diagflat(1/np.dot(Ki, self.f_hat))
-        K_Wi_i = inv(self.K+self.Sigma_tilde)
-
-        dL_dytil_difficult_term = np.diag(( -0.5*(np.dot(self.K + self.Sigma_tilde, P))
-                                            +0.5*mdot(K_Wi_i, self.Y, self.Y.T, K_Wi_i, P)
-                                           ) * np.eye(self.N))
-        dL_dytil = dL_dytil_simple_term + dL_dytil_difficult_term
-        dL_dytil = dL_dytil.reshape(1, self.N)
-
-        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
-        Wi = np.diagonal(self.Sigma_tilde) #Convenience
-        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
-        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-
-
-        #Wi(Ki + W) = Wi__Ki_W using the last K prior given to fit_full
-        #dytil_dfhat_explicit = self.Wi__Ki_W
-        #dytil_dfhat = dytil_dfhat_explicit + dytil_dfhat_implicit
-        #dytil_dfhat1 = np.dot(self.Sigma_tilde, Ki) + np.eye(self.N) # or self.Wi__Ki_W? Theyre the same basically
-
-        dytil_dfhat = - np.diagflat(np.dot(dWi_dfhat, np.dot(Ki, self.f_hat))) + np.dot(self.Sigma_tilde, Ki) + np.eye(self.N)
-        self.dytil_dfhat = dytil_dfhat
-        #dytil_dfhat = np.eye(dytil_dfhat.shape[0])
-        self.dL_dfhat = np.dot(dL_dytil, dytil_dfhat) #FIXME: Purely for checkgradding....
-        return dL_dytil, dytil_dfhat
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
         """
-                           #explicit                #implicit                     #implicit
-        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
-        :param dL_d_K_Sigma: Derivative of marginal with respect to K_prior+Sigma_tilde (posterior covariance)
-        :param dK_dthetaK: explcit derivative of kernel with respect to its hyper paramers
-        :returns: dL_dthetaK - gradients of marginal likelihood w.r.t changes in K hyperparameters
+        Gradients with respect to prior kernel parameters
         """
-        dL_dytil, dytil_dfhat = self._shared_gradients_components()
-
-        #dSigma_dfhat = -np.dot(self.Sigma_tilde, np.dot(d3phi_d3fhat, self.Sigma_tilde))
-
-        #print "Computing K gradients"
-        #print "dytil_dfhat: ", np.mean(dytil_dfhat)
-        #I = np.eye(self.N)
-        #C = np.dot(self.K, self.W)
-        #A = I + C
-        #plt.imshow(A)
-        #plt.show()
-
-        #I_KW_i, _, _, _ = pdinv(A) #FIXME: WHY SO MUCH JITTER?!
-        #B = I + w12*K*w12
-        I_KW_i = self.Bi # could use self.B_chol??
-
-        #FIXME: Careful dK_dthetaK is not the derivative with respect to the marginal just prior K!
-        #Derivative for each f dimension, for each of K's hyper parameters
-        dfhat_dthetaK = np.zeros((self.f_hat.shape[0], dK_dthetaK.shape[0]))
-        grad = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-        for ind_j, thetaj in enumerate(dK_dthetaK):
-            #dfhat_dthetaK[:, ind_j] = np.dot(thetaj, grad) - np.dot(self.K, np.dot(I_KW_i, np.dot(thetaj, grad)))
-            dfhat_dthetaK[:, ind_j] = np.dot(I_KW_i, thetaj*grad)
-
-        print "dytil_dfhat: ", np.mean(dytil_dfhat), np.std(dytil_dfhat)
-        print "dfhat_dthetaK: ", np.mean(dfhat_dthetaK), np.std(dfhat_dthetaK)
-        dytil_dthetaK = np.dot(dytil_dfhat, dfhat_dthetaK) # should be (D,thetaK)
-        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
-        print "\n"
-
-        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
-        dL_dSigma = dL_d_K_Sigma
-        #d3phi_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
-                     #explicit           #implicit
-        #dSigmai_dthetaK = 0 + np.dot(d3phi_d3fhat, dfhat_dthetaK)
-        #dSigma_dthetaK = np.zeros((self.f_hat.shape[0], self.f_hat.shape[0], dK_dthetaK.shape[0]))
-
-        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
-        Wi = np.diagonal(self.Sigma_tilde) #Convenience
-        dSigma_dthetaK_explicit = 0
-        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
-        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-        #dSigma_dthetaK_implicit = -np.sum(np.dot(dWi_dfhat, dfhat_dthetaK), axis=0)
-        dSigma_dthetaK_implicit = np.dot(dWi_dfhat, dfhat_dthetaK)
-        dSigma_dthetaK = dSigma_dthetaK_explicit + dSigma_dthetaK_implicit
-        #dSigma_dthetaK = 0 + np.dot(, dfhat_dthetaK)
-        #for ind_j, dSigmai_dthetaj in enumerate(dSigmai_dthetaK):
-            #dSigma_dthetaK_explicit = 0
-            #dSigma_dthetaK_implicit = -np.dot(Wi, dW_dfhat
-            #dSigma_dthetaK[:, :, ind_j] = -np.dot(self.Sigma_tilde, dSigmai_dthetaj*self.Sigma_tilde)
-
-        #FIXME: Won't handle multi dimensional data
-        dL_dthetaK_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaK), axis=0)
-        dL_dthetaK_via_Sigma = np.sum(np.dot(dL_dSigma, dSigma_dthetaK), axis=0)
-        dL_dthetaK_implicit = dL_dthetaK_via_ytil + dL_dthetaK_via_Sigma
-
-        print "dL_dytil: ", np.mean(dL_dytil), np.std(dL_dytil)
-        print "dytil_dthetaK: ", np.mean(dytil_dthetaK), np.std(dytil_dthetaK)
-        print "dL_dthetaK_via_ytil: ", dL_dthetaK_via_ytil
-        print "\n"
-        print "dL_dSigma: ", np.mean(dL_dSigma), np.std(dL_dSigma)
-        print "dSigma_dthetaK: ", np.mean(dSigma_dthetaK), np.std(dSigma_dthetaK)
-        print "dL_dthetaK_via_Sigma: ", dL_dthetaK_via_Sigma
-        print "\n"
-        print "dL_dthetaK_implicit: ", dL_dthetaK_implicit
-
-        return np.squeeze(dL_dthetaK_implicit)
+        return dL_dthetaK
 
     def _gradients(self, partial):
         """
         Gradients with respect to likelihood parameters
-
-        Complicated, it differs for parameters of the kernel \theta_{K}, and
-        parameters of the likelihood, \theta_{L}
-
-        dL_dtheta_K = (dL_dK * dK_dthetaK) + (dL_dytil * dytil_dthetaK) + (dL_dSigma * dSigma_dthetaK)
-        dL_dtheta_L = (dL_dK * dK_dthetaL) + (dL_dytil * dytil_dthetaL) + (dL_dSigma * dSigma_dthetaL)
-        dL_dK*dK_dthetaL = 0
-
-        dytil_dthetaX = dytil_dfhat * dfhat_dthetaX
-        dytil_dfhat = Sigma*Ki + I
-
-        fhat = K*log_p(y|fhat)                                          from rasm p125
-        dfhat_dthetaK = (I + KW)i * dK_dthetaK * log_p(y|fhat)          from rasm p125
-
-        dSigma_dthetaX = dWi_dthetaX = -Wi * dW_dthetaX * Wi
-        dW_dthetaX = d_dthetaX[d2phi_d2fhat]
-        d2phi_d2fhat = Hessian function of likelihood
-
-        partial = dL_d_K_Sigma
         """
-        dL_dytil, dytil_dfhat = self._shared_gradients_components()
-        #dfhat_dthetaL, dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-
-        dlikelihoodgrad_dthetaL, d2likelihood_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-        dlikelihood_dfhat = self.likelihood_function.link_grad(self.data, self.f_hat, self.extra_data)
-        KW_I_i, _, _, _ = pdinv(np.dot(self.K, self.W) + np.eye(self.N))
-        #KW_I_i = self.Bi # could use self.B_chol??
-        dfhat_dthetaL = mdot(KW_I_i, (self.K, dlikelihoodgrad_dthetaL))
-        dfhat_dthetaL = np.zeros(dfhat_dthetaL.shape)[:, None]
-
-        dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
-
-        #FIXME: Careful the -D*0.5 in dL_d_K_sigma might need to be -0.5?
-        dL_dSigma = np.diagflat(partial) #Is actually but can't rename it because of naming convention... dL_d_K_Sigma
-
-        Wi = np.diagonal(self.Sigma_tilde) #Convenience
-        #-1 as we are looking at W which is -1*d2log p(y|f)
-        #Can just hadamard product as diagonal matricies multiplied are just multiplying elements
-        dSigma_dthetaL_explicit = np.diagflat(-1*(Wi*(-1*d2likelihood_dthetaL)*Wi))
-
-        d3likelihood_d3fhat = self.likelihood_function.d3link(self.data, self.f_hat, self.extra_data)
-        dWi_dfhat = np.diagflat(-1*Wi*(-1*d3likelihood_d3fhat)*Wi)
-        dSigma_dthetaL_implicit = np.dot(dWi_dfhat, dfhat_dthetaL)
-        dSigma_dthetaL = dSigma_dthetaL_explicit + dSigma_dthetaL_implicit
-
-        #dSigmai_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat, self.extra_data) #FIXME: Shouldn't this have a implicit component aswell?
-        #Derivative for each f dimension, for each of K's hyper parameters
-        #dSigma_dthetaL = np.empty((self.N, len(self.likelihood_function._get_param_names())))
-        #for ind_l, dSigmai_dtheta_l in enumerate(dSigmai_dthetaL.T):
-            #dSigma_dthetaL[:, ind_l] = -mdot(self.Sigma_tilde,
-                                             #dSigmai_dtheta_l, # Careful, shouldn't this be (N, 1)?
-                                             #self.Sigma_tilde
-                                             #)
-
-        #TODO: This is Wi*A*Wi, can be more numerically stable with a trick
-        #dSigma_dthetaL = -mdot(self.Sigma_tilde, dSigmai_dthetaL, self.Sigma_tilde)
-
-        #dytil_dthetaL = dytil_dfhat*dfhat_dthetaL
-        #dytil_dthetaL = np.dot(dytil_dfhat, dfhat_dthetaL)
-        #dL_dthetaL = 0 + np.dot(dL_dytil, dytil_dthetaL)# + np.dot(dL_dSigma, dSigma_dthetaL)
-
-        dL_dthetaL_via_ytil = np.sum(np.dot(dL_dytil, dytil_dthetaL), axis=0)
-        dL_dthetaL_via_Sigma = np.sum(np.sum(np.dot(dL_dSigma, dSigma_dthetaL), axis=0))
-        dL_dthetaL = dL_dthetaL_via_ytil + dL_dthetaL_via_Sigma
-
-        return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
+        return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
         """
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 17e2a1b1..da379eb1 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -150,14 +150,8 @@ class GP(model):
             fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
-            #We need the dL_dK where K is equal to the prior K, not K+Sigma as is the case now
-            dL_dthetaK_implicit = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
-            dL_dthetaK = dL_dthetaK_explicit + dL_dthetaK_implicit
-
-            #print "dL_dthetaK_explicit: {dldkx}     dL_dthetaK_implicit: {dldki}        dL_dthetaK: {dldk}".format(dldkx=dL_dthetaK_explicit, dldki=dL_dthetaK_implicit, dldk=dL_dthetaK)
-
+            dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            #print "dL_dthetaL: ", dL_dthetaL
             print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From 23ed2a2d15c28fe5d868639ad1358024808a328f Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 29 May 2013 17:33:06 +0100
Subject: [PATCH 044/252] Lots of name changing and went through all likelihood
 gradients again

---
 GPy/examples/laplace_approximations.py  | 27 ++++---
 GPy/likelihoods/Laplace.py              | 35 +++++++--
 GPy/likelihoods/likelihood_functions.py | 96 +++++++++++++++----------
 GPy/models/GP.py                        |  2 +-
 4 files changed, 103 insertions(+), 57 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 4d8e96b8..27f063dc 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -69,22 +69,21 @@ def debug_student_t_noise_approx():
     print "Clean Gaussian"
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
-    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
-    # optimize
-    m.ensure_default_constraints()
-    m.optimize()
-    # plot
-    if plot:
-        plt.figure(1)
-        plt.suptitle('Gaussian likelihood')
-        plt.subplot(131)
-        m.plot()
-        plt.plot(X_full, Y_full)
-    print m
+    #m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    ## optimize
+    #m.ensure_default_constraints()
+    #m.optimize()
+    ## plot
+    #if plot:
+        #plt.figure(1)
+        #plt.suptitle('Gaussian likelihood')
+        #plt.subplot(131)
+        #m.plot()
+        #plt.plot(X_full, Y_full)
+    #print m
 
     edited_real_sd = initial_var_guess #real_sd
 
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
@@ -95,10 +94,10 @@ def debug_student_t_noise_approx():
     m.constrain_positive('t_noi')
     #m.constrain_fixed('t_noise_variance', real_sd)
     m.update_likelihood_approximation()
+    m.optimize('scg', messages=True)
     print(m)
     return m
     #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
-    m.optimize('scg', messages=True)
     if plot:
         plt.suptitle('Student-t likelihood')
         plt.subplot(132)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 69c0876b..f8ba25f1 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -79,17 +79,40 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
+        Ki, _, _, _ = pdinv(self.K)
+        Ki_W_i = inv(Ki + self.W) #Do it non numerically stable for now
+        d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
+        dL_dfhat = -0.5*np.dot(np.diag(Ki_W_i), d3lik_d3fhat)
+        KW = np.dot(self.K, self.W)
+        I_KW_i = inv(np.eye(KW.shape[0]) + KW)
+        return dL_dfhat, Ki, I_KW_i
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
         """
         Gradients with respect to prior kernel parameters
         """
+        dL_dfhat, Ki, I_KW_i = self._shared_gradients_components()
+        K_Wi_i = inv(self.K + inv(self.W))
+        dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
+
+        dL_dthetaK = np.zeros(dK_dthetaK.shape)
+        for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
+            #Explicit
+            dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(np.dot(K_Wi_i, dK_dthetaK_i))
+            #Implicit
+            df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK, dlp)
+            dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
+
         return dL_dthetaK
 
     def _gradients(self, partial):
         """
         Gradients with respect to likelihood parameters
         """
+        dL_dfhat, Ki, I_KW_i = self._shared_gradients_components()
+        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
+        dL_dthetaL = np.zeros(dlik_dthetaL.shape)
+
         return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
@@ -197,7 +220,7 @@ class Laplace(likelihood):
         #At this point get the hessian matrix
         #print "Data: ", self.data
         #print "fhat: ", self.f_hat
-        self.W = -np.diag(self.likelihood_function.link_hess(self.data, self.f_hat, extra_data=self.extra_data))
+        self.W = -np.diag(self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data))
 
         if not self.likelihood_function.log_concave:
             self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
@@ -212,7 +235,7 @@ class Laplace(likelihood):
         Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
         self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
 
-        b = np.dot(self.W, self.f_hat) + self.likelihood_function.link_grad(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
+        b = np.dot(self.W, self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
         solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b)))
         a = b - mdot(self.W_12, solve_chol)
         self.Ki_f = a
@@ -259,11 +282,11 @@ class Laplace(likelihood):
             return float(res)
 
         def obj_grad(f):
-            res = -1 * (self.likelihood_function.link_grad(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
+            res = -1 * (self.likelihood_function.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (--np.diag(self.likelihood_function.link_hess(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
+            res = -1 * (--np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
             return np.squeeze(res)
 
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
@@ -294,7 +317,7 @@ class Laplace(likelihood):
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             #f_old = f.copy()
-            W = -np.diag(self.likelihood_function.link_hess(self.data, f, extra_data=self.extra_data))
+            W = -np.diag(self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data))
             if not self.likelihood_function.log_concave:
                 W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -303,7 +326,7 @@ class Laplace(likelihood):
             B, L, W_12 = self._compute_B_statistics(K, W)
 
             W_f = np.dot(W, f)
-            grad = self.likelihood_function.link_grad(self.data, f, extra_data=self.extra_data)[:, None]
+            grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)[:, None]
             #Find K_i_f
             b = W_f + grad
 
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 646293d2..d75e7218 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -159,10 +159,10 @@ class student_t(likelihood_function):
     d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free, sigma=2):
+        #super(student_t, self).__init__()
         self.v = deg_free
         self.sigma = sigma
         self.log_concave = False
-        #super(student_t, self).__init__()
 
         self._set_params(np.asarray(sigma))
 
@@ -174,8 +174,6 @@ class student_t(likelihood_function):
 
     def _set_params(self, x):
         self.sigma = float(x)
-        #self.covariance_matrix = np.eye(self.N)*self._variance
-        #self.precision = 1./self._variance
 
     @property
     def variance(self, extra_data=None):
@@ -185,6 +183,8 @@ class student_t(likelihood_function):
         """link_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
 
+        For wolfram alpha import parts for derivative of sigma are -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+
         :y: data
         :f: latent variables f
         :extra_data: extra_data which is not used in student t distribution
@@ -198,17 +198,16 @@ class student_t(likelihood_function):
         e = y - f
         objective = (gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
-                     + np.log(self.sigma * np.sqrt(self.v * np.pi))
-                     - (self.v + 1) * 0.5
-                     * np.log(1 + ((e**2 / self.sigma**2) / self.v))
-                     )
+                     - np.log(self.sigma * np.sqrt(self.v * np.pi))
+                     - (self.v + 1) * 0.5 * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                    )
         return np.sum(objective)
 
-    def link_grad(self, y, f, extra_data=None):
+    def dlik_df(self, y, f, extra_data=None):
         """
         Gradient of the link function at y, given f w.r.t f
 
-        $$\frac{d}{df}p(y_{i}|f_{i}) = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{-(v+1)(f_{i}-y_{i})}{(f_{i}-y_{i})^{2} + \sigma^{2}v}$$
 
         :y: data
         :f: latent variables f
@@ -220,17 +219,17 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        grad = -((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
         return np.squeeze(grad)
 
-    def link_hess(self, y, f, extra_data=None):
+    def d2lik_d2f(self, y, f, extra_data=None):
         """
         Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
         i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
 
         Will return diagonal of hessian, since every where else it is 0
 
-        $$\frac{d^{2}p(y_{i}|f_{i})}{df^{2}} = \frac{(v + 1)(y - f)}{v \sigma^{2} + (y_{i} - f_{i})^{2}}$$
+        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((f_{i}-y_{i})^{2} - \sigma^{2}v)}{((f_{i}-y_{i})^{2} + \sigma^{2}v)^{2}}$$
 
         :y: data
         :f: latent variables f
@@ -245,54 +244,79 @@ class student_t(likelihood_function):
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
         return np.squeeze(hess)
 
-    def d3link(self, y, f, extra_data=None):
+    def d3lik_d3f(self, y, f, extra_data=None):
         """
         Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
 
-        $$\frac{2(v+1)((y-f)^{3} - 3\sigma^{2}v(y-f))}{((y-f)^{2} + \sigma^{2}v)^{3}}$$
+        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((f_{i} - y_{i})^3 - 3(f_{i} - y_{i}) \sigma^{2} v))}{((f_{i} - y_{i}) + \sigma^{2} v)^3}$$
         """
         y = np.squeeze(y)
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        d3link_d3f = (  (2*(self.v + 1)*(-1*e)*(e**2 - 3*(self.sigma**2)*self.v))
-                      / ((e**2 + (self.sigma**2)*self.v)**3)
-                     )
-        return np.squeeze(d3link_d3f)
+        d3lik_d3f = ( -(2*(self.v + 1)*(e**3 - e*3*self.v*(self.sigma**2))) /
+                       ((e**2 + (self.sigma**2)*self.v)**3)
+                    )
+        return np.squeeze(d3lik_d3f)
 
-    def link_hess_grad_std(self, y, f, extra_data=None):
+    def link_dstd(self, y, f, extra_data=None):
         """
-        Gradient of the hessian w.r.t sigma parameter (standard deviation)
+        Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
 
-        $$\frac{2\sigma v(v+1)(\sigma^{2}v - 3(f-y)^2)}{((f-y)^{2} + \sigma^{2}v)^{3}}
+        Terms relavent to derivatives wrt sigma are:
+        -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+
+        $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
         """
         y = np.squeeze(y)
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        hess_grad_sigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2)))
-                           / ((e**2 + (self.sigma**2)*self.v)**3)
-                          )
-        return np.squeeze(hess_grad_sigma)
+        dlik_dsigma = ( (1/self.sigma) -
+                        ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + (e**2) / ((self.sigma**2)*self.v) ) )
+                      )
+        return np.squeeze(dlik_dsigma)
 
-    def link_grad_std(self, y, f, extra_data=None):
+    def dlik_df_dstd(self, y, f, extra_data=None):
         """
-        Gradient of the likelihood w.r.t sigma parameter (standard deviation)
+        Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
 
-        $$\frac{-2\sigma(v+1)(y-f)}{(v\sigma^{2} + (y-f)^{2})^{2}}$$
+        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{2\sigma v(v + 1)(f-y)}{(f-y)^2 + \sigma^2 v)^2}$$
         """
         y = np.squeeze(y)
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        grad_sigma = (  (-2*self.sigma*self.v*(self.v + 1)*e)
-                      / ((self.v*(self.sigma**2) + e**2)**2)
-                     )
-        return np.squeeze(grad_sigma)
+        dlik_grad_dsigma = ((2*self.sigma*self.v*(self.v + 1)*e)
+                            / ((self.v*(self.sigma**2) + e**2)**2)
+                           )
+        return np.squeeze(dlik_grad_dsigma)
+
+    def d2lik_d2f_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
+
+        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{(v + 1)((f-y)^2 - \sigma^2 v)}{((f-y)^2 + \sigma^2 v)}$$
+        """
+        y = np.squeeze(y)
+        f = np.squeeze(f)
+        assert y.shape == f.shape
+        e = y - f
+        dlik_hess_dsigma = ( ((v + 1)*(e**2 - (self.sigma**2)*self.v)) /
+                             ((e**2 + (self.sigma**2)*self.v)**2)
+                           )
+        return np.squeeze(dlik_hess_dsigma)
 
     def _gradients(self, y, f, extra_data=None):
-        return [self.link_grad_std(y, f, extra_data=extra_data),
-                self.link_hess_grad_std(y, f, extra_data=extra_data)] # list as we might learn many parameters
+        derivs = ([self.link_dstd(y, f, extra_data=extra_data)],
+                  [self.dlik_df_dstd(y, f, extra_data=extra_data)],
+                  [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+                 ) # lists as we might learn many parameters
+        # ensure we have gradients for every parameter we want to optimize
+        assert len(derivs[0]) == len(self._get_param_names())
+        assert len(derivs[1]) == len(self._get_param_names())
+        assert len(derivs[2]) == len(self._get_param_names())
+        return derivs
 
     def predictive_values(self, mu, var):
         """
@@ -412,7 +436,7 @@ class weibull_survival(likelihood_function):
         objective = v*(np.log(self.shape) + (self.shape - 1)*np.log(y) + f) - (y**self.shape)*np.exp(f)  # FIXME: CHECK THIS WITH BOOK, wheres scale?
         return np.sum(objective)
 
-    def link_grad(self, y, f, extra_data=None):
+    def dlik_df(self, y, f, extra_data=None):
         """
         Gradient of the link function at y, given f w.r.t f
 
@@ -432,7 +456,7 @@ class weibull_survival(likelihood_function):
         grad = v - (y**self.shape)*np.exp(f)
         return np.squeeze(grad)
 
-    def link_hess(self, y, f, extra_data=None):
+    def d2lik_d2f(self, y, f, extra_data=None):
         """
         Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
         i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index da379eb1..0b5a8db6 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -147,7 +147,7 @@ class GP(model):
         if isinstance(self.likelihood, Laplace):
             dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
+            fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
             dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)

From 20227fb2ac2c0d173eed515c7870864147a5d5d5 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 30 May 2013 16:17:37 +0100
Subject: [PATCH 045/252] Made more numerically stable in a hope that it will
 work and I will find a bug...

---
 GPy/examples/laplace_approximations.py  | 10 +++---
 GPy/likelihoods/Laplace.py              | 45 ++++++++++++++++---------
 GPy/likelihoods/likelihood_functions.py |  5 +--
 GPy/models/GP.py                        |  7 ++--
 4 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 27f063dc..203d308d 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -37,9 +37,9 @@ def timing():
 
 def debug_student_t_noise_approx():
     plot = False
-    real_var = 0.1
+    real_var = 0.4
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 2)[:, None]
+    X = np.linspace(0.0, 10.0, 100)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
     X_full = np.linspace(0.0, 10.0, 500)[:, None]
@@ -89,12 +89,12 @@ def debug_student_t_noise_approx():
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     #m.constrain_positive('rbf')
-    m.constrain_fixed('rbf_v', 1.0898)
-    m.constrain_fixed('rbf_l', 1.8651)
+    #m.constrain_fixed('rbf_v', 1.0898)
+    #m.constrain_fixed('rbf_l', 1.8651)
     m.constrain_positive('t_noi')
     #m.constrain_fixed('t_noise_variance', real_sd)
     m.update_likelihood_approximation()
-    m.optimize('scg', messages=True)
+    m.optimize(messages=True)
     print(m)
     return m
     #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index f8ba25f1..85af82f9 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -79,41 +79,54 @@ class Laplace(likelihood):
         return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
 
     def _shared_gradients_components(self):
+        #FIXME: Careful of side effects! And make sure W and K are up to date!
         Ki, _, _, _ = pdinv(self.K)
-        Ki_W_i = inv(Ki + self.W) #Do it non numerically stable for now
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
-        dL_dfhat = -0.5*np.dot(np.diag(Ki_W_i), d3lik_d3fhat)
-        KW = np.dot(self.K, self.W)
-        I_KW_i = inv(np.eye(KW.shape[0]) + KW)
-        return dL_dfhat, Ki, I_KW_i
+        #dL_dfhat = -0.5*np.diag(self.Ki_W_i)*d3lik_d3fhat
+        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)*d3lik_d3fhat)[:, None]
+        Wi_K_i = mdot(self.W_12, self.Bi, self.W_12) #same as rasms R
+        I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i)
+        return dL_dfhat, Ki, I_KW_i, Wi_K_i
 
     def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
         """
         Gradients with respect to prior kernel parameters
         """
-        dL_dfhat, Ki, I_KW_i = self._shared_gradients_components()
-        K_Wi_i = inv(self.K + inv(self.W))
-        dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
+        dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components()
+        dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)[:, None]
 
         dL_dthetaK = np.zeros(dK_dthetaK.shape)
         for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
             #Explicit
-            dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(np.dot(K_Wi_i, dK_dthetaK_i))
+            dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(Wi_K_i*dK_dthetaK_i)
             #Implicit
-            df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK, dlp)
+            df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp)
             dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
 
-        return dL_dthetaK
+        return np.squeeze(dL_dthetaK)
 
     def _gradients(self, partial):
         """
         Gradients with respect to likelihood parameters
         """
-        dL_dfhat, Ki, I_KW_i = self._shared_gradients_components()
+        dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components()
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
-        dL_dthetaL = np.zeros(dlik_dthetaL.shape)
 
-        return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
+        num_params = len(dlik_dthetaL)
+        #Ki_W_i = np.diag(inv(Ki + self.W))[:, None]
+        dL_dthetaL = np.zeros((1, num_params)) # make space for one derivative for each likelihood parameter
+        for thetaL_i in range(num_params):
+            #Explicit
+            #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
+            #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None])
+            #                                               might be +
+            dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            #Implicit
+            df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+            dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL)
+
+        return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
         """
@@ -232,8 +245,8 @@ class Laplace(likelihood):
         self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
 
-        Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
-        self.ln_Ki_W_i_det = np.linalg.det(Ki_W_i)
+        self.Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
+        self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i)
 
         b = np.dot(self.W, self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
         solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b)))
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index d75e7218..c6186137 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -302,12 +302,13 @@ class student_t(likelihood_function):
         f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        dlik_hess_dsigma = ( ((v + 1)*(e**2 - (self.sigma**2)*self.v)) /
+        dlik_hess_dsigma = ( ((self.v + 1)*(e**2 - (self.sigma**2)*self.v)) /
                              ((e**2 + (self.sigma**2)*self.v)**2)
                            )
-        return np.squeeze(dlik_hess_dsigma)
+        return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):
+        #must be listed in same order as 'get_param_names'
         derivs = ([self.link_dstd(y, f, extra_data=extra_data)],
                   [self.dlik_df_dstd(y, f, extra_data=extra_data)],
                   [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 0b5a8db6..9ce83a5a 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -69,7 +69,6 @@ class GP(model):
         self.likelihood._set_params(p[self.kern.Nparam_transformed():])  # test by Nicolas
 
         if isinstance(self.likelihood, Laplace):
-            print "Updating approx: ", p
             self.likelihood.fit_full(self.kern.K(self.X))
             self.likelihood._set_params(self.likelihood._get_params())
 
@@ -134,7 +133,6 @@ class GP(model):
         matrix K* = K + diag(1./tau_tilde) plus a normalization term.
         """
         l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
-        print "Log likelihood: ", l
         return l
 
     def _log_likelihood_gradients(self):
@@ -145,17 +143,16 @@ class GP(model):
         """
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
         if isinstance(self.likelihood, Laplace):
-            dL_dthetaK_explicit = dL_dthetaK
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
             fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
             dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+            #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+            #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
 

From f9857e08c0b4f130f2ae8ace5264e9ba65d9687c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 31 May 2013 11:55:32 +0100
Subject: [PATCH 046/252] Broken it by getting rid of squeeze, but now working
 on making it faster using proper vector multiplciation for diagonals

---
 GPy/examples/laplace_approximations.py  | 12 +++--
 GPy/likelihoods/Laplace.py              | 45 ++++++----------
 GPy/likelihoods/likelihood_functions.py | 69 +++++++++++++------------
 GPy/models/GP.py                        | 13 ++++-
 4 files changed, 69 insertions(+), 70 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 203d308d..5103eefb 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -37,9 +37,10 @@ def timing():
 
 def debug_student_t_noise_approx():
     plot = False
-    real_var = 0.4
+    real_var = 0.1
     #Start a function, any function
     X = np.linspace(0.0, 10.0, 100)[:, None]
+    #X = np.array([0.5])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
     X_full = np.linspace(0.0, 10.0, 500)[:, None]
@@ -52,7 +53,7 @@ def debug_student_t_noise_approx():
     real_sd = np.sqrt(real_var)
     print "Real noise: ", real_sd
 
-    initial_var_guess = 1
+    initial_var_guess = 0.02
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -91,12 +92,14 @@ def debug_student_t_noise_approx():
     #m.constrain_positive('rbf')
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 1.8651)
-    m.constrain_positive('t_noi')
     #m.constrain_fixed('t_noise_variance', real_sd)
+    m.constrain_positive('rbf')
+    m.constrain_fixed('t_noi', real_sd)
+    m.ensure_default_constraints()
     m.update_likelihood_approximation()
     m.optimize(messages=True)
     print(m)
-    return m
+    #return m
     #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
     if plot:
         plt.suptitle('Student-t likelihood')
@@ -104,6 +107,7 @@ def debug_student_t_noise_approx():
         m.plot()
         plt.plot(X_full, Y_full)
         plt.ylim(-2.5, 2.5)
+    return m
 
     #print "Clean student t, ncg"
     #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 85af82f9..027f014e 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -53,7 +53,7 @@ class Laplace(likelihood):
 
     def predictive_values(self, mu, var, full_cov):
         if full_cov:
-            raise NotImplementedError("Cannot make correlated predictions with an EP likelihood")
+            raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
         return self.likelihood_function.predictive_values(mu, var)
 
     def _get_params(self):
@@ -63,42 +63,28 @@ class Laplace(likelihood):
         return self.likelihood_function._get_param_names()
 
     def _set_params(self, p):
-        #print "Setting laplace param with: ", p
         return self.likelihood_function._set_params(p)
 
-    def both_gradients(self, dL_d_K_Sigma, dK_dthetaK):
-        """
-        Find the gradients of the marginal likelihood w.r.t both thetaK and thetaL
-
-        dL_dthetaK differs from that of normal likelihoods as it has additional terms coming from
-        changes to y_tilde and changes to Sigma_tilde when the kernel parameters are adjusted
-
-        Similar terms arise when finding the gradients with respect to changes in the liklihood
-        parameters
-        """
-        return (self._Kgradients(dL_d_K_Sigma, dK_dthetaK), self._gradients(dL_d_K_Sigma))
-
     def _shared_gradients_components(self):
         #FIXME: Careful of side effects! And make sure W and K are up to date!
-        Ki, _, _, _ = pdinv(self.K)
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
-        #dL_dfhat = -0.5*np.diag(self.Ki_W_i)*d3lik_d3fhat
         dL_dfhat = -0.5*(np.diag(self.Ki_W_i)*d3lik_d3fhat)[:, None]
         Wi_K_i = mdot(self.W_12, self.Bi, self.W_12) #same as rasms R
         I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i)
-        return dL_dfhat, Ki, I_KW_i, Wi_K_i
+        return dL_dfhat, I_KW_i, Wi_K_i
 
-    def _Kgradients(self, dL_d_K_Sigma, dK_dthetaK):
+    def _Kgradients(self, dK_dthetaK):
         """
         Gradients with respect to prior kernel parameters
         """
-        dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components()
+        dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
         dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)[:, None]
 
         dL_dthetaK = np.zeros(dK_dthetaK.shape)
         for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
             #Explicit
-            dL_dthetaK[thetaK_i] = 0.5*mdot(self.f_hat.T, Ki, dK_dthetaK_i, Ki, self.f_hat) - 0.5*np.trace(Wi_K_i*dK_dthetaK_i)
+            f_Ki_dK_dtheta_Ki_f = mdot(self.Ki_f.T, dK_dthetaK_i, self.Ki_f)
+            dL_dthetaK[thetaK_i] = 0.5*f_Ki_dK_dtheta_Ki_f - 0.5*np.trace(Wi_K_i*dK_dthetaK_i)
             #Implicit
             df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp)
             dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
@@ -109,11 +95,12 @@ class Laplace(likelihood):
         """
         Gradients with respect to likelihood parameters
         """
-        dL_dfhat, Ki, I_KW_i, Wi_K_i = self._shared_gradients_components()
+        return np.zeros(1)
+        #return np.zeros(0)
+        dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
 
         num_params = len(dlik_dthetaL)
-        #Ki_W_i = np.diag(inv(Ki + self.W))[:, None]
         dL_dthetaL = np.zeros((1, num_params)) # make space for one derivative for each likelihood parameter
         for thetaL_i in range(num_params):
             #Explicit
@@ -123,7 +110,6 @@ class Laplace(likelihood):
             dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
-            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
             dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL)
 
         return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
@@ -230,10 +216,8 @@ class Laplace(likelihood):
         self._compute_likelihood_variables()
 
     def _compute_likelihood_variables(self):
-        #At this point get the hessian matrix
-        #print "Data: ", self.data
-        #print "fhat: ", self.f_hat
-        self.W = -np.diag(self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data))
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
             self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
@@ -273,7 +257,8 @@ class Laplace(likelihood):
         """
         #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        B = np.eye(K.shape[0]) + np.dot(W_12, np.dot(K, W_12))
+        assert np.all(W_12.T*K*W_12 == np.dot(np.diagflat(W_12), np.dot(K, np.diagflat(W_12)))) # FIXME Take this out when you've done multiinput
+        B = np.eye(K.shape[0]) + W_12.T*K*W_12
         L = jitchol(B)
         return (B, L, W_12)
 
@@ -330,7 +315,7 @@ class Laplace(likelihood):
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             #f_old = f.copy()
-            W = -np.diag(self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data))
+            W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
                 W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -339,7 +324,7 @@ class Laplace(likelihood):
             B, L, W_12 = self._compute_B_statistics(K, W)
 
             W_f = np.dot(W, f)
-            grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)[:, None]
+            grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
             #Find K_i_f
             b = W_f + grad
 
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index c6186137..c3aee835 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -191,8 +191,8 @@ class student_t(likelihood_function):
         :returns: float(likelihood evaluated for this point)
 
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
 
         e = y - f
@@ -207,7 +207,7 @@ class student_t(likelihood_function):
         """
         Gradient of the link function at y, given f w.r.t f
 
-        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{-(v+1)(f_{i}-y_{i})}{(f_{i}-y_{i})^{2} + \sigma^{2}v}$$
+        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
 
         :y: data
         :f: latent variables f
@@ -215,51 +215,52 @@ class student_t(likelihood_function):
         :returns: gradient of likelihood evaluated at points
 
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        grad = -((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
-        return np.squeeze(grad)
+        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
         """
         Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
         i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
 
-        Will return diagonal of hessian, since every where else it is 0
+        Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+        (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
 
-        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((f_{i}-y_{i})^{2} - \sigma^{2}v)}{((f_{i}-y_{i})^{2} + \sigma^{2}v)^{2}}$$
+        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
 
         :y: data
         :f: latent variables f
         :extra_data: extra_data which is not used in student t distribution
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
 
         e = y - f
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
-        return np.squeeze(hess)
+        return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
         """
         Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
 
-        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((f_{i} - y_{i})^3 - 3(f_{i} - y_{i}) \sigma^{2} v))}{((f_{i} - y_{i}) + \sigma^{2} v)^3}$$
+        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        d3lik_d3f = ( -(2*(self.v + 1)*(e**3 - e*3*self.v*(self.sigma**2))) /
+        d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
                        ((e**2 + (self.sigma**2)*self.v)**3)
                     )
-        return np.squeeze(d3lik_d3f)
+        return d3lik_d3f
 
-    def link_dstd(self, y, f, extra_data=None):
+    def lik_dstd(self, y, f, extra_data=None):
         """
         Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
 
@@ -268,48 +269,48 @@ class student_t(likelihood_function):
 
         $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        dlik_dsigma = ( (1/self.sigma) -
-                        ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + (e**2) / ((self.sigma**2)*self.v) ) )
+        dlik_dsigma = ( - (1/self.sigma) +
+                        ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) )
                       )
-        return np.squeeze(dlik_dsigma)
+        return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
         """
         Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
 
-        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{2\sigma v(v + 1)(f-y)}{(f-y)^2 + \sigma^2 v)^2}$$
+        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        dlik_grad_dsigma = ((2*self.sigma*self.v*(self.v + 1)*e)
+        dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e)
                             / ((self.v*(self.sigma**2) + e**2)**2)
                            )
-        return np.squeeze(dlik_grad_dsigma)
+        return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
         """
         Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
 
-        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{(v + 1)((f-y)^2 - \sigma^2 v)}{((f-y)^2 + \sigma^2 v)}$$
+        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
-        y = np.squeeze(y)
-        f = np.squeeze(f)
+        #y = np.squeeze(y)
+        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
-        dlik_hess_dsigma = ( ((self.v + 1)*(e**2 - (self.sigma**2)*self.v)) /
-                             ((e**2 + (self.sigma**2)*self.v)**2)
+        dlik_hess_dsigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
+                              ((e**2 + (self.sigma**2)*self.v)**3)
                            )
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):
         #must be listed in same order as 'get_param_names'
-        derivs = ([self.link_dstd(y, f, extra_data=extra_data)],
+        derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
                   [self.dlik_df_dstd(y, f, extra_data=extra_data)],
                   [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
                  ) # lists as we might learn many parameters
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 9ce83a5a..0f3dcb58 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -142,13 +142,22 @@ class GP(model):
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
+        print "dL_dthetaK before: ",dL_dthetaK
         if isinstance(self.likelihood, Laplace):
+            #Reapproximate incase it hasnt been done...
+            if isinstance(self.likelihood, Laplace):
+                self.likelihood.fit_full(self.kern.K(self.X))
+                self.likelihood._set_params(self.likelihood._get_params())
+
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
             fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
+            #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
+            #THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
 
-            dL_dthetaK = self.likelihood._Kgradients(dL_d_K_Sigma=self.dL_dK, dK_dthetaK=dK_dthetaK)
-            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
+            dL_dthetaL = 0 # self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            print "dL_dthetaK after: ",dL_dthetaK
             #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From e842f6e68735adaf95b31d0bc3c074dc39d553ea Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 31 May 2013 16:45:22 +0100
Subject: [PATCH 047/252] Made it use the fact that W is diagonal and put
 assertions in to ensure that the results are the same

---
 GPy/likelihoods/Laplace.py | 99 ++++++++++++++++++++++++++++----------
 GPy/models/GP.py           |  2 +-
 2 files changed, 75 insertions(+), 26 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 027f014e..af74755f 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -68,8 +68,11 @@ class Laplace(likelihood):
     def _shared_gradients_components(self):
         #FIXME: Careful of side effects! And make sure W and K are up to date!
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
-        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)*d3lik_d3fhat)[:, None]
-        Wi_K_i = mdot(self.W_12, self.Bi, self.W_12) #same as rasms R
+        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat)
+        Wi_K_i = mdot(np.diagflat(self.W_12), self.Bi, np.diagflat(self.W_12)) #same as rasms R
+        Wi_K_inew = self.W_12*self.Bi*self.W_12.T #same as rasms R
+        assert np.all(Wi_K_i == Wi_K_inew)
+
         I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i)
         return dL_dfhat, I_KW_i, Wi_K_i
 
@@ -78,7 +81,7 @@ class Laplace(likelihood):
         Gradients with respect to prior kernel parameters
         """
         dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
-        dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)[:, None]
+        dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
 
         dL_dthetaK = np.zeros(dK_dthetaK.shape)
         for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
@@ -89,7 +92,7 @@ class Laplace(likelihood):
             df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp)
             dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
 
-        return np.squeeze(dL_dthetaK)
+        return dL_dthetaK
 
     def _gradients(self, partial):
         """
@@ -112,7 +115,7 @@ class Laplace(likelihood):
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
             dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL)
 
-        return np.squeeze(dL_dthetaL) #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
+        return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
     def _compute_GP_variables(self):
         """
@@ -147,7 +150,9 @@ class Laplace(likelihood):
         #((L.T*w)_i + I)f_hat = y_tilde
         L = jitchol(self.K)
         Li = chol_inv(L)
-        Lt_W = np.dot(L.T, self.W) #FIXME: Can make Faster
+        Lt_W = np.dot(L.T, np.diagflat(self.W)) #FIXME: Can make Faster
+        Lt_Wnew = L.T*self.W.T
+        assert np.all(Lt_Wnew == Lt_W)
 
         ##Check it isn't singular!
         if cond(Lt_W) > epsilon:
@@ -159,12 +164,27 @@ class Laplace(likelihood):
 
         #f.T(Ki + W)f
         f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
-                    + mdot(self.f_hat.T, self.W, self.f_hat)
+                    + mdot(self.f_hat.T, np.diagflat(self.W), self.f_hat)
                     )
+        f_Ki_W_fnew = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
+                    + mdot(self.f_hat.T, self.W*self.f_hat)
+                    )
+        assert np.all(f_Ki_W_f == f_Ki_W_fnew)
 
-        y_W_f = mdot(Y_tilde.T, self.W, self.f_hat)
-        y_W_y = mdot(Y_tilde.T, self.W, Y_tilde)
-        ln_W_det = det_ln_diag(self.W)
+        y_W_f = mdot((Y_tilde.T, np.diagflat(self.W)), self.f_hat)
+        y_W_fnew = mdot(Y_tilde.T*self.W.T, self.f_hat)
+        assert np.all(y_W_f == y_W_fnew)
+
+
+        y_W_y = mdot((Y_tilde.T, np.diagflat(self.W)), Y_tilde)
+        y_W_ynew = mdot(Y_tilde.T, self.W*Y_tilde)
+        assert np.all(y_W_y == y_W_ynew)
+
+        ln_W_det = det_ln_diag(np.diagflat(self.W))
+        ln_W_detnew = np.log(self.W).sum()
+        assert np.all(ln_W_det == ln_W_detnew)
+
+        #FIXME: Revisit this
         Z_tilde = (- self.NORMAL_CONST
                    + 0.5*self.ln_K_det
                    + 0.5*ln_W_det
@@ -189,14 +209,16 @@ class Laplace(likelihood):
         if cond(self.W) > epsilon:
             print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
 
-        self.Sigma_tilde = inv(self.W)  # Damn
+        self.Sigma_tilde = inv(np.diagflat(self.W))  # Damn
+        Sigma_tildenew = np.diagflat(1.0/self.W)
+        assert np.all(self.Sigma_tilde == Sigma_tildenew)
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
         self.YYT = np.dot(self.Y, self.Y.T)
         self.covariance_matrix = self.Sigma_tilde
-        self.precision = 1 / np.diag(self.covariance_matrix)[:, None]
+        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
 
     def fit_full(self, K):
         """
@@ -229,12 +251,24 @@ class Laplace(likelihood):
         self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
 
-        self.Ki_W_i = self.K - mdot(self.K, self.W_12, self.Bi, self.W_12, self.K)
+        self.Ki_W_i = self.K - mdot(self.K, (np.diagflat(self.W_12), self.Bi, np.diagflat(self.W_12)), self.K) # Funky, order matters on stability!
+        Ki_W_inew = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
+        assert np.all(self.Ki_W_i == Ki_W_inew)
+
         self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i)
 
-        b = np.dot(self.W, self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)[:, None]
-        solve_chol = cho_solve((self.B_chol, True), mdot(self.W_12, (self.K, b)))
-        a = b - mdot(self.W_12, solve_chol)
+        b = np.dot(np.diagflat(self.W), self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
+        bnew = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
+        assert np.all(b == bnew)
+
+        solve_chol = cho_solve((self.B_chol, True), mdot((np.diagflat(self.W_12), self.K), b))
+        solve_cholnew = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
+        assert np.all(solve_chol == solve_cholnew)
+
+        a = b - mdot(np.diagflat(self.W_12), solve_chol)
+        anew = b - self.W_12*solve_chol
+        assert np.all(a == anew)
+
         self.Ki_f = a
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
         self.ln_K_det = pddet(self.K)
@@ -255,10 +289,13 @@ class Laplace(likelihood):
         :W: Negative hessian at a point (diagonal matrix)
         :returns: (B, L)
         """
-        #W is diagnoal so its sqrt is just the sqrt of the diagonal elements
+        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        assert np.all(W_12.T*K*W_12 == np.dot(np.diagflat(W_12), np.dot(K, np.diagflat(W_12)))) # FIXME Take this out when you've done multiinput
-        B = np.eye(K.shape[0]) + W_12.T*K*W_12
+        # FIXME Take this out when you've done multiinput, Weirdly this is
+        # better when its W_12.T*K*W_12 which shouldnt make a difference
+        # because K is symmetrical
+        assert np.allclose(W_12*K*W_12.T, np.dot(np.diagflat(W_12), np.dot(K, np.diagflat(W_12))))
+        B = np.eye(self.N) + W_12*K*W_12.T
         L = jitchol(B)
         return (B, L, W_12)
 
@@ -323,19 +360,31 @@ class Laplace(likelihood):
                                     # This is a property only held by non-log-concave likelihoods
             B, L, W_12 = self._compute_B_statistics(K, W)
 
-            W_f = np.dot(W, f)
+            W_f = np.dot(np.diagflat(W), f)
+            W_fnew = W*f
+            assert np.all(W_f == W_fnew)
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
             #Find K_i_f
             b = W_f + grad
 
             #a should be equal to Ki*f now so should be able to use it
             c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-            solve_L = cho_solve((L, True), np.dot(W_12, c))
-            f = c - np.dot(K, np.dot(W_12, solve_L))
 
-            solve_L = cho_solve((L, True), np.dot(W_12, np.dot(K, b)))
-            a = b - np.dot(W_12, solve_L)
-            #f = np.dot(K, a)
+            solve_L = cho_solve((L, True), np.dot(np.diagflat(W_12), c))
+            solve_Lnew = cho_solve((L, True), W_12*c)
+            assert np.all(solve_L == solve_Lnew)
+
+            f = c - np.dot(K, np.dot(np.diagflat(W_12), solve_L))
+            fnew = c - np.dot(K, W_12*solve_L)
+            assert np.all(f == fnew)
+
+            solve_L = cho_solve((L, True), np.dot(np.diagflat(W_12), np.dot(K, b)))
+            solve_Lnew = cho_solve((L, True), W_12*np.dot(K, b))
+            assert np.all(solve_L == solve_Lnew)
+
+            a = b - np.dot(np.diagflat(W_12), solve_L)
+            anew = b - W_12*solve_L
+            assert np.all(a == anew)
 
             tmp_old_obj = old_obj
             old_obj = new_obj
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 0f3dcb58..787429de 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -156,7 +156,7 @@ class GP(model):
             #THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
 
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
-            dL_dthetaL = 0 # self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
             print "dL_dthetaK after: ",dL_dthetaK
             #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:

From 6c2975079517364f00b2345f0ef9b3d2f5a14103 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 31 May 2013 16:59:54 +0100
Subject: [PATCH 048/252] Took out all the asserts and using pure broadcasting
 method of diagonal now

---
 GPy/examples/laplace_approximations.py |  4 +-
 GPy/likelihoods/Laplace.py             | 70 ++++++--------------------
 GPy/models/GP.py                       |  3 +-
 3 files changed, 20 insertions(+), 57 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 5103eefb..14ff44a0 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -39,8 +39,8 @@ def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 100)[:, None]
-    #X = np.array([0.5])[:, None]
+    #X = np.linspace(0.0, 10.0, 100)[:, None]
+    X = np.array([0.5])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
     X_full = np.linspace(0.0, 10.0, 500)[:, None]
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index af74755f..74d37d48 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -69,9 +69,7 @@ class Laplace(likelihood):
         #FIXME: Careful of side effects! And make sure W and K are up to date!
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
         dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat)
-        Wi_K_i = mdot(np.diagflat(self.W_12), self.Bi, np.diagflat(self.W_12)) #same as rasms R
-        Wi_K_inew = self.W_12*self.Bi*self.W_12.T #same as rasms R
-        assert np.all(Wi_K_i == Wi_K_inew)
+        Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
 
         I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i)
         return dL_dfhat, I_KW_i, Wi_K_i
@@ -150,9 +148,7 @@ class Laplace(likelihood):
         #((L.T*w)_i + I)f_hat = y_tilde
         L = jitchol(self.K)
         Li = chol_inv(L)
-        Lt_W = np.dot(L.T, np.diagflat(self.W)) #FIXME: Can make Faster
-        Lt_Wnew = L.T*self.W.T
-        assert np.all(Lt_Wnew == Lt_W)
+        Lt_W = L.T*self.W.T
 
         ##Check it isn't singular!
         if cond(Lt_W) > epsilon:
@@ -164,25 +160,15 @@ class Laplace(likelihood):
 
         #f.T(Ki + W)f
         f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
-                    + mdot(self.f_hat.T, np.diagflat(self.W), self.f_hat)
-                    )
-        f_Ki_W_fnew = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
                     + mdot(self.f_hat.T, self.W*self.f_hat)
                     )
-        assert np.all(f_Ki_W_f == f_Ki_W_fnew)
 
-        y_W_f = mdot((Y_tilde.T, np.diagflat(self.W)), self.f_hat)
-        y_W_fnew = mdot(Y_tilde.T*self.W.T, self.f_hat)
-        assert np.all(y_W_f == y_W_fnew)
+        y_W_f = mdot(Y_tilde.T*self.W.T, self.f_hat)
 
 
-        y_W_y = mdot((Y_tilde.T, np.diagflat(self.W)), Y_tilde)
-        y_W_ynew = mdot(Y_tilde.T, self.W*Y_tilde)
-        assert np.all(y_W_y == y_W_ynew)
+        y_W_y = mdot(Y_tilde.T, self.W*Y_tilde)
 
-        ln_W_det = det_ln_diag(np.diagflat(self.W))
-        ln_W_detnew = np.log(self.W).sum()
-        assert np.all(ln_W_det == ln_W_detnew)
+        ln_W_det = np.log(self.W).sum()
 
         #FIXME: Revisit this
         Z_tilde = (- self.NORMAL_CONST
@@ -203,15 +189,13 @@ class Laplace(likelihood):
                    #+ y_W_f
                    #+ self.ln_z_hat
                    #)
-        self.Z_tilde = 0
+        #self.Z_tilde = 0
 
         ##Check it isn't singular!
         if cond(self.W) > epsilon:
             print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
 
-        self.Sigma_tilde = inv(np.diagflat(self.W))  # Damn
-        Sigma_tildenew = np.diagflat(1.0/self.W)
-        assert np.all(self.Sigma_tilde == Sigma_tildenew)
+        self.Sigma_tilde = np.diagflat(1.0/self.W)
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -251,23 +235,15 @@ class Laplace(likelihood):
         self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
 
-        self.Ki_W_i = self.K - mdot(self.K, (np.diagflat(self.W_12), self.Bi, np.diagflat(self.W_12)), self.K) # Funky, order matters on stability!
-        Ki_W_inew = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
-        assert np.all(self.Ki_W_i == Ki_W_inew)
+        self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
 
         self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i)
 
-        b = np.dot(np.diagflat(self.W), self.f_hat) + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
-        bnew = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
-        assert np.all(b == bnew)
+        b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
 
-        solve_chol = cho_solve((self.B_chol, True), mdot((np.diagflat(self.W_12), self.K), b))
-        solve_cholnew = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
-        assert np.all(solve_chol == solve_cholnew)
+        solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
 
-        a = b - mdot(np.diagflat(self.W_12), solve_chol)
-        anew = b - self.W_12*solve_chol
-        assert np.all(a == anew)
+        a = b - self.W_12*solve_chol
 
         self.Ki_f = a
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
@@ -291,10 +267,6 @@ class Laplace(likelihood):
         """
         #W is diagonal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
-        # FIXME Take this out when you've done multiinput, Weirdly this is
-        # better when its W_12.T*K*W_12 which shouldnt make a difference
-        # because K is symmetrical
-        assert np.allclose(W_12*K*W_12.T, np.dot(np.diagflat(W_12), np.dot(K, np.diagflat(W_12))))
         B = np.eye(self.N) + W_12*K*W_12.T
         L = jitchol(B)
         return (B, L, W_12)
@@ -360,9 +332,7 @@ class Laplace(likelihood):
                                     # This is a property only held by non-log-concave likelihoods
             B, L, W_12 = self._compute_B_statistics(K, W)
 
-            W_f = np.dot(np.diagflat(W), f)
-            W_fnew = W*f
-            assert np.all(W_f == W_fnew)
+            W_f = W*f
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
             #Find K_i_f
             b = W_f + grad
@@ -370,21 +340,13 @@ class Laplace(likelihood):
             #a should be equal to Ki*f now so should be able to use it
             c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
 
-            solve_L = cho_solve((L, True), np.dot(np.diagflat(W_12), c))
-            solve_Lnew = cho_solve((L, True), W_12*c)
-            assert np.all(solve_L == solve_Lnew)
+            solve_L = cho_solve((L, True), W_12*c)
 
-            f = c - np.dot(K, np.dot(np.diagflat(W_12), solve_L))
-            fnew = c - np.dot(K, W_12*solve_L)
-            assert np.all(f == fnew)
+            f = c - np.dot(K, W_12*solve_L)
 
-            solve_L = cho_solve((L, True), np.dot(np.diagflat(W_12), np.dot(K, b)))
-            solve_Lnew = cho_solve((L, True), W_12*np.dot(K, b))
-            assert np.all(solve_L == solve_Lnew)
+            solve_L = cho_solve((L, True), W_12*np.dot(K, b))
 
-            a = b - np.dot(np.diagflat(W_12), solve_L)
-            anew = b - W_12*solve_L
-            assert np.all(a == anew)
+            a = b - W_12*solve_L
 
             tmp_old_obj = old_obj
             old_obj = new_obj
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 787429de..0ba20d7b 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -152,8 +152,9 @@ class GP(model):
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
             fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
             #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
+
+            #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
             dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
-            #THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
 
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From f3b8dfb2225c8a25a0b753ec0e2f63b28cdec827 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 3 Jun 2013 14:51:09 +0100
Subject: [PATCH 049/252] about to input new derivations for Z's...

---
 GPy/examples/laplace_approximations.py | 15 +++++++++++---
 GPy/likelihoods/Laplace.py             | 28 ++++++++++++++++----------
 GPy/models/GP.py                       | 17 ++++++++--------
 3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 14ff44a0..ee71a950 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -143,11 +143,12 @@ def student_t_approx():
     Yc[10] += 100
     Yc[25] += 10
     Yc[23] += 10
+    Yc[26] += 1000
     Yc[24] += 10
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 1000000000000
+    deg_free = 10
     real_sd = np.sqrt(real_var)
     print "Real noise: ", real_sd
 
@@ -187,21 +188,25 @@ def student_t_approx():
     plt.subplot(211)
     m.plot()
     plt.plot(X_full, Y_full)
+    plt.title('Gaussian clean')
     print m
 
     #Corrupt
     print "Corrupt Gaussian"
     m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
     m.ensure_default_constraints()
-    m.optimize()
+    #m.optimize()
     plt.subplot(212)
     m.plot()
     plt.plot(X_full, Y_full)
+    plt.title('Gaussian corrupt')
     print m
 
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
-    edited_real_sd = initial_var_guess #real_sd
+    edited_real_sd = real_sd #initial_var_guess
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -215,6 +220,7 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
+    plt.title('Student-t rasm clean')
 
     print "Corrupt student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -228,6 +234,7 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
+    plt.title('Student-t rasm corrupt')
 
     print "Clean student t, ncg"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -241,6 +248,7 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
+    plt.title('Student-t ncg clean')
 
     print "Corrupt student t, ncg"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -254,6 +262,7 @@ def student_t_approx():
     m.plot()
     plt.plot(X_full, Y_full)
     plt.ylim(-2.5, 2.5)
+    plt.title('Student-t ncg corrupt')
 
 
     ###with a student t distribution, since it has heavy tails it should work well
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 74d37d48..45fddeaa 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -6,7 +6,10 @@ from numpy.linalg import cond
 from likelihood import likelihood
 from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.lapack import dtrtrs
+import random
 #import pylab as plt
+np.random.seed(50)
+random.seed(50)
 
 
 class Laplace(likelihood):
@@ -156,6 +159,7 @@ class Laplace(likelihood):
 
         Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
         self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
+
         Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
 
         #f.T(Ki + W)f
@@ -239,15 +243,15 @@ class Laplace(likelihood):
 
         self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i)
 
+        #Do the computation again at f to get Ki_f which is useful
         b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
-
         solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
-
         a = b - self.W_12*solve_chol
-
         self.Ki_f = a
+
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
         self.ln_K_det = pddet(self.K)
+        #_, _, _, self.ln_K_det = pdinv(self.K)
 
         self.ln_z_hat = (- 0.5*self.f_Ki_f
                          - 0.5*self.ln_K_det
@@ -296,7 +300,7 @@ class Laplace(likelihood):
             res = -1 * (--np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
             return np.squeeze(res)
 
-        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
+        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
     def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50):
@@ -336,17 +340,19 @@ class Laplace(likelihood):
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
             #Find K_i_f
             b = W_f + grad
+            b = step_size*b
 
-            #a should be equal to Ki*f now so should be able to use it
-            c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-
-            solve_L = cho_solve((L, True), W_12*c)
-
-            f = c - np.dot(K, W_12*solve_L)
+            #Need this to find the f we have a stepsize which we need to move in, rather than a full unit movement
+            #c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
+            #solve_L = cho_solve((L, True), W_12*c)
+            #f = c - np.dot(K, W_12*solve_L)
 
+            #FIXME: Can't we get rid of this? Don't we want to evaluate obj(c,f) and this is our new_obj?
+            #Why did I choose to evaluate the objective function at the new f with the old hessian? I'm sure there was a good reason,
+            #Document it!
             solve_L = cho_solve((L, True), W_12*np.dot(K, b))
-
             a = b - W_12*solve_L
+            f = np.dot(K, a)
 
             tmp_old_obj = old_obj
             old_obj = new_obj
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 0ba20d7b..e4ed52ef 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -142,23 +142,22 @@ class GP(model):
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
-        print "dL_dthetaK before: ",dL_dthetaK
         if isinstance(self.likelihood, Laplace):
             #Reapproximate incase it hasnt been done...
-            if isinstance(self.likelihood, Laplace):
-                self.likelihood.fit_full(self.kern.K(self.X))
-                self.likelihood._set_params(self.likelihood._get_params())
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
+            print self.kern._get_params()
 
             #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
+            #fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
             #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
 
             #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
-            dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
+            #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
-            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
-            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            print "dL_dthetaK after: ",dL_dthetaK
+            #dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
+            dL_dthetaL = 0 #self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            #print "dL_dthetaK after: ",dL_dthetaK
             #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From ac461e1b2aa65afa08359e1ac6d6cb8956e962b4 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 18 Jun 2013 17:55:58 +0100
Subject: [PATCH 050/252] Checkgrads with explicit and implicit components half
 the time

---
 GPy/examples/laplace_approximations.py |  69 +++++++--------
 GPy/likelihoods/Laplace.py             | 114 +++++++++++--------------
 GPy/models/GP.py                       |   7 +-
 GPy/util/linalg.py                     |   2 +-
 4 files changed, 91 insertions(+), 101 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index ee71a950..5120dfb5 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -39,11 +39,11 @@ def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
     #Start a function, any function
-    #X = np.linspace(0.0, 10.0, 100)[:, None]
-    X = np.array([0.5])[:, None]
+    X = np.linspace(0.0, 10.0, 15)[:, None]
+    #X = np.array([0.5])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
-    X_full = np.linspace(0.0, 10.0, 500)[:, None]
+    X_full = np.linspace(0.0, 10.0, 15)[:, None]
     Y_full = np.sin(X_full)
 
     Y = Y/Y.max()
@@ -83,7 +83,8 @@ def debug_student_t_noise_approx():
         #plt.plot(X_full, Y_full)
     #print m
 
-    edited_real_sd = initial_var_guess #real_sd
+    #edited_real_sd = initial_var_guess #real_sd
+    edited_real_sd = real_sd
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
@@ -94,7 +95,7 @@ def debug_student_t_noise_approx():
     #m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_variance', real_sd)
     m.constrain_positive('rbf')
-    m.constrain_fixed('t_noi', real_sd)
+    #m.constrain_fixed('t_noi', real_sd)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
     m.optimize(messages=True)
@@ -148,7 +149,7 @@ def student_t_approx():
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10
+    deg_free = 8
     real_sd = np.sqrt(real_var)
     print "Real noise: ", real_sd
 
@@ -202,8 +203,6 @@ def student_t_approx():
     plt.title('Gaussian corrupt')
     print m
 
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
     edited_real_sd = real_sd #initial_var_guess
@@ -236,33 +235,35 @@ def student_t_approx():
     plt.ylim(-2.5, 2.5)
     plt.title('Student-t rasm corrupt')
 
-    print "Clean student t, ncg"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
-    m = GPy.models.GP(X, stu_t_likelihood, kernel3)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    plt.subplot(221)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
-    plt.title('Student-t ncg clean')
+    return m
 
-    print "Corrupt student t, ncg"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=False)
-    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    plt.subplot(223)
-    m.plot()
-    plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
-    plt.title('Student-t ncg corrupt')
+    #print "Clean student t, ncg"
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
+    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    #m.ensure_default_constraints()
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    #print(m)
+    #plt.subplot(221)
+    #m.plot()
+    #plt.plot(X_full, Y_full)
+    #plt.ylim(-2.5, 2.5)
+    #plt.title('Student-t ncg clean')
+
+    #print "Corrupt student t, ncg"
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=False)
+    #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    #m.ensure_default_constraints()
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    #print(m)
+    #plt.subplot(223)
+    #m.plot()
+    #plt.plot(X_full, Y_full)
+    #plt.ylim(-2.5, 2.5)
+    #plt.title('Student-t ncg corrupt')
 
 
     ###with a student t distribution, since it has heavy tails it should work well
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 45fddeaa..a8347345 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -8,9 +8,6 @@ from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
 from scipy.linalg.lapack import dtrtrs
 import random
 #import pylab as plt
-np.random.seed(50)
-random.seed(50)
-
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -45,7 +42,7 @@ class Laplace(likelihood):
         self.is_heteroscedastic = True
         self.Nparams = 0
 
-        self.NORMAL_CONST = -((0.5 * self.N) * np.log(2 * np.pi))
+        self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
 
         #Initial values for the GP variables
         self.Y = np.zeros((self.N, 1))
@@ -72,26 +69,36 @@ class Laplace(likelihood):
         #FIXME: Careful of side effects! And make sure W and K are up to date!
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
         dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat)
+
         Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
 
         I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i)
         return dL_dfhat, I_KW_i, Wi_K_i
 
-    def _Kgradients(self, dK_dthetaK):
+    def _Kgradients(self, dK_dthetaK, X):
         """
         Gradients with respect to prior kernel parameters
         """
         dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
         dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
 
-        dL_dthetaK = np.zeros(dK_dthetaK.shape)
-        for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
-            #Explicit
-            f_Ki_dK_dtheta_Ki_f = mdot(self.Ki_f.T, dK_dthetaK_i, self.Ki_f)
-            dL_dthetaK[thetaK_i] = 0.5*f_Ki_dK_dtheta_Ki_f - 0.5*np.trace(Wi_K_i*dK_dthetaK_i)
-            #Implicit
-            df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp)
-            dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
+        #Implicit
+        impl = mdot(dlp, dL_dfhat.T, I_KW_i)
+        expl_a = - mdot(self.Ki_f, self.Ki_f.T)
+        expl_b = Wi_K_i
+        expl = 0.5*expl_a - 0.5*expl_b
+        dL_dthetaK_exp = dK_dthetaK(expl, X)
+        dL_dthetaK_imp = dK_dthetaK(impl, X)
+        dL_dthetaK = -(dL_dthetaK_imp + dL_dthetaK_exp)
+
+        #dL_dthetaK = np.zeros(dK_dthetaK.shape)
+        #for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
+            ##Explicit
+            #f_Ki_dK_dtheta_Ki_f = mdot(self.Ki_f.T, dK_dthetaK_i, self.Ki_f)
+            #dL_dthetaK[thetaK_i] = 0.5*f_Ki_dK_dtheta_Ki_f - 0.5*np.trace(Wi_K_i*dK_dthetaK_i)
+            ##Implicit
+            #df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp)
+            #dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
 
         return dL_dthetaK
 
@@ -99,13 +106,12 @@ class Laplace(likelihood):
         """
         Gradients with respect to likelihood parameters
         """
-        return np.zeros(1)
-        #return np.zeros(0)
+        #return np.zeros(1)
         dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
 
         num_params = len(dlik_dthetaL)
-        dL_dthetaL = np.zeros((1, num_params)) # make space for one derivative for each likelihood parameter
+        dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter
         for thetaL_i in range(num_params):
             #Explicit
             #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
@@ -143,8 +149,6 @@ class Laplace(likelihood):
         $$\tilde{\Sigma} = W^{-1}$$
 
         """
-        epsilon = 1e14
-
         #Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
         #dtritri -> L -> L_i
         #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
@@ -153,54 +157,38 @@ class Laplace(likelihood):
         Li = chol_inv(L)
         Lt_W = L.T*self.W.T
 
-        ##Check it isn't singular!
-        if cond(Lt_W) > epsilon:
-            print "WARNING: L_inv.T * W matrix is singular,\nnumerical stability may be a problem"
-
         Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
         self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
 
         Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
 
-        #f.T(Ki + W)f
-        f_Ki_W_f = (np.dot(self.f_hat.T, cho_solve((L, True), self.f_hat))
-                    + mdot(self.f_hat.T, self.W*self.f_hat)
-                    )
+        ln_W_det = det_ln_diag(self.W)
+        yf_W_yf = mdot((Y_tilde - self.f_hat).T, np.diagflat(self.W), (Y_tilde - self.f_hat))
 
-        y_W_f = mdot(Y_tilde.T*self.W.T, self.f_hat)
-
-
-        y_W_y = mdot(Y_tilde.T, self.W*Y_tilde)
-
-        ln_W_det = np.log(self.W).sum()
-
-        #FIXME: Revisit this
-        Z_tilde = (- self.NORMAL_CONST
-                   + 0.5*self.ln_K_det
-                   + 0.5*ln_W_det
-                   + 0.5*self.ln_Ki_W_i_det
-                   + 0.5*f_Ki_W_f
-                   + 0.5*y_W_y
-                   - y_W_f
-                   + self.ln_z_hat
-                   )
-        #Z_tilde = (self.NORMAL_CONST
-                   #- 0.5*self.ln_K_det
-                   #- 0.5*ln_W_det
-                   #- 0.5*self.ln_Ki_W_i_det
-                   #- 0.5*f_Ki_W_f
-                   #- 0.5*y_W_y
-                   #+ y_W_f
+        #Z_tilde = (+ self.NORMAL_CONST
                    #+ self.ln_z_hat
+                   #+ 0.5*self.ln_I_KW_det
+                   #- 0.5*ln_W_det
+                   #+ 0.5*self.f_Ki_f
+                   #+ 0.5*yf_W_yf
                    #)
-        #self.Z_tilde = 0
-
-        ##Check it isn't singular!
-        if cond(self.W) > epsilon:
-            print "WARNING: Transformed covariance matrix is singular,\nnumerical stability may be a problem"
 
         self.Sigma_tilde = np.diagflat(1.0/self.W)
 
+        Ki, _, _, K_det = pdinv(self.K)
+        ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
+        W = np.diagflat(self.W)
+        Wi = self.Sigma_tilde
+        W12i = np.sqrt(Wi)
+        D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
+        fDf = mdot(self.f_hat.T, D, self.f_hat)
+        l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+        Z_tilde = (+ self.NORMAL_CONST
+                   + l
+                   + 0.5*ln_det_K_Wi__Bi
+                   - 0.5*fDf
+                  )
+
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
@@ -239,10 +227,6 @@ class Laplace(likelihood):
         self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
         self.Bi, _, _, B_det = pdinv(self.B)
 
-        self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
-
-        self.ln_Ki_W_i_det = np.linalg.det(self.Ki_W_i)
-
         #Do the computation again at f to get Ki_f which is useful
         b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
         solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
@@ -250,12 +234,14 @@ class Laplace(likelihood):
         self.Ki_f = a
 
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
-        self.ln_K_det = pddet(self.K)
-        #_, _, _, self.ln_K_det = pdinv(self.K)
+        self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
 
+        #For det, |I + KW| == |I + W_12*K*W_12|
+        self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
+
+        #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
         self.ln_z_hat = (- 0.5*self.f_Ki_f
-                         - 0.5*self.ln_K_det
-                         + 0.5*self.ln_Ki_W_i_det
+                         - self.ln_I_KW_det
                          + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
                          )
 
@@ -289,7 +275,7 @@ class Laplace(likelihood):
         #ONLY WORKS FOR 1D DATA
         def obj(f):
             res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
-                        + self.NORMAL_CONST)
+                        - self.NORMAL_CONST)
             return float(res)
 
         def obj_grad(f):
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index e4ed52ef..d56ee86f 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -141,6 +141,8 @@ class GP(model):
 
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
+        self.likelihood.fit_full(self.kern.K(self.X))
+        self.likelihood._set_params(self.likelihood._get_params())
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
         if isinstance(self.likelihood, Laplace):
             #Reapproximate incase it hasnt been done...
@@ -155,8 +157,9 @@ class GP(model):
             #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
             #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
 
-            #dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK=dK_dthetaK)
-            dL_dthetaL = 0 #self.likelihood._gradients(partial=np.diag(self.dL_dK))
+            dK_dthetaK = self.kern.dK_dtheta
+            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
             #print "dL_dthetaK after: ",dL_dthetaK
             #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 08e6fd99..f19acf1a 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -34,7 +34,7 @@ def det_ln_diag(A):
 
 def pddet(A):
     """
-    Determinant of a positive definite matrix
+    Determinant of a positive definite matrix, only symmetric matricies though
     """
     L = jitchol(A)
     logdetA = 2*sum(np.log(np.diag(L)))

From de689fa8e91928b7fc2d02f56d4eca14d82eaafd Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 19 Jun 2013 12:00:00 +0100
Subject: [PATCH 051/252] Now gradchecks everytime but student_t fit is bad,
 noise is underestimated by a long way

---
 GPy/examples/laplace_approximations.py  | 18 +++++++++--------
 GPy/likelihoods/Laplace.py              | 27 ++++++++++++++++---------
 GPy/likelihoods/likelihood_functions.py | 16 +--------------
 GPy/models/GP.py                        | 12 -----------
 4 files changed, 29 insertions(+), 44 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 5120dfb5..84527d08 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -39,28 +39,28 @@ def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 15)[:, None]
+    X = np.linspace(0.0, 10.0, 50)[:, None]
     #X = np.array([0.5])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
-    X_full = np.linspace(0.0, 10.0, 15)[:, None]
+    X_full = np.linspace(0.0, 10.0, 50)[:, None]
     Y_full = np.sin(X_full)
 
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10000
+    deg_free = 1000
     real_sd = np.sqrt(real_var)
-    print "Real noise: ", real_sd
+    print "Real noise std: ", real_sd
 
-    initial_var_guess = 0.02
+    initial_var_guess = 0.3
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
 
     plt.close('all')
     # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
@@ -83,22 +83,24 @@ def debug_student_t_noise_approx():
         #plt.plot(X_full, Y_full)
     #print m
 
-    #edited_real_sd = initial_var_guess #real_sd
+    edited_real_sd = initial_var_guess #real_sd
     edited_real_sd = real_sd
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m['white'] = 1e-3
     #m.constrain_positive('rbf')
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_variance', real_sd)
     m.constrain_positive('rbf')
+    m.constrain_positive('t_noise')
     #m.constrain_fixed('t_noi', real_sd)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
-    m.optimize(messages=True)
+    #m.optimize(messages=True)
     print(m)
     #return m
     #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index a8347345..5b1a814a 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -84,12 +84,13 @@ class Laplace(likelihood):
 
         #Implicit
         impl = mdot(dlp, dL_dfhat.T, I_KW_i)
-        expl_a = - mdot(self.Ki_f, self.Ki_f.T)
+        expl_a = mdot(self.Ki_f, self.Ki_f.T)
         expl_b = Wi_K_i
-        expl = 0.5*expl_a - 0.5*expl_b
+        expl = 0.5*expl_a + 0.5*expl_b
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
-        dL_dthetaK = -(dL_dthetaK_imp + dL_dthetaK_exp)
+        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp
 
         #dL_dthetaK = np.zeros(dK_dthetaK.shape)
         #for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
@@ -117,10 +118,12 @@ class Laplace(likelihood):
             #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
             #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None])
             #                                               might be +
-            dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
-            dL_dthetaL[thetaL_i] += np.dot(dL_dfhat.T, df_hat_dthetaL)
+            dL_dthetaL_imp = np.dot(dL_dfhat.T, df_hat_dthetaL)
+            #print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
+            dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp
 
         return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
@@ -180,14 +183,20 @@ class Laplace(likelihood):
         W = np.diagflat(self.W)
         Wi = self.Sigma_tilde
         W12i = np.sqrt(Wi)
-        D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
-        fDf = mdot(self.f_hat.T, D, self.f_hat)
+        #D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
+        #fDf = mdot(self.f_hat.T, D, self.f_hat)
         l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+        #print "fDf:{}   l:{}   detKWiBi:{}   W:{}   Wi:{}   Bi:{}   Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum())
+
+        y_Wi_Ki_i_y = mdot(Y_tilde.T, pdinv(self.K + Wi)[0], Y_tilde)
         Z_tilde = (+ self.NORMAL_CONST
                    + l
                    + 0.5*ln_det_K_Wi__Bi
-                   - 0.5*fDf
+                   #- 0.5*fDf
+                   - 0.5*self.f_Ki_f
+                   + 0.5*y_Wi_Ki_i_y
                   )
+        #print "Ztilde: {}".format(Z_tilde)
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -316,7 +325,7 @@ class Laplace(likelihood):
             #f_old = f.copy()
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-5     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index c3aee835..041b59bd 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -170,7 +170,7 @@ class student_t(likelihood_function):
         return np.asarray(self.sigma)
 
     def _get_param_names(self):
-        return ["t_noise_variance"]
+        return ["t_noise_std"]
 
     def _set_params(self, x):
         self.sigma = float(x)
@@ -191,8 +191,6 @@ class student_t(likelihood_function):
         :returns: float(likelihood evaluated for this point)
 
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
 
         e = y - f
@@ -215,8 +213,6 @@ class student_t(likelihood_function):
         :returns: gradient of likelihood evaluated at points
 
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
@@ -237,8 +233,6 @@ class student_t(likelihood_function):
         :extra_data: extra_data which is not used in student t distribution
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
 
         e = y - f
@@ -251,8 +245,6 @@ class student_t(likelihood_function):
 
         $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
@@ -269,8 +261,6 @@ class student_t(likelihood_function):
 
         $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         dlik_dsigma = ( - (1/self.sigma) +
@@ -284,8 +274,6 @@ class student_t(likelihood_function):
 
         $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e)
@@ -299,8 +287,6 @@ class student_t(likelihood_function):
 
         $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
-        #y = np.squeeze(y)
-        #f = np.squeeze(f)
         assert y.shape == f.shape
         e = y - f
         dlik_hess_dsigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index d56ee86f..636ebba0 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -145,18 +145,6 @@ class GP(model):
         self.likelihood._set_params(self.likelihood._get_params())
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
         if isinstance(self.likelihood, Laplace):
-            #Reapproximate incase it hasnt been done...
-            self.likelihood.fit_full(self.kern.K(self.X))
-            self.likelihood._set_params(self.likelihood._get_params())
-            print self.kern._get_params()
-
-            #Need to pass in a matrix of ones to get access to raw dK_dthetaK values without being chained
-            #fake_dL_dKs = np.ones(self.dL_dK.shape) #FIXME: Check this is right...
-            #fake_dL_dKs = np.eye(self.dL_dK.shape[0]) #FIXME: Check this is right...
-
-            #BUG: THIS SHOULD NOT BE (1,num_k_params) matrix it should be (N,N,num_k_params)
-            #dK_dthetaK = self.kern.dK_dtheta(dL_dK=fake_dL_dKs, X=self.X)
-
             dK_dthetaK = self.kern.dK_dtheta
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From e900509a7c146a80a866d29a4efaedfb10f1291a Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 19 Jun 2013 16:13:11 +0100
Subject: [PATCH 052/252] Fixed a sign wrong, now gradchecks weirdly only above
 certain points

---
 GPy/examples/laplace_approximations.py  | 61 ++++++++++++++++++++++---
 GPy/likelihoods/Laplace.py              | 47 +++----------------
 GPy/likelihoods/likelihood_functions.py |  7 ++-
 3 files changed, 64 insertions(+), 51 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 84527d08..887e35ae 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -35,6 +35,54 @@ def timing():
     print the_is
     print np.mean(the_is)
 
+def v_fail_test():
+    plt.close('all')
+    real_var = 0.1
+    X = np.linspace(0.0, 10.0, 50)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+    Y = Y/Y.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 10
+    real_sd = np.sqrt(real_var)
+    print "Real noise std: ", real_sd
+
+    kernel1 = GPy.kern.white(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+
+    edited_real_sd = 0.3#real_sd
+    edited_real_sd = real_sd
+
+    print "Clean student t, rasm"
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
+    m = GPy.models.GP(X, stu_t_likelihood, kernel1)
+    m.constrain_fixed('white', 1)
+    vs = 15
+    noises = 40
+    checkgrads = np.zeros((vs, noises))
+    vs_noises = np.zeros((vs, noises))
+    for v_ind, v in enumerate(np.linspace(1, 20, vs)):
+        m.likelihood.likelihood_function.v = v
+        print v
+        for noise_ind, noise in enumerate(np.linspace(0.0000001, 1, noises)):
+            m['t_noise'] = noise
+            m.update_likelihood_approximation()
+            checkgrads[v_ind, noise_ind] = m.checkgrad()
+            vs_noises[v_ind, noise_ind] = (float(v)/(float(v) - 2))*(noise**2)
+
+    plt.figure(1)
+    plt.title('Checkgrads')
+    plt.imshow(checkgrads, interpolation='nearest')
+    plt.xlabel('noise')
+    plt.ylabel('v')
+
+    plt.figure(2)
+    plt.title('variance change')
+    plt.imshow(vs_noises, interpolation='nearest')
+    plt.xlabel('noise')
+    plt.ylabel('v')
+    print(m)
+
 def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
@@ -49,7 +97,7 @@ def debug_student_t_noise_approx():
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 1000
+    deg_free = 10
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
 
@@ -60,7 +108,7 @@ def debug_student_t_noise_approx():
 
     plt.close('all')
     # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
@@ -90,12 +138,11 @@ def debug_student_t_noise_approx():
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
-    m['white'] = 1e-3
-    #m.constrain_positive('rbf')
-    #m.constrain_fixed('rbf_v', 1.0898)
-    #m.constrain_fixed('rbf_l', 1.8651)
+    #m['white'] = 1e-3
+    m.constrain_fixed('rbf_v', 1.0898)
+    m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_variance', real_sd)
-    m.constrain_positive('rbf')
+    #m.constrain_positive('rbf')
     m.constrain_positive('t_noise')
     #m.constrain_fixed('t_noi', real_sd)
     m.ensure_default_constraints()
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 5b1a814a..70ec568a 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -70,54 +70,38 @@ class Laplace(likelihood):
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
         dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat)
 
-        Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
-
-        I_KW_i = np.eye(self.N) - np.dot(self.K, Wi_K_i)
-        return dL_dfhat, I_KW_i, Wi_K_i
+        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
+        return dL_dfhat, I_KW_i
 
     def _Kgradients(self, dK_dthetaK, X):
         """
         Gradients with respect to prior kernel parameters
         """
-        dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
         dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
 
         #Implicit
         impl = mdot(dlp, dL_dfhat.T, I_KW_i)
         expl_a = mdot(self.Ki_f, self.Ki_f.T)
-        expl_b = Wi_K_i
+        expl_b = self.Wi_K_i
         expl = 0.5*expl_a + 0.5*expl_b
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
         #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
         dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp
-
-        #dL_dthetaK = np.zeros(dK_dthetaK.shape)
-        #for thetaK_i, dK_dthetaK_i in enumerate(dK_dthetaK):
-            ##Explicit
-            #f_Ki_dK_dtheta_Ki_f = mdot(self.Ki_f.T, dK_dthetaK_i, self.Ki_f)
-            #dL_dthetaK[thetaK_i] = 0.5*f_Ki_dK_dtheta_Ki_f - 0.5*np.trace(Wi_K_i*dK_dthetaK_i)
-            ##Implicit
-            #df_hat_dthetaK = mdot(I_KW_i, dK_dthetaK_i, dlp)
-            #dL_dthetaK[thetaK_i] += np.dot(dL_dfhat.T, df_hat_dthetaK)
-
         return dL_dthetaK
 
     def _gradients(self, partial):
         """
         Gradients with respect to likelihood parameters
         """
-        #return np.zeros(1)
-        dL_dfhat, I_KW_i, Wi_K_i = self._shared_gradients_components()
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
 
         num_params = len(dlik_dthetaL)
         dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter
         for thetaL_i in range(num_params):
             #Explicit
-            #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(np.dot(Ki_W_i.T, np.diagflat(dlik_hess_dthetaL[thetaL_i])))
-            #dL_dthetaL[thetaL_i] = np.sum(dlik_dthetaL[thetaL_i]) + 0.5*np.dot(Ki_W_i.T, dlik_hess_dthetaL[thetaL_i][:, None])
-            #                                               might be +
             dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
@@ -165,34 +149,17 @@ class Laplace(likelihood):
 
         Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
 
-        ln_W_det = det_ln_diag(self.W)
-        yf_W_yf = mdot((Y_tilde - self.f_hat).T, np.diagflat(self.W), (Y_tilde - self.f_hat))
-
-        #Z_tilde = (+ self.NORMAL_CONST
-                   #+ self.ln_z_hat
-                   #+ 0.5*self.ln_I_KW_det
-                   #- 0.5*ln_W_det
-                   #+ 0.5*self.f_Ki_f
-                   #+ 0.5*yf_W_yf
-                   #)
-
         self.Sigma_tilde = np.diagflat(1.0/self.W)
 
-        Ki, _, _, K_det = pdinv(self.K)
+        self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
         ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
-        W = np.diagflat(self.W)
-        Wi = self.Sigma_tilde
-        W12i = np.sqrt(Wi)
-        #D = Ki - mdot((Ki + W), W12i, self.Bi, W12i, (Ki + W))
-        #fDf = mdot(self.f_hat.T, D, self.f_hat)
         l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
         #print "fDf:{}   l:{}   detKWiBi:{}   W:{}   Wi:{}   Bi:{}   Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum())
 
-        y_Wi_Ki_i_y = mdot(Y_tilde.T, pdinv(self.K + Wi)[0], Y_tilde)
+        y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
         Z_tilde = (+ self.NORMAL_CONST
                    + l
                    + 0.5*ln_det_K_Wi__Bi
-                   #- 0.5*fDf
                    - 0.5*self.f_Ki_f
                    + 0.5*y_Wi_Ki_i_y
                   )
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 041b59bd..d6dbf55f 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -194,10 +194,10 @@ class student_t(likelihood_function):
         assert y.shape == f.shape
 
         e = y - f
-        objective = (gammaln((self.v + 1) * 0.5)
+        objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
                      - np.log(self.sigma * np.sqrt(self.v * np.pi))
-                     - (self.v + 1) * 0.5 * np.log(1 + ((e**2 / self.sigma**2) / self.v))
+                     - (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v))
                     )
         return np.sum(objective)
 
@@ -234,7 +234,6 @@ class student_t(likelihood_function):
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
         assert y.shape == f.shape
-
         e = y - f
         hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
         return hess
@@ -247,7 +246,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        d3lik_d3f = ( (2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
+        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
                        ((e**2 + (self.sigma**2)*self.v)**3)
                     )
         return d3lik_d3f

From d4bfd99c21c835e5cf7873e20295561c031d5221 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 20 Jun 2013 14:30:25 +0100
Subject: [PATCH 053/252] Starting to fiddle with mode finding code

---
 GPy/examples/laplace_approximations.py  | 18 ++++++++++--------
 GPy/likelihoods/Laplace.py              | 12 ++++++------
 GPy/likelihoods/likelihood_functions.py |  1 -
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 887e35ae..d300806f 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -36,7 +36,7 @@ def timing():
     print np.mean(the_is)
 
 def v_fail_test():
-    plt.close('all')
+    #plt.close('all')
     real_var = 0.1
     X = np.linspace(0.0, 10.0, 50)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
@@ -57,6 +57,7 @@ def v_fail_test():
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel1)
     m.constrain_fixed('white', 1)
+    m.constrain_positive('t_noise')
     vs = 15
     noises = 40
     checkgrads = np.zeros((vs, noises))
@@ -64,23 +65,24 @@ def v_fail_test():
     for v_ind, v in enumerate(np.linspace(1, 20, vs)):
         m.likelihood.likelihood_function.v = v
         print v
-        for noise_ind, noise in enumerate(np.linspace(0.0000001, 1, noises)):
+        for noise_ind, noise in enumerate(np.linspace(0.0001, 1, noises)):
             m['t_noise'] = noise
             m.update_likelihood_approximation()
             checkgrads[v_ind, noise_ind] = m.checkgrad()
             vs_noises[v_ind, noise_ind] = (float(v)/(float(v) - 2))*(noise**2)
 
-    plt.figure(1)
+    plt.figure()
     plt.title('Checkgrads')
     plt.imshow(checkgrads, interpolation='nearest')
     plt.xlabel('noise')
     plt.ylabel('v')
 
-    plt.figure(2)
+    plt.figure()
     plt.title('variance change')
     plt.imshow(vs_noises, interpolation='nearest')
     plt.xlabel('noise')
     plt.ylabel('v')
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print(m)
 
 def debug_student_t_noise_approx():
@@ -139,13 +141,13 @@ def debug_student_t_noise_approx():
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     #m['white'] = 1e-3
-    m.constrain_fixed('rbf_v', 1.0898)
-    m.constrain_fixed('rbf_l', 1.8651)
+    #m.constrain_fixed('rbf_v', 1.0898)
+    #m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_variance', real_sd)
     #m.constrain_positive('rbf')
-    m.constrain_positive('t_noise')
+    #m.constrain_positive('t_noise')
+    m.constrain_positive('')
     #m.constrain_fixed('t_noi', real_sd)
-    m.ensure_default_constraints()
     m.update_likelihood_approximation()
     #m.optimize(messages=True)
     print(m)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 70ec568a..ed3229a9 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -68,8 +68,7 @@ class Laplace(likelihood):
     def _shared_gradients_components(self):
         #FIXME: Careful of side effects! And make sure W and K are up to date!
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
-        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat)
-
+        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T
         I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
         return dL_dfhat, I_KW_i
 
@@ -81,10 +80,10 @@ class Laplace(likelihood):
         dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
 
         #Implicit
-        impl = mdot(dlp, dL_dfhat.T, I_KW_i)
+        impl = mdot(dlp, dL_dfhat, I_KW_i)
         expl_a = mdot(self.Ki_f, self.Ki_f.T)
         expl_b = self.Wi_K_i
-        expl = 0.5*expl_a + 0.5*expl_b
+        expl = 0.5*expl_a - 0.5*expl_b # Might need to be -?
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
         #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
@@ -103,10 +102,11 @@ class Laplace(likelihood):
         for thetaL_i in range(num_params):
             #Explicit
             dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(mdot(self.Bi, self.K, dlik_hess_dthetaL[thetaL_i]))
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
-            dL_dthetaL_imp = np.dot(dL_dfhat.T, df_hat_dthetaL)
-            #print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
+            dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)
+            print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
             dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp
 
         return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index d6dbf55f..4d298122 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -192,7 +192,6 @@ class student_t(likelihood_function):
 
         """
         assert y.shape == f.shape
-
         e = y - f
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)

From e80fad197ca3250bca4e9d7830a23dadf8ae62e9 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 24 Jun 2013 15:39:38 +0100
Subject: [PATCH 054/252] trying to fix optimisation problem, fixed a few bugs
 but still fails at very low noise

---
 GPy/examples/laplace_approximations.py |  4 +-
 GPy/likelihoods/Laplace.py             | 79 +++++++++++++++-----------
 2 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index d300806f..7b9f10b1 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -90,7 +90,7 @@ def debug_student_t_noise_approx():
     real_var = 0.1
     #Start a function, any function
     X = np.linspace(0.0, 10.0, 50)[:, None]
-    #X = np.array([0.5])[:, None]
+    #X = np.array([0.5, 1])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
     X_full = np.linspace(0.0, 10.0, 50)[:, None]
@@ -99,7 +99,7 @@ def debug_student_t_noise_approx():
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10
+    deg_free = 100000
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
 
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index ed3229a9..b5362839 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -51,6 +51,8 @@ class Laplace(likelihood):
         self.Z = 0
         self.YYT = None
 
+        self.old_a = None
+
     def predictive_values(self, mu, var, full_cov):
         if full_cov:
             raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
@@ -83,7 +85,7 @@ class Laplace(likelihood):
         impl = mdot(dlp, dL_dfhat, I_KW_i)
         expl_a = mdot(self.Ki_f, self.Ki_f.T)
         expl_b = self.Wi_K_i
-        expl = 0.5*expl_a - 0.5*expl_b # Might need to be -?
+        expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
         #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
@@ -265,7 +267,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=500000, MAX_RESTART=50):
+    def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=40):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -275,7 +277,12 @@ class Laplace(likelihood):
         :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
         :returns: f_mode
         """
-        f = np.zeros((self.N, 1))
+        if self.old_a is None:
+            old_a = np.zeros((self.N, 1))
+        else:
+            old_a = self.old_a
+
+        f = np.dot(self.K, old_a)
         new_obj = -np.inf
         old_obj = np.inf
 
@@ -292,7 +299,7 @@ class Laplace(likelihood):
             #f_old = f.copy()
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-5     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-8     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
@@ -300,38 +307,46 @@ class Laplace(likelihood):
 
             W_f = W*f
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
-            #Find K_i_f
+
             b = W_f + grad
-            b = step_size*b
-
-            #Need this to find the f we have a stepsize which we need to move in, rather than a full unit movement
-            #c = np.dot(K, W_f) + f*(1-step_size) + step_size*np.dot(K, grad)
-            #solve_L = cho_solve((L, True), W_12*c)
-            #f = c - np.dot(K, W_12*solve_L)
-
-            #FIXME: Can't we get rid of this? Don't we want to evaluate obj(c,f) and this is our new_obj?
-            #Why did I choose to evaluate the objective function at the new f with the old hessian? I'm sure there was a good reason,
-            #Document it!
             solve_L = cho_solve((L, True), W_12*np.dot(K, b))
-            a = b - W_12*solve_L
-            f = np.dot(K, a)
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            full_step_a = b - W_12*solve_L
+            da = full_step_a - old_a
 
-            tmp_old_obj = old_obj
-            old_obj = new_obj
-            new_obj = obj(a, f)
-            difference = new_obj - old_obj
-            if difference < 0:
-                #print "Objective function rose", difference
-                #If the objective function isn't rising, restart optimization
-                step_size *= 0.9
-                #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
-                #objective function isn't increasing, try reducing step size
-                #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
-                old_obj = tmp_old_obj
-                rs += 1
+            update_passed = False
+            while not update_passed:
+                a = old_a + step_size*da
+                f = np.dot(K, a)
 
-            difference = abs(difference)
+                old_obj = new_obj
+                new_obj = np.float(obj(a, f))
+                difference = new_obj - old_obj
+                #print "difference: ",difference
+                if difference < 0:
+                    #print grad
+                    print "Objective function rose", np.float(difference)
+                    #If the objective function isn't rising, restart optimization
+                    step_size *= 0.8
+                    print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    #objective function isn't increasing, try reducing step size
+                    #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
+                    #old_obj = tmp_old_obj
+                    old_obj = new_obj
+                    rs += 1
+                else:
+                    update_passed = True
+
+            #print "Iter difference: ", difference
+            #print "F: ", f
+            #print "A: ", a
+            old_a = a
+            #print "Positive difference obj: ", np.float(difference)
+            difference = np.float(abs(difference))
             i += 1
 
-        self.i = i
+        #print "Positive difference obj: ", np.float(difference)
+        print "Iterations: ",i
+        print "Step size reductions", rs
+        print "Final difference: ", difference
         return f

From 064efd5535818b3ca6ec93baa83fc72ade12eb42 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 25 Jun 2013 18:20:00 +0100
Subject: [PATCH 055/252] Added another optimisation which doesn't use
 gradients. Seems like F is almost always found, but Y can be off, suggesting
 that Wi__Ki_W is wrong, maybe W?

---
 GPy/examples/laplace_approximations.py | 47 +++++++++---------
 GPy/likelihoods/Laplace.py             | 69 ++++++++++++++++----------
 2 files changed, 67 insertions(+), 49 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 7b9f10b1..61291e71 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -25,7 +25,7 @@ def timing():
         kernel1 = GPy.kern.rbf(X.shape[1])
 
         t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-        corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=True)
+        corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
         m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
         m.ensure_default_constraints()
         m.update_likelihood_approximation()
@@ -54,18 +54,17 @@ def v_fail_test():
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel1)
-    m.constrain_fixed('white', 1)
-    m.constrain_positive('t_noise')
-    vs = 15
+    m.constrain_positive('')
+    vs = 25
     noises = 40
     checkgrads = np.zeros((vs, noises))
     vs_noises = np.zeros((vs, noises))
-    for v_ind, v in enumerate(np.linspace(1, 20, vs)):
+    for v_ind, v in enumerate(np.linspace(1, 100, vs)):
         m.likelihood.likelihood_function.v = v
         print v
-        for noise_ind, noise in enumerate(np.linspace(0.0001, 1, noises)):
+        for noise_ind, noise in enumerate(np.linspace(0.0001, 10, noises)):
             m['t_noise'] = noise
             m.update_likelihood_approximation()
             checkgrads[v_ind, noise_ind] = m.checkgrad()
@@ -77,11 +76,11 @@ def v_fail_test():
     plt.xlabel('noise')
     plt.ylabel('v')
 
-    plt.figure()
-    plt.title('variance change')
-    plt.imshow(vs_noises, interpolation='nearest')
-    plt.xlabel('noise')
-    plt.ylabel('v')
+    #plt.figure()
+    #plt.title('variance change')
+    #plt.imshow(vs_noises, interpolation='nearest')
+    #plt.xlabel('noise')
+    #plt.ylabel('v')
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print(m)
 
@@ -93,13 +92,14 @@ def debug_student_t_noise_approx():
     #X = np.array([0.5, 1])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
 
-    X_full = np.linspace(0.0, 10.0, 50)[:, None]
+    X_full = X
     Y_full = np.sin(X_full)
 
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 100000
+    deg_free = 10
+
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
 
@@ -110,7 +110,7 @@ def debug_student_t_noise_approx():
 
     plt.close('all')
     # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1])# + GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
@@ -134,13 +134,13 @@ def debug_student_t_noise_approx():
     #print m
 
     edited_real_sd = initial_var_guess #real_sd
-    edited_real_sd = real_sd
+    #edited_real_sd = real_sd
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
-    #m['white'] = 1e-3
+    m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_variance', real_sd)
@@ -159,11 +159,12 @@ def debug_student_t_noise_approx():
         m.plot()
         plt.plot(X_full, Y_full)
         plt.ylim(-2.5, 2.5)
+    print "Real noise std: ", real_sd
     return m
 
     #print "Clean student t, ncg"
     #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
+    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
     #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
@@ -260,7 +261,7 @@ def student_t_approx():
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, rasm=True)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
@@ -274,7 +275,7 @@ def student_t_approx():
 
     print "Corrupt student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=True)
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
     m.update_likelihood_approximation()
@@ -290,7 +291,7 @@ def student_t_approx():
 
     #print "Clean student t, ncg"
     #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, rasm=False)
+    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
     #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
@@ -304,7 +305,7 @@ def student_t_approx():
 
     #print "Corrupt student t, ncg"
     #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
-    #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, rasm=False)
+    #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
     #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index b5362839..b9d74846 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -12,7 +12,7 @@ import random
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self, data, likelihood_function, extra_data=None, rasm=True):
+    def __init__(self, data, likelihood_function, extra_data=None, opt='rasm'):
         """
         Laplace Approximation
 
@@ -29,13 +29,13 @@ class Laplace(likelihood):
         :data: array of data the likelihood function is approximating
         :likelihood_function: likelihood function - subclass of likelihood_function
         :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
-        :rasm: Flag of whether to use rasmussens numerically stable mode finding or simple ncg optimisation
+        :opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data)
 
         """
         self.data = data
         self.likelihood_function = likelihood_function
         self.extra_data = extra_data
-        self.rasm = rasm
+        self.opt = opt
 
         #Inital values
         self.N, self.D = self.data.shape
@@ -85,11 +85,12 @@ class Laplace(likelihood):
         impl = mdot(dlp, dL_dfhat, I_KW_i)
         expl_a = mdot(self.Ki_f, self.Ki_f.T)
         expl_b = self.Wi_K_i
+        #print "expl_a: {}, expl_b: {}".format(expl_a, expl_b)
         expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
-        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
-        dL_dthetaK = dL_dthetaK_imp + dL_dthetaK_exp
+        print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        dL_dthetaK = dL_dthetaK_exp +dL_dthetaK_imp
         return dL_dthetaK
 
     def _gradients(self, partial):
@@ -109,7 +110,7 @@ class Laplace(likelihood):
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
             dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)
             print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
-            dL_dthetaL[thetaL_i] = dL_dthetaL_imp + dL_dthetaL_exp
+            dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
 
         return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
 
@@ -165,7 +166,7 @@ class Laplace(likelihood):
                    - 0.5*self.f_Ki_f
                    + 0.5*y_Wi_Ki_i_y
                   )
-        #print "Ztilde: {}".format(Z_tilde)
+        print "Ztilde: {}".format(Z_tilde)
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -183,10 +184,11 @@ class Laplace(likelihood):
         self.K = K.copy()
 
         #Find mode
-        if self.rasm:
-            self.f_hat = self.rasm_mode(K)
-        else:
-            self.f_hat = self.ncg_mode(K)
+        self.f_hat = {
+            'rasm': self.rasm_mode,
+            'ncg': self.ncg_mode,
+            'nelder': self.nelder_mode
+        }[self.opt](self.K)
 
         #Compute hessian and other variables at mode
         self._compute_likelihood_variables()
@@ -196,20 +198,20 @@ class Laplace(likelihood):
         self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            self.W[self.W < 0] = 1e-5  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                        #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                        #To cause the posterior to become less certain than the prior and likelihood,
                                        #This is a property only held by non-log-concave likelihoods
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
-        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
-        self.Bi, _, _, B_det = pdinv(self.B)
+        #self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
+        #self.Bi, _, _, B_det = pdinv(self.B)
 
         #Do the computation again at f to get Ki_f which is useful
-        b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
-        solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
-        a = b - self.W_12*solve_chol
-        self.Ki_f = a
+        #b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
+        #solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
+        #a = b - self.W_12*solve_chol
+        self.Ki_f = self.a
 
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
         self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
@@ -239,6 +241,17 @@ class Laplace(likelihood):
         L = jitchol(B)
         return (B, L, W_12)
 
+    def nelder_mode(self, K):
+        f = np.zeros((self.N, 1))
+        self.Ki, _, _, self.ln_K_det = pdinv(K)
+        def obj(f):
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f)))
+            return float(res)
+
+        res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True})
+        f_new = res.x
+        return f_new[:, None]
+
     def ncg_mode(self, K):
         """
         Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
@@ -261,13 +274,13 @@ class Laplace(likelihood):
             return np.squeeze(res)
 
         def obj_hess(f):
-            res = -1 * (--np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
+            res = -1 * (np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
             return np.squeeze(res)
 
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=40):
+    def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=10):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -287,11 +300,10 @@ class Laplace(likelihood):
         old_obj = np.inf
 
         def obj(a, f):
-            #Careful of shape of data!
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-6
+        epsilon = 1e-9
         step_size = 1
         rs = 0
         i = 0
@@ -299,7 +311,7 @@ class Laplace(likelihood):
             #f_old = f.copy()
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-8     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
@@ -314,6 +326,7 @@ class Laplace(likelihood):
             full_step_a = b - W_12*solve_L
             da = full_step_a - old_a
 
+            f_old = f
             update_passed = False
             while not update_passed:
                 a = old_a + step_size*da
@@ -323,11 +336,11 @@ class Laplace(likelihood):
                 new_obj = np.float(obj(a, f))
                 difference = new_obj - old_obj
                 #print "difference: ",difference
-                if difference < 0:
+                if difference < -epsilon:
                     #print grad
                     print "Objective function rose", np.float(difference)
                     #If the objective function isn't rising, restart optimization
-                    step_size *= 0.8
+                    step_size *= 0.4
                     print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
                     #objective function isn't increasing, try reducing step size
                     #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
@@ -337,16 +350,20 @@ class Laplace(likelihood):
                 else:
                     update_passed = True
 
+            difference = np.abs(np.sum(f - f_old)) + abs(difference)
             #print "Iter difference: ", difference
             #print "F: ", f
             #print "A: ", a
             old_a = a
             #print "Positive difference obj: ", np.float(difference)
-            difference = np.float(abs(difference))
+            #difference = np.float(abs(difference))
             i += 1
 
         #print "Positive difference obj: ", np.float(difference)
         print "Iterations: ",i
         print "Step size reductions", rs
         print "Final difference: ", difference
+        self.a = a
+        self.B, self.B_chol, self.W_12 = B, L, W_12
+        self.Bi, _, _, B_det = pdinv(self.B)
         return f

From 617d73ca3271f080ed2e58efd9cbd9a49e301ac0 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 26 Jun 2013 15:44:26 +0100
Subject: [PATCH 056/252] Now checkgrads a lot more of the time, but still
 fails in optimisation, seems also odd that when parameter is fixed kernel
 parameters go to infinity

---
 GPy/examples/laplace_approximations.py | 17 +++++++++++------
 GPy/likelihoods/Laplace.py             | 23 ++++++++---------------
 GPy/models/GP.py                       |  7 +++++--
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 61291e71..0fd3efeb 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -98,7 +98,7 @@ def debug_student_t_noise_approx():
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10
+    deg_free = 100
 
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
@@ -133,20 +133,23 @@ def debug_student_t_noise_approx():
         #plt.plot(X_full, Y_full)
     #print m
 
-    edited_real_sd = initial_var_guess #real_sd
+    real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
+    edited_real_sd = real_stu_t_std#initial_var_guess #real_sd
     #edited_real_sd = real_sd
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
-    m['rbf_len'] = 1.5
+    #m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 1.8651)
-    #m.constrain_fixed('t_noise_variance', real_sd)
+    m.constrain_fixed('t_noise_std', edited_real_sd)
     #m.constrain_positive('rbf')
-    #m.constrain_positive('t_noise')
-    m.constrain_positive('')
+    #m.constrain_positive('t_noise_std')
+    #m.constrain_positive('')
+    m.ensure_default_constraints()
     #m.constrain_fixed('t_noi', real_sd)
     m.update_likelihood_approximation()
     #m.optimize(messages=True)
@@ -264,6 +267,7 @@ def student_t_approx():
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     m.ensure_default_constraints()
+    m.constrain_positive('t_noise')
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
@@ -278,6 +282,7 @@ def student_t_approx():
     corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
+    m.constrain_positive('t_noise')
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index b9d74846..1431a7c6 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -109,7 +109,7 @@ class Laplace(likelihood):
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
             dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)
-            print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
+            #print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
             dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
 
         return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
@@ -147,10 +147,11 @@ class Laplace(likelihood):
         Li = chol_inv(L)
         Lt_W = L.T*self.W.T
 
-        Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=False)[0]
+        Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
         self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
 
         Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
+        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         self.Sigma_tilde = np.diagflat(1.0/self.W)
 
@@ -166,7 +167,7 @@ class Laplace(likelihood):
                    - 0.5*self.f_Ki_f
                    + 0.5*y_Wi_Ki_i_y
                   )
-        print "Ztilde: {}".format(Z_tilde)
+        #print "Ztilde: {}".format(Z_tilde)
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -280,7 +281,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=500, MAX_RESTART=10):
+    def rasm_mode(self, K, MAX_ITER=250, MAX_RESTART=10):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -308,7 +309,6 @@ class Laplace(likelihood):
         rs = 0
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
-            #f_old = f.copy()
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
                 W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
@@ -338,10 +338,10 @@ class Laplace(likelihood):
                 #print "difference: ",difference
                 if difference < -epsilon:
                     #print grad
-                    print "Objective function rose", np.float(difference)
+                    #print "Objective function rose", np.float(difference)
                     #If the objective function isn't rising, restart optimization
                     step_size *= 0.4
-                    print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
                     #objective function isn't increasing, try reducing step size
                     #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
                     #old_obj = tmp_old_obj
@@ -351,18 +351,11 @@ class Laplace(likelihood):
                     update_passed = True
 
             difference = np.abs(np.sum(f - f_old)) + abs(difference)
-            #print "Iter difference: ", difference
-            #print "F: ", f
-            #print "A: ", a
             old_a = a
-            #print "Positive difference obj: ", np.float(difference)
-            #difference = np.float(abs(difference))
             i += 1
 
         #print "Positive difference obj: ", np.float(difference)
-        print "Iterations: ",i
-        print "Step size reductions", rs
-        print "Final difference: ", difference
+        print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
         self.a = a
         self.B, self.B_chol, self.W_12 = B, L, W_12
         self.Bi, _, _, B_det = pdinv(self.B)
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 636ebba0..7b6fab27 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -141,10 +141,11 @@ class GP(model):
 
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
-        self.likelihood.fit_full(self.kern.K(self.X))
-        self.likelihood._set_params(self.likelihood._get_params())
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
+        print "dL_dthetaK should be: ", dL_dthetaK
         if isinstance(self.likelihood, Laplace):
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
             dK_dthetaK = self.kern.dK_dtheta
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
@@ -153,6 +154,8 @@ class GP(model):
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
             #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+        print "dL_dthetaK is: ", dL_dthetaK
+
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
 

From c90b1f0c99b84bf7e981113e5bfd83396b825ed1 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 27 Jun 2013 15:04:57 +0100
Subject: [PATCH 057/252] Added minimizer for finding f, doesn't help

---
 GPy/examples/laplace_approximations.py |  8 +--
 GPy/likelihoods/Laplace.py             | 80 ++++++++++++++++----------
 GPy/models/GP.py                       | 11 ++--
 3 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 0fd3efeb..abb5f4ce 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -58,13 +58,13 @@ def v_fail_test():
     m = GPy.models.GP(X, stu_t_likelihood, kernel1)
     m.constrain_positive('')
     vs = 25
-    noises = 40
+    noises = 30
     checkgrads = np.zeros((vs, noises))
     vs_noises = np.zeros((vs, noises))
     for v_ind, v in enumerate(np.linspace(1, 100, vs)):
         m.likelihood.likelihood_function.v = v
         print v
-        for noise_ind, noise in enumerate(np.linspace(0.0001, 10, noises)):
+        for noise_ind, noise in enumerate(np.linspace(0.0001, 100, noises)):
             m['t_noise'] = noise
             m.update_likelihood_approximation()
             checkgrads[v_ind, noise_ind] = m.checkgrad()
@@ -145,9 +145,9 @@ def debug_student_t_noise_approx():
     #m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 1.8651)
-    m.constrain_fixed('t_noise_std', edited_real_sd)
+    #m.constrain_fixed('t_noise_std', edited_real_sd)
     #m.constrain_positive('rbf')
-    #m.constrain_positive('t_noise_std')
+    m.constrain_positive('t_noise_std')
     #m.constrain_positive('')
     m.ensure_default_constraints()
     #m.constrain_fixed('t_noi', real_sd)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 1431a7c6..e096c5f4 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -90,7 +90,7 @@ class Laplace(likelihood):
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
         print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
-        dL_dthetaK = dL_dthetaK_exp +dL_dthetaK_imp
+        dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
         return dL_dthetaK
 
     def _gradients(self, partial):
@@ -126,7 +126,6 @@ class Laplace(likelihood):
         due to the z rescaling.
 
         at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
-
         This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
         giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
 
@@ -143,17 +142,18 @@ class Laplace(likelihood):
         #dtritri -> L -> L_i
         #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
         #((L.T*w)_i + I)f_hat = y_tilde
-        L = jitchol(self.K)
-        Li = chol_inv(L)
-        Lt_W = L.T*self.W.T
+        #L = jitchol(self.K)
+        #Li = chol_inv(L)
+        #Lt_W = L.T*self.W.T
 
-        Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
-        self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
+        #Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
+        #self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
+        #Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
 
-        Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
-        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        Wi = 1.0/self.W
+        self.Sigma_tilde = np.diagflat(Wi)
 
-        self.Sigma_tilde = np.diagflat(1.0/self.W)
+        Y_tilde = Wi*(self.Ki_f + self.W*self.f_hat)
 
         self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
         ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
@@ -281,7 +281,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=250, MAX_RESTART=10):
+    def rasm_mode(self, K, MAX_ITER=40, MAX_RESTART=10):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -297,6 +297,7 @@ class Laplace(likelihood):
             old_a = self.old_a
 
         f = np.dot(self.K, old_a)
+        self.f = f
         new_obj = -np.inf
         old_obj = np.inf
 
@@ -304,7 +305,7 @@ class Laplace(likelihood):
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-9
+        epsilon = 1e-6
         step_size = 1
         rs = 0
         i = 0
@@ -316,6 +317,8 @@ class Laplace(likelihood):
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
             B, L, W_12 = self._compute_B_statistics(K, W)
+            #if i > 30:
+                #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
             W_f = W*f
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
@@ -326,37 +329,52 @@ class Laplace(likelihood):
             full_step_a = b - W_12*solve_L
             da = full_step_a - old_a
 
-            f_old = f
-            update_passed = False
-            while not update_passed:
+            f_old = self.f.copy()
+
+            def inner_obj(step_size, old_a, da, K):
                 a = old_a + step_size*da
                 f = np.dot(K, a)
+                self.a = a
+                self.f = f
+                return -obj(a, f)
 
-                old_obj = new_obj
-                new_obj = np.float(obj(a, f))
-                difference = new_obj - old_obj
+            from functools import partial
+            i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K)
+            old_obj = new_obj
+            new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
+
+            #update_passed = False
+            #while not update_passed:
+                #a = old_a + step_size*da
+                #f = np.dot(K, a)
+
+                #old_obj = new_obj
+                #new_obj = obj(a, f)
+                #difference = new_obj - old_obj
                 #print "difference: ",difference
-                if difference < -epsilon:
-                    #print grad
+                #if difference < 0:
+                    ##print grad
                     #print "Objective function rose", np.float(difference)
-                    #If the objective function isn't rising, restart optimization
-                    step_size *= 0.4
+                    ##If the objective function isn't rising, restart optimization
+                    #step_size *= 0.8
                     #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
-                    #objective function isn't increasing, try reducing step size
-                    #f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
-                    #old_obj = tmp_old_obj
-                    old_obj = new_obj
-                    rs += 1
-                else:
-                    update_passed = True
+                    ##objective function isn't increasing, try reducing step size
+                    ##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
+                    ##old_obj = tmp_old_obj
+                    #old_obj = new_obj
+                    #rs += 1
+                #else:
+                    #update_passed = True
 
+            f = self.f
+            difference = new_obj - old_obj
             difference = np.abs(np.sum(f - f_old)) + abs(difference)
-            old_a = a
+            old_a = self.a #a
             i += 1
 
         #print "Positive difference obj: ", np.float(difference)
         print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
-        self.a = a
+        #self.a = a
         self.B, self.B_chol, self.W_12 = B, L, W_12
         self.Bi, _, _, B_det = pdinv(self.B)
         return f
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 7b6fab27..1d57ed38 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -142,19 +142,18 @@ class GP(model):
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
-        print "dL_dthetaK should be: ", dL_dthetaK
+        #print "dL_dthetaK should be: ", dL_dthetaK
         if isinstance(self.likelihood, Laplace):
-            self.likelihood.fit_full(self.kern.K(self.X))
-            self.likelihood._set_params(self.likelihood._get_params())
+            #self.likelihood.fit_full(self.kern.K(self.X))
+            #self.likelihood._set_params(self.likelihood._get_params())
             dK_dthetaK = self.kern.dK_dtheta
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            #print "dL_dthetaK after: ",dL_dthetaK
-            #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-            #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+        #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
         print "dL_dthetaK is: ", dL_dthetaK
+        print "dL_dthetaL is: ", dL_dthetaL
 
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))

From 26b3855af56ee220cfa00928f6f936bd1161acdf Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 1 Jul 2013 10:06:20 +0100
Subject: [PATCH 058/252] Everything seems to be gradchecking again

---
 GPy/examples/laplace_approximations.py  |  7 ++++++-
 GPy/likelihoods/Laplace.py              | 18 +++++++++---------
 GPy/likelihoods/likelihood_functions.py |  2 +-
 GPy/models/GP.py                        |  3 +--
 4 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index abb5f4ce..24f2d88c 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -91,6 +91,8 @@ def debug_student_t_noise_approx():
     X = np.linspace(0.0, 10.0, 50)[:, None]
     #X = np.array([0.5, 1])[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+    #ty = np.array([1., 9.97733584, 4.17841363])[:, None]
+    #Y = ty
 
     X_full = X
     Y_full = np.sin(X_full)
@@ -98,7 +100,7 @@ def debug_student_t_noise_approx():
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 100
+    deg_free = 10000
 
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
@@ -151,6 +153,9 @@ def debug_student_t_noise_approx():
     #m.constrain_positive('')
     m.ensure_default_constraints()
     #m.constrain_fixed('t_noi', real_sd)
+    #m['rbf_var'] = 0.20446332
+    #m['rbf_leng'] = 0.85776241
+    #m['t_noise'] = 0.667083294421005
     m.update_likelihood_approximation()
     #m.optimize(messages=True)
     print(m)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index e096c5f4..e4652f27 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -153,7 +153,7 @@ class Laplace(likelihood):
         Wi = 1.0/self.W
         self.Sigma_tilde = np.diagflat(Wi)
 
-        Y_tilde = Wi*(self.Ki_f + self.W*self.f_hat)
+        Y_tilde = Wi*self.Ki_f + self.f_hat
 
         self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
         ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
@@ -199,7 +199,7 @@ class Laplace(likelihood):
         self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-5  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            self.W[self.W < 0] = 1e-8  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                        #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                        #To cause the posterior to become less certain than the prior and likelihood,
                                        #This is a property only held by non-log-concave likelihoods
@@ -312,7 +312,7 @@ class Laplace(likelihood):
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 0#1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
@@ -329,8 +329,9 @@ class Laplace(likelihood):
             full_step_a = b - W_12*solve_L
             da = full_step_a - old_a
 
-            f_old = self.f.copy()
+            f_old = f.copy()
 
+            f_old = self.f.copy()
             def inner_obj(step_size, old_a, da, K):
                 a = old_a + step_size*da
                 f = np.dot(K, a)
@@ -340,7 +341,6 @@ class Laplace(likelihood):
 
             from functools import partial
             i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K)
-            old_obj = new_obj
             new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
 
             #update_passed = False
@@ -354,10 +354,10 @@ class Laplace(likelihood):
                 #print "difference: ",difference
                 #if difference < 0:
                     ##print grad
-                    #print "Objective function rose", np.float(difference)
+                    ##print "Objective function rose", np.float(difference)
                     ##If the objective function isn't rising, restart optimization
                     #step_size *= 0.8
-                    #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
                     ##objective function isn't increasing, try reducing step size
                     ##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
                     ##old_obj = tmp_old_obj
@@ -368,12 +368,12 @@ class Laplace(likelihood):
 
             f = self.f
             difference = new_obj - old_obj
-            difference = np.abs(np.sum(f - f_old)) + abs(difference)
+            difference = np.abs(np.sum(f - f_old)) #+ abs(difference)
             old_a = self.a #a
             i += 1
 
         #print "Positive difference obj: ", np.float(difference)
-        print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
+        #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
         #self.a = a
         self.B, self.B_chol, self.W_12 = B, L, W_12
         self.Bi, _, _, B_det = pdinv(self.B)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 4d298122..ebc87f56 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -274,7 +274,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e)
+        dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
                             / ((self.v*(self.sigma**2) + e**2)**2)
                            )
         return dlik_grad_dsigma
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 1d57ed38..20337ef5 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -152,8 +152,7 @@ class GP(model):
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
         #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-        print "dL_dthetaK is: ", dL_dthetaK
-        print "dL_dthetaL is: ", dL_dthetaL
+        print "dL_dthetaK: {}   dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL)
 
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))

From a7169ab1ab771e567e45d6a11ae9e13b13f3c754 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 1 Jul 2013 15:21:47 +0100
Subject: [PATCH 059/252] Fixed bug where B wasn't refering to current f
 location

---
 GPy/core/model.py                       |  3 +++
 GPy/examples/laplace_approximations.py  |  5 +++--
 GPy/likelihoods/Laplace.py              | 21 ++++++++++-----------
 GPy/likelihoods/likelihood_functions.py |  6 +++++-
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 94202396..83a4a428 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -244,6 +244,9 @@ class model(parameterised):
         LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
         prior_gradients = self._transform_gradients(self._log_prior_gradients())
         obj_grads = -LL_gradients - prior_gradients
+        print self
+        #self.checkgrad(verbose=1)
+        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         return obj_f, obj_grads
 
     def optimize(self, optimizer=None, start=None, **kwargs):
diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 24f2d88c..bb621424 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -100,7 +100,7 @@ def debug_student_t_noise_approx():
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 10000
+    deg_free = 1000
 
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
@@ -152,7 +152,7 @@ def debug_student_t_noise_approx():
     m.constrain_positive('t_noise_std')
     #m.constrain_positive('')
     m.ensure_default_constraints()
-    #m.constrain_fixed('t_noi', real_sd)
+    m.constrain_bounded('t_noi', 0.001, 10)
     #m['rbf_var'] = 0.20446332
     #m['rbf_leng'] = 0.85776241
     #m['t_noise'] = 0.667083294421005
@@ -168,6 +168,7 @@ def debug_student_t_noise_approx():
         plt.plot(X_full, Y_full)
         plt.ylim(-2.5, 2.5)
     print "Real noise std: ", real_sd
+    print "or Real noise std: ", real_stu_t_std
     return m
 
     #print "Clean student t, ncg"
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index e4652f27..4c9c67df 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -158,7 +158,6 @@ class Laplace(likelihood):
         self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
         ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
         l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
-        #print "fDf:{}   l:{}   detKWiBi:{}   W:{}   Wi:{}   Bi:{}   Ki:{}".format(fDf, l, ln_det_K_Wi__Bi, W.sum(), Wi.sum(), self.Bi.sum(), Ki.sum())
 
         y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
         Z_tilde = (+ self.NORMAL_CONST
@@ -199,14 +198,14 @@ class Laplace(likelihood):
         self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-8  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                        #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                        #To cause the posterior to become less certain than the prior and likelihood,
                                        #This is a property only held by non-log-concave likelihoods
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
-        #self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
-        #self.Bi, _, _, B_det = pdinv(self.B)
+        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
+        self.Bi, _, _, B_det = pdinv(self.B)
 
         #Do the computation again at f to get Ki_f which is useful
         #b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
@@ -305,14 +304,14 @@ class Laplace(likelihood):
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-6
+        epsilon = 1e-10
         step_size = 1
         rs = 0
         i = 0
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 0#1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
@@ -335,13 +334,13 @@ class Laplace(likelihood):
             def inner_obj(step_size, old_a, da, K):
                 a = old_a + step_size*da
                 f = np.dot(K, a)
-                self.a = a
+                self.a = a # This is nasty, need to set something within an optimization though
                 self.f = f
                 return -obj(a, f)
 
             from functools import partial
             i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K)
-            new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
+            new_obj = sp.optimize.brent(i_o, tol=1e-6, maxiter=10)
 
             #update_passed = False
             #while not update_passed:
@@ -373,8 +372,8 @@ class Laplace(likelihood):
             i += 1
 
         #print "Positive difference obj: ", np.float(difference)
-        #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
+        print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
         #self.a = a
-        self.B, self.B_chol, self.W_12 = B, L, W_12
-        self.Bi, _, _, B_det = pdinv(self.B)
+        #self.B, self.B_chol, self.W_12 = B, L, W_12
+        #self.Bi, _, _, B_det = pdinv(self.B)
         return f
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index ebc87f56..57627198 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -195,8 +195,9 @@ class student_t(likelihood_function):
         e = y - f
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
-                     - np.log(self.sigma * np.sqrt(self.v * np.pi))
+                     - 0.5*np.log((self.sigma**2) * self.v * np.pi)
                      - (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v))
+                     #- (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2)))
                     )
         return np.sum(objective)
 
@@ -264,6 +265,7 @@ class student_t(likelihood_function):
         dlik_dsigma = ( - (1/self.sigma) +
                         ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) )
                       )
+        #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
@@ -290,6 +292,8 @@ class student_t(likelihood_function):
         dlik_hess_dsigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
                               ((e**2 + (self.sigma**2)*self.v)**3)
                            )
+        #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
+                             #/ ((e**2 + (self.sigma**2)*self.v)**3) )
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):

From ab6a3a571e4ef0aec66776f56921326166f09d40 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 2 Jul 2013 11:14:48 +0100
Subject: [PATCH 060/252] Playing trying to find what makes it want to go so
 low

---
 GPy/core/model.py                       |  2 +-
 GPy/examples/laplace_approximations.py  | 21 ++++++++++++++-------
 GPy/likelihoods/Laplace.py              | 18 +++++++++---------
 GPy/likelihoods/likelihood_functions.py |  4 ++--
 4 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 83a4a428..f97938a4 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -246,7 +246,7 @@ class model(parameterised):
         obj_grads = -LL_gradients - prior_gradients
         print self
         #self.checkgrad(verbose=1)
-        import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         return obj_f, obj_grads
 
     def optimize(self, optimizer=None, start=None, **kwargs):
diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index bb621424..14400a08 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -88,9 +88,12 @@ def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 50)[:, None]
+    #X = np.linspace(0.0, 10.0, 50)[:, None]
+    X = np.random.rand(100)[:, None]
+    #X = np.random.rand(100)[:, None]
     #X = np.array([0.5, 1])[:, None]
-    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var
+    #Y = X + np.random.randn(*X.shape)*real_var
     #ty = np.array([1., 9.97733584, 4.17841363])[:, None]
     #Y = ty
 
@@ -112,7 +115,8 @@ def debug_student_t_noise_approx():
 
     plt.close('all')
     # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1])# + GPy.kern.white(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
@@ -136,7 +140,7 @@ def debug_student_t_noise_approx():
     #print m
 
     real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
-    edited_real_sd = real_stu_t_std#initial_var_guess #real_sd
+    edited_real_sd = real_stu_t_std + 1#initial_var_guess #real_sd
     #edited_real_sd = real_sd
 
     print "Clean student t, rasm"
@@ -149,13 +153,16 @@ def debug_student_t_noise_approx():
     #m.constrain_fixed('rbf_l', 1.8651)
     #m.constrain_fixed('t_noise_std', edited_real_sd)
     #m.constrain_positive('rbf')
-    m.constrain_positive('t_noise_std')
+    #m.constrain_positive('t_noise_std')
     #m.constrain_positive('')
-    m.ensure_default_constraints()
-    m.constrain_bounded('t_noi', 0.001, 10)
+    #m.constrain_bounded('t_noi', 0.001, 10)
+    #m.constrain_fixed('t_noi', real_stu_t_std)
+    m.constrain_fixed('white', 0.01)
+    #m.constrain_fixed('t_no', 0.01)
     #m['rbf_var'] = 0.20446332
     #m['rbf_leng'] = 0.85776241
     #m['t_noise'] = 0.667083294421005
+    m.ensure_default_constraints()
     m.update_likelihood_approximation()
     #m.optimize(messages=True)
     print(m)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 4c9c67df..2ae68613 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -156,15 +156,15 @@ class Laplace(likelihood):
         Y_tilde = Wi*self.Ki_f + self.f_hat
 
         self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
-        ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
-        l = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+        self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
+        self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
 
-        y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
-        Z_tilde = (+ self.NORMAL_CONST
-                   + l
-                   + 0.5*ln_det_K_Wi__Bi
+        self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+        Z_tilde = (#+ self.NORMAL_CONST
+                   + self.lik
+                   + 0.5*self.ln_det_K_Wi__Bi
                    - 0.5*self.f_Ki_f
-                   + 0.5*y_Wi_Ki_i_y
+                   + 0.5*self.y_Wi_Ki_i_y
                   )
         #print "Ztilde: {}".format(Z_tilde)
 
@@ -198,7 +198,7 @@ class Laplace(likelihood):
         self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            self.W[self.W < 0] = 1e-10  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                        #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                        #To cause the posterior to become less certain than the prior and likelihood,
                                        #This is a property only held by non-log-concave likelihoods
@@ -311,7 +311,7 @@ class Laplace(likelihood):
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-10     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 57627198..fd64dbe6 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -196,8 +196,8 @@ class student_t(likelihood_function):
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
                      - 0.5*np.log((self.sigma**2) * self.v * np.pi)
-                     - (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v))
-                     #- (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2)))
+                     #- (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v))
+                     - (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2)))
                     )
         return np.sum(objective)
 

From 4e5cefb4b5cb14a3c4f94dbd4d18eac8c70a84fd Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 8 Jul 2013 15:48:53 +0100
Subject: [PATCH 061/252] Reparameratised in terms of sigma2

---
 GPy/core/model.py                       |   3 -
 GPy/examples/laplace_approximations.py  |  34 ++--
 GPy/likelihoods/Laplace.py              |  12 +-
 GPy/likelihoods/likelihood_functions.py | 207 +++++++++++++++++++++---
 4 files changed, 207 insertions(+), 49 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index f97938a4..94202396 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -244,9 +244,6 @@ class model(parameterised):
         LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
         prior_gradients = self._transform_gradients(self._log_prior_gradients())
         obj_grads = -LL_gradients - prior_gradients
-        print self
-        #self.checkgrad(verbose=1)
-        #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         return obj_f, obj_grads
 
     def optimize(self, optimizer=None, start=None, **kwargs):
diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 14400a08..d6b48ebf 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -24,7 +24,7 @@ def timing():
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
 
-        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
         corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
         m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
         m.ensure_default_constraints()
@@ -53,7 +53,7 @@ def v_fail_test():
     edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel1)
     m.constrain_positive('')
@@ -92,18 +92,18 @@ def debug_student_t_noise_approx():
     X = np.random.rand(100)[:, None]
     #X = np.random.rand(100)[:, None]
     #X = np.array([0.5, 1])[:, None]
-    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var
+    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + 1
     #Y = X + np.random.randn(*X.shape)*real_var
     #ty = np.array([1., 9.97733584, 4.17841363])[:, None]
     #Y = ty
 
     X_full = X
-    Y_full = np.sin(X_full)
+    Y_full = np.sin(X_full) + 1
 
     Y = Y/Y.max()
 
     #Add student t random noise to datapoints
-    deg_free = 1000
+    deg_free = 100
 
     real_sd = np.sqrt(real_var)
     print "Real noise std: ", real_sd
@@ -115,7 +115,7 @@ def debug_student_t_noise_approx():
 
     plt.close('all')
     # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
     #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
@@ -140,24 +140,24 @@ def debug_student_t_noise_approx():
     #print m
 
     real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
-    edited_real_sd = real_stu_t_std + 1#initial_var_guess #real_sd
+    edited_real_sd = real_stu_t_std**2 #initial_var_guess #real_sd
     #edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
 
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     #m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
-    #m.constrain_fixed('rbf_l', 1.8651)
+    #m.constrain_fixed('rbf_l', 0.2651)
     #m.constrain_fixed('t_noise_std', edited_real_sd)
     #m.constrain_positive('rbf')
-    #m.constrain_positive('t_noise_std')
+    m.constrain_positive('t_noise_std')
     #m.constrain_positive('')
     #m.constrain_bounded('t_noi', 0.001, 10)
     #m.constrain_fixed('t_noi', real_stu_t_std)
-    m.constrain_fixed('white', 0.01)
+    #m.constrain_fixed('white', 0.01)
     #m.constrain_fixed('t_no', 0.01)
     #m['rbf_var'] = 0.20446332
     #m['rbf_leng'] = 0.85776241
@@ -179,7 +179,7 @@ def debug_student_t_noise_approx():
     return m
 
     #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
     #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
@@ -276,7 +276,7 @@ def student_t_approx():
     edited_real_sd = real_sd #initial_var_guess
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     m.ensure_default_constraints()
@@ -291,7 +291,7 @@ def student_t_approx():
     plt.title('Student-t rasm clean')
 
     print "Corrupt student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
@@ -308,7 +308,7 @@ def student_t_approx():
     return m
 
     #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
     #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
@@ -322,7 +322,7 @@ def student_t_approx():
     #plt.title('Student-t ncg clean')
 
     #print "Corrupt student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma=edited_real_sd)
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
     #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
     #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
     #m.ensure_default_constraints()
@@ -337,7 +337,7 @@ def student_t_approx():
 
 
     ###with a student t distribution, since it has heavy tails it should work well
-    ###likelihood_function = student_t(deg_free, sigma=real_var)
+    ###likelihood_function = student_t(deg_free, sigma2=real_var)
     ###lap = Laplace(Y, likelihood_function)
     ###cov = kernel.K(X)
     ###lap.fit_full(cov)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 2ae68613..984112a5 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -220,10 +220,10 @@ class Laplace(likelihood):
         self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
 
         #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
-        self.ln_z_hat = (- 0.5*self.f_Ki_f
-                         - self.ln_I_KW_det
-                         + self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
-                         )
+        #self.ln_z_hat = (- 0.5*self.f_Ki_f
+                         #- self.ln_I_KW_det
+                         #+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+                         #)
 
         return self._compute_GP_variables()
 
@@ -308,6 +308,8 @@ class Laplace(likelihood):
         step_size = 1
         rs = 0
         i = 0
+        #if self.likelihood_function.sigma < 0.001:
+            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
@@ -316,8 +318,6 @@ class Laplace(likelihood):
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
             B, L, W_12 = self._compute_B_statistics(K, W)
-            #if i > 30:
-                #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
             W_f = W*f
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index fd64dbe6..bfc759d7 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -158,26 +158,26 @@ class student_t(likelihood_function):
     dln p(yi|fi)_dfi
     d2ln p(yi|fi)_d2fifj
     """
-    def __init__(self, deg_free, sigma=2):
+    def __init__(self, deg_free, sigma2=2):
         #super(student_t, self).__init__()
         self.v = deg_free
-        self.sigma = sigma
+        self.sigma2 = sigma2
         self.log_concave = False
 
-        self._set_params(np.asarray(sigma))
+        self._set_params(np.asarray(sigma2))
 
     def _get_params(self):
-        return np.asarray(self.sigma)
+        return np.asarray(self.sigma2)
 
     def _get_param_names(self):
-        return ["t_noise_std"]
+        return ["t_noise_std2"]
 
     def _set_params(self, x):
-        self.sigma = float(x)
+        self.sigma2 = float(x)
 
     @property
     def variance(self, extra_data=None):
-        return (self.v / float(self.v - 2)) * (self.sigma**2)
+        return (self.v / float(self.v - 2)) * self.sigma2
 
     def link_function(self, y, f, extra_data=None):
         """link_function $\ln p(y|f)$
@@ -193,12 +193,16 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
+        A = gammaln((self.v + 1) * 0.5)
+        B = -gammaln(self.v * 0.5)
+        C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
+        D = (-(self.v + 1)*0.5)*np.log(1 + (e**2)/(self.v*self.sigma2))
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
-                     - 0.5*np.log((self.sigma**2) * self.v * np.pi)
-                     #- (self.v + 1) * 0.5 * np.log(1 + (((e / self.sigma)**2) / self.v))
-                     - (self.v + 1) * 0.5 * np.log(1 + (e**2)/(self.v*(self.sigma**2)))
+                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
+                     + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
                     )
+        #print "A: {} B: {} C: {} D: {} obj: {}".format(A,B,C,D.sum(),objective.sum())
         return np.sum(objective)
 
     def dlik_df(self, y, f, extra_data=None):
@@ -215,7 +219,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        grad = ((self.v + 1) * e) / (self.v * (self.sigma**2) + (e**2))
+        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
         return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
@@ -235,7 +239,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        hess = ((self.v + 1)*(e**2 - self.v*(self.sigma**2))) / ((((self.sigma**2)*self.v) + e**2)**2)
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -246,8 +250,8 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*(self.sigma**2))) /
-                       ((e**2 + (self.sigma**2)*self.v)**3)
+        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       ((e**2 + self.sigma2*self.v)**3)
                     )
         return d3lik_d3f
 
@@ -262,10 +266,16 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_dsigma = ( - (1/self.sigma) +
-                        ((1+self.v)*(e**2))/((self.sigma**3)*self.v*(1 + ((e**2) / ((self.sigma**2)*self.v)) ) )
-                      )
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_dsigma = ( - (1/sigma) +
+                        #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      #)
+        #dlik_dsigma = ( - 1 +
+                        #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      #)
         #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
+        #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
+        dlik_dsigma = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
@@ -276,9 +286,11 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_grad_dsigma = ((-2*self.sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
-                            / ((self.v*(self.sigma**2) + e**2)**2)
-                           )
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
+                            #/ ((self.v*self.sigma2 + e**2)**2)
+                           #)
+        dlik_grad_dsigma = (-self.v*(self.v+1)*e)/((self.sigma2*self.v + e**2)**2)
         return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
@@ -289,11 +301,15 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_hess_dsigma = (  (2*self.sigma*self.v*(self.v + 1)*((self.sigma**2)*self.v - 3*(e**2))) /
-                              ((e**2 + (self.sigma**2)*self.v)**3)
-                           )
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
+                              #((e**2 + self.sigma2*self.v)**3)
+                           #)
         #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
                              #/ ((e**2 + (self.sigma**2)*self.v)**3) )
+        dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
+                              / (self.sigma2*self.v + (e**2))**3
+                           )
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):
@@ -466,3 +482,148 @@ class weibull_survival(likelihood_function):
 
         hess = (y**self.shape)*np.exp(f)
         return np.squeeze(hess)
+
+#class gaussian(likelihood_function):
+    #"""
+    #Gaussian likelihood - this is a test class for approximation schemes
+    #"""
+    #def __init__(self, variance):
+        #self._set_params(np.asarray(variance))
+
+    #def _get_params(self):
+        #return np.asarray(self.sigma2)
+
+    #def _get_param_names(self):
+        #return ["noise_variance"]
+
+    #def _set_params(self, x):
+        #self.variance = float(x)
+
+    #def link_function(self, y, f, extra_data=None):
+        #"""link_function $\ln p(y|f)$
+        #$$\ln p(y_{i}|f_{i}) = \ln $$
+
+        #:y: data
+        #:f: latent variables f
+        #:extra_data: extra_data which is not used in student t distribution
+        #:returns: float(likelihood evaluated for this point)
+
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #objective = -0.5*self.D*
+        #return np.sum(objective)
+
+    #def dlik_df(self, y, f, extra_data=None):
+        #"""
+        #Gradient of the link function at y, given f w.r.t f
+
+        #$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
+
+        #:y: data
+        #:f: latent variables f
+        #:extra_data: extra_data which is not used in student t distribution
+        #:returns: gradient of likelihood evaluated at points
+
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        #return grad
+
+    #def d2lik_d2f(self, y, f, extra_data=None):
+        #"""
+        #Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        #i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        #Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+        #(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+
+        #$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
+
+        #:y: data
+        #:f: latent variables f
+        #:extra_data: extra_data which is not used in student t distribution
+        #:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
+        #return hess
+
+    #def d3lik_d3f(self, y, f, extra_data=None):
+        #"""
+        #Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+
+        #$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       #((e**2 + self.sigma2*self.v)**3)
+                    #)
+        #return d3lik_d3f
+
+    #def lik_dstd(self, y, f, extra_data=None):
+        #"""
+        #Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
+
+        #Terms relavent to derivatives wrt sigma are:
+        #-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+
+        #$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #sigma = np.sqrt(self.sigma2)
+        ##dlik_dsigma = ( - (1/sigma) +
+                        ##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      ##)
+        ##dlik_dsigma = ( - 1 +
+                        ##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      ##)
+        ##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
+        #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
+        #return dlik_dsigma
+
+    #def dlik_df_dstd(self, y, f, extra_data=None):
+        #"""
+        #Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
+
+        #$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
+                            #/ ((self.v*self.sigma2 + e**2)**2)
+                           #)
+        #return dlik_grad_dsigma
+
+    #def d2lik_d2f_dstd(self, y, f, extra_data=None):
+        #"""
+        #Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
+
+        #$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
+        #"""
+        #assert y.shape == f.shape
+        #e = y - f
+        #sigma = np.sqrt(self.sigma2)
+        #dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
+                              #((e**2 + self.sigma2*self.v)**3)
+                           #)
+        ##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
+                             ##/ ((e**2 + (self.sigma**2)*self.v)**3) )
+        #return dlik_hess_dsigma
+
+    #def _gradients(self, y, f, extra_data=None):
+        ##must be listed in same order as 'get_param_names'
+        #derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
+                  #[self.dlik_df_dstd(y, f, extra_data=extra_data)],
+                  #[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+                 #) # lists as we might learn many parameters
+        ## ensure we have gradients for every parameter we want to optimize
+        #assert len(derivs[0]) == len(self._get_param_names())
+        #assert len(derivs[1]) == len(self._get_param_names())
+        #assert len(derivs[2]) == len(self._get_param_names())
+        #return derivs

From 2a366619b340d25d5dd53836e2e66ffcfb2257d7 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 8 Jul 2013 16:09:20 +0100
Subject: [PATCH 062/252] Changed incorrect naming

---
 GPy/examples/laplace_approximations.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index d6b48ebf..78b4e986 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -84,6 +84,26 @@ def v_fail_test():
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print(m)
 
+def student_t_f_check():
+    real_var = 0.1
+    X = np.random.rand(100)[:, None]
+    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var
+    X_full = X
+    Y_full = np.sin(X_full)
+    Y = Y/Y.max()
+    deg_free = 1000
+    real_sd = np.sqrt(real_var)
+
+    kernel = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+    real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
+
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std**2)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    m = GPy.models.GP(X, stu_t_likelihood, kernel)
+    m.constrain_positive('t_noise_std2')
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+
 def debug_student_t_noise_approx():
     plot = False
     real_var = 0.1
@@ -151,9 +171,9 @@ def debug_student_t_noise_approx():
     #m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 0.2651)
-    #m.constrain_fixed('t_noise_std', edited_real_sd)
+    #m.constrain_fixed('t_noise_std2', edited_real_sd)
     #m.constrain_positive('rbf')
-    m.constrain_positive('t_noise_std')
+    m.constrain_positive('t_noise_std2')
     #m.constrain_positive('')
     #m.constrain_bounded('t_noi', 0.001, 10)
     #m.constrain_fixed('t_noi', real_stu_t_std)

From ee980227ac34262b192565cafb5e195cefee46d0 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 9 Jul 2013 11:35:42 +0100
Subject: [PATCH 063/252] Fixed 2*variance plotting instead of 2*std plotting,
 tidied up

---
 GPy/examples/laplace_approximations.py  | 93 ++++++++++++++++++++-----
 GPy/likelihoods/Laplace.py              |  2 +-
 GPy/likelihoods/likelihood_functions.py | 28 +-------
 GPy/models/GP.py                        |  2 +-
 4 files changed, 78 insertions(+), 47 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 78b4e986..b3048f5a 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -85,24 +85,78 @@ def v_fail_test():
     print(m)
 
 def student_t_f_check():
-    real_var = 0.1
+    plt.close('all')
+    real_std = 0.1
     X = np.random.rand(100)[:, None]
-    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
     X_full = X
     Y_full = np.sin(X_full)
-    Y = Y/Y.max()
-    deg_free = 1000
-    real_sd = np.sqrt(real_var)
+    #Y = Y/Y.max()
+    deg_free = 10000
 
-    kernel = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
-    real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
+    #GP
+    kernelgp = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp.randomize()
+    mgp.optimize()
 
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std**2)
+    kernelst = kernelgp.copy()
+    real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
+
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std2)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GP(X, stu_t_likelihood, kernel)
-    m.constrain_positive('t_noise_std2')
-    m.ensure_default_constraints()
+
+    plt.figure(1)
+    plt.suptitle('Student likelihood')
+    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m.constrain_fixed('rbf_var', mgp._get_params()[0])
+    m.constrain_fixed('rbf_len', mgp._get_params()[1])
+
     m.update_likelihood_approximation()
+    print "T std2 {} converted from original data, LL: {}".format(real_stu_t_std2, m.log_likelihood())
+    plt.subplot(221)
+    m.plot()
+    plt.title('Student t original data noise')
+
+    #Fix student t noise variance to same a GP
+    gp_noise = mgp._get_params()[2]
+    m['t_noise_std2'] = gp_noise
+    m.update_likelihood_approximation()
+    print "T std2 {} same as GP noise, LL: {}".format(gp_noise, m.log_likelihood())
+    plt.subplot(222)
+    m.plot()
+    plt.title('Student t GP noise')
+
+    #Fix student t noise to variance converted from the GP
+    real_stu_t_std2gp = (gp_noise)*((deg_free - 2)/float(deg_free))
+    m['t_noise_std2'] = real_stu_t_std2gp
+    m.update_likelihood_approximation()
+    print "T std2 {} converted to student t noise from GP noise, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.log_likelihood())
+    plt.subplot(223)
+    m.plot()
+    plt.title('Student t GP noise converted')
+
+    m.constrain_positive('t_noise_std2')
+    m.randomize()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print "T std2 {} var {} after optimising, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.likelihood.likelihood_function.variance, m.log_likelihood())
+    plt.subplot(224)
+    m.plot()
+    plt.title('Student t optimised')
+
+    plt.figure(2)
+    print "GP noise {} after optimising, LL: {}".format(gp_noise, mgp.log_likelihood())
+    plt.suptitle('Gaussian likelihood optimised')
+    mgp.plot()
+    print "Real std: {}".format(real_std)
+    print "Real variance {}".format(real_std**2)
+
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+    return m
 
 def debug_student_t_noise_approx():
     plot = False
@@ -218,16 +272,16 @@ def student_t_approx():
     """
     Example of regressing with a student t likelihood
     """
-    real_var = 0.2
+    real_std = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 30)[:, None]
-    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+    X = np.linspace(0.0, 10.0, 50)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_std
     Yc = Y.copy()
 
     X_full = np.linspace(0.0, 10.0, 500)[:, None]
     Y_full = np.sin(X_full)
 
-    #Y = Y/Y.max()
+    Y = Y/Y.max()
 
     Yc[10] += 100
     Yc[25] += 10
@@ -238,10 +292,9 @@ def student_t_approx():
 
     #Add student t random noise to datapoints
     deg_free = 8
-    real_sd = np.sqrt(real_var)
-    print "Real noise: ", real_sd
+    print "Real noise: ", real_std
 
-    initial_var_guess = 0.01
+    initial_var_guess = 0.1
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -293,7 +346,7 @@ def student_t_approx():
 
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
-    edited_real_sd = real_sd #initial_var_guess
+    edited_real_sd = real_std #initial_var_guess
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
@@ -301,6 +354,7 @@ def student_t_approx():
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
+    m.randomize()
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
@@ -316,6 +370,7 @@ def student_t_approx():
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
+    m.randomize()
     m.update_likelihood_approximation()
     m.optimize()
     print(m)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 984112a5..c5894ed6 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -89,7 +89,7 @@ class Laplace(likelihood):
         expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
-        print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
         dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
         return dL_dthetaK
 
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index bfc759d7..595fa63c 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -193,16 +193,11 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        A = gammaln((self.v + 1) * 0.5)
-        B = -gammaln(self.v * 0.5)
-        C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
-        D = (-(self.v + 1)*0.5)*np.log(1 + (e**2)/(self.v*self.sigma2))
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
                      - 0.5*np.log(self.sigma2 * self.v * np.pi)
                      + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
                     )
-        #print "A: {} B: {} C: {} D: {} obj: {}".format(A,B,C,D.sum(),objective.sum())
         return np.sum(objective)
 
     def dlik_df(self, y, f, extra_data=None):
@@ -266,15 +261,6 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        #dlik_dsigma = ( - (1/sigma) +
-                        #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      #)
-        #dlik_dsigma = ( - 1 +
-                        #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      #)
-        #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
-        #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
         dlik_dsigma = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
         return dlik_dsigma
 
@@ -286,10 +272,6 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
-                            #/ ((self.v*self.sigma2 + e**2)**2)
-                           #)
         dlik_grad_dsigma = (-self.v*(self.v+1)*e)/((self.sigma2*self.v + e**2)**2)
         return dlik_grad_dsigma
 
@@ -301,12 +283,6 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        #dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
-                              #((e**2 + self.sigma2*self.v)**3)
-                           #)
-        #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
-                             #/ ((e**2 + (self.sigma**2)*self.v)**3) )
         dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
                               / (self.sigma2*self.v + (e**2))**3
                            )
@@ -344,8 +320,8 @@ class student_t(likelihood_function):
         #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
         #need the 95 and 5 percentiles.
         #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
-        p_025 = mu - 2.*true_var
-        p_975 = mu + 2.*true_var
+        p_025 = mu - 2.*np.sqrt(true_var)
+        p_975 = mu + 2.*np.sqrt(true_var)
 
         return mu, np.nan*mu, p_025, p_975
 
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 20337ef5..cd4b7dac 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -152,7 +152,7 @@ class GP(model):
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
         #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-        print "dL_dthetaK: {}   dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL)
+        #print "dL_dthetaK: {}   dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL)
 
         return np.hstack((dL_dthetaK, dL_dthetaL))
         #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))

From 57001851c46f34d075aa605ac1aa0ac0eb302c57 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 9 Jul 2013 20:05:03 +0100
Subject: [PATCH 064/252] Trying to debug kernel parameters learning (fails
 even when noise fixed) may be some instablility, seems like it can get it if
 it starts close

---
 GPy/examples/laplace_approximations.py | 103 ++++++++++++++++++++++---
 GPy/likelihoods/Laplace.py             |  18 +++--
 GPy/models/GP.py                       |  12 ++-
 3 files changed, 110 insertions(+), 23 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index b3048f5a..279bc597 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -1,6 +1,7 @@
 import GPy
 import numpy as np
 import matplotlib.pyplot as plt
+np.random.seed(1)
 
 def timing():
     real_var = 0.1
@@ -86,17 +87,67 @@ def v_fail_test():
 
 def student_t_f_check():
     plt.close('all')
-    real_std = 0.1
-    X = np.random.rand(100)[:, None]
+    X = np.linspace(0, 1, 50)[:, None]
+    real_std = 0.001
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
+    deg_free = 1000
+
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp.randomize()
+    mgp.optimize()
+    print mgp
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+    kernelst = kernelgp.copy()
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=1e-5)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m['rbf_v'] = mgp._get_params()[0]
+    m['rbf_l'] = mgp._get_params()[1] + 1
+    m.ensure_default_constraints()
+    m.constrain_positive('t_no')
+    print m
+    plt.figure()
+    plt.subplot(511)
+    m.plot()
+    print m
+    plt.subplot(512)
+    m.optimize(max_f_eval=15)
+    m.plot()
+    print m
+    plt.subplot(513)
+    m.optimize(max_f_eval=15)
+    m.plot()
+    print m
+    plt.subplot(514)
+    m.optimize(max_f_eval=15)
+    m.plot()
+    print m
+    plt.subplot(515)
+    m.optimize()
+    m.plot()
+    print "final optimised student t"
+    print m
+    print "real GP"
+    print mgp
+
+def student_t_fix_optimise_check():
+    plt.close('all')
+    real_var = 0.1
+    real_std = np.sqrt(real_var)
+    X = np.random.rand(200)[:, None]
     noise = np.random.randn(*X.shape)*real_std
     Y = np.sin(X*2*np.pi) + noise
     X_full = X
     Y_full = np.sin(X_full)
     #Y = Y/Y.max()
-    deg_free = 10000
+    deg_free = 1000
 
     #GP
-    kernelgp = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
     mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
     mgp.ensure_default_constraints()
     mgp.randomize()
@@ -113,10 +164,12 @@ def student_t_f_check():
     m = GPy.models.GP(X, stu_t_likelihood, kernelst)
     m.constrain_fixed('rbf_var', mgp._get_params()[0])
     m.constrain_fixed('rbf_len', mgp._get_params()[1])
+    m.constrain_positive('t_noise')
+    #m.ensure_default_constraints()
 
     m.update_likelihood_approximation()
     print "T std2 {} converted from original data, LL: {}".format(real_stu_t_std2, m.log_likelihood())
-    plt.subplot(221)
+    plt.subplot(231)
     m.plot()
     plt.title('Student t original data noise')
 
@@ -125,7 +178,7 @@ def student_t_f_check():
     m['t_noise_std2'] = gp_noise
     m.update_likelihood_approximation()
     print "T std2 {} same as GP noise, LL: {}".format(gp_noise, m.log_likelihood())
-    plt.subplot(222)
+    plt.subplot(232)
     m.plot()
     plt.title('Student t GP noise')
 
@@ -134,29 +187,57 @@ def student_t_f_check():
     m['t_noise_std2'] = real_stu_t_std2gp
     m.update_likelihood_approximation()
     print "T std2 {} converted to student t noise from GP noise, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.log_likelihood())
-    plt.subplot(223)
+    plt.subplot(233)
     m.plot()
     plt.title('Student t GP noise converted')
 
     m.constrain_positive('t_noise_std2')
     m.randomize()
     m.update_likelihood_approximation()
+    plt.subplot(234)
+    m.plot()
+    plt.title('Student t fixed rbf')
     m.optimize()
     print "T std2 {} var {} after optimising, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.likelihood.likelihood_function.variance, m.log_likelihood())
-    plt.subplot(224)
+    plt.subplot(235)
     m.plot()
-    plt.title('Student t optimised')
+    plt.title('Student t fixed rbf optimised')
 
     plt.figure(2)
+    mrbf = m.copy()
+    mrbf.unconstrain('')
+    mrbf.constrain_fixed('t_noise', m.likelihood.likelihood_function.sigma2)
+    gp_var = mgp._get_params()[0]
+    gp_len = mgp._get_params()[1]
+    mrbf.constrain_fixed('rbf_var', gp_var)
+    mrbf.constrain_positive('rbf_len')
+    mrbf.randomize()
+    print "Before optimize"
+    print mrbf
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    mrbf.checkgrad(verbose=1)
+    plt.subplot(121)
+    mrbf.plot()
+    plt.title('Student t fixed noise')
+    #mrbf.optimize()
+    print "After optimize"
+    print mrbf
+    plt.subplot(122)
+    mrbf.plot()
+    plt.title('Student t fixed noise optimized')
+    print mrbf
+
+    plt.figure(3)
     print "GP noise {} after optimising, LL: {}".format(gp_noise, mgp.log_likelihood())
     plt.suptitle('Gaussian likelihood optimised')
     mgp.plot()
     print "Real std: {}".format(real_std)
     print "Real variance {}".format(real_std**2)
 
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
-    return m
+    print "Len should be: {}".format(gp_len)
+    return mrbf
 
 def debug_student_t_noise_approx():
     plot = False
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index c5894ed6..5343f5dc 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -89,7 +89,7 @@ class Laplace(likelihood):
         expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
-        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
         dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
         return dL_dthetaK
 
@@ -290,10 +290,12 @@ class Laplace(likelihood):
         :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
         :returns: f_mode
         """
-        if self.old_a is None:
-            old_a = np.zeros((self.N, 1))
-        else:
-            old_a = self.old_a
+        old_a = np.zeros((self.N, 1))
+        #old_a = None
+        #if self.old_a is None:
+            #old_a = np.zeros((self.N, 1))
+        #else:
+            #old_a = self.old_a
 
         f = np.dot(self.K, old_a)
         self.f = f
@@ -308,8 +310,6 @@ class Laplace(likelihood):
         step_size = 1
         rs = 0
         i = 0
-        #if self.likelihood_function.sigma < 0.001:
-            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             if not self.likelihood_function.log_concave:
@@ -371,8 +371,10 @@ class Laplace(likelihood):
             old_a = self.a #a
             i += 1
 
+        self.old_a = old_a
         #print "Positive difference obj: ", np.float(difference)
-        print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
+        #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
+        print "Iterations: {}, Final_difference: {}".format(i, difference)
         #self.a = a
         #self.B, self.B_chol, self.W_12 = B, L, W_12
         #self.Bi, _, _, B_det = pdinv(self.B)
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index cd4b7dac..0f56e21c 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -132,7 +132,11 @@ class GP(model):
         model for a new variable Y* = v_tilde/tau_tilde, with a covariance
         matrix K* = K + diag(1./tau_tilde) plus a normalization term.
         """
+        if isinstance(self.likelihood, Laplace):
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
         l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        print "K_ldet: {} mft: {} Z: {}".format(self.K_logdet, self._model_fit_term(), self.likelihood.Z)
         return l
 
     def _log_likelihood_gradients(self):
@@ -142,12 +146,12 @@ class GP(model):
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
-        #print "dL_dthetaK should be: ", dL_dthetaK
+        print "dL_dthetaK should be: ", dL_dthetaK
         if isinstance(self.likelihood, Laplace):
-            #self.likelihood.fit_full(self.kern.K(self.X))
-            #self.likelihood._set_params(self.likelihood._get_params())
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
             dK_dthetaK = self.kern.dK_dtheta
-            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X)
+            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy())
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
         else:
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From aa9860859000530ba3297e72236c359f2a36a42b Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 29 Jul 2013 15:29:46 +0100
Subject: [PATCH 065/252] Started adding gaussian likelihood, changed round
 preloading old_a

---
 GPy/core/model.py                       |   6 +
 GPy/examples/laplace_approximations.py  |  72 ++++++-
 GPy/likelihoods/Laplace.py              | 173 ++++++++++------
 GPy/likelihoods/likelihood_functions.py | 251 +++++++++++++-----------
 4 files changed, 321 insertions(+), 181 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 94202396..e3a9bb68 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -244,6 +244,12 @@ class model(parameterised):
         LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
         prior_gradients = self._transform_gradients(self._log_prior_gradients())
         obj_grads = -LL_gradients - prior_gradients
+        print self
+        print self._get_params()
+        print -obj_grads
+        self.plot()
+        if isinstance(self.likelihood, likelihoods.Laplace):
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
         return obj_f, obj_grads
 
     def optimize(self, optimizer=None, start=None, **kwargs):
diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 279bc597..2b93122c 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -85,10 +85,60 @@ def v_fail_test():
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
     print(m)
 
+def student_t_obj_plane():
+    plt.close('all')
+    X = np.linspace(0, 1, 50)[:, None]
+    real_std = 0.002
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
+    deg_free = 1000
+
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp['noise'] = real_std**2
+    print "Gaussian"
+    print mgp
+
+    kernelst = kernelgp.copy()
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2))
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m.ensure_default_constraints()
+    m.constrain_fixed('t_no', real_std**2)
+    vs = 10
+    ls = 10
+    objs_t = np.zeros((vs, ls))
+    objs_g = np.zeros((vs, ls))
+    rbf_vs = np.linspace(1e-6, 8, vs)
+    rbf_ls = np.linspace(1e-2, 8, ls)
+    for v_id, rbf_v in enumerate(rbf_vs):
+        for l_id, rbf_l in enumerate(rbf_ls):
+            m['rbf_v'] = rbf_v
+            m['rbf_l'] = rbf_l
+            mgp['rbf_v'] = rbf_v
+            mgp['rbf_l'] = rbf_l
+            objs_t[v_id, l_id] = m.log_likelihood()
+            objs_g[v_id, l_id] = mgp.log_likelihood()
+    plt.figure()
+    plt.subplot(211)
+    plt.title('Student t')
+    plt.imshow(objs_t, interpolation='none')
+    plt.xlabel('variance')
+    plt.ylabel('lengthscale')
+    plt.subplot(212)
+    plt.title('Gaussian')
+    plt.imshow(objs_g, interpolation='none')
+    plt.xlabel('variance')
+    plt.ylabel('lengthscale')
+    plt.show()
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    return objs_t
+
 def student_t_f_check():
     plt.close('all')
     X = np.linspace(0, 1, 50)[:, None]
-    real_std = 0.001
+    real_std = 0.2
     noise = np.random.randn(*X.shape)*real_std
     Y = np.sin(X*2*np.pi) + noise
     deg_free = 1000
@@ -98,17 +148,26 @@ def student_t_f_check():
     mgp.ensure_default_constraints()
     mgp.randomize()
     mgp.optimize()
+    print "Gaussian"
     print mgp
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     kernelst = kernelgp.copy()
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=1e-5)
+    #kernelst += GPy.kern.bias(X.shape[1])
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernelst)
-    m['rbf_v'] = mgp._get_params()[0]
-    m['rbf_l'] = mgp._get_params()[1] + 1
+    #m['rbf_v'] = mgp._get_params()[0]
+    #m['rbf_l'] = mgp._get_params()[1] + 1
     m.ensure_default_constraints()
+    #m.constrain_fixed('rbf_v', mgp._get_params()[0])
+    #m.constrain_fixed('rbf_l', mgp._get_params()[1])
+    #m.constrain_bounded('t_no', 2*real_std**2, 1e3)
+    #m.constrain_positive('bias')
     m.constrain_positive('t_no')
+    m.randomize()
+    m['t_no'] = 0.3
+    m.likelihood.X = X
     print m
     plt.figure()
     plt.subplot(511)
@@ -143,7 +202,8 @@ def student_t_fix_optimise_check():
     Y = np.sin(X*2*np.pi) + noise
     X_full = X
     Y_full = np.sin(X_full)
-    #Y = Y/Y.max()
+    Y = Y/Y.max()
+    Y_full = Y_full/Y_full.max()
     deg_free = 1000
 
     #GP
@@ -219,7 +279,7 @@ def student_t_fix_optimise_check():
     plt.subplot(121)
     mrbf.plot()
     plt.title('Student t fixed noise')
-    #mrbf.optimize()
+    mrbf.optimize()
     print "After optimize"
     print mrbf
     plt.subplot(122)
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 5343f5dc..8b39f222 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -156,17 +156,23 @@ class Laplace(likelihood):
         Y_tilde = Wi*self.Ki_f + self.f_hat
 
         self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
+        #self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6
+
         self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
         self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
 
         self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
-        Z_tilde = (#+ self.NORMAL_CONST
+        self.aA = 0.5*self.ln_det_K_Wi__Bi
+        self.bB = - 0.5*self.f_Ki_f
+        self.cC = 0.5*self.y_Wi_Ki_i_y
+        Z_tilde = (+ 100*self.NORMAL_CONST
                    + self.lik
                    + 0.5*self.ln_det_K_Wi__Bi
                    - 0.5*self.f_Ki_f
                    + 0.5*self.y_Wi_Ki_i_y
                   )
-        #print "Ztilde: {}".format(Z_tilde)
+        print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC)
+        print self.likelihood_function._get_params()
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -198,7 +204,7 @@ class Laplace(likelihood):
         self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-10  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                        #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                        #To cause the posterior to become less certain than the prior and likelihood,
                                        #This is a property only held by non-log-concave likelihoods
@@ -280,7 +286,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=40, MAX_RESTART=10):
+    def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
         """
         Rasmussens numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -290,15 +296,19 @@ class Laplace(likelihood):
         :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
         :returns: f_mode
         """
-        old_a = np.zeros((self.N, 1))
-        #old_a = None
-        #if self.old_a is None:
-            #old_a = np.zeros((self.N, 1))
-        #else:
-            #old_a = self.old_a
+        self.old_before_s = self.likelihood_function._get_params()
+        print "before: ", self.old_before_s
+        #if self.old_before_s < 1e-5:
+            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+        #old_a = np.zeros((self.N, 1))
+        if self.old_a is None:
+            old_a = np.zeros((self.N, 1))
+            f = np.dot(K, old_a)
+        else:
+            old_a = self.old_a.copy()
+            f = self.f_hat.copy()
 
-        f = np.dot(self.K, old_a)
-        self.f = f
         new_obj = -np.inf
         old_obj = np.inf
 
@@ -306,18 +316,20 @@ class Laplace(likelihood):
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-10
+        epsilon = 1e-4
         step_size = 1
         rs = 0
         i = 0
-        while difference > epsilon and i < MAX_ITER and rs < MAX_RESTART:
+
+        while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
+            #W = np.maximum(W, 0)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-10     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
-            B, L, W_12 = self._compute_B_statistics(K, W)
+            B, L, W_12 = self._compute_B_statistics(K, W.copy())
 
             W_f = W*f
             grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
@@ -328,54 +340,105 @@ class Laplace(likelihood):
             full_step_a = b - W_12*solve_L
             da = full_step_a - old_a
 
-            f_old = f.copy()
-
-            f_old = self.f.copy()
-            def inner_obj(step_size, old_a, da, K):
-                a = old_a + step_size*da
-                f = np.dot(K, a)
-                self.a = a # This is nasty, need to set something within an optimization though
-                self.f = f
-                return -obj(a, f)
-
-            from functools import partial
-            i_o = partial(inner_obj, old_a=old_a, da=da, K=self.K)
-            new_obj = sp.optimize.brent(i_o, tol=1e-6, maxiter=10)
-
-            #update_passed = False
-            #while not update_passed:
+            #f_old = f.copy()
+            #def inner_obj(step_size, old_a, da, K):
                 #a = old_a + step_size*da
                 #f = np.dot(K, a)
+                #self.a = a.copy() # This is nasty, need to set something within an optimization though
+                #self.f = f.copy()
+                #return -obj(a, f)
 
-                #old_obj = new_obj
-                #new_obj = obj(a, f)
-                #difference = new_obj - old_obj
-                #print "difference: ",difference
-                #if difference < 0:
-                    ##print grad
-                    ##print "Objective function rose", np.float(difference)
-                    ##If the objective function isn't rising, restart optimization
-                    #step_size *= 0.8
-                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
-                    ##objective function isn't increasing, try reducing step size
-                    ##f = f_old #it's actually faster not to go back to old location and just zigzag across the mode
-                    ##old_obj = tmp_old_obj
-                    #old_obj = new_obj
-                    #rs += 1
-                #else:
-                    #update_passed = True
+            #from functools import partial
+            #i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
+            ##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
+            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
+            #f = self.f.copy()
+            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
-            f = self.f
-            difference = new_obj - old_obj
-            difference = np.abs(np.sum(f - f_old)) #+ abs(difference)
-            old_a = self.a #a
+            f_old = f.copy()
+            update_passed = False
+            while not update_passed:
+                a = old_a + step_size*da
+                f = np.dot(K, a)
+
+                old_obj = new_obj
+                new_obj = obj(a, f)
+                difference = new_obj - old_obj
+                print "difference: ",difference
+                if difference < 0:
+                    #print "Objective function rose", np.float(difference)
+                    #If the objective function isn't rising, restart optimization
+                    step_size *= 0.8
+                    #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    #objective function isn't increasing, try reducing step size
+                    f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
+                    old_obj = new_obj
+                    rs += 1
+                else:
+                    update_passed = True
+
+            #difference = abs(new_obj - old_obj)
+            #old_obj = new_obj.copy()
+            difference = np.abs(np.sum(f - f_old))
+            #old_a = self.a.copy() #a
+            old_a = a.copy()
             i += 1
+            #print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
 
-        self.old_a = old_a
+        self.old_a = old_a.copy()
         #print "Positive difference obj: ", np.float(difference)
         #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
         print "Iterations: {}, Final_difference: {}".format(i, difference)
-        #self.a = a
+        if difference > 1e-4:
+            print "FAIL FAIL FAIL FAIL FAIL FAIL"
+            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+            if hasattr(self, 'X'):
+                import pylab as pb
+                pb.figure()
+                pb.subplot(311)
+                pb.title('old f_hat')
+                pb.plot(self.X, self.f_hat)
+                pb.subplot(312)
+                pb.title('old ff')
+                pb.plot(self.X, self.old_ff)
+                pb.subplot(313)
+                pb.title('new f_hat')
+                pb.plot(self.X, f)
+
+                pb.figure()
+                pb.subplot(121)
+                pb.title('old K')
+                pb.imshow(np.diagflat(self.old_K), interpolation='none')
+                pb.colorbar()
+                pb.subplot(122)
+                pb.title('new K')
+                pb.imshow(np.diagflat(K), interpolation='none')
+                pb.colorbar()
+
+                pb.figure()
+                pb.subplot(121)
+                pb.title('old W')
+                pb.imshow(np.diagflat(self.old_W), interpolation='none')
+                pb.colorbar()
+                pb.subplot(122)
+                pb.title('new W')
+                pb.imshow(np.diagflat(W), interpolation='none')
+                pb.colorbar()
+
+                import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+                pb.close('all')
+
+        #FIXME: DELETE THESE
+        self.old_W = W.copy()
+        self.old_grad = grad.copy()
+        self.old_B = B.copy()
+        self.old_W_12 = W_12.copy()
+        self.old_ff = f.copy()
+        self.old_K = self.K.copy()
+        self.old_s = self.likelihood_function._get_params()
+        print "after: ", self.old_s
+        #print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
+        self.a = a
         #self.B, self.B_chol, self.W_12 = B, L, W_12
         #self.Bi, _, _, B_det = pdinv(self.B)
         return f
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 595fa63c..62e09a1a 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -193,11 +193,16 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
+        #A = gammaln((self.v + 1) * 0.5)
+        #B = - gammaln(self.v * 0.5)
+        #C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
+        #D = + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
                      - 0.5*np.log(self.sigma2 * self.v * np.pi)
                      + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
                     )
+        #print "C: {} D: {} obj: {}".format(C, np.sum(D), objective.sum())
         return np.sum(objective)
 
     def dlik_df(self, y, f, extra_data=None):
@@ -459,147 +464,153 @@ class weibull_survival(likelihood_function):
         hess = (y**self.shape)*np.exp(f)
         return np.squeeze(hess)
 
-#class gaussian(likelihood_function):
-    #"""
-    #Gaussian likelihood - this is a test class for approximation schemes
-    #"""
-    #def __init__(self, variance):
-        #self._set_params(np.asarray(variance))
+class gaussian(likelihood_function):
+    """
+    Gaussian likelihood - this is a test class for approximation schemes
+    """
+    def __init__(self, variance):
+        self._set_params(np.asarray(variance))
 
-    #def _get_params(self):
-        #return np.asarray(self.sigma2)
+    def _get_params(self):
+        return np.asarray(self._variance)
 
-    #def _get_param_names(self):
-        #return ["noise_variance"]
+    def _get_param_names(self):
+        return ["noise_variance"]
 
-    #def _set_params(self, x):
-        #self.variance = float(x)
+    def _set_params(self, x):
+        self._variance = float(x)
+        self.covariance_matrix = np.eye(self.N) * self._variance
+        self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG
 
-    #def link_function(self, y, f, extra_data=None):
-        #"""link_function $\ln p(y|f)$
-        #$$\ln p(y_{i}|f_{i}) = \ln $$
+    def link_function(self, y, f, extra_data=None):
+        """link_function $\ln p(y|f)$
+        $$\ln p(y_{i}|f_{i}) = \ln $$
 
-        #:y: data
-        #:f: latent variables f
-        #:extra_data: extra_data which is not used in student t distribution
-        #:returns: float(likelihood evaluated for this point)
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: float(likelihood evaluated for this point)
 
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #objective = -0.5*self.D*
-        #return np.sum(objective)
+        """
+        assert y.shape == f.shape
+        e = y - f
+        eeT = np.dot(e, e.T)
+        objective = (- 0.5*self.D*np.log(2*np.pi)
+                     - 0.5*self.ln_K
+                     - 0.5*np.sum(np.multiply(self.Ki, eeT))
+                     )
+        return np.sum(objective)
 
-    #def dlik_df(self, y, f, extra_data=None):
-        #"""
-        #Gradient of the link function at y, given f w.r.t f
+    def dlik_df(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
 
-        #$$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
+        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
 
-        #:y: data
-        #:f: latent variables f
-        #:extra_data: extra_data which is not used in student t distribution
-        #:returns: gradient of likelihood evaluated at points
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: gradient of likelihood evaluated at points
 
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
-        #return grad
+        """
+        assert y.shape == f.shape
+        e = y - f
+        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        return grad
 
-    #def d2lik_d2f(self, y, f, extra_data=None):
-        #"""
-        #Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
-        #i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+    def d2lik_d2f(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
 
-        #Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-        #(the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+        Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+        (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
 
-        #$$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
+        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
 
-        #:y: data
-        #:f: latent variables f
-        #:extra_data: extra_data which is not used in student t distribution
-        #:returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
-        #return hess
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        assert y.shape == f.shape
+        e = y - f
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
+        return hess
 
-    #def d3lik_d3f(self, y, f, extra_data=None):
-        #"""
-        #Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+    def d3lik_d3f(self, y, f, extra_data=None):
+        """
+        Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
 
-        #$$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
-                       #((e**2 + self.sigma2*self.v)**3)
-                    #)
-        #return d3lik_d3f
+        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       ((e**2 + self.sigma2*self.v)**3)
+                    )
+        return d3lik_d3f
 
-    #def lik_dstd(self, y, f, extra_data=None):
-        #"""
-        #Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
+    def lik_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
 
-        #Terms relavent to derivatives wrt sigma are:
-        #-log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+        Terms relavent to derivatives wrt sigma are:
+        -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
 
-        #$$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        ##dlik_dsigma = ( - (1/sigma) +
-                        ##((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      ##)
-        ##dlik_dsigma = ( - 1 +
-                        ##((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      ##)
-        ##dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
-        #dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
-        #return dlik_dsigma
+        $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        sigma = np.sqrt(self.sigma2)
+        #dlik_dsigma = ( - (1/sigma) +
+                        #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      #)
+        #dlik_dsigma = ( - 1 +
+                        #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
+                      #)
+        #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
+        dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
+        return dlik_dsigma
 
-    #def dlik_df_dstd(self, y, f, extra_data=None):
-        #"""
-        #Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
+    def dlik_df_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
 
-        #$$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        #dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
-                            #/ ((self.v*self.sigma2 + e**2)**2)
-                           #)
-        #return dlik_grad_dsigma
+        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        sigma = np.sqrt(self.sigma2)
+        dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
+                            / ((self.v*self.sigma2 + e**2)**2)
+                           )
+        return dlik_grad_dsigma
 
-    #def d2lik_d2f_dstd(self, y, f, extra_data=None):
-        #"""
-        #Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
+    def d2lik_d2f_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
 
-        #$$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
-        #"""
-        #assert y.shape == f.shape
-        #e = y - f
-        #sigma = np.sqrt(self.sigma2)
-        #dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
-                              #((e**2 + self.sigma2*self.v)**3)
-                           #)
-        ##dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
-                             ##/ ((e**2 + (self.sigma**2)*self.v)**3) )
-        #return dlik_hess_dsigma
+        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        sigma = np.sqrt(self.sigma2)
+        dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
+                              ((e**2 + self.sigma2*self.v)**3)
+                           )
+        #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
+                             #/ ((e**2 + (self.sigma**2)*self.v)**3) )
+        return dlik_hess_dsigma
 
-    #def _gradients(self, y, f, extra_data=None):
-        ##must be listed in same order as 'get_param_names'
-        #derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
-                  #[self.dlik_df_dstd(y, f, extra_data=extra_data)],
-                  #[self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
-                 #) # lists as we might learn many parameters
-        ## ensure we have gradients for every parameter we want to optimize
-        #assert len(derivs[0]) == len(self._get_param_names())
-        #assert len(derivs[1]) == len(self._get_param_names())
-        #assert len(derivs[2]) == len(self._get_param_names())
-        #return derivs
+    def _gradients(self, y, f, extra_data=None):
+        #must be listed in same order as 'get_param_names'
+        derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
+                  [self.dlik_df_dstd(y, f, extra_data=extra_data)],
+                  [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+                 ) # lists as we might learn many parameters
+        # ensure we have gradients for every parameter we want to optimize
+        assert len(derivs[0]) == len(self._get_param_names())
+        assert len(derivs[1]) == len(self._get_param_names())
+        assert len(derivs[2]) == len(self._get_param_names())
+        return derivs

From fdb7b99e0bd8a740dd898317aab5cd506b97e34e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 29 Jul 2013 17:21:52 +0100
Subject: [PATCH 066/252] Got rid of some overdoing the approximation

---
 GPy/likelihoods/Laplace.py |  2 +-
 GPy/models/GP.py           | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 8b39f222..f86c47b6 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -165,7 +165,7 @@ class Laplace(likelihood):
         self.aA = 0.5*self.ln_det_K_Wi__Bi
         self.bB = - 0.5*self.f_Ki_f
         self.cC = 0.5*self.y_Wi_Ki_i_y
-        Z_tilde = (+ 100*self.NORMAL_CONST
+        Z_tilde = (#+ 100*self.NORMAL_CONST
                    + self.lik
                    + 0.5*self.ln_det_K_Wi__Bi
                    - 0.5*self.f_Ki_f
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 0f56e21c..77620488 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -132,9 +132,9 @@ class GP(model):
         model for a new variable Y* = v_tilde/tau_tilde, with a covariance
         matrix K* = K + diag(1./tau_tilde) plus a normalization term.
         """
-        if isinstance(self.likelihood, Laplace):
-            self.likelihood.fit_full(self.kern.K(self.X))
-            self.likelihood._set_params(self.likelihood._get_params())
+        #if isinstance(self.likelihood, Laplace):
+            #self.likelihood.fit_full(self.kern.K(self.X))
+            #self.likelihood._set_params(self.likelihood._get_params())
         l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
         print "K_ldet: {} mft: {} Z: {}".format(self.K_logdet, self._model_fit_term(), self.likelihood.Z)
         return l
@@ -148,8 +148,8 @@ class GP(model):
         dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
         print "dL_dthetaK should be: ", dL_dthetaK
         if isinstance(self.likelihood, Laplace):
-            self.likelihood.fit_full(self.kern.K(self.X))
-            self.likelihood._set_params(self.likelihood._get_params())
+            #self.likelihood.fit_full(self.kern.K(self.X))
+            #self.likelihood._set_params(self.likelihood._get_params())
             dK_dthetaK = self.kern.dK_dtheta
             dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy())
             dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))

From 9364efc755405fdb3b424f4e3ffc01e68694b31e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 30 Jul 2013 16:11:03 +0100
Subject: [PATCH 067/252] Started adding gaussian sanity checker

---
 GPy/examples/laplace_approximations.py  | 10 ++--
 GPy/likelihoods/Laplace.py              | 80 +++++++++++++------------
 GPy/likelihoods/likelihood_functions.py | 58 +++++-------------
 3 files changed, 60 insertions(+), 88 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 2b93122c..e8b6419f 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -168,23 +168,23 @@ def student_t_f_check():
     m.randomize()
     m['t_no'] = 0.3
     m.likelihood.X = X
-    print m
+    #print m
     plt.figure()
     plt.subplot(511)
     m.plot()
-    print m
+    #print m
     plt.subplot(512)
     m.optimize(max_f_eval=15)
     m.plot()
-    print m
+    #print m
     plt.subplot(513)
     m.optimize(max_f_eval=15)
     m.plot()
-    print m
+    #print m
     plt.subplot(514)
     m.optimize(max_f_eval=15)
     m.plot()
-    print m
+    #print m
     plt.subplot(515)
     m.optimize()
     m.plot()
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index f86c47b6..aeda17da 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -89,7 +89,8 @@ class Laplace(likelihood):
         expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
         dL_dthetaK_exp = dK_dthetaK(expl, X)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
-        print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        #print "expl_a: {}, {}     expl_b: {}, {}".format(np.mean(expl_a), np.std(expl_a), np.mean(expl_b), np.std(expl_b))
         dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
         return dL_dthetaK
 
@@ -165,8 +166,7 @@ class Laplace(likelihood):
         self.aA = 0.5*self.ln_det_K_Wi__Bi
         self.bB = - 0.5*self.f_Ki_f
         self.cC = 0.5*self.y_Wi_Ki_i_y
-        Z_tilde = (#+ 100*self.NORMAL_CONST
-                   + self.lik
+        Z_tilde = (+ self.lik
                    + 0.5*self.ln_det_K_Wi__Bi
                    - 0.5*self.f_Ki_f
                    + 0.5*self.y_Wi_Ki_i_y
@@ -379,7 +379,8 @@ class Laplace(likelihood):
 
             #difference = abs(new_obj - old_obj)
             #old_obj = new_obj.copy()
-            difference = np.abs(np.sum(f - f_old))
+            #difference = np.abs(np.sum(f - f_old))
+            difference = np.abs(np.sum(a - old_a))
             #old_a = self.a.copy() #a
             old_a = a.copy()
             i += 1
@@ -391,42 +392,43 @@ class Laplace(likelihood):
         print "Iterations: {}, Final_difference: {}".format(i, difference)
         if difference > 1e-4:
             print "FAIL FAIL FAIL FAIL FAIL FAIL"
-            import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-            if hasattr(self, 'X'):
-                import pylab as pb
-                pb.figure()
-                pb.subplot(311)
-                pb.title('old f_hat')
-                pb.plot(self.X, self.f_hat)
-                pb.subplot(312)
-                pb.title('old ff')
-                pb.plot(self.X, self.old_ff)
-                pb.subplot(313)
-                pb.title('new f_hat')
-                pb.plot(self.X, f)
-
-                pb.figure()
-                pb.subplot(121)
-                pb.title('old K')
-                pb.imshow(np.diagflat(self.old_K), interpolation='none')
-                pb.colorbar()
-                pb.subplot(122)
-                pb.title('new K')
-                pb.imshow(np.diagflat(K), interpolation='none')
-                pb.colorbar()
-
-                pb.figure()
-                pb.subplot(121)
-                pb.title('old W')
-                pb.imshow(np.diagflat(self.old_W), interpolation='none')
-                pb.colorbar()
-                pb.subplot(122)
-                pb.title('new W')
-                pb.imshow(np.diagflat(W), interpolation='none')
-                pb.colorbar()
-
+            if False:
                 import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-                pb.close('all')
+                if hasattr(self, 'X'):
+                    import pylab as pb
+                    pb.figure()
+                    pb.subplot(311)
+                    pb.title('old f_hat')
+                    pb.plot(self.X, self.f_hat)
+                    pb.subplot(312)
+                    pb.title('old ff')
+                    pb.plot(self.X, self.old_ff)
+                    pb.subplot(313)
+                    pb.title('new f_hat')
+                    pb.plot(self.X, f)
+
+                    pb.figure()
+                    pb.subplot(121)
+                    pb.title('old K')
+                    pb.imshow(np.diagflat(self.old_K), interpolation='none')
+                    pb.colorbar()
+                    pb.subplot(122)
+                    pb.title('new K')
+                    pb.imshow(np.diagflat(K), interpolation='none')
+                    pb.colorbar()
+
+                    pb.figure()
+                    pb.subplot(121)
+                    pb.title('old W')
+                    pb.imshow(np.diagflat(self.old_W), interpolation='none')
+                    pb.colorbar()
+                    pb.subplot(122)
+                    pb.title('new W')
+                    pb.imshow(np.diagflat(W), interpolation='none')
+                    pb.colorbar()
+
+                    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+                    pb.close('all')
 
         #FIXME: DELETE THESE
         self.old_W = W.copy()
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 62e09a1a..42af9c8d 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -239,7 +239,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -277,7 +277,7 @@ class student_t(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_grad_dsigma = (-self.v*(self.v+1)*e)/((self.sigma2*self.v + e**2)**2)
+        dlik_grad_dsigma = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
         return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
@@ -289,7 +289,7 @@ class student_t(likelihood_function):
         assert y.shape == f.shape
         e = y - f
         dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
-                              / (self.sigma2*self.v + (e**2))**3
+                              / ((self.sigma2*self.v + (e**2))**3)
                            )
         return dlik_hess_dsigma
 
@@ -479,7 +479,8 @@ class gaussian(likelihood_function):
 
     def _set_params(self, x):
         self._variance = float(x)
-        self.covariance_matrix = np.eye(self.N) * self._variance
+        self.I = np.eye(self.N)
+        self.covariance_matrix = self.I * self._variance
         self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG
 
     def link_function(self, y, f, extra_data=None):
@@ -505,8 +506,6 @@ class gaussian(likelihood_function):
         """
         Gradient of the link function at y, given f w.r.t f
 
-        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
-
         :y: data
         :f: latent variables f
         :extra_data: extra_data which is not used in student t distribution
@@ -514,8 +513,8 @@ class gaussian(likelihood_function):
 
         """
         assert y.shape == f.shape
-        e = y - f
-        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        s2_i = (1.0/self._variance)*self.I
+        grad = np.dot(s2_i, y) - 0.5*np.dot(s2_i, f)
         return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
@@ -526,16 +525,14 @@ class gaussian(likelihood_function):
         Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
         (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
 
-        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
-
         :y: data
         :f: latent variables f
         :extra_data: extra_data which is not used in student t distribution
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
         assert y.shape == f.shape
-        e = y - f
-        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / (((self.sigma2*self.v) + e**2)**2)
+        s2_i = (1.0/self._variance)*self.I
+        hess = np.diagonal(-0.5*s2_i)
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -545,46 +542,25 @@ class gaussian(likelihood_function):
         $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
         """
         assert y.shape == f.shape
-        e = y - f
-        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
-                       ((e**2 + self.sigma2*self.v)**3)
-                    )
+        d3lik_d3f = np.diagonal(0*self.I)
         return d3lik_d3f
 
     def lik_dstd(self, y, f, extra_data=None):
         """
         Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
-
-        Terms relavent to derivatives wrt sigma are:
-        -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
-
-        $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
         """
         assert y.shape == f.shape
         e = y - f
-        sigma = np.sqrt(self.sigma2)
-        #dlik_dsigma = ( - (1/sigma) +
-                        #((1+self.v)*(e**2))/((sigma*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      #)
-        #dlik_dsigma = ( - 1 +
-                        #((1+self.v)*(e**2))/((self.sigma2*self.sigma2)*self.v*(1 + ((e**2) / (self.sigma2*self.v)) ) )
-                      #)
-        #dlik_dsigma = (((self.v + 1)*(e**2))/((e**2) + self.v*(self.sigma**2))) - 1
-        dlik_dsigma = (self.v*((e**2)-self.sigma2))/(sigma*((e**2)+self.sigma2*self.v))
+        dlik_dsigma = -0.5*self.N*self._variance - 0.5*np.dot(e.T, e)
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
         """
         Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
-
-        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
         """
         assert y.shape == f.shape
-        e = y - f
-        sigma = np.sqrt(self.sigma2)
-        dlik_grad_dsigma = ((-2*sigma*self.v*(self.v + 1)*e) #2 might not want to be here?
-                            / ((self.v*self.sigma2 + e**2)**2)
-                           )
+        s_4 = 1.0/(self._variance**2)
+        dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f))
         return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
@@ -594,13 +570,7 @@ class gaussian(likelihood_function):
         $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
         assert y.shape == f.shape
-        e = y - f
-        sigma = np.sqrt(self.sigma2)
-        dlik_hess_dsigma = (  (2*sigma*self.v*(self.v + 1)*(self.sigma2*self.v - 3*(e**2))) /
-                              ((e**2 + self.sigma2*self.v)**3)
-                           )
-        #dlik_hess_dsigma = ( 2*(self.v + 1)*self.v*(self.sigma**2)*((e**2) + (self.v*(self.sigma**2)) - 4*(e**2))
-                             #/ ((e**2 + (self.sigma**2)*self.v)**3) )
+        dlik_hess_dsigma = 1.0/(2*(self._variance**2))
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):

From 1314868ea8cf4c81d0c76f90dd4a8b11a123c427 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 16 Aug 2013 11:16:47 +0100
Subject: [PATCH 068/252] Added gaussian checker and gaussian likelihood, not
 checkgrading yet

---
 GPy/examples/laplace_approximations.py  | 65 +++++++++++++++++++------
 GPy/likelihoods/likelihood_functions.py | 38 ++++++++++-----
 2 files changed, 77 insertions(+), 26 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index e8b6419f..02b38a79 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -170,28 +170,18 @@ def student_t_f_check():
     m.likelihood.X = X
     #print m
     plt.figure()
-    plt.subplot(511)
+    plt.subplot(211)
     m.plot()
-    #print m
-    plt.subplot(512)
-    m.optimize(max_f_eval=15)
-    m.plot()
-    #print m
-    plt.subplot(513)
-    m.optimize(max_f_eval=15)
-    m.plot()
-    #print m
-    plt.subplot(514)
-    m.optimize(max_f_eval=15)
-    m.plot()
-    #print m
-    plt.subplot(515)
+    print "OPTIMIZED ONCE"
+    plt.subplot(212)
     m.optimize()
     m.plot()
     print "final optimised student t"
     print m
     print "real GP"
     print mgp
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    return m
 
 def student_t_fix_optimise_check():
     plt.close('all')
@@ -602,3 +592,48 @@ def noisy_laplace_approx():
     print m
 
     #with a student t distribution, since it has heavy tails it should work well
+
+def gaussian_f_check():
+    plt.close('all')
+    X = np.linspace(0, 1, 50)[:, None]
+    real_std = 0.2
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
+
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp.randomize()
+    mgp.optimize()
+    print "Gaussian"
+    print mgp
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+    kernelg = kernelgp.copy()
+    #kernelst += GPy.kern.bias(X.shape[1])
+    N, D = X.shape
+    g_distribution = GPy.likelihoods.likelihood_functions.gaussian(variance=0.1, N=N, D=D)
+    g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm')
+    m = GPy.models.GP(X, g_likelihood, kernelg)
+    #m['rbf_v'] = mgp._get_params()[0]
+    #m['rbf_l'] = mgp._get_params()[1] + 1
+    m.ensure_default_constraints()
+    #m.constrain_fixed('rbf_v', mgp._get_params()[0])
+    #m.constrain_fixed('rbf_l', mgp._get_params()[1])
+    #m.constrain_bounded('t_no', 2*real_std**2, 1e3)
+    #m.constrain_positive('bias')
+    m.constrain_positive('noise_var')
+    m.randomize()
+    m['noise_variance'] = 0.1
+    m.likelihood.X = X
+    plt.figure()
+    plt.subplot(211)
+    m.plot()
+    plt.subplot(212)
+    m.optimize()
+    m.plot()
+    print "final optimised student t"
+    print m
+    print "real GP"
+    print mgp
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 42af9c8d..81d93f6b 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -9,7 +9,7 @@ from ..util.plot import gpplot
 from scipy.special import gammaln, gamma
 from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 
-class likelihood_function:
+class likelihood_function(object):
     """ Likelihood class for doing Expectation propagation
 
     :param Y: observed output (Nx1 numpy.darray)
@@ -159,7 +159,7 @@ class student_t(likelihood_function):
     d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free, sigma2=2):
-        #super(student_t, self).__init__()
+        super(student_t, self).__init__()
         self.v = deg_free
         self.sigma2 = sigma2
         self.log_concave = False
@@ -468,9 +468,16 @@ class gaussian(likelihood_function):
     """
     Gaussian likelihood - this is a test class for approximation schemes
     """
-    def __init__(self, variance):
+    def __init__(self, variance, D, N):
+        super(gaussian, self).__init__()
+        self.D = D
+        self.N = N
         self._set_params(np.asarray(variance))
 
+        #Don't support normalizing yet
+        self._bias = np.zeros((1, self.D))
+        self._scale = np.ones((1, self.D))
+
     def _get_params(self):
         return np.asarray(self._variance)
 
@@ -481,7 +488,8 @@ class gaussian(likelihood_function):
         self._variance = float(x)
         self.I = np.eye(self.N)
         self.covariance_matrix = self.I * self._variance
-        self.Ki, _, _, self.ln_K = pdinv(self.covariance_matrix) # THIS MAY BE WRONG
+        self.Ki = self.I*(1.0 / self._variance)
+        self.ln_K = np.trace(self.covariance_matrix)
 
     def link_function(self, y, f, extra_data=None):
         """link_function $\ln p(y|f)$
@@ -498,7 +506,8 @@ class gaussian(likelihood_function):
         eeT = np.dot(e, e.T)
         objective = (- 0.5*self.D*np.log(2*np.pi)
                      - 0.5*self.ln_K
-                     - 0.5*np.sum(np.multiply(self.Ki, eeT))
+                     #- 0.5*np.sum(np.multiply(self.Ki, eeT))
+                     - 0.5*np.dot(np.dot(e.T, self.Ki), e)
                      )
         return np.sum(objective)
 
@@ -514,7 +523,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         s2_i = (1.0/self._variance)*self.I
-        grad = np.dot(s2_i, y) - 0.5*np.dot(s2_i, f)
+        grad = np.dot(s2_i, y) - np.dot(s2_i, f)
         return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
@@ -532,7 +541,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         s2_i = (1.0/self._variance)*self.I
-        hess = np.diagonal(-0.5*s2_i)
+        hess = np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -542,7 +551,7 @@ class gaussian(likelihood_function):
         $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
         """
         assert y.shape == f.shape
-        d3lik_d3f = np.diagonal(0*self.I)
+        d3lik_d3f = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
         return d3lik_d3f
 
     def lik_dstd(self, y, f, extra_data=None):
@@ -551,7 +560,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_dsigma = -0.5*self.N*self._variance - 0.5*np.dot(e.T, e)
+        dlik_dsigma = -0.5*self.D/self._variance - 0.5*np.trace(np.dot(e.T, np.dot(self.I, e)))
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
@@ -560,7 +569,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         s_4 = 1.0/(self._variance**2)
-        dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f))
+        dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + np.dot(s_4, np.dot(self.I, f))
         return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
@@ -570,7 +579,7 @@ class gaussian(likelihood_function):
         $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
         assert y.shape == f.shape
-        dlik_hess_dsigma = 1.0/(2*(self._variance**2))
+        dlik_hess_dsigma = np.diag(1.0/(self._variance**2)*self.I)[:, None]
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):
@@ -584,3 +593,10 @@ class gaussian(likelihood_function):
         assert len(derivs[1]) == len(self._get_param_names())
         assert len(derivs[2]) == len(self._get_param_names())
         return derivs
+
+    def predictive_values(self, mu, var):
+        mean = mu * self._scale + self._bias
+        true_var = (var + self._variance) * self._scale ** 2
+        _5pc = mean - 2.*np.sqrt(true_var)
+        _95pc = mean + 2.*np.sqrt(true_var)
+        return mean, true_var, _5pc, _95pc

From 000491b25da515a595c25fbc57e3dcbc3ee4e3f4 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 28 Aug 2013 13:26:15 +0100
Subject: [PATCH 069/252] Gaussian likelihood errors, still not working

---
 GPy/likelihoods/likelihood_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 81d93f6b..25f770b5 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -560,7 +560,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_dsigma = -0.5*self.D/self._variance - 0.5*np.trace(np.dot(e.T, np.dot(self.I, e)))
+        dlik_dsigma = -0.5*self.N/self._variance - 0.5*np.trace(np.dot(e.T, np.dot(self.I, e)))
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
@@ -579,7 +579,7 @@ class gaussian(likelihood_function):
         $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
         assert y.shape == f.shape
-        dlik_hess_dsigma = np.diag(1.0/(self._variance**2)*self.I)[:, None]
+        dlik_hess_dsigma = np.diag((1.0/(self._variance**2))*self.I)[:, None]
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):

From 54954c63f83d566a383bd0d2b14dadaa66ce363e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 29 Aug 2013 13:47:56 +0100
Subject: [PATCH 070/252] A few typos

---
 GPy/examples/laplace_approximations.py | 2 +-
 GPy/likelihoods/Laplace.py             | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 02b38a79..8be08a8f 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -632,7 +632,7 @@ def gaussian_f_check():
     plt.subplot(212)
     m.optimize()
     m.plot()
-    print "final optimised student t"
+    print "final optimised gaussian"
     print m
     print "real GP"
     print mgp
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index aeda17da..58304c23 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -105,8 +105,15 @@ class Laplace(likelihood):
         dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter
         for thetaL_i in range(num_params):
             #Explicit
+            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            #a = 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            #d = dlik_hess_dthetaL[thetaL_i]
+            #e = pdinv(pdinv(self.K)[0] + np.diagflat(self.W))[0]
+            #b = 0.5*np.dot(np.diag(e).T, d)
+            #g = 0.5*(np.diag(self.K) - np.sum(cho_solve((self.B_chol, True), np.dot(np.diagflat(self.W_12),self.K))**2, 1))
+            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - np.dot(g.T, dlik_hess_dthetaL[thetaL_i])
             dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
-            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.trace(mdot(self.Bi, self.K, dlik_hess_dthetaL[thetaL_i]))
+
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
             dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)

From f943cf9ddb9db80556ff7873108d22ac48113c2d Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 9 Sep 2013 11:54:32 +0100
Subject: [PATCH 071/252] Changed the gradients (perhaps for the worse)

---
 GPy/likelihoods/likelihood_functions.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 25f770b5..72d2ff82 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -523,7 +523,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         s2_i = (1.0/self._variance)*self.I
-        grad = np.dot(s2_i, y) - np.dot(s2_i, f)
+        grad = np.dot(s2_i, y) - 0.5*np.dot(s2_i, f)
         return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
@@ -541,7 +541,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         s2_i = (1.0/self._variance)*self.I
-        hess = np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
+        hess = 0.5*np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -560,7 +560,8 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_dsigma = -0.5*self.N/self._variance - 0.5*np.trace(np.dot(e.T, np.dot(self.I, e)))
+        s_4 = 1.0/(self._variance**2)
+        dlik_dsigma = -0.5*self.N*1/self._variance + 0.5*s_4*np.trace(np.dot(e.T, np.dot(self.I, e)))
         return dlik_dsigma
 
     def dlik_df_dstd(self, y, f, extra_data=None):
@@ -569,7 +570,7 @@ class gaussian(likelihood_function):
         """
         assert y.shape == f.shape
         s_4 = 1.0/(self._variance**2)
-        dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + np.dot(s_4, np.dot(self.I, f))
+        dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f))
         return dlik_grad_dsigma
 
     def d2lik_d2f_dstd(self, y, f, extra_data=None):
@@ -579,7 +580,7 @@ class gaussian(likelihood_function):
         $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
         assert y.shape == f.shape
-        dlik_hess_dsigma = np.diag((1.0/(self._variance**2))*self.I)[:, None]
+        dlik_hess_dsigma = 0.5*np.diag((1.0/(self._variance**2))*self.I)[:, None]
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):

From 1985cdcdbba57b49214e536684890f42e32b4bce Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 9 Sep 2013 13:29:53 +0100
Subject: [PATCH 072/252] Empty branch

---
 .gitignore  | 41 +++++++++++++++++++++++++++++++++++++++++
 .travis.yml | 21 +++++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .travis.yml

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..60866848
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,41 @@
+*.py[cod]
+
+# C extensions
+*.so
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+lib
+lib64
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+nosetests.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
+#vim
+*.swp
+
+#bfgs optimiser leaves this lying around
+iterate.dat
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 00000000..6d188401
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,21 @@
+language: python
+python:
+  - "2.7"
+
+#Set virtual env with system-site-packages to true
+virtualenv:
+  system_site_packages: true
+
+# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
+before_install: 
+  - sudo apt-get install -qq python-scipy python-pip
+  - sudo apt-get install -qq python-matplotlib
+
+install:
+  - pip install --upgrade numpy==1.7.1 
+  - pip install sphinx 
+  - pip install nose
+  - pip install . --use-mirrors
+# command to run tests, e.g. python setup.py test
+script: 
+  - nosetests GPy/testing

From f641ab54a8b6d32445e7d08cb18902958afcf3e5 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 9 Sep 2013 13:41:58 +0100
Subject: [PATCH 073/252] Checked out relavent files

---
 GPy/examples/laplace_approximations.py | 639 +++++++++++++++++++++++++
 GPy/likelihoods/Laplace.py             | 453 ++++++++++++++++++
 GPy/models/GP.py                       | 319 ++++++++++++
 3 files changed, 1411 insertions(+)
 create mode 100644 GPy/examples/laplace_approximations.py
 create mode 100644 GPy/likelihoods/Laplace.py
 create mode 100644 GPy/models/GP.py

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
new file mode 100644
index 00000000..8be08a8f
--- /dev/null
+++ b/GPy/examples/laplace_approximations.py
@@ -0,0 +1,639 @@
+import GPy
+import numpy as np
+import matplotlib.pyplot as plt
+np.random.seed(1)
+
+def timing():
+    real_var = 0.1
+    times = 1
+    deg_free = 10
+    real_sd = np.sqrt(real_var)
+    the_is = np.zeros(times)
+    X = np.linspace(0.0, 10.0, 300)[:, None]
+
+    for a in xrange(times):
+        Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+        Yc = Y.copy()
+
+        Yc[10] += 100
+        Yc[25] += 10
+        Yc[23] += 10
+        Yc[24] += 10
+        Yc[250] += 10
+        #Yc[4] += 10000
+
+        edited_real_sd = real_sd
+        kernel1 = GPy.kern.rbf(X.shape[1])
+
+        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+        corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
+        m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
+        m.ensure_default_constraints()
+        m.update_likelihood_approximation()
+        m.optimize()
+        the_is[a] = m.likelihood.i
+
+    print the_is
+    print np.mean(the_is)
+
+def v_fail_test():
+    #plt.close('all')
+    real_var = 0.1
+    X = np.linspace(0.0, 10.0, 50)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
+    Y = Y/Y.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 10
+    real_sd = np.sqrt(real_var)
+    print "Real noise std: ", real_sd
+
+    kernel1 = GPy.kern.white(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+
+    edited_real_sd = 0.3#real_sd
+    edited_real_sd = real_sd
+
+    print "Clean student t, rasm"
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    m = GPy.models.GP(X, stu_t_likelihood, kernel1)
+    m.constrain_positive('')
+    vs = 25
+    noises = 30
+    checkgrads = np.zeros((vs, noises))
+    vs_noises = np.zeros((vs, noises))
+    for v_ind, v in enumerate(np.linspace(1, 100, vs)):
+        m.likelihood.likelihood_function.v = v
+        print v
+        for noise_ind, noise in enumerate(np.linspace(0.0001, 100, noises)):
+            m['t_noise'] = noise
+            m.update_likelihood_approximation()
+            checkgrads[v_ind, noise_ind] = m.checkgrad()
+            vs_noises[v_ind, noise_ind] = (float(v)/(float(v) - 2))*(noise**2)
+
+    plt.figure()
+    plt.title('Checkgrads')
+    plt.imshow(checkgrads, interpolation='nearest')
+    plt.xlabel('noise')
+    plt.ylabel('v')
+
+    #plt.figure()
+    #plt.title('variance change')
+    #plt.imshow(vs_noises, interpolation='nearest')
+    #plt.xlabel('noise')
+    #plt.ylabel('v')
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    print(m)
+
+def student_t_obj_plane():
+    plt.close('all')
+    X = np.linspace(0, 1, 50)[:, None]
+    real_std = 0.002
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
+    deg_free = 1000
+
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp['noise'] = real_std**2
+    print "Gaussian"
+    print mgp
+
+    kernelst = kernelgp.copy()
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2))
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m.ensure_default_constraints()
+    m.constrain_fixed('t_no', real_std**2)
+    vs = 10
+    ls = 10
+    objs_t = np.zeros((vs, ls))
+    objs_g = np.zeros((vs, ls))
+    rbf_vs = np.linspace(1e-6, 8, vs)
+    rbf_ls = np.linspace(1e-2, 8, ls)
+    for v_id, rbf_v in enumerate(rbf_vs):
+        for l_id, rbf_l in enumerate(rbf_ls):
+            m['rbf_v'] = rbf_v
+            m['rbf_l'] = rbf_l
+            mgp['rbf_v'] = rbf_v
+            mgp['rbf_l'] = rbf_l
+            objs_t[v_id, l_id] = m.log_likelihood()
+            objs_g[v_id, l_id] = mgp.log_likelihood()
+    plt.figure()
+    plt.subplot(211)
+    plt.title('Student t')
+    plt.imshow(objs_t, interpolation='none')
+    plt.xlabel('variance')
+    plt.ylabel('lengthscale')
+    plt.subplot(212)
+    plt.title('Gaussian')
+    plt.imshow(objs_g, interpolation='none')
+    plt.xlabel('variance')
+    plt.ylabel('lengthscale')
+    plt.show()
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    return objs_t
+
+def student_t_f_check():
+    plt.close('all')
+    X = np.linspace(0, 1, 50)[:, None]
+    real_std = 0.2
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
+    deg_free = 1000
+
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp.randomize()
+    mgp.optimize()
+    print "Gaussian"
+    print mgp
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+    kernelst = kernelgp.copy()
+    #kernelst += GPy.kern.bias(X.shape[1])
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    #m['rbf_v'] = mgp._get_params()[0]
+    #m['rbf_l'] = mgp._get_params()[1] + 1
+    m.ensure_default_constraints()
+    #m.constrain_fixed('rbf_v', mgp._get_params()[0])
+    #m.constrain_fixed('rbf_l', mgp._get_params()[1])
+    #m.constrain_bounded('t_no', 2*real_std**2, 1e3)
+    #m.constrain_positive('bias')
+    m.constrain_positive('t_no')
+    m.randomize()
+    m['t_no'] = 0.3
+    m.likelihood.X = X
+    #print m
+    plt.figure()
+    plt.subplot(211)
+    m.plot()
+    print "OPTIMIZED ONCE"
+    plt.subplot(212)
+    m.optimize()
+    m.plot()
+    print "final optimised student t"
+    print m
+    print "real GP"
+    print mgp
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    return m
+
+def student_t_fix_optimise_check():
+    plt.close('all')
+    real_var = 0.1
+    real_std = np.sqrt(real_var)
+    X = np.random.rand(200)[:, None]
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
+    X_full = X
+    Y_full = np.sin(X_full)
+    Y = Y/Y.max()
+    Y_full = Y_full/Y_full.max()
+    deg_free = 1000
+
+    #GP
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp.randomize()
+    mgp.optimize()
+
+    kernelst = kernelgp.copy()
+    real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
+
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std2)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+
+    plt.figure(1)
+    plt.suptitle('Student likelihood')
+    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m.constrain_fixed('rbf_var', mgp._get_params()[0])
+    m.constrain_fixed('rbf_len', mgp._get_params()[1])
+    m.constrain_positive('t_noise')
+    #m.ensure_default_constraints()
+
+    m.update_likelihood_approximation()
+    print "T std2 {} converted from original data, LL: {}".format(real_stu_t_std2, m.log_likelihood())
+    plt.subplot(231)
+    m.plot()
+    plt.title('Student t original data noise')
+
+    #Fix student t noise variance to same a GP
+    gp_noise = mgp._get_params()[2]
+    m['t_noise_std2'] = gp_noise
+    m.update_likelihood_approximation()
+    print "T std2 {} same as GP noise, LL: {}".format(gp_noise, m.log_likelihood())
+    plt.subplot(232)
+    m.plot()
+    plt.title('Student t GP noise')
+
+    #Fix student t noise to variance converted from the GP
+    real_stu_t_std2gp = (gp_noise)*((deg_free - 2)/float(deg_free))
+    m['t_noise_std2'] = real_stu_t_std2gp
+    m.update_likelihood_approximation()
+    print "T std2 {} converted to student t noise from GP noise, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.log_likelihood())
+    plt.subplot(233)
+    m.plot()
+    plt.title('Student t GP noise converted')
+
+    m.constrain_positive('t_noise_std2')
+    m.randomize()
+    m.update_likelihood_approximation()
+    plt.subplot(234)
+    m.plot()
+    plt.title('Student t fixed rbf')
+    m.optimize()
+    print "T std2 {} var {} after optimising, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.likelihood.likelihood_function.variance, m.log_likelihood())
+    plt.subplot(235)
+    m.plot()
+    plt.title('Student t fixed rbf optimised')
+
+    plt.figure(2)
+    mrbf = m.copy()
+    mrbf.unconstrain('')
+    mrbf.constrain_fixed('t_noise', m.likelihood.likelihood_function.sigma2)
+    gp_var = mgp._get_params()[0]
+    gp_len = mgp._get_params()[1]
+    mrbf.constrain_fixed('rbf_var', gp_var)
+    mrbf.constrain_positive('rbf_len')
+    mrbf.randomize()
+    print "Before optimize"
+    print mrbf
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+    mrbf.checkgrad(verbose=1)
+    plt.subplot(121)
+    mrbf.plot()
+    plt.title('Student t fixed noise')
+    mrbf.optimize()
+    print "After optimize"
+    print mrbf
+    plt.subplot(122)
+    mrbf.plot()
+    plt.title('Student t fixed noise optimized')
+    print mrbf
+
+    plt.figure(3)
+    print "GP noise {} after optimising, LL: {}".format(gp_noise, mgp.log_likelihood())
+    plt.suptitle('Gaussian likelihood optimised')
+    mgp.plot()
+    print "Real std: {}".format(real_std)
+    print "Real variance {}".format(real_std**2)
+
+    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+    print "Len should be: {}".format(gp_len)
+    return mrbf
+
+def debug_student_t_noise_approx():
+    plot = False
+    real_var = 0.1
+    #Start a function, any function
+    #X = np.linspace(0.0, 10.0, 50)[:, None]
+    X = np.random.rand(100)[:, None]
+    #X = np.random.rand(100)[:, None]
+    #X = np.array([0.5, 1])[:, None]
+    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + 1
+    #Y = X + np.random.randn(*X.shape)*real_var
+    #ty = np.array([1., 9.97733584, 4.17841363])[:, None]
+    #Y = ty
+
+    X_full = X
+    Y_full = np.sin(X_full) + 1
+
+    Y = Y/Y.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 100
+
+    real_sd = np.sqrt(real_var)
+    print "Real noise std: ", real_sd
+
+    initial_var_guess = 0.3
+    #t_rv = t(deg_free, loc=0, scale=real_var)
+    #noise = t_rvrvs(size=Y.shape)
+    #Y += noise
+
+    plt.close('all')
+    # Kernel object
+    kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+    #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
+    kernel5 = kernel1.copy()
+    kernel6 = kernel1.copy()
+
+    print "Clean Gaussian"
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    #m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    ## optimize
+    #m.ensure_default_constraints()
+    #m.optimize()
+    ## plot
+    #if plot:
+        #plt.figure(1)
+        #plt.suptitle('Gaussian likelihood')
+        #plt.subplot(131)
+        #m.plot()
+        #plt.plot(X_full, Y_full)
+    #print m
+
+    real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
+    edited_real_sd = real_stu_t_std**2 #initial_var_guess #real_sd
+    #edited_real_sd = real_sd
+
+    print "Clean student t, rasm"
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+
+    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    #m['rbf_len'] = 1.5
+    #m.constrain_fixed('rbf_v', 1.0898)
+    #m.constrain_fixed('rbf_l', 0.2651)
+    #m.constrain_fixed('t_noise_std2', edited_real_sd)
+    #m.constrain_positive('rbf')
+    m.constrain_positive('t_noise_std2')
+    #m.constrain_positive('')
+    #m.constrain_bounded('t_noi', 0.001, 10)
+    #m.constrain_fixed('t_noi', real_stu_t_std)
+    #m.constrain_fixed('white', 0.01)
+    #m.constrain_fixed('t_no', 0.01)
+    #m['rbf_var'] = 0.20446332
+    #m['rbf_leng'] = 0.85776241
+    #m['t_noise'] = 0.667083294421005
+    m.ensure_default_constraints()
+    m.update_likelihood_approximation()
+    #m.optimize(messages=True)
+    print(m)
+    #return m
+    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
+    if plot:
+        plt.suptitle('Student-t likelihood')
+        plt.subplot(132)
+        m.plot()
+        plt.plot(X_full, Y_full)
+        plt.ylim(-2.5, 2.5)
+    print "Real noise std: ", real_sd
+    print "or Real noise std: ", real_stu_t_std
+    return m
+
+    #print "Clean student t, ncg"
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
+    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    #m.ensure_default_constraints()
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    #print(m)
+    #if plot:
+        #plt.subplot(133)
+        #m.plot()
+        #plt.plot(X_full, Y_full)
+        #plt.ylim(-2.5, 2.5)
+
+    #plt.show()
+
+def student_t_approx():
+    """
+    Example of regressing with a student t likelihood
+    """
+    real_std = 0.1
+    #Start a function, any function
+    X = np.linspace(0.0, 10.0, 50)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_std
+    Yc = Y.copy()
+
+    X_full = np.linspace(0.0, 10.0, 500)[:, None]
+    Y_full = np.sin(X_full)
+
+    Y = Y/Y.max()
+
+    Yc[10] += 100
+    Yc[25] += 10
+    Yc[23] += 10
+    Yc[26] += 1000
+    Yc[24] += 10
+    #Yc = Yc/Yc.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 8
+    print "Real noise: ", real_std
+
+    initial_var_guess = 0.1
+    #t_rv = t(deg_free, loc=0, scale=real_var)
+    #noise = t_rvrvs(size=Y.shape)
+    #Y += noise
+
+    #Add some extreme value noise to some of the datapoints
+    #percent_corrupted = 0.15
+    #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
+    #indices = np.arange(Y.shape[0])
+    #np.random.shuffle(indices)
+    #corrupted_indices = indices[:corrupted_datums]
+    #print corrupted_indices
+    #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
+    #Y[corrupted_indices] += noise
+
+    plt.figure(1)
+    plt.suptitle('Gaussian likelihood')
+    # Kernel object
+    kernel1 = GPy.kern.rbf(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
+    kernel5 = kernel1.copy()
+    kernel6 = kernel1.copy()
+
+    print "Clean Gaussian"
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize()
+    # plot
+    plt.subplot(211)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.title('Gaussian clean')
+    print m
+
+    #Corrupt
+    print "Corrupt Gaussian"
+    m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
+    m.ensure_default_constraints()
+    #m.optimize()
+    plt.subplot(212)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.title('Gaussian corrupt')
+    print m
+
+    plt.figure(2)
+    plt.suptitle('Student-t likelihood')
+    edited_real_sd = real_std #initial_var_guess
+
+    print "Clean student t, rasm"
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m.ensure_default_constraints()
+    m.constrain_positive('t_noise')
+    m.randomize()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(222)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+    plt.title('Student-t rasm clean')
+
+    print "Corrupt student t, rasm"
+    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
+    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
+    m.ensure_default_constraints()
+    m.constrain_positive('t_noise')
+    m.randomize()
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    plt.subplot(224)
+    m.plot()
+    plt.plot(X_full, Y_full)
+    plt.ylim(-2.5, 2.5)
+    plt.title('Student-t rasm corrupt')
+
+    return m
+
+    #print "Clean student t, ncg"
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
+    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    #m.ensure_default_constraints()
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    #print(m)
+    #plt.subplot(221)
+    #m.plot()
+    #plt.plot(X_full, Y_full)
+    #plt.ylim(-2.5, 2.5)
+    #plt.title('Student-t ncg clean')
+
+    #print "Corrupt student t, ncg"
+    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
+    #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    #m.ensure_default_constraints()
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    #print(m)
+    #plt.subplot(223)
+    #m.plot()
+    #plt.plot(X_full, Y_full)
+    #plt.ylim(-2.5, 2.5)
+    #plt.title('Student-t ncg corrupt')
+
+
+    ###with a student t distribution, since it has heavy tails it should work well
+    ###likelihood_function = student_t(deg_free, sigma2=real_var)
+    ###lap = Laplace(Y, likelihood_function)
+    ###cov = kernel.K(X)
+    ###lap.fit_full(cov)
+
+    ###test_range = np.arange(0, 10, 0.1)
+    ###plt.plot(test_range, t_rv.pdf(test_range))
+    ###for i in xrange(X.shape[0]):
+        ###mode = lap.f_hat[i]
+        ###covariance = lap.hess_hat_i[i,i]
+        ###scaling = np.exp(lap.ln_z_hat)
+        ###normalised_approx = norm(loc=mode, scale=covariance)
+        ###print "Normal with mode %f, and variance %f" % (mode, covariance)
+        ###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    ###plt.show()
+
+    return m
+
+
+def noisy_laplace_approx():
+    """
+    Example of regressing with a student t likelihood
+    """
+    #Start a function, any function
+    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
+    Y = np.sin(X)
+
+    #Add some extreme value noise to some of the datapoints
+    percent_corrupted = 0.05
+    corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
+    indices = np.arange(Y.shape[0])
+    np.random.shuffle(indices)
+    corrupted_indices = indices[:corrupted_datums]
+    print corrupted_indices
+    noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1))
+    Y[corrupted_indices] += noise
+
+    #A GP should completely break down due to the points as they get a lot of weight
+    # create simple GP model
+    m = GPy.models.GP_regression(X, Y)
+
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize()
+    # plot
+    m.plot()
+    print m
+
+    #with a student t distribution, since it has heavy tails it should work well
+
+def gaussian_f_check():
+    plt.close('all')
+    X = np.linspace(0, 1, 50)[:, None]
+    real_std = 0.2
+    noise = np.random.randn(*X.shape)*real_std
+    Y = np.sin(X*2*np.pi) + noise
+
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp.randomize()
+    mgp.optimize()
+    print "Gaussian"
+    print mgp
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+    kernelg = kernelgp.copy()
+    #kernelst += GPy.kern.bias(X.shape[1])
+    N, D = X.shape
+    g_distribution = GPy.likelihoods.likelihood_functions.gaussian(variance=0.1, N=N, D=D)
+    g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm')
+    m = GPy.models.GP(X, g_likelihood, kernelg)
+    #m['rbf_v'] = mgp._get_params()[0]
+    #m['rbf_l'] = mgp._get_params()[1] + 1
+    m.ensure_default_constraints()
+    #m.constrain_fixed('rbf_v', mgp._get_params()[0])
+    #m.constrain_fixed('rbf_l', mgp._get_params()[1])
+    #m.constrain_bounded('t_no', 2*real_std**2, 1e3)
+    #m.constrain_positive('bias')
+    m.constrain_positive('noise_var')
+    m.randomize()
+    m['noise_variance'] = 0.1
+    m.likelihood.X = X
+    plt.figure()
+    plt.subplot(211)
+    m.plot()
+    plt.subplot(212)
+    m.optimize()
+    m.plot()
+    print "final optimised gaussian"
+    print m
+    print "real GP"
+    print mgp
+    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
new file mode 100644
index 00000000..58304c23
--- /dev/null
+++ b/GPy/likelihoods/Laplace.py
@@ -0,0 +1,453 @@
+import numpy as np
+import scipy as sp
+import GPy
+from scipy.linalg import inv, cho_solve, det
+from numpy.linalg import cond
+from likelihood import likelihood
+from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
+from scipy.linalg.lapack import dtrtrs
+import random
+#import pylab as plt
+
+class Laplace(likelihood):
+    """Laplace approximation to a posterior"""
+
+    def __init__(self, data, likelihood_function, extra_data=None, opt='rasm'):
+        """
+        Laplace Approximation
+
+        First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
+        then find the z^{prime} which allows this to be a normalised gaussian instead of a
+        non-normalized gaussian
+
+        Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
+        which makes a gaussian the same as the laplace approximation
+
+        Arguments
+        ---------
+
+        :data: array of data the likelihood function is approximating
+        :likelihood_function: likelihood function - subclass of likelihood_function
+        :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
+        :opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data)
+
+        """
+        self.data = data
+        self.likelihood_function = likelihood_function
+        self.extra_data = extra_data
+        self.opt = opt
+
+        #Inital values
+        self.N, self.D = self.data.shape
+        self.is_heteroscedastic = True
+        self.Nparams = 0
+
+        self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
+
+        #Initial values for the GP variables
+        self.Y = np.zeros((self.N, 1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:, None]
+        self.Z = 0
+        self.YYT = None
+
+        self.old_a = None
+
+    def predictive_values(self, mu, var, full_cov):
+        if full_cov:
+            raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
+        return self.likelihood_function.predictive_values(mu, var)
+
+    def _get_params(self):
+        return np.asarray(self.likelihood_function._get_params())
+
+    def _get_param_names(self):
+        return self.likelihood_function._get_param_names()
+
+    def _set_params(self, p):
+        return self.likelihood_function._set_params(p)
+
+    def _shared_gradients_components(self):
+        #FIXME: Careful of side effects! And make sure W and K are up to date!
+        d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
+        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T
+        I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
+        return dL_dfhat, I_KW_i
+
+    def _Kgradients(self, dK_dthetaK, X):
+        """
+        Gradients with respect to prior kernel parameters
+        """
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
+        dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
+
+        #Implicit
+        impl = mdot(dlp, dL_dfhat, I_KW_i)
+        expl_a = mdot(self.Ki_f, self.Ki_f.T)
+        expl_b = self.Wi_K_i
+        #print "expl_a: {}, expl_b: {}".format(expl_a, expl_b)
+        expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
+        dL_dthetaK_exp = dK_dthetaK(expl, X)
+        dL_dthetaK_imp = dK_dthetaK(impl, X)
+        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
+        #print "expl_a: {}, {}     expl_b: {}, {}".format(np.mean(expl_a), np.std(expl_a), np.mean(expl_b), np.std(expl_b))
+        dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
+        return dL_dthetaK
+
+    def _gradients(self, partial):
+        """
+        Gradients with respect to likelihood parameters
+        """
+        dL_dfhat, I_KW_i = self._shared_gradients_components()
+        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
+
+        num_params = len(dlik_dthetaL)
+        dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter
+        for thetaL_i in range(num_params):
+            #Explicit
+            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            #a = 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            #d = dlik_hess_dthetaL[thetaL_i]
+            #e = pdinv(pdinv(self.K)[0] + np.diagflat(self.W))[0]
+            #b = 0.5*np.dot(np.diag(e).T, d)
+            #g = 0.5*(np.diag(self.K) - np.sum(cho_solve((self.B_chol, True), np.dot(np.diagflat(self.W_12),self.K))**2, 1))
+            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - np.dot(g.T, dlik_hess_dthetaL[thetaL_i])
+            dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+
+            #Implicit
+            df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
+            dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)
+            #print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
+            dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+
+        return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
+
+    def _compute_GP_variables(self):
+        """
+        Generates data Y which would give the normal distribution identical to the laplace approximation
+
+        GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
+        that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
+
+        Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
+        then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
+        due to the z rescaling.
+
+        at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
+        This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
+        giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
+
+        $$\tilde{Y} = \tilde{\Sigma} Hf$$
+        where
+        $$\tilde{\Sigma}^{-1} = H - K^{-1}$$
+        i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
+        since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
+        and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
+        $$\tilde{\Sigma} = W^{-1}$$
+
+        """
+        #Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
+        #dtritri -> L -> L_i
+        #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
+        #((L.T*w)_i + I)f_hat = y_tilde
+        #L = jitchol(self.K)
+        #Li = chol_inv(L)
+        #Lt_W = L.T*self.W.T
+
+        #Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
+        #self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
+        #Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
+
+        Wi = 1.0/self.W
+        self.Sigma_tilde = np.diagflat(Wi)
+
+        Y_tilde = Wi*self.Ki_f + self.f_hat
+
+        self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
+        #self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6
+
+        self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
+        self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+
+        self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+        self.aA = 0.5*self.ln_det_K_Wi__Bi
+        self.bB = - 0.5*self.f_Ki_f
+        self.cC = 0.5*self.y_Wi_Ki_i_y
+        Z_tilde = (+ self.lik
+                   + 0.5*self.ln_det_K_Wi__Bi
+                   - 0.5*self.f_Ki_f
+                   + 0.5*self.y_Wi_Ki_i_y
+                  )
+        print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC)
+        print self.likelihood_function._get_params()
+
+        #Convert to float as its (1, 1) and Z must be a scalar
+        self.Z = np.float64(Z_tilde)
+        self.Y = Y_tilde
+        self.YYT = np.dot(self.Y, self.Y.T)
+        self.covariance_matrix = self.Sigma_tilde
+        self.precision = 1.0 / np.diag(self.covariance_matrix)[:, None]
+
+    def fit_full(self, K):
+        """
+        The laplace approximation algorithm, find K and expand hessian
+        For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
+        :K: Covariance matrix
+        """
+        self.K = K.copy()
+
+        #Find mode
+        self.f_hat = {
+            'rasm': self.rasm_mode,
+            'ncg': self.ncg_mode,
+            'nelder': self.nelder_mode
+        }[self.opt](self.K)
+
+        #Compute hessian and other variables at mode
+        self._compute_likelihood_variables()
+
+    def _compute_likelihood_variables(self):
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
+
+        if not self.likelihood_function.log_concave:
+            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                       #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                       #To cause the posterior to become less certain than the prior and likelihood,
+                                       #This is a property only held by non-log-concave likelihoods
+
+        #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
+        self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
+        self.Bi, _, _, B_det = pdinv(self.B)
+
+        #Do the computation again at f to get Ki_f which is useful
+        #b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
+        #solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
+        #a = b - self.W_12*solve_chol
+        self.Ki_f = self.a
+
+        self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
+        self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
+
+        #For det, |I + KW| == |I + W_12*K*W_12|
+        self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
+
+        #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
+        #self.ln_z_hat = (- 0.5*self.f_Ki_f
+                         #- self.ln_I_KW_det
+                         #+ self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+                         #)
+
+        return self._compute_GP_variables()
+
+    def _compute_B_statistics(self, K, W):
+        """Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :K: Covariance matrix
+        :W: Negative hessian at a point (diagonal matrix)
+        :returns: (B, L)
+        """
+        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
+        W_12 = np.sqrt(W)
+        B = np.eye(self.N) + W_12*K*W_12.T
+        L = jitchol(B)
+        return (B, L, W_12)
+
+    def nelder_mode(self, K):
+        f = np.zeros((self.N, 1))
+        self.Ki, _, _, self.ln_K_det = pdinv(K)
+        def obj(f):
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f)))
+            return float(res)
+
+        res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True})
+        f_new = res.x
+        return f_new[:, None]
+
+    def ncg_mode(self, K):
+        """
+        Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
+        :K: Covariance matrix
+        :returns: f_mode
+        """
+        self.Ki, _, _, self.ln_K_det = pdinv(K)
+
+        f = np.zeros((self.N, 1))
+
+        #FIXME: Can we get rid of this horrible reshaping?
+        #ONLY WORKS FOR 1D DATA
+        def obj(f):
+            res = -1 * (self.likelihood_function.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
+                        - self.NORMAL_CONST)
+            return float(res)
+
+        def obj_grad(f):
+            res = -1 * (self.likelihood_function.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
+            return np.squeeze(res)
+
+        def obj_hess(f):
+            res = -1 * (np.diag(self.likelihood_function.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
+            return np.squeeze(res)
+
+        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
+        return f_hat[:, None]
+
+    def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
+        """
+        Rasmussens numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+
+        :K: Covariance matrix
+        :MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
+        :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
+        :returns: f_mode
+        """
+        self.old_before_s = self.likelihood_function._get_params()
+        print "before: ", self.old_before_s
+        #if self.old_before_s < 1e-5:
+            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+        #old_a = np.zeros((self.N, 1))
+        if self.old_a is None:
+            old_a = np.zeros((self.N, 1))
+            f = np.dot(K, old_a)
+        else:
+            old_a = self.old_a.copy()
+            f = self.f_hat.copy()
+
+        new_obj = -np.inf
+        old_obj = np.inf
+
+        def obj(a, f):
+            return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
+
+        difference = np.inf
+        epsilon = 1e-4
+        step_size = 1
+        rs = 0
+        i = 0
+
+        while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
+            W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
+            #W = np.maximum(W, 0)
+            if not self.likelihood_function.log_concave:
+                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                    # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                    # To cause the posterior to become less certain than the prior and likelihood,
+                                    # This is a property only held by non-log-concave likelihoods
+            B, L, W_12 = self._compute_B_statistics(K, W.copy())
+
+            W_f = W*f
+            grad = self.likelihood_function.dlik_df(self.data, f, extra_data=self.extra_data)
+
+            b = W_f + grad
+            solve_L = cho_solve((L, True), W_12*np.dot(K, b))
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            full_step_a = b - W_12*solve_L
+            da = full_step_a - old_a
+
+            #f_old = f.copy()
+            #def inner_obj(step_size, old_a, da, K):
+                #a = old_a + step_size*da
+                #f = np.dot(K, a)
+                #self.a = a.copy() # This is nasty, need to set something within an optimization though
+                #self.f = f.copy()
+                #return -obj(a, f)
+
+            #from functools import partial
+            #i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
+            ##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
+            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
+            #f = self.f.copy()
+            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+            f_old = f.copy()
+            update_passed = False
+            while not update_passed:
+                a = old_a + step_size*da
+                f = np.dot(K, a)
+
+                old_obj = new_obj
+                new_obj = obj(a, f)
+                difference = new_obj - old_obj
+                print "difference: ",difference
+                if difference < 0:
+                    #print "Objective function rose", np.float(difference)
+                    #If the objective function isn't rising, restart optimization
+                    step_size *= 0.8
+                    #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    #objective function isn't increasing, try reducing step size
+                    f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
+                    old_obj = new_obj
+                    rs += 1
+                else:
+                    update_passed = True
+
+            #difference = abs(new_obj - old_obj)
+            #old_obj = new_obj.copy()
+            #difference = np.abs(np.sum(f - f_old))
+            difference = np.abs(np.sum(a - old_a))
+            #old_a = self.a.copy() #a
+            old_a = a.copy()
+            i += 1
+            #print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
+
+        self.old_a = old_a.copy()
+        #print "Positive difference obj: ", np.float(difference)
+        #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
+        print "Iterations: {}, Final_difference: {}".format(i, difference)
+        if difference > 1e-4:
+            print "FAIL FAIL FAIL FAIL FAIL FAIL"
+            if False:
+                import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+                if hasattr(self, 'X'):
+                    import pylab as pb
+                    pb.figure()
+                    pb.subplot(311)
+                    pb.title('old f_hat')
+                    pb.plot(self.X, self.f_hat)
+                    pb.subplot(312)
+                    pb.title('old ff')
+                    pb.plot(self.X, self.old_ff)
+                    pb.subplot(313)
+                    pb.title('new f_hat')
+                    pb.plot(self.X, f)
+
+                    pb.figure()
+                    pb.subplot(121)
+                    pb.title('old K')
+                    pb.imshow(np.diagflat(self.old_K), interpolation='none')
+                    pb.colorbar()
+                    pb.subplot(122)
+                    pb.title('new K')
+                    pb.imshow(np.diagflat(K), interpolation='none')
+                    pb.colorbar()
+
+                    pb.figure()
+                    pb.subplot(121)
+                    pb.title('old W')
+                    pb.imshow(np.diagflat(self.old_W), interpolation='none')
+                    pb.colorbar()
+                    pb.subplot(122)
+                    pb.title('new W')
+                    pb.imshow(np.diagflat(W), interpolation='none')
+                    pb.colorbar()
+
+                    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+                    pb.close('all')
+
+        #FIXME: DELETE THESE
+        self.old_W = W.copy()
+        self.old_grad = grad.copy()
+        self.old_B = B.copy()
+        self.old_W_12 = W_12.copy()
+        self.old_ff = f.copy()
+        self.old_K = self.K.copy()
+        self.old_s = self.likelihood_function._get_params()
+        print "after: ", self.old_s
+        #print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
+        self.a = a
+        #self.B, self.B_chol, self.W_12 = B, L, W_12
+        #self.Bi, _, _, B_det = pdinv(self.B)
+        return f
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
new file mode 100644
index 00000000..77620488
--- /dev/null
+++ b/GPy/models/GP.py
@@ -0,0 +1,319 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import linalg
+import pylab as pb
+from .. import kern
+from ..core import model
+from ..util.linalg import pdinv, mdot, tdot
+from ..util.plot import gpplot, x_frame1D, x_frame2D, Tango
+from ..likelihoods import EP, Laplace
+
+class GP(model):
+    """
+    Gaussian Process model for regression and EP
+
+    :param X: input observations
+    :param kernel: a GPy kernel, defaults to rbf+white
+    :parm likelihood: a GPy likelihood
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :rtype: model object
+    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
+    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
+    :type powerep: list
+
+    .. Note:: Multiple independent outputs are allowed using columns of Y
+
+    """
+    def __init__(self, X, likelihood, kernel, normalize_X=False):
+        self.has_uncertain_inputs=False
+
+        # parse arguments
+        self.X = X
+        assert len(self.X.shape) == 2
+        self.N, self.Q = self.X.shape
+        assert isinstance(kernel, kern.kern)
+        self.kern = kernel
+        self.likelihood = likelihood
+        assert self.X.shape[0] == self.likelihood.data.shape[0]
+        self.N, self.D = self.likelihood.data.shape
+
+        # here's some simple normalization for the inputs
+        if normalize_X:
+            self._Xmean = X.mean(0)[None, :]
+            self._Xstd = X.std(0)[None, :]
+            self.X = (X.copy() - self._Xmean) / self._Xstd
+            if hasattr(self, 'Z'):
+                self.Z = (self.Z - self._Xmean) / self._Xstd
+        else:
+            self._Xmean = np.zeros((1, self.X.shape[1]))
+            self._Xstd = np.ones((1, self.X.shape[1]))
+
+        if not hasattr(self,'has_uncertain_inputs'):
+            self.has_uncertain_inputs = False
+        model.__init__(self)
+
+    def dL_dZ(self):
+        """
+        TODO: one day we might like to learn Z by gradient methods?
+        """
+        #FIXME: this doesn;t live here.
+        return np.zeros_like(self.Z)
+
+    def _set_params(self, p):
+        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
+        # self.likelihood._set_params(p[self.kern.Nparam:])               # test by Nicolas
+        self.likelihood._set_params(p[self.kern.Nparam_transformed():])  # test by Nicolas
+
+        if isinstance(self.likelihood, Laplace):
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
+
+        self.K = self.kern.K(self.X)
+        self.K += self.likelihood.covariance_matrix
+
+        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
+
+        # the gradient of the likelihood wrt the covariance matrix
+        if self.likelihood.YYT is None:
+            #alpha = np.dot(self.Ki, self.likelihood.Y)
+            alpha,_ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y,lower=1)
+
+            self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki)
+        else:
+            #tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
+            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
+            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
+            self.dL_dK = 0.5 * (tmp - self.D * self.Ki)
+
+    def _get_params(self):
+        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
+
+    def _get_param_names(self):
+        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
+
+    def _update_params_callback(self, p):
+        #parameters will be in transformed space
+        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
+        #set_params_transformed for likelihood doesn't exist?
+        self.likelihood._set_params(p[self.kern.Nparam_transformed():])
+        #update the likelihood approximation within the optimisation with the current parameters
+        self.update_likelihood_approximation()
+
+    def update_likelihood_approximation(self):
+        """
+        Approximates a non-gaussian likelihood using Expectation Propagation
+
+        For a Gaussian likelihood, no iteration is required:
+        this function does nothing
+        """
+        self.likelihood.fit_full(self.kern.K(self.X))
+        self._set_params(self._get_params())  # update the GP
+
+    def _model_fit_term(self):
+        """
+        Computes the model fit using YYT if it's available
+        """
+        if self.likelihood.YYT is None:
+            tmp, _ = linalg.lapack.flapack.dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
+            return -0.5 * np.sum(np.square(tmp))
+            #return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
+        else:
+            return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))
+
+    def log_likelihood(self):
+        """
+        The log marginal likelihood of the GP.
+
+        For an EP model,  can be written as the log likelihood of a regression
+        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
+        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
+        """
+        #if isinstance(self.likelihood, Laplace):
+            #self.likelihood.fit_full(self.kern.K(self.X))
+            #self.likelihood._set_params(self.likelihood._get_params())
+        l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        print "K_ldet: {} mft: {} Z: {}".format(self.K_logdet, self._model_fit_term(), self.likelihood.Z)
+        return l
+
+    def _log_likelihood_gradients(self):
+        """
+        The gradient of all parameters.
+
+        Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
+        """
+        dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
+        print "dL_dthetaK should be: ", dL_dthetaK
+        if isinstance(self.likelihood, Laplace):
+            #self.likelihood.fit_full(self.kern.K(self.X))
+            #self.likelihood._set_params(self.likelihood._get_params())
+            dK_dthetaK = self.kern.dK_dtheta
+            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy())
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        else:
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
+        #print "dL_dthetaK: {}   dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL)
+
+        return np.hstack((dL_dthetaK, dL_dthetaL))
+        #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+
+    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False):
+        """
+        Internal helper function for making predictions, does not account
+        for normalization or likelihood
+        """
+        Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
+        #KiKx = np.dot(self.Ki, Kx)
+        KiKx, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(Kx), lower=1)
+        mu = np.dot(KiKx.T, self.likelihood.Y)
+        if full_cov:
+            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
+            var = Kxx - np.dot(KiKx.T, Kx)
+        else:
+            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
+            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
+            var = var[:, None]
+        if stop:
+            debug_this
+        return mu, var
+
+
+    def predict(self, Xnew, which_parts='all', full_cov=False):
+        """
+        Predict the function(s) at the new point(s) Xnew.
+
+        Arguments
+        ---------
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.Q
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
+        :type full_cov: bool
+        :rtype: posterior mean,  a Numpy array, Nnew x self.D
+        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.D
+
+
+           If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
+           This is to allow for different normalizations of the output dimensions.
+
+        """
+        # normalize X values
+        Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
+        mu, var = self._raw_predict(Xnew, which_parts, full_cov)
+
+        # now push through likelihood
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+
+        return mean, var, _025pm, _975pm
+
+
+    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False):
+        """
+        Plot the GP's view of the world, where the data is normalized and the
+        likelihood is Gaussian.
+
+        :param samples: the number of a posteriori samples to plot
+        :param which_data: which if the training data to plot (default all)
+        :type which_data: 'all' or a slice object to slice self.X, self.Y
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :param which_parts: which of the kernel functions to plot (additively)
+        :type which_parts: 'all', or list of bools
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+
+        Plot the posterior of the GP.
+          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
+          - In two dimsensions, a contour-plot shows the mean predicted function
+          - In higher dimensions, we've no implemented this yet !TODO!
+
+        Can plot only part of the data and part of the posterior functions
+        using which_data and which_functions
+        """
+        if which_data == 'all':
+            which_data = slice(None)
+
+        if self.X.shape[1] == 1:
+            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
+            if samples == 0:
+                m, v = self._raw_predict(Xnew, which_parts=which_parts)
+                gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v))
+                pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
+            else:
+                m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
+                Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
+                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None])
+                for i in range(samples):
+                    pb.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
+            pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
+            pb.xlim(xmin, xmax)
+            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
+            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+            pb.ylim(ymin, ymax)
+            if hasattr(self, 'Z'):
+                pb.plot(self.Z, self.Z * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
+
+        elif self.X.shape[1] == 2:
+            resolution = resolution or 50
+            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
+            m, v = self._raw_predict(Xnew, which_parts=which_parts)
+            m = m.reshape(resolution, resolution).T
+            pb.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
+            pb.scatter(Xorig[:, 0], Xorig[:, 1], 40, Yorig, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max())
+            pb.xlim(xmin[0], xmax[0])
+            pb.ylim(xmin[1], xmax[1])
+        else:
+            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+
+    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20):
+        """
+        TODO: Docstrings!
+        :param levels: for 2D plotting, the number of contour levels to use
+
+        """
+        # TODO include samples
+        if which_data == 'all':
+            which_data = slice(None)
+
+        if self.X.shape[1] == 1:
+
+            Xu = self.X * self._Xstd + self._Xmean  # NOTE self.X are the normalized values now
+
+            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
+            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
+            gpplot(Xnew, m, lower, upper)
+            pb.plot(Xu[which_data], self.likelihood.data[which_data], 'kx', mew=1.5)
+            if self.has_uncertain_inputs:
+                pb.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
+                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
+                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
+
+            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+            pb.xlim(xmin, xmax)
+            pb.ylim(ymin, ymax)
+            if hasattr(self, 'Z'):
+                Zu = self.Z * self._Xstd + self._Xmean
+                pb.plot(Zu, Zu * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
+                    # pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_variance.flatten()))
+
+        elif self.X.shape[1] == 2:  # FIXME
+            resolution = resolution or 50
+            Xnew, xx, yy, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
+            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
+            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
+            m = m.reshape(resolution, resolution).T
+            pb.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
+            Yf = self.likelihood.Y.flatten()
+            pb.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            pb.xlim(xmin[0], xmax[0])
+            pb.ylim(xmin[1], xmax[1])
+            if hasattr(self, 'Z'):
+                pb.plot(self.Z[:, 0], self.Z[:, 1], 'wo')
+
+        else:
+            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"

From b9a7a407954ff3b92039761936c073c439a93a69 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 9 Sep 2013 17:34:08 +0100
Subject: [PATCH 074/252] Dragged likelihood_function changes in

---
 GPy/likelihoods/likelihood_functions.py | 384 +++++++++++++++++++++++-
 1 file changed, 383 insertions(+), 1 deletion(-)

diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 7b9b8982..5d270b2b 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -3,12 +3,13 @@
 
 
 import numpy as np
-from scipy import stats
+from scipy import stats, integrate
 import scipy as sp
 import pylab as pb
 from ..util.plot import gpplot
 from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import link_functions
+from scipy.special import gammaln, gamma
 
 class LikelihoodFunction(object):
     """
@@ -24,6 +25,7 @@ class LikelihoodFunction(object):
             assert isinstance(link,link_functions.LinkFunction)
             self.link = link
             self.moments_match = self._moments_match_numerical
+        self.log_concave = True
 
     def _preprocess_values(self,Y):
         return Y
@@ -164,3 +166,383 @@ class Poisson(LikelihoodFunction):
         p_025 = tmp[:,0]
         p_975 = tmp[:,1]
         return mean,np.nan*mean,p_025,p_975 # better variance here TODO
+
+class Student_t(LikelihoodFunction):
+    """Student t likelihood distribution
+    For nomanclature see Bayesian Data Analysis 2003 p576
+
+    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2)$$
+
+    Laplace:
+    Needs functions to calculate
+    ln p(yi|fi)
+    dln p(yi|fi)_dfi
+    d2ln p(yi|fi)_d2fifj
+    """
+    def __init__(self, deg_free=5, sigma2=2, link=None):
+        super(Student_t, self).__init__(link)
+        self.v = deg_free
+        self.sigma2 = sigma2
+
+        self._set_params(np.asarray(sigma2))
+        self.log_concave = False
+
+    def _get_params(self):
+        return np.asarray(self.sigma2)
+
+    def _get_param_names(self):
+        return ["t_noise_std2"]
+
+    def _set_params(self, x):
+        self.sigma2 = float(x)
+
+    @property
+    def variance(self, extra_data=None):
+        return (self.v / float(self.v - 2)) * self.sigma2
+
+    def link_function(self, y, f, extra_data=None):
+        """link_function $\ln p(y|f)$
+        $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
+
+        For wolfram alpha import parts for derivative of sigma are -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        assert y.shape == f.shape
+        e = y - f
+        #A = gammaln((self.v + 1) * 0.5)
+        #B = - gammaln(self.v * 0.5)
+        #C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
+        #D = + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
+        objective = (+ gammaln((self.v + 1) * 0.5)
+                     - gammaln(self.v * 0.5)
+                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
+                     + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
+                    )
+        #print "C: {} D: {} obj: {}".format(C, np.sum(D), objective.sum())
+        return np.sum(objective)
+
+    def dlik_df(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
+
+        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: gradient of likelihood evaluated at points
+
+        """
+        assert y.shape == f.shape
+        e = y - f
+        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
+        return grad
+
+    def d2lik_d2f(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+        (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+
+        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        assert y.shape == f.shape
+        e = y - f
+        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
+        return hess
+
+    def d3lik_d3f(self, y, f, extra_data=None):
+        """
+        Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+
+        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+                       ((e**2 + self.sigma2*self.v)**3)
+                    )
+        return d3lik_d3f
+
+    def lik_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
+
+        Terms relavent to derivatives wrt sigma are:
+        -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+
+        $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        dlik_dsigma = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
+        return dlik_dsigma
+
+    def dlik_df_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
+
+        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        dlik_grad_dsigma = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
+        return dlik_grad_dsigma
+
+    def d2lik_d2f_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
+
+        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
+        """
+        assert y.shape == f.shape
+        e = y - f
+        dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
+                              / ((self.sigma2*self.v + (e**2))**3)
+                           )
+        return dlik_hess_dsigma
+
+    def _gradients(self, y, f, extra_data=None):
+        #must be listed in same order as 'get_param_names'
+        derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
+                  [self.dlik_df_dstd(y, f, extra_data=extra_data)],
+                  [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+                 ) # lists as we might learn many parameters
+        # ensure we have gradients for every parameter we want to optimize
+        assert len(derivs[0]) == len(self._get_param_names())
+        assert len(derivs[1]) == len(self._get_param_names())
+        assert len(derivs[2]) == len(self._get_param_names())
+        return derivs
+
+    def predictive_values(self, mu, var):
+        """
+        Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
+
+        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
+        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
+        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
+        """
+
+        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
+        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
+        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
+        #Which was also given to us as (var)
+        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
+        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
+        true_var = var + self.variance
+
+        #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
+        #need the 95 and 5 percentiles.
+        #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
+        p_025 = mu - 2.*np.sqrt(true_var)
+        p_975 = mu + 2.*np.sqrt(true_var)
+
+        return mu, np.nan*mu, p_025, p_975
+
+    def sample_predicted_values(self, mu, var):
+        """ Experimental sample approches and numerical integration """
+        #p_025 = stats.t.ppf(.025, mu)
+        #p_975 = stats.t.ppf(.975, mu)
+
+        num_test_points = mu.shape[0]
+        #Each mu is the latent point f* at the test point x*,
+        #and the var is the gaussian variance at this point
+        #Take lots of samples from this, so we have lots of possible values
+        #for latent point f* for each test point x* weighted by how likely we were to pick it
+        print "Taking %d samples of f*".format(num_test_points)
+        num_f_samples = 10
+        num_y_samples = 10
+        student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
+        print "Student t means shape: ", student_t_means.shape
+
+        #Now we have lots of f*, lets work out the likelihood of getting this by sampling
+        #from a student t centred on this point, sample many points from this distribution
+        #centred on f*
+        #for test_point, f in enumerate(student_t_means):
+            #print test_point
+            #print f.shape
+            #student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
+                                            #scale=self.sigma,
+                                            #size=(num_f_samples, num_y_samples))
+            #print student_t_samples.shape
+
+        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
+                                        scale=self.sigma,
+                                        size=(num_test_points, num_y_samples, num_f_samples))
+        student_t_samples = np.reshape(student_t_samples,
+                                       (num_test_points, num_y_samples*num_f_samples))
+
+        #Now take the 97.5 and 0.25 percentile of these points
+        p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
+        p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
+
+        ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
+        def t_gaussian(f, mu, var):
+            return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
+                    * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
+                    )
+
+        def t_gauss_int(mu, var):
+            print "Mu: ", mu
+            print "var: ", var
+            result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
+            print "Result: ", result
+            return result[0]
+
+        vec_t_gauss_int = np.vectorize(t_gauss_int)
+
+        p = vec_t_gauss_int(mu, var)
+        p_025 = mu - p
+        p_975 = mu + p
+        return mu, np.nan*mu, p_025, p_975
+
+class Gaussian(LikelihoodFunction):
+    """
+    Gaussian likelihood - this is a test class for approximation schemes
+    """
+    def __init__(self, variance, D, N, link=None):
+        super(Gaussian, self).__init__(link)
+        self.D = D
+        self.N = N
+        self._variance = float(variance)
+        self._set_params(np.asarray(variance))
+
+        #Don't support normalizing yet
+        self._bias = np.zeros((1, self.D))
+        self._scale = np.ones((1, self.D))
+
+    def _get_params(self):
+        return np.asarray(self._variance)
+
+    def _get_param_names(self):
+        return ["noise_variance"]
+
+    def _set_params(self, x):
+        self._variance = float(x)
+        self.I = np.eye(self.N)
+        self.covariance_matrix = self.I * self._variance
+        self.Ki = self.I*(1.0 / self._variance)
+        self.ln_K = np.trace(self.covariance_matrix)
+
+    def link_function(self, y, f, extra_data=None):
+        """link_function $\ln p(y|f)$
+        $$\ln p(y_{i}|f_{i}) = \ln $$
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: float(likelihood evaluated for this point)
+
+        """
+        assert y.shape == f.shape
+        e = y - f
+        eeT = np.dot(e, e.T)
+        objective = (- 0.5*self.D*np.log(2*np.pi)
+                     - 0.5*self.ln_K
+                     #- 0.5*np.sum(np.multiply(self.Ki, eeT))
+                     - 0.5*np.dot(np.dot(e.T, self.Ki), e)
+                     )
+        return np.sum(objective)
+
+    def dlik_df(self, y, f, extra_data=None):
+        """
+        Gradient of the link function at y, given f w.r.t f
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: gradient of likelihood evaluated at points
+
+        """
+        assert y.shape == f.shape
+        s2_i = (1.0/self._variance)*self.I
+        grad = np.dot(s2_i, y) - 0.5*np.dot(s2_i, f)
+        return grad
+
+    def d2lik_d2f(self, y, f, extra_data=None):
+        """
+        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
+        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+
+        Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+        (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+
+        :y: data
+        :f: latent variables f
+        :extra_data: extra_data which is not used in student t distribution
+        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        """
+        assert y.shape == f.shape
+        s2_i = (1.0/self._variance)*self.I
+        hess = 0.5*np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
+        return hess
+
+    def d3lik_d3f(self, y, f, extra_data=None):
+        """
+        Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+
+        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
+        """
+        assert y.shape == f.shape
+        d3lik_d3f = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
+        return d3lik_d3f
+
+    def lik_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
+        """
+        assert y.shape == f.shape
+        e = y - f
+        s_4 = 1.0/(self._variance**2)
+        dlik_dsigma = -0.5*self.N*1/self._variance + 0.5*s_4*np.trace(np.dot(e.T, np.dot(self.I, e)))
+        return dlik_dsigma
+
+    def dlik_df_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
+        """
+        assert y.shape == f.shape
+        s_4 = 1.0/(self._variance**2)
+        dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f))
+        return dlik_grad_dsigma
+
+    def d2lik_d2f_dstd(self, y, f, extra_data=None):
+        """
+        Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
+
+        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
+        """
+        assert y.shape == f.shape
+        dlik_hess_dsigma = 0.5*np.diag((1.0/(self._variance**2))*self.I)[:, None]
+        return dlik_hess_dsigma
+
+    def _gradients(self, y, f, extra_data=None):
+        #must be listed in same order as 'get_param_names'
+        derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
+                  [self.dlik_df_dstd(y, f, extra_data=extra_data)],
+                  [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+                 ) # lists as we might learn many parameters
+        # ensure we have gradients for every parameter we want to optimize
+        assert len(derivs[0]) == len(self._get_param_names())
+        assert len(derivs[1]) == len(self._get_param_names())
+        assert len(derivs[2]) == len(self._get_param_names())
+        return derivs
+
+    def predictive_values(self, mu, var):
+        mean = mu * self._scale + self._bias
+        true_var = (var + self._variance) * self._scale ** 2
+        _5pc = mean - 2.*np.sqrt(true_var)
+        _95pc = mean + 2.*np.sqrt(true_var)
+        return mean, true_var, _5pc, _95pc

From c46a1aaa40d45512468ca7c3c004656ad2f94afb Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 9 Sep 2013 17:39:40 +0100
Subject: [PATCH 075/252] Merged GP models

---
 GPy/core/gp.py   |  20 ++-
 GPy/models/GP.py | 319 -----------------------------------------------
 2 files changed, 18 insertions(+), 321 deletions(-)
 delete mode 100644 GPy/models/GP.py

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 278ddc74..e1426f03 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -6,7 +6,7 @@ import numpy as np
 import pylab as pb
 from .. import kern
 from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs
-from ..likelihoods import EP
+from ..likelihoods import EP, Laplace
 from gp_base import GPBase
 
 class GP(GPBase):
@@ -41,6 +41,11 @@ class GP(GPBase):
         self.kern._set_params_transformed(p[:self.kern.num_params_transformed()])
         self.likelihood._set_params(p[self.kern.num_params_transformed():])
 
+        #TODO: Need to get rid of this check and think of a nicer OO way
+        if isinstance(self.likelihood, Laplace):
+            self.likelihood.fit_full(self.kern.K(self.X))
+            self.likelihood._set_params(self.likelihood._get_params())
+
         self.K = self.kern.K(self.X)
         self.K += self.likelihood.covariance_matrix
 
@@ -105,7 +110,18 @@ class GP(GPBase):
 
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
-        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
+        #Think of OO way of doing this also
+        if isinstance(self.likelihood, Laplace):
+            #self.likelihood.fit_full(self.kern.K(self.X))
+            #self.likelihood._set_params(self.likelihood._get_params())
+            dK_dthetaK = self.kern.dK_dtheta
+            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy())
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+        else:
+            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
+
+        return np.hstack((dL_dthetaK, dL_dthetaL))
 
     def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
         """
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
deleted file mode 100644
index 77620488..00000000
--- a/GPy/models/GP.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import linalg
-import pylab as pb
-from .. import kern
-from ..core import model
-from ..util.linalg import pdinv, mdot, tdot
-from ..util.plot import gpplot, x_frame1D, x_frame2D, Tango
-from ..likelihoods import EP, Laplace
-
-class GP(model):
-    """
-    Gaussian Process model for regression and EP
-
-    :param X: input observations
-    :param kernel: a GPy kernel, defaults to rbf+white
-    :parm likelihood: a GPy likelihood
-    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
-    :type normalize_X: False|True
-    :rtype: model object
-    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
-    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
-    :type powerep: list
-
-    .. Note:: Multiple independent outputs are allowed using columns of Y
-
-    """
-    def __init__(self, X, likelihood, kernel, normalize_X=False):
-        self.has_uncertain_inputs=False
-
-        # parse arguments
-        self.X = X
-        assert len(self.X.shape) == 2
-        self.N, self.Q = self.X.shape
-        assert isinstance(kernel, kern.kern)
-        self.kern = kernel
-        self.likelihood = likelihood
-        assert self.X.shape[0] == self.likelihood.data.shape[0]
-        self.N, self.D = self.likelihood.data.shape
-
-        # here's some simple normalization for the inputs
-        if normalize_X:
-            self._Xmean = X.mean(0)[None, :]
-            self._Xstd = X.std(0)[None, :]
-            self.X = (X.copy() - self._Xmean) / self._Xstd
-            if hasattr(self, 'Z'):
-                self.Z = (self.Z - self._Xmean) / self._Xstd
-        else:
-            self._Xmean = np.zeros((1, self.X.shape[1]))
-            self._Xstd = np.ones((1, self.X.shape[1]))
-
-        if not hasattr(self,'has_uncertain_inputs'):
-            self.has_uncertain_inputs = False
-        model.__init__(self)
-
-    def dL_dZ(self):
-        """
-        TODO: one day we might like to learn Z by gradient methods?
-        """
-        #FIXME: this doesn;t live here.
-        return np.zeros_like(self.Z)
-
-    def _set_params(self, p):
-        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
-        # self.likelihood._set_params(p[self.kern.Nparam:])               # test by Nicolas
-        self.likelihood._set_params(p[self.kern.Nparam_transformed():])  # test by Nicolas
-
-        if isinstance(self.likelihood, Laplace):
-            self.likelihood.fit_full(self.kern.K(self.X))
-            self.likelihood._set_params(self.likelihood._get_params())
-
-        self.K = self.kern.K(self.X)
-        self.K += self.likelihood.covariance_matrix
-
-        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
-
-        # the gradient of the likelihood wrt the covariance matrix
-        if self.likelihood.YYT is None:
-            #alpha = np.dot(self.Ki, self.likelihood.Y)
-            alpha,_ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y,lower=1)
-
-            self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki)
-        else:
-            #tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
-            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
-            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
-            self.dL_dK = 0.5 * (tmp - self.D * self.Ki)
-
-    def _get_params(self):
-        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
-
-    def _get_param_names(self):
-        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
-
-    def _update_params_callback(self, p):
-        #parameters will be in transformed space
-        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
-        #set_params_transformed for likelihood doesn't exist?
-        self.likelihood._set_params(p[self.kern.Nparam_transformed():])
-        #update the likelihood approximation within the optimisation with the current parameters
-        self.update_likelihood_approximation()
-
-    def update_likelihood_approximation(self):
-        """
-        Approximates a non-gaussian likelihood using Expectation Propagation
-
-        For a Gaussian likelihood, no iteration is required:
-        this function does nothing
-        """
-        self.likelihood.fit_full(self.kern.K(self.X))
-        self._set_params(self._get_params())  # update the GP
-
-    def _model_fit_term(self):
-        """
-        Computes the model fit using YYT if it's available
-        """
-        if self.likelihood.YYT is None:
-            tmp, _ = linalg.lapack.flapack.dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
-            return -0.5 * np.sum(np.square(tmp))
-            #return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
-        else:
-            return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))
-
-    def log_likelihood(self):
-        """
-        The log marginal likelihood of the GP.
-
-        For an EP model,  can be written as the log likelihood of a regression
-        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
-        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
-        """
-        #if isinstance(self.likelihood, Laplace):
-            #self.likelihood.fit_full(self.kern.K(self.X))
-            #self.likelihood._set_params(self.likelihood._get_params())
-        l = -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
-        print "K_ldet: {} mft: {} Z: {}".format(self.K_logdet, self._model_fit_term(), self.likelihood.Z)
-        return l
-
-    def _log_likelihood_gradients(self):
-        """
-        The gradient of all parameters.
-
-        Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
-        """
-        dL_dthetaK = self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X)
-        print "dL_dthetaK should be: ", dL_dthetaK
-        if isinstance(self.likelihood, Laplace):
-            #self.likelihood.fit_full(self.kern.K(self.X))
-            #self.likelihood._set_params(self.likelihood._get_params())
-            dK_dthetaK = self.kern.dK_dtheta
-            dL_dthetaK = self.likelihood._Kgradients(dK_dthetaK, self.X.copy())
-            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-        else:
-            dL_dthetaL = self.likelihood._gradients(partial=np.diag(self.dL_dK))
-        #print "Stacked dL_dthetaK, dL_dthetaL: ", np.hstack((dL_dthetaK, dL_dthetaL))
-        #print "dL_dthetaK: {}   dL_dthetaL: {}".format(dL_dthetaK, dL_dthetaL)
-
-        return np.hstack((dL_dthetaK, dL_dthetaL))
-        #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
-
-    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False):
-        """
-        Internal helper function for making predictions, does not account
-        for normalization or likelihood
-        """
-        Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
-        #KiKx = np.dot(self.Ki, Kx)
-        KiKx, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(Kx), lower=1)
-        mu = np.dot(KiKx.T, self.likelihood.Y)
-        if full_cov:
-            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
-            var = Kxx - np.dot(KiKx.T, Kx)
-        else:
-            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
-            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
-            var = var[:, None]
-        if stop:
-            debug_this
-        return mu, var
-
-
-    def predict(self, Xnew, which_parts='all', full_cov=False):
-        """
-        Predict the function(s) at the new point(s) Xnew.
-
-        Arguments
-        ---------
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.Q
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
-        :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.D
-        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.D
-
-
-           If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
-           This is to allow for different normalizations of the output dimensions.
-
-        """
-        # normalize X values
-        Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
-        mu, var = self._raw_predict(Xnew, which_parts, full_cov)
-
-        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
-
-        return mean, var, _025pm, _975pm
-
-
-    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False):
-        """
-        Plot the GP's view of the world, where the data is normalized and the
-        likelihood is Gaussian.
-
-        :param samples: the number of a posteriori samples to plot
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-
-        Plot the posterior of the GP.
-          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
-          - In two dimsensions, a contour-plot shows the mean predicted function
-          - In higher dimensions, we've no implemented this yet !TODO!
-
-        Can plot only part of the data and part of the posterior functions
-        using which_data and which_functions
-        """
-        if which_data == 'all':
-            which_data = slice(None)
-
-        if self.X.shape[1] == 1:
-            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
-            if samples == 0:
-                m, v = self._raw_predict(Xnew, which_parts=which_parts)
-                gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v))
-                pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
-            else:
-                m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
-                Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
-                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None])
-                for i in range(samples):
-                    pb.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
-            pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
-            pb.xlim(xmin, xmax)
-            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            pb.ylim(ymin, ymax)
-            if hasattr(self, 'Z'):
-                pb.plot(self.Z, self.Z * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
-
-        elif self.X.shape[1] == 2:
-            resolution = resolution or 50
-            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
-            m, v = self._raw_predict(Xnew, which_parts=which_parts)
-            m = m.reshape(resolution, resolution).T
-            pb.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-            pb.scatter(Xorig[:, 0], Xorig[:, 1], 40, Yorig, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max())
-            pb.xlim(xmin[0], xmax[0])
-            pb.ylim(xmin[1], xmax[1])
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20):
-        """
-        TODO: Docstrings!
-        :param levels: for 2D plotting, the number of contour levels to use
-
-        """
-        # TODO include samples
-        if which_data == 'all':
-            which_data = slice(None)
-
-        if self.X.shape[1] == 1:
-
-            Xu = self.X * self._Xstd + self._Xmean  # NOTE self.X are the normalized values now
-
-            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
-            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
-            gpplot(Xnew, m, lower, upper)
-            pb.plot(Xu[which_data], self.likelihood.data[which_data], 'kx', mew=1.5)
-            if self.has_uncertain_inputs:
-                pb.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
-                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
-                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
-
-            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            pb.xlim(xmin, xmax)
-            pb.ylim(ymin, ymax)
-            if hasattr(self, 'Z'):
-                Zu = self.Z * self._Xstd + self._Xmean
-                pb.plot(Zu, Zu * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
-                    # pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_variance.flatten()))
-
-        elif self.X.shape[1] == 2:  # FIXME
-            resolution = resolution or 50
-            Xnew, xx, yy, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
-            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
-            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
-            m = m.reshape(resolution, resolution).T
-            pb.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-            Yf = self.likelihood.Y.flatten()
-            pb.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
-            pb.xlim(xmin[0], xmax[0])
-            pb.ylim(xmin[1], xmax[1])
-            if hasattr(self, 'Z'):
-                pb.plot(self.Z[:, 0], self.Z[:, 1], 'wo')
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"

From 5b25273d2b92a7c513f3705f58e9d5e2d2295b7f Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 9 Sep 2013 17:44:08 +0100
Subject: [PATCH 076/252] Removed unneeded dependency

---
 GPy/examples/laplace_approximations.py | 24 ++++++++++++------------
 GPy/likelihoods/Laplace.py             |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 8be08a8f..b6443664 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -25,7 +25,7 @@ def timing():
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
 
-        t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+        t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
         corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
         m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
         m.ensure_default_constraints()
@@ -54,7 +54,7 @@ def v_fail_test():
     edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel1)
     m.constrain_positive('')
@@ -101,7 +101,7 @@ def student_t_obj_plane():
     print mgp
 
     kernelst = kernelgp.copy()
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=(real_std**2))
+    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=(real_std**2))
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernelst)
     m.ensure_default_constraints()
@@ -154,7 +154,7 @@ def student_t_f_check():
 
     kernelst = kernelgp.copy()
     #kernelst += GPy.kern.bias(X.shape[1])
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=0.05)
+    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=0.05)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernelst)
     #m['rbf_v'] = mgp._get_params()[0]
@@ -206,7 +206,7 @@ def student_t_fix_optimise_check():
     kernelst = kernelgp.copy()
     real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
 
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=real_stu_t_std2)
+    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=real_stu_t_std2)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
 
     plt.figure(1)
@@ -349,7 +349,7 @@ def debug_student_t_noise_approx():
     #edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
 
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
@@ -384,7 +384,7 @@ def debug_student_t_noise_approx():
     return m
 
     #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    #t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
     #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
@@ -480,7 +480,7 @@ def student_t_approx():
     edited_real_sd = real_std #initial_var_guess
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, stu_t_likelihood, kernel6)
     m.ensure_default_constraints()
@@ -496,7 +496,7 @@ def student_t_approx():
     plt.title('Student-t rasm clean')
 
     print "Corrupt student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
     corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
     m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
     m.ensure_default_constraints()
@@ -514,7 +514,7 @@ def student_t_approx():
     return m
 
     #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    #t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
     #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
@@ -528,7 +528,7 @@ def student_t_approx():
     #plt.title('Student-t ncg clean')
 
     #print "Corrupt student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.student_t(deg_free, sigma2=edited_real_sd)
+    #t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
     #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
     #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
     #m.ensure_default_constraints()
@@ -612,7 +612,7 @@ def gaussian_f_check():
     kernelg = kernelgp.copy()
     #kernelst += GPy.kern.bias(X.shape[1])
     N, D = X.shape
-    g_distribution = GPy.likelihoods.likelihood_functions.gaussian(variance=0.1, N=N, D=D)
+    g_distribution = GPy.likelihoods.likelihood_functions.Gaussian(variance=0.1, N=N, D=D)
     g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm')
     m = GPy.models.GP(X, g_likelihood, kernelg)
     #m['rbf_v'] = mgp._get_params()[0]
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/Laplace.py
index 58304c23..b5b16521 100644
--- a/GPy/likelihoods/Laplace.py
+++ b/GPy/likelihoods/Laplace.py
@@ -4,7 +4,7 @@ import GPy
 from scipy.linalg import inv, cho_solve, det
 from numpy.linalg import cond
 from likelihood import likelihood
-from ..util.linalg import pdinv, mdot, jitchol, chol_inv, det_ln_diag, pddet
+from ..util.linalg import pdinv, mdot, jitchol, chol_inv, pddet
 from scipy.linalg.lapack import dtrtrs
 import random
 #import pylab as plt

From 1dd83291fef489e2c44d6ccb0d4a1ba8a6776bc6 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 11 Sep 2013 11:54:15 +0100
Subject: [PATCH 077/252] Renamed some things, made some small (incorrect)
 gradient changes, generalised the gp regression for any likelihood, and added
 a place holder link function waiting for Richardos changes

---
 GPy/examples/laplace_approximations.py     | 75 +++++++++++-----------
 GPy/likelihoods/__init__.py                |  1 +
 GPy/likelihoods/{Laplace.py => laplace.py} |  0
 GPy/likelihoods/likelihood_functions.py    | 32 +++++----
 GPy/likelihoods/link_functions.py          | 13 ++++
 GPy/models/gp_regression.py                |  7 +-
 GPy/util/linalg.py                         |  8 +++
 7 files changed, 83 insertions(+), 53 deletions(-)
 rename GPy/likelihoods/{Laplace.py => laplace.py} (100%)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index b6443664..c0bc3aef 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -25,9 +25,9 @@ def timing():
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
 
-        t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+        t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
         corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
-        m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
+        m = GPy.models.GPRegression(X, corrupt_stu_t_likelihood, kernel1)
         m.ensure_default_constraints()
         m.update_likelihood_approximation()
         m.optimize()
@@ -54,9 +54,9 @@ def v_fail_test():
     edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GP(X, stu_t_likelihood, kernel1)
+    m = GPy.models.GPRegression(X, stu_t_likelihood, kernel1)
     m.constrain_positive('')
     vs = 25
     noises = 30
@@ -94,16 +94,16 @@ def student_t_obj_plane():
     deg_free = 1000
 
     kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
     mgp.ensure_default_constraints()
     mgp['noise'] = real_std**2
     print "Gaussian"
     print mgp
 
     kernelst = kernelgp.copy()
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=(real_std**2))
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=(real_std**2))
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m = GPy.models.GPRegression(X, stu_t_likelihood, kernelst)
     m.ensure_default_constraints()
     m.constrain_fixed('t_no', real_std**2)
     vs = 10
@@ -144,7 +144,7 @@ def student_t_f_check():
     deg_free = 1000
 
     kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
     mgp.ensure_default_constraints()
     mgp.randomize()
     mgp.optimize()
@@ -154,9 +154,9 @@ def student_t_f_check():
 
     kernelst = kernelgp.copy()
     #kernelst += GPy.kern.bias(X.shape[1])
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=0.05)
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=0.05)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m = GPy.models.GPRegression(X, stu_t_likelihood, kernelst)
     #m['rbf_v'] = mgp._get_params()[0]
     #m['rbf_l'] = mgp._get_params()[1] + 1
     m.ensure_default_constraints()
@@ -198,7 +198,7 @@ def student_t_fix_optimise_check():
 
     #GP
     kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
     mgp.ensure_default_constraints()
     mgp.randomize()
     mgp.optimize()
@@ -206,12 +206,12 @@ def student_t_fix_optimise_check():
     kernelst = kernelgp.copy()
     real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
 
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=real_stu_t_std2)
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=real_stu_t_std2)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
 
     plt.figure(1)
     plt.suptitle('Student likelihood')
-    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m = GPy.models.GPRegression(X, stu_t_likelihood, kernelst)
     m.constrain_fixed('rbf_var', mgp._get_params()[0])
     m.constrain_fixed('rbf_len', mgp._get_params()[1])
     m.constrain_positive('t_noise')
@@ -331,7 +331,7 @@ def debug_student_t_noise_approx():
     print "Clean Gaussian"
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
-    #m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    #m = GPy.models.GPRegression(X, Y, kernel=kernel1)
     ## optimize
     #m.ensure_default_constraints()
     #m.optimize()
@@ -349,10 +349,10 @@ def debug_student_t_noise_approx():
     #edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
 
-    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m = GPy.models.GPRegression(X, stu_t_likelihood, kernel6)
     #m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 0.2651)
@@ -384,9 +384,9 @@ def debug_student_t_noise_approx():
     return m
 
     #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    #t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
-    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    #m = GPy.models.GPRegression(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
     #m.optimize()
@@ -453,7 +453,7 @@ def student_t_approx():
     print "Clean Gaussian"
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
-    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    m = GPy.models.GPRegression(X, Y, kernel=kernel1)
     # optimize
     m.ensure_default_constraints()
     m.optimize()
@@ -466,7 +466,7 @@ def student_t_approx():
 
     #Corrupt
     print "Corrupt Gaussian"
-    m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
+    m = GPy.models.GPRegression(X, Yc, kernel=kernel2)
     m.ensure_default_constraints()
     #m.optimize()
     plt.subplot(212)
@@ -480,9 +480,9 @@ def student_t_approx():
     edited_real_sd = real_std #initial_var_guess
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m = GPy.models.GPRegression(X, Y.copy(), kernel6, stu_t_likelihood)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
     m.randomize()
@@ -496,9 +496,9 @@ def student_t_approx():
     plt.title('Student-t rasm clean')
 
     print "Corrupt student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
+    m = GPy.models.GPRegression(X, Yc.copy(), kernel4, corrupt_stu_t_likelihood)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
     m.randomize()
@@ -514,9 +514,9 @@ def student_t_approx():
     return m
 
     #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    #t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
-    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    #m = GPy.models.GPRegression(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
     #m.optimize()
@@ -528,9 +528,9 @@ def student_t_approx():
     #plt.title('Student-t ncg clean')
 
     #print "Corrupt student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    #t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
-    #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    #m = GPy.models.GPRegression(X, corrupt_stu_t_likelihood, kernel5)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
     #m.optimize()
@@ -582,7 +582,7 @@ def noisy_laplace_approx():
 
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
-    m = GPy.models.GP_regression(X, Y)
+    m = GPy.models.GPRegression(X, Y)
 
     # optimize
     m.ensure_default_constraints()
@@ -601,7 +601,7 @@ def gaussian_f_check():
     Y = np.sin(X*2*np.pi) + noise
 
     kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
     mgp.ensure_default_constraints()
     mgp.randomize()
     mgp.optimize()
@@ -612,9 +612,9 @@ def gaussian_f_check():
     kernelg = kernelgp.copy()
     #kernelst += GPy.kern.bias(X.shape[1])
     N, D = X.shape
-    g_distribution = GPy.likelihoods.likelihood_functions.Gaussian(variance=0.1, N=N, D=D)
+    g_distribution = GPy.likelihoods.functions.Gaussian(variance=0.1, N=N, D=D)
     g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm')
-    m = GPy.models.GP(X, g_likelihood, kernelg)
+    m = GPy.models.GPRegression(X, Y, kernelg, likelihood=g_likelihood)
     #m['rbf_v'] = mgp._get_params()[0]
     #m['rbf_l'] = mgp._get_params()[1] + 1
     m.ensure_default_constraints()
@@ -624,14 +624,15 @@ def gaussian_f_check():
     #m.constrain_positive('bias')
     m.constrain_positive('noise_var')
     m.randomize()
+    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
     m['noise_variance'] = 0.1
-    m.likelihood.X = X
+    #m.likelihood.X = X
     plt.figure()
-    plt.subplot(211)
-    m.plot()
-    plt.subplot(212)
+    ax = plt.subplot(211)
+    m.plot(ax=ax)
+    ax = plt.subplot(212)
     m.optimize()
-    m.plot()
+    m.plot(ax=ax)
     print "final optimised gaussian"
     print m
     print "real GP"
diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 99e88b6d..5d4e31f7 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -1,4 +1,5 @@
 from ep import EP
+from laplace import Laplace
 from gaussian import Gaussian
 # TODO: from Laplace import Laplace
 import likelihood_functions as functions
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/laplace.py
similarity index 100%
rename from GPy/likelihoods/Laplace.py
rename to GPy/likelihoods/laplace.py
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 5d270b2b..06735a9c 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -167,7 +167,7 @@ class Poisson(LikelihoodFunction):
         p_975 = tmp[:,1]
         return mean,np.nan*mean,p_025,p_975 # better variance here TODO
 
-class Student_t(LikelihoodFunction):
+class StudentT(LikelihoodFunction):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
 
@@ -180,7 +180,11 @@ class Student_t(LikelihoodFunction):
     d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free=5, sigma2=2, link=None):
-        super(Student_t, self).__init__(link)
+        self._analytical = None
+        if not link:
+            link = link_functions.Nothing()
+
+        super(StudentT, self).__init__(link)
         self.v = deg_free
         self.sigma2 = sigma2
 
@@ -413,6 +417,10 @@ class Gaussian(LikelihoodFunction):
     Gaussian likelihood - this is a test class for approximation schemes
     """
     def __init__(self, variance, D, N, link=None):
+        self._analytical = None
+        if not link:
+            link = link_functions.Nothing()
+
         super(Gaussian, self).__init__(link)
         self.D = D
         self.N = N
@@ -454,7 +462,7 @@ class Gaussian(LikelihoodFunction):
                      #- 0.5*np.sum(np.multiply(self.Ki, eeT))
                      - 0.5*np.dot(np.dot(e.T, self.Ki), e)
                      )
-        return np.sum(objective)
+        return np.sum(objective) # FIXME: put this back!
 
     def dlik_df(self, y, f, extra_data=None):
         """
@@ -468,7 +476,7 @@ class Gaussian(LikelihoodFunction):
         """
         assert y.shape == f.shape
         s2_i = (1.0/self._variance)*self.I
-        grad = np.dot(s2_i, y) - 0.5*np.dot(s2_i, f)
+        grad = np.dot(s2_i, y) - np.dot(s2_i, f)
         return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
@@ -486,7 +494,7 @@ class Gaussian(LikelihoodFunction):
         """
         assert y.shape == f.shape
         s2_i = (1.0/self._variance)*self.I
-        hess = 0.5*np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
+        hess = np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -499,17 +507,17 @@ class Gaussian(LikelihoodFunction):
         d3lik_d3f = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
         return d3lik_d3f
 
-    def lik_dstd(self, y, f, extra_data=None):
+    def lik_dvar(self, y, f, extra_data=None):
         """
         Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
         """
         assert y.shape == f.shape
         e = y - f
         s_4 = 1.0/(self._variance**2)
-        dlik_dsigma = -0.5*self.N*1/self._variance + 0.5*s_4*np.trace(np.dot(e.T, np.dot(self.I, e)))
+        dlik_dsigma = -0.5*self.N/self._variance + 0.5*s_4*np.trace(np.dot(e.T, np.dot(self.I, e)))
         return dlik_dsigma
 
-    def dlik_df_dstd(self, y, f, extra_data=None):
+    def dlik_df_dvar(self, y, f, extra_data=None):
         """
         Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
         """
@@ -518,7 +526,7 @@ class Gaussian(LikelihoodFunction):
         dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f))
         return dlik_grad_dsigma
 
-    def d2lik_d2f_dstd(self, y, f, extra_data=None):
+    def d2lik_d2f_dvar(self, y, f, extra_data=None):
         """
         Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
 
@@ -530,9 +538,9 @@ class Gaussian(LikelihoodFunction):
 
     def _gradients(self, y, f, extra_data=None):
         #must be listed in same order as 'get_param_names'
-        derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
-                  [self.dlik_df_dstd(y, f, extra_data=extra_data)],
-                  [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+        derivs = ([self.lik_dvar(y, f, extra_data=extra_data)],
+                  [self.dlik_df_dvar(y, f, extra_data=extra_data)],
+                  [self.d2lik_d2f_dvar(y, f, extra_data=extra_data)]
                  ) # lists as we might learn many parameters
         # ensure we have gradients for every parameter we want to optimize
         assert len(derivs[0]) == len(self._get_param_names())
diff --git a/GPy/likelihoods/link_functions.py b/GPy/likelihoods/link_functions.py
index 3b9a55b2..826983a9 100644
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@@ -31,3 +31,16 @@ class Probit(LinkFunction):
 
     def log_inv_transf(self,f):
         pass
+
+class Nothing(LinkFunction):
+    """
+    Probit link function: Squashes a likelihood between 0 and 1
+    """
+    def transf(self,mu):
+        return mu
+
+    def inv_transf(self,f):
+        return f
+
+    def log_inv_transf(self,f):
+        return np.log(f)
diff --git a/GPy/models/gp_regression.py b/GPy/models/gp_regression.py
index 86e1f7de..633fc1c8 100644
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@@ -25,11 +25,12 @@ class GPRegression(GP):
 
     """
 
-    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False):
+    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False, likelihood=None):
         if kernel is None:
             kernel = kern.rbf(X.shape[1])
 
-        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
+        if likelihood is None:
+            likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
 
         GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
         self.ensure_default_constraints()
@@ -39,5 +40,3 @@ class GPRegression(GP):
 
     def setstate(self, state):
         return GP.setstate(self, state)
-
-    pass
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 19cf6545..8331933d 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -55,6 +55,14 @@ def dpotri(A, lower=0):
     """
     return lapack.dpotri(A, lower=lower)
 
+def pddet(A):
+    """
+    Determinant of a positive definite matrix, only symmetric matricies though
+    """
+    L = jitchol(A)
+    logdetA = 2*sum(np.log(np.diag(L)))
+    return logdetA
+
 def trace_dot(a, b):
     """
     efficiently compute the trace of the matrix product of a and b

From 64e65b846d8b7eafc1abe66d735a4dbf2dfa540c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 11 Sep 2013 11:54:47 +0100
Subject: [PATCH 078/252] Modified gradient_checker to allow for variable 'f'

---
 GPy/models/gradient_checker.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/GPy/models/gradient_checker.py b/GPy/models/gradient_checker.py
index 5afcd7c4..face9589 100644
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@@ -26,40 +26,40 @@ class GradientChecker(Model):
         """
         :param f: Function to check gradient for
         :param df: Gradient of function to check
-        :param x0: 
+        :param x0:
             Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
-            Can be a list of arrays, if takes a list of arrays. This list will be passed 
+            Can be a list of arrays, if takes a list of arrays. This list will be passed
             to f and df in the same order as given here.
             If only one argument, make sure not to pass a list!!!
-            
+
         :type x0: [array-like] | array-like | float | int
         :param names:
             Names to print, when performing gradcheck. If a list was passed to x0
             a list of names with the same length is expected.
         :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
-        
+
         Examples:
         ---------
             from GPy.models import GradientChecker
             N, M, Q = 10, 5, 3
-        
+
             Sinusoid:
-            
+
                 X = numpy.random.rand(N, Q)
                 grad = GradientChecker(numpy.sin,numpy.cos,X,'x')
                 grad.checkgrad(verbose=1)
-    
+
             Using GPy:
-            
+
                 X, Z = numpy.random.randn(N,Q), numpy.random.randn(M,Q)
                 kern = GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)
-                grad = GradientChecker(kern.K, 
+                grad = GradientChecker(kern.K,
                                        lambda x: 2*kern.dK_dX(numpy.ones((1,1)), x),
                                        x0 = X.copy(),
-                                       names='X')  
+                                       names='X')
                 grad.checkgrad(verbose=1)
                 grad.randomize()
-                grad.checkgrad(verbose=1)      
+                grad.checkgrad(verbose=1)
         """
         Model.__init__(self)
         if isinstance(x0, (list, tuple)) and names is None:
@@ -81,8 +81,8 @@ class GradientChecker(Model):
 #             self._param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
         self.args = args
         self.kwargs = kwargs
-        self.f = f
-        self.df = df
+        self._f = f
+        self._df = df
 
     def _get_x(self):
         if len(self.names) > 1:
@@ -90,10 +90,10 @@ class GradientChecker(Model):
         return [self.__getattribute__(self.names[0])] + list(self.args)
 
     def log_likelihood(self):
-        return float(numpy.sum(self.f(*self._get_x(), **self.kwargs)))
+        return float(numpy.sum(self._f(*self._get_x(), **self.kwargs)))
 
     def _log_likelihood_gradients(self):
-        return numpy.atleast_1d(self.df(*self._get_x(), **self.kwargs)).flatten()
+        return numpy.atleast_1d(self._df(*self._get_x(), **self.kwargs)).flatten()
 
 
     def _get_params(self):

From cf9ea23aef6f9f620530a482f912df371bb3ac1b Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 11 Sep 2013 12:06:36 +0100
Subject: [PATCH 079/252] Added tests and fixed some naming

---
 GPy/likelihoods/likelihood_functions.py |  4 +-
 GPy/testing/laplace_tests.py            | 84 +++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 2 deletions(-)
 create mode 100644 GPy/testing/laplace_tests.py

diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 06735a9c..9d4dc041 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -507,7 +507,7 @@ class Gaussian(LikelihoodFunction):
         d3lik_d3f = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
         return d3lik_d3f
 
-    def lik_dvar(self, y, f, extra_data=None):
+    def dlik_dvar(self, y, f, extra_data=None):
         """
         Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
         """
@@ -538,7 +538,7 @@ class Gaussian(LikelihoodFunction):
 
     def _gradients(self, y, f, extra_data=None):
         #must be listed in same order as 'get_param_names'
-        derivs = ([self.lik_dvar(y, f, extra_data=extra_data)],
+        derivs = ([self.dlik_dvar(y, f, extra_data=extra_data)],
                   [self.dlik_df_dvar(y, f, extra_data=extra_data)],
                   [self.d2lik_d2f_dvar(y, f, extra_data=extra_data)]
                  ) # lists as we might learn many parameters
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
new file mode 100644
index 00000000..351cfcbb
--- /dev/null
+++ b/GPy/testing/laplace_tests.py
@@ -0,0 +1,84 @@
+import numpy as np
+import unittest
+import GPy
+from GPy.models import GradientChecker
+import functools
+
+class LaplaceTests(unittest.TestCase):
+    def setUp(self):
+        self.N = 5
+        self.D = 1
+        self.X = np.linspace(0, 1, self.N)[:, None]
+
+        self.real_std = 0.2
+        noise = np.random.randn(*self.X.shape)*self.real_std
+        self.Y = np.sin(self.X*2*np.pi) + noise
+
+        self.f = np.random.rand(self.N, 1)
+
+    def test_gaussian_dlik_df(self):
+        var = 0.1
+        gauss = GPy.likelihoods.functions.Gaussian(var, self.D, self.N)
+        link = functools.partial(gauss.link_function, self.Y)
+        dlik_df = functools.partial(gauss.dlik_df, self.Y)
+        grad = GradientChecker(link, dlik_df, self.f.copy(), 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+
+    def test_gaussian_d2lik_d2f(self):
+        var = 0.1
+        gauss = GPy.likelihoods.functions.Gaussian(var, self.D, self.N)
+        dlik_df = functools.partial(gauss.dlik_df, self.Y)
+        d2lik_d2f = functools.partial(gauss.d2lik_d2f, self.Y)
+        grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+
+    def test_gaussian_d3lik_d3f(self):
+        var = 0.1
+        gauss = GPy.likelihoods.functions.Gaussian(var, self.D, self.N)
+        d2lik_d2f = functools.partial(gauss.d2lik_d2f, self.Y)
+        d3lik_d3f = functools.partial(gauss.d3lik_d3f, self.Y)
+        grad = GradientChecker(d2lik_d2f, d3lik_d3f, self.f.copy(), 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+
+    def test_gaussian_dlik_dvar(self):
+        var = 0.1
+        gauss = GPy.likelihoods.functions.Gaussian(var, self.D, self.N)
+        #Since the function we are checking does not directly accept the variable we wish to tweak
+        #We make function which makes the change (set params) then calls the function
+        def p_link_var(var, likelihood, f, Y):
+            likelihood._set_params(var)
+            return likelihood.link_function(f, Y)
+
+        def p_dlik_dvar(var, likelihood, f, Y):
+            likelihood._set_params(var)
+            return likelihood.dlik_dvar(f, Y)
+
+        link = functools.partial(p_link_var, likelihood=gauss, f=self.f, Y=self.Y)
+        dlik_dvar = functools.partial(p_dlik_dvar, likelihood=gauss, f=self.f, Y=self.Y)
+        grad = GradientChecker(link, dlik_dvar, var, 'v')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+
+    def test_gaussian_dlik_df_dvar(self):
+        var = 0.1
+        gauss = GPy.likelihoods.functions.Gaussian(var, self.D, self.N)
+        def p_dlik_df(var, likelihood, f, Y):
+            likelihood._set_params(var)
+            return likelihood.dlik_df(f, Y)
+
+        def p_dlik_df_dstd(var, likelihood, f, Y):
+            likelihood._set_params(var)
+            return likelihood.dlik_df_dvar(f, Y)
+
+        dlik_df = functools.partial(p_dlik_df, likelihood=gauss, f=self.f, Y=self.Y)
+        dlik_df_dstd = functools.partial(p_dlik_df_dstd, likelihood=gauss, f=self.f, Y=self.Y)
+        grad = GradientChecker(dlik_df, dlik_df_dstd, var, 'v')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+
+if __name__ == "__main__":
+    print "Running unit tests"
+    unittest.main()

From 42f8180c4e52d62dc1013bfc4834e0c5faf43ee8 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 11 Sep 2013 15:27:14 +0100
Subject: [PATCH 080/252] Tidied up grad checking

---
 GPy/examples/laplace_approximations.py  | 20 ++++----
 GPy/likelihoods/laplace.py              |  6 ++-
 GPy/likelihoods/likelihood_functions.py | 24 +++++-----
 GPy/testing/laplace_tests.py            | 63 ++++++++++++++++---------
 4 files changed, 69 insertions(+), 44 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index c0bc3aef..50e1858b 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -27,7 +27,7 @@ def timing():
 
         t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
         corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
-        m = GPy.models.GPRegression(X, corrupt_stu_t_likelihood, kernel1)
+        m = GPy.models.GPRegression(X, Yc.copy(), kernel1, likelihood=corrupt_stu_t_likelihood)
         m.ensure_default_constraints()
         m.update_likelihood_approximation()
         m.optimize()
@@ -56,7 +56,7 @@ def v_fail_test():
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GPRegression(X, stu_t_likelihood, kernel1)
+    m = GPy.models.GPRegression(X, Y.copy(), kernel1, likelihood=stu_t_likelihood)
     m.constrain_positive('')
     vs = 25
     noises = 30
@@ -103,7 +103,7 @@ def student_t_obj_plane():
     kernelst = kernelgp.copy()
     t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=(real_std**2))
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GPRegression(X, stu_t_likelihood, kernelst)
+    m = GPy.models.GPRegression(X, Y, kernelst, likelihood=stu_t_likelihood)
     m.ensure_default_constraints()
     m.constrain_fixed('t_no', real_std**2)
     vs = 10
@@ -156,7 +156,7 @@ def student_t_f_check():
     #kernelst += GPy.kern.bias(X.shape[1])
     t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=0.05)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GPRegression(X, stu_t_likelihood, kernelst)
+    m = GPy.models.GPRegression(X, Y.copy(), kernelst, likelihood=stu_t_likelihood)
     #m['rbf_v'] = mgp._get_params()[0]
     #m['rbf_l'] = mgp._get_params()[1] + 1
     m.ensure_default_constraints()
@@ -211,7 +211,7 @@ def student_t_fix_optimise_check():
 
     plt.figure(1)
     plt.suptitle('Student likelihood')
-    m = GPy.models.GPRegression(X, stu_t_likelihood, kernelst)
+    m = GPy.models.GPRegression(X, Y, kernelst, likelihood=stu_t_likelihood)
     m.constrain_fixed('rbf_var', mgp._get_params()[0])
     m.constrain_fixed('rbf_len', mgp._get_params()[1])
     m.constrain_positive('t_noise')
@@ -352,7 +352,7 @@ def debug_student_t_noise_approx():
     t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
 
-    m = GPy.models.GPRegression(X, stu_t_likelihood, kernel6)
+    m = GPy.models.GPRegression(X, Y, kernel6, likelihood=stu_t_likelihood)
     #m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 0.2651)
@@ -482,7 +482,7 @@ def student_t_approx():
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GPRegression(X, Y.copy(), kernel6, stu_t_likelihood)
+    m = GPy.models.GPRegression(X, Y.copy(), kernel6, likelihood=stu_t_likelihood)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
     m.randomize()
@@ -498,7 +498,7 @@ def student_t_approx():
     print "Corrupt student t, rasm"
     t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GPRegression(X, Yc.copy(), kernel4, corrupt_stu_t_likelihood)
+    m = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
     m.randomize()
@@ -516,7 +516,7 @@ def student_t_approx():
     #print "Clean student t, ncg"
     #t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
-    #m = GPy.models.GPRegression(X, stu_t_likelihood, kernel3)
+    #m = GPy.models.GPRegression(X, Y, kernel3, likelihood=stu_t_likelihood)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
     #m.optimize()
@@ -530,7 +530,7 @@ def student_t_approx():
     #print "Corrupt student t, ncg"
     #t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
-    #m = GPy.models.GPRegression(X, corrupt_stu_t_likelihood, kernel5)
+    #m = GPy.models.GPRegression(X, Y, kernel5, likelihood=corrupt_stu_t_likelihood)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
     #m.optimize()
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index b5b16521..2f98b2ff 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -41,9 +41,12 @@ class Laplace(likelihood):
         self.N, self.D = self.data.shape
         self.is_heteroscedastic = True
         self.Nparams = 0
-
         self.NORMAL_CONST = ((0.5 * self.N) * np.log(2 * np.pi))
 
+        self.restart()
+
+
+    def restart(self):
         #Initial values for the GP variables
         self.Y = np.zeros((self.N, 1))
         self.covariance_matrix = np.eye(self.N)
@@ -53,6 +56,7 @@ class Laplace(likelihood):
 
         self.old_a = None
 
+
     def predictive_values(self, mu, var, full_cov):
         if full_cov:
             raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 9d4dc041..330116de 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -280,7 +280,7 @@ class StudentT(LikelihoodFunction):
                     )
         return d3lik_d3f
 
-    def lik_dstd(self, y, f, extra_data=None):
+    def dlik_dvar(self, y, f, extra_data=None):
         """
         Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
 
@@ -291,10 +291,10 @@ class StudentT(LikelihoodFunction):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_dsigma = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
-        return dlik_dsigma
+        dlik_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
+        return dlik_dvar
 
-    def dlik_df_dstd(self, y, f, extra_data=None):
+    def dlik_df_dvar(self, y, f, extra_data=None):
         """
         Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
 
@@ -302,10 +302,10 @@ class StudentT(LikelihoodFunction):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_grad_dsigma = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
-        return dlik_grad_dsigma
+        dlik_grad_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
+        return dlik_grad_dvar
 
-    def d2lik_d2f_dstd(self, y, f, extra_data=None):
+    def d2lik_d2f_dvar(self, y, f, extra_data=None):
         """
         Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
 
@@ -313,16 +313,16 @@ class StudentT(LikelihoodFunction):
         """
         assert y.shape == f.shape
         e = y - f
-        dlik_hess_dsigma = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
+        dlik_hess_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
                               / ((self.sigma2*self.v + (e**2))**3)
                            )
-        return dlik_hess_dsigma
+        return dlik_hess_dvar
 
     def _gradients(self, y, f, extra_data=None):
         #must be listed in same order as 'get_param_names'
-        derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
-                  [self.dlik_df_dstd(y, f, extra_data=extra_data)],
-                  [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+        derivs = ([self.dlik_dvar(y, f, extra_data=extra_data)],
+                  [self.dlik_df_dvar(y, f, extra_data=extra_data)],
+                  [self.d2lik_d2f_dvar(y, f, extra_data=extra_data)]
                  ) # lists as we might learn many parameters
         # ensure we have gradients for every parameter we want to optimize
         assert len(derivs[0]) == len(self._get_param_names())
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index 351cfcbb..8aabe50a 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -4,6 +4,24 @@ import GPy
 from GPy.models import GradientChecker
 import functools
 
+def dparam_partial(inst_func, *args):
+    """
+    If we have a instance method that needs to be called but that doesn't
+    take the parameter we wish to change to checkgrad, then this function
+    will change the variable using set params.
+
+    inst_func: should be a instance function of an object that we would like
+                to change
+    param: the param that will be given to set_params
+    args: anything else that needs to be given to the function (for example
+          the f or Y that are being used in the function whilst we tweak the
+          param
+    """
+    def param_func(param, inst_func, args):
+        inst_func.im_self._set_params(param)
+        return inst_func(*args)
+    return functools.partial(param_func, inst_func=inst_func, args=args)
+
 class LaplaceTests(unittest.TestCase):
     def setUp(self):
         self.N = 5
@@ -24,6 +42,7 @@ class LaplaceTests(unittest.TestCase):
         grad = GradientChecker(link, dlik_df, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
 
     def test_gaussian_d2lik_d2f(self):
         var = 0.1
@@ -33,6 +52,7 @@ class LaplaceTests(unittest.TestCase):
         grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
 
     def test_gaussian_d3lik_d3f(self):
         var = 0.1
@@ -42,42 +62,43 @@ class LaplaceTests(unittest.TestCase):
         grad = GradientChecker(d2lik_d2f, d3lik_d3f, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
 
     def test_gaussian_dlik_dvar(self):
         var = 0.1
         gauss = GPy.likelihoods.functions.Gaussian(var, self.D, self.N)
-        #Since the function we are checking does not directly accept the variable we wish to tweak
-        #We make function which makes the change (set params) then calls the function
-        def p_link_var(var, likelihood, f, Y):
-            likelihood._set_params(var)
-            return likelihood.link_function(f, Y)
 
-        def p_dlik_dvar(var, likelihood, f, Y):
-            likelihood._set_params(var)
-            return likelihood.dlik_dvar(f, Y)
-
-        link = functools.partial(p_link_var, likelihood=gauss, f=self.f, Y=self.Y)
-        dlik_dvar = functools.partial(p_dlik_dvar, likelihood=gauss, f=self.f, Y=self.Y)
+        link = dparam_partial(gauss.link_function, self.Y, self.f)
+        dlik_dvar = dparam_partial(gauss.dlik_dvar, self.Y, self.f)
         grad = GradientChecker(link, dlik_dvar, var, 'v')
+        grad.constrain_positive('v')
         grad.randomize()
         grad.checkgrad(verbose=1)
+        #self.assertTrue(grad.checkgrad())
 
     def test_gaussian_dlik_df_dvar(self):
         var = 0.1
         gauss = GPy.likelihoods.functions.Gaussian(var, self.D, self.N)
-        def p_dlik_df(var, likelihood, f, Y):
-            likelihood._set_params(var)
-            return likelihood.dlik_df(f, Y)
 
-        def p_dlik_df_dstd(var, likelihood, f, Y):
-            likelihood._set_params(var)
-            return likelihood.dlik_df_dvar(f, Y)
-
-        dlik_df = functools.partial(p_dlik_df, likelihood=gauss, f=self.f, Y=self.Y)
-        dlik_df_dstd = functools.partial(p_dlik_df_dstd, likelihood=gauss, f=self.f, Y=self.Y)
-        grad = GradientChecker(dlik_df, dlik_df_dstd, var, 'v')
+        dlik_df = dparam_partial(gauss.dlik_df, self.Y, self.f)
+        dlik_df_dvar = dparam_partial(gauss.dlik_df_dvar, self.Y, self.f)
+        grad = GradientChecker(dlik_df, dlik_df_dvar, var, 'v')
+        grad.constrain_positive('v')
         grad.randomize()
         grad.checkgrad(verbose=1)
+        #self.assertTrue(grad.checkgrad())
+
+    def test_studentt_dlik_dvar(self):
+        var = 0.1
+        stu_t = GPy.likelihoods.functions.StudentT(deg_free=5, sigma2=var)
+
+        link = dparam_partial(stu_t.link_function, self.Y, self.f)
+        dlik_dvar = dparam_partial(stu_t.dlik_dvar, self.Y, self.f)
+        grad = GradientChecker(link, dlik_dvar, var, 'v')
+        grad.constrain_positive('v')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        #self.assertTrue(grad.checkgrad())
 
 if __name__ == "__main__":
     print "Running unit tests"

From 888a1ff0f779ad1e459bfb4aa309542addfc6409 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 12 Sep 2013 10:23:51 +0100
Subject: [PATCH 081/252] Refactored tests

---
 GPy/testing/laplace_tests.py | 156 ++++++++++++++++++++++++++---------
 1 file changed, 119 insertions(+), 37 deletions(-)

diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index 8aabe50a..2db83c25 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -22,6 +22,45 @@ def dparam_partial(inst_func, *args):
         return inst_func(*args)
     return functools.partial(param_func, inst_func=inst_func, args=args)
 
+def grad_checker_wrt_params(func, dfunc, params, args, randomize=False, verbose=False):
+    """
+    checkgrad expects a f: R^N -> R^1 and df: R^N -> R^N
+    However if we are holding other parameters fixed and moving something else
+    We need to check the gradient of each of the fixed parameters (f and y for example) seperately
+    Whilst moving another parameter. otherwise f: gives back R^N and df: gives back R^NxM where M is
+    The number of parameters and N is the number of data
+    Need to take a slice out from f and a slice out of df
+    """
+    print "{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
+                                           func.__name__, dfunc.__name__)
+    partial_f = dparam_partial(func, *args)
+    partial_df = dparam_partial(dfunc, *args)
+    gradchecked = False
+    for param in params:
+        fnum = np.atleast_1d(partial_f(param)).shape[0]
+        dfnum = np.atleast_1d(partial_df(param)).shape[0]
+        for fixed_val in range(dfnum):
+            f_ind = min(fnum, fixed_val+1) - 1 #dlik and dlik_dvar gives back 1 value for each
+            grad = GradientChecker(lambda x: np.atleast_1d(partial_f(x))[f_ind],
+                                   lambda x : np.atleast_1d(partial_df(x))[fixed_val],
+                                   param, 'p')
+            grad.constrain_positive('p')
+            if randomize:
+                grad.randomize()
+            if verbose:
+                grad.checkgrad(verbose=1)
+            cg = grad.checkgrad()
+            print cg
+            if cg:
+                print "True"
+                gradchecked = True
+            else:
+                print "False"
+                return False
+    print str(gradchecked)
+    return gradchecked
+
+
 class LaplaceTests(unittest.TestCase):
     def setUp(self):
         self.N = 5
@@ -34,72 +73,115 @@ class LaplaceTests(unittest.TestCase):
 
         self.f = np.random.rand(self.N, 1)
 
+        self.var = 0.1
+        self.stu_t = GPy.likelihoods.functions.StudentT(deg_free=5, sigma2=self.var)
+        self.gauss = GPy.likelihoods.functions.Gaussian(self.var, self.D, self.N)
+
+    def tearDown(self):
+        self.stu_t = None
+        self.gauss = None
+
     def test_gaussian_dlik_df(self):
-        var = 0.1
-        gauss = GPy.likelihoods.functions.Gaussian(var, self.D, self.N)
-        link = functools.partial(gauss.link_function, self.Y)
-        dlik_df = functools.partial(gauss.dlik_df, self.Y)
+        link = functools.partial(self.gauss.link_function, self.Y)
+        dlik_df = functools.partial(self.gauss.dlik_df, self.Y)
         grad = GradientChecker(link, dlik_df, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
     def test_gaussian_d2lik_d2f(self):
-        var = 0.1
-        gauss = GPy.likelihoods.functions.Gaussian(var, self.D, self.N)
-        dlik_df = functools.partial(gauss.dlik_df, self.Y)
-        d2lik_d2f = functools.partial(gauss.d2lik_d2f, self.Y)
+        dlik_df = functools.partial(self.gauss.dlik_df, self.Y)
+        d2lik_d2f = functools.partial(self.gauss.d2lik_d2f, self.Y)
         grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
     def test_gaussian_d3lik_d3f(self):
-        var = 0.1
-        gauss = GPy.likelihoods.functions.Gaussian(var, self.D, self.N)
-        d2lik_d2f = functools.partial(gauss.d2lik_d2f, self.Y)
-        d3lik_d3f = functools.partial(gauss.d3lik_d3f, self.Y)
+        d2lik_d2f = functools.partial(self.gauss.d2lik_d2f, self.Y)
+        d3lik_d3f = functools.partial(self.gauss.d3lik_d3f, self.Y)
         grad = GradientChecker(d2lik_d2f, d3lik_d3f, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
     def test_gaussian_dlik_dvar(self):
-        var = 0.1
-        gauss = GPy.likelihoods.functions.Gaussian(var, self.D, self.N)
-
-        link = dparam_partial(gauss.link_function, self.Y, self.f)
-        dlik_dvar = dparam_partial(gauss.dlik_dvar, self.Y, self.f)
-        grad = GradientChecker(link, dlik_dvar, var, 'v')
-        grad.constrain_positive('v')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
+        #link = dparam_partial(self.gauss.link_function, self.Y, self.f)
+        #dlik_dvar = dparam_partial(self.gauss.dlik_dvar, self.Y, self.f)
+        #grad = GradientChecker(link, dlik_dvar, self.var, 'v')
+        #grad.constrain_positive('v')
+        #grad.randomize()
+        #grad.checkgrad(verbose=1)
         #self.assertTrue(grad.checkgrad())
+        self.assertTrue(grad_checker_wrt_params(self.gauss.link_function, self.gauss.dlik_dvar,
+            [self.var], args=(self.Y, self.f), randomize=True, verbose=True))
 
     def test_gaussian_dlik_df_dvar(self):
-        var = 0.1
-        gauss = GPy.likelihoods.functions.Gaussian(var, self.D, self.N)
+        #dlik_df = dparam_partial(self.gauss.dlik_df, self.Y, self.f)
+        #dlik_df_dvar = dparam_partial(self.gauss.dlik_df_dvar, self.Y, self.f)
+        #grad = GradientChecker(dlik_df, dlik_df_dvar, self.var, 'v')
+        #grad.constrain_positive('v')
+        #grad.randomize()
+        #grad.checkgrad(verbose=1)
+        #self.assertTrue(grad.checkgrad())
+        self.assertTrue(grad_checker_wrt_params(self.gauss.dlik_df, self.gauss.dlik_df_dvar,
+            [self.var], args=(self.Y, self.f), randomize=True, verbose=True))
 
-        dlik_df = dparam_partial(gauss.dlik_df, self.Y, self.f)
-        dlik_df_dvar = dparam_partial(gauss.dlik_df_dvar, self.Y, self.f)
-        grad = GradientChecker(dlik_df, dlik_df_dvar, var, 'v')
-        grad.constrain_positive('v')
+    def test_studentt_dlik_df(self):
+        link = functools.partial(self.stu_t.link_function, self.Y)
+        dlik_df = functools.partial(self.stu_t.dlik_df, self.Y)
+        grad = GradientChecker(link, dlik_df, self.f.copy(), 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+
+    def test_studentt_d2lik_d2f(self):
+        dlik_df = functools.partial(self.stu_t.dlik_df, self.Y)
+        d2lik_d2f = functools.partial(self.stu_t.d2lik_d2f, self.Y)
+        grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+
+    def test_studentt_d3lik_d3f(self):
+        d2lik_d2f = functools.partial(self.stu_t.d2lik_d2f, self.Y)
+        d3lik_d3f = functools.partial(self.stu_t.d3lik_d3f, self.Y)
+        grad = GradientChecker(d2lik_d2f, d3lik_d3f, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
-        #self.assertTrue(grad.checkgrad())
 
     def test_studentt_dlik_dvar(self):
-        var = 0.1
-        stu_t = GPy.likelihoods.functions.StudentT(deg_free=5, sigma2=var)
-
-        link = dparam_partial(stu_t.link_function, self.Y, self.f)
-        dlik_dvar = dparam_partial(stu_t.dlik_dvar, self.Y, self.f)
-        grad = GradientChecker(link, dlik_dvar, var, 'v')
-        grad.constrain_positive('v')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
+        #link = dparam_partial(self.stu_t.link_function, self.Y, self.f)
+        #dlik_dvar = dparam_partial(self.stu_t.dlik_dvar, self.Y, self.f)
+        #grad = GradientChecker(link, dlik_dvar, self.var, 'v')
+        #grad.constrain_positive('v')
+        #grad.randomize()
+        #grad.checkgrad(verbose=1)
         #self.assertTrue(grad.checkgrad())
+        self.assertTrue(grad_checker_wrt_params(self.stu_t.link_function, self.stu_t.dlik_dvar,
+            [self.var], args=(self.Y.copy(), self.f.copy()), randomize=True, verbose=True))
+
+    def test_studentt_dlik_df_dvar(self):
+        #dlik_df = dparam_partial(self.stu_t.dlik_df, self.Y, self.f)
+        #dlik_df_dvar = dparam_partial(self.stu_t.dlik_df_dvar, self.Y, self.f)
+        #grad = GradientChecker(dlik_df, dlik_df_dvar, self.var, 'v')
+        #grad.constrain_positive('v')
+        #grad.randomize()
+        #grad.checkgrad(verbose=1)
+        #self.assertTrue(grad.checkgrad())
+        self.assertTrue(grad_checker_wrt_params(self.stu_t.dlik_df, self.stu_t.dlik_df_dvar,
+            [self.var], args=(self.Y.copy(), self.f.copy()), randomize=True, verbose=True))
 
 if __name__ == "__main__":
+    #N = 5
+    #D = 1
+    #X = np.linspace(0, 1, N)[:, None]
+    #real_std = 0.2
+    #noise = np.random.randn(*X.shape)*real_std
+    #Y = np.sin(X*2*np.pi) + noise
+    #f = np.random.rand(N, 1)
+    #var = 0.1
+    #stu_t = GPy.likelihoods.functions.StudentT(deg_free=5, sigma2=var)
+
+    #print grad_checker_wrt_params(stu_t.dlik_df, stu_t.dlik_df_dvar, [var], args=(Y, f), randomize=True, verbose=False)
+
     print "Running unit tests"
     unittest.main()

From e36ffcba6e332b96bd400d53b811325469489aef Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 12 Sep 2013 15:08:02 +0100
Subject: [PATCH 082/252] All gradients now gradcheck

---
 GPy/likelihoods/likelihood_functions.py |  18 +--
 GPy/testing/laplace_tests.py            | 141 ++++++++++++------------
 2 files changed, 82 insertions(+), 77 deletions(-)

diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 330116de..39367734 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -291,6 +291,7 @@ class StudentT(LikelihoodFunction):
         """
         assert y.shape == f.shape
         e = y - f
+        #FIXME: OUT BY SOME FUNCTION OF N
         dlik_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
         return dlik_dvar
 
@@ -442,7 +443,7 @@ class Gaussian(LikelihoodFunction):
         self.I = np.eye(self.N)
         self.covariance_matrix = self.I * self._variance
         self.Ki = self.I*(1.0 / self._variance)
-        self.ln_K = np.trace(self.covariance_matrix)
+        self.ln_det_K = np.sum(np.log(np.diag(self.covariance_matrix)))
 
     def link_function(self, y, f, extra_data=None):
         """link_function $\ln p(y|f)$
@@ -458,11 +459,11 @@ class Gaussian(LikelihoodFunction):
         e = y - f
         eeT = np.dot(e, e.T)
         objective = (- 0.5*self.D*np.log(2*np.pi)
-                     - 0.5*self.ln_K
-                     #- 0.5*np.sum(np.multiply(self.Ki, eeT))
-                     - 0.5*np.dot(np.dot(e.T, self.Ki), e)
+                     - 0.5*self.ln_det_K
+                     #- 0.5*np.dot(np.dot(e.T, self.Ki), e)
+                     - (0.5/self._variance)*np.dot(e.T, e) # As long as K is diagonal
                      )
-        return np.sum(objective) # FIXME: put this back!
+        return np.sum(objective)
 
     def dlik_df(self, y, f, extra_data=None):
         """
@@ -514,7 +515,8 @@ class Gaussian(LikelihoodFunction):
         assert y.shape == f.shape
         e = y - f
         s_4 = 1.0/(self._variance**2)
-        dlik_dsigma = -0.5*self.N/self._variance + 0.5*s_4*np.trace(np.dot(e.T, np.dot(self.I, e)))
+        dlik_dsigma = -0.5*self.N/self._variance + 0.5*s_4*np.dot(e.T, e)
+        #dlik_dsigma = -0.5*self.N + 0.5*s_4*np.dot(e.T, e)
         return dlik_dsigma
 
     def dlik_df_dvar(self, y, f, extra_data=None):
@@ -523,7 +525,7 @@ class Gaussian(LikelihoodFunction):
         """
         assert y.shape == f.shape
         s_4 = 1.0/(self._variance**2)
-        dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f))
+        dlik_grad_dsigma = -np.dot(s_4*self.I, y) + np.dot(s_4*self.I, f)
         return dlik_grad_dsigma
 
     def d2lik_d2f_dvar(self, y, f, extra_data=None):
@@ -533,7 +535,7 @@ class Gaussian(LikelihoodFunction):
         $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
         """
         assert y.shape == f.shape
-        dlik_hess_dsigma = 0.5*np.diag((1.0/(self._variance**2))*self.I)[:, None]
+        dlik_hess_dsigma = np.diag((1.0/(self._variance**2))*self.I)[:, None]
         return dlik_hess_dsigma
 
     def _gradients(self, y, f, extra_data=None):
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index 2db83c25..7fc6f2f4 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -3,6 +3,7 @@ import unittest
 import GPy
 from GPy.models import GradientChecker
 import functools
+import inspect
 
 def dparam_partial(inst_func, *args):
     """
@@ -22,66 +23,71 @@ def dparam_partial(inst_func, *args):
         return inst_func(*args)
     return functools.partial(param_func, inst_func=inst_func, args=args)
 
-def grad_checker_wrt_params(func, dfunc, params, args, randomize=False, verbose=False):
+def dparam_checkgrad(func, dfunc, params, args, constrain_positive=True, randomize=False, verbose=False):
     """
     checkgrad expects a f: R^N -> R^1 and df: R^N -> R^N
     However if we are holding other parameters fixed and moving something else
-    We need to check the gradient of each of the fixed parameters (f and y for example) seperately
-    Whilst moving another parameter. otherwise f: gives back R^N and df: gives back R^NxM where M is
+    We need to check the gradient of each of the fixed parameters
+    (f and y for example) seperately.
+    Whilst moving another parameter. otherwise f: gives back R^N and
+    df: gives back R^NxM where M is
     The number of parameters and N is the number of data
     Need to take a slice out from f and a slice out of df
     """
-    print "{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
-                                           func.__name__, dfunc.__name__)
+    #print "\n{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
+                                           #func.__name__, dfunc.__name__)
     partial_f = dparam_partial(func, *args)
     partial_df = dparam_partial(dfunc, *args)
-    gradchecked = False
+    gradchecking = True
     for param in params:
         fnum = np.atleast_1d(partial_f(param)).shape[0]
         dfnum = np.atleast_1d(partial_df(param)).shape[0]
         for fixed_val in range(dfnum):
-            f_ind = min(fnum, fixed_val+1) - 1 #dlik and dlik_dvar gives back 1 value for each
+            #dlik and dlik_dvar gives back 1 value for each
+            f_ind = min(fnum, fixed_val+1) - 1
             grad = GradientChecker(lambda x: np.atleast_1d(partial_f(x))[f_ind],
                                    lambda x : np.atleast_1d(partial_df(x))[fixed_val],
                                    param, 'p')
-            grad.constrain_positive('p')
+            if constrain_positive:
+                grad.constrain_positive('p')
             if randomize:
                 grad.randomize()
+            print grad
             if verbose:
                 grad.checkgrad(verbose=1)
-            cg = grad.checkgrad()
-            print cg
-            if cg:
-                print "True"
-                gradchecked = True
-            else:
-                print "False"
-                return False
-    print str(gradchecked)
-    return gradchecked
+            if not grad.checkgrad():
+                gradchecking = False
+
+    return gradchecking
 
 
 class LaplaceTests(unittest.TestCase):
     def setUp(self):
-        self.N = 5
-        self.D = 1
+        self.N = 1
+        self.D = 5
         self.X = np.linspace(0, 1, self.N)[:, None]
 
         self.real_std = 0.2
         noise = np.random.randn(*self.X.shape)*self.real_std
         self.Y = np.sin(self.X*2*np.pi) + noise
+        #self.Y = np.array([[1.0]])#np.sin(self.X*2*np.pi) + noise
 
         self.f = np.random.rand(self.N, 1)
+        #self.f = np.array([[3.0]])#np.sin(self.X*2*np.pi) + noise
 
-        self.var = 0.1
+        self.var = np.random.rand(1)
         self.stu_t = GPy.likelihoods.functions.StudentT(deg_free=5, sigma2=self.var)
         self.gauss = GPy.likelihoods.functions.Gaussian(self.var, self.D, self.N)
 
     def tearDown(self):
         self.stu_t = None
         self.gauss = None
+        self.Y = None
+        self.f = None
+        self.X = None
 
     def test_gaussian_dlik_df(self):
+        print "\n{}".format(inspect.stack()[0][3])
         link = functools.partial(self.gauss.link_function, self.Y)
         dlik_df = functools.partial(self.gauss.dlik_df, self.Y)
         grad = GradientChecker(link, dlik_df, self.f.copy(), 'f')
@@ -90,6 +96,7 @@ class LaplaceTests(unittest.TestCase):
         self.assertTrue(grad.checkgrad())
 
     def test_gaussian_d2lik_d2f(self):
+        print "\n{}".format(inspect.stack()[0][3])
         dlik_df = functools.partial(self.gauss.dlik_df, self.Y)
         d2lik_d2f = functools.partial(self.gauss.d2lik_d2f, self.Y)
         grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
@@ -98,6 +105,7 @@ class LaplaceTests(unittest.TestCase):
         self.assertTrue(grad.checkgrad())
 
     def test_gaussian_d3lik_d3f(self):
+        print "\n{}".format(inspect.stack()[0][3])
         d2lik_d2f = functools.partial(self.gauss.d2lik_d2f, self.Y)
         d3lik_d3f = functools.partial(self.gauss.d3lik_d3f, self.Y)
         grad = GradientChecker(d2lik_d2f, d3lik_d3f, self.f.copy(), 'f')
@@ -106,28 +114,31 @@ class LaplaceTests(unittest.TestCase):
         self.assertTrue(grad.checkgrad())
 
     def test_gaussian_dlik_dvar(self):
-        #link = dparam_partial(self.gauss.link_function, self.Y, self.f)
-        #dlik_dvar = dparam_partial(self.gauss.dlik_dvar, self.Y, self.f)
-        #grad = GradientChecker(link, dlik_dvar, self.var, 'v')
-        #grad.constrain_positive('v')
-        #grad.randomize()
-        #grad.checkgrad(verbose=1)
-        #self.assertTrue(grad.checkgrad())
-        self.assertTrue(grad_checker_wrt_params(self.gauss.link_function, self.gauss.dlik_dvar,
-            [self.var], args=(self.Y, self.f), randomize=True, verbose=True))
+        print "\n{}".format(inspect.stack()[0][3])
+        self.assertTrue(
+                dparam_checkgrad(self.gauss.link_function, self.gauss.dlik_dvar,
+                    [self.var], args=(self.Y, self.f), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
 
     def test_gaussian_dlik_df_dvar(self):
-        #dlik_df = dparam_partial(self.gauss.dlik_df, self.Y, self.f)
-        #dlik_df_dvar = dparam_partial(self.gauss.dlik_df_dvar, self.Y, self.f)
-        #grad = GradientChecker(dlik_df, dlik_df_dvar, self.var, 'v')
-        #grad.constrain_positive('v')
-        #grad.randomize()
-        #grad.checkgrad(verbose=1)
-        #self.assertTrue(grad.checkgrad())
-        self.assertTrue(grad_checker_wrt_params(self.gauss.dlik_df, self.gauss.dlik_df_dvar,
-            [self.var], args=(self.Y, self.f), randomize=True, verbose=True))
+        print "\n{}".format(inspect.stack()[0][3])
+        self.assertTrue(
+                dparam_checkgrad(self.gauss.dlik_df, self.gauss.dlik_df_dvar,
+                    [self.var], args=(self.Y.copy(), self.f.copy()), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
+
+    def test_gaussian_d2lik_d2f_dvar(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.assertTrue(
+                dparam_checkgrad(self.gauss.d2lik_d2f, self.gauss.d2lik_d2f_dvar,
+                    [self.var], args=(self.Y, self.f), constrain_positive=True,
+                    randomize=True, verbose=True)
+                )
 
     def test_studentt_dlik_df(self):
+        print "\n{}".format(inspect.stack()[0][3])
         link = functools.partial(self.stu_t.link_function, self.Y)
         dlik_df = functools.partial(self.stu_t.dlik_df, self.Y)
         grad = GradientChecker(link, dlik_df, self.f.copy(), 'f')
@@ -135,6 +146,7 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
 
     def test_studentt_d2lik_d2f(self):
+        print "\n{}".format(inspect.stack()[0][3])
         dlik_df = functools.partial(self.stu_t.dlik_df, self.Y)
         d2lik_d2f = functools.partial(self.stu_t.d2lik_d2f, self.Y)
         grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
@@ -142,6 +154,7 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
 
     def test_studentt_d3lik_d3f(self):
+        print "\n{}".format(inspect.stack()[0][3])
         d2lik_d2f = functools.partial(self.stu_t.d2lik_d2f, self.Y)
         d3lik_d3f = functools.partial(self.stu_t.d3lik_d3f, self.Y)
         grad = GradientChecker(d2lik_d2f, d3lik_d3f, self.f.copy(), 'f')
@@ -149,39 +162,29 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
 
     def test_studentt_dlik_dvar(self):
-        #link = dparam_partial(self.stu_t.link_function, self.Y, self.f)
-        #dlik_dvar = dparam_partial(self.stu_t.dlik_dvar, self.Y, self.f)
-        #grad = GradientChecker(link, dlik_dvar, self.var, 'v')
-        #grad.constrain_positive('v')
-        #grad.randomize()
-        #grad.checkgrad(verbose=1)
-        #self.assertTrue(grad.checkgrad())
-        self.assertTrue(grad_checker_wrt_params(self.stu_t.link_function, self.stu_t.dlik_dvar,
-            [self.var], args=(self.Y.copy(), self.f.copy()), randomize=True, verbose=True))
+        print "\n{}".format(inspect.stack()[0][3])
+        self.assertTrue(
+                dparam_checkgrad(self.stu_t.link_function, self.stu_t.dlik_dvar,
+                    [self.var], args=(self.Y.copy(), self.f.copy()),
+                    constrain_positive=True, randomize=True, verbose=True)
+                )
 
     def test_studentt_dlik_df_dvar(self):
-        #dlik_df = dparam_partial(self.stu_t.dlik_df, self.Y, self.f)
-        #dlik_df_dvar = dparam_partial(self.stu_t.dlik_df_dvar, self.Y, self.f)
-        #grad = GradientChecker(dlik_df, dlik_df_dvar, self.var, 'v')
-        #grad.constrain_positive('v')
-        #grad.randomize()
-        #grad.checkgrad(verbose=1)
-        #self.assertTrue(grad.checkgrad())
-        self.assertTrue(grad_checker_wrt_params(self.stu_t.dlik_df, self.stu_t.dlik_df_dvar,
-            [self.var], args=(self.Y.copy(), self.f.copy()), randomize=True, verbose=True))
+        print "\n{}".format(inspect.stack()[0][3])
+        self.assertTrue(
+                dparam_checkgrad(self.stu_t.dlik_df, self.stu_t.dlik_df_dvar,
+                    [self.var], args=(self.Y.copy(), self.f.copy()),
+                    constrain_positive=True, randomize=True, verbose=True)
+                )
+
+    def test_studentt_d2lik_d2f_dvar(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.assertTrue(
+                dparam_checkgrad(self.stu_t.d2lik_d2f, self.stu_t.d2lik_d2f_dvar,
+                    [self.var], args=(self.Y.copy(), self.f.copy()),
+                    constrain_positive=True, randomize=True, verbose=True)
+                )
 
 if __name__ == "__main__":
-    #N = 5
-    #D = 1
-    #X = np.linspace(0, 1, N)[:, None]
-    #real_std = 0.2
-    #noise = np.random.randn(*X.shape)*real_std
-    #Y = np.sin(X*2*np.pi) + noise
-    #f = np.random.rand(N, 1)
-    #var = 0.1
-    #stu_t = GPy.likelihoods.functions.StudentT(deg_free=5, sigma2=var)
-
-    #print grad_checker_wrt_params(stu_t.dlik_df, stu_t.dlik_df_dvar, [var], args=(Y, f), randomize=True, verbose=False)
-
     print "Running unit tests"
     unittest.main()

From b663fff622fe325b320c6cb4655ec315cd97dbba Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 13 Sep 2013 14:34:28 +0100
Subject: [PATCH 083/252] Now checkgrads for gaussian, and ALMOST for student t

---
 GPy/examples/laplace_approximations.py |  67 ++++++++++----
 GPy/likelihoods/laplace.py             | 123 +++++++++++++++----------
 2 files changed, 119 insertions(+), 71 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 50e1858b..e8af74eb 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -1,6 +1,7 @@
 import GPy
 import numpy as np
 import matplotlib.pyplot as plt
+from GPy.util import datasets
 np.random.seed(1)
 
 def timing():
@@ -405,7 +406,7 @@ def student_t_approx():
     """
     real_std = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 50)[:, None]
+    X = np.linspace(0.0, 10.0, 100)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_std
     Yc = Y.copy()
 
@@ -422,7 +423,7 @@ def student_t_approx():
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
-    deg_free = 8
+    deg_free = 5
     print "Real noise: ", real_std
 
     initial_var_guess = 0.1
@@ -456,11 +457,13 @@ def student_t_approx():
     m = GPy.models.GPRegression(X, Y, kernel=kernel1)
     # optimize
     m.ensure_default_constraints()
+    m.randomize()
     m.optimize()
     # plot
-    plt.subplot(211)
-    m.plot()
+    ax = plt.subplot(211)
+    m.plot(ax=ax)
     plt.plot(X_full, Y_full)
+    plt.ylim(-1.5, 1.5)
     plt.title('Gaussian clean')
     print m
 
@@ -468,16 +471,18 @@ def student_t_approx():
     print "Corrupt Gaussian"
     m = GPy.models.GPRegression(X, Yc, kernel=kernel2)
     m.ensure_default_constraints()
-    #m.optimize()
-    plt.subplot(212)
-    m.plot()
+    m.randomize()
+    m.optimize()
+    ax = plt.subplot(212)
+    m.plot(ax=ax)
     plt.plot(X_full, Y_full)
+    plt.ylim(-1.5, 1.5)
     plt.title('Gaussian corrupt')
     print m
 
     plt.figure(2)
     plt.suptitle('Student-t likelihood')
-    edited_real_sd = real_std #initial_var_guess
+    edited_real_sd = initial_var_guess
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
@@ -486,13 +491,14 @@ def student_t_approx():
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
     m.randomize()
-    m.update_likelihood_approximation()
+    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+    #m.update_likelihood_approximation()
     m.optimize()
     print(m)
-    plt.subplot(222)
-    m.plot()
+    ax = plt.subplot(211)
+    m.plot(ax=ax)
     plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
+    plt.ylim(-1.5, 1.5)
     plt.title('Student-t rasm clean')
 
     print "Corrupt student t, rasm"
@@ -502,15 +508,17 @@ def student_t_approx():
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
     m.randomize()
-    m.update_likelihood_approximation()
+    #m.update_likelihood_approximation()
+    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
     m.optimize()
     print(m)
-    plt.subplot(224)
-    m.plot()
+    ax = plt.subplot(212)
+    m.plot(ax=ax)
     plt.plot(X_full, Y_full)
-    plt.ylim(-2.5, 2.5)
+    plt.ylim(-1.5, 1.5)
     plt.title('Student-t rasm corrupt')
 
+    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
     return m
 
     #print "Clean student t, ncg"
@@ -607,7 +615,6 @@ def gaussian_f_check():
     mgp.optimize()
     print "Gaussian"
     print mgp
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
     kernelg = kernelgp.copy()
     #kernelst += GPy.kern.bias(X.shape[1])
@@ -615,6 +622,7 @@ def gaussian_f_check():
     g_distribution = GPy.likelihoods.functions.Gaussian(variance=0.1, N=N, D=D)
     g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm')
     m = GPy.models.GPRegression(X, Y, kernelg, likelihood=g_likelihood)
+    m.likelihood.X = X
     #m['rbf_v'] = mgp._get_params()[0]
     #m['rbf_l'] = mgp._get_params()[1] + 1
     m.ensure_default_constraints()
@@ -623,18 +631,37 @@ def gaussian_f_check():
     #m.constrain_bounded('t_no', 2*real_std**2, 1e3)
     #m.constrain_positive('bias')
     m.constrain_positive('noise_var')
+    #m['noise_variance'] = 0.1
+    #m.likelihood.X = X
     m.randomize()
     import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
-    m['noise_variance'] = 0.1
-    #m.likelihood.X = X
     plt.figure()
     ax = plt.subplot(211)
     m.plot(ax=ax)
-    ax = plt.subplot(212)
+
     m.optimize()
+    ax = plt.subplot(212)
     m.plot(ax=ax)
+
     print "final optimised gaussian"
     print m
     print "real GP"
     print mgp
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+def boston_example():
+    data = datasets.boston_housing()
+    X = data['X'].copy()
+    Y = data['Y'].copy()
+    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
+    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
+    mgp.ensure_default_constraints()
+    mgp.randomize()
+    mgp.optimize()
+    mgp.plot()
+    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+
+def plot_f_approx(model):
+    plt.figure()
+    model.plot(ax=plt.gca())
+    plt.plot(model.X, model.likelihood.f_hat, c='g')
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 2f98b2ff..2897e1de 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -7,6 +7,7 @@ from likelihood import likelihood
 from ..util.linalg import pdinv, mdot, jitchol, chol_inv, pddet
 from scipy.linalg.lapack import dtrtrs
 import random
+from functools import partial
 #import pylab as plt
 
 class Laplace(likelihood):
@@ -87,11 +88,15 @@ class Laplace(likelihood):
 
         #Implicit
         impl = mdot(dlp, dL_dfhat, I_KW_i)
-        expl_a = mdot(self.Ki_f, self.Ki_f.T)
+        #expl_a = mdot(self.Ki_f, self.Ki_f.T)
+        expl_a = np.dot(self.Ki_f, self.Ki_f.T)
         expl_b = self.Wi_K_i
         #print "expl_a: {}, expl_b: {}".format(expl_a, expl_b)
-        expl = 0.5*expl_a + 0.5*expl_b # Might need to be -?
-        dL_dthetaK_exp = dK_dthetaK(expl, X)
+        #expl = 0.5*expl_a - 0.5*expl_b # Might need to be -?
+        #dL_dthetaK_exp = dK_dthetaK(expl, X)
+        dL_dthetaK_exp_a = dK_dthetaK(expl_a, X)
+        dL_dthetaK_exp_b = dK_dthetaK(expl_b, X)
+        dL_dthetaK_exp = 0.5*dL_dthetaK_exp_a - 0.5*dL_dthetaK_exp_b
         dL_dthetaK_imp = dK_dthetaK(impl, X)
         #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
         #print "expl_a: {}, {}     expl_b: {}, {}".format(np.mean(expl_a), np.std(expl_a), np.mean(expl_b), np.std(expl_b))
@@ -116,7 +121,13 @@ class Laplace(likelihood):
             #b = 0.5*np.dot(np.diag(e).T, d)
             #g = 0.5*(np.diag(self.K) - np.sum(cho_solve((self.B_chol, True), np.dot(np.diagflat(self.W_12),self.K))**2, 1))
             #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - np.dot(g.T, dlik_hess_dthetaL[thetaL_i])
-            dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+
+            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
+            dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
+                             #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
+                             + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[thetaL_i])
+                             )
+            import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
 
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
@@ -168,22 +179,31 @@ class Laplace(likelihood):
         Y_tilde = Wi*self.Ki_f + self.f_hat
 
         self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
+        #self.Wi_K_i, _, _, self.ln_det_Wi_K = pdinv(self.Sigma_tilde + self.K) # TODO: Check if Wi_K_i == R above and same with det below
+        self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
+
         #self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6
 
-        self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
+        #self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
         self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
 
         self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
-        self.aA = 0.5*self.ln_det_K_Wi__Bi
-        self.bB = - 0.5*self.f_Ki_f
-        self.cC = 0.5*self.y_Wi_Ki_i_y
+        #self.aA = 0.5*self.ln_det_K_Wi__Bi
+        #self.bB = - 0.5*self.f_Ki_f
+        #self.cC = 0.5*self.y_Wi_Ki_i_y
         Z_tilde = (+ self.lik
-                   + 0.5*self.ln_det_K_Wi__Bi
+                    #+ 0.5*self.ln_det_K_Wi__Bi
+                   - 0.5*self.ln_B_det
+                   + 0.5*self.ln_det_Wi_K
                    - 0.5*self.f_Ki_f
                    + 0.5*self.y_Wi_Ki_i_y
                   )
-        print "Ztilde: {} lik: {} a: {} b: {} c: {}".format(Z_tilde, self.lik, self.aA, self.bB, self.cC)
-        print self.likelihood_function._get_params()
+        #self.aA = 0.5*self.ln_det_Wi_K
+        #self.bB = - 0.5*self.f_Ki_f
+        #self.cC = 0.5*self.y_Wi_Ki_i_y
+        #self.dD = -0.5*self.ln_B_det
+        #print "Ztilde: {} lik: {} a: {} b: {} c: {} d:".format(Z_tilde, self.lik, self.aA, self.bB, self.cC, self.dD)
+        print "param value: {}".format(self.likelihood_function._get_params())
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -222,7 +242,7 @@ class Laplace(likelihood):
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
         self.B, self.B_chol, self.W_12 = self._compute_B_statistics(self.K, self.W)
-        self.Bi, _, _, B_det = pdinv(self.B)
+        self.Bi, _, _, self.ln_B_det = pdinv(self.B)
 
         #Do the computation again at f to get Ki_f which is useful
         #b = self.W*self.f_hat + self.likelihood_function.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
@@ -234,7 +254,7 @@ class Laplace(likelihood):
         self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
 
         #For det, |I + KW| == |I + W_12*K*W_12|
-        self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
+        #self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
 
         #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
         #self.ln_z_hat = (- 0.5*self.f_Ki_f
@@ -299,7 +319,7 @@ class Laplace(likelihood):
 
     def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
         """
-        Rasmussens numerically stable mode finding
+        Rasmussen's numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
 
         :K: Covariance matrix
@@ -308,7 +328,7 @@ class Laplace(likelihood):
         :returns: f_mode
         """
         self.old_before_s = self.likelihood_function._get_params()
-        print "before: ", self.old_before_s
+        #print "before: ", self.old_before_s
         #if self.old_before_s < 1e-5:
             #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
@@ -351,42 +371,42 @@ class Laplace(likelihood):
             full_step_a = b - W_12*solve_L
             da = full_step_a - old_a
 
-            #f_old = f.copy()
-            #def inner_obj(step_size, old_a, da, K):
-                #a = old_a + step_size*da
-                #f = np.dot(K, a)
-                #self.a = a.copy() # This is nasty, need to set something within an optimization though
-                #self.f = f.copy()
-                #return -obj(a, f)
-
-            #from functools import partial
-            #i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
-            ##new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
-            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
-            #f = self.f.copy()
-            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-
             f_old = f.copy()
-            update_passed = False
-            while not update_passed:
+            def inner_obj(step_size, old_a, da, K):
                 a = old_a + step_size*da
                 f = np.dot(K, a)
+                self.a = a.copy() # This is nasty, need to set something within an optimization though
+                self.f = f.copy()
+                return -obj(a, f)
 
-                old_obj = new_obj
-                new_obj = obj(a, f)
-                difference = new_obj - old_obj
-                print "difference: ",difference
-                if difference < 0:
-                    #print "Objective function rose", np.float(difference)
-                    #If the objective function isn't rising, restart optimization
-                    step_size *= 0.8
-                    #print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
-                    #objective function isn't increasing, try reducing step size
-                    f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
-                    old_obj = new_obj
-                    rs += 1
-                else:
-                    update_passed = True
+            i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
+            #new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
+            new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
+            f = self.f.copy()
+            a = self.a.copy()
+            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
+
+            #f_old = f.copy()
+            #update_passed = False
+            #while not update_passed:
+                #a = old_a + step_size*da
+                #f = np.dot(K, a)
+
+                #old_obj = new_obj
+                #new_obj = obj(a, f)
+                #difference = new_obj - old_obj
+                ##print "difference: ",difference
+                #if difference < 0:
+                    ##print "Objective function rose", np.float(difference)
+                    ##If the objective function isn't rising, restart optimization
+                    #step_size *= 0.8
+                    ##print "Reducing step-size to {ss:.3} and restarting optimization".format(ss=step_size)
+                    ##objective function isn't increasing, try reducing step size
+                    #f = f_old.copy() #it's actually faster not to go back to old location and just zigzag across the mode
+                    #old_obj = new_obj
+                    #rs += 1
+                #else:
+                    #update_passed = True
 
             #difference = abs(new_obj - old_obj)
             #old_obj = new_obj.copy()
@@ -400,10 +420,11 @@ class Laplace(likelihood):
         self.old_a = old_a.copy()
         #print "Positive difference obj: ", np.float(difference)
         #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
-        print "Iterations: {}, Final_difference: {}".format(i, difference)
+        #print "Iterations: {}, Final_difference: {}".format(i, difference)
         if difference > 1e-4:
-            print "FAIL FAIL FAIL FAIL FAIL FAIL"
-            if False:
+        #if True:
+            #print "Not perfect f_hat fit difference: {}".format(difference)
+            if True:
                 import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
                 if hasattr(self, 'X'):
                     import pylab as pb
@@ -449,7 +470,7 @@ class Laplace(likelihood):
         self.old_ff = f.copy()
         self.old_K = self.K.copy()
         self.old_s = self.likelihood_function._get_params()
-        print "after: ", self.old_s
+        #print "after: ", self.old_s
         #print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
         self.a = a
         #self.B, self.B_chol, self.W_12 = B, L, W_12

From 5e88a885b127163a83336b3773894a2f76a924e9 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 13 Sep 2013 18:01:41 +0100
Subject: [PATCH 084/252] Student t likelihood function checkgrads (summed
 gradients wrt to sigma2), maybe some numerical instability in laplace

---
 GPy/likelihoods/laplace.py              |  5 +----
 GPy/likelihoods/likelihood_functions.py | 18 +++++++---------
 GPy/testing/laplace_tests.py            | 28 ++++++++++++++++++++++---
 3 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 2897e1de..7cc4834a 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -127,7 +127,6 @@ class Laplace(likelihood):
                              #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
                              + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[thetaL_i])
                              )
-            import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
 
             #Implicit
             df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
@@ -203,7 +202,7 @@ class Laplace(likelihood):
         #self.cC = 0.5*self.y_Wi_Ki_i_y
         #self.dD = -0.5*self.ln_B_det
         #print "Ztilde: {} lik: {} a: {} b: {} c: {} d:".format(Z_tilde, self.lik, self.aA, self.bB, self.cC, self.dD)
-        print "param value: {}".format(self.likelihood_function._get_params())
+        #print "param value: {}".format(self.likelihood_function._get_params())
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -330,7 +329,6 @@ class Laplace(likelihood):
         self.old_before_s = self.likelihood_function._get_params()
         #print "before: ", self.old_before_s
         #if self.old_before_s < 1e-5:
-            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
         #old_a = np.zeros((self.N, 1))
         if self.old_a is None:
@@ -384,7 +382,6 @@ class Laplace(likelihood):
             new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
             f = self.f.copy()
             a = self.a.copy()
-            #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
             #f_old = f.copy()
             #update_passed = False
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 39367734..b2f9ded7 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -218,16 +218,11 @@ class StudentT(LikelihoodFunction):
         """
         assert y.shape == f.shape
         e = y - f
-        #A = gammaln((self.v + 1) * 0.5)
-        #B = - gammaln(self.v * 0.5)
-        #C = - 0.5*np.log(self.sigma2 * self.v * np.pi)
-        #D = + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
                      - 0.5*np.log(self.sigma2 * self.v * np.pi)
-                     + (-(self.v + 1)*0.5)*np.log(1 + ((e**2)/self.sigma2)/np.float(self.v))
+                     - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
                     )
-        #print "C: {} D: {} obj: {}".format(C, np.sum(D), objective.sum())
         return np.sum(objective)
 
     def dlik_df(self, y, f, extra_data=None):
@@ -291,9 +286,13 @@ class StudentT(LikelihoodFunction):
         """
         assert y.shape == f.shape
         e = y - f
-        #FIXME: OUT BY SOME FUNCTION OF N
+        #FIXME: OUT BY SOME FUNCTION OF N, or the fact that we are summing over several things in the objective?
         dlik_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
-        return dlik_dvar
+        #dlik_dvar = ( 0.5*(1/float(self.sigma2))
+                     #-0.5*(self.v + 1)*(-(1/float(self.v))*(e**2)/(1/(float(self.sigma2**2))))
+                     #/ (1 + (1/float(self.v))*((e**2)/float(self.sigma2)))
+                     #)
+        return np.sum(dlik_dvar) #May not want to sum over all dimensions if using many D?
 
     def dlik_df_dvar(self, y, f, extra_data=None):
         """
@@ -516,8 +515,7 @@ class Gaussian(LikelihoodFunction):
         e = y - f
         s_4 = 1.0/(self._variance**2)
         dlik_dsigma = -0.5*self.N/self._variance + 0.5*s_4*np.dot(e.T, e)
-        #dlik_dsigma = -0.5*self.N + 0.5*s_4*np.dot(e.T, e)
-        return dlik_dsigma
+        return np.sum(dlik_dsigma) # Sure about this sum?
 
     def dlik_df_dvar(self, y, f, extra_data=None):
         """
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index 7fc6f2f4..a52cc3cd 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -45,6 +45,7 @@ def dparam_checkgrad(func, dfunc, params, args, constrain_positive=True, randomi
         for fixed_val in range(dfnum):
             #dlik and dlik_dvar gives back 1 value for each
             f_ind = min(fnum, fixed_val+1) - 1
+            print "fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val)
             grad = GradientChecker(lambda x: np.atleast_1d(partial_f(x))[f_ind],
                                    lambda x : np.atleast_1d(partial_df(x))[fixed_val],
                                    param, 'p')
@@ -63,9 +64,9 @@ def dparam_checkgrad(func, dfunc, params, args, constrain_positive=True, randomi
 
 class LaplaceTests(unittest.TestCase):
     def setUp(self):
-        self.N = 1
-        self.D = 5
-        self.X = np.linspace(0, 1, self.N)[:, None]
+        self.N = 5
+        self.D = 1
+        self.X = np.linspace(0, self.D, self.N)[:, None]
 
         self.real_std = 0.2
         noise = np.random.randn(*self.X.shape)*self.real_std
@@ -104,6 +105,27 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
+    def test_gaussian_d2lik_d2f_2(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.Y = None
+        self.gauss = None
+
+        self.N = 2
+        self.D = 1
+        self.X = np.linspace(0, self.D, self.N)[:, None]
+        self.real_std = 0.2
+        noise = np.random.randn(*self.X.shape)*self.real_std
+        self.Y = np.sin(self.X*2*np.pi) + noise
+        self.f = np.random.rand(self.N, 1)
+        self.gauss = GPy.likelihoods.functions.Gaussian(self.var, self.D, self.N)
+
+        dlik_df = functools.partial(self.gauss.dlik_df, self.Y)
+        d2lik_d2f = functools.partial(self.gauss.d2lik_d2f, self.Y)
+        grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
+
     def test_gaussian_d3lik_d3f(self):
         print "\n{}".format(inspect.stack()[0][3])
         d2lik_d2f = functools.partial(self.gauss.d2lik_d2f, self.Y)

From 5a8033b0164e421c70e4c1c5b461968e14b54f74 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 16 Sep 2013 13:01:13 +0100
Subject: [PATCH 085/252] Tidying up

---
 GPy/likelihoods/laplace.py              | 2 +-
 GPy/likelihoods/likelihood_functions.py | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 7cc4834a..1d282b8d 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -76,6 +76,7 @@ class Laplace(likelihood):
         #FIXME: Careful of side effects! And make sure W and K are up to date!
         d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
         dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T
+        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
         I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
         return dL_dfhat, I_KW_i
 
@@ -88,7 +89,6 @@ class Laplace(likelihood):
 
         #Implicit
         impl = mdot(dlp, dL_dfhat, I_KW_i)
-        #expl_a = mdot(self.Ki_f, self.Ki_f.T)
         expl_a = np.dot(self.Ki_f, self.Ki_f.T)
         expl_b = self.Wi_K_i
         #print "expl_a: {}, expl_b: {}".format(expl_a, expl_b)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index b2f9ded7..dbdd3fa6 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -286,12 +286,7 @@ class StudentT(LikelihoodFunction):
         """
         assert y.shape == f.shape
         e = y - f
-        #FIXME: OUT BY SOME FUNCTION OF N, or the fact that we are summing over several things in the objective?
         dlik_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
-        #dlik_dvar = ( 0.5*(1/float(self.sigma2))
-                     #-0.5*(self.v + 1)*(-(1/float(self.v))*(e**2)/(1/(float(self.sigma2**2))))
-                     #/ (1 + (1/float(self.v))*((e**2)/float(self.sigma2)))
-                     #)
         return np.sum(dlik_dvar) #May not want to sum over all dimensions if using many D?
 
     def dlik_df_dvar(self, y, f, extra_data=None):

From ebfff6c832b9dcf230ba870c3cc5a5594fef73c9 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 18 Sep 2013 13:18:28 +0100
Subject: [PATCH 086/252] Added some stability and tidied up

---
 GPy/likelihoods/laplace.py   | 85 +++++++++++++-----------------------
 GPy/testing/laplace_tests.py | 56 +++++++++++++++++++++++-
 2 files changed, 84 insertions(+), 57 deletions(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 1d282b8d..f8569c52 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -4,7 +4,7 @@ import GPy
 from scipy.linalg import inv, cho_solve, det
 from numpy.linalg import cond
 from likelihood import likelihood
-from ..util.linalg import pdinv, mdot, jitchol, chol_inv, pddet
+from ..util.linalg import pdinv, mdot, jitchol, chol_inv, pddet, dtrtrs
 from scipy.linalg.lapack import dtrtrs
 import random
 from functools import partial
@@ -46,7 +46,6 @@ class Laplace(likelihood):
 
         self.restart()
 
-
     def restart(self):
         #Initial values for the GP variables
         self.Y = np.zeros((self.N, 1))
@@ -57,7 +56,6 @@ class Laplace(likelihood):
 
         self.old_a = None
 
-
     def predictive_values(self, mu, var, full_cov):
         if full_cov:
             raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
@@ -73,10 +71,8 @@ class Laplace(likelihood):
         return self.likelihood_function._set_params(p)
 
     def _shared_gradients_components(self):
-        #FIXME: Careful of side effects! And make sure W and K are up to date!
-        d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat)
-        dL_dfhat = -0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T
-        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+        d3lik_d3fhat = self.likelihood_function.d3lik_d3f(self.data, self.f_hat, extra_data=self.extra_data)
+        dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
         I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
         return dL_dfhat, I_KW_i
 
@@ -87,19 +83,16 @@ class Laplace(likelihood):
         dL_dfhat, I_KW_i = self._shared_gradients_components()
         dlp = self.likelihood_function.dlik_df(self.data, self.f_hat)
 
-        #Implicit
-        impl = mdot(dlp, dL_dfhat, I_KW_i)
+        #Explicit
         expl_a = np.dot(self.Ki_f, self.Ki_f.T)
         expl_b = self.Wi_K_i
-        #print "expl_a: {}, expl_b: {}".format(expl_a, expl_b)
-        #expl = 0.5*expl_a - 0.5*expl_b # Might need to be -?
-        #dL_dthetaK_exp = dK_dthetaK(expl, X)
-        dL_dthetaK_exp_a = dK_dthetaK(expl_a, X)
-        dL_dthetaK_exp_b = dK_dthetaK(expl_b, X)
-        dL_dthetaK_exp = 0.5*dL_dthetaK_exp_a - 0.5*dL_dthetaK_exp_b
+        expl = 0.5*expl_a - 0.5*expl_b
+        dL_dthetaK_exp = dK_dthetaK(expl, X)
+
+        #Implicit
+        impl = mdot(dlp, dL_dfhat, I_KW_i)
         dL_dthetaK_imp = dK_dthetaK(impl, X)
-        #print "dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
-        #print "expl_a: {}, {}     expl_b: {}, {}".format(np.mean(expl_a), np.std(expl_a), np.mean(expl_b), np.std(expl_b))
+        #print "K: dL_dthetaK_exp: {}     dL_dthetaK_implicit: {}".format(dL_dthetaK_exp, dL_dthetaK_imp)
         dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
         return dL_dthetaK
 
@@ -111,27 +104,19 @@ class Laplace(likelihood):
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.likelihood_function._gradients(self.data, self.f_hat)
 
         num_params = len(dlik_dthetaL)
-        dL_dthetaL = np.zeros(num_params) # make space for one derivative for each likelihood parameter
+        # make space for one derivative for each likelihood parameter
+        dL_dthetaL = np.zeros(num_params)
         for thetaL_i in range(num_params):
             #Explicit
-            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
-            #a = 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
-            #d = dlik_hess_dthetaL[thetaL_i]
-            #e = pdinv(pdinv(self.K)[0] + np.diagflat(self.W))[0]
-            #b = 0.5*np.dot(np.diag(e).T, d)
-            #g = 0.5*(np.diag(self.K) - np.sum(cho_solve((self.B_chol, True), np.dot(np.diagflat(self.W_12),self.K))**2, 1))
-            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - np.dot(g.T, dlik_hess_dthetaL[thetaL_i])
-
-            #dL_dthetaL_exp = np.sum(dlik_dthetaL[thetaL_i]) - 0.5*np.dot(np.diag(self.Ki_W_i), dlik_hess_dthetaL[thetaL_i])
             dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
                              #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
                              + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[thetaL_i])
                              )
 
             #Implicit
-            df_hat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
-            dL_dthetaL_imp = np.dot(dL_dfhat, df_hat_dthetaL)
-            #print "dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
+            dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
+            dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
+            #print "LIK: dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
             dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
 
         return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
@@ -177,32 +162,21 @@ class Laplace(likelihood):
 
         Y_tilde = Wi*self.Ki_f + self.f_hat
 
-        self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
+        #self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
+        self.Wi_K_i = self.W_12*cho_solve((self.B_chol, True), np.diagflat(self.W_12))
         #self.Wi_K_i, _, _, self.ln_det_Wi_K = pdinv(self.Sigma_tilde + self.K) # TODO: Check if Wi_K_i == R above and same with det below
+
         self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
 
-        #self.Wi_K_i[self.Wi_K_i< 1e-6] = 1e-6
-
-        #self.ln_det_K_Wi__Bi = self.ln_I_KW_det + pddet(self.Sigma_tilde + self.K)
         self.lik = self.likelihood_function.link_function(self.data, self.f_hat, extra_data=self.extra_data)
 
         self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
-        #self.aA = 0.5*self.ln_det_K_Wi__Bi
-        #self.bB = - 0.5*self.f_Ki_f
-        #self.cC = 0.5*self.y_Wi_Ki_i_y
         Z_tilde = (+ self.lik
-                    #+ 0.5*self.ln_det_K_Wi__Bi
                    - 0.5*self.ln_B_det
                    + 0.5*self.ln_det_Wi_K
                    - 0.5*self.f_Ki_f
                    + 0.5*self.y_Wi_Ki_i_y
                   )
-        #self.aA = 0.5*self.ln_det_Wi_K
-        #self.bB = - 0.5*self.f_Ki_f
-        #self.cC = 0.5*self.y_Wi_Ki_i_y
-        #self.dD = -0.5*self.ln_B_det
-        #print "Ztilde: {} lik: {} a: {} b: {} c: {} d:".format(Z_tilde, self.lik, self.aA, self.bB, self.cC, self.dD)
-        #print "param value: {}".format(self.likelihood_function._get_params())
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -234,7 +208,8 @@ class Laplace(likelihood):
         self.W = -self.likelihood_function.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         if not self.likelihood_function.log_concave:
-            self.W[self.W < 0] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+            #print "Under 1e-6: {}".format(np.sum(self.W < 1e-6))
+            self.W[self.W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                        #If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                        #To cause the posterior to become less certain than the prior and likelihood,
                                        #This is a property only held by non-log-concave likelihoods
@@ -250,7 +225,7 @@ class Laplace(likelihood):
         self.Ki_f = self.a
 
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
-        self.Ki_W_i = self.K - mdot(self.K, self.W_12*self.Bi*self.W_12.T, self.K)
+        self.Ki_W_i = self.K - mdot(self.K, self.W_12*cho_solve((self.B_chol, True), np.diagflat(self.W_12)), self.K)
 
         #For det, |I + KW| == |I + W_12*K*W_12|
         #self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
@@ -316,7 +291,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
+    def rasm_mode(self, K, MAX_ITER=200, MAX_RESTART=10):
         """
         Rasmussen's numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -326,7 +301,7 @@ class Laplace(likelihood):
         :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
         :returns: f_mode
         """
-        self.old_before_s = self.likelihood_function._get_params()
+        #self.old_before_s = self.likelihood_function._get_params()
         #print "before: ", self.old_before_s
         #if self.old_before_s < 1e-5:
 
@@ -345,7 +320,7 @@ class Laplace(likelihood):
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-4
+        epsilon = 1e-10
         step_size = 1
         rs = 0
         i = 0
@@ -354,7 +329,8 @@ class Laplace(likelihood):
             W = -self.likelihood_function.d2lik_d2f(self.data, f, extra_data=self.extra_data)
             #W = np.maximum(W, 0)
             if not self.likelihood_function.log_concave:
-                W[W < 0] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                #print "Under 1e-10: {}".format(np.sum(W < 1e-10))
+                W[W < 1e-10] = 1e-10     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
@@ -379,7 +355,7 @@ class Laplace(likelihood):
 
             i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
             #new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
-            new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20, 'disp':True}).fun
+            new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-6, options={'maxiter':20, 'disp':True}).fun
             f = self.f.copy()
             a = self.a.copy()
 
@@ -418,10 +394,9 @@ class Laplace(likelihood):
         #print "Positive difference obj: ", np.float(difference)
         #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
         #print "Iterations: {}, Final_difference: {}".format(i, difference)
-        if difference > 1e-4:
-        #if True:
-            #print "Not perfect f_hat fit difference: {}".format(difference)
-            if True:
+        if difference > epsilon:
+            print "Not perfect f_hat fit difference: {}".format(difference)
+            if False:
                 import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
                 if hasattr(self, 'X'):
                     import pylab as pb
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index a52cc3cd..1e5d3d32 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -68,12 +68,13 @@ class LaplaceTests(unittest.TestCase):
         self.D = 1
         self.X = np.linspace(0, self.D, self.N)[:, None]
 
-        self.real_std = 0.2
+        self.real_std = 0.1
         noise = np.random.randn(*self.X.shape)*self.real_std
         self.Y = np.sin(self.X*2*np.pi) + noise
         #self.Y = np.array([[1.0]])#np.sin(self.X*2*np.pi) + noise
+        self.var = 0.3
 
-        self.f = np.random.rand(self.N, 1)
+        self.f = np.random.rand(self.N, self.D)
         #self.f = np.array([[3.0]])#np.sin(self.X*2*np.pi) + noise
 
         self.var = np.random.rand(1)
@@ -207,6 +208,57 @@ class LaplaceTests(unittest.TestCase):
                     constrain_positive=True, randomize=True, verbose=True)
                 )
 
+    def test_gauss_rbf(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.Y = self.Y/self.Y.max()
+        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
+        gauss_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.gauss, opt='rasm')
+        m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=gauss_laplace)
+        m.ensure_default_constraints()
+        m.randomize()
+        m.checkgrad(verbose=1)
+        self.assertTrue(m.checkgrad())
+
+    def test_studentt_approx_gauss_rbf(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.Y = self.Y/self.Y.max()
+        self.stu_t = GPy.likelihoods.functions.StudentT(deg_free=1000, sigma2=self.var)
+        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
+        stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm')
+        m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
+        m.ensure_default_constraints()
+        m.constrain_positive('t_noise')
+        m.randomize()
+        m.checkgrad(verbose=1)
+        print m
+        self.assertTrue(m.checkgrad())
+
+    def test_studentt_rbf(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.Y = self.Y/self.Y.max()
+        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1], variance=2.0)
+        stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm')
+        m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
+        m.ensure_default_constraints()
+        m.constrain_positive('t_noise')
+        m.randomize()
+        m.checkgrad(verbose=1)
+        print m
+        self.assertTrue(m.checkgrad())
+
+    def test_studentt_rbf_smallvar(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.Y = self.Y/self.Y.max()
+        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1], variance=2.0)
+        stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm')
+        m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
+        m.ensure_default_constraints()
+        m.constrain_positive('t_noise')
+        m['t_noise'] = 0.01
+        m.checkgrad(verbose=1)
+        print m
+        self.assertTrue(m.checkgrad())
+
 if __name__ == "__main__":
     print "Running unit tests"
     unittest.main()

From ca09051a56d3d7e1e3c601a8b26aa17f199e349e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 18 Sep 2013 16:51:28 +0100
Subject: [PATCH 087/252] Changed the examples (started boston data) and
 increased tolerance of finding fhat

---
 GPy/examples/laplace_approximations.py | 98 +++++++++++++++++++++-----
 GPy/likelihoods/laplace.py             |  8 +--
 2 files changed, 85 insertions(+), 21 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index e8af74eb..3e24c89f 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -199,7 +199,7 @@ def student_t_fix_optimise_check():
 
     #GP
     kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
+    mgp = GPy.models.GPRegression(X, Y.copy(), kernel=kernelgp)
     mgp.ensure_default_constraints()
     mgp.randomize()
     mgp.optimize()
@@ -212,7 +212,7 @@ def student_t_fix_optimise_check():
 
     plt.figure(1)
     plt.suptitle('Student likelihood')
-    m = GPy.models.GPRegression(X, Y, kernelst, likelihood=stu_t_likelihood)
+    m = GPy.models.GPRegression(X, Y.copy(), kernelst, likelihood=stu_t_likelihood)
     m.constrain_fixed('rbf_var', mgp._get_params()[0])
     m.constrain_fixed('rbf_len', mgp._get_params()[1])
     m.constrain_positive('t_noise')
@@ -406,27 +406,29 @@ def student_t_approx():
     """
     real_std = 0.1
     #Start a function, any function
-    X = np.linspace(0.0, 10.0, 100)[:, None]
+    X = np.linspace(0.0, np.pi*2, 100)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_std
     Yc = Y.copy()
 
-    X_full = np.linspace(0.0, 10.0, 500)[:, None]
+    X_full = np.linspace(0.0, np.pi*2, 500)[:, None]
     Y_full = np.sin(X_full)
 
     Y = Y/Y.max()
 
-    Yc[10] += 100
-    Yc[25] += 10
-    Yc[23] += 10
-    Yc[26] += 1000
-    Yc[24] += 10
+    Yc[75:80] += 1
+
+    #Yc[10] += 100
+    #Yc[25] += 10
+    #Yc[23] += 10
+    #Yc[26] += 1000
+    #Yc[24] += 10
     #Yc = Yc/Yc.max()
 
     #Add student t random noise to datapoints
     deg_free = 5
     print "Real noise: ", real_std
 
-    initial_var_guess = 0.1
+    initial_var_guess = 0.5
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
@@ -650,16 +652,78 @@ def gaussian_f_check():
     import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
 
 def boston_example():
+    import sklearn
+    from sklearn.cross_validation import KFold
     data = datasets.boston_housing()
     X = data['X'].copy()
     Y = data['Y'].copy()
-    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
-    mgp.ensure_default_constraints()
-    mgp.randomize()
-    mgp.optimize()
-    mgp.plot()
-    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+    Y = Y-Y.mean()
+    Y = Y/Y.std()
+    num_folds = 2
+    kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    score_folds = np.zeros((3, num_folds))
+    def rmse(Y, Ystar):
+        return np.sqrt(np.mean((Y-Ystar)**2))
+    #for train, test in kf:
+    for n, (train, test) in enumerate(kf):
+        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        print "Fold {}".format(n)
+
+        noise = np.exp(-2)
+
+        #Gaussian GP
+        print "Gauss GP"
+        kernelgp = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp)
+        mgp.ensure_default_constraints()
+        mgp['noise'] = noise
+        mgp.optimize(messages=1)
+        Y_test_pred = mgp.predict(X_test)
+        score_folds[0, n] = rmse(Y_test, Y_test_pred[0])
+        plt.figure()
+        plt.scatter(X_test[:, 0], Y_test_pred[0])
+        plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
+        print score_folds
+        plt.title('GP gauss')
+
+        print "Gaussian Laplace GP"
+        sigma2_start = 1
+        kernelstu = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1], variance=0.01)
+        N, D = Y_train.shape
+        g_distribution = GPy.likelihoods.functions.Gaussian(variance=noise, N=N, D=D)
+        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution, opt='rasm')
+        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=g_likelihood)
+        mg.ensure_default_constraints()
+        mg.constrain_positive('noise_variance')
+        mg.optimize(messages=1)
+        Y_test_pred = mg.predict(X_test)
+        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
+        print score_folds
+        plt.figure()
+        plt.scatter(X_test[:, 0], Y_test_pred[0])
+        plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
+        plt.title('Lap gauss')
+
+        #Student t likelihood
+        print "Student-T GP"
+        deg_free = 5
+        kernelstu = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1], variance=0.01)
+        t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=noise)
+        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
+        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
+        mstu_t.ensure_default_constraints()
+        #mstu_t.constrain_positive('t_noise')
+        mstu_t.constrain_bounded('t_noise', 0.01, 1000)
+        mstu_t.optimize(messages=1)
+        Y_test_pred = mstu_t.predict(X_test)
+        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
+        print score_folds
+        plt.figure()
+        plt.scatter(X_test[:, 0], Y_test_pred[0])
+        plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
+        plt.title('Stu t')
+        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+
 
 def plot_f_approx(model):
     plt.figure()
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index f8569c52..5c9362ab 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -291,7 +291,7 @@ class Laplace(likelihood):
         f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
         return f_hat[:, None]
 
-    def rasm_mode(self, K, MAX_ITER=200, MAX_RESTART=10):
+    def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
         """
         Rasmussen's numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -320,7 +320,7 @@ class Laplace(likelihood):
             return -0.5*np.dot(a.T, f) + self.likelihood_function.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-10
+        epsilon = 1e-6
         step_size = 1
         rs = 0
         i = 0
@@ -330,7 +330,7 @@ class Laplace(likelihood):
             #W = np.maximum(W, 0)
             if not self.likelihood_function.log_concave:
                 #print "Under 1e-10: {}".format(np.sum(W < 1e-10))
-                W[W < 1e-10] = 1e-10     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                W[W < 1e-6] = 1e-6     # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                     # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                     # To cause the posterior to become less certain than the prior and likelihood,
                                     # This is a property only held by non-log-concave likelihoods
@@ -355,7 +355,7 @@ class Laplace(likelihood):
 
             i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
             #new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
-            new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-6, options={'maxiter':20, 'disp':True}).fun
+            new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':20}).fun
             f = self.f.copy()
             a = self.a.copy()
 

From 9d7b670160684d760136737b18237ae5405c5c97 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 19 Sep 2013 15:56:18 +0100
Subject: [PATCH 088/252] Tests setup but not fitting properly yet

---
 GPy/examples/laplace_approximations.py | 87 +++++++++++++++++++-------
 1 file changed, 65 insertions(+), 22 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 3e24c89f..1ad4eb38 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -659,9 +659,10 @@ def boston_example():
     Y = data['Y'].copy()
     Y = Y-Y.mean()
     Y = Y/Y.std()
-    num_folds = 2
+    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+    num_folds = 10
     kf = KFold(len(Y), n_folds=num_folds, indices=True)
-    score_folds = np.zeros((3, num_folds))
+    score_folds = np.zeros((4, num_folds))
     def rmse(Y, Ystar):
         return np.sqrt(np.mean((Y-Ystar)**2))
     #for train, test in kf:
@@ -673,56 +674,98 @@ def boston_example():
 
         #Gaussian GP
         print "Gauss GP"
-        kernelgp = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
+        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1], variance=0.01)
         mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp)
         mgp.ensure_default_constraints()
         mgp['noise'] = noise
+        mgp.constrain_fixed('white', 0.01)
+        print mgp
         mgp.optimize(messages=1)
         Y_test_pred = mgp.predict(X_test)
         score_folds[0, n] = rmse(Y_test, Y_test_pred[0])
-        plt.figure()
-        plt.scatter(X_test[:, 0], Y_test_pred[0])
-        plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
+        print mgp
         print score_folds
-        plt.title('GP gauss')
+        #plt.figure()
+        #plt.scatter(X_test[:, 0], Y_test_pred[0])
+        #plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
+        #plt.title('GP gauss')
 
         print "Gaussian Laplace GP"
         sigma2_start = 1
-        kernelstu = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1], variance=0.01)
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1], variance=0.1)
         N, D = Y_train.shape
         g_distribution = GPy.likelihoods.functions.Gaussian(variance=noise, N=N, D=D)
         g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution, opt='rasm')
         mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=g_likelihood)
         mg.ensure_default_constraints()
         mg.constrain_positive('noise_variance')
-        mg.optimize(messages=1)
+        mg.constrain_fixed('white', 0.01)
+        mg['noise'] = noise
+        print mg
+        try:
+            mg.optimize(messages=1)
+        except Exception:
+            print "Blew up"
         Y_test_pred = mg.predict(X_test)
         score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
         print score_folds
-        plt.figure()
-        plt.scatter(X_test[:, 0], Y_test_pred[0])
-        plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
-        plt.title('Lap gauss')
+        print mg
+        #plt.figure()
+        #plt.scatter(X_test[:, 0], Y_test_pred[0])
+        #plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
+        #plt.title('Lap gauss')
 
         #Student t likelihood
-        print "Student-T GP"
         deg_free = 5
-        kernelstu = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1], variance=0.01)
+        print "Student-T GP {}df".format(deg_free)
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1], variance=0.1)
         t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=noise)
         stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
         mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
         mstu_t.ensure_default_constraints()
+        mstu_t.constrain_fixed('white', 0.01)
         #mstu_t.constrain_positive('t_noise')
-        mstu_t.constrain_bounded('t_noise', 0.01, 1000)
-        mstu_t.optimize(messages=1)
+        mstu_t.constrain_bounded('t_noise', 0.001, 1000)
+        mstu_t['t_noise'] = noise
+        print mstu_t
+        try:
+            mstu_t.optimize(messages=1)
+        except Exception:
+            print "Blew up"
         Y_test_pred = mstu_t.predict(X_test)
         score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
         print score_folds
-        plt.figure()
-        plt.scatter(X_test[:, 0], Y_test_pred[0])
-        plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
-        plt.title('Stu t')
-        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+        print mstu_t
+        #plt.figure()
+        #plt.scatter(X_test[:, 0], Y_test_pred[0])
+        #plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
+        #plt.title('Stu t {}df'.format(deg_free))
+
+        deg_free = 3
+        print "Student-T GP {}df".format(deg_free)
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1], variance=0.1)
+        t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=noise)
+        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
+        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
+        mstu_t.ensure_default_constraints()
+        mstu_t.constrain_fixed('white', 0.01)
+        #mstu_t.constrain_positive('t_noise')
+        mstu_t.constrain_bounded('t_noise', 0.001, 1000)
+        mstu_t['t_noise'] = noise
+        print mstu_t
+        try:
+            mstu_t.optimize(messages=1)
+        except Exception:
+            print "Blew up"
+        mstu_t.optimize(messages=1)
+        Y_test_pred = mstu_t.predict(X_test)
+        score_folds[3, n] = rmse(Y_test, Y_test_pred[0])
+        print score_folds
+        print mstu_t
+        #plt.figure()
+        #plt.scatter(X_test[:, 0], Y_test_pred[0])
+        #plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
+        #plt.title('Stu t {}df'.format(deg_free))
 
 
 def plot_f_approx(model):

From 2c419d2f484962991318010a56a760eb2cfc50f8 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 19 Sep 2013 18:17:39 +0100
Subject: [PATCH 089/252] Boston housing works (apart from variance of student
 t is not valid below 2)

---
 GPy/examples/laplace_approximations.py | 281 ++++++++++++++++---------
 1 file changed, 184 insertions(+), 97 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 1ad4eb38..9a1a1399 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -657,6 +657,190 @@ def boston_example():
     data = datasets.boston_housing()
     X = data['X'].copy()
     Y = data['Y'].copy()
+    X = X-X.mean(axis=0)
+    X = X/X.std(axis=0)
+    Y = Y-Y.mean()
+    Y = Y/Y.std()
+    num_folds = 10
+    kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    score_folds = np.zeros((6, num_folds))
+    def rmse(Y, Ystar):
+        return np.sqrt(np.mean((Y-Ystar)**2))
+    for n, (train, test) in enumerate(kf):
+        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        print "Fold {}".format(n)
+
+        noise = 1e-1 #np.exp(-2)
+        rbf_len = 0.5
+        data_axis_plot = 4
+        plot = True
+
+        #Gaussian GP
+        print "Gauss GP"
+        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp)
+        mgp.ensure_default_constraints()
+        mgp.constrain_fixed('white', 1e-5)
+        mgp['rbf_len'] = rbf_len
+        mgp['noise'] = noise
+        print mgp
+        mgp.optimize(messages=1)
+        Y_test_pred = mgp.predict(X_test)
+        score_folds[0, n] = rmse(Y_test, Y_test_pred[0])
+        print mgp
+        print score_folds
+        if plot:
+            plt.figure()
+            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+            plt.title('GP gauss')
+
+        print "Gaussian Laplace GP"
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        N, D = Y_train.shape
+        g_distribution = GPy.likelihoods.functions.Gaussian(variance=noise, N=N, D=D)
+        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution, opt='rasm')
+        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=g_likelihood)
+        mg.ensure_default_constraints()
+        mg.constrain_positive('noise_variance')
+        mg.constrain_fixed('white', 1e-5)
+        mg['rbf_len'] = rbf_len
+        mg['noise'] = noise
+        print mg
+        try:
+            mg.optimize(messages=1)
+        except Exception:
+            print "Blew up"
+        Y_test_pred = mg.predict(X_test)
+        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
+        print score_folds
+        print mg
+        if plot:
+            plt.figure()
+            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+            plt.title('Lap gauss')
+
+        #Student T
+        deg_free = 1
+        print "Student-T GP {}df".format(deg_free)
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=noise)
+        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
+        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
+        mstu_t.ensure_default_constraints()
+        mstu_t.constrain_fixed('white', 1e-5)
+        mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
+        mstu_t['rbf_len'] = rbf_len
+        mstu_t['t_noise'] = noise
+        print mstu_t
+        try:
+            mstu_t.optimize(messages=1)
+        except Exception:
+            print "Blew up"
+        Y_test_pred = mstu_t.predict(X_test)
+        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
+        print score_folds
+        print mstu_t
+        if plot:
+            plt.figure()
+            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+            plt.title('Stu t {}df'.format(deg_free))
+
+        deg_free = 2
+        print "Student-T GP {}df".format(deg_free)
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=noise)
+        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
+        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
+        mstu_t.ensure_default_constraints()
+        mstu_t.constrain_fixed('white', 1e-5)
+        mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
+        mstu_t['rbf_len'] = rbf_len
+        mstu_t['t_noise'] = noise
+        print mstu_t
+        try:
+            mstu_t.optimize(messages=1)
+        except Exception:
+            print "Blew up"
+        Y_test_pred = mstu_t.predict(X_test)
+        score_folds[3, n] = rmse(Y_test, Y_test_pred[0])
+        print score_folds
+        print mstu_t
+        if plot:
+            plt.figure()
+            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+            plt.title('Stu t {}df'.format(deg_free))
+
+        #Student t likelihood
+        deg_free = 3
+        print "Student-T GP {}df".format(deg_free)
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=noise)
+        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
+        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
+        mstu_t.ensure_default_constraints()
+        mstu_t.constrain_fixed('white', 1e-5)
+        mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
+        mstu_t['rbf_len'] = rbf_len
+        mstu_t['t_noise'] = noise
+        print mstu_t
+        try:
+            mstu_t.optimize(messages=1)
+        except Exception:
+            print "Blew up"
+        Y_test_pred = mstu_t.predict(X_test)
+        score_folds[4, n] = rmse(Y_test, Y_test_pred[0])
+        print score_folds
+        print mstu_t
+        if plot:
+            plt.figure()
+            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+            plt.title('Stu t {}df'.format(deg_free))
+
+        deg_free = 5
+        print "Student-T GP {}df".format(deg_free)
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=noise)
+        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
+        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
+        mstu_t.ensure_default_constraints()
+        mstu_t.constrain_fixed('white', 1e-5)
+        mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
+        mstu_t['rbf_len'] = rbf_len
+        mstu_t['t_noise'] = noise
+        print mstu_t
+        try:
+            mstu_t.optimize(messages=1)
+        except Exception:
+            print "Blew up"
+        Y_test_pred = mstu_t.predict(X_test)
+        score_folds[5, n] = rmse(Y_test, Y_test_pred[0])
+        print score_folds
+        print mstu_t
+        if plot:
+            plt.figure()
+            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+            plt.title('Stu t {}df'.format(deg_free))
+
+
+
+
+    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+    return score_folds
+
+def precipitation_example():
+    import sklearn
+    from sklearn.cross_validation import KFold
+    data = datasets.boston_housing()
+    X = data['X'].copy()
+    Y = data['Y'].copy()
+    X = X-X.mean(axis=0)
+    X = X/X.std(axis=0)
     Y = Y-Y.mean()
     Y = Y/Y.std()
     import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
@@ -670,103 +854,6 @@ def boston_example():
         X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
         print "Fold {}".format(n)
 
-        noise = np.exp(-2)
-
-        #Gaussian GP
-        print "Gauss GP"
-        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1], variance=0.01)
-        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp)
-        mgp.ensure_default_constraints()
-        mgp['noise'] = noise
-        mgp.constrain_fixed('white', 0.01)
-        print mgp
-        mgp.optimize(messages=1)
-        Y_test_pred = mgp.predict(X_test)
-        score_folds[0, n] = rmse(Y_test, Y_test_pred[0])
-        print mgp
-        print score_folds
-        #plt.figure()
-        #plt.scatter(X_test[:, 0], Y_test_pred[0])
-        #plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
-        #plt.title('GP gauss')
-
-        print "Gaussian Laplace GP"
-        sigma2_start = 1
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1], variance=0.1)
-        N, D = Y_train.shape
-        g_distribution = GPy.likelihoods.functions.Gaussian(variance=noise, N=N, D=D)
-        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution, opt='rasm')
-        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=g_likelihood)
-        mg.ensure_default_constraints()
-        mg.constrain_positive('noise_variance')
-        mg.constrain_fixed('white', 0.01)
-        mg['noise'] = noise
-        print mg
-        try:
-            mg.optimize(messages=1)
-        except Exception:
-            print "Blew up"
-        Y_test_pred = mg.predict(X_test)
-        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
-        print score_folds
-        print mg
-        #plt.figure()
-        #plt.scatter(X_test[:, 0], Y_test_pred[0])
-        #plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
-        #plt.title('Lap gauss')
-
-        #Student t likelihood
-        deg_free = 5
-        print "Student-T GP {}df".format(deg_free)
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1], variance=0.1)
-        t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=noise)
-        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
-        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
-        mstu_t.ensure_default_constraints()
-        mstu_t.constrain_fixed('white', 0.01)
-        #mstu_t.constrain_positive('t_noise')
-        mstu_t.constrain_bounded('t_noise', 0.001, 1000)
-        mstu_t['t_noise'] = noise
-        print mstu_t
-        try:
-            mstu_t.optimize(messages=1)
-        except Exception:
-            print "Blew up"
-        Y_test_pred = mstu_t.predict(X_test)
-        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
-        print score_folds
-        print mstu_t
-        #plt.figure()
-        #plt.scatter(X_test[:, 0], Y_test_pred[0])
-        #plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
-        #plt.title('Stu t {}df'.format(deg_free))
-
-        deg_free = 3
-        print "Student-T GP {}df".format(deg_free)
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1], variance=0.1)
-        t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=noise)
-        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
-        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
-        mstu_t.ensure_default_constraints()
-        mstu_t.constrain_fixed('white', 0.01)
-        #mstu_t.constrain_positive('t_noise')
-        mstu_t.constrain_bounded('t_noise', 0.001, 1000)
-        mstu_t['t_noise'] = noise
-        print mstu_t
-        try:
-            mstu_t.optimize(messages=1)
-        except Exception:
-            print "Blew up"
-        mstu_t.optimize(messages=1)
-        Y_test_pred = mstu_t.predict(X_test)
-        score_folds[3, n] = rmse(Y_test, Y_test_pred[0])
-        print score_folds
-        print mstu_t
-        #plt.figure()
-        #plt.scatter(X_test[:, 0], Y_test_pred[0])
-        #plt.scatter(X_test[:, 0], Y_test, c='r', marker='x')
-        #plt.title('Stu t {}df'.format(deg_free))
-
 
 def plot_f_approx(model):
     plt.figure()

From b1d7fc4745bf10b752df6f7dc2f9ee3bfa1e5927 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Tue, 1 Oct 2013 08:57:00 +0100
Subject: [PATCH 090/252] more samples for higher sampling accuracy

---
 GPy/testing/psi_stat_expectation_tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/testing/psi_stat_expectation_tests.py b/GPy/testing/psi_stat_expectation_tests.py
index 30ca14d6..bcdbd2af 100644
--- a/GPy/testing/psi_stat_expectation_tests.py
+++ b/GPy/testing/psi_stat_expectation_tests.py
@@ -105,7 +105,7 @@ class Test(unittest.TestCase):
 
     def test_psi2(self):
         for kern in self.kerns:
-            Nsamples = self.Nsamples/300.
+            Nsamples = self.Nsamples/10.
             psi2 = kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
             K_ = np.zeros((self.num_inducing, self.num_inducing))
             diffs = []
@@ -135,7 +135,7 @@ class Test(unittest.TestCase):
 if __name__ == "__main__":
     sys.argv = ['',
          #'Test.test_psi0',
-         'Test.test_psi1',
+         #'Test.test_psi1',
          'Test.test_psi2',
          ]
     unittest.main()

From c4715b2f5b25ba1009d229e4881d6c22f397e95d Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 2 Oct 2013 13:37:48 +0100
Subject: [PATCH 091/252] Fixed white variance

---
 GPy/testing/laplace_tests.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index 1e5d3d32..4a5bf4e2 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -236,11 +236,13 @@ class LaplaceTests(unittest.TestCase):
     def test_studentt_rbf(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.Y = self.Y/self.Y.max()
-        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1], variance=2.0)
+        white_var = 3.0
+        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
         stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm')
         m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
         m.ensure_default_constraints()
         m.constrain_positive('t_noise')
+        m.constrain_fixed('white', white_var)
         m.randomize()
         m.checkgrad(verbose=1)
         print m
@@ -249,11 +251,13 @@ class LaplaceTests(unittest.TestCase):
     def test_studentt_rbf_smallvar(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.Y = self.Y/self.Y.max()
-        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1], variance=2.0)
+        white_var = 3.0
+        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
         stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm')
         m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
         m.ensure_default_constraints()
         m.constrain_positive('t_noise')
+        m.constrain_fixed('white', white_var)
         m['t_noise'] = 0.01
         m.checkgrad(verbose=1)
         print m

From da67e39e5000c881a30f93bd3081a97b828e93dc Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 3 Oct 2013 19:04:00 +0100
Subject: [PATCH 092/252] Tidied up laplace

---
 GPy/examples/laplace_approximations.py        |  87 ++---
 GPy/likelihoods/laplace.py                    | 344 +++++++-----------
 .../noise_models/student_t_noise.py           |   3 +-
 GPy/testing/laplace_tests.py                  |   8 +-
 4 files changed, 159 insertions(+), 283 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 712312c7..eb78c47a 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -27,7 +27,7 @@ def timing():
         kernel1 = GPy.kern.rbf(X.shape[1])
 
         t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-        corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
+        corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
         m = GPy.models.GPRegression(X, Yc.copy(), kernel1, likelihood=corrupt_stu_t_likelihood)
         m.ensure_default_constraints()
         m.update_likelihood_approximation()
@@ -56,7 +56,7 @@ def v_fail_test():
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
     m = GPy.models.GPRegression(X, Y.copy(), kernel1, likelihood=stu_t_likelihood)
     m.constrain_positive('')
     vs = 25
@@ -103,7 +103,7 @@ def student_t_obj_plane():
 
     kernelst = kernelgp.copy()
     t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=(real_std**2))
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
     m = GPy.models.GPRegression(X, Y, kernelst, likelihood=stu_t_likelihood)
     m.ensure_default_constraints()
     m.constrain_fixed('t_no', real_std**2)
@@ -156,7 +156,7 @@ def student_t_f_check():
     kernelst = kernelgp.copy()
     #kernelst += GPy.kern.bias(X.shape[1])
     t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=0.05)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
     m = GPy.models.GPRegression(X, Y.copy(), kernelst, likelihood=stu_t_likelihood)
     #m['rbf_v'] = mgp._get_params()[0]
     #m['rbf_l'] = mgp._get_params()[1] + 1
@@ -208,7 +208,7 @@ def student_t_fix_optimise_check():
     real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
 
     t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=real_stu_t_std2)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
 
     plt.figure(1)
     plt.suptitle('Student likelihood')
@@ -351,7 +351,7 @@ def debug_student_t_noise_approx():
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
 
     m = GPy.models.GPRegression(X, Y, kernel6, likelihood=stu_t_likelihood)
     #m['rbf_len'] = 1.5
@@ -488,7 +488,7 @@ def student_t_approx():
 
     print "Clean student t, rasm"
     t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
     m = GPy.models.GPRegression(X, Y.copy(), kernel6, likelihood=stu_t_likelihood)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
@@ -504,7 +504,7 @@ def student_t_approx():
 
     print "Corrupt student t, rasm"
     t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
     m = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
@@ -526,51 +526,22 @@ def student_t_approx():
     import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
     return m
 
-    #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
-    #m = GPy.models.GPRegression(X, Y, kernel3, likelihood=stu_t_likelihood)
-    #m.ensure_default_constraints()
-    #m.update_likelihood_approximation()
-    #m.optimize()
-    #print(m)
-    #plt.subplot(221)
-    #m.plot()
-    #plt.plot(X_full, Y_full)
-    #plt.ylim(-2.5, 2.5)
-    #plt.title('Student-t ncg clean')
+    #with a student t distribution, since it has heavy tails it should work well
+    #likelihood_function = student_t(deg_free=deg_free, sigma2=real_var)
+    #lap = Laplace(Y, likelihood_function)
+    #cov = kernel.K(X)
+    #lap.fit_full(cov)
 
-    #print "Corrupt student t, ncg"
-    #t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
-    #m = GPy.models.GPRegression(X, Y, kernel5, likelihood=corrupt_stu_t_likelihood)
-    #m.ensure_default_constraints()
-    #m.update_likelihood_approximation()
-    #m.optimize()
-    #print(m)
-    #plt.subplot(223)
-    #m.plot()
-    #plt.plot(X_full, Y_full)
-    #plt.ylim(-2.5, 2.5)
-    #plt.title('Student-t ncg corrupt')
-
-
-    ###with a student t distribution, since it has heavy tails it should work well
-    ###likelihood_function = student_t(deg_free=deg_free, sigma2=real_var)
-    ###lap = Laplace(Y, likelihood_function)
-    ###cov = kernel.K(X)
-    ###lap.fit_full(cov)
-
-    ###test_range = np.arange(0, 10, 0.1)
-    ###plt.plot(test_range, t_rv.pdf(test_range))
-    ###for i in xrange(X.shape[0]):
-        ###mode = lap.f_hat[i]
-        ###covariance = lap.hess_hat_i[i,i]
-        ###scaling = np.exp(lap.ln_z_hat)
-        ###normalised_approx = norm(loc=mode, scale=covariance)
-        ###print "Normal with mode %f, and variance %f" % (mode, covariance)
-        ###plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
-    ###plt.show()
+    #test_range = np.arange(0, 10, 0.1)
+    #plt.plot(test_range, t_rv.pdf(test_range))
+    #for i in xrange(X.shape[0]):
+        #mode = lap.f_hat[i]
+        #covariance = lap.hess_hat_i[i,i]
+        #scaling = np.exp(lap.ln_z_hat)
+        #normalised_approx = norm(loc=mode, scale=covariance)
+        #print "Normal with mode %f, and variance %f" % (mode, covariance)
+        #plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
+    #plt.show()
 
     return m
 
@@ -625,7 +596,7 @@ def gaussian_f_check():
     #kernelst += GPy.kern.bias(X.shape[1])
     N, D = X.shape
     g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=0.1, N=N, D=D)
-    g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm')
+    g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution)
     m = GPy.models.GPRegression(X, Y, kernelg, likelihood=g_likelihood)
     m.likelihood.X = X
     #m['rbf_v'] = mgp._get_params()[0]
@@ -702,7 +673,7 @@ def boston_example():
         kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         N, D = Y_train.shape
         g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
-        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution, opt='rasm')
+        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
         mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=g_likelihood)
         mg.ensure_default_constraints()
         mg.constrain_positive('noise_variance')
@@ -729,7 +700,7 @@ def boston_example():
         print "Student-T GP {}df".format(deg_free)
         kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
-        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
+        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
         mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
         mstu_t.ensure_default_constraints()
         mstu_t.constrain_fixed('white', 1e-5)
@@ -755,7 +726,7 @@ def boston_example():
         print "Student-T GP {}df".format(deg_free)
         kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
-        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
+        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
         mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
         mstu_t.ensure_default_constraints()
         mstu_t.constrain_fixed('white', 1e-5)
@@ -782,7 +753,7 @@ def boston_example():
         print "Student-T GP {}df".format(deg_free)
         kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
-        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
+        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
         mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
         mstu_t.ensure_default_constraints()
         mstu_t.constrain_fixed('white', 1e-5)
@@ -808,7 +779,7 @@ def boston_example():
         print "Student-T GP {}df".format(deg_free)
         kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
-        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution, opt='rasm')
+        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
         mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
         mstu_t.ensure_default_constraints()
         mstu_t.constrain_fixed('white', 1e-5)
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 7fe2d64a..46203506 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -1,42 +1,42 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
 import numpy as np
 import scipy as sp
-import GPy
-from scipy.linalg import inv, cho_solve, det
-from numpy.linalg import cond
+from scipy.linalg import cho_solve
 from likelihood import likelihood
-from ..util.linalg import pdinv, mdot, jitchol, chol_inv, pddet, dtrtrs
+from ..util.linalg import mdot, jitchol, pddet
 from scipy.linalg.lapack import dtrtrs
-import random
-from functools import partial
-#import pylab as plt
+from functools import partial as partial_func
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
 
-    def __init__(self, data, noise_model, extra_data=None, opt='rasm'):
+    def __init__(self, data, noise_model, extra_data=None):
         """
         Laplace Approximation
 
-        First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
-        then find the z^{prime} which allows this to be a normalised gaussian instead of a
-        non-normalized gaussian
+        Find the moments \hat{f} and the hessian at this point
+        (using Newton-Raphson) of the unnormalised posterior
 
-        Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
-        which makes a gaussian the same as the laplace approximation
+        Compute the GP variables (i.e. generate some Y^{squiggle} and
+        z^{squiggle} which makes a gaussian the same as the laplace
+        approximation to the posterior, but normalised
 
         Arguments
         ---------
 
-        :data: array of data the likelihood function is approximating
-        :noise_model: likelihood function - subclass of noise_model
-        :extra_data: additional data used by some likelihood functions, for example survival likelihoods need censoring data
-        :opt: Optimiser to use, rasm numerically stable, ncg or nelder-mead (latter only work with 1d data)
-
+        :param data: array of data the likelihood function is approximating
+        :type data: NxD
+        :param noise_model: likelihood function - subclass of noise_model
+        :type noise_model: noise_model
+        :param extra_data: additional data used by some likelihood functions,
+                           for example survival likelihoods need censoring data
         """
         self.data = data
         self.noise_model = noise_model
         self.extra_data = extra_data
-        self.opt = opt
 
         #Inital values
         self.N, self.D = self.data.shape
@@ -48,6 +48,9 @@ class Laplace(likelihood):
         likelihood.__init__(self)
 
     def restart(self):
+        """
+        Reset likelihood variables to their defaults
+        """
         #Initial values for the GP variables
         self.Y = np.zeros((self.N, 1))
         self.covariance_matrix = np.eye(self.N)
@@ -55,11 +58,12 @@ class Laplace(likelihood):
         self.Z = 0
         self.YYT = None
 
-        self.old_a = None
+        self.old_Ki_f = None
 
     def predictive_values(self, mu, var, full_cov):
         if full_cov:
-            raise NotImplementedError("Cannot make correlated predictions with an Laplace likelihood")
+            raise NotImplementedError("Cannot make correlated predictions\
+                    with an Laplace likelihood")
         return self.noise_model.predictive_values(mu, var)
 
     def _get_params(self):
@@ -79,7 +83,10 @@ class Laplace(likelihood):
 
     def _Kgradients(self):
         """
-        Gradients with respect to prior kernel parameters
+        Gradients with respect to prior kernel parameters dL_dK to be chained
+        with dK_dthetaK to give dL_dthetaK
+        :returns: dL_dK matrix
+        :rtype: Matrix (1 x num_kernel_params)
         """
         dL_dfhat, I_KW_i = self._shared_gradients_components()
         dlp = self.noise_model.dlik_df(self.data, self.f_hat)
@@ -93,19 +100,25 @@ class Laplace(likelihood):
         #Implicit
         impl = mdot(dlp, dL_dfhat, I_KW_i)
 
-        #No longer required as we are computing these in the gp already otherwise we would take them away and add them back
+        #No longer required as we are computing these in the gp already
+        #otherwise we would take them away and add them back
         #dL_dthetaK_imp = dK_dthetaK(impl, X)
         #dL_dthetaK = dL_dthetaK_exp + dL_dthetaK_imp
         #dL_dK = expl + impl
 
-        #No need to compute explicit as we are computing dZ_dK to account for the difference
-        #Between the K gradients of a normal GP, and the K gradients including the implicit part
+        #No need to compute explicit as we are computing dZ_dK to account
+        #for the difference between the K gradients of a normal GP,
+        #and the K gradients including the implicit part
         dL_dK = impl
         return dL_dK
 
     def _gradients(self, partial):
         """
-        Gradients with respect to likelihood parameters
+        Gradients with respect to likelihood parameters (dL_dthetaL)
+
+        :param partial: Not needed by this likelihood
+        :type partial: lambda function
+        :rtype: array of derivatives (1 x num_likelihood_params)
         """
         dL_dfhat, I_KW_i = self._shared_gradients_components()
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.data, self.f_hat)
@@ -123,62 +136,51 @@ class Laplace(likelihood):
             #Implicit
             dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
             dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
-            #print "LIK: dL_dthetaL_exp: {}     dL_dthetaL_implicit: {}".format(dL_dthetaL_exp, dL_dthetaL_imp)
             dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
 
-        return dL_dthetaL #should be array of length *params-being optimized*, for student t just optimising 1 parameter, this is (1,)
+        return dL_dthetaL
 
     def _compute_GP_variables(self):
         """
-        Generates data Y which would give the normal distribution identical to the laplace approximation
+        Generate data Y which would give the normal distribution identical
+        to the laplace approximation to the posterior, but normalised
 
-        GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
-        that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood
+        GPy expects a likelihood to be gaussian, so need to caluclate
+        the data Y^{\tilde} that makes the posterior match that found
+        by a laplace approximation to a non-gaussian likelihood but with
+        a gaussian likelihood
 
-        Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
-        then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
-        due to the z rescaling.
+        Firstly,
+        The hessian of the unormalised posterior distribution is (K^{-1} + W)^{-1},
+        i.e. z*N(f|f^{\hat}, (K^{-1} + W)^{-1}) but this assumes a non-gaussian likelihood,
+        we wish to find the hessian \Sigma^{\tilde}
+        that has the same curvature but using our new simulated data Y^{\tilde}
+        i.e. we do N(Y^{\tilde}|f^{\hat}, \Sigma^{\tilde})N(f|0, K) = z*N(f|f^{\hat}, (K^{-1} + W)^{-1})
+        and we wish to find what Y^{\tilde} and \Sigma^{\tilde}
+        We find that Y^{\tilde} = W^{-1}(K^{-1} + W)f^{\hat} and \Sigma^{tilde} = W^{-1}
 
-        at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)
-        This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
-        giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)
-
-        $$\tilde{Y} = \tilde{\Sigma} Hf$$
-        where
-        $$\tilde{\Sigma}^{-1} = H - K^{-1}$$
-        i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
-        since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
-        and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$
-        $$\tilde{\Sigma} = W^{-1}$$
+        Secondly,
+        GPy optimizes the log marginal log p(y) = -0.5*ln|K+\Sigma^{\tilde}| - 0.5*Y^{\tilde}^{T}(K^{-1} + \Sigma^{tilde})^{-1}Y + lik.Z
+        So we can suck up any differences between that and our log marginal likelihood approximation
+        p^{\squiggle}(y) = -0.5*f^{\hat}K^{-1}f^{\hat} + log p(y|f^{\hat}) - 0.5*log |K||K^{-1} + W|
+        which we want to optimize instead, by equating them and rearranging, the difference is added onto
+        the log p(y) that GPy optimizes by default
 
+        Thirdly,
+        Since we have gradients that depend on how we move f^{\hat}, we have implicit components
+        aswell as the explicit dL_dK, we hold these differences in dZ_dK and add them to dL_dK in the
+        gp.py code
         """
-        #Wi(Ki + W) = WiKi + I = KW_i + I = L_Lt_W_i + I = Wi_Lit_Li + I = Lt_W_i_Li + I
-        #dtritri -> L -> L_i
-        #dtrtrs -> L.T*W, L_i -> (L.T*W)_i*L_i
-        #((L.T*w)_i + I)f_hat = y_tilde
-        #L = jitchol(self.K)
-        #Li = chol_inv(L)
-        #Lt_W = L.T*self.W.T
-
-        #Lt_W_i_Li = dtrtrs(Lt_W, Li, lower=True)[0]
-        #self.Wi__Ki_W = Lt_W_i_Li + np.eye(self.N)
-        #Y_tilde = np.dot(self.Wi__Ki_W, self.f_hat)
-
         Wi = 1.0/self.W
         self.Sigma_tilde = np.diagflat(Wi)
 
         Y_tilde = Wi*self.Ki_f + self.f_hat
 
-        #self.Wi_K_i = self.W_12*self.Bi*self.W_12.T #same as rasms R
-        #self.Wi_K_i = self.W_12*cho_solve((self.B_chol, True), np.diagflat(self.W_12))
         self.Wi_K_i = self.W12BiW12
-        #self.Wi_K_i, _, _, self.ln_det_Wi_K = pdinv(self.Sigma_tilde + self.K) # TODO: Check if Wi_K_i == R above and same with det below
-
         self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-
         self.lik = self.noise_model.link_function(self.data, self.f_hat, extra_data=self.extra_data)
-
         self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+
         Z_tilde = (+ self.lik
                    - 0.5*self.ln_B_det
                    + 0.5*self.ln_det_Wi_K
@@ -201,54 +203,46 @@ class Laplace(likelihood):
         """
         The laplace approximation algorithm, find K and expand hessian
         For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
-        :K: Covariance matrix
+        :param K: Covariance matrix evaluated at locations X
+        :type K: NxD matrix
         """
         self.K = K.copy()
 
         #Find mode
-        self.f_hat = {
-            'rasm': self.rasm_mode,
-            'ncg': self.ncg_mode,
-            'nelder': self.nelder_mode
-        }[self.opt](self.K)
+        self.f_hat = self.rasm_mode(self.K)
 
         #Compute hessian and other variables at mode
         self._compute_likelihood_variables()
 
+        #Compute fake variables replicating laplace approximation to posterior
+        self._compute_GP_variables()
+
     def _compute_likelihood_variables(self):
+        """
+        Compute the variables required to compute gaussian Y variables
+        """
         #At this point get the hessian matrix (or vector as W is diagonal)
         self.W = -self.noise_model.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
         self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
 
-        #Do the computation again at f to get Ki_f which is useful
-        #b = self.W*self.f_hat + self.noise_model.dlik_df(self.data, self.f_hat, extra_data=self.extra_data)
-        #solve_chol = cho_solve((self.B_chol, True), np.dot(self.W_12*self.K, b))
-        #a = b - self.W_12*solve_chol
-        self.Ki_f = self.a
-
+        self.Ki_f = self.Ki_f
         self.f_Ki_f = np.dot(self.f_hat.T, self.Ki_f)
         self.Ki_W_i = self.K - mdot(self.K, self.W12BiW12, self.K)
 
-        #For det, |I + KW| == |I + W_12*K*W_12|
-        #self.ln_I_KW_det = pddet(np.eye(self.N) + self.W_12*self.K*self.W_12.T)
-
-        #self.ln_I_KW_det = pddet(np.eye(self.N) + np.dot(self.K, self.W))
-        #self.ln_z_hat = (- 0.5*self.f_Ki_f
-                         #- self.ln_I_KW_det
-                         #+ self.noise_model.link_function(self.data, self.f_hat, extra_data=self.extra_data)
-                         #)
-
-        return self._compute_GP_variables()
-
     def _compute_B_statistics(self, K, W, a):
-        """Rasmussen suggests the use of a numerically stable positive definite matrix B
+        """
+        Rasmussen suggests the use of a numerically stable positive definite matrix B
         Which has a positive diagonal element and can be easyily inverted
 
-        :K: Covariance matrix
-        :W: Negative hessian at a point (diagonal matrix)
-        :returns: (B, L)
+        :param K: Covariance matrix evaluated at locations X
+        :type K: NxD matrix
+        :param W: Negative hessian at a point (diagonal matrix)
+        :type W: Vector of diagonal values of hessian (1xN)
+        :param a: Matrix to calculate W12BiW12a
+        :type a: Matrix NxN
+        :returns: (W12BiW12, ln_B_det)
         """
         if not self.noise_model.log_concave:
             #print "Under 1e-10: {}".format(np.sum(W < 1e-10))
@@ -265,74 +259,37 @@ class Laplace(likelihood):
 
         W12BiW12= W_12*cho_solve((L, True), W_12*a)
         ln_B_det = 2*np.sum(np.log(np.diag(L)))
-        return (W12BiW12, ln_B_det)
+        return W12BiW12, ln_B_det
 
-    def nelder_mode(self, K):
-        f = np.zeros((self.N, 1))
-        self.Ki, _, _, self.ln_K_det = pdinv(K)
-        def obj(f):
-            res = -1 * (self.noise_model.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5*np.dot(f.T, np.dot(self.Ki, f)))
-            return float(res)
-
-        res = sp.optimize.minimize(obj, f, method='nelder-mead', options={'xtol': 1e-7, 'maxiter': 25000, 'disp': True})
-        f_new = res.x
-        return f_new[:, None]
-
-    def ncg_mode(self, K):
-        """
-        Find the mode using a normal ncg optimizer and inversion of K (numerically unstable but intuative)
-        :K: Covariance matrix
-        :returns: f_mode
-        """
-        self.Ki, _, _, self.ln_K_det = pdinv(K)
-
-        f = np.zeros((self.N, 1))
-
-        #FIXME: Can we get rid of this horrible reshaping?
-        #ONLY WORKS FOR 1D DATA
-        def obj(f):
-            res = -1 * (self.noise_model.link_function(self.data[:, 0], f, extra_data=self.extra_data) - 0.5 * np.dot(f.T, np.dot(self.Ki, f))
-                        - self.NORMAL_CONST)
-            return float(res)
-
-        def obj_grad(f):
-            res = -1 * (self.noise_model.dlik_df(self.data[:, 0], f, extra_data=self.extra_data) - np.dot(self.Ki, f))
-            return np.squeeze(res)
-
-        def obj_hess(f):
-            res = -1 * (np.diag(self.noise_model.d2lik_d2f(self.data[:, 0], f, extra_data=self.extra_data)) - self.Ki)
-            return np.squeeze(res)
-
-        f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess, disp=False)
-        return f_hat[:, None]
-
-    def rasm_mode(self, K, MAX_ITER=100, MAX_RESTART=10):
+    def rasm_mode(self, K, MAX_ITER=100):
         """
         Rasmussen's numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
+        Influenced by GPML (BSD) code, all errors are our own
 
-        :K: Covariance matrix
-        :MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
-        :MAX_RESTART: Maximum number of restarts (reducing step_size) before forcing finish of optimisation
-        :returns: f_mode
+        :param K: Covariance matrix evaluated at locations X
+        :type K: NxD matrix
+        :param MAX_ITER: Maximum number of iterations of newton-raphson before forcing finish of optimisation
+        :type MAX_ITER: scalar
+        :returns: f_hat, mode on which to make laplace approxmiation
+        :rtype: NxD matrix
         """
-        #self.old_before_s = self.noise_model._get_params()
-        #print "before: ", self.old_before_s
-        #if self.old_before_s < 1e-5:
+        #old_Ki_f = np.zeros((self.N, 1))
 
-        #old_a = np.zeros((self.N, 1))
-        if self.old_a is None:
-            old_a = np.zeros((self.N, 1))
-            f = np.dot(K, old_a)
+        #Start f's at zero originally
+        if self.old_Ki_f is None:
+            old_Ki_f = np.zeros((self.N, 1))
+            f = np.dot(K, old_Ki_f)
         else:
-            old_a = self.old_a.copy()
+            #Start at the old best point
+            old_Ki_f = self.old_Ki_f.copy()
             f = self.f_hat.copy()
 
         new_obj = -np.inf
         old_obj = np.inf
 
-        def obj(a, f):
-            return -0.5*np.dot(a.T, f) + self.noise_model.link_function(self.data, f, extra_data=self.extra_data)
+        def obj(Ki_f, f):
+            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.link_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
         epsilon = 1e-6
@@ -340,42 +297,43 @@ class Laplace(likelihood):
         rs = 0
         i = 0
 
-        while difference > epsilon and i < MAX_ITER:# and rs < MAX_RESTART:
+        while difference > epsilon and i < MAX_ITER:
             W = -self.noise_model.d2lik_d2f(self.data, f, extra_data=self.extra_data)
 
             W_f = W*f
             grad = self.noise_model.dlik_df(self.data, f, extra_data=self.extra_data)
 
             b = W_f + grad
-            #TODO!!!
             W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
-            #solve_L = cho_solve((L, True), W_12*np.dot(K, b))
+
             #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
-            full_step_a = b - W12BiW12Kb
-            da = full_step_a - old_a
+            full_step_Ki_f = b - W12BiW12Kb
+            dKi_f = full_step_Ki_f - old_Ki_f
 
             f_old = f.copy()
-            def inner_obj(step_size, old_a, da, K):
-                a = old_a + step_size*da
-                f = np.dot(K, a)
-                self.a = a.copy() # This is nasty, need to set something within an optimization though
+            def inner_obj(step_size, old_Ki_f, dKi_f, K):
+                Ki_f = old_Ki_f + step_size*dKi_f
+                f = np.dot(K, Ki_f)
+                # This is nasty, need to set something within an optimization though
+                self.Ki_f = Ki_f.copy()
                 self.f = f.copy()
-                return -obj(a, f)
+                return -obj(Ki_f, f)
 
-            i_o = partial(inner_obj, old_a=old_a, da=da, K=K)
-            #new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=20)
+            i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
+            #Find the stepsize that minimizes the objective function using a brent line search
             new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':30}).fun
             f = self.f.copy()
-            a = self.a.copy()
+            Ki_f = self.Ki_f.copy()
 
+            #Optimize without linesearch
             #f_old = f.copy()
             #update_passed = False
             #while not update_passed:
-                #a = old_a + step_size*da
-                #f = np.dot(K, a)
+                #Ki_f = old_Ki_f + step_size*dKi_f
+                #f = np.dot(K, Ki_f)
 
                 #old_obj = new_obj
-                #new_obj = obj(a, f)
+                #new_obj = obj(Ki_f, f)
                 #difference = new_obj - old_obj
                 ##print "difference: ",difference
                 #if difference < 0:
@@ -390,70 +348,18 @@ class Laplace(likelihood):
                 #else:
                     #update_passed = True
 
+            #old_Ki_f = self.Ki_f.copy()
+
             #difference = abs(new_obj - old_obj)
             #old_obj = new_obj.copy()
             #difference = np.abs(np.sum(f - f_old))
-            difference = np.abs(np.sum(a - old_a))
-            #old_a = self.a.copy() #a
-            old_a = a.copy()
+            difference = np.abs(np.sum(Ki_f - old_Ki_f))
+            old_Ki_f = Ki_f.copy()
             i += 1
-            #print "a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
 
-        self.old_a = old_a.copy()
-        #print "Positive difference obj: ", np.float(difference)
-        #print "Iterations: {}, Step size reductions: {}, Final_difference: {}, step_size: {}".format(i, rs, difference, step_size)
-        #print "Iterations: {}, Final_difference: {}".format(i, difference)
+        self.old_Ki_f = old_Ki_f.copy()
         if difference > epsilon:
             print "Not perfect f_hat fit difference: {}".format(difference)
-            if False:
-                import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-                if hasattr(self, 'X'):
-                    import pylab as pb
-                    pb.figure()
-                    pb.subplot(311)
-                    pb.title('old f_hat')
-                    pb.plot(self.X, self.f_hat)
-                    pb.subplot(312)
-                    pb.title('old ff')
-                    pb.plot(self.X, self.old_ff)
-                    pb.subplot(313)
-                    pb.title('new f_hat')
-                    pb.plot(self.X, f)
 
-                    pb.figure()
-                    pb.subplot(121)
-                    pb.title('old K')
-                    pb.imshow(np.diagflat(self.old_K), interpolation='none')
-                    pb.colorbar()
-                    pb.subplot(122)
-                    pb.title('new K')
-                    pb.imshow(np.diagflat(K), interpolation='none')
-                    pb.colorbar()
-
-                    pb.figure()
-                    pb.subplot(121)
-                    pb.title('old W')
-                    pb.imshow(np.diagflat(self.old_W), interpolation='none')
-                    pb.colorbar()
-                    pb.subplot(122)
-                    pb.title('new W')
-                    pb.imshow(np.diagflat(W), interpolation='none')
-                    pb.colorbar()
-
-                    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-                    pb.close('all')
-
-        #FIXME: DELETE THESE
-        #self.old_W = W.copy()
-        #self.old_grad = grad.copy()
-        #self.old_B = B.copy()
-        #self.old_W_12 = W_12.copy()
-        #self.old_ff = f.copy()
-        #self.old_K = self.K.copy()
-        #self.old_s = self.noise_model._get_params()
-        #print "after: ", self.old_s
-        #print "FINAL a max: {} a min: {} a var: {}".format(np.max(self.a), np.min(self.a), np.var(self.a))
-        self.a = a
-        #self.B, self.B_chol, self.W_12 = B, L, W_12
-        #self.Bi, _, _, B_det = pdinv(self.B)
+        self.Ki_f = Ki_f
         return f
diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index 6b609016..89620987 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from scipy import stats,special
+from scipy import stats, special
 import scipy as sp
 import gp_transformations
 from noise_distributions import NoiseDistribution
@@ -180,7 +180,6 @@ class StudentT(NoiseDistribution):
         #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
         true_var = sigma**2 + self.variance
 
-        print "True var: {}".format(true_var)
         return true_var
 
     def _predictive_mean_analytical(self, mu, var):
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index 0537e104..6d720f87 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -218,7 +218,7 @@ class LaplaceTests(unittest.TestCase):
         print "\n{}".format(inspect.stack()[0][3])
         self.Y = self.Y/self.Y.max()
         kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
-        gauss_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.gauss, opt='rasm')
+        gauss_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.gauss)
         m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=gauss_laplace)
         m.ensure_default_constraints()
         m.randomize()
@@ -230,7 +230,7 @@ class LaplaceTests(unittest.TestCase):
         self.Y = self.Y/self.Y.max()
         self.stu_t = GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var)
         kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
-        stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm')
+        stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
         m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
         m.ensure_default_constraints()
         m.constrain_positive('t_noise')
@@ -244,7 +244,7 @@ class LaplaceTests(unittest.TestCase):
         self.Y = self.Y/self.Y.max()
         white_var = 1
         kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
-        stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm')
+        stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
         m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
         m.ensure_default_constraints()
         m.constrain_positive('t_noise')
@@ -259,7 +259,7 @@ class LaplaceTests(unittest.TestCase):
         self.Y = self.Y/self.Y.max()
         white_var = 1
         kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
-        stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t, opt='rasm')
+        stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
         m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
         m.ensure_default_constraints()
         m.constrain_positive('t_noise')

From 2acf93148222936a706cdc59f8ebca0ff99a48b4 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 4 Oct 2013 14:44:50 +0100
Subject: [PATCH 093/252] Tidying up a lot, works for 1D, need to check for
 more dimensions

---
 GPy/examples/laplace_approximations.py        | 447 +-----------------
 GPy/likelihoods/laplace.py                    |   4 +-
 .../noise_models/gaussian_noise.py            |  20 +-
 .../noise_models/student_t_noise.py           | 105 ++--
 GPy/testing/laplace_tests.py                  |  26 +-
 doc/GPy.examples.rst                          |   8 +
 doc/GPy.kern.parts.rst                        |  16 +
 doc/GPy.likelihoods.noise_models.rst          |   8 +
 doc/GPy.likelihoods.rst                       |  16 +
 doc/GPy.testing.rst                           |  16 +
 doc/GPy.util.rst                              |  24 +
 11 files changed, 192 insertions(+), 498 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index eb78c47a..ea3a9f8e 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -4,402 +4,6 @@ import matplotlib.pyplot as plt
 from GPy.util import datasets
 np.random.seed(1)
 
-def timing():
-    real_var = 0.1
-    times = 1
-    deg_free = 10
-    real_sd = np.sqrt(real_var)
-    the_is = np.zeros(times)
-    X = np.linspace(0.0, 10.0, 300)[:, None]
-
-    for a in xrange(times):
-        Y = np.sin(X) + np.random.randn(*X.shape)*real_var
-        Yc = Y.copy()
-
-        Yc[10] += 100
-        Yc[25] += 10
-        Yc[23] += 10
-        Yc[24] += 10
-        Yc[250] += 10
-        #Yc[4] += 10000
-
-        edited_real_sd = real_sd
-        kernel1 = GPy.kern.rbf(X.shape[1])
-
-        t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-        corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
-        m = GPy.models.GPRegression(X, Yc.copy(), kernel1, likelihood=corrupt_stu_t_likelihood)
-        m.ensure_default_constraints()
-        m.update_likelihood_approximation()
-        m.optimize()
-        the_is[a] = m.likelihood.i
-
-    print the_is
-    print np.mean(the_is)
-
-def v_fail_test():
-    #plt.close('all')
-    real_var = 0.1
-    X = np.linspace(0.0, 10.0, 50)[:, None]
-    Y = np.sin(X) + np.random.randn(*X.shape)*real_var
-    Y = Y/Y.max()
-
-    #Add student t random noise to datapoints
-    deg_free = 10
-    real_sd = np.sqrt(real_var)
-    print "Real noise std: ", real_sd
-
-    kernel1 = GPy.kern.white(X.shape[1]) #+ GPy.kern.white(X.shape[1])
-
-    edited_real_sd = 0.3#real_sd
-    edited_real_sd = real_sd
-
-    print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
-    m = GPy.models.GPRegression(X, Y.copy(), kernel1, likelihood=stu_t_likelihood)
-    m.constrain_positive('')
-    vs = 25
-    noises = 30
-    checkgrads = np.zeros((vs, noises))
-    vs_noises = np.zeros((vs, noises))
-    for v_ind, v in enumerate(np.linspace(1, 100, vs)):
-        m.likelihood.likelihood_function.v = v
-        print v
-        for noise_ind, noise in enumerate(np.linspace(0.0001, 100, noises)):
-            m['t_noise'] = noise
-            m.update_likelihood_approximation()
-            checkgrads[v_ind, noise_ind] = m.checkgrad()
-            vs_noises[v_ind, noise_ind] = (float(v)/(float(v) - 2))*(noise**2)
-
-    plt.figure()
-    plt.title('Checkgrads')
-    plt.imshow(checkgrads, interpolation='nearest')
-    plt.xlabel('noise')
-    plt.ylabel('v')
-
-    #plt.figure()
-    #plt.title('variance change')
-    #plt.imshow(vs_noises, interpolation='nearest')
-    #plt.xlabel('noise')
-    #plt.ylabel('v')
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-    print(m)
-
-def student_t_obj_plane():
-    plt.close('all')
-    X = np.linspace(0, 1, 50)[:, None]
-    real_std = 0.002
-    noise = np.random.randn(*X.shape)*real_std
-    Y = np.sin(X*2*np.pi) + noise
-    deg_free = 1000
-
-    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
-    mgp.ensure_default_constraints()
-    mgp['noise'] = real_std**2
-    print "Gaussian"
-    print mgp
-
-    kernelst = kernelgp.copy()
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=(real_std**2))
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
-    m = GPy.models.GPRegression(X, Y, kernelst, likelihood=stu_t_likelihood)
-    m.ensure_default_constraints()
-    m.constrain_fixed('t_no', real_std**2)
-    vs = 10
-    ls = 10
-    objs_t = np.zeros((vs, ls))
-    objs_g = np.zeros((vs, ls))
-    rbf_vs = np.linspace(1e-6, 8, vs)
-    rbf_ls = np.linspace(1e-2, 8, ls)
-    for v_id, rbf_v in enumerate(rbf_vs):
-        for l_id, rbf_l in enumerate(rbf_ls):
-            m['rbf_v'] = rbf_v
-            m['rbf_l'] = rbf_l
-            mgp['rbf_v'] = rbf_v
-            mgp['rbf_l'] = rbf_l
-            objs_t[v_id, l_id] = m.log_likelihood()
-            objs_g[v_id, l_id] = mgp.log_likelihood()
-    plt.figure()
-    plt.subplot(211)
-    plt.title('Student t')
-    plt.imshow(objs_t, interpolation='none')
-    plt.xlabel('variance')
-    plt.ylabel('lengthscale')
-    plt.subplot(212)
-    plt.title('Gaussian')
-    plt.imshow(objs_g, interpolation='none')
-    plt.xlabel('variance')
-    plt.ylabel('lengthscale')
-    plt.show()
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-    return objs_t
-
-def student_t_f_check():
-    plt.close('all')
-    X = np.linspace(0, 1, 50)[:, None]
-    real_std = 0.2
-    noise = np.random.randn(*X.shape)*real_std
-    Y = np.sin(X*2*np.pi) + noise
-    deg_free = 1000
-
-    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
-    mgp.ensure_default_constraints()
-    mgp.randomize()
-    mgp.optimize()
-    print "Gaussian"
-    print mgp
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-
-    kernelst = kernelgp.copy()
-    #kernelst += GPy.kern.bias(X.shape[1])
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=0.05)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
-    m = GPy.models.GPRegression(X, Y.copy(), kernelst, likelihood=stu_t_likelihood)
-    #m['rbf_v'] = mgp._get_params()[0]
-    #m['rbf_l'] = mgp._get_params()[1] + 1
-    m.ensure_default_constraints()
-    #m.constrain_fixed('rbf_v', mgp._get_params()[0])
-    #m.constrain_fixed('rbf_l', mgp._get_params()[1])
-    #m.constrain_bounded('t_no', 2*real_std**2, 1e3)
-    #m.constrain_positive('bias')
-    m.constrain_positive('t_no')
-    m.randomize()
-    m['t_no'] = 0.3
-    m.likelihood.X = X
-    #print m
-    plt.figure()
-    plt.subplot(211)
-    m.plot()
-    print "OPTIMIZED ONCE"
-    plt.subplot(212)
-    m.optimize()
-    m.plot()
-    print "final optimised student t"
-    print m
-    print "real GP"
-    print mgp
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-    return m
-
-def student_t_fix_optimise_check():
-    plt.close('all')
-    real_var = 0.1
-    real_std = np.sqrt(real_var)
-    X = np.random.rand(200)[:, None]
-    noise = np.random.randn(*X.shape)*real_std
-    Y = np.sin(X*2*np.pi) + noise
-    X_full = X
-    Y_full = np.sin(X_full)
-    Y = Y/Y.max()
-    Y_full = Y_full/Y_full.max()
-    deg_free = 1000
-
-    #GP
-    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GPRegression(X, Y.copy(), kernel=kernelgp)
-    mgp.ensure_default_constraints()
-    mgp.randomize()
-    mgp.optimize()
-
-    kernelst = kernelgp.copy()
-    real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
-
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=real_stu_t_std2)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
-
-    plt.figure(1)
-    plt.suptitle('Student likelihood')
-    m = GPy.models.GPRegression(X, Y.copy(), kernelst, likelihood=stu_t_likelihood)
-    m.constrain_fixed('rbf_var', mgp._get_params()[0])
-    m.constrain_fixed('rbf_len', mgp._get_params()[1])
-    m.constrain_positive('t_noise')
-    #m.ensure_default_constraints()
-
-    m.update_likelihood_approximation()
-    print "T std2 {} converted from original data, LL: {}".format(real_stu_t_std2, m.log_likelihood())
-    plt.subplot(231)
-    m.plot()
-    plt.title('Student t original data noise')
-
-    #Fix student t noise variance to same a GP
-    gp_noise = mgp._get_params()[2]
-    m['t_noise_std2'] = gp_noise
-    m.update_likelihood_approximation()
-    print "T std2 {} same as GP noise, LL: {}".format(gp_noise, m.log_likelihood())
-    plt.subplot(232)
-    m.plot()
-    plt.title('Student t GP noise')
-
-    #Fix student t noise to variance converted from the GP
-    real_stu_t_std2gp = (gp_noise)*((deg_free - 2)/float(deg_free))
-    m['t_noise_std2'] = real_stu_t_std2gp
-    m.update_likelihood_approximation()
-    print "T std2 {} converted to student t noise from GP noise, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.log_likelihood())
-    plt.subplot(233)
-    m.plot()
-    plt.title('Student t GP noise converted')
-
-    m.constrain_positive('t_noise_std2')
-    m.randomize()
-    m.update_likelihood_approximation()
-    plt.subplot(234)
-    m.plot()
-    plt.title('Student t fixed rbf')
-    m.optimize()
-    print "T std2 {} var {} after optimising, LL: {}".format(m.likelihood.likelihood_function.sigma2, m.likelihood.likelihood_function.variance, m.log_likelihood())
-    plt.subplot(235)
-    m.plot()
-    plt.title('Student t fixed rbf optimised')
-
-    plt.figure(2)
-    mrbf = m.copy()
-    mrbf.unconstrain('')
-    mrbf.constrain_fixed('t_noise', m.likelihood.likelihood_function.sigma2)
-    gp_var = mgp._get_params()[0]
-    gp_len = mgp._get_params()[1]
-    mrbf.constrain_fixed('rbf_var', gp_var)
-    mrbf.constrain_positive('rbf_len')
-    mrbf.randomize()
-    print "Before optimize"
-    print mrbf
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-    mrbf.checkgrad(verbose=1)
-    plt.subplot(121)
-    mrbf.plot()
-    plt.title('Student t fixed noise')
-    mrbf.optimize()
-    print "After optimize"
-    print mrbf
-    plt.subplot(122)
-    mrbf.plot()
-    plt.title('Student t fixed noise optimized')
-    print mrbf
-
-    plt.figure(3)
-    print "GP noise {} after optimising, LL: {}".format(gp_noise, mgp.log_likelihood())
-    plt.suptitle('Gaussian likelihood optimised')
-    mgp.plot()
-    print "Real std: {}".format(real_std)
-    print "Real variance {}".format(real_std**2)
-
-    #import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-
-    print "Len should be: {}".format(gp_len)
-    return mrbf
-
-def debug_student_t_noise_approx():
-    plot = False
-    real_var = 0.1
-    #Start a function, any function
-    #X = np.linspace(0.0, 10.0, 50)[:, None]
-    X = np.random.rand(100)[:, None]
-    #X = np.random.rand(100)[:, None]
-    #X = np.array([0.5, 1])[:, None]
-    Y = np.sin(X*2*np.pi) + np.random.randn(*X.shape)*real_var + 1
-    #Y = X + np.random.randn(*X.shape)*real_var
-    #ty = np.array([1., 9.97733584, 4.17841363])[:, None]
-    #Y = ty
-
-    X_full = X
-    Y_full = np.sin(X_full) + 1
-
-    Y = Y/Y.max()
-
-    #Add student t random noise to datapoints
-    deg_free = 100
-
-    real_sd = np.sqrt(real_var)
-    print "Real noise std: ", real_sd
-
-    initial_var_guess = 0.3
-    #t_rv = t(deg_free, loc=0, scale=real_var)
-    #noise = t_rvrvs(size=Y.shape)
-    #Y += noise
-
-    plt.close('all')
-    # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1]) #+ GPy.kern.white(X.shape[1])
-    #kernel1 = GPy.kern.linear(X.shape[1]) + GPy.kern.white(X.shape[1])
-    kernel2 = kernel1.copy()
-    kernel3 = kernel1.copy()
-    kernel4 = kernel1.copy()
-    kernel5 = kernel1.copy()
-    kernel6 = kernel1.copy()
-
-    print "Clean Gaussian"
-    #A GP should completely break down due to the points as they get a lot of weight
-    # create simple GP model
-    #m = GPy.models.GPRegression(X, Y, kernel=kernel1)
-    ## optimize
-    #m.ensure_default_constraints()
-    #m.optimize()
-    ## plot
-    #if plot:
-        #plt.figure(1)
-        #plt.suptitle('Gaussian likelihood')
-        #plt.subplot(131)
-        #m.plot()
-        #plt.plot(X_full, Y_full)
-    #print m
-
-    real_stu_t_std = np.sqrt(real_var*((deg_free - 2)/float(deg_free)))
-    edited_real_sd = real_stu_t_std**2 #initial_var_guess #real_sd
-    #edited_real_sd = real_sd
-
-    print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
-
-    m = GPy.models.GPRegression(X, Y, kernel6, likelihood=stu_t_likelihood)
-    #m['rbf_len'] = 1.5
-    #m.constrain_fixed('rbf_v', 1.0898)
-    #m.constrain_fixed('rbf_l', 0.2651)
-    #m.constrain_fixed('t_noise_std2', edited_real_sd)
-    #m.constrain_positive('rbf')
-    m.constrain_positive('t_noise_std2')
-    #m.constrain_positive('')
-    #m.constrain_bounded('t_noi', 0.001, 10)
-    #m.constrain_fixed('t_noi', real_stu_t_std)
-    #m.constrain_fixed('white', 0.01)
-    #m.constrain_fixed('t_no', 0.01)
-    #m['rbf_var'] = 0.20446332
-    #m['rbf_leng'] = 0.85776241
-    #m['t_noise'] = 0.667083294421005
-    m.ensure_default_constraints()
-    m.update_likelihood_approximation()
-    #m.optimize(messages=True)
-    print(m)
-    #return m
-    #m.optimize('lbfgsb', messages=True, callback=m._update_params_callback)
-    if plot:
-        plt.suptitle('Student-t likelihood')
-        plt.subplot(132)
-        m.plot()
-        plt.plot(X_full, Y_full)
-        plt.ylim(-2.5, 2.5)
-    print "Real noise std: ", real_sd
-    print "or Real noise std: ", real_stu_t_std
-    return m
-
-    #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
-    #m = GPy.models.GPRegression(X, stu_t_likelihood, kernel3)
-    #m.ensure_default_constraints()
-    #m.update_likelihood_approximation()
-    #m.optimize()
-    #print(m)
-    #if plot:
-        #plt.subplot(133)
-        #m.plot()
-        #plt.plot(X_full, Y_full)
-        #plt.ylim(-2.5, 2.5)
-
-    #plt.show()
-
 def student_t_approx():
     """
     Example of regressing with a student t likelihood
@@ -415,8 +19,10 @@ def student_t_approx():
 
     Y = Y/Y.max()
 
+    #Slightly noisy data
     Yc[75:80] += 1
 
+    #Very noisy data
     #Yc[10] += 100
     #Yc[25] += 10
     #Yc[23] += 10
@@ -427,22 +33,12 @@ def student_t_approx():
     #Add student t random noise to datapoints
     deg_free = 5
     print "Real noise: ", real_std
-
     initial_var_guess = 0.5
+
     #t_rv = t(deg_free, loc=0, scale=real_var)
     #noise = t_rvrvs(size=Y.shape)
     #Y += noise
 
-    #Add some extreme value noise to some of the datapoints
-    #percent_corrupted = 0.15
-    #corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
-    #indices = np.arange(Y.shape[0])
-    #np.random.shuffle(indices)
-    #corrupted_indices = indices[:corrupted_datums]
-    #print corrupted_indices
-    #noise = t_rv.rvs(size=(len(corrupted_indices), 1))
-    #Y[corrupted_indices] += noise
-
     plt.figure(1)
     plt.suptitle('Gaussian likelihood')
     # Kernel object
@@ -459,6 +55,7 @@ def student_t_approx():
     m = GPy.models.GPRegression(X, Y, kernel=kernel1)
     # optimize
     m.ensure_default_constraints()
+    m.constrain_fixed('white', 1e-4)
     m.randomize()
     m.optimize()
     # plot
@@ -473,6 +70,7 @@ def student_t_approx():
     print "Corrupt Gaussian"
     m = GPy.models.GPRegression(X, Yc, kernel=kernel2)
     m.ensure_default_constraints()
+    m.constrain_fixed('white', 1e-4)
     m.randomize()
     m.optimize()
     ax = plt.subplot(212)
@@ -492,6 +90,7 @@ def student_t_approx():
     m = GPy.models.GPRegression(X, Y.copy(), kernel6, likelihood=stu_t_likelihood)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
+    m.constrain_fixed('white', 1e-4)
     m.randomize()
     #m.update_likelihood_approximation()
     m.optimize()
@@ -510,7 +109,6 @@ def student_t_approx():
     m.constrain_positive('t_noise')
     m.constrain_fixed('white', 1e-4)
     m.randomize()
-    #m.update_likelihood_approximation()
     for a in range(1):
         m.randomize()
         m_start = m.copy()
@@ -523,7 +121,6 @@ def student_t_approx():
     plt.ylim(-1.5, 1.5)
     plt.title('Student-t rasm corrupt')
 
-    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
     return m
 
     #with a student t distribution, since it has heavy tails it should work well
@@ -545,38 +142,6 @@ def student_t_approx():
 
     return m
 
-
-def noisy_laplace_approx():
-    """
-    Example of regressing with a student t likelihood
-    """
-    #Start a function, any function
-    X = np.sort(np.random.uniform(0, 15, 70))[:, None]
-    Y = np.sin(X)
-
-    #Add some extreme value noise to some of the datapoints
-    percent_corrupted = 0.05
-    corrupted_datums = int(np.round(Y.shape[0] * percent_corrupted))
-    indices = np.arange(Y.shape[0])
-    np.random.shuffle(indices)
-    corrupted_indices = indices[:corrupted_datums]
-    print corrupted_indices
-    noise = np.random.uniform(-10, 10, (len(corrupted_indices), 1))
-    Y[corrupted_indices] += noise
-
-    #A GP should completely break down due to the points as they get a lot of weight
-    # create simple GP model
-    m = GPy.models.GPRegression(X, Y)
-
-    # optimize
-    m.ensure_default_constraints()
-    m.optimize()
-    # plot
-    m.plot()
-    print m
-
-    #with a student t distribution, since it has heavy tails it should work well
-
 def gaussian_f_check():
     plt.close('all')
     X = np.linspace(0, 1, 50)[:, None]
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 46203506..46ca66bb 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -178,7 +178,7 @@ class Laplace(likelihood):
 
         self.Wi_K_i = self.W12BiW12
         self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-        self.lik = self.noise_model.link_function(self.data, self.f_hat, extra_data=self.extra_data)
+        self.lik = self.noise_model.lik_function(self.data, self.f_hat, extra_data=self.extra_data)
         self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
 
         Z_tilde = (+ self.lik
@@ -289,7 +289,7 @@ class Laplace(likelihood):
         old_obj = np.inf
 
         def obj(Ki_f, f):
-            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.link_function(self.data, f, extra_data=self.extra_data)
+            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.lik_function(self.data, f, extra_data=self.extra_data)
 
         difference = np.inf
         epsilon = 1e-6
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index 38729883..f4251ff3 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -76,7 +76,7 @@ class Gaussian(NoiseDistribution):
         new_sigma2 = self.predictive_variance(mu,sigma)
         return new_sigma2*(mu/sigma**2 + self.gp_link.transf(mu)/self.variance)
 
-    def _predictive_variance_analytical(self,mu,sigma):
+    def _predictive_variance_analytical(self,mu,sigma,predictive_mean=None):
         return 1./(1./self.variance + 1./sigma**2)
 
     def _mass(self,gp,obs):
@@ -116,8 +116,8 @@ class Gaussian(NoiseDistribution):
     def _d2variance_dgp2(self,gp):
         return 0
 
-    def link_function(self, y, f, extra_data=None):
-        """link_function $\ln p(y|f)$
+    def lik_function(self, y, f, extra_data=None):
+        """lik_function $\ln p(y|f)$
         $$\ln p(y_{i}|f_{i}) = \ln $$
 
         :y: data
@@ -128,10 +128,9 @@ class Gaussian(NoiseDistribution):
         """
         assert y.shape == f.shape
         e = y - f
-        eeT = np.dot(e, e.T)
         objective = (- 0.5*self.D*np.log(2*np.pi)
                      - 0.5*self.ln_det_K
-                     - (0.5/self.variance)*np.dot(e.T, e) # As long as K is diagonal
+                     - (0.5/self.variance)*np.sum(np.square(e)) # As long as K is diagonal
                      )
         return np.sum(objective)
 
@@ -146,14 +145,14 @@ class Gaussian(NoiseDistribution):
 
         """
         assert y.shape == f.shape
-        s2_i = (1.0/self.variance)*self.I
-        grad = np.dot(s2_i, y) - np.dot(s2_i, f)
+        s2_i = (1.0/self.variance)
+        grad = s2_i*y - s2_i*f
         return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
         """
         Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
-        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+        i.e. second derivative lik_function at y given f f_j  w.r.t f and f_j
 
         Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
         (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
@@ -164,13 +163,12 @@ class Gaussian(NoiseDistribution):
         :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
         """
         assert y.shape == f.shape
-        s2_i = (1.0/self.variance)*self.I
-        hess = np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
+        hess = -(1.0/self.variance)*np.ones((self.N, 1))
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
         """
-        Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+        Third order derivative lik_function (log-likelihood ) at y given f f_j w.r.t f and f_j
 
         $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
         """
diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index 89620987..000168e1 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -15,10 +15,8 @@ class StudentT(NoiseDistribution):
 
     For nomanclature see Bayesian Data Analysis 2003 p576
 
-    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2)$$
-
     .. math::
-        Fill in maths
+        \\ln p(y_{i}|f_{i}) = \\ln \\Gamma(\\frac{v+1}{2}) - \\ln \\Gamma(\\frac{v}{2})\\sqrt{v \\pi}\\sigma - \\frac{v+1}{2}\\ln (1 + \\frac{1}{v}\\left(\\frac{y_{i} - f_{i}}{\\sigma}\\right)^2)
 
     """
     def __init__(self,gp_link=None,analytical_mean=True,analytical_variance=True, deg_free=5, sigma2=2):
@@ -42,16 +40,20 @@ class StudentT(NoiseDistribution):
     def variance(self, extra_data=None):
         return (self.v / float(self.v - 2)) * self.sigma2
 
-    def link_function(self, y, f, extra_data=None):
-        """link_function $\ln p(y|f)$
-        $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
+    def lik_function(self, y, f, extra_data=None):
+        """
+        Log Likelihood Function
 
-        For wolfram alpha import parts for derivative of sigma are -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+        .. math::
+            \\ln p(y_{i}|f_{i}) = \\ln \\Gamma(\\frac{v+1}{2}) - \\ln \\Gamma(\\frac{v}{2})\\sqrt{v \\pi}\sigma - \\frac{v+1}{2}\\ln (1 + \\frac{1}{v}\\left(\\frac{y_{i} - f_{i}}{\\sigma}\\right)^2
 
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: float(likelihood evaluated for this point)
+        :param y: data
+        :type y: NxD matrix
+        :param f: latent variables f
+        :type f: NxD matrix
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: likelihood evaluated for this point
+        :rtype: float
 
         """
         assert y.shape == f.shape
@@ -65,14 +67,18 @@ class StudentT(NoiseDistribution):
 
     def dlik_df(self, y, f, extra_data=None):
         """
-        Gradient of the link function at y, given f w.r.t f
+        Gradient of the log likelihood function at y, given f w.r.t f
 
-        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
+        .. math::
+            \\frac{d \\ln p(y_{i}|f_{i})}{df} = \\frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \\sigma^{2}v}
 
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
+        :param y: data
+        :type y: NxD matrix
+        :param f: latent variables f
+        :type f: NxD matrix
+        :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: gradient of likelihood evaluated at points
+        :rtype: 1xN array
 
         """
         assert y.shape == f.shape
@@ -82,18 +88,23 @@ class StudentT(NoiseDistribution):
 
     def d2lik_d2f(self, y, f, extra_data=None):
         """
-        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
-        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
+        Hessian at y, given f, w.r.t f the hessian will be 0 unless i == j
+        i.e. second derivative lik_function at y given f_{i} f_{j}  w.r.t f_{i} and f_{j}
 
-        Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-        (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = \\frac{(v+1)((y_{i}-f_{i})^{2} - \\sigma^{2}v)}{((y_{i}-f_{i})^{2} + \\sigma^{2}v)^{2}}
 
-        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
+        :param y: data
+        :type y: NxD matrix
+        :param f: latent variables f
+        :type f: NxD matrix
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: 1xN array
 
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
         """
         assert y.shape == f.shape
         e = y - f
@@ -102,9 +113,18 @@ class StudentT(NoiseDistribution):
 
     def d3lik_d3f(self, y, f, extra_data=None):
         """
-        Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+        Third order derivative log-likelihood function at y given f w.r.t f
 
-        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|f_{i})}{d^{3}f} = \\frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \\sigma^{2} v))}{((y_{i} - f_{i}) + \\sigma^{2} v)^3}
+
+        :param y: data
+        :type y: NxD matrix
+        :param f: latent variables f
+        :type f: NxD matrix
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: 1xN array
         """
         assert y.shape == f.shape
         e = y - f
@@ -115,23 +135,39 @@ class StudentT(NoiseDistribution):
 
     def dlik_dvar(self, y, f, extra_data=None):
         """
-        Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
+        Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)
 
-        Terms relavent to derivatives wrt sigma are:
-        -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
+        .. math::
+            \\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = -\\frac{1}{\\sigma} + \\frac{(1+v)(y_{i}-f_{i})^2}{\\sigma^3 v(1 + \\frac{1}{v}(\\frac{(y_{i} - f_{i})}{\\sigma^2})^2)}
 
-        $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
+        :param y: data
+        :type y: NxD matrix
+        :param f: latent variables f
+        :type f: NxD matrix
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: 1x1 array
         """
         assert y.shape == f.shape
         e = y - f
         dlik_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
-        return np.sum(dlik_dvar) #May not want to sum over all dimensions if using many D?
+        #FIXME: May not want to sum over all dimensions if using many D?
+        return np.sum(dlik_dvar)
 
     def dlik_df_dvar(self, y, f, extra_data=None):
         """
-        Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
+        Derivative of the dlik_df w.r.t variance parameter (t_noise)
 
-        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|f_{i})}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \\sigma^2 v)^2}
+
+        :param y: data
+        :type y: NxD matrix
+        :param f: latent variables f
+        :type f: NxD matrix
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: 1xN array
         """
         assert y.shape == f.shape
         e = y - f
@@ -180,6 +216,7 @@ class StudentT(NoiseDistribution):
         #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
         true_var = sigma**2 + self.variance
 
+        print true_var
         return true_var
 
     def _predictive_mean_analytical(self, mu, var):
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index 6d720f87..debb3c27 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -66,7 +66,7 @@ class LaplaceTests(unittest.TestCase):
     def setUp(self):
         self.N = 5
         self.D = 1
-        self.X = np.linspace(0, self.D, self.N)[:, None]
+        self.X = np.random.rand(self.N, self.D)
 
         self.real_std = 0.1
         noise = np.random.randn(*self.X.shape)*self.real_std
@@ -93,7 +93,7 @@ class LaplaceTests(unittest.TestCase):
 
     def test_gaussian_dlik_df(self):
         print "\n{}".format(inspect.stack()[0][3])
-        link = functools.partial(self.gauss.link_function, self.Y)
+        link = functools.partial(self.gauss.lik_function, self.Y)
         dlik_df = functools.partial(self.gauss.dlik_df, self.Y)
         grad = GradientChecker(link, dlik_df, self.f.copy(), 'f')
         grad.randomize()
@@ -128,6 +128,8 @@ class LaplaceTests(unittest.TestCase):
         grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
+        grad.checkgrad()
+
         self.assertTrue(grad.checkgrad())
 
     def test_gaussian_d3lik_d3f(self):
@@ -142,7 +144,7 @@ class LaplaceTests(unittest.TestCase):
     def test_gaussian_dlik_dvar(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.gauss.link_function, self.gauss.dlik_dvar,
+                dparam_checkgrad(self.gauss.lik_function, self.gauss.dlik_dvar,
                     [self.var], args=(self.Y, self.f), constrain_positive=True,
                     randomize=False, verbose=True)
                 )
@@ -159,19 +161,21 @@ class LaplaceTests(unittest.TestCase):
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
                 dparam_checkgrad(self.gauss.d2lik_d2f, self.gauss.d2lik_d2f_dvar,
-                    [self.var], args=(self.Y, self.f), constrain_positive=True,
+                    [self.var], args=(self.Y, self.f.copy()), constrain_positive=True,
                     randomize=True, verbose=True)
                 )
 
     def test_studentt_dlik_df(self):
         print "\n{}".format(inspect.stack()[0][3])
-        link = functools.partial(self.stu_t.link_function, self.Y)
+        link = functools.partial(self.stu_t.lik_function, self.Y)
         dlik_df = functools.partial(self.stu_t.dlik_df, self.Y)
         grad = GradientChecker(link, dlik_df, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
+    """ Gradchecker fault """
+    @unittest.expectedFailure
     def test_studentt_d2lik_d2f(self):
         print "\n{}".format(inspect.stack()[0][3])
         dlik_df = functools.partial(self.stu_t.dlik_df, self.Y)
@@ -193,7 +197,7 @@ class LaplaceTests(unittest.TestCase):
     def test_studentt_dlik_dvar(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.stu_t.link_function, self.stu_t.dlik_dvar,
+                dparam_checkgrad(self.stu_t.lik_function, self.stu_t.dlik_dvar,
                     [self.var], args=(self.Y.copy(), self.f.copy()),
                     constrain_positive=True, randomize=True, verbose=True)
                 )
@@ -220,6 +224,7 @@ class LaplaceTests(unittest.TestCase):
         kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
         gauss_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.gauss)
         m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=gauss_laplace)
+        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
         m.ensure_default_constraints()
         m.randomize()
         m.checkgrad(verbose=1, step=self.step)
@@ -242,7 +247,7 @@ class LaplaceTests(unittest.TestCase):
     def test_studentt_rbf(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.Y = self.Y/self.Y.max()
-        white_var = 1
+        white_var = 0.001
         kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
         stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
         m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
@@ -254,10 +259,12 @@ class LaplaceTests(unittest.TestCase):
         print m
         self.assertTrue(m.checkgrad(step=self.step))
 
+    """ With small variances its likely the implicit part isn't perfectly correct? """
+    @unittest.expectedFailure
     def test_studentt_rbf_smallvar(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.Y = self.Y/self.Y.max()
-        white_var = 1
+        white_var = 0.001
         kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
         stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
         m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
@@ -265,8 +272,7 @@ class LaplaceTests(unittest.TestCase):
         m.constrain_positive('t_noise')
         m.constrain_fixed('white', white_var)
         m['t_noise'] = 0.01
-        m.checkgrad(verbose=1, step=self.step)
-        print m
+        m.checkgrad(verbose=1)
         self.assertTrue(m.checkgrad(step=self.step))
 
 if __name__ == "__main__":
diff --git a/doc/GPy.examples.rst b/doc/GPy.examples.rst
index 4fd3528f..288ff631 100644
--- a/doc/GPy.examples.rst
+++ b/doc/GPy.examples.rst
@@ -20,6 +20,14 @@ GPy.examples.dimensionality_reduction module
     :undoc-members:
     :show-inheritance:
 
+GPy.examples.laplace_approximations module
+------------------------------------------
+
+.. automodule:: GPy.examples.laplace_approximations
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.examples.regression module
 ------------------------------
 
diff --git a/doc/GPy.kern.parts.rst b/doc/GPy.kern.parts.rst
index ec0661b4..650fe5cb 100644
--- a/doc/GPy.kern.parts.rst
+++ b/doc/GPy.kern.parts.rst
@@ -28,6 +28,14 @@ GPy.kern.parts.Matern52 module
     :undoc-members:
     :show-inheritance:
 
+GPy.kern.parts.ODE_1 module
+---------------------------
+
+.. automodule:: GPy.kern.parts.ODE_1
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.kern.parts.bias module
 --------------------------
 
@@ -44,6 +52,14 @@ GPy.kern.parts.coregionalize module
     :undoc-members:
     :show-inheritance:
 
+GPy.kern.parts.eq_ode1 module
+-----------------------------
+
+.. automodule:: GPy.kern.parts.eq_ode1
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.kern.parts.exponential module
 ---------------------------------
 
diff --git a/doc/GPy.likelihoods.noise_models.rst b/doc/GPy.likelihoods.noise_models.rst
index d1a4f451..c16ee7d1 100644
--- a/doc/GPy.likelihoods.noise_models.rst
+++ b/doc/GPy.likelihoods.noise_models.rst
@@ -60,6 +60,14 @@ GPy.likelihoods.noise_models.poisson_noise module
     :undoc-members:
     :show-inheritance:
 
+GPy.likelihoods.noise_models.student_t_noise module
+---------------------------------------------------
+
+.. automodule:: GPy.likelihoods.noise_models.student_t_noise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 
 Module contents
 ---------------
diff --git a/doc/GPy.likelihoods.rst b/doc/GPy.likelihoods.rst
index c3da2650..2e7da879 100644
--- a/doc/GPy.likelihoods.rst
+++ b/doc/GPy.likelihoods.rst
@@ -43,6 +43,14 @@ GPy.likelihoods.gaussian_mixed_noise module
     :undoc-members:
     :show-inheritance:
 
+GPy.likelihoods.laplace module
+------------------------------
+
+.. automodule:: GPy.likelihoods.laplace
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.likelihoods.likelihood module
 ---------------------------------
 
@@ -51,6 +59,14 @@ GPy.likelihoods.likelihood module
     :undoc-members:
     :show-inheritance:
 
+GPy.likelihoods.likelihood_functions module
+-------------------------------------------
+
+.. automodule:: GPy.likelihoods.likelihood_functions
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.likelihoods.noise_model_constructors module
 -----------------------------------------------
 
diff --git a/doc/GPy.testing.rst b/doc/GPy.testing.rst
index bd5258b7..ef25ba60 100644
--- a/doc/GPy.testing.rst
+++ b/doc/GPy.testing.rst
@@ -4,6 +4,14 @@ GPy.testing package
 Submodules
 ----------
 
+GPy.testing.bcgplvm_tests module
+--------------------------------
+
+.. automodule:: GPy.testing.bcgplvm_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.testing.bgplvm_tests module
 -------------------------------
 
@@ -44,6 +52,14 @@ GPy.testing.kernel_tests module
     :undoc-members:
     :show-inheritance:
 
+GPy.testing.laplace_tests module
+--------------------------------
+
+.. automodule:: GPy.testing.laplace_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.testing.mapping_tests module
 --------------------------------
 
diff --git a/doc/GPy.util.rst b/doc/GPy.util.rst
index c86280a7..5aca7cf9 100644
--- a/doc/GPy.util.rst
+++ b/doc/GPy.util.rst
@@ -43,6 +43,14 @@ GPy.util.decorators module
     :undoc-members:
     :show-inheritance:
 
+GPy.util.erfcx module
+---------------------
+
+.. automodule:: GPy.util.erfcx
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.linalg module
 ----------------------
 
@@ -51,6 +59,14 @@ GPy.util.linalg module
     :undoc-members:
     :show-inheritance:
 
+GPy.util.ln_diff_erfs module
+----------------------------
+
+.. automodule:: GPy.util.ln_diff_erfs
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.misc module
 --------------------
 
@@ -99,6 +115,14 @@ GPy.util.squashers module
     :undoc-members:
     :show-inheritance:
 
+GPy.util.symbolic module
+------------------------
+
+.. automodule:: GPy.util.symbolic
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.univariate_Gaussian module
 -----------------------------------
 

From 4925d8a0d94d240f5674399f8014fd2b725083c6 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 4 Oct 2013 15:38:59 +0100
Subject: [PATCH 094/252] Doccing and testing for D dimensional input (not
 multiple dimensional Y yet)

---
 .../noise_models/student_t_noise.py           | 50 +++++++++++--------
 GPy/testing/laplace_tests.py                  | 15 +++---
 2 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index 000168e1..dc78b582 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -48,9 +48,9 @@ class StudentT(NoiseDistribution):
             \\ln p(y_{i}|f_{i}) = \\ln \\Gamma(\\frac{v+1}{2}) - \\ln \\Gamma(\\frac{v}{2})\\sqrt{v \\pi}\sigma - \\frac{v+1}{2}\\ln (1 + \\frac{1}{v}\\left(\\frac{y_{i} - f_{i}}{\\sigma}\\right)^2
 
         :param y: data
-        :type y: NxD matrix
+        :type y: Nx1 matrix
         :param f: latent variables f
-        :type f: NxD matrix
+        :type f: Nx1 matrix
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: likelihood evaluated for this point
         :rtype: float
@@ -73,12 +73,12 @@ class StudentT(NoiseDistribution):
             \\frac{d \\ln p(y_{i}|f_{i})}{df} = \\frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \\sigma^{2}v}
 
         :param y: data
-        :type y: NxD matrix
+        :type y: Nx1 matrix
         :param f: latent variables f
-        :type f: NxD matrix
+        :type f: Nx1 matrix
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: gradient of likelihood evaluated at points
-        :rtype: 1xN array
+        :rtype: Nx1 array
 
         """
         assert y.shape == f.shape
@@ -95,12 +95,12 @@ class StudentT(NoiseDistribution):
             \\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = \\frac{(v+1)((y_{i}-f_{i})^{2} - \\sigma^{2}v)}{((y_{i}-f_{i})^{2} + \\sigma^{2}v)^{2}}
 
         :param y: data
-        :type y: NxD matrix
+        :type y: Nx1 matrix
         :param f: latent variables f
-        :type f: NxD matrix
+        :type f: Nx1 matrix
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
-        :rtype: 1xN array
+        :rtype: Nx1 array
 
         .. Note::
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
@@ -119,12 +119,12 @@ class StudentT(NoiseDistribution):
             \\frac{d^{3} \\ln p(y_{i}|f_{i})}{d^{3}f} = \\frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \\sigma^{2} v))}{((y_{i} - f_{i}) + \\sigma^{2} v)^3}
 
         :param y: data
-        :type y: NxD matrix
+        :type y: Nx1 matrix
         :param f: latent variables f
-        :type f: NxD matrix
+        :type f: Nx1 matrix
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: third derivative of likelihood evaluated at points f
-        :rtype: 1xN array
+        :rtype: Nx1 array
         """
         assert y.shape == f.shape
         e = y - f
@@ -138,15 +138,17 @@ class StudentT(NoiseDistribution):
         Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)
 
         .. math::
-            \\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = -\\frac{1}{\\sigma} + \\frac{(1+v)(y_{i}-f_{i})^2}{\\sigma^3 v(1 + \\frac{1}{v}(\\frac{(y_{i} - f_{i})}{\\sigma^2})^2)}
+            \\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = \\frac{v((y_{i} - f_{i})^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - f_{i})^{2})}
+
+        -\\frac{1}{\\sigma} + \\frac{(1+v)(y_{i}-f_{i})^2}{\\sigma^3 v(1 + \\frac{1}{v}(\\frac{(y_{i} - f_{i})}{\\sigma^2})^2)}
 
         :param y: data
-        :type y: NxD matrix
+        :type y: Nx1 matrix
         :param f: latent variables f
-        :type f: NxD matrix
+        :type f: Nx1 matrix
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
-        :rtype: 1x1 array
+        :rtype: float
         """
         assert y.shape == f.shape
         e = y - f
@@ -162,12 +164,12 @@ class StudentT(NoiseDistribution):
             \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|f_{i})}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \\sigma^2 v)^2}
 
         :param y: data
-        :type y: NxD matrix
+        :type y: Nx1 matrix
         :param f: latent variables f
-        :type f: NxD matrix
+        :type f: Nx1 matrix
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
-        :rtype: 1xN array
+        :rtype: Nx1 array
         """
         assert y.shape == f.shape
         e = y - f
@@ -178,7 +180,16 @@ class StudentT(NoiseDistribution):
         """
         Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
 
-        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f}) = \\frac{2\\sigma v(v + 1)(\\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \\sigma^2 v)^3}
+
+        :param y: data
+        :type y: Nx1 matrix
+        :param f: latent variables f
+        :type f: Nx1 matrix
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
+        :rtype: Nx1 array
         """
         assert y.shape == f.shape
         e = y - f
@@ -216,7 +227,6 @@ class StudentT(NoiseDistribution):
         #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
         true_var = sigma**2 + self.variance
 
-        print true_var
         return true_var
 
     def _predictive_mean_analytical(self, mu, var):
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index debb3c27..e1876296 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -65,16 +65,16 @@ def dparam_checkgrad(func, dfunc, params, args, constrain_positive=True, randomi
 class LaplaceTests(unittest.TestCase):
     def setUp(self):
         self.N = 5
-        self.D = 1
-        self.X = np.random.rand(self.N, self.D)
+        self.D = 3
+        self.X = np.random.rand(self.N, self.D)*10
 
         self.real_std = 0.1
-        noise = np.random.randn(*self.X.shape)*self.real_std
-        self.Y = np.sin(self.X*2*np.pi) + noise
+        noise = np.random.randn(*self.X[:, 0].shape)*self.real_std
+        self.Y = (np.sin(self.X[:, 0]*2*np.pi) + noise)[:, None]
         #self.Y = np.array([[1.0]])#np.sin(self.X*2*np.pi) + noise
         self.var = 0.2
 
-        self.f = np.random.rand(self.N, self.D)
+        self.f = np.random.rand(self.N, 1)
         #self.f = np.array([[3.0]])#np.sin(self.X*2*np.pi) + noise
 
         self.var = np.random.rand(1)
@@ -109,6 +109,8 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
+    """ Gradchecker fault """
+    @unittest.expectedFailure
     def test_gaussian_d2lik_d2f_2(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.Y = None
@@ -174,8 +176,6 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    """ Gradchecker fault """
-    @unittest.expectedFailure
     def test_studentt_d2lik_d2f(self):
         print "\n{}".format(inspect.stack()[0][3])
         dlik_df = functools.partial(self.stu_t.dlik_df, self.Y)
@@ -224,7 +224,6 @@ class LaplaceTests(unittest.TestCase):
         kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
         gauss_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.gauss)
         m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=gauss_laplace)
-        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
         m.ensure_default_constraints()
         m.randomize()
         m.checkgrad(verbose=1, step=self.step)

From 91f194cd29874be61c11067552c7034b3ca2ac04 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 4 Oct 2013 16:32:04 +0100
Subject: [PATCH 095/252] More doc strings

---
 GPy/likelihoods/laplace.py                    |   9 +-
 GPy/likelihoods/noise_model_constructors.py   |  11 +-
 .../noise_models/gaussian_noise.py            | 104 ++++++++++++++----
 .../noise_models/student_t_noise.py           |  34 +++---
 4 files changed, 110 insertions(+), 48 deletions(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 46ca66bb..11b1731b 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -203,8 +203,9 @@ class Laplace(likelihood):
         """
         The laplace approximation algorithm, find K and expand hessian
         For nomenclature see Rasmussen & Williams 2006 - modified for numerical stability
-        :param K: Covariance matrix evaluated at locations X
-        :type K: NxD matrix
+
+        :param K: Prior covariance matrix evaluated at locations X
+        :type K: NxN matrix
         """
         self.K = K.copy()
 
@@ -236,8 +237,8 @@ class Laplace(likelihood):
         Rasmussen suggests the use of a numerically stable positive definite matrix B
         Which has a positive diagonal element and can be easyily inverted
 
-        :param K: Covariance matrix evaluated at locations X
-        :type K: NxD matrix
+        :param K: Prior covariance matrix evaluated at locations X
+        :type K: NxN matrix
         :param W: Negative hessian at a point (diagonal matrix)
         :type W: Vector of diagonal values of hessian (1xN)
         :param a: Matrix to calculate W12BiW12a
diff --git a/GPy/likelihoods/noise_model_constructors.py b/GPy/likelihoods/noise_model_constructors.py
index 05d8db55..26d07391 100644
--- a/GPy/likelihoods/noise_model_constructors.py
+++ b/GPy/likelihoods/noise_model_constructors.py
@@ -90,7 +90,9 @@ def gaussian(gp_link=None, variance=2, D=None, N=None):
     Construct a Gaussian likelihood
 
     :param gp_link: a GPy gp_link function
-    :param variance: scalar, variance
+    :param variance: variance
+    :type variance: scalar
+    :returns: Gaussian noise model:
     """
     if gp_link is None:
         gp_link = noise_models.gp_transformations.Identity()
@@ -104,8 +106,11 @@ def student_t(gp_link=None, deg_free=5, sigma2=2):
     Construct a Student t likelihood
 
     :param gp_link: a GPy gp_link function
-    :param deg_free: scalar, degrees of freedom
-    :param sigma2: scalar, variance
+    :param deg_free: degrees of freedom of student-t
+    :type deg_free: scalar
+    :param sigma2: variance
+    :type sigma2: scalar
+    :returns: Student-T noise model
     """
     if gp_link is None:
         gp_link = noise_models.gp_transformations.Identity()
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index f4251ff3..2ca6c373 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -117,14 +117,19 @@ class Gaussian(NoiseDistribution):
         return 0
 
     def lik_function(self, y, f, extra_data=None):
-        """lik_function $\ln p(y|f)$
-        $$\ln p(y_{i}|f_{i}) = \ln $$
+        """
+        Log likelihood function
 
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: float(likelihood evaluated for this point)
+        .. math::
+            \\ln p(y_{i}|f_{i}) = -\\frac{D \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - f_{i})^{T}\\sigma^{-2}(y_{i} - f_{i})}{2}
 
+        :param y: data
+        :type y: Nx1 array
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: likelihood evaluated for this point
+        :rtype: float
         """
         assert y.shape == f.shape
         e = y - f
@@ -138,10 +143,16 @@ class Gaussian(NoiseDistribution):
         """
         Gradient of the link function at y, given f w.r.t f
 
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
+        .. math::
+            \\frac{d \\ln p(y_{i}|f_{i})}{df} = \\frac{1}{\\sigma^{2}}(y_{i} - f_{i})
+
+        :param y: data
+        :type y: Nx1 array
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
 
         """
         assert y.shape == f.shape
@@ -151,16 +162,23 @@ class Gaussian(NoiseDistribution):
 
     def d2lik_d2f(self, y, f, extra_data=None):
         """
-        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
-        i.e. second derivative lik_function at y given f f_j  w.r.t f and f_j
+        Hessian at y, given f, w.r.t f the hessian will be 0 unless i == j
+        i.e. second derivative lik_function at y given f_{i} f_{j}  w.r.t f_{i} and f_{j}
 
-        Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-        (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
 
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
+        :param y: data
+        :type y: Nx1 array
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
         """
         assert y.shape == f.shape
         hess = -(1.0/self.variance)*np.ones((self.N, 1))
@@ -168,9 +186,18 @@ class Gaussian(NoiseDistribution):
 
     def d3lik_d3f(self, y, f, extra_data=None):
         """
-        Third order derivative lik_function (log-likelihood ) at y given f f_j w.r.t f and f_j
+        Third order derivative log-likelihood function at y given f w.r.t f
 
-        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|f_{i})}{d^{3}f} = 0
+
+        :param y: data
+        :type y: Nx1 array
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
         """
         assert y.shape == f.shape
         d3lik_d3f = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
@@ -178,7 +205,18 @@ class Gaussian(NoiseDistribution):
 
     def dlik_dvar(self, y, f, extra_data=None):
         """
-        Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
+        Gradient of the log-likelihood function at y given f, w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = \\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - f_{i})^{2}}{2\\sigma^{4}}
+
+        :param y: data
+        :type y: Nx1 array
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: float
         """
         assert y.shape == f.shape
         e = y - f
@@ -188,7 +226,18 @@ class Gaussian(NoiseDistribution):
 
     def dlik_df_dvar(self, y, f, extra_data=None):
         """
-        Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
+        Derivative of the dlik_df w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|f_{i})}{df}) = \\frac{1}{\\sigma^{4}}(-y_{i} + f_{i})
+
+        :param y: data
+        :type y: Nx1 array
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: Nx1 array
         """
         assert y.shape == f.shape
         s_4 = 1.0/(self.variance**2)
@@ -197,9 +246,18 @@ class Gaussian(NoiseDistribution):
 
     def d2lik_d2f_dvar(self, y, f, extra_data=None):
         """
-        Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
+        Gradient of the hessian (d2lik_d2f) w.r.t variance parameter (noise_variance)
 
-        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f}) = \\frac{1}{\\sigma^{4}}
+
+        :param y: data
+        :type y: Nx1 array
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
+        :rtype: Nx1 array
         """
         assert y.shape == f.shape
         dlik_hess_dsigma = np.diag((1.0/(self.variance**2))*self.I)[:, None]
diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index dc78b582..0ba517a6 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -48,9 +48,9 @@ class StudentT(NoiseDistribution):
             \\ln p(y_{i}|f_{i}) = \\ln \\Gamma(\\frac{v+1}{2}) - \\ln \\Gamma(\\frac{v}{2})\\sqrt{v \\pi}\sigma - \\frac{v+1}{2}\\ln (1 + \\frac{1}{v}\\left(\\frac{y_{i} - f_{i}}{\\sigma}\\right)^2
 
         :param y: data
-        :type y: Nx1 matrix
+        :type y: Nx1 array
         :param f: latent variables f
-        :type f: Nx1 matrix
+        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: likelihood evaluated for this point
         :rtype: float
@@ -73,9 +73,9 @@ class StudentT(NoiseDistribution):
             \\frac{d \\ln p(y_{i}|f_{i})}{df} = \\frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \\sigma^{2}v}
 
         :param y: data
-        :type y: Nx1 matrix
+        :type y: Nx1 array
         :param f: latent variables f
-        :type f: Nx1 matrix
+        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: gradient of likelihood evaluated at points
         :rtype: Nx1 array
@@ -95,9 +95,9 @@ class StudentT(NoiseDistribution):
             \\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = \\frac{(v+1)((y_{i}-f_{i})^{2} - \\sigma^{2}v)}{((y_{i}-f_{i})^{2} + \\sigma^{2}v)^{2}}
 
         :param y: data
-        :type y: Nx1 matrix
+        :type y: Nx1 array
         :param f: latent variables f
-        :type f: Nx1 matrix
+        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
         :rtype: Nx1 array
@@ -119,9 +119,9 @@ class StudentT(NoiseDistribution):
             \\frac{d^{3} \\ln p(y_{i}|f_{i})}{d^{3}f} = \\frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \\sigma^{2} v))}{((y_{i} - f_{i}) + \\sigma^{2} v)^3}
 
         :param y: data
-        :type y: Nx1 matrix
+        :type y: Nx1 array
         :param f: latent variables f
-        :type f: Nx1 matrix
+        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: third derivative of likelihood evaluated at points f
         :rtype: Nx1 array
@@ -140,12 +140,10 @@ class StudentT(NoiseDistribution):
         .. math::
             \\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = \\frac{v((y_{i} - f_{i})^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - f_{i})^{2})}
 
-        -\\frac{1}{\\sigma} + \\frac{(1+v)(y_{i}-f_{i})^2}{\\sigma^3 v(1 + \\frac{1}{v}(\\frac{(y_{i} - f_{i})}{\\sigma^2})^2)}
-
         :param y: data
-        :type y: Nx1 matrix
+        :type y: Nx1 array
         :param f: latent variables f
-        :type f: Nx1 matrix
+        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: float
@@ -164,9 +162,9 @@ class StudentT(NoiseDistribution):
             \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|f_{i})}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \\sigma^2 v)^2}
 
         :param y: data
-        :type y: Nx1 matrix
+        :type y: Nx1 array
         :param f: latent variables f
-        :type f: Nx1 matrix
+        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: Nx1 array
@@ -178,15 +176,15 @@ class StudentT(NoiseDistribution):
 
     def d2lik_d2f_dvar(self, y, f, extra_data=None):
         """
-        Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
+        Gradient of the hessian (d2lik_d2f) w.r.t variance parameter (t_noise)
 
         .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f}) = \\frac{2\\sigma v(v + 1)(\\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \\sigma^2 v)^3}
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - f_{i})^{2})}{(\\sigma^{2}v + (y_{i} - f_{i})^{2})^{3}}
 
         :param y: data
-        :type y: Nx1 matrix
+        :type y: Nx1 array
         :param f: latent variables f
-        :type f: Nx1 matrix
+        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
         :rtype: Nx1 array

From ec36007564a1f335a48607cc95e362bfc0a3fd80 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 4 Oct 2013 16:33:23 +0100
Subject: [PATCH 096/252] Removed fit as it is unused

---
 GPy/likelihoods/likelihood.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index 61f7d8aa..a86eaac6 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -34,9 +34,6 @@ class likelihood(Parameterized):
     def _set_params(self, x):
         raise NotImplementedError
 
-    def fit(self):
-        raise NotImplementedError
-
     def fit_full(self, K):
         """
         No approximations needed by default

From 4738467a955124ae6ea3942aff9201627784f1a1 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 4 Oct 2013 19:31:23 +0100
Subject: [PATCH 097/252] Docs

---
 GPy/likelihoods/noise_models/gaussian_noise.py      | 10 ++++++++--
 GPy/likelihoods/noise_models/noise_distributions.py | 10 +++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index 2ca6c373..df351cf1 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -94,7 +94,10 @@ class Gaussian(NoiseDistribution):
 
     def _mean(self,gp):
         """
-        Mass (or density) function
+        Expected value of y under the Mass (or density) function p(y|f)
+
+        .. math::
+            E_{p(y|f)}[y]
         """
         return self.gp_link.transf(gp)
 
@@ -106,7 +109,10 @@ class Gaussian(NoiseDistribution):
 
     def _variance(self,gp):
         """
-        Mass (or density) function
+        Variance of y under the Mass (or density) function p(y|f)
+
+        .. math::
+            Var_{p(y|f)}[y]
         """
         return self.variance
 
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 33a79ce8..c5297172 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -248,19 +248,27 @@ class NoiseDistribution(object):
 
     def _predictive_mean_analytical(self,mu,sigma):
         """
+        Predictive mean
+        .. math::
+            E(Y^{*}|Y) = E( E(Y^{*}|f^{*}, Y) )
+
         If available, this function computes the predictive mean analytically.
         """
         pass
 
     def _predictive_variance_analytical(self,mu,sigma):
         """
+        Predictive variance
+        .. math::
+            V(Y^{*}| Y) = E( V(Y^{*}|f^{*}, Y) ) + V( E(Y^{*}|f^{*}, Y) )
+
         If available, this function computes the predictive variance analytically.
         """
         pass
 
     def _predictive_mean_numerical(self,mu,sigma):
         """
-        Laplace approximation to the predictive mean: E(Y_star) = E( E(Y_star|f_star) )
+        Laplace approximation to the predictive mean: E(Y_star|Y) = E( E(Y_star|f_star, Y) )
 
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation

From 77bca5547055bb76ef66b9ba132661bbdc631761 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 7 Oct 2013 15:28:40 +0100
Subject: [PATCH 098/252] Beginning to merge lik_functions and derivatives with
 richardos

---
 .../noise_models/gaussian_noise.py            | 29 +++++++++++---
 GPy/testing/laplace_tests.py                  | 39 ++++++++++++++++---
 2 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index df351cf1..afd5d297 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -33,7 +33,8 @@ class Gaussian(NoiseDistribution):
         self.I = np.eye(self.N)
         self.covariance_matrix = self.I * self.variance
         self.Ki = self.I*(1.0 / self.variance)
-        self.ln_det_K = np.sum(np.log(np.diag(self.covariance_matrix)))
+        #self.ln_det_K = np.sum(np.log(np.diag(self.covariance_matrix)))
+        self.ln_det_K = self.N*np.log(self.variance)
 
     def _laplace_gradients(self, y, f, extra_data=None):
         #must be listed in same order as 'get_param_names'
@@ -81,10 +82,26 @@ class Gaussian(NoiseDistribution):
 
     def _mass(self,gp,obs):
         #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
-        return stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance))
+        #Assumes no covariance, exp, sum, log for numerical stability
+        return np.exp(np.sum(np.log(stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance)))))
 
-    def _nlog_mass(self,gp,obs):
-        return .5*((self.gp_link.transf(gp)-obs)**2/self.variance + np.log(2.*np.pi*self.variance))
+    def _nlog_mass(self,gp,obs, extra_data=None):
+        """
+        Negative Log likelihood function
+
+        .. math::
+            \\-ln p(y_{i}|f_{i}) = +\\frac{D \\ln 2\\pi}{2} + \\frac{\\ln |K|}{2} + \\frac{(y_{i} - f_{i})^{T}\\sigma^{-2}(y_{i} - f_{i})}{2}
+
+        :param y: data
+        :type y: Nx1 array
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert gp.shape == obs.shape
+        return .5*(np.sum((self.gp_link.transf(gp)-obs)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
 
     def _dnlog_mass_dgp(self,gp,obs):
         return (self.gp_link.transf(gp)-obs)/self.variance * self.gp_link.dtransf_df(gp)
@@ -139,7 +156,7 @@ class Gaussian(NoiseDistribution):
         """
         assert y.shape == f.shape
         e = y - f
-        objective = (- 0.5*self.D*np.log(2*np.pi)
+        objective = (- 0.5*self.N*np.log(2*np.pi)
                      - 0.5*self.ln_det_K
                      - (0.5/self.variance)*np.sum(np.square(e)) # As long as K is diagonal
                      )
@@ -206,7 +223,7 @@ class Gaussian(NoiseDistribution):
         :rtype: Nx1 array
         """
         assert y.shape == f.shape
-        d3lik_d3f = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
+        d3lik_d3f = np.diagonal(0*self.I)[:, None]
         return d3lik_d3f
 
     def dlik_dvar(self, y, f, extra_data=None):
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index e1876296..acd60b4a 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -64,18 +64,16 @@ def dparam_checkgrad(func, dfunc, params, args, constrain_positive=True, randomi
 
 class LaplaceTests(unittest.TestCase):
     def setUp(self):
-        self.N = 5
+        self.N = 50
         self.D = 3
         self.X = np.random.rand(self.N, self.D)*10
 
         self.real_std = 0.1
         noise = np.random.randn(*self.X[:, 0].shape)*self.real_std
         self.Y = (np.sin(self.X[:, 0]*2*np.pi) + noise)[:, None]
-        #self.Y = np.array([[1.0]])#np.sin(self.X*2*np.pi) + noise
-        self.var = 0.2
-
         self.f = np.random.rand(self.N, 1)
-        #self.f = np.array([[3.0]])#np.sin(self.X*2*np.pi) + noise
+
+        self.var = 0.2
 
         self.var = np.random.rand(1)
         self.stu_t = GPy.likelihoods.student_t(deg_free=5, sigma2=self.var)
@@ -91,6 +89,37 @@ class LaplaceTests(unittest.TestCase):
         self.f = None
         self.X = None
 
+    def test_lik_mass(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        np.testing.assert_almost_equal(
+                                np.sum(self.gauss._nlog_mass(self.f.copy(), self.Y.copy())),
+                                -self.gauss.lik_function(self.Y.copy(), self.f.copy()))
+
+    def test_mass_nlog_mass(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        np.testing.assert_almost_equal(
+                               -np.log(self.gauss._mass(self.f.copy(), self.Y.copy())),
+                               self.gauss._nlog_mass(self.f.copy(), self.Y.copy()))
+
+    def test_gaussian_dnlog_mass_dgp(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        link = functools.partial(self.gauss._nlog_mass, obs=self.Y)
+        dlik_df = functools.partial(self.gauss._dnlog_mass_dgp, obs=self.Y)
+        grad = GradientChecker(link, dlik_df, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
+
+    def test_gaussian_d2nlog_mass_d2gp(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        link = functools.partial(self.gauss._dnlog_mass_dgp, obs=self.Y)
+        dlik_df = functools.partial(self.gauss._d2nlog_mass_dgp2, obs=self.Y)
+        grad = GradientChecker(link, dlik_df, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
+
+
     def test_gaussian_dlik_df(self):
         print "\n{}".format(inspect.stack()[0][3])
         link = functools.partial(self.gauss.lik_function, self.Y)

From 76debef6b87ebddc2661272866d0ea0b068a2a03 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 7 Oct 2013 17:59:40 +0100
Subject: [PATCH 099/252] Finished tearing gaussian noise down, time for
 student t

---
 GPy/likelihoods/laplace.py                    |  12 +-
 .../noise_models/gaussian_noise.py            | 293 ++++++++----------
 .../noise_models/gp_transformations.py        |  15 +-
 .../noise_models/student_t_noise.py           |  16 +-
 GPy/testing/laplace_tests.py                  |  63 +++-
 5 files changed, 208 insertions(+), 191 deletions(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 11b1731b..26365467 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -76,7 +76,7 @@ class Laplace(likelihood):
         return self.noise_model._set_params(p)
 
     def _shared_gradients_components(self):
-        d3lik_d3fhat = self.noise_model.d3lik_d3f(self.data, self.f_hat, extra_data=self.extra_data)
+        d3lik_d3fhat = -self.noise_model._d3nlog_mass_dgp3(self.f_hat, self.data, extra_data=self.extra_data)
         dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
         I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
         return dL_dfhat, I_KW_i
@@ -89,7 +89,7 @@ class Laplace(likelihood):
         :rtype: Matrix (1 x num_kernel_params)
         """
         dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlp = self.noise_model.dlik_df(self.data, self.f_hat)
+        dlp = -self.noise_model._dnlog_mass_dgp(self.data, self.f_hat)
 
         #Explicit
         #expl_a = np.dot(self.Ki_f, self.Ki_f.T)
@@ -178,7 +178,7 @@ class Laplace(likelihood):
 
         self.Wi_K_i = self.W12BiW12
         self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-        self.lik = self.noise_model.lik_function(self.data, self.f_hat, extra_data=self.extra_data)
+        self.lik = -self.noise_model._nlog_mass(self.f_hat, self.data, extra_data=self.extra_data)
         self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
 
         Z_tilde = (+ self.lik
@@ -237,7 +237,7 @@ class Laplace(likelihood):
         Rasmussen suggests the use of a numerically stable positive definite matrix B
         Which has a positive diagonal element and can be easyily inverted
 
-        :param K: Prior covariance matrix evaluated at locations X
+        :param K: Prior Covariance matrix evaluated at locations X
         :type K: NxN matrix
         :param W: Negative hessian at a point (diagonal matrix)
         :type W: Vector of diagonal values of hessian (1xN)
@@ -290,7 +290,7 @@ class Laplace(likelihood):
         old_obj = np.inf
 
         def obj(Ki_f, f):
-            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.lik_function(self.data, f, extra_data=self.extra_data)
+            return -0.5*np.dot(Ki_f.T, f) - self.noise_model._nlog_mass(f, self.data, extra_data=self.extra_data)
 
         difference = np.inf
         epsilon = 1e-6
@@ -302,7 +302,7 @@ class Laplace(likelihood):
             W = -self.noise_model.d2lik_d2f(self.data, f, extra_data=self.extra_data)
 
             W_f = W*f
-            grad = self.noise_model.dlik_df(self.data, f, extra_data=self.extra_data)
+            grad = -self.noise_model._dnlog_mass_dgp(f, self.data, extra_data=self.extra_data)
 
             b = W_f + grad
             W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index afd5d297..51b7c6a1 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -38,9 +38,9 @@ class Gaussian(NoiseDistribution):
 
     def _laplace_gradients(self, y, f, extra_data=None):
         #must be listed in same order as 'get_param_names'
-        derivs = ([self.dlik_dvar(y, f, extra_data=extra_data)],
-                  [self.dlik_df_dvar(y, f, extra_data=extra_data)],
-                  [self.d2lik_d2f_dvar(y, f, extra_data=extra_data)]
+        derivs = ([-self._dnlog_mass_dvar(f, y, extra_data=extra_data)],
+                  [-self._dnlog_mass_dgp_dvar(f, y, extra_data=extra_data)],
+                  [-self._d2nlog_mass_dgp2_dvar(f, y, extra_data=extra_data)]
                  ) # lists as we might learn many parameters
         # ensure we have gradients for every parameter we want to optimize
         assert len(derivs[0]) == len(self._get_param_names())
@@ -80,22 +80,23 @@ class Gaussian(NoiseDistribution):
     def _predictive_variance_analytical(self,mu,sigma,predictive_mean=None):
         return 1./(1./self.variance + 1./sigma**2)
 
-    def _mass(self,gp,obs):
+    def _mass(self, gp, obs):
         #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
         #Assumes no covariance, exp, sum, log for numerical stability
         return np.exp(np.sum(np.log(stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance)))))
 
-    def _nlog_mass(self,gp,obs, extra_data=None):
+    def _nlog_mass(self, gp, obs, extra_data=None):
         """
         Negative Log likelihood function
+        Chained with link function deriative
 
         .. math::
-            \\-ln p(y_{i}|f_{i}) = +\\frac{D \\ln 2\\pi}{2} + \\frac{\\ln |K|}{2} + \\frac{(y_{i} - f_{i})^{T}\\sigma^{-2}(y_{i} - f_{i})}{2}
+            \\-ln p(y_{i}|\\lambda(f_{i})) = +\\frac{D \\ln 2\\pi}{2} + \\frac{\\ln |K|}{2} + \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
 
-        :param y: data
-        :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
+        :param gp: latent variables (f)
+        :type gp: Nx1 array
+        :param obs: data (y)
+        :type obs: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: likelihood evaluated for this point
         :rtype: float
@@ -103,12 +104,133 @@ class Gaussian(NoiseDistribution):
         assert gp.shape == obs.shape
         return .5*(np.sum((self.gp_link.transf(gp)-obs)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
 
-    def _dnlog_mass_dgp(self,gp,obs):
+    def _dnlog_mass_dgp(self, gp, obs, extra_data=None):
+        """
+        Negative Gradient of the link function at y, given f w.r.t f
+        Chained with link function deriative
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|f_{i})}{df} = \\frac{1}{\\sigma^{2}}(y_{i} - f_{i})
+            \\frac{d \\-ln p(y_{i}|f_{i})}{df} = -\\frac{1}{\\sigma^{2}}(y_{i} - \\lambda(f_{i}))\\frac{d\\lambda(f_{i})}{df_{i}}
+
+        :param gp: latent variables (f)
+        :type gp: Nx1 array
+        :param obs: data (y)
+        :type obs: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: gradient of negative likelihood evaluated at points
+        :rtype: Nx1 array
+        """
+        assert gp.shape == obs.shape
         return (self.gp_link.transf(gp)-obs)/self.variance * self.gp_link.dtransf_df(gp)
 
-    def _d2nlog_mass_dgp2(self,gp,obs):
+    def _d2nlog_mass_dgp2(self, gp, obs, extra_data=None):
+        """
+        Negative Hessian at y, given f, w.r.t f the hessian will be 0 unless i == j
+        i.e. second derivative _nlog_mass at y given f_{i} f_{j}  w.r.t f_{i} and f_{j}
+        Chained with link function deriative
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
+
+        :param gp: latent variables (f)
+        :type gp: Nx1 array
+        :param obs: data (y)
+        :type obs: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+        """
+        assert gp.shape == obs.shape
+        #FIXME: Why squared?
         return ((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)/self.variance
 
+    def _d3nlog_mass_dgp3(self, gp, obs, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given f w.r.t f
+        Chained with link function deriative
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|f_{i})}{d^{3}f} = 0
+
+        :param gp: latent variables (f)
+        :type gp: Nx1 array
+        :param obs: data (y)
+        :type obs: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert gp.shape == obs.shape
+        d2lambda_df2 = self.gp_link.d2transf_df2(gp)
+        return ((self.gp_link.transf(gp)-obs)*self.gp_link.d3transf_df3(gp) - self.gp_link.dtransf_df(gp)*d2lambda_df2 + d2lambda_df2)/self.variance
+
+    def _dnlog_mass_dvar(self, gp, obs, extra_data=None):
+        """
+        Gradient of the negative log-likelihood function at y given f, w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = \\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - f_{i})^{2}}{2\\sigma^{4}}
+
+        :param gp: latent variables (f)
+        :type gp: Nx1 array
+        :param obs: data (y)
+        :type obs: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: float
+        """
+        assert gp.shape == obs.shape
+        e = (obs - self.gp_link.transf(gp))
+        s_4 = 1.0/(self.variance**2)
+        dnlik_dsigma = 0.5*self.N/self.variance - 0.5*s_4*np.dot(e.T, e)
+        return np.sum(dnlik_dsigma) # Sure about this sum?
+
+    def _dnlog_mass_dgp_dvar(self, gp, obs, extra_data=None):
+        """
+        Derivative of the dlik_df w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|f_{i})}{df}) = \\frac{1}{\\sigma^{4}}(-y_{i} + f_{i})
+
+        :param y: data
+        :type y: Nx1 array
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert gp.shape == obs.shape
+        s_4 = 1.0/(self.variance**2)
+        dnlik_grad_dsigma = s_4*(obs - self.gp_link.transf(gp))*self.gp_link.dtransf_df(gp)
+        return dnlik_grad_dsigma
+
+    def _d2nlog_mass_dgp2_dvar(self, gp, obs, extra_data=None):
+        """
+        Gradient of the hessian (d2lik_d2f) w.r.t variance parameter (noise_variance)
+
+        .. math::
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f}) = \\frac{1}{\\sigma^{4}}
+
+        :param gp: latent variables (f)
+        :type gp: Nx1 array
+        :param obs: data (y)
+        :type obs: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        assert gp.shape == obs.shape
+        s_4 = 1.0/(self.variance**2)
+        #FIXME: Why squared?
+        dnlik_hess_dvar = -s_4*((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)
+        return dnlik_hess_dvar
+
     def _mean(self,gp):
         """
         Expected value of y under the Mass (or density) function p(y|f)
@@ -138,150 +260,3 @@ class Gaussian(NoiseDistribution):
 
     def _d2variance_dgp2(self,gp):
         return 0
-
-    def lik_function(self, y, f, extra_data=None):
-        """
-        Log likelihood function
-
-        .. math::
-            \\ln p(y_{i}|f_{i}) = -\\frac{D \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - f_{i})^{T}\\sigma^{-2}(y_{i} - f_{i})}{2}
-
-        :param y: data
-        :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: likelihood evaluated for this point
-        :rtype: float
-        """
-        assert y.shape == f.shape
-        e = y - f
-        objective = (- 0.5*self.N*np.log(2*np.pi)
-                     - 0.5*self.ln_det_K
-                     - (0.5/self.variance)*np.sum(np.square(e)) # As long as K is diagonal
-                     )
-        return np.sum(objective)
-
-    def dlik_df(self, y, f, extra_data=None):
-        """
-        Gradient of the link function at y, given f w.r.t f
-
-        .. math::
-            \\frac{d \\ln p(y_{i}|f_{i})}{df} = \\frac{1}{\\sigma^{2}}(y_{i} - f_{i})
-
-        :param y: data
-        :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: gradient of likelihood evaluated at points
-        :rtype: Nx1 array
-
-        """
-        assert y.shape == f.shape
-        s2_i = (1.0/self.variance)
-        grad = s2_i*y - s2_i*f
-        return grad
-
-    def d2lik_d2f(self, y, f, extra_data=None):
-        """
-        Hessian at y, given f, w.r.t f the hessian will be 0 unless i == j
-        i.e. second derivative lik_function at y given f_{i} f_{j}  w.r.t f_{i} and f_{j}
-
-        .. math::
-            \\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
-
-        :param y: data
-        :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
-        :rtype: Nx1 array
-
-        .. Note::
-            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-            (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
-        """
-        assert y.shape == f.shape
-        hess = -(1.0/self.variance)*np.ones((self.N, 1))
-        return hess
-
-    def d3lik_d3f(self, y, f, extra_data=None):
-        """
-        Third order derivative log-likelihood function at y given f w.r.t f
-
-        .. math::
-            \\frac{d^{3} \\ln p(y_{i}|f_{i})}{d^{3}f} = 0
-
-        :param y: data
-        :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: third derivative of likelihood evaluated at points f
-        :rtype: Nx1 array
-        """
-        assert y.shape == f.shape
-        d3lik_d3f = np.diagonal(0*self.I)[:, None]
-        return d3lik_d3f
-
-    def dlik_dvar(self, y, f, extra_data=None):
-        """
-        Gradient of the log-likelihood function at y given f, w.r.t variance parameter (noise_variance)
-
-        .. math::
-            \\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = \\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - f_{i})^{2}}{2\\sigma^{4}}
-
-        :param y: data
-        :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
-        :rtype: float
-        """
-        assert y.shape == f.shape
-        e = y - f
-        s_4 = 1.0/(self.variance**2)
-        dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.dot(e.T, e)
-        return np.sum(dlik_dsigma) # Sure about this sum?
-
-    def dlik_df_dvar(self, y, f, extra_data=None):
-        """
-        Derivative of the dlik_df w.r.t variance parameter (noise_variance)
-
-        .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|f_{i})}{df}) = \\frac{1}{\\sigma^{4}}(-y_{i} + f_{i})
-
-        :param y: data
-        :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
-        :rtype: Nx1 array
-        """
-        assert y.shape == f.shape
-        s_4 = 1.0/(self.variance**2)
-        dlik_grad_dsigma = -np.dot(s_4*self.I, y) + np.dot(s_4*self.I, f)
-        return dlik_grad_dsigma
-
-    def d2lik_d2f_dvar(self, y, f, extra_data=None):
-        """
-        Gradient of the hessian (d2lik_d2f) w.r.t variance parameter (noise_variance)
-
-        .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f}) = \\frac{1}{\\sigma^{4}}
-
-        :param y: data
-        :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
-        :rtype: Nx1 array
-        """
-        assert y.shape == f.shape
-        dlik_hess_dsigma = np.diag((1.0/(self.variance**2))*self.I)[:, None]
-        return dlik_hess_dsigma
diff --git a/GPy/likelihoods/noise_models/gp_transformations.py b/GPy/likelihoods/noise_models/gp_transformations.py
index e95e9df7..c6e316e8 100644
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@@ -24,19 +24,25 @@ class GPTransformation(object):
         """
         Gaussian process tranformation function, latent space -> output space
         """
-        pass
+        raise NotImplementedError
 
     def dtransf_df(self,f):
         """
         derivative of transf(f) w.r.t. f
         """
-        pass
+        raise NotImplementedError
 
     def d2transf_df2(self,f):
         """
         second derivative of transf(f) w.r.t. f
         """
-        pass
+        raise NotImplementedError
+
+    def d3transf_df3(self,f):
+        """
+        third derivative of transf(f) w.r.t. f
+        """
+        raise NotImplementedError
 
 class Identity(GPTransformation):
     """
@@ -54,6 +60,9 @@ class Identity(GPTransformation):
     def d2transf_df2(self,f):
         return 0
 
+    def d3transf_df3(self,f):
+        return 0
+
 
 class Probit(GPTransformation):
     """
diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index 0ba517a6..c4319313 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -40,30 +40,30 @@ class StudentT(NoiseDistribution):
     def variance(self, extra_data=None):
         return (self.v / float(self.v - 2)) * self.sigma2
 
-    def lik_function(self, y, f, extra_data=None):
+    def _nlog_mass(self, gp, obs, extra_data=None):
         """
         Log Likelihood Function
 
         .. math::
             \\ln p(y_{i}|f_{i}) = \\ln \\Gamma(\\frac{v+1}{2}) - \\ln \\Gamma(\\frac{v}{2})\\sqrt{v \\pi}\sigma - \\frac{v+1}{2}\\ln (1 + \\frac{1}{v}\\left(\\frac{y_{i} - f_{i}}{\\sigma}\\right)^2
 
-        :param y: data
-        :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
+        :param gp: latent variables (f)
+        :type gp: Nx1 array
+        :param obs: data (y)
+        :type obs: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: likelihood evaluated for this point
         :rtype: float
 
         """
-        assert y.shape == f.shape
-        e = y - f
+        assert gp.shape == obs.shape
+        e = obs - self.gp_link.transf(gp)
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
                      - 0.5*np.log(self.sigma2 * self.v * np.pi)
                      - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
                     )
-        return np.sum(objective)
+        return -np.sum(objective)
 
     def dlik_df(self, y, f, extra_data=None):
         """
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index acd60b4a..1154052e 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -64,7 +64,7 @@ def dparam_checkgrad(func, dfunc, params, args, constrain_positive=True, randomi
 
 class LaplaceTests(unittest.TestCase):
     def setUp(self):
-        self.N = 50
+        self.N = 5
         self.D = 3
         self.X = np.random.rand(self.N, self.D)*10
 
@@ -101,6 +101,25 @@ class LaplaceTests(unittest.TestCase):
                                -np.log(self.gauss._mass(self.f.copy(), self.Y.copy())),
                                self.gauss._nlog_mass(self.f.copy(), self.Y.copy()))
 
+    def test_mass_dnlog_mass_dgp_ndlik_df(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        np.testing.assert_almost_equal(
+                               self.gauss._dnlog_mass_dgp(gp=self.f.copy(), obs=self.Y.copy()),
+                               -self.gauss.dlik_df(y=self.Y.copy(), f=self.f.copy()))
+
+    def test_mass_d2nlog_mass_dgp2_nd2lik_d2f(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        np.testing.assert_almost_equal(
+                               self.gauss._d2nlog_mass_dgp2(gp=self.f.copy(), obs=self.Y.copy()),
+                               -self.gauss.d2lik_d2f(y=self.Y.copy(), f=self.f.copy()))
+
+    def test_mass_d2nlog_mass_dgp3_nd2lik_d3f(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        np.testing.assert_almost_equal(
+                               self.gauss._d3nlog_mass_dgp3(gp=self.f.copy(), obs=self.Y.copy()),
+                               -self.gauss.d3lik_d3f(y=self.Y.copy(), f=self.f.copy()))
+
+
     def test_gaussian_dnlog_mass_dgp(self):
         print "\n{}".format(inspect.stack()[0][3])
         link = functools.partial(self.gauss._nlog_mass, obs=self.Y)
@@ -119,24 +138,38 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-
-    def test_gaussian_dlik_df(self):
+    def test_gaussian_d3nlog_mass_d3gp(self):
         print "\n{}".format(inspect.stack()[0][3])
-        link = functools.partial(self.gauss.lik_function, self.Y)
-        dlik_df = functools.partial(self.gauss.dlik_df, self.Y)
-        grad = GradientChecker(link, dlik_df, self.f.copy(), 'f')
+        link = functools.partial(self.gauss._d2nlog_mass_dgp2, obs=self.Y)
+        dlik_df = functools.partial(self.gauss._d3nlog_mass_dgp3, obs=self.Y)
+        grad = GradientChecker(link, dlik_df, self.f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    def test_gaussian_d2lik_d2f(self):
+    def test_gaussian_dnlog_mass_dvar(self):
         print "\n{}".format(inspect.stack()[0][3])
-        dlik_df = functools.partial(self.gauss.dlik_df, self.Y)
-        d2lik_d2f = functools.partial(self.gauss.d2lik_d2f, self.Y)
-        grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
+        self.assertTrue(
+                dparam_checkgrad(self.gauss._nlog_mass, self.gauss._dnlog_mass_dvar,
+                    [self.var], args=(self.Y, self.f), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
+
+    def test_gaussian_dnlog_mass_dgp_dvar(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.assertTrue(
+                dparam_checkgrad(self.gauss._dnlog_mass_dgp, self.gauss._dnlog_mass_dgp_dvar,
+                    [self.var], args=(self.Y, self.f), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
+
+    def test_gaussian_d2nlog_mass_d2gp_dvar(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.assertTrue(
+                dparam_checkgrad(self.gauss._d2nlog_mass_dgp2, self.gauss._d2nlog_mass_dgp2_dvar,
+                    [self.var], args=(self.Y, self.f), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
 
     """ Gradchecker fault """
     @unittest.expectedFailure
@@ -154,8 +187,8 @@ class LaplaceTests(unittest.TestCase):
         self.f = np.random.rand(self.N, 1)
         self.gauss = GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N)
 
-        dlik_df = functools.partial(self.gauss.dlik_df, self.Y)
-        d2lik_d2f = functools.partial(self.gauss.d2lik_d2f, self.Y)
+        dlik_df = functools.partial(self.gauss._dnlog_mass_dgp, obs=self.Y)
+        d2lik_d2f = functools.partial(self.gauss._d2nlog_mass_dgp2, obs=self.Y)
         grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)

From 39eb0368d8880b9a0afe058bbbacee981c4af8a9 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 8 Oct 2013 12:30:14 +0100
Subject: [PATCH 100/252] changes Nparts for num_parts in kern

---
 GPy/kern/kern.py            | 12 ++++++------
 GPy/testing/kernel_tests.py | 12 ++++++++++--
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 5a8882dd..d6611a51 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -31,7 +31,7 @@ class kern(Parameterized):
 
         """
         self.parts = parts
-        self.Nparts = len(parts)
+        self.num_parts = len(parts)
         self.num_params = sum([p.num_params for p in self.parts])
 
         self.input_dim = input_dim
@@ -61,7 +61,7 @@ class kern(Parameterized):
         here just all the indices, rest can get recomputed
         """
         return Parameterized.getstate(self) + [self.parts,
-                self.Nparts,
+                self.num_parts,
                 self.num_params,
                 self.input_dim,
                 self.input_slices,
@@ -73,7 +73,7 @@ class kern(Parameterized):
         self.input_slices = state.pop()
         self.input_dim = state.pop()
         self.num_params = state.pop()
-        self.Nparts = state.pop()
+        self.num_parts = state.pop()
         self.parts = state.pop()
         Parameterized.setstate(self, state)
 
@@ -308,7 +308,7 @@ class kern(Parameterized):
 
     def K(self, X, X2=None, which_parts='all'):
         if which_parts == 'all':
-            which_parts = [True] * self.Nparts
+            which_parts = [True] * self.num_parts
         assert X.shape[1] == self.input_dim
         if X2 is None:
             target = np.zeros((X.shape[0], X.shape[0]))
@@ -359,7 +359,7 @@ class kern(Parameterized):
     def Kdiag(self, X, which_parts='all'):
         """Compute the diagonal of the covariance function for inputs X."""
         if which_parts == 'all':
-            which_parts = [True] * self.Nparts
+            which_parts = [True] * self.num_parts
         assert X.shape[1] == self.input_dim
         target = np.zeros(X.shape[0])
         [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self.parts, self.input_slices, which_parts) if part_on]
@@ -497,7 +497,7 @@ class kern(Parameterized):
 
     def plot(self, x=None, plot_limits=None, which_parts='all', resolution=None, *args, **kwargs):
         if which_parts == 'all':
-            which_parts = [True] * self.Nparts
+            which_parts = [True] * self.num_parts
         if self.input_dim == 1:
             if x is None:
                 x = np.zeros((1, 1))
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 87d4a20e..71daf0e8 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -7,6 +7,13 @@ import GPy
 
 verbose = False
 
+try:
+    import sympy
+    SYMPY_AVAILABLE=True
+except ImportError:
+    SYMPY_AVAILABLE=False
+
+
 class KernelTests(unittest.TestCase):
     def test_kerneltie(self):
         K = GPy.kern.rbf(5, ARD=True)
@@ -22,8 +29,9 @@ class KernelTests(unittest.TestCase):
         self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
     def test_rbf_sympykernel(self):
-        kern = GPy.kern.rbf_sympy(5)
-        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+        if SYMPY_AVAILABLE:
+            kern = GPy.kern.rbf_sympy(5)
+            self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
     def test_rbf_invkernel(self):
         kern = GPy.kern.rbf_inv(5)

From a59d980327c5c583264b168b0ff7c7290cae790c Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 8 Oct 2013 14:49:18 +0100
Subject: [PATCH 101/252] Nparam changes to num_params

---
 GPy/core/fitc.py                        | 2 +-
 GPy/core/sparse_gp.py                   | 2 +-
 GPy/kern/parts/periodic_Matern32.py     | 2 +-
 GPy/kern/parts/periodic_Matern52.py     | 2 +-
 GPy/kern/parts/periodic_exponential.py  | 2 +-
 GPy/likelihoods/ep.py                   | 2 +-
 GPy/likelihoods/ep_mixed_noise.py       | 2 +-
 GPy/likelihoods/gaussian.py             | 2 +-
 GPy/likelihoods/gaussian_mixed_noise.py | 8 ++++----
 GPy/models/mrd.py                       | 4 ++--
 10 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/GPy/core/fitc.py b/GPy/core/fitc.py
index c9cf6eb2..0d294d07 100644
--- a/GPy/core/fitc.py
+++ b/GPy/core/fitc.py
@@ -126,7 +126,7 @@ class FITC(SparseGP):
             self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T,self.Z,self.X[i:i+1,:])
 
         # the partial derivative vector for the likelihood
-        if self.likelihood.Nparams == 0:
+        if self.likelihood.num_params == 0:
             # save computation here.
             self.partial_for_likelihood = None
         elif self.likelihood.is_heteroscedastic:
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index d4b33ed2..9251fcd6 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -156,7 +156,7 @@ class SparseGP(GPBase):
 
 
         # the partial derivative vector for the likelihood
-        if self.likelihood.Nparams == 0:
+        if self.likelihood.num_params == 0:
             # save computation here.
             self.partial_for_likelihood = None
         elif self.likelihood.is_heteroscedastic:
diff --git a/GPy/kern/parts/periodic_Matern32.py b/GPy/kern/parts/periodic_Matern32.py
index 5693085d..0de57f82 100644
--- a/GPy/kern/parts/periodic_Matern32.py
+++ b/GPy/kern/parts/periodic_Matern32.py
@@ -113,7 +113,7 @@ class PeriodicMatern32(Kernpart):
 
     @silence_errors
     def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters (shape is Nxnum_inducingxNparam)"""
+        """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
         if X2 is None: X2 = X
         FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
         FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
diff --git a/GPy/kern/parts/periodic_Matern52.py b/GPy/kern/parts/periodic_Matern52.py
index 7b5ae846..882084fd 100644
--- a/GPy/kern/parts/periodic_Matern52.py
+++ b/GPy/kern/parts/periodic_Matern52.py
@@ -115,7 +115,7 @@ class PeriodicMatern52(Kernpart):
 
     @silence_errors
     def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters (shape is Nxnum_inducingxNparam)"""
+        """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
         if X2 is None: X2 = X
         FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
         FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
diff --git a/GPy/kern/parts/periodic_exponential.py b/GPy/kern/parts/periodic_exponential.py
index 36b7b9ac..201def6d 100644
--- a/GPy/kern/parts/periodic_exponential.py
+++ b/GPy/kern/parts/periodic_exponential.py
@@ -111,7 +111,7 @@ class PeriodicExponential(Kernpart):
 
     @silence_errors
     def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters (shape is Nxnum_inducingxNparam)"""
+        """derivative of the covariance matrix with respect to the parameters (shape is N x num_inducing x num_params)"""
         if X2 is None: X2 = X
         FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
         FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
diff --git a/GPy/likelihoods/ep.py b/GPy/likelihoods/ep.py
index d242e583..4fedd66b 100644
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@@ -18,7 +18,7 @@ class EP(likelihood):
         self.data = data
         self.num_data, self.output_dim = self.data.shape
         self.is_heteroscedastic = True
-        self.Nparams = 0
+        self.num_params = 0
         self._transf_data = self.noise_model._preprocess_values(data)
 
         #Initial values - Likelihood approximation parameters:
diff --git a/GPy/likelihoods/ep_mixed_noise.py b/GPy/likelihoods/ep_mixed_noise.py
index ffc8cb51..f5452512 100644
--- a/GPy/likelihoods/ep_mixed_noise.py
+++ b/GPy/likelihoods/ep_mixed_noise.py
@@ -31,7 +31,7 @@ class EP_Mixed_Noise(likelihood):
         self.data = np.vstack(data_list)
         self.N, self.output_dim = self.data.shape
         self.is_heteroscedastic = True
-        self.Nparams = 0#FIXME
+        self.num_params = 0#FIXME
         self._transf_data = np.vstack([noise_model._preprocess_values(data) for noise_model,data in zip(noise_model_list,data_list)])
         #TODO non-gaussian index
 
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 8f66d074..da13ddb0 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -15,7 +15,7 @@ class Gaussian(likelihood):
     """
     def __init__(self, data, variance=1., normalize=False):
         self.is_heteroscedastic = False
-        self.Nparams = 1
+        self.num_params = 1
         self.Z = 0. # a correction factor which accounts for the approximation made
         N, self.output_dim = data.shape
 
diff --git a/GPy/likelihoods/gaussian_mixed_noise.py b/GPy/likelihoods/gaussian_mixed_noise.py
index 4df01ec2..696867c0 100644
--- a/GPy/likelihoods/gaussian_mixed_noise.py
+++ b/GPy/likelihoods/gaussian_mixed_noise.py
@@ -23,14 +23,14 @@ class Gaussian_Mixed_Noise(likelihood):
     :type normalize: False|True
     """
     def __init__(self, data_list, noise_params=None, normalize=True):
-        self.Nparams = len(data_list)
+        self.num_params = len(data_list)
         self.n_list = [data.size for data in data_list]
-        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.Nparams),self.n_list)])
+        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.num_params),self.n_list)])
 
         if noise_params is None:
-            noise_params = [1.] * self.Nparams
+            noise_params = [1.] * self.num_params
         else:
-            assert self.Nparams == len(noise_params), 'Number of noise parameters does not match the number of noise models.'
+            assert self.num_params == len(noise_params), 'Number of noise parameters does not match the number of noise models.'
 
         self.noise_model_list = [Gaussian(Y,variance=v,normalize = normalize) for Y,v in zip(data_list,noise_params)]
         self.n_params = [noise_model._get_params().size for noise_model in self.noise_model_list]
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index be191e9b..1435028f 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -211,8 +211,8 @@ class MRD(Model):
 #         g.Z = Z.reshape(self.num_inducing, self.input_dim)
 #
 #     def _set_kern_params(self, g, p):
-#         g.kern._set_params(p[:g.kern.Nparam])
-#         g.likelihood._set_params(p[g.kern.Nparam:])
+#         g.kern._set_params(p[:g.kern.num_params])
+#         g.likelihood._set_params(p[g.kern.num_params:])
 
     def _set_params(self, x):
         start = 0; end = self.NQ

From 1a46026015f8f4d72ab2c9519f7a960bd74c2c2c Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Wed, 9 Oct 2013 11:14:42 +0100
Subject: [PATCH 102/252] Fixed stick datasets bug ... but sympykern is
 currently in a rewrite so will be broken

---
 GPy/kern/constructors.py    |  23 +++++-
 GPy/kern/kern.py            |   5 ++
 GPy/kern/parts/kernpart.py  |   7 +-
 GPy/kern/parts/sympykern.py | 138 ++++++++++++++++++++----------------
 GPy/testing/kernel_tests.py |   8 +++
 GPy/util/datasets.py        |   4 +-
 6 files changed, 120 insertions(+), 65 deletions(-)

diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index e6952186..a1252052 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -302,8 +302,8 @@ if sympy_available:
         Z = sp.symbols('z_:' + str(input_dim))
         variance = sp.var('variance',positive=True)
         if ARD:
-            lengthscales = [sp.var('lengthscale_%i' % i, positive=True) for i in range(input_dim)]
-            dist_string = ' + '.join(['(x_%i-z_%i)**2/lengthscale_%i**2' % (i, i, i) for i in range(input_dim)])
+            lengthscales = sp.symbols('lengthscale_:' + str(input_dim))
+            dist_string = ' + '.join(['(x_%i-z_%i)**2/lengthscale%i**2' % (i, i, i) for i in range(input_dim)])
             dist = parse_expr(dist_string)
             f =  variance*sp.exp(-dist/2.)
         else:
@@ -313,6 +313,25 @@ if sympy_available:
             f =  variance*sp.exp(-dist/(2*lengthscale**2))
         return kern(input_dim, [spkern(input_dim, f, name='rbf_sympy')])
 
+    def eq_sympy(input_dim, output_dim, ARD=False, variance=1., lengthscale=1.):
+        """
+        Exponentiated quadratic with multiple outputs.
+        """
+        X = sp.symbols('x_:' + str(input_dim))
+        Z = sp.symbols('z_:' + str(input_dim))
+        variance = sp.var('variance',positive=True)
+        if ARD:
+            lengthscales = [sp.var('lengthscale%i_i lengthscale%i_j' % i, positive=True) for i in range(input_dim)]
+            dist_string = ' + '.join(['(x_%i-z_%i)**2/(lengthscale%i_i*lengthscale%i_j)' % (i, i, i) for i in range(input_dim)])
+            dist = parse_expr(dist_string)
+            f =  variance*sp.exp(-dist/2.)
+        else:
+            lengthscale = sp.var('lengthscale_i lengthscale_j',positive=True)
+            dist_string = ' + '.join(['(x_%i-z_%i)**2' % (i, i) for i in range(input_dim)])
+            dist = parse_expr(dist_string)
+            f =  variance*sp.exp(-dist/(2*lengthscale**2))
+        return kern(input_dim, [spkern(input_dim, f, name='eq_sympy')])
+
     def sinc(input_dim, ARD=False, variance=1., lengthscale=1.):
         """
         TODO: Not clear why this isn't working, suggests argument of sinc is not a number.
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 5a8882dd..97084aa9 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -672,8 +672,13 @@ def kern_test(kern, X=None, X2=None, verbose=False):
     pass_checks = True
     if X==None:
         X = np.random.randn(10, kern.input_dim)
+        for ind in kern.output_indicator:
+            X[:, ind] = np.random.randint(kern.output_dim, X.shape[0])
     if X2==None:
         X2 = np.random.randn(20, kern.input_dim)
+        for ind in kern.output_indicator:
+            X2[:, ind] = np.random.randint(kern.output_dim, X2.shape[0])
+
     if verbose:
         print("Checking covariance function is positive definite.")
     result = Kern_check_model(kern, X=X).is_positive_definite()
diff --git a/GPy/kern/parts/kernpart.py b/GPy/kern/parts/kernpart.py
index 475d835f..95deeb81 100644
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@@ -5,15 +5,20 @@
 class Kernpart(object):
     def __init__(self,input_dim):
         """
-        The base class for a kernpart: a positive definite function which forms part of a kernel
+        The base class for a kernpart: a positive definite function which forms part of a covariance function (kernel).
 
         :param input_dim: the number of input dimensions to the function
         :type input_dim: int
 
         Do not instantiate.
         """
+        # stores indices of any inputs that are for indicating outputs
+        self.output_indicator = []
+        # the input dimensionality for the covariance
         self.input_dim = input_dim
+        # the number of optimisable parameters
         self.num_params = 1
+        # the name of the covariance function.
         self.name = 'unnamed'
 
     def _get_params(self):
diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index dc6a5390..a9f73436 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -27,7 +27,7 @@ class spkern(Kernpart):
      - to handle multiple inputs, call them x_1, z_1, etc
      - to handle multpile correlated outputs, you'll need to add parameters with an index, such as lengthscale_i and lengthscale_j.
     """
-    def __init__(self,input_dim, k=None, output_dim=1, name=None, param=None):
+    def __init__(self, input_dim, k=None, output_dim=1, name=None, param=None):
         if name is None:
             self.name='sympykern'
         else:
@@ -44,7 +44,9 @@ class spkern(Kernpart):
         assert len(self._sp_x)==len(self._sp_z)
         self.input_dim = len(self._sp_x)
         if output_dim > 1:
+            self.output_indicator=[self.input_dim]
             self.input_dim += 1
+            
         assert self.input_dim == input_dim
         self.output_dim = output_dim
         # extract parameter names
@@ -63,26 +65,28 @@ class spkern(Kernpart):
             self._sp_theta = [theta for theta in thetas if theta not in self._sp_theta_i and theta not in self._sp_theta_j]
             
             self.num_split_params = len(self._sp_theta_i)
-            self._split_param_names = ["%s"%theta.name[:-2] for theta in self._sp_theta_i]
-            for params in self._split_param_names:
-                setattr(self, params, np.ones(self.output_dim))
+            self._split_theta_names = ["%s"%theta.name[:-2] for theta in self._sp_theta_i]
+            for theta in self._split_theta_names:
+                setattr(self, theta, np.ones(self.output_dim))
             
             self.num_shared_params = len(self._sp_theta)
             self.num_params = self.num_shared_params+self.num_split_params*self.output_dim
             
         else:
             self.num_split_params = 0
-            self._split_param_names = []
+            self._split_theta_names = []
             self._sp_theta = thetas
             self.num_shared_params = len(self._sp_theta)
             self.num_params = self.num_shared_params
-
-        #deal with param
-        if param is None:
-            param = np.ones(self.num_params)
-            
-        assert param.size==self.num_params
-        self._set_params(param)
+        
+        for theta in self._sp_theta:
+            val = 1.0
+            if param is not None:
+                if param.has_key(theta):
+                    val = param[theta]
+            setattr(self, theta, val)
+        #deal with param            
+        self._set_params(self._get_params())
 
         #Differentiate!
         self._sp_dk_dtheta = [sp.diff(k,theta).simplify() for theta in self._sp_theta]
@@ -90,53 +94,29 @@ class spkern(Kernpart):
             self._sp_dk_dtheta_i = [sp.diff(k,theta).simplify() for theta in self._sp_theta_i]
             
         self._sp_dk_dx = [sp.diff(k,xi).simplify() for xi in self._sp_x]
-        #self._sp_dk_dz = [sp.diff(k,zi) for zi in self._sp_z]
 
-        #self.compute_psi_stats()
+        if False:
+            self.compute_psi_stats()
+
         self._gen_code()
 
-        self.weave_kwargs = {\
-            'support_code':self._function_code,\
-            'include_dirs':[tempfile.gettempdir(), os.path.join(current_dir,'parts/')],\
-            'headers':['"sympy_helpers.h"'],\
-            'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp")],\
-            #'extra_compile_args':['-ftree-vectorize', '-mssse3', '-ftree-vectorizer-verbose=5'],\
-            'extra_compile_args':[],\
-            'extra_link_args':['-lgomp'],\
+        if False:
+            extra_compile_args = ['-ftree-vectorize', '-mssse3', '-ftree-vectorizer-verbose=5']
+        else:
+            extra_compile_args = []
+            
+        self.weave_kwargs = {
+            'support_code':self._function_code,
+            'include_dirs':[tempfile.gettempdir(), os.path.join(current_dir,'parts/')],
+            'headers':['"sympy_helpers.h"'],
+            'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp")],
+            'extra_compile_args':extra_compile_args,
+            'extra_link_args':['-lgomp'],
             'verbose':True}
 
     def __add__(self,other):
         return spkern(self._sp_k+other._sp_k)
 
-    def compute_psi_stats(self):
-        #define some normal distributions
-        mus = [sp.var('mu_%i'%i,real=True) for i in range(self.input_dim)]
-        Ss = [sp.var('S_%i'%i,positive=True) for i in range(self.input_dim)]
-        normals = [(2*sp.pi*Si)**(-0.5)*sp.exp(-0.5*(xi-mui)**2/Si) for xi, mui, Si in zip(self._sp_x, mus, Ss)]
-
-        #do some integration!
-        #self._sp_psi0 = ??
-        self._sp_psi1 = self._sp_k
-        for i in range(self.input_dim):
-            print 'perfoming integrals %i of %i'%(i+1,2*self.input_dim)
-            sys.stdout.flush()
-            self._sp_psi1 *= normals[i]
-            self._sp_psi1 = sp.integrate(self._sp_psi1,(self._sp_x[i],-sp.oo,sp.oo))
-            clear_cache()
-        self._sp_psi1 = self._sp_psi1.simplify()
-
-        #and here's psi2 (eek!)
-        zprime = [sp.Symbol('zp%i'%i) for i in range(self.input_dim)]
-        self._sp_psi2 = self._sp_k.copy()*self._sp_k.copy().subs(zip(self._sp_z,zprime))
-        for i in range(self.input_dim):
-            print 'perfoming integrals %i of %i'%(self.input_dim+i+1,2*self.input_dim)
-            sys.stdout.flush()
-            self._sp_psi2 *= normals[i]
-            self._sp_psi2 = sp.integrate(self._sp_psi2,(self._sp_x[i],-sp.oo,sp.oo))
-            clear_cache()
-        self._sp_psi2 = self._sp_psi2.simplify()
-
-
     def _gen_code(self):
         #generate c functions from sympy objects        
         argument_sequence = self._sp_x+self._sp_z+self._sp_theta
@@ -201,8 +181,10 @@ class spkern(Kernpart):
         
         # Code to compute diagonal of covariance.
         diag_arg_string = re.sub('Z','X',arg_string)
+        diag_arg_string = re.sub('int jj','//int jj',diag_arg_string)
         diag_arg_string = re.sub('j','i',diag_arg_string)
-        diag_precompute_string = re.sub('Z','X',precompute_string)
+        diag_precompute_string = re.sub('int jj','//int jj',precompute_string)
+        diag_precompute_string = re.sub('Z','X',diag_precompute_string)
         diag_precompute_string = re.sub('j','i',diag_precompute_string)
         # Code to do the looping for Kdiag
         self._Kdiag_code =\
@@ -245,6 +227,7 @@ class spkern(Kernpart):
 
         # Code to compute gradients for Kdiag TODO: needs clean up
         diag_func_string = re.sub('Z','X',func_string,count=0)
+        diag_func_string = re.sub('int jj','//int jj',diag_func_string)
         diag_func_string = re.sub('j','i',diag_func_string)
         diag_func_string = re.sub('partial\[i\*num_inducing\+i\]','partial[i]',diag_func_string)
         self._dKdiag_dtheta_code =\
@@ -279,6 +262,7 @@ class spkern(Kernpart):
   
 
         diag_gradient_funcs = re.sub('Z','X',gradient_funcs,count=0)
+        diag_gradient_funcs = re.sub('int jj','//int jj',diag_gradient_funcs)
         diag_gradient_funcs = re.sub('j','i',diag_gradient_funcs)
         diag_gradient_funcs = re.sub('partial\[i\*num_inducing\+i\]','2*partial[i]',diag_gradient_funcs)
 
@@ -312,7 +296,7 @@ class spkern(Kernpart):
         if partial is not None:
             arg_names += ['partial']
         if self.output_dim>1:
-            arg_names += self._split_param_names
+            arg_names += self._split_theta_names
             arg_names += ['output_dim']
         return arg_names
         
@@ -320,7 +304,7 @@ class spkern(Kernpart):
         param, output_dim = self._shared_params, self.output_dim
 
         # Need to extract parameters first
-        for split_params in self._split_param_names:
+        for split_params in self._split_theta_names:
             locals()[split_params] = getattr(self, split_params)
         arg_names = self._get_arg_names(Z, partial)        
         weave.inline(code=code, arg_names=arg_names,**self.weave_kwargs)
@@ -353,21 +337,55 @@ class spkern(Kernpart):
     def dKdiag_dX(self,partial,X,target):
         self._weave.inline(self._dKdiag_dX_code, X, target, Z, partial)
 
-    def _set_params(self,param):
-        #print param.flags['C_CONTIGUOUS']
+    def compute_psi_stats(self):
+        #define some normal distributions
+        mus = [sp.var('mu_%i'%i,real=True) for i in range(self.input_dim)]
+        Ss = [sp.var('S_%i'%i,positive=True) for i in range(self.input_dim)]
+        normals = [(2*sp.pi*Si)**(-0.5)*sp.exp(-0.5*(xi-mui)**2/Si) for xi, mui, Si in zip(self._sp_x, mus, Ss)]
+
+        #do some integration!
+        #self._sp_psi0 = ??
+        self._sp_psi1 = self._sp_k
+        for i in range(self.input_dim):
+            print 'perfoming integrals %i of %i'%(i+1,2*self.input_dim)
+            sys.stdout.flush()
+            self._sp_psi1 *= normals[i]
+            self._sp_psi1 = sp.integrate(self._sp_psi1,(self._sp_x[i],-sp.oo,sp.oo))
+            clear_cache()
+        self._sp_psi1 = self._sp_psi1.simplify()
+
+        #and here's psi2 (eek!)
+        zprime = [sp.Symbol('zp%i'%i) for i in range(self.input_dim)]
+        self._sp_psi2 = self._sp_k.copy()*self._sp_k.copy().subs(zip(self._sp_z,zprime))
+        for i in range(self.input_dim):
+            print 'perfoming integrals %i of %i'%(self.input_dim+i+1,2*self.input_dim)
+            sys.stdout.flush()
+            self._sp_psi2 *= normals[i]
+            self._sp_psi2 = sp.integrate(self._sp_psi2,(self._sp_x[i],-sp.oo,sp.oo))
+            clear_cache()
+        self._sp_psi2 = self._sp_psi2.simplify()
+
+
+    def _set_params(self,param):        
         assert param.size == (self.num_params)
-        self._shared_params = param[0:self.num_shared_params]
+        for i, shared_params in enumerate(self._sp_theta):
+            start = i
+            end = i+1
+            setattr(self, shared_params, param[start:end])
+            
         if self.output_dim>1:
-            for i, split_params in enumerate(self._split_param_names):
+            for i, split_params in enumerate(self._split_theta_names):
                 start = self.num_shared_params + i*self.output_dim
                 end = self.num_shared_params + (i+1)*self.output_dim
                 setattr(self, split_params, param[start:end])
 
 
     def _get_params(self):
-        params = self._shared_params
+        params = np.zeros(0)
+        for shared_params in self._sp_theta:
+            params = np.hstack((params, getattr(self, shared_params)))
         if self.output_dim>1:
-            for split_params in self._split_param_names:
+            for split_params in self._split_theta_names:
                 params = np.hstack((params, getattr(self, split_params).flatten()))
         return params
 
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 87d4a20e..e0a87169 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -25,6 +25,14 @@ class KernelTests(unittest.TestCase):
         kern = GPy.kern.rbf_sympy(5)
         self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
+    def test_eq_sympykernel(self):
+        kern = GPy.kern.eq_sympy(5, 3)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_sinckernel(self):
+        kern = GPy.kern.sinc(5)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
     def test_rbf_invkernel(self):
         kern = GPy.kern.rbf_inv(5)
         self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 79bc3fc3..2ff168b3 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -491,11 +491,11 @@ def ripley_synth(data_set='ripley_prnn_data'):
 def osu_run1(data_set='osu_run1', sample_every=4):
     if not data_available(data_set):
         download_data(data_set)
-    zip = zipfile.ZipFile(os.path.join(data_path, data_set, 'sprintTXT.ZIP'), 'r')
+    zip = zipfile.ZipFile(os.path.join(data_path, data_set, 'run1TXT.ZIP'), 'r')
     path = os.path.join(data_path, data_set)
     for name in zip.namelist():
         zip.extract(name, path)
-    Y, connect = GPy.util.mocap.load_text_data('Aug210107', path)
+    Y, connect = GPy.util.mocap.load_text_data('Aug210106', path)
     Y = Y[0:-1:sample_every, :]
     return data_details_return({'Y': Y, 'connect' : connect}, data_set)
 

From de0a5d0e70643ddd4a2d2901c740041af81ca981 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Wed, 9 Oct 2013 12:07:39 +0100
Subject: [PATCH 103/252] Some fixes and changes to the sympykern.

---
 GPy/kern/constructors.py    | 17 ++++++++++-------
 GPy/kern/kern.py            | 10 +++++-----
 GPy/kern/parts/kernpart.py  |  2 --
 GPy/kern/parts/sympykern.py | 22 ++++++++++++----------
 GPy/testing/kernel_tests.py |  2 +-
 5 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index a1252052..62c29744 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -317,20 +317,23 @@ if sympy_available:
         """
         Exponentiated quadratic with multiple outputs.
         """
-        X = sp.symbols('x_:' + str(input_dim))
-        Z = sp.symbols('z_:' + str(input_dim))
+        real_input_dim = input_dim
+        if output_dim>1:
+            real_input_dim -= 1
+        X = sp.symbols('x_:' + str(real_input_dim))
+        Z = sp.symbols('z_:' + str(real_input_dim))
         variance = sp.var('variance',positive=True)
         if ARD:
-            lengthscales = [sp.var('lengthscale%i_i lengthscale%i_j' % i, positive=True) for i in range(input_dim)]
-            dist_string = ' + '.join(['(x_%i-z_%i)**2/(lengthscale%i_i*lengthscale%i_j)' % (i, i, i) for i in range(input_dim)])
+            lengthscales = [sp.var('lengthscale%i_i lengthscale%i_j' % i, positive=True) for i in range(real_input_dim)]
+            dist_string = ' + '.join(['(x_%i-z_%i)**2/(lengthscale%i_i*lengthscale%i_j)' % (i, i, i) for i in range(real_input_dim)])
             dist = parse_expr(dist_string)
             f =  variance*sp.exp(-dist/2.)
         else:
             lengthscale = sp.var('lengthscale_i lengthscale_j',positive=True)
-            dist_string = ' + '.join(['(x_%i-z_%i)**2' % (i, i) for i in range(input_dim)])
+            dist_string = ' + '.join(['(x_%i-z_%i)**2' % (i, i) for i in range(real_input_dim)])
             dist = parse_expr(dist_string)
-            f =  variance*sp.exp(-dist/(2*lengthscale**2))
-        return kern(input_dim, [spkern(input_dim, f, name='eq_sympy')])
+            f =  variance*sp.exp(-dist/(2*lengthscale_i*lengthscale_j))
+        return kern(input_dim, [spkern(input_dim, f, output_dim=output_dim, name='eq_sympy')])
 
     def sinc(input_dim, ARD=False, variance=1., lengthscale=1.):
         """
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index ff7dd1c1..08f36109 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -658,7 +658,7 @@ class Kern_check_dKdiag_dX(Kern_check_model):
     def _set_params(self, x):
         self.X=x.reshape(self.X.shape)
 
-def kern_test(kern, X=None, X2=None, verbose=False):
+def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
     """This function runs on kernels to check the correctness of their implementation. It checks that the covariance function is positive definite for a randomly generated data set.
 
     :param kern: the kernel to be tested.
@@ -672,12 +672,12 @@ def kern_test(kern, X=None, X2=None, verbose=False):
     pass_checks = True
     if X==None:
         X = np.random.randn(10, kern.input_dim)
-        for ind in kern.output_indicator:
-            X[:, ind] = np.random.randint(kern.output_dim, X.shape[0])
+        if output_ind is not None:
+            X[:, output_ind] = np.random.randint(kern.output_dim, X.shape[0])
     if X2==None:
         X2 = np.random.randn(20, kern.input_dim)
-        for ind in kern.output_indicator:
-            X2[:, ind] = np.random.randint(kern.output_dim, X2.shape[0])
+        if output_ind is not None:
+            X2[:, output_ind] = np.random.randint(kern.output_dim, X2.shape[0])
 
     if verbose:
         print("Checking covariance function is positive definite.")
diff --git a/GPy/kern/parts/kernpart.py b/GPy/kern/parts/kernpart.py
index 95deeb81..f6777083 100644
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@@ -12,8 +12,6 @@ class Kernpart(object):
 
         Do not instantiate.
         """
-        # stores indices of any inputs that are for indicating outputs
-        self.output_indicator = []
         # the input dimensionality for the covariance
         self.input_dim = input_dim
         # the number of optimisable parameters
diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index a9f73436..09ab9934 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -44,7 +44,6 @@ class spkern(Kernpart):
         assert len(self._sp_x)==len(self._sp_z)
         self.input_dim = len(self._sp_x)
         if output_dim > 1:
-            self.output_indicator=[self.input_dim]
             self.input_dim += 1
             
         assert self.input_dim == input_dim
@@ -84,7 +83,7 @@ class spkern(Kernpart):
             if param is not None:
                 if param.has_key(theta):
                     val = param[theta]
-            setattr(self, theta, val)
+            setattr(self, theta.name, val)
         #deal with param            
         self._set_params(self._get_params())
 
@@ -146,7 +145,7 @@ class spkern(Kernpart):
             reverse_arg_list = list(arg_list)
             reverse_arg_list.reverse()
 
-        param_arg_list = ["param[%i]"%i for i in range(self.num_shared_params)]
+        param_arg_list = [shared_params.name for shared_params in self._sp_theta]
         arg_list += param_arg_list
 
         precompute_list=[]
@@ -201,11 +200,12 @@ class spkern(Kernpart):
         """%(diag_precompute_string,diag_arg_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
 
         # Code to compute gradients
-        func_list = ([' '*16 + 'target[%i] += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
+        func_list = []
         if self.output_dim>1:
             func_list += [' '*16 + "int %s=(int)%s[%s*input_dim+output_dim];"%(index, var, index2) for index, var, index2 in zip(['ii', 'jj'], ['X', 'Z'], ['i', 'j'])]
             func_list += [' '*16 + 'target[%i+ii] += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, arg_string) for i, theta in enumerate(self._sp_theta_i)]
             func_list += [' '*16 + 'target[%i+jj] += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, reverse_arg_string) for i, theta in enumerate(self._sp_theta_i)]
+        func_list += ([' '*16 + 'target[%i] += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
         func_string = '\n'.join(func_list) 
 
         self._dK_dtheta_code =\
@@ -290,7 +290,9 @@ class spkern(Kernpart):
         #TODO: insert multiple functions here via string manipulation
         #TODO: similar functions for psi_stats
     def _get_arg_names(self, Z=None, partial=None):
-        arg_names = ['target','X','param']
+        arg_names = ['target','X']
+        for shared_params in self._sp_theta:
+            arg_names += [shared_params.name]
         if Z is not None:
             arg_names += ['Z']
         if partial is not None:
@@ -301,7 +303,9 @@ class spkern(Kernpart):
         return arg_names
         
     def _weave_inline(self, code, X, target, Z=None, partial=None):
-        param, output_dim = self._shared_params, self.output_dim
+        output_dim = self.output_dim
+        for shared_params in self._sp_theta:
+            locals()[shared_params.name] = getattr(self, shared_params.name)
 
         # Need to extract parameters first
         for split_params in self._split_theta_names:
@@ -369,9 +373,7 @@ class spkern(Kernpart):
     def _set_params(self,param):        
         assert param.size == (self.num_params)
         for i, shared_params in enumerate(self._sp_theta):
-            start = i
-            end = i+1
-            setattr(self, shared_params, param[start:end])
+            setattr(self, shared_params.name, param[i])
             
         if self.output_dim>1:
             for i, split_params in enumerate(self._split_theta_names):
@@ -383,7 +385,7 @@ class spkern(Kernpart):
     def _get_params(self):
         params = np.zeros(0)
         for shared_params in self._sp_theta:
-            params = np.hstack((params, getattr(self, shared_params)))
+            params = np.hstack((params, getattr(self, shared_params.name)))
         if self.output_dim>1:
             for split_params in self._split_theta_names:
                 params = np.hstack((params, getattr(self, split_params).flatten()))
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 5c45ae20..f64dac2b 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -34,7 +34,7 @@ class KernelTests(unittest.TestCase):
             self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
     def test_eq_sympykernel(self):
-        kern = GPy.kern.eq_sympy(5, 3)
+        kern = GPy.kern.eq_sympy(5, 3, output_ind=4)
         self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
     def test_sinckernel(self):

From 1f37ec41514cd9746dde9ef95b04fc2510e62879 Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Thu, 10 Oct 2013 18:00:11 +0100
Subject: [PATCH 104/252] Missing term in the likelihood.

---
 GPy/core/fitc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/core/fitc.py b/GPy/core/fitc.py
index c9cf6eb2..c5350271 100644
--- a/GPy/core/fitc.py
+++ b/GPy/core/fitc.py
@@ -159,7 +159,7 @@ class FITC(SparseGP):
         A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.beta_star)) - 0.5 * np.sum(self.V_star * self.likelihood.Y)
         C = -self.output_dim * (np.sum(np.log(np.diag(self.LB))))
         D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
-        return A + C + D
+        return A + C + D + self.likelihood.Z
 
     def _log_likelihood_gradients(self):
         pass

From 6945ad7aa14d498d8e6ba4d39029f4cc21a88d89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Fusi?= <nicolo.fusi@gmail.com>
Date: Fri, 11 Oct 2013 16:19:27 -0700
Subject: [PATCH 105/252] Seems to work on windows now

not everything works yet, but I've identified the main issues. Still
TODO: handle missing OMP libraries gracefully
---
 GPy/util/linalg.py |  4 +++-
 GPy/util/misc.py   | 20 +++++++++++---------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 4e7f7fff..213cd047 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -325,6 +325,7 @@ def symmetrify(A, upper=False):
     """
     N, M = A.shape
     assert N == M
+    
     c_contig_code = """
     int iN;
     for (int i=1; i<N; i++){
@@ -343,6 +344,8 @@ def symmetrify(A, upper=False):
       }
     }
     """
+
+    N = int(N) # for safe type casting
     if A.flags['C_CONTIGUOUS'] and upper:
         weave.inline(f_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
     elif A.flags['C_CONTIGUOUS'] and not upper:
@@ -403,4 +406,3 @@ def backsub_both_sides(L, X, transpose='left'):
     else:
         tmp, _ = lapack.dtrtrs(L, np.asfortranarray(X), lower=1, trans=0)
         return lapack.dtrtrs(L, np.asfortranarray(tmp.T), lower=1, trans=0)[0].T
-
diff --git a/GPy/util/misc.py b/GPy/util/misc.py
index 72edf99f..5866ecf9 100644
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@@ -61,7 +61,7 @@ def fast_array_equal(A, B):
     int i, j;
     return_val = 1;
 
-    #pragma omp parallel for private(i, j)
+    // #pragma omp parallel for private(i, j)
     for(i=0;i<N;i++){
        for(j=0;j<D;j++){
           if(A(i, j) != B(i, j)){
@@ -76,7 +76,7 @@ def fast_array_equal(A, B):
     int i, j, z;
     return_val = 1;
 
-    #pragma omp parallel for private(i, j, z)
+    // #pragma omp parallel for private(i, j, z)
     for(i=0;i<N;i++){
        for(j=0;j<D;j++){
          for(z=0;z<Q;z++){
@@ -90,7 +90,7 @@ def fast_array_equal(A, B):
     """
 
     support_code = """
-    #include <omp.h>
+    // #include <omp.h>
     #include <math.h>
     """
 
@@ -107,15 +107,17 @@ def fast_array_equal(A, B):
         return False
     elif A.shape == B.shape:
         if A.ndim == 2:
-            N, D = A.shape
-            value = weave.inline(code2, support_code=support_code, libraries=['gomp'],
+            N, D = [int(i) for i in A.shape]
+            value = weave.inline(code2, support_code=support_code,
                                  arg_names=['A', 'B', 'N', 'D'],
-                                 type_converters=weave.converters.blitz,**weave_options)
+                                 type_converters=weave.converters.blitz)
+            # libraries=['gomp'], **weave_options)
         elif A.ndim == 3:
-            N, D, Q = A.shape
-            value = weave.inline(code3, support_code=support_code, libraries=['gomp'],
+            N, D, Q = [int(i) for i in A.shape]
+            value = weave.inline(code3, support_code=support_code,
                                  arg_names=['A', 'B', 'N', 'D', 'Q'],
-                                 type_converters=weave.converters.blitz,**weave_options)
+                                 type_converters=weave.converters.blitz)
+            #libraries=['gomp'], **weave_options)
         else:
             value = np.array_equal(A,B)
 

From a92780cb89cfea5ff2fb57d97356b6889079e9cc Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Mon, 14 Oct 2013 05:59:15 +0100
Subject: [PATCH 106/252] Added olivetti faces data set. It required adding
 netpbmfile.py a bsd licensed pgm file reader from Christoph Gohlke, which
 doesn't seem to have a spearate installer. Also modified image_show to assume
 by default that array ordering is python instead of fortran. Modified
 brendan_faces demo to explicilty force fortran ordering. Notified Teo of
 change.

---
 GPy/examples/dimensionality_reduction.py |  31 ++-
 GPy/util/__init__.py                     |   2 +
 GPy/util/datasets.py                     |  87 ++++--
 GPy/util/netpbmfile.py                   | 331 +++++++++++++++++++++++
 GPy/util/visualize.py                    |  61 +++--
 5 files changed, 458 insertions(+), 54 deletions(-)
 create mode 100644 GPy/util/netpbmfile.py

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 005b131f..8aaeb4ae 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -327,31 +327,52 @@ def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
         m.plot_scales("MRD Scales")
     return m
 
+
+
 def brendan_faces():
     from GPy import kern
     data = GPy.util.datasets.brendan_faces()
     Q = 2
-    Y = data['Y'][0:-1:10, :]
-    # Y = data['Y']
+    Y = data['Y']
     Yn = Y - Y.mean()
     Yn /= Yn.std()
 
     m = GPy.models.GPLVM(Yn, Q)
-    # m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=100)
 
     # optimize
     m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())
 
-    m.optimize('scg', messages=1, max_f_eval=10000)
+    m.optimize('scg', messages=1, max_iters=10)
 
     ax = m.plot_latent(which_indices=(0, 1))
     y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, invert=False, scale=False)
+    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
     lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
     raw_input('Press enter to finish')
 
     return m
+
+def olivetti_faces():
+    from GPy import kern
+    data = GPy.util.datasets.olivetti_faces()
+    Q = 2
+    Y = data['Y']
+    Yn = Y - Y.mean()
+    Yn /= Yn.std()
+
+    m = GPy.models.GPLVM(Yn, Q)
+    m.optimize('scg', messages=1, max_iters=1000)
+
+    ax = m.plot_latent(which_indices=(0, 1))
+    y = m.likelihood.Y[0, :]
+    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
+    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+    raw_input('Press enter to finish')
+
+    return m
+
 def stick_play(range=None, frame_rate=15):
+
     data = GPy.util.datasets.osu_run1()
     # optimize
     if range == None:
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index 99548268..db9b7362 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -14,3 +14,5 @@ import visualize
 import decorators
 import classification
 import latent_space_visualizations
+
+import netpbmfile
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 2ff168b3..45ed694c 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -8,17 +8,12 @@ import zipfile
 import tarfile
 import datetime
 
-ipython_notebook = False
-if ipython_notebook:
-    import IPython.core.display
-    def ipynb_input(varname, prompt=''):
-        """Prompt user for input and assign string val to given variable name."""
-        js_code = ("""
-            var value = prompt("{prompt}","");
-            var py_code = "{varname} = '" + value + "'";
-            IPython.notebook.kernel.execute(py_code);
-        """).format(prompt=prompt, varname=varname)
-        return IPython.core.display.Javascript(js_code)
+ipython_available=True
+try:
+    import IPython
+except ImportError:
+    ipython_available=False
+
 
 import sys, urllib
 
@@ -34,8 +29,11 @@ data_path = os.path.join(os.path.dirname(__file__), 'datasets')
 default_seed = 10000
 overide_manual_authorize=False
 neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/'
+sam_url = 'http://www.cs.nyu.edu/~roweis/data/'
 cmu_url = 'http://mocap.cs.cmu.edu/subjects/'
-# Note: there may be a better way of storing data resources. One of the pythonistas will need to take a look.
+
+# Note: there may be a better way of storing data resources, for the
+# moment we are storing them in a dictionary.
 data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
                                        'files' : [['ankurDataPoseSilhouette.mat']],
                                        'license' : None,
@@ -49,7 +47,7 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
                                       'license' : None,
                                       'size' : 51276
                                       },
-                  'brendan_faces' : {'urls' : ['http://www.cs.nyu.edu/~roweis/data/'],
+                  'brendan_faces' : {'urls' : [sam_url],
                                      'files': [['frey_rawface.mat']],
                                      'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.',
                                      'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""",
@@ -93,6 +91,12 @@ The database was created with funding from NSF EIA-0196217.""",
                                             'details' : """Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.""",
                                             'license' : None,
                                             'size' : 21949154},
+                  'olivetti_faces' : {'urls' : [neil_url + 'olivetti_faces/', sam_url],
+                                      'files' : [['att_faces.zip'], ['olivettifaces.mat']],
+                                            'citation' : 'Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994',
+                                            'details' : """Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. """,
+                                            'license': None,
+                                            'size' : 8561331},
                   'olympic_marathon_men' : {'urls' : [neil_url + 'olympic_marathon_men/'],
                                             'files' : [['olympicMarathonTimes.csv']],
                                             'citation' : None,
@@ -144,23 +148,32 @@ The database was created with funding from NSF EIA-0196217.""",
                   }
 
 
-def prompt_user():
+def prompt_user(prompt):
     """Ask user for agreeing to data set licenses."""
     # raw_input returns the empty string for "enter"
     yes = set(['yes', 'y'])
     no = set(['no','n'])
-    choice = ''
-    if ipython_notebook:
-        ipynb_input(choice, prompt='provide your answer here')
-    else:
+
+    try:
+        print(prompt)
         choice = raw_input().lower()
+        # would like to test for exception here, but not sure if we can do that without importing IPython
+    except: 
+        print('Stdin is not implemented.')
+        print('You need to set')
+        print('overide_manual_authorize=True')
+        print('to proceed with the download. Please set that variable and continue.')
+        raise
+
+    
     if choice in yes:
         return True
     elif choice in no:
         return False
     else:
-        sys.stdout.write("Please respond with 'yes', 'y' or 'no', 'n'")
-        return prompt_user()
+        print("Your response was a " + choice)
+        print("Please respond with 'yes', 'y' or 'no', 'n'")
+        #return prompt_user()
 
 
 def data_available(dataset_name=None):
@@ -212,15 +225,14 @@ def authorize_download(dataset_name=None):
             print('You must also agree to the following license:')
             print(dr['license'])
             print('')
-        print('Do you wish to proceed with the download? [yes/no]')
-        return prompt_user()
+        return prompt_user('Do you wish to proceed with the download? [yes/no]')
 
 def download_data(dataset_name=None):
     """Check with the user that the are happy with terms and conditions for the data set, then download it."""
 
     dr = data_resources[dataset_name]
     if not authorize_download(dataset_name):
-        return False
+        raise Exception("Permission to download data set denied.")
 
     if dr.has_key('suffices'):
         for url, files, suffices in zip(dr['urls'], dr['files'], dr['suffices']):
@@ -489,12 +501,12 @@ def ripley_synth(data_set='ripley_prnn_data'):
     return data_details_return({'X': X, 'y': y, 'Xtest': Xtest, 'ytest': ytest, 'info': 'Synthetic data generated by Ripley for a two class classification problem.'}, data_set)
 
 def osu_run1(data_set='osu_run1', sample_every=4):
+    path = os.path.join(data_path, data_set)
     if not data_available(data_set):
         download_data(data_set)
-    zip = zipfile.ZipFile(os.path.join(data_path, data_set, 'run1TXT.ZIP'), 'r')
-    path = os.path.join(data_path, data_set)
-    for name in zip.namelist():
-        zip.extract(name, path)
+        zip = zipfile.ZipFile(os.path.join(data_path, data_set, 'run1TXT.ZIP'), 'r')
+        for name in zip.namelist():
+            zip.extract(name, path)
     Y, connect = GPy.util.mocap.load_text_data('Aug210106', path)
     Y = Y[0:-1:sample_every, :]
     return data_details_return({'Y': Y, 'connect' : connect}, data_set)
@@ -579,6 +591,24 @@ def toy_linear_1d_classification(seed=default_seed):
     X = (np.r_[x1, x2])[:, None]
     return {'X': X, 'Y':  sample_class(2.*X), 'F': 2.*X, 'seed' : seed}
 
+def olivetti_faces(data_set='olivetti_faces'):
+    path = os.path.join(data_path, data_set)
+    if not data_available(data_set):
+        download_data(data_set)
+        zip = zipfile.ZipFile(os.path.join(path, 'att_faces.zip'), 'r')
+        for name in zip.namelist():
+            zip.extract(name, path)
+    Y = []
+    lbls = []
+    for subject in range(40):
+        for image in range(10):
+            image_path = os.path.join(path, 'orl_faces', 's'+str(subject+1), str(image+1) + '.pgm')
+            Y.append(GPy.util.netpbmfile.imread(image_path).flatten())
+            lbls.append(subject)
+    Y = np.asarray(Y)
+    lbls = np.asarray(lbls)[:, None]
+    return data_details_return({'Y': Y, 'lbls' : lbls, 'info': "ORL Faces processed to 64x64 images."}, data_set)
+    
 def olympic_100m_men(data_set='rogers_girolami_data'):
     if not data_available(data_set):
         download_data(data_set)
@@ -586,7 +616,8 @@ def olympic_100m_men(data_set='rogers_girolami_data'):
         tar_file = os.path.join(path, 'firstcoursemldata.tar.gz')
         tar = tarfile.open(tar_file)
         print('Extracting file.')
-        tar.extractall(path=path)
+        tar.extractall(path=path)    
+ 
         tar.close()
     olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male100']
 
diff --git a/GPy/util/netpbmfile.py b/GPy/util/netpbmfile.py
new file mode 100644
index 00000000..030bd574
--- /dev/null
+++ b/GPy/util/netpbmfile.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# netpbmfile.py
+
+# Copyright (c) 2011-2013, Christoph Gohlke
+# Copyright (c) 2011-2013, The Regents of the University of California
+# Produced at the Laboratory for Fluorescence Dynamics.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of the copyright holders nor the names of any
+#   contributors may be used to endorse or promote products derived
+#   from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+"""Read and write image data from respectively to Netpbm files.
+
+This implementation follows the Netpbm format specifications at
+http://netpbm.sourceforge.net/doc/. No gamma correction is performed.
+
+The following image formats are supported: PBM (bi-level), PGM (grayscale),
+PPM (color), PAM (arbitrary), XV thumbnail (RGB332, read-only).
+
+:Author:
+  `Christoph Gohlke <http://www.lfd.uci.edu/~gohlke/>`_
+
+:Organization:
+  Laboratory for Fluorescence Dynamics, University of California, Irvine
+
+:Version: 2013.01.18
+
+Requirements
+------------
+* `CPython 2.7, 3.2 or 3.3 <http://www.python.org>`_
+* `Numpy 1.7 <http://www.numpy.org>`_
+* `Matplotlib 1.2 <http://www.matplotlib.org>`_  (optional for plotting)
+
+Examples
+--------
+>>> im1 = numpy.array([[0, 1],[65534, 65535]], dtype=numpy.uint16)
+>>> imsave('_tmp.pgm', im1)
+>>> im2 = imread('_tmp.pgm')
+>>> assert numpy.all(im1 == im2)
+
+"""
+
+from __future__ import division, print_function
+
+import sys
+import re
+import math
+from copy import deepcopy
+
+import numpy
+
+__version__ = '2013.01.18'
+__docformat__ = 'restructuredtext en'
+__all__ = ['imread', 'imsave', 'NetpbmFile']
+
+
+def imread(filename, *args, **kwargs):
+    """Return image data from Netpbm file as numpy array.
+
+    `args` and `kwargs` are arguments to NetpbmFile.asarray().
+
+    Examples
+    --------
+    >>> image = imread('_tmp.pgm')
+
+    """
+    try:
+        netpbm = NetpbmFile(filename)
+        image = netpbm.asarray()
+    finally:
+        netpbm.close()
+    return image
+
+
+def imsave(filename, data, maxval=None, pam=False):
+    """Write image data to Netpbm file.
+
+    Examples
+    --------
+    >>> image = numpy.array([[0, 1],[65534, 65535]], dtype=numpy.uint16)
+    >>> imsave('_tmp.pgm', image)
+
+    """
+    try:
+        netpbm = NetpbmFile(data, maxval=maxval)
+        netpbm.write(filename, pam=pam)
+    finally:
+        netpbm.close()
+
+
+class NetpbmFile(object):
+    """Read and write Netpbm PAM, PBM, PGM, PPM, files."""
+
+    _types = {b'P1': b'BLACKANDWHITE', b'P2': b'GRAYSCALE', b'P3': b'RGB',
+              b'P4': b'BLACKANDWHITE', b'P5': b'GRAYSCALE', b'P6': b'RGB',
+              b'P7 332': b'RGB', b'P7': b'RGB_ALPHA'}
+
+    def __init__(self, arg=None, **kwargs):
+        """Initialize instance from filename, open file, or numpy array."""
+        for attr in ('header', 'magicnum', 'width', 'height', 'maxval',
+                     'depth', 'tupltypes', '_filename', '_fh', '_data'):
+            setattr(self, attr, None)
+        if arg is None:
+            self._fromdata([], **kwargs)
+        elif isinstance(arg, basestring):
+            self._fh = open(arg, 'rb')
+            self._filename = arg
+            self._fromfile(self._fh, **kwargs)
+        elif hasattr(arg, 'seek'):
+            self._fromfile(arg, **kwargs)
+            self._fh = arg
+        else:
+            self._fromdata(arg, **kwargs)
+
+    def asarray(self, copy=True, cache=False, **kwargs):
+        """Return image data from file as numpy array."""
+        data = self._data
+        if data is None:
+            data = self._read_data(self._fh, **kwargs)
+            if cache:
+                self._data = data
+            else:
+                return data
+        return deepcopy(data) if copy else data
+
+    def write(self, arg, **kwargs):
+        """Write instance to file."""
+        if hasattr(arg, 'seek'):
+            self._tofile(arg, **kwargs)
+        else:
+            with open(arg, 'wb') as fid:
+                self._tofile(fid, **kwargs)
+
+    def close(self):
+        """Close open file. Future asarray calls might fail."""
+        if self._filename and self._fh:
+            self._fh.close()
+            self._fh = None
+
+    def __del__(self):
+        self.close()
+
+    def _fromfile(self, fh):
+        """Initialize instance from open file."""
+        fh.seek(0)
+        data = fh.read(4096)
+        if (len(data) < 7) or not (b'0' < data[1:2] < b'8'):
+            raise ValueError("Not a Netpbm file:\n%s" % data[:32])
+        try:
+            self._read_pam_header(data)
+        except Exception:
+            try:
+                self._read_pnm_header(data)
+            except Exception:
+                raise ValueError("Not a Netpbm file:\n%s" % data[:32])
+
+    def _read_pam_header(self, data):
+        """Read PAM header and initialize instance."""
+        regroups = re.search(
+            b"(^P7[\n\r]+(?:(?:[\n\r]+)|(?:#.*)|"
+            b"(HEIGHT\s+\d+)|(WIDTH\s+\d+)|(DEPTH\s+\d+)|(MAXVAL\s+\d+)|"
+            b"(?:TUPLTYPE\s+\w+))*ENDHDR\n)", data).groups()
+        self.header = regroups[0]
+        self.magicnum = b'P7'
+        for group in regroups[1:]:
+            key, value = group.split()
+            setattr(self, unicode(key).lower(), int(value))
+        matches = re.findall(b"(TUPLTYPE\s+\w+)", self.header)
+        self.tupltypes = [s.split(None, 1)[1] for s in matches]
+
+    def _read_pnm_header(self, data):
+        """Read PNM header and initialize instance."""
+        bpm = data[1:2] in b"14"
+        regroups = re.search(b"".join((
+            b"(^(P[123456]|P7 332)\s+(?:#.*[\r\n])*",
+            b"\s*(\d+)\s+(?:#.*[\r\n])*",
+            b"\s*(\d+)\s+(?:#.*[\r\n])*" * (not bpm),
+            b"\s*(\d+)\s(?:\s*#.*[\r\n]\s)*)")), data).groups() + (1, ) * bpm
+        self.header = regroups[0]
+        self.magicnum = regroups[1]
+        self.width = int(regroups[2])
+        self.height = int(regroups[3])
+        self.maxval = int(regroups[4])
+        self.depth = 3 if self.magicnum in b"P3P6P7 332" else 1
+        self.tupltypes = [self._types[self.magicnum]]
+
+    def _read_data(self, fh, byteorder='>'):
+        """Return image data from open file as numpy array."""
+        fh.seek(len(self.header))
+        data = fh.read()
+        dtype = 'u1' if self.maxval < 256 else byteorder + 'u2'
+        depth = 1 if self.magicnum == b"P7 332" else self.depth
+        shape = [-1, self.height, self.width, depth]
+        size = numpy.prod(shape[1:])
+        if self.magicnum in b"P1P2P3":
+            data = numpy.array(data.split(None, size)[:size], dtype)
+            data = data.reshape(shape)
+        elif self.maxval == 1:
+            shape[2] = int(math.ceil(self.width / 8))
+            data = numpy.frombuffer(data, dtype).reshape(shape)
+            data = numpy.unpackbits(data, axis=-2)[:, :, :self.width, :]
+        else:
+            data = numpy.frombuffer(data, dtype)
+            data = data[:size * (data.size // size)].reshape(shape)
+        if data.shape[0] < 2:
+            data = data.reshape(data.shape[1:])
+        if data.shape[-1] < 2:
+            data = data.reshape(data.shape[:-1])
+        if self.magicnum == b"P7 332":
+            rgb332 = numpy.array(list(numpy.ndindex(8, 8, 4)), numpy.uint8)
+            rgb332 *= [36, 36, 85]
+            data = numpy.take(rgb332, data, axis=0)
+        return data
+
+    def _fromdata(self, data, maxval=None):
+        """Initialize instance from numpy array."""
+        data = numpy.array(data, ndmin=2, copy=True)
+        if data.dtype.kind not in "uib":
+            raise ValueError("not an integer type: %s" % data.dtype)
+        if data.dtype.kind == 'i' and numpy.min(data) < 0:
+            raise ValueError("data out of range: %i" % numpy.min(data))
+        if maxval is None:
+            maxval = numpy.max(data)
+            maxval = 255 if maxval < 256 else 65535
+        if maxval < 0 or maxval > 65535:
+            raise ValueError("data out of range: %i" % maxval)
+        data = data.astype('u1' if maxval < 256 else '>u2')
+        self._data = data
+        if data.ndim > 2 and data.shape[-1] in (3, 4):
+            self.depth = data.shape[-1]
+            self.width = data.shape[-2]
+            self.height = data.shape[-3]
+            self.magicnum = b'P7' if self.depth == 4 else b'P6'
+        else:
+            self.depth = 1
+            self.width = data.shape[-1]
+            self.height = data.shape[-2]
+            self.magicnum = b'P5' if maxval > 1 else b'P4'
+        self.maxval = maxval
+        self.tupltypes = [self._types[self.magicnum]]
+        self.header = self._header()
+
+    def _tofile(self, fh, pam=False):
+        """Write Netbm file."""
+        fh.seek(0)
+        fh.write(self._header(pam))
+        data = self.asarray(copy=False)
+        if self.maxval == 1:
+            data = numpy.packbits(data, axis=-1)
+        data.tofile(fh)
+
+    def _header(self, pam=False):
+        """Return file header as byte string."""
+        if pam or self.magicnum == b'P7':
+            header = "\n".join((
+                "P7",
+                "HEIGHT %i" % self.height,
+                "WIDTH %i" % self.width,
+                "DEPTH %i" % self.depth,
+                "MAXVAL %i" % self.maxval,
+                "\n".join("TUPLTYPE %s" % unicode(i) for i in self.tupltypes),
+                "ENDHDR\n"))
+        elif self.maxval == 1:
+            header = "P4 %i %i\n" % (self.width, self.height)
+        elif self.depth == 1:
+            header = "P5 %i %i %i\n" % (self.width, self.height, self.maxval)
+        else:
+            header = "P6 %i %i %i\n" % (self.width, self.height, self.maxval)
+        if sys.version_info[0] > 2:
+            header = bytes(header, 'ascii')
+        return header
+
+    def __str__(self):
+        """Return information about instance."""
+        return unicode(self.header)
+
+
+if sys.version_info[0] > 2:
+    basestring = str
+    unicode = lambda x: str(x, 'ascii')
+
+if __name__ == "__main__":
+    # Show images specified on command line or all images in current directory
+    from glob import glob
+    from matplotlib import pyplot
+    files = sys.argv[1:] if len(sys.argv) > 1 else glob('*.p*m')
+    for fname in files:
+        try:
+            pam = NetpbmFile(fname)
+            img = pam.asarray(copy=False)
+            if False:
+                pam.write('_tmp.pgm.out', pam=True)
+                img2 = imread('_tmp.pgm.out')
+                assert numpy.all(img == img2)
+                imsave('_tmp.pgm.out', img)
+                img2 = imread('_tmp.pgm.out')
+                assert numpy.all(img == img2)
+            pam.close()
+        except ValueError as e:
+            print(fname, e)
+            continue
+        _shape = img.shape
+        if img.ndim > 3 or (img.ndim > 2 and img.shape[-1] not in (3, 4)):
+            img = img[0]
+        cmap = 'gray' if pam.maxval > 1 else 'binary'
+        pyplot.imshow(img, cmap, interpolation='nearest')
+        pyplot.title("%s %s %s %s" % (fname, unicode(pam.magicnum),
+                                      _shape, img.dtype))
+        pyplot.show()
diff --git a/GPy/util/visualize.py b/GPy/util/visualize.py
index 7a519555..ecdf78ce 100644
--- a/GPy/util/visualize.py
+++ b/GPy/util/visualize.py
@@ -246,17 +246,36 @@ class lvm_dimselect(lvm):
 
 
 class image_show(matplotlib_show):
-    """Show a data vector as an image."""
-    def __init__(self, vals, axes=None, dimensions=(16,16), transpose=False, invert=False, scale=False, palette=[], presetMean = 0., presetSTD = -1., selectImage=0):
+    """Show a data vector as an image. This visualizer rehapes the output vector and displays it as an image.
+
+    :param vals: the values of the output to display.
+    :type vals: ndarray
+    :param axes: the axes to show the output on.
+    :type vals: axes handle
+    :param dimensions: the dimensions that the image needs to be transposed to for display.
+    :type dimensions: tuple
+    :param transpose: whether to transpose the image before display.
+    :type bool: default is False.
+    :param order: whether array is in Fortan ordering ('F') or Python ordering ('C'). Default is python ('C').
+    :type order: string
+    :param invert: whether to invert the pixels or not (default False).
+    :type invert: bool
+    :param palette: a palette to use for the image.
+    :param preset_mean: the preset mean of a scaled image.
+    :type preset_mean: double
+    :param preset_std: the preset standard deviation of a scaled image.
+    :type preset_std: double"""
+    def __init__(self, vals, axes=None, dimensions=(16,16), transpose=False, order='C', invert=False, scale=False, palette=[], preset_mean = 0., preset_std = -1., select_image=0):
         matplotlib_show.__init__(self, vals, axes)
         self.dimensions = dimensions
         self.transpose = transpose
+        self.order = order
         self.invert = invert
         self.scale = scale
         self.palette = palette
-        self.presetMean = presetMean
-        self.presetSTD = presetSTD
-        self.selectImage = selectImage # This is used when the y vector contains multiple images concatenated.
+        self.preset_mean = preset_mean
+        self.preset_std = preset_std
+        self.select_image = select_image # This is used when the y vector contains multiple images concatenated.
 
         self.set_image(self.vals)
         if not self.palette == []: # Can just show the image (self.set_image() took care of setting the palette)
@@ -272,22 +291,22 @@ class image_show(matplotlib_show):
 
     def set_image(self, vals):
         dim = self.dimensions[0] * self.dimensions[1]
-        nImg = np.sqrt(vals[0,].size/dim)
-        if nImg > 1 and nImg.is_integer(): # Show a mosaic of images
-            nImg = np.int(nImg)
-            self.vals = np.zeros((self.dimensions[0]*nImg, self.dimensions[1]*nImg))
-            for iR in range(nImg):
-                for iC in range(nImg):
-                    currImgId = iR*nImg + iC
-                    currImg = np.reshape(vals[0,dim*currImgId+np.array(range(dim))], self.dimensions, order='F')
-                    firstRow = iR*self.dimensions[0]
-                    lastRow = (iR+1)*self.dimensions[0]
-                    firstCol = iC*self.dimensions[1]
-                    lastCol = (iC+1)*self.dimensions[1]
-                    self.vals[firstRow:lastRow, firstCol:lastCol] = currImg
+        num_images = np.sqrt(vals[0,].size/dim)
+        if num_images > 1 and num_images.is_integer(): # Show a mosaic of images
+            num_images = np.int(num_images)
+            self.vals = np.zeros((self.dimensions[0]*num_images, self.dimensions[1]*num_images))
+            for iR in range(num_images):
+                for iC in range(num_images):
+                    cur_img_id = iR*num_images + iC
+                    cur_img = np.reshape(vals[0,dim*cur_img_id+np.array(range(dim))], self.dimensions, order=self.order)
+                    first_row = iR*self.dimensions[0]
+                    last_row = (iR+1)*self.dimensions[0]
+                    first_col = iC*self.dimensions[1]
+                    last_col = (iC+1)*self.dimensions[1]
+                    self.vals[first_row:last_row, first_col:last_col] = cur_img
 
         else: 
-            self.vals = np.reshape(vals[0,dim*self.selectImage+np.array(range(dim))], self.dimensions, order='F')
+            self.vals = np.reshape(vals[0,dim*self.select_image+np.array(range(dim))], self.dimensions, order=self.order)
         if self.transpose:
             self.vals = self.vals.T
         # if not self.scale:
@@ -296,8 +315,8 @@ class image_show(matplotlib_show):
             self.vals = -self.vals
 
         # un-normalizing, for visualisation purposes:
-        if self.presetSTD >= 0: # The Mean is assumed to be in the range (0,255)
-            self.vals = self.vals*self.presetSTD + self.presetMean
+        if self.preset_std >= 0: # The Mean is assumed to be in the range (0,255)
+            self.vals = self.vals*self.preset_std + self.preset_mean
             # Clipping the values:
             self.vals[self.vals < 0] = 0
             self.vals[self.vals > 255] = 255

From fe30db1331cd5f4ac20b5e36de0cdf68ba867bfa Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Mon, 14 Oct 2013 09:37:35 +0100
Subject: [PATCH 107/252] Updated sympy code, multioutput grad checks pass
 apart from wrt X. Similar problems with prediction as to sinc covariance,
 needs investigation.

---
 GPy/examples/dimensionality_reduction.py |  4 +-
 GPy/kern/constructors.py                 |  8 ++-
 GPy/kern/parts/sympykern.py              | 81 +++++++++++++++--------
 GPy/util/datasets.py                     | 83 +++++++++++++++++++-----
 4 files changed, 124 insertions(+), 52 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 8aaeb4ae..298607b6 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -327,8 +327,6 @@ def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
         m.plot_scales("MRD Scales")
     return m
 
-
-
 def brendan_faces():
     from GPy import kern
     data = GPy.util.datasets.brendan_faces()
@@ -342,7 +340,7 @@ def brendan_faces():
     # optimize
     m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())
 
-    m.optimize('scg', messages=1, max_iters=10)
+    m.optimize('scg', messages=1, max_iters=1000)
 
     ax = m.plot_latent(which_indices=(0, 1))
     y = m.likelihood.Y[0, :]
diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 62c29744..c6a6672f 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -322,17 +322,19 @@ if sympy_available:
             real_input_dim -= 1
         X = sp.symbols('x_:' + str(real_input_dim))
         Z = sp.symbols('z_:' + str(real_input_dim))
-        variance = sp.var('variance',positive=True)
+        scale = sp.var('scale_i scale_j',positive=True)
         if ARD:
             lengthscales = [sp.var('lengthscale%i_i lengthscale%i_j' % i, positive=True) for i in range(real_input_dim)]
-            dist_string = ' + '.join(['(x_%i-z_%i)**2/(lengthscale%i_i*lengthscale%i_j)' % (i, i, i) for i in range(real_input_dim)])
+            shared_lengthscales = [sp.var('shared_lengthscale%i' % i, positive=True) for i in range(real_input_dim)]
+            dist_string = ' + '.join(['(x_%i-z_%i)**2/(shared_lengthscale%i**2 + lengthscale%i_i*lengthscale%i_j)' % (i, i, i) for i in range(real_input_dim)])
             dist = parse_expr(dist_string)
             f =  variance*sp.exp(-dist/2.)
         else:
             lengthscale = sp.var('lengthscale_i lengthscale_j',positive=True)
+            shared_lengthscale = sp.var('shared_lengthscale',positive=True)
             dist_string = ' + '.join(['(x_%i-z_%i)**2' % (i, i) for i in range(real_input_dim)])
             dist = parse_expr(dist_string)
-            f =  variance*sp.exp(-dist/(2*lengthscale_i*lengthscale_j))
+            f =  scale_i*scale_j*sp.exp(-dist/(2*(shared_lengthscale**2 + lengthscale_i*lengthscale_j)))
         return kern(input_dim, [spkern(input_dim, f, output_dim=output_dim, name='eq_sympy')])
 
     def sinc(input_dim, ARD=False, variance=1., lengthscale=1.):
diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index 09ab9934..ea603eab 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -43,9 +43,9 @@ class spkern(Kernpart):
         assert all([z.name=='z_%i'%i for i,z in enumerate(self._sp_z)])
         assert len(self._sp_x)==len(self._sp_z)
         self.input_dim = len(self._sp_x)
+        self._real_input_dim = self.input_dim
         if output_dim > 1:
             self.input_dim += 1
-            
         assert self.input_dim == input_dim
         self.output_dim = output_dim
         # extract parameter names
@@ -139,8 +139,10 @@ class spkern(Kernpart):
         self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)
 
         # This is the basic argument construction for the C code.
-        arg_list = (["X[i*input_dim+%s]"%x.name[2:] for x in self._sp_x]
-                    + ["Z[j*input_dim+%s]"%z.name[2:] for z in self._sp_z])
+        #arg_list = (["X[i*input_dim+%s]"%x.name[2:] for x in self._sp_x]
+        #            + ["Z[j*input_dim+%s]"%z.name[2:] for z in self._sp_z])
+        arg_list = (["X2(i, %s)"%x.name[2:] for x in self._sp_x]
+                    + ["Z2(j, %s)"%z.name[2:] for z in self._sp_z])
         if self.output_dim>1:
             reverse_arg_list = list(arg_list)
             reverse_arg_list.reverse()
@@ -151,17 +153,21 @@ class spkern(Kernpart):
         precompute_list=[]
         if self.output_dim > 1:
             reverse_arg_list+=list(param_arg_list)
-            split_param_arg_list = ["%s[%s]"%(theta.name[:-2],index) for index in ['ii', 'jj'] for theta in self._sp_theta_i]
-            split_param_reverse_arg_list = ["%s[%s]"%(theta.name[:-2],index) for index in ['jj', 'ii'] for theta in self._sp_theta_i]
+            split_param_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['ii', 'jj'] for theta in self._sp_theta_i]
+            split_param_reverse_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['jj', 'ii'] for theta in self._sp_theta_i]
             arg_list += split_param_arg_list
             reverse_arg_list += split_param_reverse_arg_list
-            precompute_list += [' '*16+"int %s=(int)%s[%s*input_dim+output_dim];"%(index, var, index2) for index, var, index2 in zip(['ii', 'jj'], ['X', 'Z'], ['i', 'j'])]
+            # Extract the right output indices from the inputs.
+            c_define_output_indices = [' '*16 + "int %s=(int)%s(%s, %i);"%(index, var, index2, self.input_dim-1) for index, var, index2 in zip(['ii', 'jj'], ['X2', 'Z2'], ['i', 'j'])]
+            precompute_list += c_define_output_indices
             reverse_arg_string = ", ".join(reverse_arg_list)
         arg_string = ", ".join(arg_list)
         precompute_string = "\n".join(precompute_list)
         # Here's the code to do the looping for K
         self._K_code =\
         """
+        // _K_code
+        // Code for computing the covariance function.
         int i;
         int j;
         int N = target_array->dimensions[0];
@@ -171,7 +177,8 @@ class spkern(Kernpart):
         for (i=0;i<N;i++){
             for (j=0;j<num_inducing;j++){
 %s
-                target[i*num_inducing+j] = k(%s);
+                //target[i*num_inducing+j] = 
+                TARGET2(i, j) += k(%s);
             }
         }
         %s
@@ -188,28 +195,33 @@ class spkern(Kernpart):
         # Code to do the looping for Kdiag
         self._Kdiag_code =\
         """
+        // _Kdiag_code
+        // Code for computing diagonal of covariance function.
         int i;
         int N = target_array->dimensions[0];
         int input_dim = X_array->dimensions[1];
         //#pragma omp parallel for
         for (i=0;i<N;i++){
                 %s
-                target[i] = k(%s);
+                //target[i] =
+                TARGET1(i)=k(%s);
         }
         %s
         """%(diag_precompute_string,diag_arg_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
 
         # Code to compute gradients
-        func_list = []
+        grad_func_list = []
         if self.output_dim>1:
-            func_list += [' '*16 + "int %s=(int)%s[%s*input_dim+output_dim];"%(index, var, index2) for index, var, index2 in zip(['ii', 'jj'], ['X', 'Z'], ['i', 'j'])]
-            func_list += [' '*16 + 'target[%i+ii] += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, arg_string) for i, theta in enumerate(self._sp_theta_i)]
-            func_list += [' '*16 + 'target[%i+jj] += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, reverse_arg_string) for i, theta in enumerate(self._sp_theta_i)]
-        func_list += ([' '*16 + 'target[%i] += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
-        func_string = '\n'.join(func_list) 
+            grad_func_list += c_define_output_indices
+            grad_func_list += [' '*16 + 'TARGET1(%i+ii) += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, arg_string) for i, theta in enumerate(self._sp_theta_i)]
+            grad_func_list += [' '*16 + 'TARGET1(%i+jj) += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, reverse_arg_string) for i, theta in enumerate(self._sp_theta_i)]
+        grad_func_list += ([' '*16 + 'TARGET1(%i) += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
+        grad_func_string = '\n'.join(grad_func_list) 
 
         self._dK_dtheta_code =\
         """
+        // _dK_dtheta_code
+        // Code for computing gradient of covariance with respect to parameters.
         int i;
         int j;
         int N = partial_array->dimensions[0];
@@ -222,16 +234,18 @@ class spkern(Kernpart):
             }
         }
         %s
-        """%(func_string,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
+        """%(grad_func_string,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
 
 
         # Code to compute gradients for Kdiag TODO: needs clean up
-        diag_func_string = re.sub('Z','X',func_string,count=0)
-        diag_func_string = re.sub('int jj','//int jj',diag_func_string)
-        diag_func_string = re.sub('j','i',diag_func_string)
-        diag_func_string = re.sub('partial\[i\*num_inducing\+i\]','partial[i]',diag_func_string)
+        diag_grad_func_string = re.sub('Z','X',grad_func_string,count=0)
+        diag_grad_func_string = re.sub('int jj','//int jj',diag_grad_func_string)
+        diag_grad_func_string = re.sub('j','i',diag_grad_func_string)
+        diag_grad_func_string = re.sub('partial\[i\*num_inducing\+i\]','partial[i]',diag_grad_func_string)
         self._dKdiag_dtheta_code =\
         """
+        // _dKdiag_dtheta_code
+        // Code for computing gradient of diagonal with respect to parameters.
         int i;
         int N = partial_array->dimensions[0];
         int input_dim = X_array->dimensions[1];
@@ -239,13 +253,19 @@ class spkern(Kernpart):
                 %s
         }
         %s
-        """%(diag_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+        """%(diag_grad_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
 
-        # Code for gradients wrt X
-        gradient_funcs = "\n".join(["target[i*input_dim+%i] += partial[i*num_inducing+j]*dk_dx%i(%s);"%(q,q,arg_string) for q in range(self.input_dim)])
+        # Code for gradients wrt X, TODO: may need to deal with special case where one input is actually an output.
+        gradX_func_list = []
+        if self.output_dim>1:
+            gradX_func_list += c_define_output_indices
+        gradX_func_list += ["TARGET2(i, %i) += partial[i*num_inducing+j]*dk_dx_%i(%s);"%(q,q,arg_string) for q in range(self._real_input_dim)]
+        gradX_func_string = "\n".join(gradX_func_list)
 
         self._dK_dX_code = \
         """
+        // _dK_dX_code
+        // Code for computing gradient of covariance with respect to inputs.
         int i;
         int j;
         int N = partial_array->dimensions[0];
@@ -258,24 +278,26 @@ class spkern(Kernpart):
           }
         }
         %s
-        """%(gradient_funcs,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+        """%(gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
   
 
-        diag_gradient_funcs = re.sub('Z','X',gradient_funcs,count=0)
-        diag_gradient_funcs = re.sub('int jj','//int jj',diag_gradient_funcs)
-        diag_gradient_funcs = re.sub('j','i',diag_gradient_funcs)
-        diag_gradient_funcs = re.sub('partial\[i\*num_inducing\+i\]','2*partial[i]',diag_gradient_funcs)
+        diag_gradX_func_string = re.sub('Z','X',gradX_func_string,count=0)
+        diag_gradX_func_string = re.sub('int jj','//int jj',diag_gradX_func_string)
+        diag_gradX_func_string = re.sub('j','i',diag_gradX_func_string)
+        diag_gradX_func_string = re.sub('partial\[i\*num_inducing\+i\]','2*partial[i]',diag_gradX_func_string)
 
         # Code for gradients of Kdiag wrt X
         self._dKdiag_dX_code= \
         """
+        // _dKdiag_dX_code
+        // Code for computing gradient of diagonal with respect to inputs.
         int N = partial_array->dimensions[0];
         int input_dim = X_array->dimensions[1];
         for (int i=0;i<N; i++){
             %s
         }
         %s
-        """%(diag_gradient_funcs,"/*"+str(self._sp_k)+"*/") #adding a
+        """%(diag_gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a
         # string representation forces recompile when needed Get rid
         # of Zs in argument for diagonal. TODO: Why wasn't
         # diag_func_string called here? Need to check that.
@@ -285,6 +307,9 @@ class spkern(Kernpart):
         self._K_code_X = self._K_code.replace('Z[', 'X[')
         self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z[', 'X[')
         self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= partial[', '+= 2*partial[')
+        self._K_code_X = self._K_code.replace('Z2(', 'X2(')
+        self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z2(', 'X2(')
+        self._dK_dX_code_X = self._dK_dX_code.replace('Z2(', 'X2(')
 
 
         #TODO: insert multiple functions here via string manipulation
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 45ed694c..a6a97457 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -609,24 +609,8 @@ def olivetti_faces(data_set='olivetti_faces'):
     lbls = np.asarray(lbls)[:, None]
     return data_details_return({'Y': Y, 'lbls' : lbls, 'info': "ORL Faces processed to 64x64 images."}, data_set)
     
-def olympic_100m_men(data_set='rogers_girolami_data'):
-    if not data_available(data_set):
-        download_data(data_set)
-        path = os.path.join(data_path, data_set)
-        tar_file = os.path.join(path, 'firstcoursemldata.tar.gz')
-        tar = tarfile.open(tar_file)
-        print('Extracting file.')
-        tar.extractall(path=path)    
- 
-        tar.close()
-    olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male100']
-
-    X = olympic_data[:, 0][:, None]
-    Y = olympic_data[:, 1][:, None]
-    return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m men from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set)
-
-def olympic_100m_women(data_set='rogers_girolami_data'):
-    if not data_available(data_set):
+def download_rogers_girolami_data():
+    if not data_available('rogers_girolami_data'):
         download_data(data_set)
         path = os.path.join(data_path, data_set)
         tar_file = os.path.join(path, 'firstcoursemldata.tar.gz')
@@ -634,12 +618,55 @@ def olympic_100m_women(data_set='rogers_girolami_data'):
         print('Extracting file.')
         tar.extractall(path=path)
         tar.close()
+
+def olympic_100m_men(data_set='rogers_girolami_data'):
+    download_rogers_girolami_data()
+    olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male100']
+
+    X = olympic_data[:, 0][:, None]
+    Y = olympic_data[:, 1][:, None]
+    return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m men from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set)
+
+def olympic_100m_women(data_set='rogers_girolami_data'):
+    download_rogers_girolami_data()
     olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['female100']
 
     X = olympic_data[:, 0][:, None]
     Y = olympic_data[:, 1][:, None]
     return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m women from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set)
 
+def olympic_200m_women(data_set='rogers_girolami_data'):
+    download_rogers_girolami_data()
+    olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['female200']
+
+    X = olympic_data[:, 0][:, None]
+    Y = olympic_data[:, 1][:, None]
+    return data_details_return({'X': X, 'Y': Y, 'info': "Olympic 200 m winning times for women from 1896 until 2008. Data is from Rogers and Girolami's First Course in Machine Learning."}, data_set)
+
+def olympic_200m_men(data_set='rogers_girolami_data'):
+    download_rogers_girolami_data()
+    olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male200']
+
+    X = olympic_data[:, 0][:, None]
+    Y = olympic_data[:, 1][:, None]
+    return data_details_return({'X': X, 'Y': Y, 'info': "Male 200 m winning times for women from 1896 until 2008. Data is from Rogers and Girolami's First Course in Machine Learning."}, data_set)
+
+def olympic_400m_women(data_set='rogers_girolami_data'):
+    download_rogers_girolami_data()
+    olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['female400']
+
+    X = olympic_data[:, 0][:, None]
+    Y = olympic_data[:, 1][:, None]
+    return data_details_return({'X': X, 'Y': Y, 'info': "Olympic 400 m winning times for women until 2008. Data is from Rogers and Girolami's First Course in Machine Learning."}, data_set)
+
+def olympic_400m_men(data_set='rogers_girolami_data'):
+    download_rogers_girolami_data()
+    olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male400']
+
+    X = olympic_data[:, 0][:, None]
+    Y = olympic_data[:, 1][:, None]
+    return data_details_return({'X': X, 'Y': Y, 'info': "Male 400 m winning times for women until 2008. Data is from Rogers and Girolami's First Course in Machine Learning."}, data_set)
+
 def olympic_marathon_men(data_set='olympic_marathon_men'):
     if not data_available(data_set):
         download_data(data_set)
@@ -648,6 +675,26 @@ def olympic_marathon_men(data_set='olympic_marathon_men'):
     Y = olympics[:, 1:2]
     return data_details_return({'X': X, 'Y': Y}, data_set)
 
+def olympics():
+    """All olympics sprint winning times for multiple output prediction."""
+    X = np.zeros((0, 2))
+    Y = np.zeros((0, 1))
+    for i, dataset in enumerate([olympic_100m_men,
+                              olympic_100m_women,
+                              olympic_200m_men,
+                              olympic_200m_women,
+                              olympic_400m_men,
+                              olympic_400m_women]):
+        data = dataset()
+        year = data['X']
+        time = data['Y']
+        X = np.vstack((X, np.hstack((year, np.ones_like(year)*i))))
+        Y = np.vstack((Y, time))
+    data['X'] = X
+    data['Y'] = Y
+    data['info'] = "Olympics sprint event winning for men and women to 2008. Data is from Rogers and Girolami's First Course in Machine Learning."
+    return data
+
 # def movielens_small(partNo=1,seed=default_seed):
 #     np.random.seed(seed=seed)
 

From c3de628e995f06baa77321dd6a861f792924468b Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 14 Oct 2013 17:11:39 +0100
Subject: [PATCH 108/252] docstrinfs in kern.py

---
 GPy/kern/kern.py               | 53 ++++++++++++++++++++++++----------
 GPy/kern/parts/hierarchical.py |  2 +-
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 08f36109..805c6b43 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -79,15 +79,14 @@ class kern(Parameterized):
 
 
     def plot_ARD(self, fignum=None, ax=None, title='', legend=False):
-        """If an ARD kernel is present, it bar-plots the ARD parameters.
+        """If an ARD kernel is present, plot a bar representation using matplotlib
 
         :param fignum: figure number of the plot
         :param ax: matplotlib axis to plot on
-        :param title: 
-            title of the plot, 
+        :param title:
+            title of the plot,
             pass '' to not print a title
             pass None for a generic title
-
         """
         if ax is None:
             fig = pb.figure(fignum)
@@ -152,6 +151,13 @@ class kern(Parameterized):
         return ax
 
     def _transform_gradients(self, g):
+        """
+        Apply the transformations of the kernel so that the returned vector
+        represents the gradient in the transformed space (i.e. that given by
+        get_params_transformed())
+
+        :param g: the gradient vector for the current model, usually created by dK_dtheta
+        """
         x = self._get_params()
         [np.put(x, i, x * t.gradfactor(x[i])) for i, t in zip(self.constrained_indices, self.constraints)]
         [np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]]
@@ -162,7 +168,9 @@ class kern(Parameterized):
             return g
 
     def compute_param_slices(self):
-        """create a set of slices that can index the parameters of each part."""
+        """
+        Create a set of slices that can index the parameters of each part.
+        """
         self.param_slices = []
         count = 0
         for p in self.parts:
@@ -170,14 +178,19 @@ class kern(Parameterized):
             count += p.num_params
 
     def __add__(self, other):
-        """
-        Shortcut for `add`.
-        """
+        """ Overloading of the '+' operator. for more control, see self.add """
         return self.add(other)
 
     def add(self, other, tensor=False):
         """
-        Add another kernel to this one. Both kernels are defined on the same _space_
+        Add another kernel to this one.
+
+        If Tensor is False, both kernels are defined on the same _space_. then
+        the created kernel will have the same number of inputs as self and
+        other (which must be the same).
+
+        If Tensor is True, then the dimensions are stacked 'horizontally', so
+        that the resulting kernel has self.input_dim + other.input_dim
 
         :param other: the other kernel to be added
         :type other: GPy.kern
@@ -210,9 +223,7 @@ class kern(Parameterized):
         return newkern
 
     def __mul__(self, other):
-        """
-        Shortcut for `prod`.
-        """
+        """ Here we overload the '*' operator. See self.prod for more information"""
         return self.prod(other)
 
     def __pow__(self, other, tensor=False):
@@ -228,7 +239,7 @@ class kern(Parameterized):
         :param other: the other kernel to be added
         :type other: GPy.kern
         :param tensor: whether or not to use the tensor space (default is false).
-        :type tensor: bool 
+        :type tensor: bool
 
         """
         K1 = self.copy()
@@ -307,6 +318,17 @@ class kern(Parameterized):
         return sum([[name + '_' + n for n in k._get_param_names()] for name, k in zip(names, self.parts)], [])
 
     def K(self, X, X2=None, which_parts='all'):
+        """
+        Compute the kernel function.
+
+        :param X: the first set of inputs to the kernel
+        :param X2: (optional) the second set of arguments to the kernel. If X2
+                   is None, this is passed throgh to the 'part' object, which
+                   handles this as X2 == X.
+        :param which_parts: a list of booleans detailing whether to include
+                            each of the part functions. By default, 'all'
+                            indicates [True]*self.num_parts
+        """
         if which_parts == 'all':
             which_parts = [True] * self.num_parts
         assert X.shape[1] == self.input_dim
@@ -321,7 +343,7 @@ class kern(Parameterized):
     def dK_dtheta(self, dL_dK, X, X2=None):
         """
         Compute the gradient of the covariance function with respect to the parameters.
-        
+
         :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
         :type dL_dK: Np.ndarray (num_samples x num_inducing)
         :param X: Observed data inputs
@@ -329,6 +351,7 @@ class kern(Parameterized):
         :param X2: Observed data inputs (optional, defaults to X)
         :type X2: np.ndarray (num_inducing x input_dim)
 
+        returns: dL_dtheta
         """
         assert X.shape[1] == self.input_dim
         target = np.zeros(self.num_params)
@@ -340,7 +363,7 @@ class kern(Parameterized):
         return self._transform_gradients(target)
 
     def dK_dX(self, dL_dK, X, X2=None):
-        """Compute the gradient of the covariance function with respect to X.
+        """Compute the gradient of the objective function with respect to X.
 
         :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
         :type dL_dK: np.ndarray (num_samples x num_inducing)
diff --git a/GPy/kern/parts/hierarchical.py b/GPy/kern/parts/hierarchical.py
index ab96fdd7..c629f6b9 100644
--- a/GPy/kern/parts/hierarchical.py
+++ b/GPy/kern/parts/hierarchical.py
@@ -7,7 +7,7 @@ from independent_outputs import index_to_slices
 
 class Hierarchical(Kernpart):
     """
-    A kernel part which can reopresent a hierarchy of indepencnce: a gerenalisation of independent_outputs
+    A kernel part which can reopresent a hierarchy of indepencnce: a generalisation of independent_outputs
 
     """
     def __init__(self,parts):

From da2a88826d670f4284d466dd291d539b9428cf47 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Mon, 14 Oct 2013 22:09:41 +0100
Subject: [PATCH 109/252] Basic sim code functional.

---
 GPy/core/model.py           |  2 +-
 GPy/kern/constructors.py    |  4 +--
 GPy/kern/parts/sympykern.py | 67 ++++++++++++++++++++++++++-----------
 GPy/util/symbolic.py        | 12 ++++++-
 4 files changed, 62 insertions(+), 23 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 7aff8f4d..c1ab7b6a 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -259,7 +259,7 @@ class Model(Parameterized):
         these terms are present in the name the parameter is
         constrained positive.
         """
-        positive_strings = ['variance', 'lengthscale', 'precision', 'kappa']
+        positive_strings = ['variance', 'lengthscale', 'precision', 'decay', 'kappa']
         # param_names = self._get_param_names()
         currently_constrained = self.all_constrained_indices()
         to_make_positive = []
diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index c6a6672f..392f43ba 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -330,11 +330,11 @@ if sympy_available:
             dist = parse_expr(dist_string)
             f =  variance*sp.exp(-dist/2.)
         else:
-            lengthscale = sp.var('lengthscale_i lengthscale_j',positive=True)
+            lengthscales = sp.var('lengthscale_i lengthscale_j',positive=True)
             shared_lengthscale = sp.var('shared_lengthscale',positive=True)
             dist_string = ' + '.join(['(x_%i-z_%i)**2' % (i, i) for i in range(real_input_dim)])
             dist = parse_expr(dist_string)
-            f =  scale_i*scale_j*sp.exp(-dist/(2*(shared_lengthscale**2 + lengthscale_i*lengthscale_j)))
+            f =  scale_i*scale_j*sp.exp(-dist/(2*(lengthscale_i**2 + lengthscale_j**2 + shared_lengthscale**2)))
         return kern(input_dim, [spkern(input_dim, f, output_dim=output_dim, name='eq_sympy')])
 
     def sinc(input_dim, ARD=False, variance=1., lengthscale=1.):
diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index ea603eab..88c179aa 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -117,6 +117,9 @@ class spkern(Kernpart):
         return spkern(self._sp_k+other._sp_k)
 
     def _gen_code(self):
+        """Generates the C functions necessary for computing the covariance function using the sympy objects as input."""
+        #TODO: maybe generate one C function only to save compile time? Also easier to take that as a basis and hand craft other covariances??
+
         #generate c functions from sympy objects        
         argument_sequence = self._sp_x+self._sp_z+self._sp_theta
         code_list = [('k',self._sp_k)]
@@ -138,15 +141,20 @@ class spkern(Kernpart):
         # Substitute any known derivatives which sympy doesn't compute
         self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)
 
-        # This is the basic argument construction for the C code.
-        #arg_list = (["X[i*input_dim+%s]"%x.name[2:] for x in self._sp_x]
-        #            + ["Z[j*input_dim+%s]"%z.name[2:] for z in self._sp_z])
+
+        ############################################################
+        # This is the basic argument construction for the C code.  #
+        ############################################################
+        
         arg_list = (["X2(i, %s)"%x.name[2:] for x in self._sp_x]
                     + ["Z2(j, %s)"%z.name[2:] for z in self._sp_z])
+
+        # for multiple outputs need to also provide these arguments reversed.
         if self.output_dim>1:
             reverse_arg_list = list(arg_list)
             reverse_arg_list.reverse()
 
+        # Add in any 'shared' parameters to the list.
         param_arg_list = [shared_params.name for shared_params in self._sp_theta]
         arg_list += param_arg_list
 
@@ -163,6 +171,15 @@ class spkern(Kernpart):
             reverse_arg_string = ", ".join(reverse_arg_list)
         arg_string = ", ".join(arg_list)
         precompute_string = "\n".join(precompute_list)
+
+        # Code to compute argments string needed when only X is provided.
+        X_arg_string = re.sub('Z','X',arg_string)
+        # Code to compute argument string when only diagonal is required.
+        diag_arg_string = re.sub('int jj','//int jj',X_arg_string)
+        diag_arg_string = re.sub('j','i',diag_arg_string)
+        diag_precompute_string = precompute_list[0]
+
+
         # Here's the code to do the looping for K
         self._K_code =\
         """
@@ -184,14 +201,28 @@ class spkern(Kernpart):
         %s
         """%(precompute_string,arg_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
 
-        
-        # Code to compute diagonal of covariance.
-        diag_arg_string = re.sub('Z','X',arg_string)
-        diag_arg_string = re.sub('int jj','//int jj',diag_arg_string)
-        diag_arg_string = re.sub('j','i',diag_arg_string)
-        diag_precompute_string = re.sub('int jj','//int jj',precompute_string)
-        diag_precompute_string = re.sub('Z','X',diag_precompute_string)
-        diag_precompute_string = re.sub('j','i',diag_precompute_string)
+        self._K_code_X = """
+        // _K_code_X
+        // Code for computing the covariance function.
+        int i;
+        int j;
+        int N = target_array->dimensions[0];
+        int num_inducing = target_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
+        //#pragma omp parallel for private(j)
+        for (i=0;i<N;i++){
+            %s // int ii=(int)X2(i, 1);
+            TARGET2(i, i) += k(%s);
+            for (j=0;j<i;j++){
+              %s //int jj=(int)X2(j, 1);
+              double kval = k(%s); //double kval = k(X2(i, 0), X2(j, 0), shared_lengthscale, LENGTHSCALE1(ii), SCALE1(ii), LENGTHSCALE1(jj), SCALE1(jj));
+              TARGET2(i, j) += kval;
+              TARGET2(j, i) += kval;
+            }
+        }
+        /*%s*/
+        """%(diag_precompute_string, diag_arg_string, re.sub('Z2', 'X2', precompute_list[1]), X_arg_string,str(self._sp_k)) #adding a string representation forces recompile when needed
+
         # Code to do the looping for Kdiag
         self._Kdiag_code =\
         """
@@ -213,9 +244,9 @@ class spkern(Kernpart):
         grad_func_list = []
         if self.output_dim>1:
             grad_func_list += c_define_output_indices
-            grad_func_list += [' '*16 + 'TARGET1(%i+ii) += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, arg_string) for i, theta in enumerate(self._sp_theta_i)]
-            grad_func_list += [' '*16 + 'TARGET1(%i+jj) += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, reverse_arg_string) for i, theta in enumerate(self._sp_theta_i)]
-        grad_func_list += ([' '*16 + 'TARGET1(%i) += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
+            grad_func_list += [' '*16 + 'TARGET1(%i+ii) += PARTIAL2(i, j)*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, arg_string) for i, theta in enumerate(self._sp_theta_i)]
+            grad_func_list += [' '*16 + 'TARGET1(%i+jj) += PARTIAL2(i, j)*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, reverse_arg_string) for i, theta in enumerate(self._sp_theta_i)]
+        grad_func_list += ([' '*16 + 'TARGET1(%i) += PARTIAL2(i, j)*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
         grad_func_string = '\n'.join(grad_func_list) 
 
         self._dK_dtheta_code =\
@@ -241,7 +272,7 @@ class spkern(Kernpart):
         diag_grad_func_string = re.sub('Z','X',grad_func_string,count=0)
         diag_grad_func_string = re.sub('int jj','//int jj',diag_grad_func_string)
         diag_grad_func_string = re.sub('j','i',diag_grad_func_string)
-        diag_grad_func_string = re.sub('partial\[i\*num_inducing\+i\]','partial[i]',diag_grad_func_string)
+        diag_grad_func_string = re.sub('PARTIAL2\(i, i\)','PARTIAL1(i)',diag_grad_func_string)
         self._dKdiag_dtheta_code =\
         """
         // _dKdiag_dtheta_code
@@ -259,7 +290,7 @@ class spkern(Kernpart):
         gradX_func_list = []
         if self.output_dim>1:
             gradX_func_list += c_define_output_indices
-        gradX_func_list += ["TARGET2(i, %i) += partial[i*num_inducing+j]*dk_dx_%i(%s);"%(q,q,arg_string) for q in range(self._real_input_dim)]
+        gradX_func_list += ["TARGET2(i, %i) += PARTIAL2(i, j)*dk_dx_%i(%s);"%(q,q,arg_string) for q in range(self._real_input_dim)]
         gradX_func_string = "\n".join(gradX_func_list)
 
         self._dK_dX_code = \
@@ -284,7 +315,7 @@ class spkern(Kernpart):
         diag_gradX_func_string = re.sub('Z','X',gradX_func_string,count=0)
         diag_gradX_func_string = re.sub('int jj','//int jj',diag_gradX_func_string)
         diag_gradX_func_string = re.sub('j','i',diag_gradX_func_string)
-        diag_gradX_func_string = re.sub('partial\[i\*num_inducing\+i\]','2*partial[i]',diag_gradX_func_string)
+        diag_gradX_func_string = re.sub('PARTIAL2\(i, i\)','2*PARTIAL1(i)',diag_gradX_func_string)
 
         # Code for gradients of Kdiag wrt X
         self._dKdiag_dX_code= \
@@ -304,10 +335,8 @@ class spkern(Kernpart):
         #self._dKdiag_dX_code = self._dKdiag_dX_code.replace('Z[j', 'X[i')
 
         # Code to use when only X is provided. 
-        self._K_code_X = self._K_code.replace('Z[', 'X[')
         self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z[', 'X[')
         self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= partial[', '+= 2*partial[')
-        self._K_code_X = self._K_code.replace('Z2(', 'X2(')
         self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z2(', 'X2(')
         self._dK_dX_code_X = self._dK_dX_code.replace('Z2(', 'X2(')
 
diff --git a/GPy/util/symbolic.py b/GPy/util/symbolic.py
index 8b368a77..10c59a5e 100644
--- a/GPy/util/symbolic.py
+++ b/GPy/util/symbolic.py
@@ -22,9 +22,19 @@ class ln_diff_erf(Function):
 class sim_h(Function):
     nargs = 5
 
+    def fdiff(self, argindex=1):
+        pass
+    
     @classmethod
     def eval(cls, t, tprime, d_i, d_j, l):
-        return exp((d_j/2*l)**2)/(d_i+d_j)*(exp(-d_j*(tprime - t))*(erf((tprime-t)/l - d_j/2*l) + erf(t/l + d_j/2*l)) - exp(-(d_j*tprime + d_i))*(erf(tprime/l - d_j/2*l) + erf(d_j/2*l)))
+        # putting in the is_Number stuff forces it to look for a fdiff method for derivative.
+        return (exp((d_j/2*l)**2)/(d_i+d_j)
+                *(exp(-d_j*(tprime - t))
+                  *(erf((tprime-t)/l - d_j/2*l)
+                    + erf(t/l + d_j/2*l))
+                  - exp(-(d_j*tprime + d_i))
+                  *(erf(tprime/l - d_j/2*l)
+                    + erf(d_j/2*l))))
 
 class erfc(Function):
     nargs = 1

From 491eb7243a5ea35b08dc2ba827703ac7f869f188 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Tue, 15 Oct 2013 05:49:11 +0100
Subject: [PATCH 110/252] Added xw_pen data.

---
 GPy/util/datasets.py | 14 ++++++++++++++
 GPy/util/symbolic.py | 26 +++++++++++++++++++-------
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index a6a97457..d13e9f6c 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -145,6 +145,12 @@ The database was created with funding from NSF EIA-0196217.""",
                                         'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
                                         'license' : None,
                                         'size' : 24229368},
+                  'xw_pen' : {'urls' : [neil_url + 'xw_pen/'],
+                                        'files' : [['xw_pen_15.csv']],
+                                        'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""",
+                                        'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005',
+                                        'license' : None,
+                                        'size' : 3410}
                   }
 
 
@@ -608,6 +614,14 @@ def olivetti_faces(data_set='olivetti_faces'):
     Y = np.asarray(Y)
     lbls = np.asarray(lbls)[:, None]
     return data_details_return({'Y': Y, 'lbls' : lbls, 'info': "ORL Faces processed to 64x64 images."}, data_set)
+
+def xw_pen(data_set='xw_pen'):
+    if not data_available(data_set):
+        download_data(data_set)
+    Y = np.loadtxt(os.path.join(data_path, data_set, 'xw_pen_15.csv'), delimiter=',')
+    X = np.arange(485)[:, None]
+    return data_details_return({'Y': Y, 'X': X, 'info': "Tilt data from a personalized digital assistant pen."}, data_set)
+
     
 def download_rogers_girolami_data():
     if not data_available('rogers_girolami_data'):
diff --git a/GPy/util/symbolic.py b/GPy/util/symbolic.py
index 10c59a5e..0b5ca381 100644
--- a/GPy/util/symbolic.py
+++ b/GPy/util/symbolic.py
@@ -28,13 +28,25 @@ class sim_h(Function):
     @classmethod
     def eval(cls, t, tprime, d_i, d_j, l):
         # putting in the is_Number stuff forces it to look for a fdiff method for derivative.
-        return (exp((d_j/2*l)**2)/(d_i+d_j)
-                *(exp(-d_j*(tprime - t))
-                  *(erf((tprime-t)/l - d_j/2*l)
-                    + erf(t/l + d_j/2*l))
-                  - exp(-(d_j*tprime + d_i))
-                  *(erf(tprime/l - d_j/2*l)
-                    + erf(d_j/2*l))))
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+            if (t is S.NaN
+                or tprime is S.NaN
+                or d_i is S.NaN
+                or d_j is S.NaN
+                or l is S.NaN):
+                return S.NaN
+            else:
+                return (exp((d_j/2*l)**2)/(d_i+d_j)
+                        *(exp(-d_j*(tprime - t))
+                          *(erf((tprime-t)/l - d_j/2*l)
+                            + erf(t/l + d_j/2*l))
+                          - exp(-(d_j*tprime + d_i))
+                          *(erf(tprime/l - d_j/2*l)
+                            + erf(d_j/2*l))))
 
 class erfc(Function):
     nargs = 1

From a4c0a941becf8f7818a525ecd6915bf008a3cf0d Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Tue, 15 Oct 2013 05:53:39 +0100
Subject: [PATCH 111/252] Added xw_pen data.

---
 GPy/util/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index d13e9f6c..f5947179 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -620,7 +620,7 @@ def xw_pen(data_set='xw_pen'):
         download_data(data_set)
     Y = np.loadtxt(os.path.join(data_path, data_set, 'xw_pen_15.csv'), delimiter=',')
     X = np.arange(485)[:, None]
-    return data_details_return({'Y': Y, 'X': X, 'info': "Tilt data from a personalized digital assistant pen."}, data_set)
+    return data_details_return({'Y': Y, 'X': X, 'info': "Tilt data from a personalized digital assistant pen. Plot in original paper showed regression between time steps 175 and 275."}, data_set)
 
     
 def download_rogers_girolami_data():

From 96f189113ac037bbb709535c9c75997571c225f6 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 15 Oct 2013 12:25:19 +0100
Subject: [PATCH 112/252] Started on chaining, must remember to chain
 _laplace_gradients aswell!

---
 GPy/likelihoods/laplace.py                    |  14 +-
 .../noise_models/gaussian_noise.py            | 155 +++++-----
 .../noise_models/student_t_noise.py           | 126 +++++----
 GPy/testing/laplace_tests.py                  | 265 +++++++++++-------
 4 files changed, 325 insertions(+), 235 deletions(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 26365467..f4233554 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -76,7 +76,7 @@ class Laplace(likelihood):
         return self.noise_model._set_params(p)
 
     def _shared_gradients_components(self):
-        d3lik_d3fhat = -self.noise_model._d3nlog_mass_dgp3(self.f_hat, self.data, extra_data=self.extra_data)
+        d3lik_d3fhat = self.noise_model.d3logpdf_df3(self.f_hat, self.data, extra_data=self.extra_data)
         dL_dfhat = 0.5*(np.diag(self.Ki_W_i)[:, None]*d3lik_d3fhat).T #why isn't this -0.5?
         I_KW_i = np.eye(self.N) - np.dot(self.K, self.Wi_K_i)
         return dL_dfhat, I_KW_i
@@ -89,7 +89,7 @@ class Laplace(likelihood):
         :rtype: Matrix (1 x num_kernel_params)
         """
         dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlp = -self.noise_model._dnlog_mass_dgp(self.data, self.f_hat)
+        dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data)
 
         #Explicit
         #expl_a = np.dot(self.Ki_f, self.Ki_f.T)
@@ -178,7 +178,7 @@ class Laplace(likelihood):
 
         self.Wi_K_i = self.W12BiW12
         self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-        self.lik = -self.noise_model._nlog_mass(self.f_hat, self.data, extra_data=self.extra_data)
+        self.lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
         self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
 
         Z_tilde = (+ self.lik
@@ -223,7 +223,7 @@ class Laplace(likelihood):
         Compute the variables required to compute gaussian Y variables
         """
         #At this point get the hessian matrix (or vector as W is diagonal)
-        self.W = -self.noise_model.d2lik_d2f(self.data, self.f_hat, extra_data=self.extra_data)
+        self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
 
         #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
         self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
@@ -290,7 +290,7 @@ class Laplace(likelihood):
         old_obj = np.inf
 
         def obj(Ki_f, f):
-            return -0.5*np.dot(Ki_f.T, f) - self.noise_model._nlog_mass(f, self.data, extra_data=self.extra_data)
+            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data)
 
         difference = np.inf
         epsilon = 1e-6
@@ -299,10 +299,10 @@ class Laplace(likelihood):
         i = 0
 
         while difference > epsilon and i < MAX_ITER:
-            W = -self.noise_model.d2lik_d2f(self.data, f, extra_data=self.extra_data)
+            W = -self.noise_model.d2logpdf_df2(f, self.data, extra_data=self.extra_data)
 
             W_f = W*f
-            grad = -self.noise_model._dnlog_mass_dgp(f, self.data, extra_data=self.extra_data)
+            grad = self.noise_model.dlogpdf_df(f, self.data, extra_data=self.extra_data)
 
             b = W_f + grad
             W12BiW12Kb, _ = self._compute_B_statistics(K, W.copy(), np.dot(K, b))
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index 51b7c6a1..7b2e1a85 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -80,63 +80,82 @@ class Gaussian(NoiseDistribution):
     def _predictive_variance_analytical(self,mu,sigma,predictive_mean=None):
         return 1./(1./self.variance + 1./sigma**2)
 
-    def _mass(self, gp, obs):
+    def _mass(self, link_f, y):
+        #FIXME: Careful now passing link_f in not gp (f)!
         #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
         #Assumes no covariance, exp, sum, log for numerical stability
-        return np.exp(np.sum(np.log(stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance)))))
+        #return np.exp(np.sum(np.log(stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance)))))
+        #return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
+        return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
 
-    def _nlog_mass(self, gp, obs, extra_data=None):
+    def _nlog_mass(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in likelihood.py for link function evaluation\
+                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
+
+    def _dnlog_mass_dgp(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in likelihood.py for link function evaluation\
+                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
+
+    def _d2nlog_mass_dgp2(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in likelihood.py for link function evaluation\
+                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
+
+    def logpdf(self, link_f, y, extra_data=None):
         """
-        Negative Log likelihood function
-        Chained with link function deriative
+        Log likelihood function
 
         .. math::
-            \\-ln p(y_{i}|\\lambda(f_{i})) = +\\frac{D \\ln 2\\pi}{2} + \\frac{\\ln |K|}{2} + \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+            \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
 
-        :param gp: latent variables (f)
-        :type gp: Nx1 array
-        :param obs: data (y)
-        :type obs: Nx1 array
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: likelihood evaluated for this point
         :rtype: float
         """
-        assert gp.shape == obs.shape
-        return .5*(np.sum((self.gp_link.transf(gp)-obs)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
+        assert link_f.shape == y.shape
+        return -0.5*(np.sum((y-link_f)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
 
-    def _dnlog_mass_dgp(self, gp, obs, extra_data=None):
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
         """
-        Negative Gradient of the link function at y, given f w.r.t f
-        Chained with link function deriative
+        Gradient of the pdf at y, given link(f) w.r.t link(f)
 
         .. math::
             \\frac{d \\ln p(y_{i}|f_{i})}{df} = \\frac{1}{\\sigma^{2}}(y_{i} - f_{i})
-            \\frac{d \\-ln p(y_{i}|f_{i})}{df} = -\\frac{1}{\\sigma^{2}}(y_{i} - \\lambda(f_{i}))\\frac{d\\lambda(f_{i})}{df_{i}}
 
-        :param gp: latent variables (f)
-        :type gp: Nx1 array
-        :param obs: data (y)
-        :type obs: Nx1 array
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: gradient of negative likelihood evaluated at points
         :rtype: Nx1 array
         """
-        assert gp.shape == obs.shape
-        return (self.gp_link.transf(gp)-obs)/self.variance * self.gp_link.dtransf_df(gp)
+        assert link_f.shape == y.shape
+        s2_i = (1.0/self.variance)
+        grad = s2_i*y - s2_i*link_f
+        return grad
 
-    def _d2nlog_mass_dgp2(self, gp, obs, extra_data=None):
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
         """
-        Negative Hessian at y, given f, w.r.t f the hessian will be 0 unless i == j
+        Hessian at y, given link_f, w.r.t link_f the hessian will be 0 unless i == j
         i.e. second derivative _nlog_mass at y given f_{i} f_{j}  w.r.t f_{i} and f_{j}
-        Chained with link function deriative
 
         .. math::
             \\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
 
-        :param gp: latent variables (f)
-        :type gp: Nx1 array
-        :param obs: data (y)
-        :type obs: Nx1 array
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
         :rtype: Nx1 array
@@ -145,91 +164,89 @@ class Gaussian(NoiseDistribution):
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
             (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
         """
-        assert gp.shape == obs.shape
-        #FIXME: Why squared?
-        return ((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)/self.variance
+        assert link_f.shape == y.shape
+        hess = -(1.0/self.variance)*np.ones((self.N, 1))
+        return hess
 
-    def _d3nlog_mass_dgp3(self, gp, obs, extra_data=None):
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
         """
-        Third order derivative log-likelihood function at y given f w.r.t f
-        Chained with link function deriative
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
 
         .. math::
             \\frac{d^{3} \\ln p(y_{i}|f_{i})}{d^{3}f} = 0
 
-        :param gp: latent variables (f)
-        :type gp: Nx1 array
-        :param obs: data (y)
-        :type obs: Nx1 array
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: third derivative of likelihood evaluated at points f
         :rtype: Nx1 array
         """
-        assert gp.shape == obs.shape
-        d2lambda_df2 = self.gp_link.d2transf_df2(gp)
-        return ((self.gp_link.transf(gp)-obs)*self.gp_link.d3transf_df3(gp) - self.gp_link.dtransf_df(gp)*d2lambda_df2 + d2lambda_df2)/self.variance
+        assert link_f.shape == y.shape
+        d3logpdf_dlink3 = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
+        return d3logpdf_dlink3
 
-    def _dnlog_mass_dvar(self, gp, obs, extra_data=None):
+    def dlogpdf_dvar(self, link_f, y, extra_data=None):
         """
-        Gradient of the negative log-likelihood function at y given f, w.r.t variance parameter (noise_variance)
+        Gradient of the negative log-likelihood function at y given link(f), w.r.t variance parameter (noise_variance)
 
         .. math::
             \\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = \\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - f_{i})^{2}}{2\\sigma^{4}}
 
-        :param gp: latent variables (f)
-        :type gp: Nx1 array
-        :param obs: data (y)
-        :type obs: Nx1 array
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: float
         """
-        assert gp.shape == obs.shape
-        e = (obs - self.gp_link.transf(gp))
+        assert link_f.shape == y.shape
+        e = y - link_f
         s_4 = 1.0/(self.variance**2)
-        dnlik_dsigma = 0.5*self.N/self.variance - 0.5*s_4*np.dot(e.T, e)
-        return np.sum(dnlik_dsigma) # Sure about this sum?
+        dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.dot(e.T, e)
+        return np.sum(dlik_dsigma) # Sure about this sum?
 
-    def _dnlog_mass_dgp_dvar(self, gp, obs, extra_data=None):
+    def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
         """
-        Derivative of the dlik_df w.r.t variance parameter (noise_variance)
+        Derivative of the dlogpdf_dlink w.r.t variance parameter (noise_variance)
 
         .. math::
             \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|f_{i})}{df}) = \\frac{1}{\\sigma^{4}}(-y_{i} + f_{i})
 
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert gp.shape == obs.shape
+        assert link_f.shape == y.shape
         s_4 = 1.0/(self.variance**2)
-        dnlik_grad_dsigma = s_4*(obs - self.gp_link.transf(gp))*self.gp_link.dtransf_df(gp)
-        return dnlik_grad_dsigma
+        dlik_grad_dsigma = -np.dot(s_4*self.I, y) + np.dot(s_4*self.I, link_f)
+        return dlik_grad_dsigma
 
-    def _d2nlog_mass_dgp2_dvar(self, gp, obs, extra_data=None):
+    def d2logpdf_dlink2_dvar(self, link_f, y, extra_data=None):
         """
-        Gradient of the hessian (d2lik_d2f) w.r.t variance parameter (noise_variance)
+        Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (noise_variance)
 
         .. math::
             \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f}) = \\frac{1}{\\sigma^{4}}
 
-        :param gp: latent variables (f)
-        :type gp: Nx1 array
-        :param obs: data (y)
-        :type obs: Nx1 array
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert gp.shape == obs.shape
+        assert link_f.shape == y.shape
         s_4 = 1.0/(self.variance**2)
-        #FIXME: Why squared?
-        dnlik_hess_dvar = -s_4*((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)
-        return dnlik_hess_dvar
+        d2logpdf_dlink2_dvar = np.diag(s_4*self.I)[:, None]
+        return d2logpdf_dlink2_dvar
 
     def _mean(self,gp):
         """
diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index c4319313..dcd41fda 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -40,64 +40,82 @@ class StudentT(NoiseDistribution):
     def variance(self, extra_data=None):
         return (self.v / float(self.v - 2)) * self.sigma2
 
-    def _nlog_mass(self, gp, obs, extra_data=None):
+    def _nlog_mass(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in likelihood.py for link function evaluation\
+                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
+
+    def _dnlog_mass_dgp(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in likelihood.py for link function evaluation\
+                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
+
+    def _d2nlog_mass_dgp2(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in likelihood.py for link function evaluation\
+                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
+
+    def logpdf(self, link_f, y, extra_data=None):
         """
         Log Likelihood Function
 
         .. math::
             \\ln p(y_{i}|f_{i}) = \\ln \\Gamma(\\frac{v+1}{2}) - \\ln \\Gamma(\\frac{v}{2})\\sqrt{v \\pi}\sigma - \\frac{v+1}{2}\\ln (1 + \\frac{1}{v}\\left(\\frac{y_{i} - f_{i}}{\\sigma}\\right)^2
 
-        :param gp: latent variables (f)
-        :type gp: Nx1 array
-        :param obs: data (y)
-        :type obs: Nx1 array
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: likelihood evaluated for this point
         :rtype: float
 
         """
-        assert gp.shape == obs.shape
-        e = obs - self.gp_link.transf(gp)
+        assert link_f.shape == y.shape
+        e = y - link_f
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
                      - 0.5*np.log(self.sigma2 * self.v * np.pi)
                      - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
                     )
-        return -np.sum(objective)
+        return np.sum(objective)
 
-    def dlik_df(self, y, f, extra_data=None):
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
         """
-        Gradient of the log likelihood function at y, given f w.r.t f
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
 
         .. math::
             \\frac{d \\ln p(y_{i}|f_{i})}{df} = \\frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \\sigma^{2}v}
 
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: gradient of likelihood evaluated at points
         :rtype: Nx1 array
 
         """
-        assert y.shape == f.shape
-        e = y - f
+        assert y.shape == link_f.shape
+        e = y - link_f
         grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
         return grad
 
-    def d2lik_d2f(self, y, f, extra_data=None):
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
         """
-        Hessian at y, given f, w.r.t f the hessian will be 0 unless i == j
+        Hessian at y, given link(f), w.r.t link(f) the hessian will be 0 unless i == j
         i.e. second derivative lik_function at y given f_{i} f_{j}  w.r.t f_{i} and f_{j}
 
         .. math::
             \\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = \\frac{(v+1)((y_{i}-f_{i})^{2} - \\sigma^{2}v)}{((y_{i}-f_{i})^{2} + \\sigma^{2}v)^{2}}
 
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
         :rtype: Nx1 array
@@ -106,101 +124,101 @@ class StudentT(NoiseDistribution):
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
             (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
         """
-        assert y.shape == f.shape
-        e = y - f
+        assert y.shape == link_f.shape
+        e = y - link_f
         hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
         return hess
 
-    def d3lik_d3f(self, y, f, extra_data=None):
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
         """
         Third order derivative log-likelihood function at y given f w.r.t f
 
         .. math::
             \\frac{d^{3} \\ln p(y_{i}|f_{i})}{d^{3}f} = \\frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \\sigma^{2} v))}{((y_{i} - f_{i}) + \\sigma^{2} v)^3}
 
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: third derivative of likelihood evaluated at points f
         :rtype: Nx1 array
         """
-        assert y.shape == f.shape
-        e = y - f
-        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
+        assert y.shape == link_f.shape
+        e = y - link_f
+        d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
                        ((e**2 + self.sigma2*self.v)**3)
                     )
-        return d3lik_d3f
+        return d3lik_dlink3
 
-    def dlik_dvar(self, y, f, extra_data=None):
+    def dlogpdf_dvar(self, link_f, y, extra_data=None):
         """
         Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)
 
         .. math::
             \\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = \\frac{v((y_{i} - f_{i})^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - f_{i})^{2})}
 
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: float
         """
-        assert y.shape == f.shape
-        e = y - f
-        dlik_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
-        #FIXME: May not want to sum over all dimensions if using many D?
-        return np.sum(dlik_dvar)
+        assert y.shape == link_f.shape
+        e = y - link_f
+        dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
+        #FIXME: Careful as this hasn't been chained with dlink_var, not sure if we want link functions on our parameters?! Shouldn't need them with constraints
+        return np.sum(dlogpdf_dvar)
 
-    def dlik_df_dvar(self, y, f, extra_data=None):
+    def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
         """
-        Derivative of the dlik_df w.r.t variance parameter (t_noise)
+        Derivative of the dlogpdf_dlink w.r.t variance parameter (t_noise)
 
         .. math::
             \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|f_{i})}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \\sigma^2 v)^2}
 
+        :param link_f: latent variables link_f
+        :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert y.shape == f.shape
-        e = y - f
-        dlik_grad_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
-        return dlik_grad_dvar
+        assert y.shape == link_f.shape
+        e = y - link_f
+        dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
+        return dlogpdf_dlink_dvar
 
-    def d2lik_d2f_dvar(self, y, f, extra_data=None):
+    def d2logpdf_dlink2_dvar(self, link_f, y, extra_data=None):
         """
-        Gradient of the hessian (d2lik_d2f) w.r.t variance parameter (t_noise)
+        Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (t_noise)
 
         .. math::
             \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - f_{i})^{2})}{(\\sigma^{2}v + (y_{i} - f_{i})^{2})^{3}}
 
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param f: latent variables f
-        :type f: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert y.shape == f.shape
-        e = y - f
-        dlik_hess_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
+        assert y.shape == link_f.shape
+        e = y - link_f
+        d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
                               / ((self.sigma2*self.v + (e**2))**3)
                            )
-        return dlik_hess_dvar
+        return d2logpdf_dlink2_dvar
 
     def _laplace_gradients(self, y, f, extra_data=None):
         #must be listed in same order as 'get_param_names'
-        derivs = ([self.dlik_dvar(y, f, extra_data=extra_data)],
-                  [self.dlik_df_dvar(y, f, extra_data=extra_data)],
-                  [self.d2lik_d2f_dvar(y, f, extra_data=extra_data)]
+        derivs = ([self.dlogpdf_dvar(f, y, extra_data=extra_data)],
+                  [self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)],
+                  [self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)]
                  ) # lists as we might learn many parameters
         # ensure we have gradients for every parameter we want to optimize
         assert len(derivs[0]) == len(self._get_param_names())
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index 1154052e..936241b1 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -89,91 +89,124 @@ class LaplaceTests(unittest.TestCase):
         self.f = None
         self.X = None
 
-    def test_lik_mass(self):
+    def test_mass_logpdf(self):
         print "\n{}".format(inspect.stack()[0][3])
         np.testing.assert_almost_equal(
-                                np.sum(self.gauss._nlog_mass(self.f.copy(), self.Y.copy())),
-                                -self.gauss.lik_function(self.Y.copy(), self.f.copy()))
+                               np.log(self.gauss._mass(self.f.copy(), self.Y.copy())),
+                               self.gauss.logpdf(self.f.copy(), self.Y.copy()))
 
-    def test_mass_nlog_mass(self):
+
+    """ dGauss_df's """
+    @unittest.skip("Not Implemented Yet")
+    def test_gaussian_dlogpdf_df(self):
+        #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
-        np.testing.assert_almost_equal(
-                               -np.log(self.gauss._mass(self.f.copy(), self.Y.copy())),
-                               self.gauss._nlog_mass(self.f.copy(), self.Y.copy()))
-
-    def test_mass_dnlog_mass_dgp_ndlik_df(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        np.testing.assert_almost_equal(
-                               self.gauss._dnlog_mass_dgp(gp=self.f.copy(), obs=self.Y.copy()),
-                               -self.gauss.dlik_df(y=self.Y.copy(), f=self.f.copy()))
-
-    def test_mass_d2nlog_mass_dgp2_nd2lik_d2f(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        np.testing.assert_almost_equal(
-                               self.gauss._d2nlog_mass_dgp2(gp=self.f.copy(), obs=self.Y.copy()),
-                               -self.gauss.d2lik_d2f(y=self.Y.copy(), f=self.f.copy()))
-
-    def test_mass_d2nlog_mass_dgp3_nd2lik_d3f(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        np.testing.assert_almost_equal(
-                               self.gauss._d3nlog_mass_dgp3(gp=self.f.copy(), obs=self.Y.copy()),
-                               -self.gauss.d3lik_d3f(y=self.Y.copy(), f=self.f.copy()))
-
-
-    def test_gaussian_dnlog_mass_dgp(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        link = functools.partial(self.gauss._nlog_mass, obs=self.Y)
-        dlik_df = functools.partial(self.gauss._dnlog_mass_dgp, obs=self.Y)
-        grad = GradientChecker(link, dlik_df, self.f.copy(), 'g')
+        logpdf = functools.partial(self.gauss.logpdf, y=self.Y)
+        dlogpdf_df = functools.partial(self.gauss.dlogpdf_df, y=self.Y)
+        grad = GradientChecker(logpdf, dlogpdf_df, self.f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    def test_gaussian_d2nlog_mass_d2gp(self):
+    @unittest.skip("Not Implemented Yet")
+    def test_gaussian_d2logpdf_df2(self):
+        #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
-        link = functools.partial(self.gauss._dnlog_mass_dgp, obs=self.Y)
-        dlik_df = functools.partial(self.gauss._d2nlog_mass_dgp2, obs=self.Y)
-        grad = GradientChecker(link, dlik_df, self.f.copy(), 'g')
+        dlogpdf_df = functools.partial(self.gauss.dlogpdf_df, y=self.Y)
+        d2logpdf_df2 = functools.partial(self.gauss.d2logpdf_df2, y=self.Y)
+        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, self.f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    def test_gaussian_d3nlog_mass_d3gp(self):
+    @unittest.skip("Not Implemented Yet")
+    def test_gaussian_d3logpdf_df3(self):
+        #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
-        link = functools.partial(self.gauss._d2nlog_mass_dgp2, obs=self.Y)
-        dlik_df = functools.partial(self.gauss._d3nlog_mass_dgp3, obs=self.Y)
-        grad = GradientChecker(link, dlik_df, self.f.copy(), 'g')
+        d2logpdf_df2 = functools.partial(self.gauss.d2logpdf_df2, y=self.Y)
+        d3logpdf_df3 = functools.partial(self.gauss.d3logpdf_df3, y=self.Y)
+        grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, self.f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    def test_gaussian_dnlog_mass_dvar(self):
+    @unittest.skip("Not Implemented Yet")
+    def test_gaussian_dlogpdf_df_dvar(self):
+        #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.gauss._nlog_mass, self.gauss._dnlog_mass_dvar,
-                    [self.var], args=(self.Y, self.f), constrain_positive=True,
+                dparam_checkgrad(self.gauss.dlogpdf_df, self.gauss.dlogpdf_df_dvar,
+                    [self.var], args=(self.f, self.Y), constrain_positive=True,
                     randomize=False, verbose=True)
                 )
 
-    def test_gaussian_dnlog_mass_dgp_dvar(self):
+    @unittest.skip("Not Implemented Yet")
+    def test_gaussian_d2logpdf2_df2_dvar(self):
+        #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.gauss._dnlog_mass_dgp, self.gauss._dnlog_mass_dgp_dvar,
-                    [self.var], args=(self.Y, self.f), constrain_positive=True,
+                dparam_checkgrad(self.gauss.d2logpdf_df2, self.gauss.d2logpdf_df2_dvar,
+                    [self.var], args=(self.f, self.Y), constrain_positive=True,
                     randomize=False, verbose=True)
                 )
 
-    def test_gaussian_d2nlog_mass_d2gp_dvar(self):
+
+    """ dGauss_dlink's """
+    def test_gaussian_dlogpdf_dlink(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        logpdf = functools.partial(self.gauss.logpdf, y=self.Y)
+        dlogpdf_dlink = functools.partial(self.gauss.dlogpdf_dlink, y=self.Y)
+        grad = GradientChecker(logpdf, dlogpdf_dlink, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
+
+    def test_gaussian_d2logpdf_dlink2(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        dlogpdf_dlink = functools.partial(self.gauss.dlogpdf_dlink, y=self.Y)
+        d2logpdf_dlink2 = functools.partial(self.gauss.d2logpdf_dlink2, y=self.Y)
+        grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
+
+    def test_gaussian_d3logpdf_dlink3(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        d2logpdf_dlink2 = functools.partial(self.gauss.d2logpdf_dlink2, y=self.Y)
+        d3logpdf_dlink3 = functools.partial(self.gauss.d3logpdf_dlink3, y=self.Y)
+        grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
+
+    def test_gaussian_dlogpdf_dvar(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.gauss._d2nlog_mass_dgp2, self.gauss._d2nlog_mass_dgp2_dvar,
-                    [self.var], args=(self.Y, self.f), constrain_positive=True,
+                dparam_checkgrad(self.gauss.logpdf, self.gauss.dlogpdf_dvar,
+                    [self.var], args=(self.f, self.Y), constrain_positive=True,
                     randomize=False, verbose=True)
                 )
 
+    def test_gaussian_dlogpdf_dlink_dvar(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.assertTrue(
+                dparam_checkgrad(self.gauss.dlogpdf_dlink, self.gauss.dlogpdf_dlink_dvar,
+                    [self.var], args=(self.f, self.Y), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
+
+    def test_gaussian_d2logpdf2_dlink2_dvar(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.assertTrue(
+                dparam_checkgrad(self.gauss.d2logpdf_dlink2, self.gauss.d2logpdf_dlink2_dvar,
+                    [self.var], args=(self.f, self.Y), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
+
+
     """ Gradchecker fault """
     @unittest.expectedFailure
-    def test_gaussian_d2lik_d2f_2(self):
+    def test_gaussian_d2logpdf_df2_2(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.Y = None
         self.gauss = None
@@ -187,99 +220,121 @@ class LaplaceTests(unittest.TestCase):
         self.f = np.random.rand(self.N, 1)
         self.gauss = GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N)
 
-        dlik_df = functools.partial(self.gauss._dnlog_mass_dgp, obs=self.Y)
-        d2lik_d2f = functools.partial(self.gauss._d2nlog_mass_dgp2, obs=self.Y)
-        grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        grad.checkgrad()
-
-        self.assertTrue(grad.checkgrad())
-
-    def test_gaussian_d3lik_d3f(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        d2lik_d2f = functools.partial(self.gauss.d2lik_d2f, self.Y)
-        d3lik_d3f = functools.partial(self.gauss.d3lik_d3f, self.Y)
-        grad = GradientChecker(d2lik_d2f, d3lik_d3f, self.f.copy(), 'f')
+        dlogpdf_df = functools.partial(self.gauss.dlogpdf_df, y=self.Y)
+        d2logpdf_df2 = functools.partial(self.gauss.d2logpdf_df2, y=self.Y)
+        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, self.f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    def test_gaussian_dlik_dvar(self):
+    """ dStudentT_df's """
+    @unittest.skip("Not Implemented Yet")
+    def test_studentt_dlogpdf_df(self):
+        #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.gauss.lik_function, self.gauss.dlik_dvar,
-                    [self.var], args=(self.Y, self.f), constrain_positive=True,
-                    randomize=False, verbose=True)
-                )
-
-    def test_gaussian_dlik_df_dvar(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.gauss.dlik_df, self.gauss.dlik_df_dvar,
-                    [self.var], args=(self.Y.copy(), self.f.copy()), constrain_positive=True,
-                    randomize=False, verbose=True)
-                )
-
-    def test_gaussian_d2lik_d2f_dvar(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.gauss.d2lik_d2f, self.gauss.d2lik_d2f_dvar,
-                    [self.var], args=(self.Y, self.f.copy()), constrain_positive=True,
-                    randomize=True, verbose=True)
-                )
-
-    def test_studentt_dlik_df(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        link = functools.partial(self.stu_t.lik_function, self.Y)
-        dlik_df = functools.partial(self.stu_t.dlik_df, self.Y)
-        grad = GradientChecker(link, dlik_df, self.f.copy(), 'f')
+        link = functools.partial(self.stu_t.logpdf, y=self.Y)
+        dlogpdf_df = functools.partial(self.stu_t.dlogpdf_df, y=self.Y)
+        grad = GradientChecker(link, dlogpdf_df, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    def test_studentt_d2lik_d2f(self):
+    @unittest.skip("Not Implemented Yet")
+    def test_studentt_d2logpdf_df2(self):
+        #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
-        dlik_df = functools.partial(self.stu_t.dlik_df, self.Y)
-        d2lik_d2f = functools.partial(self.stu_t.d2lik_d2f, self.Y)
-        grad = GradientChecker(dlik_df, d2lik_d2f, self.f.copy(), 'f')
+        dlogpdf_df = functools.partial(self.stu_t.dlogpdf_df, y=self.Y)
+        d2logpdf_df2 = functools.partial(self.stu_t.d2logpdf_df2, y=self.Y)
+        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
+    @unittest.skip("Not Implemented Yet")
     def test_studentt_d3lik_d3f(self):
+        #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
-        d2lik_d2f = functools.partial(self.stu_t.d2lik_d2f, self.Y)
-        d3lik_d3f = functools.partial(self.stu_t.d3lik_d3f, self.Y)
-        grad = GradientChecker(d2lik_d2f, d3lik_d3f, self.f.copy(), 'f')
+        d2logpdf_df2 = functools.partial(self.stu_t.d2logpdf_d2f, y=self.Y)
+        d3logpdf_df3 = functools.partial(self.stu_t.d3logpdf_d3f, y=self.Y)
+        grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    def test_studentt_dlik_dvar(self):
+    @unittest.skip("Not Implemented Yet")
+    def test_studentt_dlogpdf_df_dvar(self):
+        #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.stu_t.lik_function, self.stu_t.dlik_dvar,
+                dparam_checkgrad(self.stu_t.dlogpdf_df, self.stu_t.dlogpdf_df_dvar,
                     [self.var], args=(self.Y.copy(), self.f.copy()),
                     constrain_positive=True, randomize=True, verbose=True)
                 )
 
-    def test_studentt_dlik_df_dvar(self):
+    @unittest.skip("Not Implemented Yet")
+    def test_studentt_d2logpdf_df2_dvar(self):
+        #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.stu_t.dlik_df, self.stu_t.dlik_df_dvar,
+                dparam_checkgrad(self.stu_t.d2logpdf_df2, self.stu_t.d2logpdf_df2_dvar,
                     [self.var], args=(self.Y.copy(), self.f.copy()),
                     constrain_positive=True, randomize=True, verbose=True)
                 )
 
-    def test_studentt_d2lik_d2f_dvar(self):
+    """ dStudentT_dlink's """
+    def test_studentt_dlogpdf_dlink(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        logpdf = functools.partial(self.stu_t.logpdf, y=self.Y)
+        dlogpdf_dlink = functools.partial(self.stu_t.dlogpdf_dlink, y=self.Y)
+        grad = GradientChecker(logpdf, dlogpdf_dlink, self.f.copy(), 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
+
+    def test_studentt_d2logpdf_dlink2(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        dlogpdf_dlink = functools.partial(self.stu_t.dlogpdf_dlink, y=self.Y)
+        d2logpdf_dlink2 = functools.partial(self.stu_t.d2logpdf_dlink2, y=self.Y)
+        grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, self.f.copy(), 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
+
+    def test_studentt_d3logpdf_dlink3(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        d2logpdf_dlink2 = functools.partial(self.stu_t.d2logpdf_dlink2, y=self.Y)
+        d3logpdf_dlink3 = functools.partial(self.stu_t.d3logpdf_dlink3, y=self.Y)
+        grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, self.f.copy(), 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
+
+    def test_studentt_dlogpdf_dvar(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.stu_t.d2lik_d2f, self.stu_t.d2lik_d2f_dvar,
+                dparam_checkgrad(self.stu_t.logpdf, self.stu_t.dlogpdf_dvar,
                     [self.var], args=(self.Y.copy(), self.f.copy()),
                     constrain_positive=True, randomize=True, verbose=True)
                 )
 
+    def test_studentt_dlogpdf_dlink_dvar(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.assertTrue(
+                dparam_checkgrad(self.stu_t.dlogpdf_dlink, self.stu_t.dlogpdf_dlink_dvar,
+                    [self.var], args=(self.Y.copy(), self.f.copy()),
+                    constrain_positive=True, randomize=True, verbose=True)
+                )
+
+    def test_studentt_d2logpdf_dlink2_dvar(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.assertTrue(
+                dparam_checkgrad(self.stu_t.d2logpdf_dlink2, self.stu_t.d2logpdf_dlink2_dvar,
+                    [self.var], args=(self.Y.copy(), self.f.copy()),
+                    constrain_positive=True, randomize=True, verbose=True)
+                )
+
+
+    """ Grad check whole models (grad checking Laplace not just noise models """
     def test_gauss_rbf(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.Y = self.Y/self.Y.max()

From 03443245713db87edf475aba2718990e8cda373e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 15 Oct 2013 18:58:41 +0100
Subject: [PATCH 113/252] Still tidying up, laplace now working again, gaussian
 and student_t likelihoods now done

---
 GPy/likelihoods/laplace.py                    | 10 +--
 .../noise_models/gaussian_noise.py            | 30 +++----
 .../noise_models/noise_distributions.py       | 86 +++++++++++++++++++
 .../noise_models/student_t_noise.py           | 47 +++-------
 GPy/testing/laplace_tests.py                  | 48 +++++------
 GPy/util/misc.py                              | 27 ++++++
 6 files changed, 167 insertions(+), 81 deletions(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index f4233554..8019e430 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -89,7 +89,7 @@ class Laplace(likelihood):
         :rtype: Matrix (1 x num_kernel_params)
         """
         dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data)
+        dlp = self.noise_model.dlogpdf_df(self.f_hat, self.data, extra_data=self.extra_data)
 
         #Explicit
         #expl_a = np.dot(self.Ki_f, self.Ki_f.T)
@@ -121,20 +121,20 @@ class Laplace(likelihood):
         :rtype: array of derivatives (1 x num_likelihood_params)
         """
         dL_dfhat, I_KW_i = self._shared_gradients_components()
-        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.data, self.f_hat)
+        dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
 
         num_params = len(dlik_dthetaL)
         # make space for one derivative for each likelihood parameter
         dL_dthetaL = np.zeros(num_params)
         for thetaL_i in range(num_params):
             #Explicit
-            dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
+            dL_dthetaL_exp = ( np.sum(dlik_dthetaL[:, thetaL_i])
                              #- 0.5*np.trace(mdot(self.Ki_W_i, (self.K, np.diagflat(dlik_hess_dthetaL[thetaL_i]))))
-                             + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[thetaL_i])
+                             + np.dot(0.5*np.diag(self.Ki_W_i)[:,None].T, dlik_hess_dthetaL[:, thetaL_i])
                              )
 
             #Implicit
-            dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[thetaL_i])
+            dfhat_dthetaL = mdot(I_KW_i, self.K, dlik_grad_dthetaL[:, thetaL_i])
             dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
             dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
 
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index 7b2e1a85..8bce30b7 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -36,18 +36,6 @@ class Gaussian(NoiseDistribution):
         #self.ln_det_K = np.sum(np.log(np.diag(self.covariance_matrix)))
         self.ln_det_K = self.N*np.log(self.variance)
 
-    def _laplace_gradients(self, y, f, extra_data=None):
-        #must be listed in same order as 'get_param_names'
-        derivs = ([-self._dnlog_mass_dvar(f, y, extra_data=extra_data)],
-                  [-self._dnlog_mass_dgp_dvar(f, y, extra_data=extra_data)],
-                  [-self._d2nlog_mass_dgp2_dvar(f, y, extra_data=extra_data)]
-                 ) # lists as we might learn many parameters
-        # ensure we have gradients for every parameter we want to optimize
-        assert len(derivs[0]) == len(self._get_param_names())
-        assert len(derivs[1]) == len(self._get_param_names())
-        assert len(derivs[2]) == len(self._get_param_names())
-        return derivs
-
     def _gradients(self,partial):
         return np.zeros(1)
         #return np.sum(partial)
@@ -106,9 +94,9 @@ class Gaussian(NoiseDistribution):
                             rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
                             its derivatives")
 
-    def logpdf(self, link_f, y, extra_data=None):
+    def logpdf_link(self, link_f, y, extra_data=None):
         """
-        Log likelihood function
+        Log likelihood function given link(f)
 
         .. math::
             \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
@@ -187,7 +175,7 @@ class Gaussian(NoiseDistribution):
         d3logpdf_dlink3 = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
         return d3logpdf_dlink3
 
-    def dlogpdf_dvar(self, link_f, y, extra_data=None):
+    def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
         """
         Gradient of the negative log-likelihood function at y given link(f), w.r.t variance parameter (noise_variance)
 
@@ -248,6 +236,18 @@ class Gaussian(NoiseDistribution):
         d2logpdf_dlink2_dvar = np.diag(s_4*self.I)[:, None]
         return d2logpdf_dlink2_dvar
 
+    def dlogpdf_link_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, extra_data=extra_data)
+        return np.asarray([[dlogpdf_dvar]])
+
+    def dlogpdf_dlink_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dtheta(self, f, y, extra_data=None):
+        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)
+        return d2logpdf_dlink2_dvar
+
     def _mean(self,gp):
         """
         Expected value of y under the Mass (or density) function p(y|f)
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 29b71795..6b36f42b 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -9,6 +9,7 @@ import pylab as pb
 from GPy.util.plot import gpplot
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
+from GPy.util.misc import chain_1, chain_2, chain_3
 
 
 class NoiseDistribution(object):
@@ -398,6 +399,89 @@ class NoiseDistribution(object):
         """
         return sp.optimize.fmin_ncg(self._nlog_joint_predictive_scaled,x0=(mu,self.gp_link.transf(mu)),fprime=self._gradient_nlog_joint_predictive,fhess=self._hessian_nlog_joint_predictive,args=(mu,sigma),disp=False)
 
+    def logpdf(self, f, y, extra_data=None):
+        """
+        Evaluates the link function link(f) then computes the log likelihood using it
+        """
+        link_f = self.gp_link.transf(f)
+        return self.logpdf_link(f, y, extra_data=extra_data)
+
+    def dlogpdf_df(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
+        link_f = self.gp_link.transf(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
+        dlink_df = self.gp_link.dtransf_df(f)
+        return chain_1(dlogpdf_dlink, dlink_df)
+
+    def d2logpdf_df2(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
+        link_f = self.gp_link.transf(f)
+        d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, extra_data=extra_data)
+        dlink_df = self.gp_link.dtransf_df(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
+        d2link_df2 = self.gp_link.d2transf_df2(f)
+        return chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
+
+    def d3logpdf_df3(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
+        link_f = self.gp_link.transf(f)
+        d3logpdf_dlink3 = self.d3logpdf_dlink3(link_f, y, extra_data=extra_data)
+        dlink_df = self.gp_link.dtransf_df(f)
+        d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, extra_data=extra_data)
+        d2link_df2 = self.gp_link.d2transf_df2(f)
+        dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
+        d3link_df3 = self.gp_link.d3transf_df3(f)
+        return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
+
+    def dlogpdf_dtheta(self, f, y, extra_data=None):
+        link_f = self.gp_link.transf(f)
+        return self.dlogpdf_link_dtheta(link_f, y, extra_data=extra_data)
+
+    def dlogpdf_df_dtheta(self, f, y, extra_data=None):
+        link_f = self.gp_link.transf(f)
+        dlink_df = self.gp_link.dtransf_df(f)
+        dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
+        return chain_1(dlogpdf_dlink_dtheta, dlink_df)
+
+    def d2logpdf_df2_dtheta(self, f, y, extra_data=None):
+        link_f = self.gp_link.transf(f)
+        dlink_df = self.gp_link.dtransf_df(f)
+        d2link_df2 = self.gp_link.d2transf_df2(f) #FIXME: I THINK ITS THIS
+        d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, extra_data=extra_data)
+        dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
+        return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
+        #return chain_1(d2logpdf_dlink2_dtheta, d2link_df2)
+
+    def _laplace_gradients(self, f, y, extra_data=None):
+        #link_f = self.gp_link.transf(f)
+        #dlink_df = self.gp_link.dtransf_df(f)
+        #d2link_df2 = self.gp_link.d2transf_df2(f)
+
+        #dlogpdf_dtheta = self.dlogpdf_dtheta(link_f, y, extra_data=extra_data)
+        #dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
+        #d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, extra_data=extra_data)
+
+        ##now chain them all with dlink_df etc
+        #dlogpdf_df_dtheta = chain_1(dlogpdf_dlink_dtheta, dlink_df)
+        #d2logpdf_df2_dtheta = chain_1(d2logpdf_dlink2_dtheta, d2link_df2)
+
+        dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, extra_data=extra_data)
+        dlogpdf_df_dtheta = self.dlogpdf_df_dtheta(f, y, extra_data=extra_data)
+        d2logpdf_df2_dtheta = self.d2logpdf_df2_dtheta(f, y, extra_data=extra_data)
+
+        #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
+        # ensure we have gradients for every parameter we want to optimize
+        assert dlogpdf_dtheta.shape[1] == len(self._get_param_names())
+        assert dlogpdf_df_dtheta.shape[1] == len(self._get_param_names())
+        assert d2logpdf_df2_dtheta.shape[1] == len(self._get_param_names())
+        return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
+
     def predictive_values(self,mu,var):
         """
         Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
@@ -433,3 +517,5 @@ class NoiseDistribution(object):
         """
         pass
 
+
+
diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index dcd41fda..0e881a8d 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -40,27 +40,9 @@ class StudentT(NoiseDistribution):
     def variance(self, extra_data=None):
         return (self.v / float(self.v - 2)) * self.sigma2
 
-    def _nlog_mass(self, link_f, y, extra_data=None):
-        NotImplementedError("Deprecated, now doing chain in likelihood.py for link function evaluation\
-                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
-                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
-                            its derivatives")
-
-    def _dnlog_mass_dgp(self, link_f, y, extra_data=None):
-        NotImplementedError("Deprecated, now doing chain in likelihood.py for link function evaluation\
-                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
-                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
-                            its derivatives")
-
-    def _d2nlog_mass_dgp2(self, link_f, y, extra_data=None):
-        NotImplementedError("Deprecated, now doing chain in likelihood.py for link function evaluation\
-                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
-                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
-                            its derivatives")
-
-    def logpdf(self, link_f, y, extra_data=None):
+    def logpdf_link(self, link_f, y, extra_data=None):
         """
-        Log Likelihood Function
+        Log Likelihood Function given link(f)
 
         .. math::
             \\ln p(y_{i}|f_{i}) = \\ln \\Gamma(\\frac{v+1}{2}) - \\ln \\Gamma(\\frac{v}{2})\\sqrt{v \\pi}\sigma - \\frac{v+1}{2}\\ln (1 + \\frac{1}{v}\\left(\\frac{y_{i} - f_{i}}{\\sigma}\\right)^2
@@ -151,7 +133,7 @@ class StudentT(NoiseDistribution):
                     )
         return d3lik_dlink3
 
-    def dlogpdf_dvar(self, link_f, y, extra_data=None):
+    def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
         """
         Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)
 
@@ -169,7 +151,6 @@ class StudentT(NoiseDistribution):
         assert y.shape == link_f.shape
         e = y - link_f
         dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
-        #FIXME: Careful as this hasn't been chained with dlink_var, not sure if we want link functions on our parameters?! Shouldn't need them with constraints
         return np.sum(dlogpdf_dvar)
 
     def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
@@ -214,17 +195,17 @@ class StudentT(NoiseDistribution):
                            )
         return d2logpdf_dlink2_dvar
 
-    def _laplace_gradients(self, y, f, extra_data=None):
-        #must be listed in same order as 'get_param_names'
-        derivs = ([self.dlogpdf_dvar(f, y, extra_data=extra_data)],
-                  [self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)],
-                  [self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)]
-                 ) # lists as we might learn many parameters
-        # ensure we have gradients for every parameter we want to optimize
-        assert len(derivs[0]) == len(self._get_param_names())
-        assert len(derivs[1]) == len(self._get_param_names())
-        assert len(derivs[2]) == len(self._get_param_names())
-        return derivs
+    def dlogpdf_link_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, extra_data=extra_data)
+        return np.asarray([[dlogpdf_dvar]])
+
+    def dlogpdf_dlink_dtheta(self, f, y, extra_data=None):
+        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, extra_data=extra_data)
+        return dlogpdf_dlink_dvar
+
+    def d2logpdf_dlink2_dtheta(self, f, y, extra_data=None):
+        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, extra_data=extra_data)
+        return d2logpdf_dlink2_dvar
 
     def _predictive_variance_analytical(self, mu, sigma, predictive_mean=None):
         """
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index 936241b1..dbdd34f3 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -80,7 +80,7 @@ class LaplaceTests(unittest.TestCase):
         self.gauss = GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N)
 
         #Make a bigger step as lower bound can be quite curved
-        self.step = 1e-4
+        self.step = 1e-3
 
     def tearDown(self):
         self.stu_t = None
@@ -97,7 +97,6 @@ class LaplaceTests(unittest.TestCase):
 
 
     """ dGauss_df's """
-    @unittest.skip("Not Implemented Yet")
     def test_gaussian_dlogpdf_df(self):
         #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
@@ -108,7 +107,6 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    @unittest.skip("Not Implemented Yet")
     def test_gaussian_d2logpdf_df2(self):
         #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
@@ -119,7 +117,6 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    @unittest.skip("Not Implemented Yet")
     def test_gaussian_d3logpdf_df3(self):
         #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
@@ -130,22 +127,20 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    @unittest.skip("Not Implemented Yet")
     def test_gaussian_dlogpdf_df_dvar(self):
         #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.gauss.dlogpdf_df, self.gauss.dlogpdf_df_dvar,
+                dparam_checkgrad(self.gauss.dlogpdf_df, self.gauss.dlogpdf_df_dtheta,
                     [self.var], args=(self.f, self.Y), constrain_positive=True,
                     randomize=False, verbose=True)
                 )
 
-    @unittest.skip("Not Implemented Yet")
     def test_gaussian_d2logpdf2_df2_dvar(self):
         #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.gauss.d2logpdf_df2, self.gauss.d2logpdf_df2_dvar,
+                dparam_checkgrad(self.gauss.d2logpdf_df2, self.gauss.d2logpdf_df2_dtheta,
                     [self.var], args=(self.f, self.Y), constrain_positive=True,
                     randomize=False, verbose=True)
                 )
@@ -182,7 +177,7 @@ class LaplaceTests(unittest.TestCase):
     def test_gaussian_dlogpdf_dvar(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.gauss.logpdf, self.gauss.dlogpdf_dvar,
+                dparam_checkgrad(self.gauss.logpdf, self.gauss.dlogpdf_dtheta,
                     [self.var], args=(self.f, self.Y), constrain_positive=True,
                     randomize=False, verbose=True)
                 )
@@ -190,7 +185,7 @@ class LaplaceTests(unittest.TestCase):
     def test_gaussian_dlogpdf_dlink_dvar(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.gauss.dlogpdf_dlink, self.gauss.dlogpdf_dlink_dvar,
+                dparam_checkgrad(self.gauss.dlogpdf_dlink, self.gauss.dlogpdf_dlink_dtheta,
                     [self.var], args=(self.f, self.Y), constrain_positive=True,
                     randomize=False, verbose=True)
                 )
@@ -198,7 +193,7 @@ class LaplaceTests(unittest.TestCase):
     def test_gaussian_d2logpdf2_dlink2_dvar(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.gauss.d2logpdf_dlink2, self.gauss.d2logpdf_dlink2_dvar,
+                dparam_checkgrad(self.gauss.d2logpdf_dlink2, self.gauss.d2logpdf_dlink2_dtheta,
                     [self.var], args=(self.f, self.Y), constrain_positive=True,
                     randomize=False, verbose=True)
                 )
@@ -228,7 +223,6 @@ class LaplaceTests(unittest.TestCase):
         self.assertTrue(grad.checkgrad())
 
     """ dStudentT_df's """
-    @unittest.skip("Not Implemented Yet")
     def test_studentt_dlogpdf_df(self):
         #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
@@ -239,7 +233,6 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    @unittest.skip("Not Implemented Yet")
     def test_studentt_d2logpdf_df2(self):
         #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
@@ -250,34 +243,31 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    @unittest.skip("Not Implemented Yet")
     def test_studentt_d3lik_d3f(self):
         #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_df2 = functools.partial(self.stu_t.d2logpdf_d2f, y=self.Y)
-        d3logpdf_df3 = functools.partial(self.stu_t.d3logpdf_d3f, y=self.Y)
+        d2logpdf_df2 = functools.partial(self.stu_t.d2logpdf_df2, y=self.Y)
+        d3logpdf_df3 = functools.partial(self.stu_t.d3logpdf_df3, y=self.Y)
         grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, self.f.copy(), 'f')
         grad.randomize()
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    @unittest.skip("Not Implemented Yet")
     def test_studentt_dlogpdf_df_dvar(self):
         #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.stu_t.dlogpdf_df, self.stu_t.dlogpdf_df_dvar,
-                    [self.var], args=(self.Y.copy(), self.f.copy()),
+                dparam_checkgrad(self.stu_t.dlogpdf_df, self.stu_t.dlogpdf_df_dtheta,
+                    [self.var], args=(self.f.copy(), self.Y.copy()),
                     constrain_positive=True, randomize=True, verbose=True)
                 )
 
-    @unittest.skip("Not Implemented Yet")
     def test_studentt_d2logpdf_df2_dvar(self):
         #FIXME: Needs non-identity Link function
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.stu_t.d2logpdf_df2, self.stu_t.d2logpdf_df2_dvar,
-                    [self.var], args=(self.Y.copy(), self.f.copy()),
+                dparam_checkgrad(self.stu_t.d2logpdf_df2, self.stu_t.d2logpdf_df2_dtheta,
+                    [self.var], args=(self.f.copy(), self.Y.copy()),
                     constrain_positive=True, randomize=True, verbose=True)
                 )
 
@@ -312,24 +302,24 @@ class LaplaceTests(unittest.TestCase):
     def test_studentt_dlogpdf_dvar(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.stu_t.logpdf, self.stu_t.dlogpdf_dvar,
-                    [self.var], args=(self.Y.copy(), self.f.copy()),
+                dparam_checkgrad(self.stu_t.logpdf, self.stu_t.dlogpdf_dtheta,
+                    [self.var], args=(self.f.copy(), self.Y.copy()),
                     constrain_positive=True, randomize=True, verbose=True)
                 )
 
     def test_studentt_dlogpdf_dlink_dvar(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.stu_t.dlogpdf_dlink, self.stu_t.dlogpdf_dlink_dvar,
-                    [self.var], args=(self.Y.copy(), self.f.copy()),
+                dparam_checkgrad(self.stu_t.dlogpdf_dlink, self.stu_t.dlogpdf_dlink_dtheta,
+                    [self.var], args=(self.f.copy(), self.Y.copy()),
                     constrain_positive=True, randomize=True, verbose=True)
                 )
 
     def test_studentt_d2logpdf_dlink2_dvar(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.assertTrue(
-                dparam_checkgrad(self.stu_t.d2logpdf_dlink2, self.stu_t.d2logpdf_dlink2_dvar,
-                    [self.var], args=(self.Y.copy(), self.f.copy()),
+                dparam_checkgrad(self.stu_t.d2logpdf_dlink2, self.stu_t.d2logpdf_dlink2_dtheta,
+                    [self.var], args=(self.f.copy(), self.Y.copy()),
                     constrain_positive=True, randomize=True, verbose=True)
                 )
 
@@ -388,7 +378,9 @@ class LaplaceTests(unittest.TestCase):
         m.constrain_positive('t_noise')
         m.constrain_fixed('white', white_var)
         m['t_noise'] = 0.01
+        m.randomize()
         m.checkgrad(verbose=1)
+        print m
         self.assertTrue(m.checkgrad(step=self.step))
 
 if __name__ == "__main__":
diff --git a/GPy/util/misc.py b/GPy/util/misc.py
index 5866ecf9..885f9e83 100644
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@@ -4,6 +4,33 @@
 import numpy as np
 from scipy import weave
 
+def chain_1(df_dg, dg_dx):
+    """
+    Generic chaining function for first derivative
+
+    .. math::
+        \\frac{d(f . g)}{dx} = \\frac{df}{dg} \\frac{dg}{dx}
+    """
+    return df_dg * dg_dx
+
+def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
+    """
+    Generic chaining function for second derivative
+
+    .. math::
+        \\frac{d^{2}(f . g)}{dx^{2}} = \\frac{d^{2}f}{dg^{2}}(\\frac{dg}{dx})^{2} + \\frac{df}{dg}\\frac{d^{2}g}{dx^{2}}
+    """
+    return d2f_dg2*(dg_dx**2) + df_dg*d2g_dx2
+
+def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
+    """
+    Generic chaining function for third derivative
+
+    .. math::
+        \\frac{d^{3}(f . g)}{dx^{3}} = \\frac{d^{3}f}{dg^{3}}(\\frac{dg}{dx})^{3} + 3\\frac{d^{2}f}{dg^{2}}\\frac{dg}{dx}\\frac{d^{2}g}{dx^{2}} + \\frac{df}{dg}\\frac{d^{3}g}{dx^{3}}
+    """
+    return d3f_dg3*(dg_dx**3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3
+
 def opt_wrapper(m, **kwargs):
     """
     This function just wraps the optimization procedure of a GPy

From dc12fb43b73c641012b53ffcba80a1f4987ba9cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Fusi?= <nicolo.fusi@gmail.com>
Date: Tue, 15 Oct 2013 16:03:56 -0700
Subject: [PATCH 114/252] Added configuration file

this was done to solve the OpenMP problem on Windows/mac, but I think it
is useful in general. All unit tests pass except the sympy kern ones.
---
 GPy/examples/dimensionality_reduction.py |  2 +-
 GPy/gpy_config.cfg                       |  7 +++
 GPy/kern/parts/linear.py                 | 74 +++++++++++++++---------
 GPy/kern/parts/rbf.py                    | 49 ++++++++++++----
 GPy/kern/parts/rbf_inv.py                | 48 ++++++++++-----
 GPy/util/config.py                       | 17 ++++++
 GPy/util/misc.py                         | 50 +++++++++++-----
 7 files changed, 179 insertions(+), 68 deletions(-)
 create mode 100644 GPy/gpy_config.cfg
 create mode 100644 GPy/util/config.py

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 298607b6..bde249c8 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -26,7 +26,7 @@ def BGPLVM(seed=default_seed):
     lik = Gaussian(Y, normalize=True)
 
     k = GPy.kern.rbf_inv(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
-    # k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
+    # k = GPy.kern.linear(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
     # k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
 
     m = GPy.models.BayesianGPLVM(lik, Q, kernel=k, num_inducing=num_inducing)
diff --git a/GPy/gpy_config.cfg b/GPy/gpy_config.cfg
new file mode 100644
index 00000000..8683f96c
--- /dev/null
+++ b/GPy/gpy_config.cfg
@@ -0,0 +1,7 @@
+# This is the configuration file for GPy
+
+[parallel]
+# Enable openmp support. This speeds up some computations, depending on the number
+# of cores available. Setting up a compiler with openmp support can be difficult on 
+# some platforms, hence this option.
+openmp=True
diff --git a/GPy/kern/parts/linear.py b/GPy/kern/parts/linear.py
index ffcbcf5e..ab96bb31 100644
--- a/GPy/kern/parts/linear.py
+++ b/GPy/kern/parts/linear.py
@@ -7,6 +7,7 @@ import numpy as np
 from ...util.linalg import tdot
 from ...util.misc import fast_array_equal
 from scipy import weave
+from ...util.config import *
 
 class Linear(Kernpart):
     """
@@ -51,6 +52,26 @@ class Linear(Kernpart):
         self._Z, self._mu, self._S = np.empty(shape=(3, 1))
         self._X, self._X2, self._params = np.empty(shape=(3, 1))
 
+        # a set of optional args to pass to weave
+        weave_options_openmp = {'headers'           : ['<omp.h>'],
+                                'extra_compile_args': ['-fopenmp -O3'],
+                                'extra_link_args'   : ['-lgomp'],
+                                'libraries': ['gomp']}
+        weave_options_noopenmp = {'extra_compile_args': ['-O3']}
+
+
+        if config.getboolean('parallel', 'openmp'):
+            self.weave_options = weave_options_openmp
+            self.weave_support_code =  """
+            #include <omp.h>
+            #include <math.h>
+            """
+        else:
+            self.weave_options = weave_options_noopenmp
+            self.weave_support_code = """
+            #include <math.h>
+            """
+
     def _get_params(self):
         return self.variances
 
@@ -190,11 +211,17 @@ class Linear(Kernpart):
         #target_mu_dummy += (dL_dpsi2[:, :, :, None] * muAZZA).sum(1).sum(1)
         #target_S_dummy += (dL_dpsi2[:, :, :, None] * self.ZA[None, :, None, :] * self.ZA[None, None, :, :]).sum(1).sum(1)
 
+
+        if config.getboolean('parallel', 'openmp'):
+            pragma_string = "#pragma omp parallel for private(m,mm,q,qq,factor,tmp)"
+        else:
+            pragma_string = ''
+
         #Using weave, we can exploiut the symmetry of this problem:
         code = """
         int n, m, mm,q,qq;
         double factor,tmp;
-        #pragma omp parallel for private(m,mm,q,qq,factor,tmp)
+        %s
         for(n=0;n<N;n++){
           for(m=0;m<num_inducing;m++){
             for(mm=0;mm<=m;mm++){
@@ -218,19 +245,13 @@ class Linear(Kernpart):
             }
           }
         }
-        """
-        support_code = """
-        #include <omp.h>
-        #include <math.h>
-        """
-        weave_options = {'headers'           : ['<omp.h>'],
-                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
-                         'extra_link_args'   : ['-lgomp']}
+        """ % pragma_string
 
-        N,num_inducing,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
-        weave.inline(code, support_code=support_code, libraries=['gomp'],
-                     arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
-                     type_converters=weave.converters.blitz,**weave_options)
+
+        N,num_inducing,input_dim = int(mu.shape[0]),int(Z.shape[0]),int(mu.shape[1])
+        weave.inline(code, support_code=self.weave_support_code,
+                    arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
+                    type_converters=weave.converters.blitz,**self.weave_options)
 
 
     def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
@@ -240,9 +261,15 @@ class Linear(Kernpart):
         #dummy_target += psi2_dZ.sum(0).sum(0)
 
         AZA = self.variances*self.ZAinner
+
+        if config.getboolean('parallel', 'openmp'):
+            pragma_string = '#pragma omp parallel for private(n,mm,q)'
+        else:
+            pragma_string = ''
+
         code="""
         int n,m,mm,q;
-        #pragma omp parallel for private(n,mm,q)
+        %s
         for(m=0;m<num_inducing;m++){
           for(q=0;q<input_dim;q++){
             for(mm=0;mm<num_inducing;mm++){
@@ -252,22 +279,13 @@ class Linear(Kernpart):
             }
           }
         }
-        """
-        support_code = """
-        #include <omp.h>
-        #include <math.h>
-        """
-        weave_options = {'headers'           : ['<omp.h>'],
-                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
-                         'extra_link_args'   : ['-lgomp']}
+        """ % pragma_string
 
-        N,num_inducing,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
-        weave.inline(code, support_code=support_code, libraries=['gomp'],
+
+        N,num_inducing,input_dim = int(mu.shape[0]),int(Z.shape[0]),int(mu.shape[1])
+        weave.inline(code, support_code=self.weave_support_code, 
                      arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'],
-                     type_converters=weave.converters.blitz,**weave_options)
-
-
-
+                     type_converters=weave.converters.blitz,**self.weave_options)
 
 
     #---------------------------------------#
diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index 855e2b71..585d687f 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -7,6 +7,7 @@ import numpy as np
 from scipy import weave
 from ...util.linalg import tdot
 from ...util.misc import fast_array_equal
+from ...util.config import *
 
 class RBF(Kernpart):
     """
@@ -57,12 +58,27 @@ class RBF(Kernpart):
         self._X, self._X2, self._params = np.empty(shape=(3, 1))
 
         # a set of optional args to pass to weave
-        self.weave_options = {'headers'           : ['<omp.h>'],
-                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
-                         'extra_link_args'   : ['-lgomp']}
+        weave_options_openmp = {'headers'           : ['<omp.h>'],
+                                'extra_compile_args': ['-fopenmp -O3'],
+                                'extra_link_args'   : ['-lgomp'],
+                                'libraries': ['gomp']}
+        weave_options_noopenmp = {'extra_compile_args': ['-O3']}
 
 
 
+        if config.getboolean('parallel', 'openmp'):
+            self.weave_options = weave_options_openmp
+            self.weave_support_code =  """
+            #include <omp.h>
+            #include <math.h>
+            """
+        else:
+            self.weave_options = weave_options_noopenmp
+            self.weave_support_code = """
+            #include <math.h>
+            """
+
+
     def _get_params(self):
         return np.hstack((self.variance, self.lengthscale))
 
@@ -110,7 +126,7 @@ class RBF(Kernpart):
                   target(q+1) += var_len3(q)*tmp;
                 }
                 """
-                num_data, num_inducing, input_dim = X.shape[0], X.shape[0], self.input_dim
+                num_data, num_inducing, input_dim = int(X.shape[0]), int(X.shape[0]), int(self.input_dim)
                 weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
             else:
                 code = """
@@ -126,7 +142,7 @@ class RBF(Kernpart):
                   target(q+1) += var_len3(q)*tmp;
                 }
                 """
-                num_data, num_inducing, input_dim = X.shape[0], X2.shape[0], self.input_dim
+                num_data, num_inducing, input_dim = int(X.shape[0]), int(X2.shape[0]), int(self.input_dim)
                 # [np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.input_dim)]
                 weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
         else:
@@ -287,10 +303,16 @@ class RBF(Kernpart):
             lengthscale2 = self.lengthscale2
         else:
             lengthscale2 = np.ones(input_dim) * self.lengthscale2
+
+        if config.getboolean('parallel', 'openmp'):
+            pragma_string = '#pragma omp parallel for private(tmp)'
+        else:
+            pragma_string = ''
+
         code = """
         double tmp;
 
-        #pragma omp parallel for private(tmp)
+        %s
         for (int n=0; n<N; n++){
             for (int m=0; m<num_inducing; m++){
                for (int mm=0; mm<(m+1); mm++){
@@ -320,13 +342,20 @@ class RBF(Kernpart):
             }
         }
 
-        """
+        """ % pragma_string
+
+        if config.getboolean('parallel', 'openmp'):
+            pragma_string = '#include <omp.h>'
+        else:
+            pragma_string = ''
 
         support_code = """
-        #include <omp.h>
+        %s
         #include <math.h>
-        """
-        weave.inline(code, support_code=support_code, libraries=['gomp'],
+        """ % pragma_string
+
+        N, num_inducing, input_dim = int(N), int(num_inducing), int(input_dim)
+        weave.inline(code, support_code=support_code,
                      arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
                      type_converters=weave.converters.blitz, **self.weave_options)
 
diff --git a/GPy/kern/parts/rbf_inv.py b/GPy/kern/parts/rbf_inv.py
index 0433e96c..1cc05aaa 100644
--- a/GPy/kern/parts/rbf_inv.py
+++ b/GPy/kern/parts/rbf_inv.py
@@ -7,6 +7,8 @@ import numpy as np
 import hashlib
 from scipy import weave
 from ...util.linalg import tdot
+from ...util.config import *
+
 
 class RBFInv(RBF):
     """
@@ -58,11 +60,23 @@ class RBFInv(RBF):
         self._X, self._X2, self._params = np.empty(shape=(3, 1))
 
         # a set of optional args to pass to weave
-        self.weave_options = {'headers'           : ['<omp.h>'],
-                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
-                         'extra_link_args'   : ['-lgomp']}
-
+        weave_options_openmp = {'headers'           : ['<omp.h>'],
+                                'extra_compile_args': ['-fopenmp -O3'],
+                                'extra_link_args'   : ['-lgomp'],
+                                'libraries': ['gomp']}
+        weave_options_noopenmp = {'extra_compile_args': ['-O3']}
 
+        if config.getboolean('parallel', 'openmp'):
+            self.weave_options = weave_options_openmp
+            self.weave_support_code =  """
+            #include <omp.h>
+            #include <math.h>
+            """
+        else:
+            self.weave_options = weave_options_noopenmp
+            self.weave_support_code = """
+            #include <math.h>
+            """
 
     def _get_params(self):
         return np.hstack((self.variance, self.inv_lengthscale))
@@ -109,7 +123,7 @@ class RBFInv(RBF):
                   target(q+1) += var_len3(q)*tmp*(-len2(q));
                 }
                 """
-                num_data, num_inducing, input_dim = X.shape[0], X.shape[0], self.input_dim
+                num_data, num_inducing, input_dim = int(X.shape[0]), int(X.shape[0]), int(self.input_dim)
                 weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3', 'len2'], type_converters=weave.converters.blitz, **self.weave_options)
             else:
                 code = """
@@ -125,7 +139,7 @@ class RBFInv(RBF):
                   target(q+1) += var_len3(q)*tmp*(-len2(q));
                 }
                 """
-                num_data, num_inducing, input_dim = X.shape[0], X2.shape[0], self.input_dim
+                num_data, num_inducing, input_dim = int(X.shape[0]), int(X2.shape[0]), int(self.input_dim)
                 # [np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.input_dim)]
                 weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3', 'len2'], type_converters=weave.converters.blitz, **self.weave_options)
         else:
@@ -133,7 +147,7 @@ class RBFInv(RBF):
 
     def dK_dX(self, dL_dK, X, X2, target):
         self._K_computations(X, X2)
-        if X2 is None:            
+        if X2 is None:
             _K_dist = 2*(X[:, None, :] - X[None, :, :])
         else:
             _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
@@ -263,8 +277,8 @@ class RBFInv(RBF):
             self._Z, self._mu, self._S = Z, mu, S
 
     def weave_psi2(self, mu, Zhat):
-        N, input_dim = mu.shape
-        num_inducing = Zhat.shape[0]
+        N, input_dim = int(mu.shape[0]), int(mu.shape[1])
+        num_inducing = int(Zhat.shape[0])
 
         mudist = np.empty((N, num_inducing, num_inducing, input_dim))
         mudist_sq = np.empty((N, num_inducing, num_inducing, input_dim))
@@ -279,10 +293,16 @@ class RBFInv(RBF):
             inv_lengthscale2 = self.inv_lengthscale2
         else:
             inv_lengthscale2 = np.ones(input_dim) * self.inv_lengthscale2
+
+        if config.getboolean('parallel', 'openmp'):
+            pragma_string = '#pragma omp parallel for private(tmp)'
+        else:
+            pragma_string = ''
+
         code = """
         double tmp;
 
-        #pragma omp parallel for private(tmp)
+        %s
         for (int n=0; n<N; n++){
             for (int m=0; m<num_inducing; m++){
                for (int mm=0; mm<(m+1); mm++){
@@ -312,13 +332,9 @@ class RBFInv(RBF):
             }
         }
 
-        """
+        """ % pragma_string
 
-        support_code = """
-        #include <omp.h>
-        #include <math.h>
-        """
-        weave.inline(code, support_code=support_code, libraries=['gomp'],
+        weave.inline(code, support_code=self.weave_support_code,
                      arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'inv_lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
                      type_converters=weave.converters.blitz, **self.weave_options)
 
diff --git a/GPy/util/config.py b/GPy/util/config.py
new file mode 100644
index 00000000..d2ed7543
--- /dev/null
+++ b/GPy/util/config.py
@@ -0,0 +1,17 @@
+#
+# This loads the configuration
+#
+import ConfigParser
+import os
+config = ConfigParser.ConfigParser()
+
+user_file = os.path.join(os.getenv('HOME'),'.gpy_config.cfg')
+default_file = os.path.join('..','gpy_config.cfg')
+
+# 1. check if the user has a ~/.gpy_config.cfg
+if os.path.isfile(user_file):
+    config.read(user_file)
+else:
+    # 2. if not, use the default one
+    path = os.path.dirname(__file__)
+    config.read(os.path.join(path,default_file))
diff --git a/GPy/util/misc.py b/GPy/util/misc.py
index 5866ecf9..d3f23b75 100644
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from scipy import weave
+from config import *
 
 def opt_wrapper(m, **kwargs):
     """
@@ -57,11 +58,18 @@ def kmm_init(X, m = 10):
     return X[inducing]
 
 def fast_array_equal(A, B):
+
+
+    if config.getboolean('parallel', 'openmp'):
+        pragma_string = '#pragma omp parallel for private(i, j)'
+    else:
+        pragma_string = ''
+
     code2="""
     int i, j;
     return_val = 1;
 
-    // #pragma omp parallel for private(i, j)
+    %s
     for(i=0;i<N;i++){
        for(j=0;j<D;j++){
           if(A(i, j) != B(i, j)){
@@ -70,13 +78,18 @@ def fast_array_equal(A, B):
           }
        }
     }
-    """
+    """ % pragma_string
+
+    if config.getboolean('parallel', 'openmp'):
+        pragma_string = '#pragma omp parallel for private(i, j, z)'
+    else:
+        pragma_string = ''
 
     code3="""
     int i, j, z;
     return_val = 1;
 
-    // #pragma omp parallel for private(i, j, z)
+    %s
     for(i=0;i<N;i++){
        for(j=0;j<D;j++){
          for(z=0;z<Q;z++){
@@ -87,20 +100,33 @@ def fast_array_equal(A, B):
           }
        }
     }
-    """
+    """ % pragma_string
+
+    if config.getboolean('parallel', 'openmp'):
+        pragma_string = '#include <omp.h>'
+    else:
+        pragma_string = ''
 
     support_code = """
-    // #include <omp.h>
+    %s
     #include <math.h>
-    """
+    """ % pragma_string
 
-    weave_options = {'headers'           : ['<omp.h>'],
-                     'extra_compile_args': ['-fopenmp -O3'],
-                     'extra_link_args'   : ['-lgomp']}
 
+    weave_options_openmp = {'headers'           : ['<omp.h>'],
+                            'extra_compile_args': ['-fopenmp -O3'],
+                            'extra_link_args'   : ['-lgomp'],
+                            'libraries': ['gomp']}
+    weave_options_noopenmp = {'extra_compile_args': ['-O3']}
+
+    if config.getboolean('parallel', 'openmp'):
+        weave_options = weave_options_openmp
+    else:
+        weave_options = weave_options_noopenmp
 
     value = False
 
+
     if (A == None) and (B == None):
         return True
     elif ((A == None) and (B != None)) or ((A != None) and (B == None)):
@@ -110,14 +136,12 @@ def fast_array_equal(A, B):
             N, D = [int(i) for i in A.shape]
             value = weave.inline(code2, support_code=support_code,
                                  arg_names=['A', 'B', 'N', 'D'],
-                                 type_converters=weave.converters.blitz)
-            # libraries=['gomp'], **weave_options)
+                                 type_converters=weave.converters.blitz, **weave_options)
         elif A.ndim == 3:
             N, D, Q = [int(i) for i in A.shape]
             value = weave.inline(code3, support_code=support_code,
                                  arg_names=['A', 'B', 'N', 'D', 'Q'],
-                                 type_converters=weave.converters.blitz)
-            #libraries=['gomp'], **weave_options)
+                                 type_converters=weave.converters.blitz, **weave_options)
         else:
             value = np.array_equal(A,B)
 

From 6e28fdf4fd83aa511fe9751ccd14e317ae83c117 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 16 Oct 2013 15:35:14 +0100
Subject: [PATCH 115/252] Fixed some bugs, added third derivative for log
 transformation, and did some doccing

---
 .../noise_models/gaussian_noise.py            |  17 ++-
 .../noise_models/gp_transformations.py        |   7 +
 .../noise_models/noise_distributions.py       | 122 ++++++++++++++++--
 GPy/testing/laplace_tests.py                  |   7 +-
 doc/GPy.testing.rst                           |   8 ++
 doc/GPy.util.rst                              |  16 +++
 6 files changed, 155 insertions(+), 22 deletions(-)

diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index 8bce30b7..5811f916 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -68,7 +68,7 @@ class Gaussian(NoiseDistribution):
     def _predictive_variance_analytical(self,mu,sigma,predictive_mean=None):
         return 1./(1./self.variance + 1./sigma**2)
 
-    def _mass(self, link_f, y):
+    def pdf_link(self, link_f, y, extra_data=None):
         #FIXME: Careful now passing link_f in not gp (f)!
         #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
         #Assumes no covariance, exp, sum, log for numerical stability
@@ -76,21 +76,26 @@ class Gaussian(NoiseDistribution):
         #return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
         return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
 
+    def _mass(self, link_f, y, extra_data=None):
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use pdf in noise_model.py, if implementing a likelihood\
+                            rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
+                            its derivatives")
     def _nlog_mass(self, link_f, y, extra_data=None):
-        NotImplementedError("Deprecated, now doing chain in likelihood.py for link function evaluation\
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
                             Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
                             rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
                             its derivatives")
 
     def _dnlog_mass_dgp(self, link_f, y, extra_data=None):
-        NotImplementedError("Deprecated, now doing chain in likelihood.py for link function evaluation\
-                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use dlogpdf_df in noise_model.py, if implementing a likelihood\
                             rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
                             its derivatives")
 
     def _d2nlog_mass_dgp2(self, link_f, y, extra_data=None):
-        NotImplementedError("Deprecated, now doing chain in likelihood.py for link function evaluation\
-                            Please negate your function and use logpdf in noise_model.py, if implementing a likelihood\
+        NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
+                            Please negate your function and use d2logpdf_df2 in noise_model.py, if implementing a likelihood\
                             rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
                             its derivatives")
 
diff --git a/GPy/likelihoods/noise_models/gp_transformations.py b/GPy/likelihoods/noise_models/gp_transformations.py
index c6e316e8..b9db75ce 100644
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@@ -80,6 +80,10 @@ class Probit(GPTransformation):
     def d2transf_df2(self,f):
         return -f * std_norm_pdf(f)
 
+    def d3transf_df3(self,f):
+        f2 = f**2
+        return -(1/(np.sqrt(2*np.pi)))*np.exp(-0.5*(f2))*(f2-1)
+
 class Log(GPTransformation):
     """
     .. math::
@@ -96,6 +100,9 @@ class Log(GPTransformation):
     def d2transf_df2(self,f):
         return np.exp(f)
 
+    def d3transf_df3(self,f):
+        return np.exp(f)
+
 class Log_ex_1(GPTransformation):
     """
     .. math::
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 6b36f42b..0516a735 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -399,16 +399,82 @@ class NoiseDistribution(object):
         """
         return sp.optimize.fmin_ncg(self._nlog_joint_predictive_scaled,x0=(mu,self.gp_link.transf(mu)),fprime=self._gradient_nlog_joint_predictive,fhess=self._hessian_nlog_joint_predictive,args=(mu,sigma),disp=False)
 
-    def logpdf(self, f, y, extra_data=None):
+    def pdf_link(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def dlogpdf_link_dtheta(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def dlogpdf_dlink_dtheta(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+    def d2logpdf_dlink2_dtheta(self, link_f, y, extra_data=None):
+        raise NotImplementedError
+
+
+    def pdf(self, f, y, extra_data=None):
         """
-        Evaluates the link function link(f) then computes the log likelihood using it
+        Evaluates the link function link(f) then computes the likelihood (pdf) using it
+
+        .. math:
+            p(y|\\lambda(f))
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: likelihood evaluated for this point
+        :rtype: float
         """
         link_f = self.gp_link.transf(f)
-        return self.logpdf_link(f, y, extra_data=extra_data)
+        return self.pdf_link(link_f, y, extra_data=extra_data)
+
+    def logpdf(self, f, y, extra_data=None):
+        """
+        Evaluates the link function link(f) then computes the log likelihood (log pdf) using it
+
+        .. math:
+            \\log p(y|\\lambda(f))
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: log likelihood evaluated for this point
+        :rtype: float
+        """
+        link_f = self.gp_link.transf(f)
+        return self.logpdf_link(link_f, y, extra_data=extra_data)
 
     def dlogpdf_df(self, f, y, extra_data=None):
         """
-        TODO: Doc strings
+        Evaluates the link function link(f) then computes the derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule
+
+        .. math::
+            \\frac{d\\log p(y|\\lambda(f))}{df} = \\frac{d\\log p(y|\\lambda(f))}{d\\lambda(f)}\\frac{d\\lambda(f)}{df}
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: derivative of log likelihood evaluated for this point
+        :rtype: float
         """
         link_f = self.gp_link.transf(f)
         dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
@@ -417,7 +483,19 @@ class NoiseDistribution(object):
 
     def d2logpdf_df2(self, f, y, extra_data=None):
         """
-        TODO: Doc strings
+        Evaluates the link function link(f) then computes the second derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule
+
+        .. math::
+            \\frac{d^{2}\\log p(y|\\lambda(f))}{df^{2}} = \\frac{d^{2}\\log p(y|\\lambda(f))}{d^{2}\\lambda(f)}\\left(\\frac{d\\lambda(f)}{df}\\right)^{2} + \\frac{d\\log p(y|\\lambda(f))}{d\\lambda(f)}\\frac{d^{2}\\lambda(f)}{df^{2}}
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: second derivative of log likelihood evaluated for this point
+        :rtype: float
         """
         link_f = self.gp_link.transf(f)
         d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, extra_data=extra_data)
@@ -428,7 +506,19 @@ class NoiseDistribution(object):
 
     def d3logpdf_df3(self, f, y, extra_data=None):
         """
-        TODO: Doc strings
+        Evaluates the link function link(f) then computes the third derivative of log likelihood using it
+        Uses the Faa di Bruno's formula for the chain rule
+
+        .. math::
+            \\frac{d^{3}\\log p(y|\\lambda(f))}{df^{3}} = \\frac{d^{3}\\log p(y|\\lambda(f)}{d\\lambda(f)^{3}}\\left(\\frac{d\\lambda(f)}{df}\\right)^{3} + 3\\frac{d^{2}\\log p(y|\\lambda(f)}{d\\lambda(f)^{2}}\\frac{d\\lambda(f)}{df}\\frac{d^{2}\\lambda(f)}{df^{2}} + \\frac{d\\log p(y|\\lambda(f)}{d\\lambda(f)}\\frac{d^{3}\\lambda(f)}{df^{3}}
+
+        :param f: latent variables f
+        :type f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: third derivative of log likelihood evaluated for this point
+        :rtype: float
         """
         link_f = self.gp_link.transf(f)
         d3logpdf_dlink3 = self.d3logpdf_dlink3(link_f, y, extra_data=extra_data)
@@ -440,23 +530,33 @@ class NoiseDistribution(object):
         return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
 
     def dlogpdf_dtheta(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
         link_f = self.gp_link.transf(f)
         return self.dlogpdf_link_dtheta(link_f, y, extra_data=extra_data)
 
     def dlogpdf_df_dtheta(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
         link_f = self.gp_link.transf(f)
         dlink_df = self.gp_link.dtransf_df(f)
         dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
         return chain_1(dlogpdf_dlink_dtheta, dlink_df)
 
     def d2logpdf_df2_dtheta(self, f, y, extra_data=None):
+        """
+        TODO: Doc strings
+        """
         link_f = self.gp_link.transf(f)
         dlink_df = self.gp_link.dtransf_df(f)
-        d2link_df2 = self.gp_link.d2transf_df2(f) #FIXME: I THINK ITS THIS
+        d2link_df2 = self.gp_link.d2transf_df2(f)
         d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, extra_data=extra_data)
         dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
-        return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
+        #FIXME: Why isn't this chain_1?
         #return chain_1(d2logpdf_dlink2_dtheta, d2link_df2)
+        return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
 
     def _laplace_gradients(self, f, y, extra_data=None):
         #link_f = self.gp_link.transf(f)
@@ -508,14 +608,10 @@ class NoiseDistribution(object):
         q3 = np.vstack(q3)
         return pred_mean, pred_var, q1, q3
 
-
     def samples(self, gp):
         """
         Returns a set of samples of observations based on a given value of the latent variable.
 
         :param gp: latent variable
         """
-        pass
-
-
-
+        raise NotImplementedError
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index dbdd34f3..1f20d9ae 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -4,6 +4,7 @@ import GPy
 from GPy.models import GradientChecker
 import functools
 import inspect
+from GPy.likelihoods.noise_models import gp_transformations
 
 def dparam_partial(inst_func, *args):
     """
@@ -77,7 +78,7 @@ class LaplaceTests(unittest.TestCase):
 
         self.var = np.random.rand(1)
         self.stu_t = GPy.likelihoods.student_t(deg_free=5, sigma2=self.var)
-        self.gauss = GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N)
+        self.gauss = GPy.likelihoods.gaussian(gp_transformations.Log(), variance=self.var, D=self.D, N=self.N)
 
         #Make a bigger step as lower bound can be quite curved
         self.step = 1e-3
@@ -92,7 +93,7 @@ class LaplaceTests(unittest.TestCase):
     def test_mass_logpdf(self):
         print "\n{}".format(inspect.stack()[0][3])
         np.testing.assert_almost_equal(
-                               np.log(self.gauss._mass(self.f.copy(), self.Y.copy())),
+                               np.log(self.gauss.pdf(self.f.copy(), self.Y.copy())),
                                self.gauss.logpdf(self.f.copy(), self.Y.copy()))
 
 
@@ -149,7 +150,7 @@ class LaplaceTests(unittest.TestCase):
     """ dGauss_dlink's """
     def test_gaussian_dlogpdf_dlink(self):
         print "\n{}".format(inspect.stack()[0][3])
-        logpdf = functools.partial(self.gauss.logpdf, y=self.Y)
+        logpdf = functools.partial(self.gauss.logpdf_link, y=self.Y)
         dlogpdf_dlink = functools.partial(self.gauss.dlogpdf_dlink, y=self.Y)
         grad = GradientChecker(logpdf, dlogpdf_dlink, self.f.copy(), 'g')
         grad.randomize()
diff --git a/doc/GPy.testing.rst b/doc/GPy.testing.rst
index ef25ba60..078a41a2 100644
--- a/doc/GPy.testing.rst
+++ b/doc/GPy.testing.rst
@@ -76,6 +76,14 @@ GPy.testing.mrd_tests module
     :undoc-members:
     :show-inheritance:
 
+GPy.testing.noise_distributions module
+--------------------------------------
+
+.. automodule:: GPy.testing.noise_distributions
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.testing.prior_tests module
 ------------------------------
 
diff --git a/doc/GPy.util.rst b/doc/GPy.util.rst
index 5aca7cf9..f2aaed7f 100644
--- a/doc/GPy.util.rst
+++ b/doc/GPy.util.rst
@@ -27,6 +27,14 @@ GPy.util.classification module
     :undoc-members:
     :show-inheritance:
 
+GPy.util.config module
+----------------------
+
+.. automodule:: GPy.util.config
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.datasets module
 ------------------------
 
@@ -91,6 +99,14 @@ GPy.util.multioutput module
     :undoc-members:
     :show-inheritance:
 
+GPy.util.netpbmfile module
+--------------------------
+
+.. automodule:: GPy.util.netpbmfile
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.util.plot module
 --------------------
 

From 208b6862bd23dafee21ec8d649dc2c27fefdbe87 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 16 Oct 2013 18:42:36 +0100
Subject: [PATCH 116/252] Tidying up laplace_tests.py

---
 .../noise_models/noise_distributions.py       |  11 +-
 GPy/testing/laplace_tests.py                  | 569 +++++++++---------
 2 files changed, 305 insertions(+), 275 deletions(-)

diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 0516a735..5b92e2b5 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -415,7 +415,10 @@ class NoiseDistribution(object):
         raise NotImplementedError
 
     def dlogpdf_link_dtheta(self, link_f, y, extra_data=None):
-        raise NotImplementedError
+        if len(self._get_params()) == 0:
+            pass
+        else:
+            raise NotImplementedError
 
     def dlogpdf_dlink_dtheta(self, link_f, y, extra_data=None):
         raise NotImplementedError
@@ -474,7 +477,7 @@ class NoiseDistribution(object):
         :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
         :returns: derivative of log likelihood evaluated for this point
-        :rtype: float
+        :rtype: 1xN array
         """
         link_f = self.gp_link.transf(f)
         dlogpdf_dlink = self.dlogpdf_dlink(link_f, y, extra_data=extra_data)
@@ -494,8 +497,8 @@ class NoiseDistribution(object):
         :param y: data
         :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: second derivative of log likelihood evaluated for this point
-        :rtype: float
+        :returns: second derivative of log likelihood evaluated for this point (diagonal only)
+        :rtype: 1xN array
         """
         link_f = self.gp_link.transf(f)
         d2logpdf_dlink2 = self.d2logpdf_dlink2(link_f, y, extra_data=extra_data)
diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/laplace_tests.py
index 1f20d9ae..9f430741 100644
--- a/GPy/testing/laplace_tests.py
+++ b/GPy/testing/laplace_tests.py
@@ -63,7 +63,305 @@ def dparam_checkgrad(func, dfunc, params, args, constrain_positive=True, randomi
     return gradchecking
 
 
+from nose.tools import with_setup
+class TestNoiseModels(object):
+    """
+    Generic model checker
+    """
+    def setUp(self):
+        self.N = 5
+        self.D = 3
+        self.X = np.random.rand(self.N, self.D)*10
+
+        self.real_std = 0.1
+        noise = np.random.randn(*self.X[:, 0].shape)*self.real_std
+        self.Y = (np.sin(self.X[:, 0]*2*np.pi) + noise)[:, None]
+        self.f = np.random.rand(self.N, 1)
+
+        self.var = 0.2
+
+        self.var = np.random.rand(1)
+
+        #Make a bigger step as lower bound can be quite curved
+        self.step = 1e-3
+
+    def tearDown(self):
+        self.Y = None
+        self.f = None
+        self.X = None
+
+    def test_noise_models(self):
+        self.setUp()
+        """
+        Dictionary where we nest models we would like to check
+            Name: {
+                "model": model_instance,
+                "grad_params": {
+                    "names": [names_of_params_we_want, to_grad_check],
+                    "vals": [values_of_params, to_start_at],
+                    "constrain_positive": [boolean_values, of_whether_to_constrain]
+                    },
+                "laplace": boolean_of_whether_model_should_work_for_laplace
+                }
+        """
+        noise_models = {"Student_t_default": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constrain_positive": [True]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_small_var": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [0.01],
+                                "constrain_positive": [True]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_approx_gauss": {
+                            "model": GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constrain_positive": [True]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_log": {
+                            "model": GPy.likelihoods.student_t(gp_link=gp_transformations.Log(), deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constrain_positive": [True]
+                                },
+                            "laplace": True
+                            },
+                        "Gaussian_default": {
+                            "model": GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constrain_positive": [True]
+                                },
+                            "laplace": True
+                            },
+                        "Gaussian_log": {
+                            "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log(), variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constrain_positive": [True]
+                                },
+                            "laplace": True
+                            }
+                        }
+
+        for name, attributes in noise_models.iteritems():
+            model = attributes["model"]
+            params = attributes["grad_params"]
+            param_vals = params["vals"]
+            param_names= params["names"]
+            constrain_positive = params["constrain_positive"]
+            laplace = attributes["laplace"]
+
+            if len(param_vals) > 1:
+                raise NotImplementedError("Cannot support multiple params in likelihood yet!")
+
+            #Required by all
+            #Normal derivatives
+            yield self.t_logpdf, model
+            yield self.t_dlogpdf_df, model
+            yield self.t_d2logpdf_df2, model
+            #Link derivatives
+            yield self.t_dlogpdf_dlink, model
+            yield self.t_d2logpdf_dlink2, model
+            yield self.t_d3logpdf_dlink3, model
+            if laplace:
+                #Laplace only derivatives
+                yield self.t_d3logpdf_df3, model
+                #Params
+                yield self.t_dlogpdf_dparams, model, param_vals
+                yield self.t_dlogpdf_df_dparams, model, param_vals
+                yield self.t_d2logpdf2_df2_dparams, model, param_vals
+                #Link params
+                yield self.t_dlogpdf_link_dparams, model, param_vals
+                yield self.t_dlogpdf_dlink_dparams, model, param_vals
+                yield self.t_d2logpdf2_dlink2_dparams, model, param_vals
+
+                #laplace likelihood gradcheck
+                yield self.t_laplace_fit_rbf_white, model, param_vals, param_names, constrain_positive
+
+        self.tearDown()
+
+    #############
+    # dpdf_df's #
+    #############
+    @with_setup(setUp, tearDown)
+    def t_logpdf(self, model):
+        print "\n{}".format(inspect.stack()[0][3])
+        np.testing.assert_almost_equal(
+                               np.log(model.pdf(self.f.copy(), self.Y.copy())),
+                               model.logpdf(self.f.copy(), self.Y.copy()))
+
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_df(self, model):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.description = "\n{}".format(inspect.stack()[0][3])
+        logpdf = functools.partial(model.logpdf, y=self.Y)
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=self.Y)
+        grad = GradientChecker(logpdf, dlogpdf_df, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf_df2(self, model):
+        print "\n{}".format(inspect.stack()[0][3])
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=self.Y)
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=self.Y)
+        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d3logpdf_df3(self, model):
+        print "\n{}".format(inspect.stack()[0][3])
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=self.Y)
+        d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=self.Y)
+        grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    ##############
+    # df_dparams #
+    ##############
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_dparams(self, model, params):
+        print "\n{}".format(inspect.stack()[0][3])
+        assert (
+                dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
+                    params, args=(self.f, self.Y), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_df_dparams(self, model, params):
+        print "\n{}".format(inspect.stack()[0][3])
+        assert (
+                dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
+                    params, args=(self.f, self.Y), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf2_df2_dparams(self, model, params):
+        print "\n{}".format(inspect.stack()[0][3])
+        assert (
+                dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
+                    params, args=(self.f, self.Y), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
+
+    ################
+    # dpdf_dlink's #
+    ################
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_dlink(self, model):
+        print "\n{}".format(inspect.stack()[0][3])
+        logpdf = functools.partial(model.logpdf_link, y=self.Y)
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=self.Y)
+        grad = GradientChecker(logpdf, dlogpdf_dlink, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf_dlink2(self, model):
+        print "\n{}".format(inspect.stack()[0][3])
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=self.Y)
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=self.Y)
+        grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d3logpdf_dlink3(self, model):
+        print "\n{}".format(inspect.stack()[0][3])
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=self.Y)
+        d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=self.Y)
+        grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    #################
+    # dlink_dparams #
+    #################
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_link_dparams(self, model, params):
+        print "\n{}".format(inspect.stack()[0][3])
+        assert (
+                dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
+                    params, args=(self.f, self.Y), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_dlink_dparams(self, model, params):
+        print "\n{}".format(inspect.stack()[0][3])
+        assert (
+                dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
+                    params, args=(self.f, self.Y), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf2_dlink2_dparams(self, model, params):
+        print "\n{}".format(inspect.stack()[0][3])
+        assert (
+                dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
+                    params, args=(self.f, self.Y), constrain_positive=True,
+                    randomize=False, verbose=True)
+                )
+
+    ################
+    # laplace test #
+    ################
+    @with_setup(setUp, tearDown)
+    def t_laplace_fit_rbf_white(self, model, param_vals, param_names, constrain_positive):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.Y = self.Y/self.Y.max()
+        white_var = 0.001
+        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
+        laplace_likelihood = GPy.likelihoods.Laplace(self.Y.copy(), model)
+        m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=laplace_likelihood)
+        m.ensure_default_constraints()
+        m.constrain_fixed('white', white_var)
+
+        for param_num in range(len(param_names)):
+            name = param_names[param_num]
+            if constrain_positive[param_num]:
+                m.constrain_positive(name)
+            m[name] = param_vals[param_num]
+
+        m.randomize()
+        m.checkgrad(verbose=1, step=self.step)
+        print m
+        assert m.checkgrad(step=self.step)
+
+
 class LaplaceTests(unittest.TestCase):
+    """
+    Specific likelihood tests, not general enough for the above tests
+    """
+
     def setUp(self):
         self.N = 5
         self.D = 3
@@ -90,116 +388,6 @@ class LaplaceTests(unittest.TestCase):
         self.f = None
         self.X = None
 
-    def test_mass_logpdf(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        np.testing.assert_almost_equal(
-                               np.log(self.gauss.pdf(self.f.copy(), self.Y.copy())),
-                               self.gauss.logpdf(self.f.copy(), self.Y.copy()))
-
-
-    """ dGauss_df's """
-    def test_gaussian_dlogpdf_df(self):
-        #FIXME: Needs non-identity Link function
-        print "\n{}".format(inspect.stack()[0][3])
-        logpdf = functools.partial(self.gauss.logpdf, y=self.Y)
-        dlogpdf_df = functools.partial(self.gauss.dlogpdf_df, y=self.Y)
-        grad = GradientChecker(logpdf, dlogpdf_df, self.f.copy(), 'g')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
-
-    def test_gaussian_d2logpdf_df2(self):
-        #FIXME: Needs non-identity Link function
-        print "\n{}".format(inspect.stack()[0][3])
-        dlogpdf_df = functools.partial(self.gauss.dlogpdf_df, y=self.Y)
-        d2logpdf_df2 = functools.partial(self.gauss.d2logpdf_df2, y=self.Y)
-        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, self.f.copy(), 'g')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
-
-    def test_gaussian_d3logpdf_df3(self):
-        #FIXME: Needs non-identity Link function
-        print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_df2 = functools.partial(self.gauss.d2logpdf_df2, y=self.Y)
-        d3logpdf_df3 = functools.partial(self.gauss.d3logpdf_df3, y=self.Y)
-        grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, self.f.copy(), 'g')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
-
-    def test_gaussian_dlogpdf_df_dvar(self):
-        #FIXME: Needs non-identity Link function
-        print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.gauss.dlogpdf_df, self.gauss.dlogpdf_df_dtheta,
-                    [self.var], args=(self.f, self.Y), constrain_positive=True,
-                    randomize=False, verbose=True)
-                )
-
-    def test_gaussian_d2logpdf2_df2_dvar(self):
-        #FIXME: Needs non-identity Link function
-        print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.gauss.d2logpdf_df2, self.gauss.d2logpdf_df2_dtheta,
-                    [self.var], args=(self.f, self.Y), constrain_positive=True,
-                    randomize=False, verbose=True)
-                )
-
-
-    """ dGauss_dlink's """
-    def test_gaussian_dlogpdf_dlink(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        logpdf = functools.partial(self.gauss.logpdf_link, y=self.Y)
-        dlogpdf_dlink = functools.partial(self.gauss.dlogpdf_dlink, y=self.Y)
-        grad = GradientChecker(logpdf, dlogpdf_dlink, self.f.copy(), 'g')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
-
-    def test_gaussian_d2logpdf_dlink2(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        dlogpdf_dlink = functools.partial(self.gauss.dlogpdf_dlink, y=self.Y)
-        d2logpdf_dlink2 = functools.partial(self.gauss.d2logpdf_dlink2, y=self.Y)
-        grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, self.f.copy(), 'g')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
-
-    def test_gaussian_d3logpdf_dlink3(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_dlink2 = functools.partial(self.gauss.d2logpdf_dlink2, y=self.Y)
-        d3logpdf_dlink3 = functools.partial(self.gauss.d3logpdf_dlink3, y=self.Y)
-        grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, self.f.copy(), 'g')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
-
-    def test_gaussian_dlogpdf_dvar(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.gauss.logpdf, self.gauss.dlogpdf_dtheta,
-                    [self.var], args=(self.f, self.Y), constrain_positive=True,
-                    randomize=False, verbose=True)
-                )
-
-    def test_gaussian_dlogpdf_dlink_dvar(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.gauss.dlogpdf_dlink, self.gauss.dlogpdf_dlink_dtheta,
-                    [self.var], args=(self.f, self.Y), constrain_positive=True,
-                    randomize=False, verbose=True)
-                )
-
-    def test_gaussian_d2logpdf2_dlink2_dvar(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.gauss.d2logpdf_dlink2, self.gauss.d2logpdf_dlink2_dtheta,
-                    [self.var], args=(self.f, self.Y), constrain_positive=True,
-                    randomize=False, verbose=True)
-                )
-
-
     """ Gradchecker fault """
     @unittest.expectedFailure
     def test_gaussian_d2logpdf_df2_2(self):
@@ -223,167 +411,6 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
-    """ dStudentT_df's """
-    def test_studentt_dlogpdf_df(self):
-        #FIXME: Needs non-identity Link function
-        print "\n{}".format(inspect.stack()[0][3])
-        link = functools.partial(self.stu_t.logpdf, y=self.Y)
-        dlogpdf_df = functools.partial(self.stu_t.dlogpdf_df, y=self.Y)
-        grad = GradientChecker(link, dlogpdf_df, self.f.copy(), 'f')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
-
-    def test_studentt_d2logpdf_df2(self):
-        #FIXME: Needs non-identity Link function
-        print "\n{}".format(inspect.stack()[0][3])
-        dlogpdf_df = functools.partial(self.stu_t.dlogpdf_df, y=self.Y)
-        d2logpdf_df2 = functools.partial(self.stu_t.d2logpdf_df2, y=self.Y)
-        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, self.f.copy(), 'f')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
-
-    def test_studentt_d3lik_d3f(self):
-        #FIXME: Needs non-identity Link function
-        print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_df2 = functools.partial(self.stu_t.d2logpdf_df2, y=self.Y)
-        d3logpdf_df3 = functools.partial(self.stu_t.d3logpdf_df3, y=self.Y)
-        grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, self.f.copy(), 'f')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
-
-    def test_studentt_dlogpdf_df_dvar(self):
-        #FIXME: Needs non-identity Link function
-        print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.stu_t.dlogpdf_df, self.stu_t.dlogpdf_df_dtheta,
-                    [self.var], args=(self.f.copy(), self.Y.copy()),
-                    constrain_positive=True, randomize=True, verbose=True)
-                )
-
-    def test_studentt_d2logpdf_df2_dvar(self):
-        #FIXME: Needs non-identity Link function
-        print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.stu_t.d2logpdf_df2, self.stu_t.d2logpdf_df2_dtheta,
-                    [self.var], args=(self.f.copy(), self.Y.copy()),
-                    constrain_positive=True, randomize=True, verbose=True)
-                )
-
-    """ dStudentT_dlink's """
-    def test_studentt_dlogpdf_dlink(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        logpdf = functools.partial(self.stu_t.logpdf, y=self.Y)
-        dlogpdf_dlink = functools.partial(self.stu_t.dlogpdf_dlink, y=self.Y)
-        grad = GradientChecker(logpdf, dlogpdf_dlink, self.f.copy(), 'f')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
-
-    def test_studentt_d2logpdf_dlink2(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        dlogpdf_dlink = functools.partial(self.stu_t.dlogpdf_dlink, y=self.Y)
-        d2logpdf_dlink2 = functools.partial(self.stu_t.d2logpdf_dlink2, y=self.Y)
-        grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, self.f.copy(), 'f')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
-
-    def test_studentt_d3logpdf_dlink3(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_dlink2 = functools.partial(self.stu_t.d2logpdf_dlink2, y=self.Y)
-        d3logpdf_dlink3 = functools.partial(self.stu_t.d3logpdf_dlink3, y=self.Y)
-        grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, self.f.copy(), 'f')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        self.assertTrue(grad.checkgrad())
-
-    def test_studentt_dlogpdf_dvar(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.stu_t.logpdf, self.stu_t.dlogpdf_dtheta,
-                    [self.var], args=(self.f.copy(), self.Y.copy()),
-                    constrain_positive=True, randomize=True, verbose=True)
-                )
-
-    def test_studentt_dlogpdf_dlink_dvar(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.stu_t.dlogpdf_dlink, self.stu_t.dlogpdf_dlink_dtheta,
-                    [self.var], args=(self.f.copy(), self.Y.copy()),
-                    constrain_positive=True, randomize=True, verbose=True)
-                )
-
-    def test_studentt_d2logpdf_dlink2_dvar(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        self.assertTrue(
-                dparam_checkgrad(self.stu_t.d2logpdf_dlink2, self.stu_t.d2logpdf_dlink2_dtheta,
-                    [self.var], args=(self.f.copy(), self.Y.copy()),
-                    constrain_positive=True, randomize=True, verbose=True)
-                )
-
-
-    """ Grad check whole models (grad checking Laplace not just noise models """
-    def test_gauss_rbf(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        self.Y = self.Y/self.Y.max()
-        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
-        gauss_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.gauss)
-        m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=gauss_laplace)
-        m.ensure_default_constraints()
-        m.randomize()
-        m.checkgrad(verbose=1, step=self.step)
-        self.assertTrue(m.checkgrad(step=self.step))
-
-    def test_studentt_approx_gauss_rbf(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        self.Y = self.Y/self.Y.max()
-        self.stu_t = GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var)
-        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
-        stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
-        m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
-        m.ensure_default_constraints()
-        m.constrain_positive('t_noise')
-        m.randomize()
-        m.checkgrad(verbose=1, step=self.step)
-        print m
-        self.assertTrue(m.checkgrad(step=self.step))
-
-    def test_studentt_rbf(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        self.Y = self.Y/self.Y.max()
-        white_var = 0.001
-        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
-        stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
-        m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
-        m.ensure_default_constraints()
-        m.constrain_positive('t_noise')
-        m.constrain_fixed('white', white_var)
-        m.randomize()
-        m.checkgrad(verbose=1, step=self.step)
-        print m
-        self.assertTrue(m.checkgrad(step=self.step))
-
-    """ With small variances its likely the implicit part isn't perfectly correct? """
-    @unittest.expectedFailure
-    def test_studentt_rbf_smallvar(self):
-        print "\n{}".format(inspect.stack()[0][3])
-        self.Y = self.Y/self.Y.max()
-        white_var = 0.001
-        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
-        stu_t_laplace = GPy.likelihoods.Laplace(self.Y.copy(), self.stu_t)
-        m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=stu_t_laplace)
-        m.ensure_default_constraints()
-        m.constrain_positive('t_noise')
-        m.constrain_fixed('white', white_var)
-        m['t_noise'] = 0.01
-        m.randomize()
-        m.checkgrad(verbose=1)
-        print m
-        self.assertTrue(m.checkgrad(step=self.step))
-
 if __name__ == "__main__":
     print "Running unit tests"
     unittest.main()

From e65548f38503bbbf460251f8a608a3ec925fe420 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 16 Oct 2013 18:43:14 +0100
Subject: [PATCH 117/252] Renamed laplace_tests to likelihoods_tests

---
 GPy/testing/{laplace_tests.py => likelihoods_tests.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename GPy/testing/{laplace_tests.py => likelihoods_tests.py} (100%)

diff --git a/GPy/testing/laplace_tests.py b/GPy/testing/likelihoods_tests.py
similarity index 100%
rename from GPy/testing/laplace_tests.py
rename to GPy/testing/likelihoods_tests.py

From afd38df1eff037f0d27168320616533dc1ab189c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 17 Oct 2013 14:31:24 +0100
Subject: [PATCH 118/252] Added pdf_link's for gaussian and student t, added
 third derivatives for transformations and tests for them

---
 GPy/likelihoods/likelihood_functions.py       | 551 ------------------
 .../noise_models/gaussian_noise.py            |  41 +-
 .../noise_models/gp_transformations.py        |  22 +-
 .../noise_models/noise_distributions.py       |  15 +-
 .../noise_models/student_t_noise.py           |  26 +-
 GPy/testing/gp_transformation_tests.py        |  61 ++
 GPy/testing/likelihoods_tests.py              |  46 +-
 GPy/util/univariate_Gaussian.py               |  34 +-
 doc/GPy.likelihoods.rst                       |   8 -
 doc/GPy.testing.rst                           |  14 +-
 10 files changed, 203 insertions(+), 615 deletions(-)
 delete mode 100644 GPy/likelihoods/likelihood_functions.py
 create mode 100644 GPy/testing/gp_transformation_tests.py

diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
deleted file mode 100644
index dbdd3fa6..00000000
--- a/GPy/likelihoods/likelihood_functions.py
+++ /dev/null
@@ -1,551 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats, integrate
-import scipy as sp
-import pylab as pb
-from ..util.plot import gpplot
-from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import link_functions
-from scipy.special import gammaln, gamma
-
-class LikelihoodFunction(object):
-    """
-    Likelihood class for doing Expectation propagation
-
-    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the LikelihoodFunction used
-    """
-    def __init__(self,link):
-        if link == self._analytical:
-            self.moments_match = self._moments_match_analytical
-        else:
-            assert isinstance(link,link_functions.LinkFunction)
-            self.link = link
-            self.moments_match = self._moments_match_numerical
-        self.log_concave = True
-
-    def _preprocess_values(self,Y):
-        return Y
-
-    def _product(self,gp,obs,mu,sigma):
-        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._distribution(gp,obs)
-
-    def _nlog_product(self,gp,obs,mu,sigma):
-        return -(-.5*(gp-mu)**2/sigma**2 + self._log_distribution(gp,obs))
-
-    def _locate(self,obs,mu,sigma):
-        """
-        Golden Search to find the mode in the _product function (cavity x exact likelihood) and define a grid around it for numerical integration
-        """
-        golden_A = -1 if obs == 0 else np.array([np.log(obs),mu]).min() #Lower limit
-        golden_B = np.array([np.log(obs),mu]).max() #Upper limit
-        return sp.optimize.golden(self._nlog_product, args=(obs,mu,sigma), brack=(golden_A,golden_B)) #Better to work with _nlog_product than with _product
-
-    def _moments_match_numerical(self,obs,tau,v):
-        """
-        Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input.
-        """
-        mu = v/tau
-        sigma = np.sqrt(1./tau)
-        opt = self._locate(obs,mu,sigma)
-        width = 3./np.log(max(obs,2))
-        A = opt - width #Grid's lower limit
-        B = opt + width #Grid's Upper limit
-        K =  10*int(np.log(max(obs,150))) #Number of points in the grid
-        h = (B-A)/K # length of the intervals
-        grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis)
-        x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier
-        _aux1 = self._product(A,obs,mu,sigma)
-        _aux2 = self._product(B,obs,mu,sigma)
-        _aux3 = 4*self._product(grid_x[range(1,K,2)],obs,mu,sigma)
-        _aux4 = 2*self._product(grid_x[range(2,K-1,2)],obs,mu,sigma)
-        zeroth = np.hstack((_aux1,_aux2,_aux3,_aux4)) # grid of points (Y axis) rearranged
-        first = zeroth*x
-        second = first*x
-        Z_hat = sum(zeroth)*h/3 # Zero-th moment
-        mu_hat = sum(first)*h/(3*Z_hat) # First moment
-        m2 = sum(second)*h/(3*Z_hat) # Second moment
-        sigma2_hat = m2 - mu_hat**2 # Second central moment
-        return float(Z_hat), float(mu_hat), float(sigma2_hat)
-
-class Binomial(LikelihoodFunction):
-    """
-    Probit likelihood
-    Y is expected to take values in {-1,1}
-    -----
-    $$
-    L(x) = \\Phi (Y_i*f_i)
-    $$
-    """
-    def __init__(self,link=None):
-        self._analytical = link_functions.Probit
-        if not link:
-            link = self._analytical
-        super(Binomial, self).__init__(link)
-
-    def _distribution(self,gp,obs):
-        pass
-
-    def _log_distribution(self,gp,obs):
-        pass
-
-    def _preprocess_values(self,Y):
-        """
-        Check if the values of the observations correspond to the values
-        assumed by the likelihood function.
-
-        ..Note:: Binary classification algorithm works better with classes {-1,1}
-        """
-        Y_prep = Y.copy()
-        Y1 = Y[Y.flatten()==1].size
-        Y2 = Y[Y.flatten()==0].size
-        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
-        Y_prep[Y.flatten() == 0] = -1
-        return Y_prep
-
-    def _moments_match_analytical(self,data_i,tau_i,v_i):
-        """
-        Moments match of the marginal approximation in EP algorithm
-
-        :param i: number of observation (int)
-        :param tau_i: precision of the cavity distribution (float)
-        :param v_i: mean/variance of the cavity distribution (float)
-        """
-        z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
-        Z_hat = std_norm_cdf(z)
-        phi = std_norm_pdf(z)
-        mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
-        sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
-        return Z_hat, mu_hat, sigma2_hat
-
-    def predictive_values(self,mu,var):
-        """
-        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction
-        :param mu: mean of the latent variable
-        :param var: variance of the latent variable
-        """
-        mu = mu.flatten()
-        var = var.flatten()
-        mean = stats.norm.cdf(mu/np.sqrt(1+var))
-        norm_025 = [stats.norm.ppf(.025,m,v) for m,v in zip(mu,var)]
-        norm_975 = [stats.norm.ppf(.975,m,v) for m,v in zip(mu,var)]
-        p_025 = stats.norm.cdf(norm_025/np.sqrt(1+var))
-        p_975 = stats.norm.cdf(norm_975/np.sqrt(1+var))
-        return mean[:,None], np.nan*var, p_025[:,None], p_975[:,None] # TODO: var
-
-class Poisson(LikelihoodFunction):
-    """
-    Poisson likelihood
-    Y is expected to take values in {0,1,2,...}
-    -----
-    $$
-    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
-    $$
-    """
-    def __init__(self,link=None):
-        self._analytical = None
-        if not link:
-            link = link_functions.Log()
-        super(Poisson, self).__init__(link)
-
-    def _distribution(self,gp,obs):
-        return stats.poisson.pmf(obs,self.link.inv_transf(gp))
-
-    def _log_distribution(self,gp,obs):
-        return - self.link.inv_transf(gp) + obs * self.link.log_inv_transf(gp)
-
-    def predictive_values(self,mu,var):
-        """
-        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
-        """
-        mean = self.link.transf(mu)#np.exp(mu*self.scale + self.location)
-        tmp = stats.poisson.ppf(np.array([.025,.975]),mean)
-        p_025 = tmp[:,0]
-        p_975 = tmp[:,1]
-        return mean,np.nan*mean,p_025,p_975 # better variance here TODO
-
-class StudentT(LikelihoodFunction):
-    """Student t likelihood distribution
-    For nomanclature see Bayesian Data Analysis 2003 p576
-
-    $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2)$$
-
-    Laplace:
-    Needs functions to calculate
-    ln p(yi|fi)
-    dln p(yi|fi)_dfi
-    d2ln p(yi|fi)_d2fifj
-    """
-    def __init__(self, deg_free=5, sigma2=2, link=None):
-        self._analytical = None
-        if not link:
-            link = link_functions.Nothing()
-
-        super(StudentT, self).__init__(link)
-        self.v = deg_free
-        self.sigma2 = sigma2
-
-        self._set_params(np.asarray(sigma2))
-        self.log_concave = False
-
-    def _get_params(self):
-        return np.asarray(self.sigma2)
-
-    def _get_param_names(self):
-        return ["t_noise_std2"]
-
-    def _set_params(self, x):
-        self.sigma2 = float(x)
-
-    @property
-    def variance(self, extra_data=None):
-        return (self.v / float(self.v - 2)) * self.sigma2
-
-    def link_function(self, y, f, extra_data=None):
-        """link_function $\ln p(y|f)$
-        $$\ln p(y_{i}|f_{i}) = \ln \Gamma(\frac{v+1}{2}) - \ln \Gamma(\frac{v}{2})\sqrt{v \pi}\sigma - \frac{v+1}{2}\ln (1 + \frac{1}{v}\left(\frac{y_{i} - f_{i}}{\sigma}\right)^2$$
-
-        For wolfram alpha import parts for derivative of sigma are -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: float(likelihood evaluated for this point)
-
-        """
-        assert y.shape == f.shape
-        e = y - f
-        objective = (+ gammaln((self.v + 1) * 0.5)
-                     - gammaln(self.v * 0.5)
-                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
-                     - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
-                    )
-        return np.sum(objective)
-
-    def dlik_df(self, y, f, extra_data=None):
-        """
-        Gradient of the link function at y, given f w.r.t f
-
-        $$\frac{dp(y_{i}|f_{i})}{df} = \frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \sigma^{2}v}$$
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: gradient of likelihood evaluated at points
-
-        """
-        assert y.shape == f.shape
-        e = y - f
-        grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
-        return grad
-
-    def d2lik_d2f(self, y, f, extra_data=None):
-        """
-        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
-        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
-
-        Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-        (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
-
-        $$\frac{d^{2}p(y_{i}|f_{i})}{d^{3}f} = \frac{(v+1)((y_{i}-f_{i})^{2} - \sigma^{2}v)}{((y_{i}-f_{i})^{2} + \sigma^{2}v)^{2}}$$
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
-        """
-        assert y.shape == f.shape
-        e = y - f
-        hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
-        return hess
-
-    def d3lik_d3f(self, y, f, extra_data=None):
-        """
-        Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
-
-        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
-        """
-        assert y.shape == f.shape
-        e = y - f
-        d3lik_d3f = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
-                       ((e**2 + self.sigma2*self.v)**3)
-                    )
-        return d3lik_d3f
-
-    def dlik_dvar(self, y, f, extra_data=None):
-        """
-        Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
-
-        Terms relavent to derivatives wrt sigma are:
-        -log(sqrt(v*pi)*s) -(1/2)*(v + 1)*log(1 + (1/v)*((y-f)/(s))^2))
-
-        $$\frac{dp(y_{i}|f_{i})}{d\sigma} = -\frac{1}{\sigma} + \frac{(1+v)(y_{i}-f_{i})^2}{\sigma^3 v(1 + \frac{1}{v}(\frac{(y_{i} - f_{i})}{\sigma^2})^2)}$$
-        """
-        assert y.shape == f.shape
-        e = y - f
-        dlik_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
-        return np.sum(dlik_dvar) #May not want to sum over all dimensions if using many D?
-
-    def dlik_df_dvar(self, y, f, extra_data=None):
-        """
-        Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
-
-        $$\frac{d}{d\sigma}(\frac{dp(y_{i}|f_{i})}{df}) = \frac{-2\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \sigma^2 v)^2}$$
-        """
-        assert y.shape == f.shape
-        e = y - f
-        dlik_grad_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
-        return dlik_grad_dvar
-
-    def d2lik_d2f_dvar(self, y, f, extra_data=None):
-        """
-        Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
-
-        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
-        """
-        assert y.shape == f.shape
-        e = y - f
-        dlik_hess_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
-                              / ((self.sigma2*self.v + (e**2))**3)
-                           )
-        return dlik_hess_dvar
-
-    def _gradients(self, y, f, extra_data=None):
-        #must be listed in same order as 'get_param_names'
-        derivs = ([self.dlik_dvar(y, f, extra_data=extra_data)],
-                  [self.dlik_df_dvar(y, f, extra_data=extra_data)],
-                  [self.d2lik_d2f_dvar(y, f, extra_data=extra_data)]
-                 ) # lists as we might learn many parameters
-        # ensure we have gradients for every parameter we want to optimize
-        assert len(derivs[0]) == len(self._get_param_names())
-        assert len(derivs[1]) == len(self._get_param_names())
-        assert len(derivs[2]) == len(self._get_param_names())
-        return derivs
-
-    def predictive_values(self, mu, var):
-        """
-        Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
-
-        Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
-        (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
-        *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
-        """
-
-        #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
-        #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
-        #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
-        #Which was also given to us as (var)
-        #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
-        #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
-        true_var = var + self.variance
-
-        #Now we have an analytical solution for the variances of the distribution p(y*|f*)p(f*) around our test points but we now
-        #need the 95 and 5 percentiles.
-        #FIXME: Hack, just pretend p(y*|f*)p(f*) is a gaussian and use the gaussian's percentiles
-        p_025 = mu - 2.*np.sqrt(true_var)
-        p_975 = mu + 2.*np.sqrt(true_var)
-
-        return mu, np.nan*mu, p_025, p_975
-
-    def sample_predicted_values(self, mu, var):
-        """ Experimental sample approches and numerical integration """
-        #p_025 = stats.t.ppf(.025, mu)
-        #p_975 = stats.t.ppf(.975, mu)
-
-        num_test_points = mu.shape[0]
-        #Each mu is the latent point f* at the test point x*,
-        #and the var is the gaussian variance at this point
-        #Take lots of samples from this, so we have lots of possible values
-        #for latent point f* for each test point x* weighted by how likely we were to pick it
-        print "Taking %d samples of f*".format(num_test_points)
-        num_f_samples = 10
-        num_y_samples = 10
-        student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
-        print "Student t means shape: ", student_t_means.shape
-
-        #Now we have lots of f*, lets work out the likelihood of getting this by sampling
-        #from a student t centred on this point, sample many points from this distribution
-        #centred on f*
-        #for test_point, f in enumerate(student_t_means):
-            #print test_point
-            #print f.shape
-            #student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
-                                            #scale=self.sigma,
-                                            #size=(num_f_samples, num_y_samples))
-            #print student_t_samples.shape
-
-        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
-                                        scale=self.sigma,
-                                        size=(num_test_points, num_y_samples, num_f_samples))
-        student_t_samples = np.reshape(student_t_samples,
-                                       (num_test_points, num_y_samples*num_f_samples))
-
-        #Now take the 97.5 and 0.25 percentile of these points
-        p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
-        p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
-
-        ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
-        def t_gaussian(f, mu, var):
-            return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
-                    * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
-                    )
-
-        def t_gauss_int(mu, var):
-            print "Mu: ", mu
-            print "var: ", var
-            result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
-            print "Result: ", result
-            return result[0]
-
-        vec_t_gauss_int = np.vectorize(t_gauss_int)
-
-        p = vec_t_gauss_int(mu, var)
-        p_025 = mu - p
-        p_975 = mu + p
-        return mu, np.nan*mu, p_025, p_975
-
-class Gaussian(LikelihoodFunction):
-    """
-    Gaussian likelihood - this is a test class for approximation schemes
-    """
-    def __init__(self, variance, D, N, link=None):
-        self._analytical = None
-        if not link:
-            link = link_functions.Nothing()
-
-        super(Gaussian, self).__init__(link)
-        self.D = D
-        self.N = N
-        self._variance = float(variance)
-        self._set_params(np.asarray(variance))
-
-        #Don't support normalizing yet
-        self._bias = np.zeros((1, self.D))
-        self._scale = np.ones((1, self.D))
-
-    def _get_params(self):
-        return np.asarray(self._variance)
-
-    def _get_param_names(self):
-        return ["noise_variance"]
-
-    def _set_params(self, x):
-        self._variance = float(x)
-        self.I = np.eye(self.N)
-        self.covariance_matrix = self.I * self._variance
-        self.Ki = self.I*(1.0 / self._variance)
-        self.ln_det_K = np.sum(np.log(np.diag(self.covariance_matrix)))
-
-    def link_function(self, y, f, extra_data=None):
-        """link_function $\ln p(y|f)$
-        $$\ln p(y_{i}|f_{i}) = \ln $$
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: float(likelihood evaluated for this point)
-
-        """
-        assert y.shape == f.shape
-        e = y - f
-        eeT = np.dot(e, e.T)
-        objective = (- 0.5*self.D*np.log(2*np.pi)
-                     - 0.5*self.ln_det_K
-                     #- 0.5*np.dot(np.dot(e.T, self.Ki), e)
-                     - (0.5/self._variance)*np.dot(e.T, e) # As long as K is diagonal
-                     )
-        return np.sum(objective)
-
-    def dlik_df(self, y, f, extra_data=None):
-        """
-        Gradient of the link function at y, given f w.r.t f
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: gradient of likelihood evaluated at points
-
-        """
-        assert y.shape == f.shape
-        s2_i = (1.0/self._variance)*self.I
-        grad = np.dot(s2_i, y) - np.dot(s2_i, f)
-        return grad
-
-    def d2lik_d2f(self, y, f, extra_data=None):
-        """
-        Hessian at this point (if we are only looking at the link function not the prior) the hessian will be 0 unless i == j
-        i.e. second derivative link_function at y given f f_j  w.r.t f and f_j
-
-        Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-        (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
-
-        :y: data
-        :f: latent variables f
-        :extra_data: extra_data which is not used in student t distribution
-        :returns: array which is diagonal of covariance matrix (second derivative of likelihood evaluated at points)
-        """
-        assert y.shape == f.shape
-        s2_i = (1.0/self._variance)*self.I
-        hess = np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
-        return hess
-
-    def d3lik_d3f(self, y, f, extra_data=None):
-        """
-        Third order derivative link_function (log-likelihood ) at y given f f_j w.r.t f and f_j
-
-        $$\frac{d^{3}p(y_{i}|f_{i})}{d^{3}f} = \frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \sigma^{2} v))}{((y_{i} - f_{i}) + \sigma^{2} v)^3}$$
-        """
-        assert y.shape == f.shape
-        d3lik_d3f = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
-        return d3lik_d3f
-
-    def dlik_dvar(self, y, f, extra_data=None):
-        """
-        Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
-        """
-        assert y.shape == f.shape
-        e = y - f
-        s_4 = 1.0/(self._variance**2)
-        dlik_dsigma = -0.5*self.N/self._variance + 0.5*s_4*np.dot(e.T, e)
-        return np.sum(dlik_dsigma) # Sure about this sum?
-
-    def dlik_df_dvar(self, y, f, extra_data=None):
-        """
-        Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
-        """
-        assert y.shape == f.shape
-        s_4 = 1.0/(self._variance**2)
-        dlik_grad_dsigma = -np.dot(s_4*self.I, y) + np.dot(s_4*self.I, f)
-        return dlik_grad_dsigma
-
-    def d2lik_d2f_dvar(self, y, f, extra_data=None):
-        """
-        Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
-
-        $$\frac{d}{d\sigma}(\frac{d^{2}p(y_{i}|f_{i})}{d^{2}f}) = \frac{2\sigma v(v + 1)(\sigma^2 v - 3(y-f)^2)}{((y-f)^2 + \sigma^2 v)^3}$$
-        """
-        assert y.shape == f.shape
-        dlik_hess_dsigma = np.diag((1.0/(self._variance**2))*self.I)[:, None]
-        return dlik_hess_dsigma
-
-    def _gradients(self, y, f, extra_data=None):
-        #must be listed in same order as 'get_param_names'
-        derivs = ([self.dlik_dvar(y, f, extra_data=extra_data)],
-                  [self.dlik_df_dvar(y, f, extra_data=extra_data)],
-                  [self.d2lik_d2f_dvar(y, f, extra_data=extra_data)]
-                 ) # lists as we might learn many parameters
-        # ensure we have gradients for every parameter we want to optimize
-        assert len(derivs[0]) == len(self._get_param_names())
-        assert len(derivs[1]) == len(self._get_param_names())
-        assert len(derivs[2]) == len(self._get_param_names())
-        return derivs
-
-    def predictive_values(self, mu, var):
-        mean = mu * self._scale + self._bias
-        true_var = (var + self._variance) * self._scale ** 2
-        _5pc = mean - 2.*np.sqrt(true_var)
-        _95pc = mean + 2.*np.sqrt(true_var)
-        return mean, true_var, _5pc, _95pc
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index 5811f916..2dd0cd64 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -68,14 +68,6 @@ class Gaussian(NoiseDistribution):
     def _predictive_variance_analytical(self,mu,sigma,predictive_mean=None):
         return 1./(1./self.variance + 1./sigma**2)
 
-    def pdf_link(self, link_f, y, extra_data=None):
-        #FIXME: Careful now passing link_f in not gp (f)!
-        #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
-        #Assumes no covariance, exp, sum, log for numerical stability
-        #return np.exp(np.sum(np.log(stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance)))))
-        #return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
-        return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
-
     def _mass(self, link_f, y, extra_data=None):
         NotImplementedError("Deprecated, now doing chain in noise_model.py for link function evaluation\
                             Please negate your function and use pdf in noise_model.py, if implementing a likelihood\
@@ -99,6 +91,25 @@ class Gaussian(NoiseDistribution):
                             rederivate the derivative without doing the chain and put in logpdf, dlogpdf_dlink or\
                             its derivatives")
 
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        #Assumes no covariance, exp, sum, log for numerical stability
+        return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
+
+
     def logpdf_link(self, link_f, y, extra_data=None):
         """
         Log likelihood function given link(f)
@@ -111,7 +122,7 @@ class Gaussian(NoiseDistribution):
         :param y: data
         :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: likelihood evaluated for this point
+        :returns: log likelihood evaluated for this point
         :rtype: float
         """
         assert link_f.shape == y.shape
@@ -129,7 +140,7 @@ class Gaussian(NoiseDistribution):
         :param y: data
         :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: gradient of negative likelihood evaluated at points
+        :returns: gradient of log likelihood evaluated at points
         :rtype: Nx1 array
         """
         assert link_f.shape == y.shape
@@ -150,7 +161,7 @@ class Gaussian(NoiseDistribution):
         :param y: data
         :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points f)
         :rtype: Nx1 array
 
         .. Note::
@@ -173,7 +184,7 @@ class Gaussian(NoiseDistribution):
         :param y: data
         :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: third derivative of likelihood evaluated at points f
+        :returns: third derivative of log likelihood evaluated at points f
         :rtype: Nx1 array
         """
         assert link_f.shape == y.shape
@@ -192,7 +203,7 @@ class Gaussian(NoiseDistribution):
         :param y: data
         :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :returns: derivative of log likelihood evaluated at points f w.r.t variance parameter
         :rtype: float
         """
         assert link_f.shape == y.shape
@@ -213,7 +224,7 @@ class Gaussian(NoiseDistribution):
         :param y: data
         :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :returns: derivative of log likelihood evaluated at points f w.r.t variance parameter
         :rtype: Nx1 array
         """
         assert link_f.shape == y.shape
@@ -233,7 +244,7 @@ class Gaussian(NoiseDistribution):
         :param y: data
         :type y: Nx1 array
         :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
+        :returns: derivative of log hessian evaluated at points f and f_j w.r.t variance parameter
         :rtype: Nx1 array
         """
         assert link_f.shape == y.shape
diff --git a/GPy/likelihoods/noise_models/gp_transformations.py b/GPy/likelihoods/noise_models/gp_transformations.py
index b9db75ce..65730418 100644
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@@ -55,13 +55,13 @@ class Identity(GPTransformation):
         return f
 
     def dtransf_df(self,f):
-        return 1.
+        return np.ones_like(f)
 
     def d2transf_df2(self,f):
-        return 0
+        return np.zeros_like(f)
 
     def d3transf_df3(self,f):
-        return 0
+        return np.zeros_like(f)
 
 
 class Probit(GPTransformation):
@@ -82,7 +82,7 @@ class Probit(GPTransformation):
 
     def d3transf_df3(self,f):
         f2 = f**2
-        return -(1/(np.sqrt(2*np.pi)))*np.exp(-0.5*(f2))*(f2-1)
+        return -(1/(np.sqrt(2*np.pi)))*np.exp(-0.5*(f2))*(1-f2)
 
 class Log(GPTransformation):
     """
@@ -120,15 +120,23 @@ class Log_ex_1(GPTransformation):
         aux = np.exp(f)/(1.+np.exp(f))
         return aux*(1.-aux)
 
+    def d3transf_df3(self,f):
+        aux = np.exp(f)/(1.+np.exp(f))
+        daux_df = aux*(1.-aux)
+        return daux_df - (2.*aux*daux_df)
+
 class Reciprocal(GPTransformation):
-    def transf(sefl,f):
+    def transf(self,f):
         return 1./f
 
     def dtransf_df(self,f):
-        return -1./f**2
+        return -1./(f**2)
 
     def d2transf_df2(self,f):
-        return 2./f**3
+        return 2./(f**3)
+
+    def d3transf_df3(self,f):
+        return -6./(f**4)
 
 class Heaviside(GPTransformation):
     """
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 5b92e2b5..dc3a7de5 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -415,18 +415,23 @@ class NoiseDistribution(object):
         raise NotImplementedError
 
     def dlogpdf_link_dtheta(self, link_f, y, extra_data=None):
-        if len(self._get_params()) == 0:
-            pass
-        else:
-            raise NotImplementedError
+        """
+        Need to check if it should even exist by checking length of getparams
+        """
+        raise NotImplementedError
 
     def dlogpdf_dlink_dtheta(self, link_f, y, extra_data=None):
+        """
+        Need to check if it should even exist by checking length of getparams
+        """
         raise NotImplementedError
 
     def d2logpdf_dlink2_dtheta(self, link_f, y, extra_data=None):
+        """
+        Need to check if it should even exist by checking length of getparams
+        """
         raise NotImplementedError
 
-
     def pdf(self, f, y, extra_data=None):
         """
         Evaluates the link function link(f) then computes the likelihood (pdf) using it
diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index 0e881a8d..87cfb235 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -40,12 +40,36 @@ class StudentT(NoiseDistribution):
     def variance(self, extra_data=None):
         return (self.v / float(self.v - 2)) * self.sigma2
 
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in student t distribution - not used
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert link_f.shape == y.shape
+        e = y - link_f
+        #Careful gamma(big_number) is infinity!
+        objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
+                     / (np.sqrt(self.v * np.pi * self.sigma2)))
+                     * ((1 + (1./float(self.v))*((e**2)/float(self.sigma2)))**(-0.5*(self.v + 1)))
+                    )
+        return np.prod(objective)
+
     def logpdf_link(self, link_f, y, extra_data=None):
         """
         Log Likelihood Function given link(f)
 
         .. math::
-            \\ln p(y_{i}|f_{i}) = \\ln \\Gamma(\\frac{v+1}{2}) - \\ln \\Gamma(\\frac{v}{2})\\sqrt{v \\pi}\sigma - \\frac{v+1}{2}\\ln (1 + \\frac{1}{v}\\left(\\frac{y_{i} - f_{i}}{\\sigma}\\right)^2
+            \\ln p(y_{i}|f_{i}) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)
 
         :param link_f: latent variables (link(f))
         :type link_f: Nx1 array
diff --git a/GPy/testing/gp_transformation_tests.py b/GPy/testing/gp_transformation_tests.py
new file mode 100644
index 00000000..42c0414b
--- /dev/null
+++ b/GPy/testing/gp_transformation_tests.py
@@ -0,0 +1,61 @@
+from nose.tools import with_setup
+from GPy.models import GradientChecker
+from GPy.likelihoods.noise_models import gp_transformations
+import inspect
+import unittest
+import numpy as np
+
+class TestTransformations(object):
+    """
+    Generic transformations checker
+    """
+    def setUp(self):
+        N = 30
+        self.fs = [np.random.rand(N, 1), float(np.random.rand(1))]
+
+
+    def tearDown(self):
+        self.fs = None
+
+    def test_transformations(self):
+        self.setUp()
+        transformations = [gp_transformations.Identity(),
+                           gp_transformations.Log(),
+                           gp_transformations.Probit(),
+                           gp_transformations.Log_ex_1(),
+                           gp_transformations.Reciprocal(),
+                           ]
+
+        for transformation in transformations:
+            for f in self.fs:
+                yield self.t_dtransf_df, transformation, f
+                yield self.t_d2transf_df2, transformation, f
+                yield self.t_d3transf_df3, transformation, f
+
+    @with_setup(setUp, tearDown)
+    def t_dtransf_df(self, transformation, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        grad = GradientChecker(transformation.transf, transformation.dtransf_df, f, 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2transf_df2(self, transformation, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        grad = GradientChecker(transformation.dtransf_df, transformation.d2transf_df2, f, 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d3transf_df3(self, transformation, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        grad = GradientChecker(transformation.d2transf_df2, transformation.d3transf_df3, f, 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+#if __name__ == "__main__":
+    #print "Running unit tests"
+    #unittest.main()
diff --git a/GPy/testing/likelihoods_tests.py b/GPy/testing/likelihoods_tests.py
index 9f430741..84e5f036 100644
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@@ -113,6 +113,15 @@ class TestNoiseModels(object):
                                 },
                             "laplace": True
                             },
+                        "Student_t_1_var": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [1],
+                                "constrain_positive": [True]
+                                },
+                            "laplace": True
+                            },
                         "Student_t_small_var": {
                             "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
                             "grad_params": {
@@ -157,6 +166,24 @@ class TestNoiseModels(object):
                                 "constrain_positive": [True]
                                 },
                             "laplace": True
+                            },
+                        "Gaussian_probit": {
+                            "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Probit(), variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constrain_positive": [True]
+                                },
+                            "laplace": True
+                            },
+                        "Gaussian_log_ex": {
+                            "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constrain_positive": [True]
+                                },
+                            "laplace": True
                             }
                         }
 
@@ -179,10 +206,10 @@ class TestNoiseModels(object):
             #Link derivatives
             yield self.t_dlogpdf_dlink, model
             yield self.t_d2logpdf_dlink2, model
-            yield self.t_d3logpdf_dlink3, model
             if laplace:
                 #Laplace only derivatives
                 yield self.t_d3logpdf_df3, model
+                yield self.t_d3logpdf_dlink3, model
                 #Params
                 yield self.t_dlogpdf_dparams, model, param_vals
                 yield self.t_dlogpdf_df_dparams, model, param_vals
@@ -203,6 +230,7 @@ class TestNoiseModels(object):
     @with_setup(setUp, tearDown)
     def t_logpdf(self, model):
         print "\n{}".format(inspect.stack()[0][3])
+        print model
         np.testing.assert_almost_equal(
                                np.log(model.pdf(self.f.copy(), self.Y.copy())),
                                model.logpdf(self.f.copy(), self.Y.copy()))
@@ -216,6 +244,7 @@ class TestNoiseModels(object):
         grad = GradientChecker(logpdf, dlogpdf_df, self.f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
+        print model
         assert grad.checkgrad()
 
     @with_setup(setUp, tearDown)
@@ -226,6 +255,7 @@ class TestNoiseModels(object):
         grad = GradientChecker(dlogpdf_df, d2logpdf_df2, self.f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
+        print model
         assert grad.checkgrad()
 
     @with_setup(setUp, tearDown)
@@ -236,6 +266,7 @@ class TestNoiseModels(object):
         grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, self.f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
+        print model
         assert grad.checkgrad()
 
     ##############
@@ -244,6 +275,7 @@ class TestNoiseModels(object):
     @with_setup(setUp, tearDown)
     def t_dlogpdf_dparams(self, model, params):
         print "\n{}".format(inspect.stack()[0][3])
+        print model
         assert (
                 dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
                     params, args=(self.f, self.Y), constrain_positive=True,
@@ -253,6 +285,7 @@ class TestNoiseModels(object):
     @with_setup(setUp, tearDown)
     def t_dlogpdf_df_dparams(self, model, params):
         print "\n{}".format(inspect.stack()[0][3])
+        print model
         assert (
                 dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
                     params, args=(self.f, self.Y), constrain_positive=True,
@@ -262,6 +295,7 @@ class TestNoiseModels(object):
     @with_setup(setUp, tearDown)
     def t_d2logpdf2_df2_dparams(self, model, params):
         print "\n{}".format(inspect.stack()[0][3])
+        print model
         assert (
                 dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
                     params, args=(self.f, self.Y), constrain_positive=True,
@@ -279,6 +313,7 @@ class TestNoiseModels(object):
         grad = GradientChecker(logpdf, dlogpdf_dlink, self.f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
+        print grad
         assert grad.checkgrad()
 
     @with_setup(setUp, tearDown)
@@ -289,6 +324,7 @@ class TestNoiseModels(object):
         grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, self.f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
+        print grad
         assert grad.checkgrad()
 
     @with_setup(setUp, tearDown)
@@ -299,6 +335,7 @@ class TestNoiseModels(object):
         grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, self.f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
+        print grad
         assert grad.checkgrad()
 
     #################
@@ -307,6 +344,7 @@ class TestNoiseModels(object):
     @with_setup(setUp, tearDown)
     def t_dlogpdf_link_dparams(self, model, params):
         print "\n{}".format(inspect.stack()[0][3])
+        print model
         assert (
                 dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
                     params, args=(self.f, self.Y), constrain_positive=True,
@@ -316,6 +354,7 @@ class TestNoiseModels(object):
     @with_setup(setUp, tearDown)
     def t_dlogpdf_dlink_dparams(self, model, params):
         print "\n{}".format(inspect.stack()[0][3])
+        print model
         assert (
                 dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
                     params, args=(self.f, self.Y), constrain_positive=True,
@@ -325,6 +364,7 @@ class TestNoiseModels(object):
     @with_setup(setUp, tearDown)
     def t_d2logpdf2_dlink2_dparams(self, model, params):
         print "\n{}".format(inspect.stack()[0][3])
+        print model
         assert (
                 dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
                     params, args=(self.f, self.Y), constrain_positive=True,
@@ -379,7 +419,7 @@ class LaplaceTests(unittest.TestCase):
         self.gauss = GPy.likelihoods.gaussian(gp_transformations.Log(), variance=self.var, D=self.D, N=self.N)
 
         #Make a bigger step as lower bound can be quite curved
-        self.step = 1e-3
+        self.step = 1e-6
 
     def tearDown(self):
         self.stu_t = None
@@ -388,8 +428,6 @@ class LaplaceTests(unittest.TestCase):
         self.f = None
         self.X = None
 
-    """ Gradchecker fault """
-    @unittest.expectedFailure
     def test_gaussian_d2logpdf_df2_2(self):
         print "\n{}".format(inspect.stack()[0][3])
         self.Y = None
diff --git a/GPy/util/univariate_Gaussian.py b/GPy/util/univariate_Gaussian.py
index 5a5880d5..702ab25c 100644
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@@ -13,24 +13,32 @@ def std_norm_cdf(x):
     Cumulative standard Gaussian distribution
     Based on Abramowitz, M. and Stegun, I. (1970)
     """
+    #Generalize for many x
+    x = np.asarray(x).copy()
+    cdf_x = np.zeros_like(x)
+    N = x.size
     support_code = "#include <math.h>"
     code = """
 
-    double sign = 1.0;
-    if (x < 0.0){
-        sign = -1.0;
-        x = -x;
+    double sign, t, erf;
+    for (int i=0; i<N; i++){
+        sign = 1.0;
+        if (x[i] < 0.0){
+            sign = -1.0;
+            x[i] = -x[i];
+        }
+        x[i] = x[i]/sqrt(2.0);
+
+        t = 1.0/(1.0 +  0.3275911*x[i]);
+
+        erf = 1. - exp(-x[i]*x[i])*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))));
+
+        //return_val = 0.5*(1.0 + sign*erf);
+        cdf_x[i] = 0.5*(1.0 + sign*erf);
     }
-    x = x/sqrt(2.0);
-
-    double t = 1.0/(1.0 +  0.3275911*x);
-
-    double erf = 1. - exp(-x*x)*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))));
-
-    return_val = 0.5*(1.0 + sign*erf);
     """
-    x = float(x)
-    return weave.inline(code,arg_names=['x'],support_code=support_code)
+    weave.inline(code, arg_names=['x', 'cdf_x', 'N'], support_code=support_code)
+    return cdf_x
 
 def inv_std_norm_cdf(x):
     """
diff --git a/doc/GPy.likelihoods.rst b/doc/GPy.likelihoods.rst
index 2e7da879..34d98739 100644
--- a/doc/GPy.likelihoods.rst
+++ b/doc/GPy.likelihoods.rst
@@ -59,14 +59,6 @@ GPy.likelihoods.likelihood module
     :undoc-members:
     :show-inheritance:
 
-GPy.likelihoods.likelihood_functions module
--------------------------------------------
-
-.. automodule:: GPy.likelihoods.likelihood_functions
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
 GPy.likelihoods.noise_model_constructors module
 -----------------------------------------------
 
diff --git a/doc/GPy.testing.rst b/doc/GPy.testing.rst
index 078a41a2..2d41d5fc 100644
--- a/doc/GPy.testing.rst
+++ b/doc/GPy.testing.rst
@@ -52,10 +52,10 @@ GPy.testing.kernel_tests module
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.laplace_tests module
---------------------------------
+GPy.testing.likelihoods_tests module
+------------------------------------
 
-.. automodule:: GPy.testing.laplace_tests
+.. automodule:: GPy.testing.likelihoods_tests
     :members:
     :undoc-members:
     :show-inheritance:
@@ -76,14 +76,6 @@ GPy.testing.mrd_tests module
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.noise_distributions module
---------------------------------------
-
-.. automodule:: GPy.testing.noise_distributions
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
 GPy.testing.prior_tests module
 ------------------------------
 

From f3fd9f13252c1244cfb19d1a6427be6813156635 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 17 Oct 2013 15:04:55 +0100
Subject: [PATCH 119/252] Rename Binomial to Bernoulli (maybe generalise it
 with the constant later, but tilted distribution may change)

---
 GPy/examples/classification.py                |  2 +-
 GPy/likelihoods/noise_model_constructors.py   |  9 ++---
 GPy/likelihoods/noise_models/__init__.py      |  2 +-
 .../{binomial_noise.py => bernoulli_noise.py} |  6 ++--
 GPy/models/fitc_classification.py             |  4 +--
 GPy/models/gp_classification.py               |  4 +--
 GPy/models/sparse_gp_classification.py        |  4 +--
 GPy/testing/unit_tests.py                     |  2 +-
 GPy/util/datasets.py                          | 34 +++++++++----------
 9 files changed, 34 insertions(+), 33 deletions(-)
 rename GPy/likelihoods/noise_models/{binomial_noise.py => bernoulli_noise.py} (95%)

diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index da2ffb24..0630537b 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -116,7 +116,7 @@ def toy_heaviside(seed=default_seed):
     Y[Y.flatten() == -1] = 0
 
     # Model definition
-    noise_model = GPy.likelihoods.binomial(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
+    noise_model = GPy.likelihoods.bernoulli(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
     likelihood = GPy.likelihoods.EP(Y,noise_model)
     m = GPy.models.GPClassification(data['X'], likelihood=likelihood)
 
diff --git a/GPy/likelihoods/noise_model_constructors.py b/GPy/likelihoods/noise_model_constructors.py
index 26d07391..95247c03 100644
--- a/GPy/likelihoods/noise_model_constructors.py
+++ b/GPy/likelihoods/noise_model_constructors.py
@@ -4,9 +4,9 @@
 import numpy as np
 import noise_models
 
-def binomial(gp_link=None):
+def bernoulli(gp_link=None):
     """
-    Construct a binomial likelihood
+    Construct a bernoulli likelihood
 
     :param gp_link: a GPy gp_link function
     """
@@ -27,11 +27,12 @@ def binomial(gp_link=None):
         analytical_mean = False
         analytical_variance = False
 
-    return noise_models.binomial_noise.Binomial(gp_link,analytical_mean,analytical_variance)
+    return noise_models.bernoulli_noise.Bernoulli(gp_link,analytical_mean,analytical_variance)
 
 def exponential(gp_link=None):
+
     """
-    Construct a binomial likelihood
+    Construct a exponential likelihood
 
     :param gp_link: a GPy gp_link function
     """
diff --git a/GPy/likelihoods/noise_models/__init__.py b/GPy/likelihoods/noise_models/__init__.py
index 54f3f61a..d1d134dc 100644
--- a/GPy/likelihoods/noise_models/__init__.py
+++ b/GPy/likelihoods/noise_models/__init__.py
@@ -1,5 +1,5 @@
 import noise_distributions
-import binomial_noise
+import bernoulli_noise
 import exponential_noise
 import gaussian_noise
 import gamma_noise
diff --git a/GPy/likelihoods/noise_models/binomial_noise.py b/GPy/likelihoods/noise_models/bernoulli_noise.py
similarity index 95%
rename from GPy/likelihoods/noise_models/binomial_noise.py
rename to GPy/likelihoods/noise_models/bernoulli_noise.py
index c0bb8be4..1d45c82e 100644
--- a/GPy/likelihoods/noise_models/binomial_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@@ -9,7 +9,7 @@ from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from noise_distributions import NoiseDistribution
 
-class Binomial(NoiseDistribution):
+class Bernoulli(NoiseDistribution):
     """
     Probit likelihood
     Y is expected to take values in {-1,1}
@@ -19,7 +19,7 @@ class Binomial(NoiseDistribution):
     $$
     """
     def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
-        super(Binomial, self).__init__(gp_link,analytical_mean,analytical_variance)
+        super(Bernoulli, self).__init__(gp_link,analytical_mean,analytical_variance)
 
     def _preprocess_values(self,Y):
         """
@@ -31,7 +31,7 @@ class Binomial(NoiseDistribution):
         Y_prep = Y.copy()
         Y1 = Y[Y.flatten()==1].size
         Y2 = Y[Y.flatten()==0].size
-        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
+        assert Y1 + Y2 == Y.size, 'Bernoulli likelihood is meant to be used only with outputs in {0,1}.'
         Y_prep[Y.flatten() == 0] = -1
         return Y_prep
 
diff --git a/GPy/models/fitc_classification.py b/GPy/models/fitc_classification.py
index ee92a1b4..0aa21db9 100644
--- a/GPy/models/fitc_classification.py
+++ b/GPy/models/fitc_classification.py
@@ -16,7 +16,7 @@ class FITCClassification(FITC):
 
     :param X: input observations
     :param Y: observed values
-    :param likelihood: a GPy likelihood, defaults to Binomial with probit link function
+    :param likelihood: a GPy likelihood, defaults to Bernoulli with probit link function
     :param kernel: a GPy kernel, defaults to rbf+white
     :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
     :type normalize_X: False|True
@@ -31,7 +31,7 @@ class FITCClassification(FITC):
             kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)
 
         if likelihood is None:
-            noise_model = likelihoods.binomial()
+            noise_model = likelihoods.bernoulli()
             likelihood = likelihoods.EP(Y, noise_model)
         elif Y is not None:
             if not all(Y.flatten() == likelihood.data.flatten()):
diff --git a/GPy/models/gp_classification.py b/GPy/models/gp_classification.py
index fce51cfa..7fc61bb7 100644
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@@ -15,7 +15,7 @@ class GPClassification(GP):
 
     :param X: input observations
     :param Y: observed values, can be None if likelihood is not None
-    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
+    :param likelihood: a GPy likelihood, defaults to Bernoulli with Probit link_function
     :param kernel: a GPy kernel, defaults to rbf
     :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
     :type normalize_X: False|True
@@ -31,7 +31,7 @@ class GPClassification(GP):
             kernel = kern.rbf(X.shape[1])
 
         if likelihood is None:
-            noise_model = likelihoods.binomial()
+            noise_model = likelihoods.bernoulli()
             likelihood = likelihoods.EP(Y, noise_model)
         elif Y is not None:
             if not all(Y.flatten() == likelihood.data.flatten()):
diff --git a/GPy/models/sparse_gp_classification.py b/GPy/models/sparse_gp_classification.py
index 50c2f935..9274aacc 100644
--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@@ -16,7 +16,7 @@ class SparseGPClassification(SparseGP):
 
     :param X: input observations
     :param Y: observed values
-    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
+    :param likelihood: a GPy likelihood, defaults to Bernoulli with probit link_function
     :param kernel: a GPy kernel, defaults to rbf+white
     :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
     :type normalize_X: False|True
@@ -31,7 +31,7 @@ class SparseGPClassification(SparseGP):
             kernel = kern.rbf(X.shape[1])# + kern.white(X.shape[1],1e-3)
 
         if likelihood is None:
-            noise_model = likelihoods.binomial()
+            noise_model = likelihoods.bernoulli()
             likelihood = likelihoods.EP(Y, noise_model)
         elif Y is not None:
             if not all(Y.flatten() == likelihood.data.flatten()):
diff --git a/GPy/testing/unit_tests.py b/GPy/testing/unit_tests.py
index e4d9e063..818cb56e 100644
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@@ -209,7 +209,7 @@ class GradientTests(unittest.TestCase):
         Z = np.linspace(0, 15, 4)[:, None]
         kernel = GPy.kern.rbf(1)
         m = GPy.models.SparseGPClassification(X,Y,kernel=kernel,Z=Z)
-        #distribution = GPy.likelihoods.likelihood_functions.Binomial()
+        #distribution = GPy.likelihoods.likelihood_functions.Bernoulli()
         #likelihood = GPy.likelihoods.EP(Y, distribution)
         #m = GPy.core.SparseGP(X, likelihood, kernel, Z)
         #m.ensure_default_constraints()
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index f5947179..565f8e76 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -17,13 +17,13 @@ except ImportError:
 
 import sys, urllib
 
-def reporthook(a,b,c): 
+def reporthook(a,b,c):
     # ',' at the end of the line is important!
     #print "% 3.1f%% of %d bytes\r" % (min(100, float(a * b) / c * 100), c),
     #you can also use sys.stdout.write
     sys.stdout.write("\r% 3.1f%% of %d bytes" % (min(100, float(a * b) / c * 100), c))
     sys.stdout.flush()
-     
+
 # Global variables
 data_path = os.path.join(os.path.dirname(__file__), 'datasets')
 default_seed = 10000
@@ -39,7 +39,7 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
                                        'license' : None,
                                        'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""",
                                        'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""},
-                   
+
                   'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'],
                                       'files' : [['Index', 'housing.data', 'housing.names']],
                                       'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""",
@@ -164,14 +164,14 @@ def prompt_user(prompt):
         print(prompt)
         choice = raw_input().lower()
         # would like to test for exception here, but not sure if we can do that without importing IPython
-    except: 
+    except:
         print('Stdin is not implemented.')
         print('You need to set')
         print('overide_manual_authorize=True')
         print('to proceed with the download. Please set that variable and continue.')
         raise
 
-    
+
     if choice in yes:
         return True
     elif choice in no:
@@ -189,7 +189,7 @@ def data_available(dataset_name=None):
             if not os.path.exists(os.path.join(data_path, dataset_name, file)):
                 return False
     return True
-            
+
 def download_url(url, store_directory, save_name = None, messages = True, suffix=''):
     """Download a file from a url and save it to disk."""
     i = url.rfind('/')
@@ -249,18 +249,18 @@ def download_data(dataset_name=None):
             for file in files:
                 download_url(os.path.join(url,file), dataset_name, dataset_name)
     return True
-                  
+
 def data_details_return(data, data_set):
     """Update the data component of the data dictionary with details drawn from the data_resources."""
     data.update(data_resources[data_set])
     return data
 
-    
+
 def cmu_urls_files(subj_motions, messages = True):
     '''
-    Find which resources are missing on the local disk for the requested CMU motion capture motions. 
+    Find which resources are missing on the local disk for the requested CMU motion capture motions.
     '''
-    
+
     subjects_num = subj_motions[0]
     motions_num = subj_motions[1]
 
@@ -280,15 +280,15 @@ def cmu_urls_files(subj_motions, messages = True):
             motions[i].append(curMot)
 
     all_skels = []
-    
+
     assert len(subjects) == len(motions)
-    
+
     all_motions = []
-            
+
     for i in range(len(subjects)):
         skel_dir = os.path.join(data_path, 'cmu_mocap')
         cur_skel_file = os.path.join(skel_dir, subjects[i] + '.asf')
-        
+
         url_required = False
         file_download = []
         if not os.path.exists(cur_skel_file):
@@ -332,7 +332,7 @@ if gpxpy_available:
             points = [point for track in gpx.tracks for segment in track.segments for point in segment.points]
             data = [[(point.time-datetime.datetime(2013,8,21)).total_seconds(), point.latitude, point.longitude, point.elevation] for point in points]
             X.append(np.asarray(data)[::sample_every, :])
-            gpx_file.close()        
+            gpx_file.close()
         return data_details_return({'X' : X, 'info' : 'Data is an array containing time in seconds, latitude, longitude and elevation in that order.'}, data_set)
 
 del gpxpy_available
@@ -408,7 +408,7 @@ def oil(data_set='three_phase_oil_flow'):
     return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'Xtest' : Xtest, 'Xvalid': Xvalid, 'Yvalid': Yvalid}, data_set)
     #else:
     # throw an error
-    
+
 def oil_100(seed=default_seed, data_set = 'three_phase_oil_flow'):
     np.random.seed(seed=seed)
     data = oil()
@@ -622,7 +622,7 @@ def xw_pen(data_set='xw_pen'):
     X = np.arange(485)[:, None]
     return data_details_return({'Y': Y, 'X': X, 'info': "Tilt data from a personalized digital assistant pen. Plot in original paper showed regression between time steps 175 and 275."}, data_set)
 
-    
+
 def download_rogers_girolami_data():
     if not data_available('rogers_girolami_data'):
         download_data(data_set)

From 1848653fceab54028bf6ab7026e7aa83ad9df9bf Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 17 Oct 2013 17:44:08 +0100
Subject: [PATCH 120/252] Added more options to generic tests (constraining
 link function values as bernoulli requies R^{0,1}) and implemented new
 gradients for bernoulli

---
 .../noise_models/bernoulli_noise.py           | 104 ++++++++
 .../noise_models/gaussian_noise.py            |  60 ++---
 .../noise_models/student_t_noise.py           |   8 +-
 GPy/testing/likelihoods_tests.py              | 234 +++++++++++-------
 4 files changed, 285 insertions(+), 121 deletions(-)

diff --git a/GPy/likelihoods/noise_models/bernoulli_noise.py b/GPy/likelihoods/noise_models/bernoulli_noise.py
index 1d45c82e..fc7c5011 100644
--- a/GPy/likelihoods/noise_models/bernoulli_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@@ -93,6 +93,110 @@ class Bernoulli(NoiseDistribution):
         p = self.gp_link.transf(gp)
         return (obs/p + (1.-obs)/(1.-p))*self.gp_link.d2transf_df2(gp) + ((1.-obs)/(1.-p)**2-obs/p**2)*self.gp_link.dtransf_df(gp)
 
+    def pdf_link(self, link_f, y, extra_data=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+            \\p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        .. Note:
+            Each y_{i} must be in {0,1}
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        objective = (link_f**y) * ((1.-link_f)**(1.-y))
+        return np.exp(np.sum(np.log(objective)))
+
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = y_{i}\\log\\lambda(f_{i}) + (1-y_{i})\\log (1-f_{i})
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in bernoulli
+        :returns: log likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        objective = np.log(link_f**y) + np.log((1.-link_f)**(1.-y))
+        return np.sum(objective)
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the pdf at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - \\frac{(1 - y_{i})}{(1 - \\lambda(f_{i}))}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: gradient of log likelihood evaluated at points
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        grad = (y/link_f) - (1.-y)/(1-link_f)
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link_f, w.r.t link_f the hessian will be 0 unless i == j
+        i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
+
+
+        .. math::
+            \\frac{d^{2}\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)^{2}} = \\frac{-y_{i}}{\\lambda(f)^{2}} - \\frac{(1-y_{i})}{(1-\\lambda(f))^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        d2logpdf_dlink2 = -y/(link_f**2) - (1-y)/((1-link_f)**2)
+        return d2logpdf_dlink2
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f)^{3}} - \\frac{2(1-y_{i}}{(1-\\lambda(f))^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data not used in gaussian
+        :returns: third derivative of log likelihood evaluated at points link(f)
+        :rtype: Nx1 array
+        """
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        d3logpdf_dlink3 = 2*(y/(link_f**3) - (1-y)/((1-link_f)**3))
+        return d3logpdf_dlink3
+
     def _mean(self,gp):
         """
         Mass (or density) function
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index 2dd0cd64..1c5ac1db 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -102,7 +102,7 @@ class Gaussian(NoiseDistribution):
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
+        :param extra_data: extra_data not used in gaussian
         :returns: likelihood evaluated for this point
         :rtype: float
         """
@@ -121,11 +121,11 @@ class Gaussian(NoiseDistribution):
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
+        :param extra_data: extra_data not used in gaussian
         :returns: log likelihood evaluated for this point
         :rtype: float
         """
-        assert link_f.shape == y.shape
+        assert np.asarray(link_f).shape == np.asarray(y).shape
         return -0.5*(np.sum((y-link_f)**2/self.variance) + self.ln_det_K + self.N*np.log(2.*np.pi))
 
     def dlogpdf_dlink(self, link_f, y, extra_data=None):
@@ -133,17 +133,17 @@ class Gaussian(NoiseDistribution):
         Gradient of the pdf at y, given link(f) w.r.t link(f)
 
         .. math::
-            \\frac{d \\ln p(y_{i}|f_{i})}{df} = \\frac{1}{\\sigma^{2}}(y_{i} - f_{i})
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{1}{\\sigma^{2}}(y_{i} - \\lambda(f_{i}))
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: gradient of log likelihood evaluated at points
+        :param extra_data: extra_data not used in gaussian
+        :returns: gradient of log likelihood evaluated at points link(f)
         :rtype: Nx1 array
         """
-        assert link_f.shape == y.shape
+        assert np.asarray(link_f).shape == np.asarray(y).shape
         s2_i = (1.0/self.variance)
         grad = s2_i*y - s2_i*link_f
         return grad
@@ -151,24 +151,24 @@ class Gaussian(NoiseDistribution):
     def d2logpdf_dlink2(self, link_f, y, extra_data=None):
         """
         Hessian at y, given link_f, w.r.t link_f the hessian will be 0 unless i == j
-        i.e. second derivative _nlog_mass at y given f_{i} f_{j}  w.r.t f_{i} and f_{j}
+        i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
 
         .. math::
-            \\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
+            \\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points f)
+        :param extra_data: extra_data not used in gaussian
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
         :rtype: Nx1 array
 
         .. Note::
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-            (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
-        assert link_f.shape == y.shape
+        assert np.asarray(link_f).shape == np.asarray(y).shape
         hess = -(1.0/self.variance)*np.ones((self.N, 1))
         return hess
 
@@ -177,18 +177,18 @@ class Gaussian(NoiseDistribution):
         Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
 
         .. math::
-            \\frac{d^{3} \\ln p(y_{i}|f_{i})}{d^{3}f} = 0
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = 0
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: third derivative of log likelihood evaluated at points f
+        :param extra_data: extra_data not used in gaussian
+        :returns: third derivative of log likelihood evaluated at points link(f)
         :rtype: Nx1 array
         """
-        assert link_f.shape == y.shape
-        d3logpdf_dlink3 = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
+        assert np.asarray(link_f).shape == np.asarray(y).shape
+        d3logpdf_dlink3 = np.diagonal(0*self.I)[:, None]
         return d3logpdf_dlink3
 
     def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
@@ -196,17 +196,17 @@ class Gaussian(NoiseDistribution):
         Gradient of the negative log-likelihood function at y given link(f), w.r.t variance parameter (noise_variance)
 
         .. math::
-            \\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = \\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - f_{i})^{2}}{2\\sigma^{4}}
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - \\lambda(f_{i}))^{2}}{2\\sigma^{4}}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: derivative of log likelihood evaluated at points f w.r.t variance parameter
+        :param extra_data: extra_data not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
         :rtype: float
         """
-        assert link_f.shape == y.shape
+        assert np.asarray(link_f).shape == np.asarray(y).shape
         e = y - link_f
         s_4 = 1.0/(self.variance**2)
         dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.dot(e.T, e)
@@ -217,17 +217,17 @@ class Gaussian(NoiseDistribution):
         Derivative of the dlogpdf_dlink w.r.t variance parameter (noise_variance)
 
         .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|f_{i})}{df}) = \\frac{1}{\\sigma^{4}}(-y_{i} + f_{i})
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)}) = \\frac{1}{\\sigma^{4}}(-y_{i} + \\lambda(f_{i}))
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: derivative of log likelihood evaluated at points f w.r.t variance parameter
+        :param extra_data: extra_data not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert link_f.shape == y.shape
+        assert np.asarray(link_f).shape == np.asarray(y).shape
         s_4 = 1.0/(self.variance**2)
         dlik_grad_dsigma = -np.dot(s_4*self.I, y) + np.dot(s_4*self.I, link_f)
         return dlik_grad_dsigma
@@ -237,17 +237,17 @@ class Gaussian(NoiseDistribution):
         Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (noise_variance)
 
         .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f}) = \\frac{1}{\\sigma^{4}}
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}\\lambda(f)}) = \\frac{1}{\\sigma^{4}}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
-        :returns: derivative of log hessian evaluated at points f and f_j w.r.t variance parameter
+        :param extra_data: extra_data not used in gaussian
+        :returns: derivative of log hessian evaluated at points link(f_i) and link(f_j) w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert link_f.shape == y.shape
+        assert np.asarray(link_f).shape == np.asarray(y).shape
         s_4 = 1.0/(self.variance**2)
         d2logpdf_dlink2_dvar = np.diag(s_4*self.I)[:, None]
         return d2logpdf_dlink2_dvar
diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index 87cfb235..56f42ab2 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -55,7 +55,7 @@ class StudentT(NoiseDistribution):
         :returns: likelihood evaluated for this point
         :rtype: float
         """
-        assert link_f.shape == y.shape
+        assert np.asarray(link_f).shape == np.asarray(y).shape
         e = y - link_f
         #Careful gamma(big_number) is infinity!
         objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
@@ -80,7 +80,7 @@ class StudentT(NoiseDistribution):
         :rtype: float
 
         """
-        assert link_f.shape == y.shape
+        assert np.asarray(link_f).shape == np.asarray(y).shape
         e = y - link_f
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
@@ -113,7 +113,7 @@ class StudentT(NoiseDistribution):
     def d2logpdf_dlink2(self, link_f, y, extra_data=None):
         """
         Hessian at y, given link(f), w.r.t link(f) the hessian will be 0 unless i == j
-        i.e. second derivative lik_function at y given f_{i} f_{j}  w.r.t f_{i} and f_{j}
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
 
         .. math::
             \\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = \\frac{(v+1)((y_{i}-f_{i})^{2} - \\sigma^{2}v)}{((y_{i}-f_{i})^{2} + \\sigma^{2}v)^{2}}
@@ -128,7 +128,7 @@ class StudentT(NoiseDistribution):
 
         .. Note::
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
-            (the distribution for y_{i} depends only on f_{i} not on f_{j!=i}
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
         assert y.shape == link_f.shape
         e = y - link_f
diff --git a/GPy/testing/likelihoods_tests.py b/GPy/testing/likelihoods_tests.py
index 84e5f036..449f3e90 100644
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@@ -5,6 +5,7 @@ from GPy.models import GradientChecker
 import functools
 import inspect
 from GPy.likelihoods.noise_models import gp_transformations
+from functools import partial
 
 def dparam_partial(inst_func, *args):
     """
@@ -24,7 +25,7 @@ def dparam_partial(inst_func, *args):
         return inst_func(*args)
     return functools.partial(param_func, inst_func=inst_func, args=args)
 
-def dparam_checkgrad(func, dfunc, params, args, constrain_positive=True, randomize=False, verbose=False):
+def dparam_checkgrad(func, dfunc, params, args, constraints=None, randomize=False, verbose=False):
     """
     checkgrad expects a f: R^N -> R^1 and df: R^N -> R^N
     However if we are holding other parameters fixed and moving something else
@@ -50,8 +51,10 @@ def dparam_checkgrad(func, dfunc, params, args, constrain_positive=True, randomi
             grad = GradientChecker(lambda x: np.atleast_1d(partial_f(x))[f_ind],
                                    lambda x : np.atleast_1d(partial_df(x))[fixed_val],
                                    param, 'p')
-            if constrain_positive:
-                grad.constrain_positive('p')
+            #This is not general for more than one param...
+            if constraints is not None:
+                for constraint in constraints:
+                    constraint('p', grad)
             if randomize:
                 grad.randomize()
             print grad
@@ -77,6 +80,7 @@ class TestNoiseModels(object):
         noise = np.random.randn(*self.X[:, 0].shape)*self.real_std
         self.Y = (np.sin(self.X[:, 0]*2*np.pi) + noise)[:, None]
         self.f = np.random.rand(self.N, 1)
+        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
 
         self.var = 0.2
 
@@ -92,6 +96,22 @@ class TestNoiseModels(object):
 
     def test_noise_models(self):
         self.setUp()
+
+        ####################################################
+        # Constraint wrappers so we can just list them off #
+        ####################################################
+        def constrain_negative(regex, model):
+            model.constrain_negative(regex)
+
+        def constrain_positive(regex, model):
+            model.constrain_positive(regex)
+
+        def constrain_bounded(regex, model, lower, upper):
+            """
+            Used like: partial(constrain_bounded, lower=0, upper=1)
+            """
+            model.constrain_bounded(regex, lower, upper)
+
         """
         Dictionary where we nest models we would like to check
             Name: {
@@ -99,9 +119,10 @@ class TestNoiseModels(object):
                 "grad_params": {
                     "names": [names_of_params_we_want, to_grad_check],
                     "vals": [values_of_params, to_start_at],
-                    "constrain_positive": [boolean_values, of_whether_to_constrain]
+                    "constrain": [constraint_wrappers, listed_here]
                     },
-                "laplace": boolean_of_whether_model_should_work_for_laplace
+                "laplace": boolean_of_whether_model_should_work_for_laplace,
+                "link_f_constraints": [constraint_wrappers, listed_here]
                 }
         """
         noise_models = {"Student_t_default": {
@@ -109,7 +130,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [self.var],
-                                "constrain_positive": [True]
+                                "constraints": [constrain_positive]
                                 },
                             "laplace": True
                             },
@@ -118,7 +139,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [1],
-                                "constrain_positive": [True]
+                                "constraints": [constrain_positive]
                                 },
                             "laplace": True
                             },
@@ -127,7 +148,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [0.01],
-                                "constrain_positive": [True]
+                                "constraints": [constrain_positive]
                                 },
                             "laplace": True
                             },
@@ -136,7 +157,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [self.var],
-                                "constrain_positive": [True]
+                                "constraints": [constrain_positive]
                                 },
                             "laplace": True
                             },
@@ -145,7 +166,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["t_noise"],
                                 "vals": [self.var],
-                                "constrain_positive": [True]
+                                "constraints": [constrain_positive]
                                 },
                             "laplace": True
                             },
@@ -154,7 +175,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["noise_model_variance"],
                                 "vals": [self.var],
-                                "constrain_positive": [True]
+                                "constraints": [constrain_positive]
                                 },
                             "laplace": True
                             },
@@ -163,7 +184,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["noise_model_variance"],
                                 "vals": [self.var],
-                                "constrain_positive": [True]
+                                "constraints": [constrain_positive]
                                 },
                             "laplace": True
                             },
@@ -172,7 +193,7 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["noise_model_variance"],
                                 "vals": [self.var],
-                                "constrain_positive": [True]
+                                "constraints": [constrain_positive]
                                 },
                             "laplace": True
                             },
@@ -181,18 +202,42 @@ class TestNoiseModels(object):
                             "grad_params": {
                                 "names": ["noise_model_variance"],
                                 "vals": [self.var],
-                                "constrain_positive": [True]
+                                "constraints": [constrain_positive]
                                 },
                             "laplace": True
-                            }
+                            },
+                        "Bernoulli_default": {
+                            "model": GPy.likelihoods.bernoulli(),
+                            "link_f_constraints": [partial(constrain_bounded, lower=0, upper=1)],
+                            "laplace": True,
+                            "Y": self.binary_Y,
                         }
+                    }
 
         for name, attributes in noise_models.iteritems():
             model = attributes["model"]
-            params = attributes["grad_params"]
-            param_vals = params["vals"]
-            param_names= params["names"]
-            constrain_positive = params["constrain_positive"]
+            if "grad_params" in attributes:
+                params = attributes["grad_params"]
+                param_vals = params["vals"]
+                param_names= params["names"]
+                param_constraints = params["constraints"]
+            else:
+                params = []
+                param_vals = []
+                param_names = []
+                constrain_positive = []
+            if "link_f_constraints" in attributes:
+                link_f_constraints = attributes["link_f_constraints"]
+            else:
+                link_f_constraints = []
+            if "Y" in attributes:
+                Y = attributes["Y"].copy()
+            else:
+                Y = self.Y.copy()
+            if "f" in attributes:
+                f = attributes["f"].copy()
+            else:
+                f = self.f.copy()
             laplace = attributes["laplace"]
 
             if len(param_vals) > 1:
@@ -200,27 +245,27 @@ class TestNoiseModels(object):
 
             #Required by all
             #Normal derivatives
-            yield self.t_logpdf, model
-            yield self.t_dlogpdf_df, model
-            yield self.t_d2logpdf_df2, model
+            yield self.t_logpdf, model, Y, f
+            yield self.t_dlogpdf_df, model, Y, f
+            yield self.t_d2logpdf_df2, model, Y, f
             #Link derivatives
-            yield self.t_dlogpdf_dlink, model
-            yield self.t_d2logpdf_dlink2, model
+            yield self.t_dlogpdf_dlink, model, Y, f, link_f_constraints
+            yield self.t_d2logpdf_dlink2, model, Y, f, link_f_constraints
             if laplace:
                 #Laplace only derivatives
-                yield self.t_d3logpdf_df3, model
-                yield self.t_d3logpdf_dlink3, model
+                yield self.t_d3logpdf_df3, model, Y, f
+                yield self.t_d3logpdf_dlink3, model, Y, f, link_f_constraints
                 #Params
-                yield self.t_dlogpdf_dparams, model, param_vals
-                yield self.t_dlogpdf_df_dparams, model, param_vals
-                yield self.t_d2logpdf2_df2_dparams, model, param_vals
+                yield self.t_dlogpdf_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_dlogpdf_df_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_d2logpdf2_df2_dparams, model, Y, f, param_vals, param_constraints
                 #Link params
-                yield self.t_dlogpdf_link_dparams, model, param_vals
-                yield self.t_dlogpdf_dlink_dparams, model, param_vals
-                yield self.t_d2logpdf2_dlink2_dparams, model, param_vals
+                yield self.t_dlogpdf_link_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_dlogpdf_dlink_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, param_vals, param_constraints
 
                 #laplace likelihood gradcheck
-                yield self.t_laplace_fit_rbf_white, model, param_vals, param_names, constrain_positive
+                yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
 
         self.tearDown()
 
@@ -228,42 +273,42 @@ class TestNoiseModels(object):
     # dpdf_df's #
     #############
     @with_setup(setUp, tearDown)
-    def t_logpdf(self, model):
+    def t_logpdf(self, model, Y, f):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         np.testing.assert_almost_equal(
-                               np.log(model.pdf(self.f.copy(), self.Y.copy())),
-                               model.logpdf(self.f.copy(), self.Y.copy()))
+                               np.log(model.pdf(f.copy(), Y.copy())),
+                               model.logpdf(f.copy(), Y.copy()))
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_df(self, model):
+    def t_dlogpdf_df(self, model, Y, f):
         print "\n{}".format(inspect.stack()[0][3])
         self.description = "\n{}".format(inspect.stack()[0][3])
-        logpdf = functools.partial(model.logpdf, y=self.Y)
-        dlogpdf_df = functools.partial(model.dlogpdf_df, y=self.Y)
-        grad = GradientChecker(logpdf, dlogpdf_df, self.f.copy(), 'g')
+        logpdf = functools.partial(model.logpdf, y=Y)
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
+        grad = GradientChecker(logpdf, dlogpdf_df, f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
         print model
         assert grad.checkgrad()
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf_df2(self, model):
+    def t_d2logpdf_df2(self, model, Y, f):
         print "\n{}".format(inspect.stack()[0][3])
-        dlogpdf_df = functools.partial(model.dlogpdf_df, y=self.Y)
-        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=self.Y)
-        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, self.f.copy(), 'g')
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
+        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
         print model
         assert grad.checkgrad()
 
     @with_setup(setUp, tearDown)
-    def t_d3logpdf_df3(self, model):
+    def t_d3logpdf_df3(self, model, Y, f):
         print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=self.Y)
-        d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=self.Y)
-        grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, self.f.copy(), 'g')
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
+        d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=Y)
+        grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
         print model
@@ -273,32 +318,32 @@ class TestNoiseModels(object):
     # df_dparams #
     ##############
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dparams(self, model, params):
+    def t_dlogpdf_dparams(self, model, Y, f, params, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
-                    params, args=(self.f, self.Y), constrain_positive=True,
+                    params, args=(f, Y), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_df_dparams(self, model, params):
+    def t_dlogpdf_df_dparams(self, model, Y, f, params, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
-                    params, args=(self.f, self.Y), constrain_positive=True,
+                    params, args=(f, Y), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf2_df2_dparams(self, model, params):
+    def t_d2logpdf2_df2_dparams(self, model, Y, f, params, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
-                    params, args=(self.f, self.Y), constrain_positive=True,
+                    params, args=(f, Y), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
@@ -306,33 +351,48 @@ class TestNoiseModels(object):
     # dpdf_dlink's #
     ################
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dlink(self, model):
+    def t_dlogpdf_dlink(self, model, Y, f, link_f_constraints):
         print "\n{}".format(inspect.stack()[0][3])
-        logpdf = functools.partial(model.logpdf_link, y=self.Y)
-        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=self.Y)
-        grad = GradientChecker(logpdf, dlogpdf_dlink, self.f.copy(), 'g')
+        logpdf = functools.partial(model.logpdf_link, y=Y)
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
+        grad = GradientChecker(logpdf, dlogpdf_dlink, f.copy(), 'g')
+
+        #Apply constraints to link_f values
+        for constraint in link_f_constraints:
+            constraint('g', grad)
+
+        grad.randomize()
+        print grad
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf_dlink2(self, model, Y, f, link_f_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
+        grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, f.copy(), 'g')
+
+        #Apply constraints to link_f values
+        for constraint in link_f_constraints:
+            constraint('g', grad)
+
         grad.randomize()
         grad.checkgrad(verbose=1)
         print grad
         assert grad.checkgrad()
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf_dlink2(self, model):
+    def t_d3logpdf_dlink3(self, model, Y, f, link_f_constraints):
         print "\n{}".format(inspect.stack()[0][3])
-        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=self.Y)
-        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=self.Y)
-        grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, self.f.copy(), 'g')
-        grad.randomize()
-        grad.checkgrad(verbose=1)
-        print grad
-        assert grad.checkgrad()
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
+        d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=Y)
+        grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, f.copy(), 'g')
+
+        #Apply constraints to link_f values
+        for constraint in link_f_constraints:
+            constraint('g', grad)
 
-    @with_setup(setUp, tearDown)
-    def t_d3logpdf_dlink3(self, model):
-        print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=self.Y)
-        d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=self.Y)
-        grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, self.f.copy(), 'g')
         grad.randomize()
         grad.checkgrad(verbose=1)
         print grad
@@ -342,32 +402,32 @@ class TestNoiseModels(object):
     # dlink_dparams #
     #################
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_link_dparams(self, model, params):
+    def t_dlogpdf_link_dparams(self, model, Y, f, params, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
-                    params, args=(self.f, self.Y), constrain_positive=True,
+                    params, args=(f, Y), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dlink_dparams(self, model, params):
+    def t_dlogpdf_dlink_dparams(self, model, Y, f, params, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
-                    params, args=(self.f, self.Y), constrain_positive=True,
+                    params, args=(f, Y), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf2_dlink2_dparams(self, model, params):
+    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, params, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
         assert (
                 dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
-                    params, args=(self.f, self.Y), constrain_positive=True,
+                    params, args=(f, Y), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
@@ -375,26 +435,26 @@ class TestNoiseModels(object):
     # laplace test #
     ################
     @with_setup(setUp, tearDown)
-    def t_laplace_fit_rbf_white(self, model, param_vals, param_names, constrain_positive):
+    def t_laplace_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
         print "\n{}".format(inspect.stack()[0][3])
-        self.Y = self.Y/self.Y.max()
+        #Normalize
+        Y = Y/Y.max()
         white_var = 0.001
-        kernel = GPy.kern.rbf(self.X.shape[1]) + GPy.kern.white(self.X.shape[1])
-        laplace_likelihood = GPy.likelihoods.Laplace(self.Y.copy(), model)
-        m = GPy.models.GPRegression(self.X, self.Y.copy(), kernel, likelihood=laplace_likelihood)
+        kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), model)
+        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=laplace_likelihood)
         m.ensure_default_constraints()
         m.constrain_fixed('white', white_var)
 
         for param_num in range(len(param_names)):
             name = param_names[param_num]
-            if constrain_positive[param_num]:
-                m.constrain_positive(name)
             m[name] = param_vals[param_num]
+            constraints[param_num](name, m)
 
         m.randomize()
-        m.checkgrad(verbose=1, step=self.step)
+        m.checkgrad(verbose=1, step=step)
         print m
-        assert m.checkgrad(step=self.step)
+        assert m.checkgrad(step=step)
 
 
 class LaplaceTests(unittest.TestCase):

From 10f3f7d14a9b3b9decb7bbff7f8fca9d50a421a5 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 17 Oct 2013 18:33:08 +0100
Subject: [PATCH 121/252] Refactored gradients wrt parameters slightly, need to
 future proof against _get_param_names() disappearing

---
 GPy/likelihoods/laplace.py                    |  5 ++-
 .../noise_models/noise_distributions.py       | 42 ++++++++++++-------
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 8019e430..33594da8 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -123,7 +123,9 @@ class Laplace(likelihood):
         dL_dfhat, I_KW_i = self._shared_gradients_components()
         dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = self.noise_model._laplace_gradients(self.f_hat, self.data, extra_data=self.extra_data)
 
-        num_params = len(dlik_dthetaL)
+        #len(dlik_dthetaL)
+        num_params = len(self._get_param_names())
+        print num_params
         # make space for one derivative for each likelihood parameter
         dL_dthetaL = np.zeros(num_params)
         for thetaL_i in range(num_params):
@@ -138,6 +140,7 @@ class Laplace(likelihood):
             dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
             dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
 
+        print dL_dthetaL
         return dL_dthetaL
 
     def _compute_GP_variables(self):
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index dc3a7de5..0bb106b2 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -270,6 +270,7 @@ class NoiseDistribution(object):
     def _predictive_mean_numerical(self,mu,sigma):
         """
         Laplace approximation to the predictive mean: E(Y_star|Y) = E( E(Y_star|f_star, Y) )
+        if self.
 
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
@@ -541,32 +542,45 @@ class NoiseDistribution(object):
         """
         TODO: Doc strings
         """
-        link_f = self.gp_link.transf(f)
-        return self.dlogpdf_link_dtheta(link_f, y, extra_data=extra_data)
+        if len(self._get_param_names()) > 0:
+            link_f = self.gp_link.transf(f)
+            return self.dlogpdf_link_dtheta(link_f, y, extra_data=extra_data)
+        else:
+            #Is no parameters so return an empty array for its derivatives
+            return np.empty([1, 0])
 
     def dlogpdf_df_dtheta(self, f, y, extra_data=None):
         """
         TODO: Doc strings
         """
-        link_f = self.gp_link.transf(f)
-        dlink_df = self.gp_link.dtransf_df(f)
-        dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
-        return chain_1(dlogpdf_dlink_dtheta, dlink_df)
+        if len(self._get_param_names()) > 0:
+            link_f = self.gp_link.transf(f)
+            dlink_df = self.gp_link.dtransf_df(f)
+            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
+            return chain_1(dlogpdf_dlink_dtheta, dlink_df)
+        else:
+            #Is no parameters so return an empty array for its derivatives
+            return np.empty([f.shape[0], 0])
 
     def d2logpdf_df2_dtheta(self, f, y, extra_data=None):
         """
         TODO: Doc strings
         """
-        link_f = self.gp_link.transf(f)
-        dlink_df = self.gp_link.dtransf_df(f)
-        d2link_df2 = self.gp_link.d2transf_df2(f)
-        d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, extra_data=extra_data)
-        dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
-        #FIXME: Why isn't this chain_1?
-        #return chain_1(d2logpdf_dlink2_dtheta, d2link_df2)
-        return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
+        if len(self._get_param_names()) > 0:
+            link_f = self.gp_link.transf(f)
+            dlink_df = self.gp_link.dtransf_df(f)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, extra_data=extra_data)
+            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
+            #FIXME: Why isn't this chain_1?
+            #return chain_1(d2logpdf_dlink2_dtheta, d2link_df2)
+            return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
+        else:
+            #Is no parameters so return an empty array for its derivatives
+            return np.empty([f.shape[0], 0])
 
     def _laplace_gradients(self, f, y, extra_data=None):
+        #Bit nasty we recompute thesesome of these but it keeps it modular
         #link_f = self.gp_link.transf(f)
         #dlink_df = self.gp_link.dtransf_df(f)
         #d2link_df2 = self.gp_link.d2transf_df2(f)

From 0eee4b42d23aae7f4fa861dc8fe5e6bee2c4cd91 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 18 Oct 2013 14:08:37 +0100
Subject: [PATCH 122/252] Fixed a few laplace bits

---
 GPy/examples/classification.py                | 37 ++++++++++++++++++-
 GPy/likelihoods/laplace.py                    | 15 +++++---
 .../noise_models/bernoulli_noise.py           | 26 +++----------
 .../noise_models/student_t_noise.py           |  3 +-
 4 files changed, 52 insertions(+), 29 deletions(-)

diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index 0630537b..38559105 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -43,7 +43,7 @@ def oil(num_inducing=50, max_iters=100, kernel=None):
 
 def toy_linear_1d_classification(seed=default_seed):
     """
-    Simple 1D classification example
+    Simple 1D classification example using EP approximation
 
     :param seed: seed value for data generation (default is 4).
     :type seed: int
@@ -71,6 +71,41 @@ def toy_linear_1d_classification(seed=default_seed):
 
     return m
 
+def toy_linear_1d_classification_laplace(seed=default_seed):
+    """
+    Simple 1D classification example using Laplace approximation
+
+    :param seed: seed value for data generation (default is 4).
+    :type seed: int
+
+    """
+
+    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    Y = data['Y'][:, 0:1]
+    Y[Y.flatten() == -1] = 0
+
+    bern_noise_model = GPy.likelihoods.bernoulli()
+    laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), bern_noise_model)
+
+    # Model definition
+    m = GPy.models.GPClassification(data['X'], Y, likelihood=laplace_likelihood)
+
+    print m
+    # Optimize
+    #m.update_likelihood_approximation()
+    # Parameters optimization:
+    m.optimize(messages=1)
+    #m.pseudo_EM()
+
+    # Plot
+    fig, axes = pb.subplots(2,1)
+    m.plot_f(ax=axes[0])
+    m.plot(ax=axes[1])
+    print(m)
+
+    return m
+
+
 def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
     """
     Sparse 1D classification example
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 33594da8..e6ffd78c 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -1,6 +1,14 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-
+#
+#Parts of this file were influenced by the Matlab GPML framework written by
+#Carl Edward Rasmussen & Hannes Nickisch, however all bugs are our own.
+#
+#The GPML code is released under the FreeBSD License.
+#Copyright (c) 2005-2013 Carl Edward Rasmussen & Hannes Nickisch. All rights reserved.
+#
+#The code and associated documentation is available from
+#http://gaussianprocess.org/gpml/code.
 
 import numpy as np
 import scipy as sp
@@ -32,7 +40,6 @@ class Laplace(likelihood):
         :param noise_model: likelihood function - subclass of noise_model
         :type noise_model: noise_model
         :param extra_data: additional data used by some likelihood functions,
-                           for example survival likelihoods need censoring data
         """
         self.data = data
         self.noise_model = noise_model
@@ -125,7 +132,6 @@ class Laplace(likelihood):
 
         #len(dlik_dthetaL)
         num_params = len(self._get_param_names())
-        print num_params
         # make space for one derivative for each likelihood parameter
         dL_dthetaL = np.zeros(num_params)
         for thetaL_i in range(num_params):
@@ -140,7 +146,6 @@ class Laplace(likelihood):
             dL_dthetaL_imp = np.dot(dL_dfhat, dfhat_dthetaL)
             dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
 
-        print dL_dthetaL
         return dL_dthetaL
 
     def _compute_GP_variables(self):
@@ -265,7 +270,7 @@ class Laplace(likelihood):
         ln_B_det = 2*np.sum(np.log(np.diag(L)))
         return W12BiW12, ln_B_det
 
-    def rasm_mode(self, K, MAX_ITER=100):
+    def rasm_mode(self, K, MAX_ITER=30):
         """
         Rasmussen's numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
diff --git a/GPy/likelihoods/noise_models/bernoulli_noise.py b/GPy/likelihoods/noise_models/bernoulli_noise.py
index fc7c5011..7ef8aa82 100644
--- a/GPy/likelihoods/noise_models/bernoulli_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@@ -58,6 +58,8 @@ class Bernoulli(NoiseDistribution):
             sigma2_hat = (1. - a*N/Z_hat - np.square(N/Z_hat))/tau_i
             if np.any(np.isnan([Z_hat, mu_hat, sigma2_hat])):
                 stop
+        else:
+            raise ValueError("Exact moment matching not available for link {}".format(self.gp_link.gp_transformations.__name__))
 
         return Z_hat, mu_hat, sigma2_hat
 
@@ -75,24 +77,6 @@ class Bernoulli(NoiseDistribution):
         else:
             raise NotImplementedError
 
-    def _mass(self,gp,obs):
-        #NOTE obs must be in {0,1}
-        p = self.gp_link.transf(gp)
-        return p**obs * (1.-p)**(1.-obs)
-
-    def _nlog_mass(self,gp,obs):
-        p = self.gp_link.transf(gp)
-        return obs*np.log(p) + (1.-obs)*np.log(1-p)
-
-    def _dnlog_mass_dgp(self,gp,obs):
-        p = self.gp_link.transf(gp)
-        dp = self.gp_link.dtransf_df(gp)
-        return obs/p * dp - (1.-obs)/(1.-p) * dp
-
-    def _d2nlog_mass_dgp2(self,gp,obs):
-        p = self.gp_link.transf(gp)
-        return (obs/p + (1.-obs)/(1.-p))*self.gp_link.d2transf_df2(gp) + ((1.-obs)/(1.-p)**2-obs/p**2)*self.gp_link.dtransf_df(gp)
-
     def pdf_link(self, link_f, y, extra_data=None):
         """
         Likelihood function given link(f)
@@ -109,7 +93,7 @@ class Bernoulli(NoiseDistribution):
         :rtype: float
 
         .. Note:
-            Each y_{i} must be in {0,1}
+            Each y_i must be in {0,1}
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
         objective = (link_f**y) * ((1.-link_f)**(1.-y))
@@ -131,7 +115,8 @@ class Bernoulli(NoiseDistribution):
         :rtype: float
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
-        objective = np.log(link_f**y) + np.log((1.-link_f)**(1.-y))
+        #objective = y*np.log(link_f) + (1.-y)*np.log(link_f)
+        objective = np.where(y==1, np.log(link_f), np.log(1-link_f))
         return np.sum(objective)
 
     def dlogpdf_dlink(self, link_f, y, extra_data=None):
@@ -222,7 +207,6 @@ class Bernoulli(NoiseDistribution):
     def _d2variance_dgp2(self,gp):
         return self.gp_link.d2transf_df2(gp)*(1. - 2.*self.gp_link.transf(gp)) - 2*self.gp_link.dtransf_df(gp)**2
 
-
     def samples(self, gp):
         """
         Returns a set of samples of observations based on a given value of the latent variable.
diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index 56f42ab2..49de781f 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -233,7 +233,7 @@ class StudentT(NoiseDistribution):
 
     def _predictive_variance_analytical(self, mu, sigma, predictive_mean=None):
         """
-        Compute  mean, and conficence interval (percentiles 5 and 95) of the prediction
+        Compute predictive variance of student_t*normal p(y*|f*)p(f*)
 
         Need to find what the variance is at the latent points for a student t*normal p(y*|f*)p(f*)
         (((g((v+1)/2))/(g(v/2)*s*sqrt(v*pi)))*(1+(1/v)*((y-f)/s)^2)^(-(v+1)/2))
@@ -313,4 +313,3 @@ class StudentT(NoiseDistribution):
         p_025 = mu - p
         p_975 = mu + p
         return mu, np.nan*mu, p_025, p_975
-

From ceb1f7490db77689575ef101df9a9324253ebee9 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 18 Oct 2013 16:11:47 +0100
Subject: [PATCH 123/252] Added quadrature numerical moment matching (but not
 predictive yet)

---
 .../noise_models/noise_distributions.py       | 54 ++++++++++++-------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 0bb106b2..82071a50 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -10,6 +10,7 @@ from GPy.util.plot import gpplot
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from GPy.util.misc import chain_1, chain_2, chain_3
+from scipy.integrate import quad
 
 
 class NoiseDistribution(object):
@@ -125,9 +126,41 @@ class NoiseDistribution(object):
         """
         If available, this function computes the moments analytically.
         """
-        pass
+        raise NotImplementedError
 
     def _moments_match_numerical(self,obs,tau,v):
+        """
+        Calculation of moments using quadrature
+
+        :param obs: observed output
+        :param tau: cavity distribution 1st natural parameter (precision)
+        :param v: cavity distribution 2nd natural paramenter (mu*precision)
+        """
+        #Compute first integral for zeroth moment
+        mu = v/tau
+        def int_1(f):
+            return self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        z, accuracy = quad(int_1, -np.inf, np.inf)
+        z /= np.sqrt(2*np.pi/tau)
+
+        #Compute second integral for first moment
+        def int_2(f):
+            return f*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        mean, accuracy = quad(int_2, -np.inf, np.inf)
+        mean /= np.sqrt(2*np.pi/tau)
+        mean /= z
+
+        #Compute integral for variance
+        def int_3(f):
+            return (f**2)*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
+        Ef2, accuracy = quad(int_3, -np.inf, np.inf)
+        Ef2 /= np.sqrt(2*np.pi/tau)
+        Ef2 /= z
+        variance = Ef2 - mean**2
+
+        return z, mean, variance
+
+    def _moments_match_numerical_laplace(self,obs,tau,v):
         """
         Lapace approximation to calculate the moments.
 
@@ -255,7 +288,7 @@ class NoiseDistribution(object):
 
         If available, this function computes the predictive mean analytically.
         """
-        pass
+        raise NotImplementedError
 
     def _predictive_variance_analytical(self,mu,sigma):
         """
@@ -265,7 +298,7 @@ class NoiseDistribution(object):
 
         If available, this function computes the predictive variance analytically.
         """
-        pass
+        raise NotImplementedError
 
     def _predictive_mean_numerical(self,mu,sigma):
         """
@@ -572,27 +605,12 @@ class NoiseDistribution(object):
             d2link_df2 = self.gp_link.d2transf_df2(f)
             d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, extra_data=extra_data)
             dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
-            #FIXME: Why isn't this chain_1?
-            #return chain_1(d2logpdf_dlink2_dtheta, d2link_df2)
             return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
         else:
             #Is no parameters so return an empty array for its derivatives
             return np.empty([f.shape[0], 0])
 
     def _laplace_gradients(self, f, y, extra_data=None):
-        #Bit nasty we recompute thesesome of these but it keeps it modular
-        #link_f = self.gp_link.transf(f)
-        #dlink_df = self.gp_link.dtransf_df(f)
-        #d2link_df2 = self.gp_link.d2transf_df2(f)
-
-        #dlogpdf_dtheta = self.dlogpdf_dtheta(link_f, y, extra_data=extra_data)
-        #dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(link_f, y, extra_data=extra_data)
-        #d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(link_f, y, extra_data=extra_data)
-
-        ##now chain them all with dlink_df etc
-        #dlogpdf_df_dtheta = chain_1(dlogpdf_dlink_dtheta, dlink_df)
-        #d2logpdf_df2_dtheta = chain_1(d2logpdf_dlink2_dtheta, d2link_df2)
-
         dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, extra_data=extra_data)
         dlogpdf_df_dtheta = self.dlogpdf_df_dtheta(f, y, extra_data=extra_data)
         d2logpdf_df2_dtheta = self.d2logpdf_df2_dtheta(f, y, extra_data=extra_data)

From a3422eae218ae7a4b97d48c8fc9afc6436fce250 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 22 Oct 2013 13:37:12 +0100
Subject: [PATCH 124/252] Doc stringing

---
 .../noise_models/bernoulli_noise.py           | 26 +++++++------
 .../noise_models/gaussian_noise.py            | 25 +++++++-----
 .../noise_models/noise_distributions.py       |  7 +---
 .../noise_models/student_t_noise.py           | 39 ++++++++++---------
 doc/GPy.likelihoods.noise_models.rst          |  6 +--
 doc/GPy.testing.rst                           |  8 ++++
 6 files changed, 61 insertions(+), 50 deletions(-)

diff --git a/GPy/likelihoods/noise_models/bernoulli_noise.py b/GPy/likelihoods/noise_models/bernoulli_noise.py
index 7ef8aa82..1d27d48b 100644
--- a/GPy/likelihoods/noise_models/bernoulli_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@@ -11,12 +11,14 @@ from noise_distributions import NoiseDistribution
 
 class Bernoulli(NoiseDistribution):
     """
-    Probit likelihood
-    Y is expected to take values in {-1,1}
-    -----
-    $$
-    L(x) = \\Phi (Y_i*f_i)
-    $$
+    Bernoulli likelihood
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+    .. Note::
+        Y is expected to take values in {-1,1}
+        Probit likelihood usually used
     """
     def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
         super(Bernoulli, self).__init__(gp_link,analytical_mean,analytical_variance)
@@ -82,7 +84,7 @@ class Bernoulli(NoiseDistribution):
         Likelihood function given link(f)
 
         .. math::
-            \\p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
@@ -111,7 +113,7 @@ class Bernoulli(NoiseDistribution):
         :param y: data
         :type y: Nx1 array
         :param extra_data: extra_data not used in bernoulli
-        :returns: log likelihood evaluated for this point
+        :returns: log likelihood evaluated at points link(f)
         :rtype: float
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
@@ -130,8 +132,8 @@ class Bernoulli(NoiseDistribution):
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data not used in gaussian
-        :returns: gradient of log likelihood evaluated at points
+        :param extra_data: extra_data not used in bernoulli
+        :returns: gradient of log likelihood evaluated at points link(f)
         :rtype: Nx1 array
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
@@ -151,7 +153,7 @@ class Bernoulli(NoiseDistribution):
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data not used in gaussian
+        :param extra_data: extra_data not used in bernoulli
         :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points link(f))
         :rtype: Nx1 array
 
@@ -174,7 +176,7 @@ class Bernoulli(NoiseDistribution):
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data not used in gaussian
+        :param extra_data: extra_data not used in bernoulli
         :returns: third derivative of log likelihood evaluated at points link(f)
         :rtype: Nx1 array
         """
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index 1c5ac1db..63d3a52a 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -12,12 +12,15 @@ class Gaussian(NoiseDistribution):
     """
     Gaussian likelihood
 
-    :param mean: mean value of the Gaussian distribution
-    :param variance: mean value of the Gaussian distribution
+    .. math::
+        \\ln p(y_{i}|\\lambda(f_{i})) = -\\frac{N \\ln 2\\pi}{2} - \\frac{\\ln |K|}{2} - \\frac{(y_{i} - \\lambda(f_{i}))^{T}\\sigma^{-2}(y_{i} - \\lambda(f_{i}))}{2}
+
+    :param variance: variance value of the Gaussian distribution
+    :param N: Number of data points
+    :type N: int
     """
     def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,variance=1., D=None, N=None):
         self.variance = variance
-        self.D = D
         self.N = N
         self._set_params(np.asarray(variance))
         super(Gaussian, self).__init__(gp_link,analytical_mean,analytical_variance)
@@ -109,7 +112,6 @@ class Gaussian(NoiseDistribution):
         #Assumes no covariance, exp, sum, log for numerical stability
         return np.exp(np.sum(np.log(stats.norm.pdf(y, link_f, np.sqrt(self.variance)))))
 
-
     def logpdf_link(self, link_f, y, extra_data=None):
         """
         Log likelihood function given link(f)
@@ -150,9 +152,11 @@ class Gaussian(NoiseDistribution):
 
     def d2logpdf_dlink2(self, link_f, y, extra_data=None):
         """
-        Hessian at y, given link_f, w.r.t link_f the hessian will be 0 unless i == j
+        Hessian at y, given link_f, w.r.t link_f.
         i.e. second derivative logpdf at y given link(f_i) link(f_j)  w.r.t link(f_i) and link(f_j)
 
+        The hessian will be 0 unless i == j
+
         .. math::
             \\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}f} = -\\frac{1}{\\sigma^{2}}
 
@@ -193,10 +197,10 @@ class Gaussian(NoiseDistribution):
 
     def dlogpdf_link_dvar(self, link_f, y, extra_data=None):
         """
-        Gradient of the negative log-likelihood function at y given link(f), w.r.t variance parameter (noise_variance)
+        Gradient of the log-likelihood function at y given link(f), w.r.t variance parameter (noise_variance)
 
         .. math::
-            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - \\lambda(f_{i}))^{2}}{2\\sigma^{4}}
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\sigma^{2}} = -\\frac{N}{2\\sigma^{2}} + \\frac{(y_{i} - \\lambda(f_{i}))^{2}}{2\\sigma^{4}}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
@@ -209,7 +213,7 @@ class Gaussian(NoiseDistribution):
         assert np.asarray(link_f).shape == np.asarray(y).shape
         e = y - link_f
         s_4 = 1.0/(self.variance**2)
-        dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.dot(e.T, e)
+        dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.square(e)
         return np.sum(dlik_dsigma) # Sure about this sum?
 
     def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
@@ -228,8 +232,9 @@ class Gaussian(NoiseDistribution):
         :rtype: Nx1 array
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
-        s_4 = 1.0/(self.variance**2)
-        dlik_grad_dsigma = -np.dot(s_4*self.I, y) + np.dot(s_4*self.I, link_f)
+        s_4 = 1./(self.variance**2)
+        #dlik_grad_dsigma = -np.dot(s_4*self.I, y) + np.dot(s_4*self.I, link_f)
+        dlik_grad_dsigma = -s_4*y + s_4*link_f
         return dlik_grad_dsigma
 
     def d2logpdf_dlink2_dvar(self, link_f, y, extra_data=None):
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 82071a50..897986a5 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -12,14 +12,9 @@ import gp_transformations
 from GPy.util.misc import chain_1, chain_2, chain_3
 from scipy.integrate import quad
 
-
 class NoiseDistribution(object):
     """
-    Likelihood class for doing Expectation propagation
-
-    :param Y: observed output (Nx1 numpy.darray)
-
-    .. note:: Y values allowed depend on the LikelihoodFunction used
+    Likelihood class for doing approximations
     """
     def __init__(self,gp_link,analytical_mean=False,analytical_variance=False):
         assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."
diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index 49de781f..7937a507 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -16,7 +16,7 @@ class StudentT(NoiseDistribution):
     For nomanclature see Bayesian Data Analysis 2003 p576
 
     .. math::
-        \\ln p(y_{i}|f_{i}) = \\ln \\Gamma(\\frac{v+1}{2}) - \\ln \\Gamma(\\frac{v}{2})\\sqrt{v \\pi}\\sigma - \\frac{v+1}{2}\\ln (1 + \\frac{1}{v}\\left(\\frac{y_{i} - f_{i}}{\\sigma}\\right)^2)
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
 
     """
     def __init__(self,gp_link=None,analytical_mean=True,analytical_variance=True, deg_free=5, sigma2=2):
@@ -45,13 +45,13 @@ class StudentT(NoiseDistribution):
         Likelihood function given link(f)
 
         .. math::
-            \\ln p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \\lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
+        :param extra_data: extra_data which is not used in student t distribution
         :returns: likelihood evaluated for this point
         :rtype: float
         """
@@ -69,13 +69,13 @@ class StudentT(NoiseDistribution):
         Log Likelihood Function given link(f)
 
         .. math::
-            \\ln p(y_{i}|f_{i}) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)
+            \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)
 
         :param link_f: latent variables (link(f))
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
+        :param extra_data: extra_data which is not used in student t distribution
         :returns: likelihood evaluated for this point
         :rtype: float
 
@@ -94,13 +94,13 @@ class StudentT(NoiseDistribution):
         Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
 
         .. math::
-            \\frac{d \\ln p(y_{i}|f_{i})}{df} = \\frac{(v+1)(y_{i}-f_{i})}{(y_{i}-f_{i})^{2} + \\sigma^{2}v}
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{(v+1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v}
 
         :param link_f: latent variables (f)
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
+        :param extra_data: extra_data which is not used in student t distribution
         :returns: gradient of likelihood evaluated at points
         :rtype: Nx1 array
 
@@ -112,17 +112,18 @@ class StudentT(NoiseDistribution):
 
     def d2logpdf_dlink2(self, link_f, y, extra_data=None):
         """
-        Hessian at y, given link(f), w.r.t link(f) the hessian will be 0 unless i == j
+        Hessian at y, given link(f), w.r.t link(f)
         i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
 
         .. math::
-            \\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f} = \\frac{(v+1)((y_{i}-f_{i})^{2} - \\sigma^{2}v)}{((y_{i}-f_{i})^{2} + \\sigma^{2}v)^{2}}
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{(v+1)((y_{i}-\lambda(f_{i}))^{2} - \\sigma^{2}v)}{((y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v)^{2}}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
+        :param extra_data: extra_data which is not used in student t distribution
         :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
         :rtype: Nx1 array
 
@@ -137,16 +138,16 @@ class StudentT(NoiseDistribution):
 
     def d3logpdf_dlink3(self, link_f, y, extra_data=None):
         """
-        Third order derivative log-likelihood function at y given f w.r.t f
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
 
         .. math::
-            \\frac{d^{3} \\ln p(y_{i}|f_{i})}{d^{3}f} = \\frac{-2(v+1)((y_{i} - f_{i})^3 - 3(y_{i} - f_{i}) \\sigma^{2} v))}{((y_{i} - f_{i}) + \\sigma^{2} v)^3}
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{-2(v+1)((y_{i} - \lambda(f_{i}))^3 - 3(y_{i} - \lambda(f_{i})) \\sigma^{2} v))}{((y_{i} - \lambda(f_{i})) + \\sigma^{2} v)^3}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
+        :param extra_data: extra_data which is not used in student t distribution
         :returns: third derivative of likelihood evaluated at points f
         :rtype: Nx1 array
         """
@@ -162,13 +163,13 @@ class StudentT(NoiseDistribution):
         Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)
 
         .. math::
-            \\frac{d \\ln p(y_{i}|f_{i})}{d\\sigma^{2}} = \\frac{v((y_{i} - f_{i})^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - f_{i})^{2})}
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{v((y_{i} - \lambda(f_{i}))^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
+        :param extra_data: extra_data which is not used in student t distribution
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: float
         """
@@ -182,13 +183,13 @@ class StudentT(NoiseDistribution):
         Derivative of the dlogpdf_dlink w.r.t variance parameter (t_noise)
 
         .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|f_{i})}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-f_{i})}{(y_{i}-f_{i})^2 + \\sigma^2 v)^2}
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^2 + \\sigma^2 v)^2}
 
         :param link_f: latent variables link_f
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
+        :param extra_data: extra_data which is not used in student t distribution
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: Nx1 array
         """
@@ -202,13 +203,13 @@ class StudentT(NoiseDistribution):
         Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (t_noise)
 
         .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|f_{i})}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - f_{i})^{2})}{(\\sigma^{2}v + (y_{i} - f_{i})^{2})^{3}}
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - \lambda(f_{i}))^{2})}{(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})^{3}}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
         :param y: data
         :type y: Nx1 array
-        :param extra_data: extra_data which is not used in student t distribution - not used
+        :param extra_data: extra_data which is not used in student t distribution
         :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
         :rtype: Nx1 array
         """
diff --git a/doc/GPy.likelihoods.noise_models.rst b/doc/GPy.likelihoods.noise_models.rst
index c16ee7d1..6fec5aff 100644
--- a/doc/GPy.likelihoods.noise_models.rst
+++ b/doc/GPy.likelihoods.noise_models.rst
@@ -4,10 +4,10 @@ GPy.likelihoods.noise_models package
 Submodules
 ----------
 
-GPy.likelihoods.noise_models.binomial_noise module
---------------------------------------------------
+GPy.likelihoods.noise_models.bernoulli_noise module
+---------------------------------------------------
 
-.. automodule:: GPy.likelihoods.noise_models.binomial_noise
+.. automodule:: GPy.likelihoods.noise_models.bernoulli_noise
     :members:
     :undoc-members:
     :show-inheritance:
diff --git a/doc/GPy.testing.rst b/doc/GPy.testing.rst
index 2d41d5fc..98b001c0 100644
--- a/doc/GPy.testing.rst
+++ b/doc/GPy.testing.rst
@@ -36,6 +36,14 @@ GPy.testing.examples_tests module
     :undoc-members:
     :show-inheritance:
 
+GPy.testing.gp_transformation_tests module
+------------------------------------------
+
+.. automodule:: GPy.testing.gp_transformation_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 GPy.testing.gplvm_tests module
 ------------------------------
 

From eacf622ac74de38ccdd18c97dc27d4521409d40e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 22 Oct 2013 13:51:16 +0100
Subject: [PATCH 125/252] Fixed breakage of dvar, tidied up to make more
 efficient

---
 GPy/likelihoods/noise_models/gaussian_noise.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index 63d3a52a..83cc2f47 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -213,7 +213,7 @@ class Gaussian(NoiseDistribution):
         assert np.asarray(link_f).shape == np.asarray(y).shape
         e = y - link_f
         s_4 = 1.0/(self.variance**2)
-        dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.square(e)
+        dlik_dsigma = -0.5*self.N/self.variance + 0.5*s_4*np.sum(np.square(e))
         return np.sum(dlik_dsigma) # Sure about this sum?
 
     def dlogpdf_dlink_dvar(self, link_f, y, extra_data=None):
@@ -232,8 +232,7 @@ class Gaussian(NoiseDistribution):
         :rtype: Nx1 array
         """
         assert np.asarray(link_f).shape == np.asarray(y).shape
-        s_4 = 1./(self.variance**2)
-        #dlik_grad_dsigma = -np.dot(s_4*self.I, y) + np.dot(s_4*self.I, link_f)
+        s_4 = 1.0/(self.variance**2)
         dlik_grad_dsigma = -s_4*y + s_4*link_f
         return dlik_grad_dsigma
 

From 5f9d7eb70913a4664d22bc0324cfc45fba1d0f20 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 22 Oct 2013 15:22:27 +0100
Subject: [PATCH 126/252] Changed naming from old derivatives of likelihoods to
 new ones in noise distributions

---
 GPy/likelihoods/noise_models/noise_distributions.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 897986a5..58c44629 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -80,7 +80,7 @@ class NoiseDistribution(object):
         :param sigma: cavity distribution standard deviation
 
         """
-        return .5*((gp-mu)/sigma)**2 + self._nlog_mass(gp,obs)
+        return .5*((gp-mu)/sigma)**2 - self.logpdf(gp,obs)
 
     def _dnlog_product_dgp(self,gp,obs,mu,sigma):
         """
@@ -92,7 +92,7 @@ class NoiseDistribution(object):
         :param sigma: cavity distribution standard deviation
 
         """
-        return (gp - mu)/sigma**2 + self._dnlog_mass_dgp(gp,obs)
+        return (gp - mu)/sigma**2 - self.dlogpdf_df(gp,obs)
 
     def _d2nlog_product_dgp2(self,gp,obs,mu,sigma):
         """
@@ -104,7 +104,7 @@ class NoiseDistribution(object):
         :param sigma: cavity distribution standard deviation
 
         """
-        return 1./sigma**2 + self._d2nlog_mass_dgp2(gp,obs)
+        return 1./sigma**2 - self.d2logpdf_df2(gp,obs)
 
     def _product_mode(self,obs,mu,sigma):
         """
@@ -166,8 +166,8 @@ class NoiseDistribution(object):
         """
         mu = v/tau
         mu_hat = self._product_mode(obs,mu,np.sqrt(1./tau))
-        sigma2_hat = 1./(tau + self._d2nlog_mass_dgp2(mu_hat,obs))
-        Z_hat = np.exp(-.5*tau*(mu_hat-mu)**2) * self._mass(mu_hat,obs)*np.sqrt(tau*sigma2_hat)
+        sigma2_hat = 1./(tau - self.d2logpdf_df2(mu_hat,obs))
+        Z_hat = np.exp(-.5*tau*(mu_hat-mu)**2) * self.pdf(mu_hat,obs)*np.sqrt(tau*sigma2_hat)
         return Z_hat,mu_hat,sigma2_hat
 
     def _nlog_conditional_mean_scaled(self,gp,mu,sigma):

From 7c9eda482c1ee4e993855b6afc9dcdb84180f4ec Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 22 Oct 2013 15:30:56 +0100
Subject: [PATCH 127/252] Moved transf_data to make data -1 or 1 from 0 or 1
 for bernoulli with probit into the analytical moment match (but it 10%
 slower), needs removing from epmixednoise

---
 GPy/likelihoods/ep.py                         |  7 +++---
 .../noise_models/bernoulli_noise.py           | 24 ++++++++++++-------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/GPy/likelihoods/ep.py b/GPy/likelihoods/ep.py
index 4fedd66b..cfa00500 100644
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@@ -19,7 +19,6 @@ class EP(likelihood):
         self.num_data, self.output_dim = self.data.shape
         self.is_heteroscedastic = True
         self.num_params = 0
-        self._transf_data = self.noise_model._preprocess_values(data)
 
         #Initial values - Likelihood approximation parameters:
         #p(y|f) = t(f|tau_tilde,v_tilde)
@@ -134,7 +133,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
@@ -233,7 +232,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@@ -336,7 +335,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
diff --git a/GPy/likelihoods/noise_models/bernoulli_noise.py b/GPy/likelihoods/noise_models/bernoulli_noise.py
index 1d27d48b..5a11ba37 100644
--- a/GPy/likelihoods/noise_models/bernoulli_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@@ -45,18 +45,24 @@ class Bernoulli(NoiseDistribution):
         :param tau_i: precision of the cavity distribution (float)
         :param v_i: mean/variance of the cavity distribution (float)
         """
+        if data_i == 1:
+            sign = 1.
+        elif data_i == 0:
+            sign = -1
+        else:
+            raise ValueError("bad value for Bernouilli observation (0,1)")
         if isinstance(self.gp_link,gp_transformations.Probit):
-            z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
+            z = sign*v_i/np.sqrt(tau_i**2 + tau_i)
             Z_hat = std_norm_cdf(z)
             phi = std_norm_pdf(z)
-            mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
+            mu_hat = v_i/tau_i + sign*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
             sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
 
         elif isinstance(self.gp_link,gp_transformations.Heaviside):
-            a = data_i*v_i/np.sqrt(tau_i)
+            a = sign*v_i/np.sqrt(tau_i)
             Z_hat = std_norm_cdf(a)
             N = std_norm_pdf(a)
-            mu_hat = v_i/tau_i + data_i*N/Z_hat/np.sqrt(tau_i)
+            mu_hat = v_i/tau_i + sign*N/Z_hat/np.sqrt(tau_i)
             sigma2_hat = (1. - a*N/Z_hat - np.square(N/Z_hat))/tau_i
             if np.any(np.isnan([Z_hat, mu_hat, sigma2_hat])):
                 stop
@@ -97,7 +103,7 @@ class Bernoulli(NoiseDistribution):
         .. Note:
             Each y_i must be in {0,1}
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         objective = (link_f**y) * ((1.-link_f)**(1.-y))
         return np.exp(np.sum(np.log(objective)))
 
@@ -116,7 +122,7 @@ class Bernoulli(NoiseDistribution):
         :returns: log likelihood evaluated at points link(f)
         :rtype: float
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         #objective = y*np.log(link_f) + (1.-y)*np.log(link_f)
         objective = np.where(y==1, np.log(link_f), np.log(1-link_f))
         return np.sum(objective)
@@ -136,7 +142,7 @@ class Bernoulli(NoiseDistribution):
         :returns: gradient of log likelihood evaluated at points link(f)
         :rtype: Nx1 array
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         grad = (y/link_f) - (1.-y)/(1-link_f)
         return grad
 
@@ -161,7 +167,7 @@ class Bernoulli(NoiseDistribution):
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
             (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         d2logpdf_dlink2 = -y/(link_f**2) - (1-y)/((1-link_f)**2)
         return d2logpdf_dlink2
 
@@ -180,7 +186,7 @@ class Bernoulli(NoiseDistribution):
         :returns: third derivative of log likelihood evaluated at points link(f)
         :rtype: Nx1 array
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         d3logpdf_dlink3 = 2*(y/(link_f**3) - (1-y)/((1-link_f)**3))
         return d3logpdf_dlink3
 

From 22c24c0abe149d6961f61037158686997c31f996 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 22 Oct 2013 15:33:14 +0100
Subject: [PATCH 128/252] Use bfgs for laplace instead

---
 GPy/examples/classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index 38559105..d4f55d4a 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -94,7 +94,7 @@ def toy_linear_1d_classification_laplace(seed=default_seed):
     # Optimize
     #m.update_likelihood_approximation()
     # Parameters optimization:
-    m.optimize(messages=1)
+    m.optimize('bfgs', messages=1)
     #m.pseudo_EM()
 
     # Plot

From c0b94f051b458fdf27e41b2b4631421180b8883c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 22 Oct 2013 17:22:23 +0100
Subject: [PATCH 129/252] Added numerical mean and variance with quadrature,
 about to clean up

---
 .../noise_models/noise_distributions.py       | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 58c44629..d5c9af0a 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -296,6 +296,23 @@ class NoiseDistribution(object):
         raise NotImplementedError
 
     def _predictive_mean_numerical(self,mu,sigma):
+        """
+        Quadrature calculation of the predictive mean: E(Y_star|Y) = E( E(Y_star|f_star, Y) )
+
+        :param mu: mean of posterior
+        :param sigma: standard deviation of posterior
+
+        """
+        sigma2 = sigma**2
+        #Compute first moment
+        def int_mean(f):
+            return self._mean(f)*np.exp(-(0.5/sigma2)*np.square(f - mu))
+        scaled_mean, accuracy = quad(int_mean, -np.inf, np.inf)
+        mean = scaled_mean / np.sqrt(2*np.pi*(sigma2))
+
+        return mean
+
+    def _predictive_mean_numerical_laplace(self,mu,sigma):
         """
         Laplace approximation to the predictive mean: E(Y_star|Y) = E( E(Y_star|f_star, Y) )
         if self.
@@ -336,6 +353,40 @@ class NoiseDistribution(object):
         """
         Laplace approximation to the predictive variance: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
 
+        :param mu: mean of posterior
+        :param sigma: standard deviation of posterior
+        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
+
+        """
+        sigma2 = sigma**2
+        normalizer = np.sqrt(2*np.pi*sigma2)
+
+        # E( V(Y_star|f_star) )
+        #Compute expected value of variance
+        def int_var(f):
+            return self._variance(f)*np.exp(-(0.5/sigma2)*np.square(f - mu))
+        scaled_exp_variance, accuracy = quad(int_var, -np.inf, np.inf)
+        exp_var = scaled_exp_variance / normalizer
+
+        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star) )**2
+        if predictive_mean is None:
+            predictive_mean = self.predictive_mean(mu,sigma)
+
+        predictive_mean_sq = predictive_mean**2
+        def int_pred_mean_sq(f):
+            return predictive_mean_sq*np.exp(-(0.5/(sigma2))*np.square(f - mu))
+
+        scaled_exp_exp2, accuracy = quad(int_pred_mean_sq, -np.inf, np.inf)
+        exp_exp2 = scaled_exp_exp2 / normalizer
+
+        var_exp = exp_exp2 - predictive_mean**2
+        # V(Y_star | f_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+        return exp_var + var_exp
+
+    def _predictive_variance_numerical_laplace(self,mu,sigma,predictive_mean=None):
+        """
+        Laplace approximation to the predictive variance: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
         :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.

From 9b99061b09b631bbe2f66a0a39f7e6b353e6e1bc Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 22 Oct 2013 17:31:20 +0100
Subject: [PATCH 130/252] Tore out code no longer used from noise_distributions
 due to rewriting using quadrature

---
 .../noise_models/noise_distributions.py       | 301 ------------------
 1 file changed, 301 deletions(-)

diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index d5c9af0a..c7ade68f 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -56,67 +56,6 @@ class NoiseDistribution(object):
         """
         return Y
 
-    def _product(self,gp,obs,mu,sigma):
-        """
-        Product between the cavity distribution and a likelihood factor.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._mass(gp,obs)
-
-    def _nlog_product_scaled(self,gp,obs,mu,sigma):
-        """
-        Negative log-product between the cavity distribution and a likelihood factor.
-
-        .. note:: The constant term in the Gaussian distribution is ignored.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return .5*((gp-mu)/sigma)**2 - self.logpdf(gp,obs)
-
-    def _dnlog_product_dgp(self,gp,obs,mu,sigma):
-        """
-        Derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - self.dlogpdf_df(gp,obs)
-
-    def _d2nlog_product_dgp2(self,gp,obs,mu,sigma):
-        """
-        Second derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
-
-        :param gp: latent variable
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - self.d2logpdf_df2(gp,obs)
-
-    def _product_mode(self,obs,mu,sigma):
-        """
-        Newton's CG method to find the mode in _product (cavity x likelihood factor).
-
-        :param obs: observed output
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return sp.optimize.fmin_ncg(self._nlog_product_scaled,x0=mu,fprime=self._dnlog_product_dgp,fhess=self._d2nlog_product_dgp2,args=(obs,mu,sigma),disp=False)
-
     def _moments_match_analytical(self,obs,tau,v):
         """
         If available, this function computes the moments analytically.
@@ -155,126 +94,6 @@ class NoiseDistribution(object):
 
         return z, mean, variance
 
-    def _moments_match_numerical_laplace(self,obs,tau,v):
-        """
-        Lapace approximation to calculate the moments.
-
-        :param obs: observed output
-        :param tau: cavity distribution 1st natural parameter (precision)
-        :param v: cavity distribution 2nd natural paramenter (mu*precision)
-
-        """
-        mu = v/tau
-        mu_hat = self._product_mode(obs,mu,np.sqrt(1./tau))
-        sigma2_hat = 1./(tau - self.d2logpdf_df2(mu_hat,obs))
-        Z_hat = np.exp(-.5*tau*(mu_hat-mu)**2) * self.pdf(mu_hat,obs)*np.sqrt(tau*sigma2_hat)
-        return Z_hat,mu_hat,sigma2_hat
-
-    def _nlog_conditional_mean_scaled(self,gp,mu,sigma):
-        """
-        Negative logarithm of the l.v.'s predictive distribution times the output's mean given the l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        .. note:: This function helps computing E(Y_star) = E(E(Y_star|f_star))
-
-        """
-        return .5*((gp - mu)/sigma)**2 - np.log(self._mean(gp))
-
-    def _dnlog_conditional_mean_dgp(self,gp,mu,sigma):
-        """
-        Derivative of _nlog_conditional_mean_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - self._dmean_dgp(gp)/self._mean(gp)
-
-    def _d2nlog_conditional_mean_dgp2(self,gp,mu,sigma):
-        """
-        Second derivative of _nlog_conditional_mean_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - self._d2mean_dgp2(gp)/self._mean(gp) + (self._dmean_dgp(gp)/self._mean(gp))**2
-
-    def _nlog_exp_conditional_variance_scaled(self,gp,mu,sigma):
-        """
-        Negative logarithm of the l.v.'s predictive distribution times the output's variance given the l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        .. note:: This function helps computing E(V(Y_star|f_star))
-
-        """
-        return .5*((gp - mu)/sigma)**2 - np.log(self._variance(gp))
-
-    def _dnlog_exp_conditional_variance_dgp(self,gp,mu,sigma):
-        """
-        Derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - self._dvariance_dgp(gp)/self._variance(gp)
-
-    def _d2nlog_exp_conditional_variance_dgp2(self,gp,mu,sigma):
-        """
-        Second derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - self._d2variance_dgp2(gp)/self._variance(gp) + (self._dvariance_dgp(gp)/self._variance(gp))**2
-
-    def _nlog_exp_conditional_mean_sq_scaled(self,gp,mu,sigma):
-        """
-        Negative logarithm of the l.v.'s predictive distribution times the output's mean squared given the l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        .. note:: This function helps computing E( E(Y_star|f_star)**2 )
-
-        """
-        return .5*((gp - mu)/sigma)**2 - 2*np.log(self._mean(gp))
-
-    def _dnlog_exp_conditional_mean_sq_dgp(self,gp,mu,sigma):
-        """
-        Derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return (gp - mu)/sigma**2 - 2*self._dmean_dgp(gp)/self._mean(gp)
-
-    def _d2nlog_exp_conditional_mean_sq_dgp2(self,gp,mu,sigma):
-        """
-        Second derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
-
-        :param gp: latent variable
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        return 1./sigma**2 - 2*( self._d2mean_dgp2(gp)/self._mean(gp) - (self._dmean_dgp(gp)/self._mean(gp))**2 )
-
     def _predictive_mean_analytical(self,mu,sigma):
         """
         Predictive mean
@@ -312,43 +131,6 @@ class NoiseDistribution(object):
 
         return mean
 
-    def _predictive_mean_numerical_laplace(self,mu,sigma):
-        """
-        Laplace approximation to the predictive mean: E(Y_star|Y) = E( E(Y_star|f_star, Y) )
-        if self.
-
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        maximum = sp.optimize.fmin_ncg(self._nlog_conditional_mean_scaled,x0=self._mean(mu),fprime=self._dnlog_conditional_mean_dgp,fhess=self._d2nlog_conditional_mean_dgp2,args=(mu,sigma),disp=False)
-        mean = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma))*sigma)
-        """
-
-        pb.figure()
-        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
-        f = np.array([np.exp(-self._nlog_conditional_mean_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
-        pb.plot(x,f,'b-')
-        sigma2 = 1./self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma)
-        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
-        k = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
-        pb.plot(x,f2*mean,'r-')
-        pb.vlines(maximum,0,f.max())
-        """
-        return mean
-
-    def _predictive_mean_sq(self,mu,sigma):
-        """
-        Laplace approximation to the predictive mean squared: E(Y_star**2) = E( E(Y_star|f_star)**2 )
-
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-
-        """
-        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_mean_sq_scaled,x0=self._mean(mu),fprime=self._dnlog_exp_conditional_mean_sq_dgp,fhess=self._d2nlog_exp_conditional_mean_sq_dgp2,args=(mu,sigma),disp=False)
-        mean_squared = np.exp(-self._nlog_exp_conditional_mean_sq_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_mean_sq_dgp2(maximum,mu,sigma))*sigma)
-        return mean_squared
-
     def _predictive_variance_numerical(self,mu,sigma,predictive_mean=None):
         """
         Laplace approximation to the predictive variance: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
@@ -383,38 +165,6 @@ class NoiseDistribution(object):
         # V(Y_star | f_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
         return exp_var + var_exp
 
-    def _predictive_variance_numerical_laplace(self,mu,sigma,predictive_mean=None):
-        """
-        Laplace approximation to the predictive variance: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
-
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
-
-        """
-        # E( V(Y_star|f_star) )
-        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_variance_scaled,x0=self._variance(mu),fprime=self._dnlog_exp_conditional_variance_dgp,fhess=self._d2nlog_exp_conditional_variance_dgp2,args=(mu,sigma),disp=False)
-        exp_var = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma))*sigma)
-
-        """
-        pb.figure()
-        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
-        f = np.array([np.exp(-self._nlog_exp_conditional_variance_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
-        pb.plot(x,f,'b-')
-        sigma2 = 1./self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma)
-        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
-        k = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
-        pb.plot(x,f2*exp_var,'r--')
-        pb.vlines(maximum,0,f.max())
-        """
-
-        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star)**2 )
-        exp_exp2 = self._predictive_mean_sq(mu,sigma)
-        if predictive_mean is None:
-            predictive_mean = self.predictive_mean(mu,sigma)
-        var_exp = exp_exp2 - predictive_mean**2
-        return exp_var + var_exp
-
     def _predictive_percentiles(self,p,mu,sigma):
         """
         Percentiles of the predictive distribution
@@ -428,57 +178,6 @@ class NoiseDistribution(object):
         qf = stats.norm.ppf(p,mu,sigma)
         return self.gp_link.transf(qf)
 
-    def _nlog_joint_predictive_scaled(self,x,mu,sigma):
-        """
-        Negative logarithm of the joint predictive distribution (latent variable and output).
-
-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
-
-        """
-        return self._nlog_product_scaled(x[0],x[1],mu,sigma)
-
-    def _gradient_nlog_joint_predictive(self,x,mu,sigma):
-        """
-        Gradient of _nlog_joint_predictive_scaled.
-
-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
-
-        .. note: Only available when the output is continuous
-
-        """
-        assert not self.discrete, "Gradient not available for discrete outputs."
-        return np.array((self._dnlog_product_dgp(gp=x[0],obs=x[1],mu=mu,sigma=sigma),self._dnlog_mass_dobs(obs=x[1],gp=x[0])))
-
-    def _hessian_nlog_joint_predictive(self,x,mu,sigma):
-        """
-        Hessian of _nlog_joint_predictive_scaled.
-
-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
-
-        .. note: Only available when the output is continuous
-
-        """
-        assert not self.discrete, "Hessian not available for discrete outputs."
-        cross_derivative = self._d2nlog_mass_dcross(gp=x[0],obs=x[1])
-        return np.array((self._d2nlog_product_dgp2(gp=x[0],obs=x[1],mu=mu,sigma=sigma),cross_derivative,cross_derivative,self._d2nlog_mass_dobs2(obs=x[1],gp=x[0]))).reshape(2,2)
-
-    def _joint_predictive_mode(self,mu,sigma):
-        """
-        Negative logarithm of the joint predictive distribution (latent variable and output).
-
-        :param x: tuple (latent variable,output)
-        :param mu: latent variable's predictive mean
-        :param sigma: latent variable's predictive standard deviation
-
-        """
-        return sp.optimize.fmin_ncg(self._nlog_joint_predictive_scaled,x0=(mu,self.gp_link.transf(mu)),fprime=self._gradient_nlog_joint_predictive,fhess=self._hessian_nlog_joint_predictive,args=(mu,sigma),disp=False)
-
     def pdf_link(self, link_f, y, extra_data=None):
         raise NotImplementedError
 

From 7ecf2337324ffaa5e8b45fed8653ac9d24c13600 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 23 Oct 2013 12:08:59 +0100
Subject: [PATCH 131/252] Removed derivatives of variance wrt gp and
 derivatives of means with respect to gp from noise models

---
 GPy/likelihoods/noise_models/bernoulli_noise.py     | 12 ------------
 GPy/likelihoods/noise_models/exponential_noise.py   | 12 ------------
 GPy/likelihoods/noise_models/gamma_noise.py         | 12 ------------
 GPy/likelihoods/noise_models/gaussian_noise.py      | 12 ------------
 GPy/likelihoods/noise_models/noise_distributions.py |  4 ++--
 GPy/likelihoods/noise_models/poisson_noise.py       | 12 ------------
 6 files changed, 2 insertions(+), 62 deletions(-)

diff --git a/GPy/likelihoods/noise_models/bernoulli_noise.py b/GPy/likelihoods/noise_models/bernoulli_noise.py
index 5a11ba37..77242333 100644
--- a/GPy/likelihoods/noise_models/bernoulli_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@@ -196,12 +196,6 @@ class Bernoulli(NoiseDistribution):
         """
         return self.gp_link.transf(gp)
 
-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
     def _variance(self,gp):
         """
         Mass (or density) function
@@ -209,12 +203,6 @@ class Bernoulli(NoiseDistribution):
         p = self.gp_link.transf(gp)
         return p*(1.-p)
 
-    def _dvariance_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)*(1. - 2.*self.gp_link.transf(gp))
-
-    def _d2variance_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)*(1. - 2.*self.gp_link.transf(gp)) - 2*self.gp_link.dtransf_df(gp)**2
-
     def samples(self, gp):
         """
         Returns a set of samples of observations based on a given value of the latent variable.
diff --git a/GPy/likelihoods/noise_models/exponential_noise.py b/GPy/likelihoods/noise_models/exponential_noise.py
index 56e63c75..450c11be 100644
--- a/GPy/likelihoods/noise_models/exponential_noise.py
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@@ -49,20 +49,8 @@ class Exponential(NoiseDistribution):
         """
         return self.gp_link.transf(gp)
 
-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
     def _variance(self,gp):
         """
         Mass (or density) function
         """
         return self.gp_link.transf(gp)**2
-
-    def _dvariance_dgp(self,gp):
-        return 2*self.gp_link.transf(gp)*self.gp_link.dtransf_df(gp)
-
-    def _d2variance_dgp2(self,gp):
-        return 2 * (self.gp_link.dtransf_df(gp)**2 + self.gp_link.transf(gp)*self.gp_link.d2transf_df2(gp))
diff --git a/GPy/likelihoods/noise_models/gamma_noise.py b/GPy/likelihoods/noise_models/gamma_noise.py
index 6bf0dd7b..5229cb4f 100644
--- a/GPy/likelihoods/noise_models/gamma_noise.py
+++ b/GPy/likelihoods/noise_models/gamma_noise.py
@@ -52,20 +52,8 @@ class Gamma(NoiseDistribution):
         """
         return self.gp_link.transf(gp)
 
-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
     def _variance(self,gp):
         """
         Mass (or density) function
         """
         return self.gp_link.transf(gp)/self.beta
-
-    def _dvariance_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)/self.beta
-
-    def _d2variance_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)/self.beta
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index 83cc2f47..0ce8ffd9 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -277,12 +277,6 @@ class Gaussian(NoiseDistribution):
         """
         return self.gp_link.transf(gp)
 
-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
     def _variance(self,gp):
         """
         Variance of y under the Mass (or density) function p(y|f)
@@ -291,9 +285,3 @@ class Gaussian(NoiseDistribution):
             Var_{p(y|f)}[y]
         """
         return self.variance
-
-    def _dvariance_dgp(self,gp):
-        return 0
-
-    def _d2variance_dgp2(self,gp):
-        return 0
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index c7ade68f..59465a5b 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -371,8 +371,8 @@ class NoiseDistribution(object):
         """
         Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
 
-        :param mu: mean of the latent variable
-        :param var: variance of the latent variable
+        :param mu: mean of the latent variable, f
+        :param var: variance of the latent variable, f
 
         """
         if isinstance(mu,float) or isinstance(mu,int):
diff --git a/GPy/likelihoods/noise_models/poisson_noise.py b/GPy/likelihoods/noise_models/poisson_noise.py
index 33de84cd..80d7951b 100644
--- a/GPy/likelihoods/noise_models/poisson_noise.py
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@@ -50,20 +50,8 @@ class Poisson(NoiseDistribution):
         """
         return self.gp_link.transf(gp)
 
-    def _dmean_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2mean_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)
-
     def _variance(self,gp):
         """
         Mass (or density) function
         """
         return self.gp_link.transf(gp)
-
-    def _dvariance_dgp(self,gp):
-        return self.gp_link.dtransf_df(gp)
-
-    def _d2variance_dgp2(self,gp):
-        return self.gp_link.d2transf_df2(gp)

From 6678bca011dff22516db7b463c655860bf49cb9b Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 23 Oct 2013 13:28:08 +0100
Subject: [PATCH 132/252] Fixed bug in gradient checker where it worked
 differently given a integer parameter to a float

---
 GPy/models/gradient_checker.py   | 2 +-
 GPy/testing/likelihoods_tests.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/GPy/models/gradient_checker.py b/GPy/models/gradient_checker.py
index face9589..64b8b2fb 100644
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@@ -75,7 +75,7 @@ class GradientChecker(Model):
             self.names = names
             self.shapes = [get_shape(x0)]
         for name, xi in zip(self.names, at_least_one_element(x0)):
-            self.__setattr__(name, xi)
+            self.__setattr__(name, numpy.float_(xi))
 #         self._param_names = []
 #         for name, shape in zip(self.names, self.shapes):
 #             self._param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
diff --git a/GPy/testing/likelihoods_tests.py b/GPy/testing/likelihoods_tests.py
index 449f3e90..9a3dfd16 100644
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@@ -321,6 +321,7 @@ class TestNoiseModels(object):
     def t_dlogpdf_dparams(self, model, Y, f, params, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
+        print param_constraints
         assert (
                 dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
                     params, args=(f, Y), constraints=param_constraints,
@@ -331,6 +332,7 @@ class TestNoiseModels(object):
     def t_dlogpdf_df_dparams(self, model, Y, f, params, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
+        print param_constraints
         assert (
                 dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
                     params, args=(f, Y), constraints=param_constraints,
@@ -341,6 +343,7 @@ class TestNoiseModels(object):
     def t_d2logpdf2_df2_dparams(self, model, Y, f, params, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
+        #print param_constraints
         assert (
                 dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
                     params, args=(f, Y), constraints=param_constraints,

From 3e0b597486d356adeb484c676c29cfcb881c908d Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 23 Oct 2013 14:39:33 +0100
Subject: [PATCH 133/252] Updated boston tests (more folds, allow a bias as the
 datasets are not normalized once split) and more folds. Tweaked some laplace
 line search parameters, added basis tests for ep

---
 GPy/examples/laplace_approximations.py | 45 ++++++++++-----------
 GPy/likelihoods/laplace.py             | 10 +++--
 GPy/testing/likelihoods_tests.py       | 56 +++++++++++++++++++++-----
 3 files changed, 75 insertions(+), 36 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index ea3a9f8e..2f163583 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -193,6 +193,8 @@ def gaussian_f_check():
 def boston_example():
     import sklearn
     from sklearn.cross_validation import KFold
+    optimizer='bfgs'
+    messages=0
     data = datasets.boston_housing()
     X = data['X'].copy()
     Y = data['Y'].copy()
@@ -200,9 +202,9 @@ def boston_example():
     X = X/X.std(axis=0)
     Y = Y-Y.mean()
     Y = Y/Y.std()
-    num_folds = 10
+    num_folds = 30
     kf = KFold(len(Y), n_folds=num_folds, indices=True)
-    score_folds = np.zeros((6, num_folds))
+    score_folds = np.zeros((7, num_folds))
     def rmse(Y, Ystar):
         return np.sqrt(np.mean((Y-Ystar)**2))
     for n, (train, test) in enumerate(kf):
@@ -212,18 +214,19 @@ def boston_example():
         noise = 1e-1 #np.exp(-2)
         rbf_len = 0.5
         data_axis_plot = 4
-        plot = True
+        plot = False
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
         #Gaussian GP
         print "Gauss GP"
-        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
-        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp)
+        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
         mgp.ensure_default_constraints()
         mgp.constrain_fixed('white', 1e-5)
         mgp['rbf_len'] = rbf_len
         mgp['noise'] = noise
         print mgp
-        mgp.optimize(messages=1)
+        mgp.optimize(optimizer=optimizer,messages=messages)
         Y_test_pred = mgp.predict(X_test)
         score_folds[0, n] = rmse(Y_test, Y_test_pred[0])
         print mgp
@@ -235,11 +238,10 @@ def boston_example():
             plt.title('GP gauss')
 
         print "Gaussian Laplace GP"
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         N, D = Y_train.shape
         g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
         g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
-        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=g_likelihood)
+        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=g_likelihood)
         mg.ensure_default_constraints()
         mg.constrain_positive('noise_variance')
         mg.constrain_fixed('white', 1e-5)
@@ -247,7 +249,7 @@ def boston_example():
         mg['noise'] = noise
         print mg
         try:
-            mg.optimize(messages=1)
+            mg.optimize(optimizer=optimizer, messages=messages)
         except Exception:
             print "Blew up"
         Y_test_pred = mg.predict(X_test)
@@ -263,10 +265,9 @@ def boston_example():
         #Student T
         deg_free = 1
         print "Student-T GP {}df".format(deg_free)
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
         stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
-        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
+        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
         mstu_t.ensure_default_constraints()
         mstu_t.constrain_fixed('white', 1e-5)
         mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
@@ -274,7 +275,7 @@ def boston_example():
         mstu_t['t_noise'] = noise
         print mstu_t
         try:
-            mstu_t.optimize(messages=1)
+            mstu_t.optimize(optimizer=optimizer, messages=messages)
         except Exception:
             print "Blew up"
         Y_test_pred = mstu_t.predict(X_test)
@@ -287,12 +288,11 @@ def boston_example():
             plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
             plt.title('Stu t {}df'.format(deg_free))
 
-        deg_free = 2
+        deg_free = 8
         print "Student-T GP {}df".format(deg_free)
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
         stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
-        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
+        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
         mstu_t.ensure_default_constraints()
         mstu_t.constrain_fixed('white', 1e-5)
         mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
@@ -300,7 +300,7 @@ def boston_example():
         mstu_t['t_noise'] = noise
         print mstu_t
         try:
-            mstu_t.optimize(messages=1)
+            mstu_t.optimize(optimizer=optimizer, messages=messages)
         except Exception:
             print "Blew up"
         Y_test_pred = mstu_t.predict(X_test)
@@ -316,10 +316,9 @@ def boston_example():
         #Student t likelihood
         deg_free = 3
         print "Student-T GP {}df".format(deg_free)
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
         stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
-        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
+        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
         mstu_t.ensure_default_constraints()
         mstu_t.constrain_fixed('white', 1e-5)
         mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
@@ -327,7 +326,7 @@ def boston_example():
         mstu_t['t_noise'] = noise
         print mstu_t
         try:
-            mstu_t.optimize(messages=1)
+            mstu_t.optimize(optimizer=optimizer, messages=messages)
         except Exception:
             print "Blew up"
         Y_test_pred = mstu_t.predict(X_test)
@@ -342,10 +341,9 @@ def boston_example():
 
         deg_free = 5
         print "Student-T GP {}df".format(deg_free)
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
         stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
-        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu, likelihood=stu_t_likelihood)
+        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
         mstu_t.ensure_default_constraints()
         mstu_t.constrain_fixed('white', 1e-5)
         mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
@@ -353,7 +351,7 @@ def boston_example():
         mstu_t['t_noise'] = noise
         print mstu_t
         try:
-            mstu_t.optimize(messages=1)
+            mstu_t.optimize(optimizer=optimizer, messages=messages)
         except Exception:
             print "Blew up"
         Y_test_pred = mstu_t.predict(X_test)
@@ -366,9 +364,10 @@ def boston_example():
             plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
             plt.title('Stu t {}df'.format(deg_free))
 
+        score_folds[6, n] = rmse(Y_test, np.mean(Y_train))
 
 
-
+    print "Average scores: {}".format(np.mean(score_folds, 1))
     import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
     return score_folds
 
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index e6ffd78c..05b4ff02 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -301,9 +301,9 @@ class Laplace(likelihood):
             return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-6
-        step_size = 1
-        rs = 0
+        epsilon = 1e-5
+        #step_size = 1
+        #rs = 0
         i = 0
 
         while difference > epsilon and i < MAX_ITER:
@@ -330,7 +330,9 @@ class Laplace(likelihood):
 
             i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
             #Find the stepsize that minimizes the objective function using a brent line search
-            new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':30}).fun
+            #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
+            #steps than get this exact then make a step, if B was bigger it might be the other way around though
+            new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
             f = self.f.copy()
             Ki_f = self.Ki_f.copy()
 
diff --git a/GPy/testing/likelihoods_tests.py b/GPy/testing/likelihoods_tests.py
index 9a3dfd16..fff5dcac 100644
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@@ -30,9 +30,9 @@ def dparam_checkgrad(func, dfunc, params, args, constraints=None, randomize=Fals
     checkgrad expects a f: R^N -> R^1 and df: R^N -> R^N
     However if we are holding other parameters fixed and moving something else
     We need to check the gradient of each of the fixed parameters
-    (f and y for example) seperately.
-    Whilst moving another parameter. otherwise f: gives back R^N and
-    df: gives back R^NxM where M is
+    (f and y for example) seperately,  whilst moving another parameter.
+    Otherwise f: gives back R^N and
+              df: gives back R^NxM where M is
     The number of parameters and N is the number of data
     Need to take a slice out from f and a slice out of df
     """
@@ -48,6 +48,8 @@ def dparam_checkgrad(func, dfunc, params, args, constraints=None, randomize=Fals
             #dlik and dlik_dvar gives back 1 value for each
             f_ind = min(fnum, fixed_val+1) - 1
             print "fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val)
+            #Make grad checker with this param moving, note that set_params is NOT being called
+            #The parameter is being set directly with __setattr__
             grad = GradientChecker(lambda x: np.atleast_1d(partial_f(x))[f_ind],
                                    lambda x : np.atleast_1d(partial_df(x))[fixed_val],
                                    param, 'p')
@@ -57,8 +59,8 @@ def dparam_checkgrad(func, dfunc, params, args, constraints=None, randomize=Fals
                     constraint('p', grad)
             if randomize:
                 grad.randomize()
-            print grad
             if verbose:
+                print grad
                 grad.checkgrad(verbose=1)
             if not grad.checkgrad():
                 gradchecking = False
@@ -122,6 +124,7 @@ class TestNoiseModels(object):
                     "constrain": [constraint_wrappers, listed_here]
                     },
                 "laplace": boolean_of_whether_model_should_work_for_laplace,
+                "ep": boolean_of_whether_model_should_work_for_laplace,
                 "link_f_constraints": [constraint_wrappers, listed_here]
                 }
         """
@@ -177,7 +180,8 @@ class TestNoiseModels(object):
                                 "vals": [self.var],
                                 "constraints": [constrain_positive]
                                 },
-                            "laplace": True
+                            "laplace": True,
+                            "ep": True
                             },
                         "Gaussian_log": {
                             "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log(), variance=self.var, D=self.D, N=self.N),
@@ -211,6 +215,7 @@ class TestNoiseModels(object):
                             "link_f_constraints": [partial(constrain_bounded, lower=0, upper=1)],
                             "laplace": True,
                             "Y": self.binary_Y,
+                            "ep": True
                         }
                     }
 
@@ -238,7 +243,14 @@ class TestNoiseModels(object):
                 f = attributes["f"].copy()
             else:
                 f = self.f.copy()
-            laplace = attributes["laplace"]
+            if "laplace" in attributes:
+                laplace = attributes["laplace"]
+            else:
+                laplace = False
+            if "ep" in attributes:
+                ep = attributes["ep"]
+            else:
+                ep = False
 
             if len(param_vals) > 1:
                 raise NotImplementedError("Cannot support multiple params in likelihood yet!")
@@ -266,6 +278,10 @@ class TestNoiseModels(object):
 
                 #laplace likelihood gradcheck
                 yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+            if ep:
+                #ep likelihood gradcheck
+                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+
 
         self.tearDown()
 
@@ -321,7 +337,6 @@ class TestNoiseModels(object):
     def t_dlogpdf_dparams(self, model, Y, f, params, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
-        print param_constraints
         assert (
                 dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
                     params, args=(f, Y), constraints=param_constraints,
@@ -332,7 +347,6 @@ class TestNoiseModels(object):
     def t_dlogpdf_df_dparams(self, model, Y, f, params, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
-        print param_constraints
         assert (
                 dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
                     params, args=(f, Y), constraints=param_constraints,
@@ -343,7 +357,6 @@ class TestNoiseModels(object):
     def t_d2logpdf2_df2_dparams(self, model, Y, f, params, param_constraints):
         print "\n{}".format(inspect.stack()[0][3])
         print model
-        #print param_constraints
         assert (
                 dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
                     params, args=(f, Y), constraints=param_constraints,
@@ -459,6 +472,31 @@ class TestNoiseModels(object):
         print m
         assert m.checkgrad(step=step)
 
+    ###########
+    # EP test #
+    ###########
+    @with_setup(setUp, tearDown)
+    def t_ep_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        #Normalize
+        Y = Y/Y.max()
+        white_var = 0.001
+        kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        ep_likelihood = GPy.likelihoods.EP(Y.copy(), model)
+        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=ep_likelihood)
+        m.ensure_default_constraints()
+        m.constrain_fixed('white', white_var)
+
+        for param_num in range(len(param_names)):
+            name = param_names[param_num]
+            m[name] = param_vals[param_num]
+            constraints[param_num](name, m)
+
+        m.randomize()
+        m.checkgrad(verbose=1, step=step)
+        print m
+        assert m.checkgrad(step=step)
+
 
 class LaplaceTests(unittest.TestCase):
     """

From 7b6a56f83c60b19ed4e24058790d46f19fb8d16c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 23 Oct 2013 18:39:48 +0100
Subject: [PATCH 134/252] Added log predictive density, ln p(y*|D)

---
 GPy/core/gp_base.py                           | 15 ++++++++++
 GPy/likelihoods/ep.py                         | 16 +++++++++++
 GPy/likelihoods/gaussian.py                   | 20 +++++++++++++
 GPy/likelihoods/laplace.py                    | 16 +++++++++++
 GPy/likelihoods/likelihood.py                 | 16 +++++++++++
 .../noise_models/noise_distributions.py       | 28 +++++++++++++++++++
 6 files changed, 111 insertions(+)

diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index 083f9980..7cf62e69 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -418,3 +418,18 @@ class GPBase(Model):
 
         index = np.ones((X.shape[0],1))*output
         return np.hstack((X,index))
+
+    def log_predictive_density(self, x_test, y_test):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param x_test: test observations (x_{*})
+        :type x_test: (Nx1) array
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        """
+        mu_star, var_star = self._raw_predict(x_test)
+        return self.likelihood.log_predictive_density(y_test, mu_star, var_star)
diff --git a/GPy/likelihoods/ep.py b/GPy/likelihoods/ep.py
index cfa00500..32575813 100644
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@@ -54,6 +54,22 @@ class EP(likelihood):
             raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
         return self.noise_model.predictive_values(mu,var)
 
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
+
     def _get_params(self):
         #return np.zeros(0)
         return self.noise_model._get_params()
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 8b9ac776..85c028b4 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -90,5 +90,25 @@ class Gaussian(likelihood):
             _95pc = mean + 2.*np.sqrt(true_var)
         return mean, true_var, _5pc, _95pc
 
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+
+        .. Note:
+            Works as if each test point was provided individually, i.e. not full_cov
+        """
+        y_rescaled = (y_test - self._offset)/self._scale
+        return -0.5*np.log(2*np.pi) -0.5*np.log(var_star + self._variance) -0.5*(np.square(y_rescaled - mu_star))/(var_star + self._variance)
+
     def _gradients(self, partial):
         return np.sum(partial)
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 05b4ff02..047d7f74 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -73,6 +73,22 @@ class Laplace(likelihood):
                     with an Laplace likelihood")
         return self.noise_model.predictive_values(mu, var)
 
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
+
     def _get_params(self):
         return np.asarray(self.noise_model._get_params())
 
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index a86eaac6..5e7c8c68 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -51,3 +51,19 @@ class likelihood(Parameterized):
 
     def predictive_values(self, mu, var):
         raise NotImplementedError
+
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        raise NotImplementedError
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 59465a5b..3cd46013 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -62,6 +62,34 @@ class NoiseDistribution(object):
         """
         raise NotImplementedError
 
+    def log_predictive_density(self, y_test, mu_star, var_star):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        """
+        assert y_test.shape==mu_star.shape
+        assert y_test.shape==var_star.shape
+        assert y_test.shape[1] == 1
+        def integral_generator(y, m, v):
+            """Generate a function which can be integrated to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
+            def f(f_star):
+                return self.pdf(f_star, y)*np.exp(-(1./(2*v))*np.square(m-f_star))
+            return f
+
+        scaled_p_ystar, accuracy = zip(*[quad(integral_generator(y, m, v), -np.inf, np.inf) for y, m, v in zip(y_test.flatten(), mu_star.flatten(), var_star.flatten())])
+        scaled_p_ystar = np.array(scaled_p_ystar).reshape(-1,1)
+        p_ystar = scaled_p_ystar/np.sqrt(2*np.pi*var_star)
+        return np.log(p_ystar)
+
     def _moments_match_numerical(self,obs,tau,v):
         """
         Calculation of moments using quadrature

From 8c222bef866c617199cc392ed18fa22aa805265d Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 23 Oct 2013 18:40:13 +0100
Subject: [PATCH 135/252] Updated laplace example to use predictive density
 aswell as RMSE

---
 GPy/examples/laplace_approximations.py | 190 ++++++++++---------------
 1 file changed, 79 insertions(+), 111 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 2f163583..b5d0e8f8 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -196,6 +196,7 @@ def boston_example():
     optimizer='bfgs'
     messages=0
     data = datasets.boston_housing()
+    degrees_freedoms = [3, 5, 8, 10]
     X = data['X'].copy()
     Y = data['Y'].copy()
     X = X-X.mean(axis=0)
@@ -204,7 +205,9 @@ def boston_example():
     Y = Y/Y.std()
     num_folds = 30
     kf = KFold(len(Y), n_folds=num_folds, indices=True)
-    score_folds = np.zeros((7, num_folds))
+    num_models = len(degrees_freedoms) + 3 #3 for baseline, gaussian, gaussian laplace approx
+    score_folds = np.zeros((num_models, num_folds))
+    pred_density = score_folds.copy()
     def rmse(Y, Ystar):
         return np.sqrt(np.mean((Y-Ystar)**2))
     for n, (train, test) in enumerate(kf):
@@ -218,6 +221,9 @@ def boston_example():
         kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
         kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
+        #Baseline
+        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
+
         #Gaussian GP
         print "Gauss GP"
         mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
@@ -228,9 +234,10 @@ def boston_example():
         print mgp
         mgp.optimize(optimizer=optimizer,messages=messages)
         Y_test_pred = mgp.predict(X_test)
-        score_folds[0, n] = rmse(Y_test, Y_test_pred[0])
+        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
+        pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
         print mgp
-        print score_folds
+        print pred_density
         if plot:
             plt.figure()
             plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
@@ -253,8 +260,9 @@ def boston_example():
         except Exception:
             print "Blew up"
         Y_test_pred = mg.predict(X_test)
-        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
-        print score_folds
+        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
+        pred_density[2, n] = np.mean(mg.log_predictive_density(X_test, Y_test))
+        print pred_density
         print mg
         if plot:
             plt.figure()
@@ -262,114 +270,74 @@ def boston_example():
             plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
             plt.title('Lap gauss')
 
-        #Student T
-        deg_free = 1
-        print "Student-T GP {}df".format(deg_free)
-        t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
-        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
-        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
-        mstu_t.ensure_default_constraints()
-        mstu_t.constrain_fixed('white', 1e-5)
-        mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
-        mstu_t['rbf_len'] = rbf_len
-        mstu_t['t_noise'] = noise
-        print mstu_t
-        try:
-            mstu_t.optimize(optimizer=optimizer, messages=messages)
-        except Exception:
-            print "Blew up"
-        Y_test_pred = mstu_t.predict(X_test)
-        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
-        print score_folds
-        print mstu_t
-        if plot:
-            plt.figure()
-            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-            plt.title('Stu t {}df'.format(deg_free))
-
-        deg_free = 8
-        print "Student-T GP {}df".format(deg_free)
-        t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
-        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
-        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
-        mstu_t.ensure_default_constraints()
-        mstu_t.constrain_fixed('white', 1e-5)
-        mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
-        mstu_t['rbf_len'] = rbf_len
-        mstu_t['t_noise'] = noise
-        print mstu_t
-        try:
-            mstu_t.optimize(optimizer=optimizer, messages=messages)
-        except Exception:
-            print "Blew up"
-        Y_test_pred = mstu_t.predict(X_test)
-        score_folds[3, n] = rmse(Y_test, Y_test_pred[0])
-        print score_folds
-        print mstu_t
-        if plot:
-            plt.figure()
-            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-            plt.title('Stu t {}df'.format(deg_free))
-
-        #Student t likelihood
-        deg_free = 3
-        print "Student-T GP {}df".format(deg_free)
-        t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
-        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
-        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
-        mstu_t.ensure_default_constraints()
-        mstu_t.constrain_fixed('white', 1e-5)
-        mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
-        mstu_t['rbf_len'] = rbf_len
-        mstu_t['t_noise'] = noise
-        print mstu_t
-        try:
-            mstu_t.optimize(optimizer=optimizer, messages=messages)
-        except Exception:
-            print "Blew up"
-        Y_test_pred = mstu_t.predict(X_test)
-        score_folds[4, n] = rmse(Y_test, Y_test_pred[0])
-        print score_folds
-        print mstu_t
-        if plot:
-            plt.figure()
-            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-            plt.title('Stu t {}df'.format(deg_free))
-
-        deg_free = 5
-        print "Student-T GP {}df".format(deg_free)
-        t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=noise)
-        stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
-        mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
-        mstu_t.ensure_default_constraints()
-        mstu_t.constrain_fixed('white', 1e-5)
-        mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
-        mstu_t['rbf_len'] = rbf_len
-        mstu_t['t_noise'] = noise
-        print mstu_t
-        try:
-            mstu_t.optimize(optimizer=optimizer, messages=messages)
-        except Exception:
-            print "Blew up"
-        Y_test_pred = mstu_t.predict(X_test)
-        score_folds[5, n] = rmse(Y_test, Y_test_pred[0])
-        print score_folds
-        print mstu_t
-        if plot:
-            plt.figure()
-            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-            plt.title('Stu t {}df'.format(deg_free))
-
-        score_folds[6, n] = rmse(Y_test, np.mean(Y_train))
-
+        for stu_num, df in enumerate(degrees_freedoms):
+            #Student T
+            print "Student-T GP {}df".format(df)
+            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
+            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
+            mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
+            mstu_t.ensure_default_constraints()
+            mstu_t.constrain_fixed('white', 1e-5)
+            mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
+            mstu_t['rbf_len'] = rbf_len
+            mstu_t['t_noise'] = noise
+            print mstu_t
+            try:
+                mstu_t.optimize(optimizer=optimizer, messages=messages)
+            except Exception:
+                print "Blew up"
+            Y_test_pred = mstu_t.predict(X_test)
+            score_folds[3+stu_num, n] = rmse(Y_test, Y_test_pred[0])
+            pred_density[3+stu_num, n] = np.mean(mstu_t.log_predictive_density(X_test, Y_test))
+            print pred_density
+            print mstu_t
+            if plot:
+                plt.figure()
+                plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+                plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+                plt.title('Stu t {}df'.format(df))
 
     print "Average scores: {}".format(np.mean(score_folds, 1))
-    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
-    return score_folds
+    print "Average pred density: {}".format(np.mean(pred_density, 1))
+
+    #Plotting
+    stu_t_legends = ['Student T, df={}'.format(df) for df in degrees_freedoms]
+    legends = ['Baseline', 'Gaussian', 'Laplace Approx Gaussian'] + stu_t_legends
+
+    #Plot boxplots for RMSE density
+    fig = plt.figure()
+    ax=fig.add_subplot(111)
+    plt.title('RMSE')
+    bp = ax.boxplot(score_folds.T, notch=0, sym='+', vert=1, whis=1.5)
+    plt.setp(bp['boxes'], color='black')
+    plt.setp(bp['whiskers'], color='black')
+    plt.setp(bp['fliers'], color='red', marker='+')
+    xtickNames = plt.setp(ax, xticklabels=legends)
+    plt.setp(xtickNames, rotation=45, fontsize=8)
+    ax.set_ylabel('RMSE')
+    ax.set_xlabel('Distribution')
+    #Make grid and put it below boxes
+    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+              alpha=0.5)
+    ax.set_axisbelow(True)
+
+    #Plot boxplots for predictive density
+    fig = plt.figure()
+    ax=fig.add_subplot(111)
+    plt.title('Predictive density')
+    bp = ax.boxplot(pred_density[1:,:].T, notch=0, sym='+', vert=1, whis=1.5)
+    plt.setp(bp['boxes'], color='black')
+    plt.setp(bp['whiskers'], color='black')
+    plt.setp(bp['fliers'], color='red', marker='+')
+    xtickNames = plt.setp(ax, xticklabels=legends[1:])
+    plt.setp(xtickNames, rotation=45, fontsize=8)
+    ax.set_ylabel('Mean Log probability P(Y*|Y)')
+    ax.set_xlabel('Distribution')
+    #Make grid and put it below boxes
+    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+              alpha=0.5)
+    ax.set_axisbelow(True)
+    return score_folds, pred_density
 
 def precipitation_example():
     import sklearn

From 9ce51e94f6c5cd34e7b20083877a46b07114ea91 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 24 Oct 2013 15:19:09 +0100
Subject: [PATCH 136/252] Removed unnecessary laplace examples

---
 GPy/examples/laplace_approximations.py | 56 +-------------------------
 1 file changed, 1 insertion(+), 55 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index b5d0e8f8..b30d100f 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -142,54 +142,6 @@ def student_t_approx():
 
     return m
 
-def gaussian_f_check():
-    plt.close('all')
-    X = np.linspace(0, 1, 50)[:, None]
-    real_std = 0.2
-    noise = np.random.randn(*X.shape)*real_std
-    Y = np.sin(X*2*np.pi) + noise
-
-    kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
-    mgp.ensure_default_constraints()
-    mgp.randomize()
-    mgp.optimize()
-    print "Gaussian"
-    print mgp
-
-    kernelg = kernelgp.copy()
-    #kernelst += GPy.kern.bias(X.shape[1])
-    N, D = X.shape
-    g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=0.1, N=N, D=D)
-    g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution)
-    m = GPy.models.GPRegression(X, Y, kernelg, likelihood=g_likelihood)
-    m.likelihood.X = X
-    #m['rbf_v'] = mgp._get_params()[0]
-    #m['rbf_l'] = mgp._get_params()[1] + 1
-    m.ensure_default_constraints()
-    #m.constrain_fixed('rbf_v', mgp._get_params()[0])
-    #m.constrain_fixed('rbf_l', mgp._get_params()[1])
-    #m.constrain_bounded('t_no', 2*real_std**2, 1e3)
-    #m.constrain_positive('bias')
-    m.constrain_positive('noise_var')
-    #m['noise_variance'] = 0.1
-    #m.likelihood.X = X
-    m.randomize()
-    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
-    plt.figure()
-    ax = plt.subplot(211)
-    m.plot(ax=ax)
-
-    m.optimize()
-    ax = plt.subplot(212)
-    m.plot(ax=ax)
-
-    print "final optimised gaussian"
-    print m
-    print "real GP"
-    print mgp
-    import ipdb; ipdb.set_trace() ### XXX BREAKPOINT
-
 def boston_example():
     import sklearn
     from sklearn.cross_validation import KFold
@@ -337,7 +289,7 @@ def boston_example():
     ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
               alpha=0.5)
     ax.set_axisbelow(True)
-    return score_folds, pred_density
+    return mstu
 
 def precipitation_example():
     import sklearn
@@ -359,9 +311,3 @@ def precipitation_example():
     for n, (train, test) in enumerate(kf):
         X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
         print "Fold {}".format(n)
-
-
-def plot_f_approx(model):
-    plt.figure()
-    model.plot(ax=plt.gca())
-    plt.plot(model.X, model.likelihood.f_hat, c='g')

From de9e5e7fb0869e4bcb5bc927e32bdd8bf72f5a39 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 24 Oct 2013 15:21:40 +0100
Subject: [PATCH 137/252] Minor clean up

---
 GPy/examples/laplace_approximations.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index b30d100f..96b423f0 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -155,13 +155,15 @@ def boston_example():
     X = X/X.std(axis=0)
     Y = Y-Y.mean()
     Y = Y/Y.std()
-    num_folds = 30
+    num_folds = 10
     kf = KFold(len(Y), n_folds=num_folds, indices=True)
     num_models = len(degrees_freedoms) + 3 #3 for baseline, gaussian, gaussian laplace approx
     score_folds = np.zeros((num_models, num_folds))
     pred_density = score_folds.copy()
+
     def rmse(Y, Ystar):
         return np.sqrt(np.mean((Y-Ystar)**2))
+
     for n, (train, test) in enumerate(kf):
         X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
         print "Fold {}".format(n)
@@ -184,7 +186,7 @@ def boston_example():
         mgp['rbf_len'] = rbf_len
         mgp['noise'] = noise
         print mgp
-        mgp.optimize(optimizer=optimizer,messages=messages)
+        mgp.optimize(optimizer=optimizer, messages=messages)
         Y_test_pred = mgp.predict(X_test)
         score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
         pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
@@ -289,7 +291,7 @@ def boston_example():
     ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
               alpha=0.5)
     ax.set_axisbelow(True)
-    return mstu
+    return mstu_t
 
 def precipitation_example():
     import sklearn

From a46121c430c4fee5300d652d3e8ce249bf52d0ab Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 24 Oct 2013 15:49:20 +0100
Subject: [PATCH 138/252] Was a bug in the examples_tests.py, fixed and added
 brendan faces to ignore list

---
 GPy/testing/examples_tests.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/GPy/testing/examples_tests.py b/GPy/testing/examples_tests.py
index 989251a7..15dbe234 100644
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@@ -37,9 +37,8 @@ def model_checkgrads(model):
 
 def model_instance(model):
     #assert isinstance(model, GPy.core.model)
-    return isinstance(model, GPy.core.model)
+    return isinstance(model, GPy.core.model.Model)
 
-@nottest
 def test_models():
     examples_path = os.path.dirname(GPy.examples.__file__)
     # Load modules
@@ -54,7 +53,7 @@ def test_models():
         print "After"
         print functions
         for example in functions:
-            if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100']:
+            if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100', 'brendan_faces']:
                 print "SKIPPING"
                 continue
 

From 33b6a7d24fbec9400ee55fe9e669c74ed0d52e66 Mon Sep 17 00:00:00 2001
From: James Hensman <james@jamess-mbp.lan>
Date: Thu, 24 Oct 2013 19:32:37 +0100
Subject: [PATCH 139/252] turned omp off by default as discussed

---
 GPy/gpy_config.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/gpy_config.cfg b/GPy/gpy_config.cfg
index 8683f96c..d52edd28 100644
--- a/GPy/gpy_config.cfg
+++ b/GPy/gpy_config.cfg
@@ -4,4 +4,4 @@
 # Enable openmp support. This speeds up some computations, depending on the number
 # of cores available. Setting up a compiler with openmp support can be difficult on 
 # some platforms, hence this option.
-openmp=True
+openmp=False

From bddb22f4afc799699f18d431126068753197a7f2 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 24 Oct 2013 21:30:23 +0100
Subject: [PATCH 140/252] docstrings and removal of duplicated plotting code in
 gp_base

---
 GPy/core/gp_base.py | 191 ++++++--------------------------------------
 1 file changed, 25 insertions(+), 166 deletions(-)

diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index 083f9980..12e71c93 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -9,7 +9,9 @@ from ..likelihoods import Gaussian, Gaussian_Mixed_Noise
 class GPBase(Model):
     """
     Gaussian process base model for holding shared behaviour between
-    sparse_GP and GP models.
+    sparse_GP and GP models, and potentially other models in the future.
+
+    Here we define some functions that are use
     """
     def __init__(self, X, likelihood, kernel, normalize_X=False):
         self.X = X
@@ -34,29 +36,6 @@ class GPBase(Model):
         # All leaf nodes should call self._set_params(self._get_params()) at
         # the end
 
-    def getstate(self):
-        """
-        Get the current state of the class, here we return everything that is needed to recompute the model.
-        """
-        return Model.getstate(self) + [self.X,
-                self.num_data,
-                self.input_dim,
-                self.kern,
-                self.likelihood,
-                self.output_dim,
-                self._Xoffset,
-                self._Xscale]
-
-    def setstate(self, state):
-        self._Xscale = state.pop()
-        self._Xoffset = state.pop()
-        self.output_dim = state.pop()
-        self.likelihood = state.pop()
-        self.kern = state.pop()
-        self.input_dim = state.pop()
-        self.num_data = state.pop()
-        self.X = state.pop()
-        Model.setstate(self, state)
 
     def posterior_samples_f(self,X,size=10,which_parts='all',full_cov=True):
         """
@@ -269,152 +248,32 @@ class GPBase(Model):
         else:
             raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
 
-    def plot_single_output_f(self, output=None, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
+    def getstate(self):
         """
-        For a specific output, in a multioutput model, this function works just as plot_f on single output models.
-
-        :param output: which output to plot (for multiple output models only)
-        :type output: integer (first output is 0)
-        :param samples: the number of a posteriori samples to plot
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-        :type resolution: int
-        :param full_cov:
-        :type full_cov: bool
-                :param fignum: figure to plot on.
-        :type fignum: figure number
-        :param ax: axes to plot on.
-        :type ax: axes handle
+        Get the curent state of the class. This is only used to efficiently
+        pickle the model. See also self.setstate
         """
-        assert output is not None, "An output must be specified."
-        assert len(self.likelihood.noise_model_list) > output, "The model has only %s outputs." %(self.output_dim + 1)
+        return Model.getstate(self) + [self.X,
+                self.num_data,
+                self.input_dim,
+                self.kern,
+                self.likelihood,
+                self.output_dim,
+                self._Xoffset,
+                self._Xscale]
 
-        if which_data == 'all':
-            which_data = slice(None)
-
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        if self.X.shape[1] == 2:
-            Xu = self.X[self.X[:,-1]==output ,0:1]
-            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
-            Xnew_indexed = self._add_output_index(Xnew,output)
-
-            m, v = self._raw_predict(Xnew_indexed, which_parts=which_parts)
-
-            if samples:
-                Ysim = self.posterior_samples_f(Xnew_indexed, samples, which_parts=which_parts, full_cov=True)
-                for yi in Ysim.T:
-                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
-
-            gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax)
-            ax.plot(Xu[which_data], self.likelihood.Y[self.likelihood.index==output][:,None], 'kx', mew=1.5)
-            ax.set_xlim(xmin, xmax)
-            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_ylim(ymin, ymax)
-
-        elif self.X.shape[1] == 3:
-            raise NotImplementedError, "Plots not implemented for multioutput models with 2D inputs...yet"
-            #if samples:
-            #    warnings.warn("Samples only implemented for 1 dimensional inputs.")
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-
-    def plot_single_output(self, output=None, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
+    def setstate(self, state):
         """
-        For a specific output, in a multioutput model, this function works just as plot_f on single output models.
-
-        :param output: which output to plot (for multiple output models only)
-        :type output: integer (first output is 0)
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :type plot_limits: np.array
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-        :type resolution: int
-        :param levels: number of levels to plot in a contour plot.
-        :type levels: int
-        :param samples: the number of a posteriori samples to plot
-        :type samples: int
-        :param fignum: figure to plot on.
-        :type fignum: figure number
-        :param ax: axes to plot on.
-        :type ax: axes handle
-        :type output: integer (first output is 0)
-        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
-        :type fixed_inputs: a list of tuples
-        :param linecol: color of line to plot.
-        :type linecol:
-        :param fillcol: color of fill
-        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        Set the state of the model. Used for efficient pickling
         """
-        assert output is not None, "An output must be specified."
-        assert len(self.likelihood.noise_model_list) > output, "The model has only %s outputs." %(self.output_dim + 1)
-        if which_data == 'all':
-            which_data = slice(None)
-
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        if self.X.shape[1] == 2:
-            resolution = resolution or 200
-
-            Xu = self.X[self.X[:,-1]==output,:] #keep the output of interest
-            Xu = self.X * self._Xscale + self._Xoffset
-            Xu = self.X[self.X[:,-1]==output ,0:1] #get rid of the index column
-
-            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
-            Xnew_indexed = self._add_output_index(Xnew,output)
+        self._Xscale = state.pop()
+        self._Xoffset = state.pop()
+        self.output_dim = state.pop()
+        self.likelihood = state.pop()
+        self.kern = state.pop()
+        self.input_dim = state.pop()
+        self.num_data = state.pop()
+        self.X = state.pop()
+        Model.setstate(self, state)
 
 
-            m, v, lower, upper = self.predict(Xnew_indexed, which_parts=which_parts,noise_model=output)
-
-            if samples: #NOTE not tested with fixed_inputs
-                Ysim = self.posterior_samples(Xnew_indexed, samples, which_parts=which_parts, full_cov=True,noise_model=output)
-                for yi in Ysim.T:
-                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
-
-            for d in range(m.shape[1]):
-                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data], self.likelihood.noise_model_list[output].data, 'kx', mew=1.5)
-            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_xlim(xmin, xmax)
-            ax.set_ylim(ymin, ymax)
-
-        elif self.X.shape[1] == 3:
-            raise NotImplementedError, "Plots not implemented for multioutput models with 2D inputs...yet"
-            #if samples:
-            #    warnings.warn("Samples only implemented for 1 dimensional inputs.")
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-
-    def _add_output_index(self,X,output):
-        """
-        In a multioutput model, appends an index column to X to specify the output it is related to.
-
-        :param X: Input data
-        :type X: np.ndarray, N x self.input_dim
-        :param output: output X is related to
-        :type output: integer in {0,..., output_dim-1}
-
-        .. Note:: For multiple non-independent outputs models only.
-        """
-
-        assert hasattr(self,'multioutput'), 'This function is for multiple output models only.'
-
-        index = np.ones((X.shape[0],1))*output
-        return np.hstack((X,index))

From 683f45366b451298e03e1cb839ff50fd1312bdd0 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 24 Oct 2013 21:58:51 +0100
Subject: [PATCH 141/252] some tidying in gp.py

---
 GPy/core/gp.py        |  21 +++---
 GPy/core/sparse_gp.py | 168 ++++--------------------------------------
 2 files changed, 22 insertions(+), 167 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 67eb7c69..2ea09117 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -27,12 +27,6 @@ class GP(GPBase):
         GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
         self._set_params(self._get_params())
 
-    def getstate(self):
-        return GPBase.getstate(self)
-
-    def setstate(self, state):
-        GPBase.setstate(self, state)
-        self._set_params(self._get_params())
 
     def _set_params(self, p):
         self.kern._set_params_transformed(p[:self.kern.num_params_transformed()])
@@ -101,12 +95,7 @@ class GP(GPBase):
 
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
-        #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
-        if not isinstance(self.likelihood,EP):
-            tmp = np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
-        else:
-            tmp = np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
-        return tmp
+        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
 
     def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
         """
@@ -193,3 +182,11 @@ class GP(GPBase):
         """
         Xnew = self._add_output_index(Xnew, output)
         return self.predict(Xnew, which_parts=which_parts, full_cov=full_cov, likelihood_args=likelihood_args)
+
+    def getstate(self):
+        return GPBase.getstate(self)
+
+    def setstate(self, state):
+        GPBase.setstate(self, state)
+        self._set_params(self._get_params())
+
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 9251fcd6..8c8df30c 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -52,23 +52,6 @@ class SparseGP(GPBase):
 
         self._const_jitter = None
 
-    def getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        """
-        return GPBase.getstate(self) + [self.Z,
-                self.num_inducing,
-                self.has_uncertain_inputs,
-                self.X_variance]
-
-    def setstate(self, state):
-        self.X_variance = state.pop()
-        self.has_uncertain_inputs = state.pop()
-        self.num_inducing = state.pop()
-        self.Z = state.pop()
-        GPBase.setstate(self, state)
-
     def _compute_kernel_matrices(self):
         # kernel computations, using BGPLVM notation
         self.Kmm = self.kern.K(self.Z)
@@ -87,7 +70,6 @@ class SparseGP(GPBase):
 
         # factor Kmm
         self._Lm = jitchol(self.Kmm + self._const_jitter)
-        # TODO: no white kernel needed anymore, all noise in likelihood --------
 
         # The rather complex computations of self._A
         if self.has_uncertain_inputs:
@@ -421,145 +403,21 @@ class SparseGP(GPBase):
         else:
             raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
 
-    def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
+    def getstate(self):
         """
-        For a specific output, predict the function at the new point(s) Xnew.
-
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        :param output: output to predict
-        :type output: integer in {0,..., num_outputs-1}
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-        :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
-        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
-
-        .. Note:: For multiple output models only
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
         """
+        return GPBase.getstate(self) + [self.Z,
+                self.num_inducing,
+                self.has_uncertain_inputs,
+                self.X_variance]
 
-        assert hasattr(self,'multioutput')
-        index = np.ones_like(Xnew)*output
-        Xnew = np.hstack((Xnew,index))
-
-        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
-
-        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, noise_model = output)
-        return mean, var, _025pm, _975pm
-
-    def _raw_predict_single_output(self, _Xnew, output=0, X_variance_new=None, which_parts='all', full_cov=False,stop=False):
-        """
-        Internal helper function for making predictions for a specific output,
-        does not account for normalization or likelihood
-        ---------
-
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        :param output: output to predict
-        :type output: integer in {0,..., num_outputs-1}
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-
-        .. Note:: For multiple output models only
-        """
-        Bi, _ = dpotri(self.LB, lower=0)  # WTH? this lower switch should be 1, but that doesn't work!
-        symmetrify(Bi)
-        Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi)
-
-        if self.Cpsi1V is None:
-            psi1V = np.dot(self.psi1.T,self.likelihood.V)
-            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
-            tmp, _ = dpotrs(self.LB, tmp, lower=1)
-            self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1)
-
-        assert hasattr(self,'multioutput')
-        index = np.ones_like(_Xnew)*output
-        _Xnew = np.hstack((_Xnew,index))
-
-        if X_variance_new is None:
-            Kx = self.kern.K(self.Z, _Xnew, which_parts=which_parts)
-            mu = np.dot(Kx.T, self.Cpsi1V)
-            if full_cov:
-                Kxx = self.kern.K(_Xnew, which_parts=which_parts)
-                var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting
-            else:
-                Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
-                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
-        else:
-            Kx = self.kern.psi1(self.Z, _Xnew, X_variance_new)
-            mu = np.dot(Kx, self.Cpsi1V)
-            if full_cov:
-                raise NotImplementedError, "TODO"
-            else:
-                Kxx = self.kern.psi0(self.Z, _Xnew, X_variance_new)
-                psi2 = self.kern.psi2(self.Z, _Xnew, X_variance_new)
-                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-
-        return mu, var[:, None]
+    def setstate(self, state):
+        self.X_variance = state.pop()
+        self.has_uncertain_inputs = state.pop()
+        self.num_inducing = state.pop()
+        self.Z = state.pop()
+        GPBase.setstate(self, state)
 
 
-    def plot_single_output_f(self, output=None, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
-
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-        if fignum is None and ax is None:
-                fignum = fig.num
-        if which_data is 'all':
-            which_data = slice(None)
-
-        GPBase.plot_single_output_f(self, output=output, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, full_cov=full_cov, fignum=fignum, ax=ax)
-
-        if self.X.shape[1] == 2:
-            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
-                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
-                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
-            Zu = self.Z * self._Xscale + self._Xoffset
-            Zu = Zu[Zu[:,1]==output,0:1]
-            ax.plot(Zu[:,0], np.zeros_like(Zu[:,0]) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
-
-        elif self.X.shape[1] == 2:
-            Zu = self.Z * self._Xscale + self._Xoffset
-            Zu = Zu[Zu[:,1]==output,0:2]
-            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
-
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-    def plot_single_output(self, output=None, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None):
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-        if fignum is None and ax is None:
-                fignum = fig.num
-        if which_data is 'all':
-            which_data = slice(None)
-
-        GPBase.plot_single_output(self, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, levels=20, fignum=fignum, ax=ax, output=output)
-
-        if self.X.shape[1] == 2:
-            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
-                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
-                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
-            Zu = self.Z * self._Xscale + self._Xoffset
-            Zu = Zu[Zu[:,1]==output,0:1]
-            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
-
-        elif self.X.shape[1] == 3:
-            Zu = self.Z * self._Xscale + self._Xoffset
-            Zu = Zu[Zu[:,1]==output,0:1]
-            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"

From eeb5f59fca5936be0eb80a414f67497f52a8f59c Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 24 Oct 2013 22:06:07 +0100
Subject: [PATCH 142/252] improved docstrings in svigp

---
 GPy/core/svigp.py | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py
index c5ea9c6b..9f27f465 100644
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@@ -18,30 +18,16 @@ class SVIGP(GPBase):
     Stochastic Variational inference in a Gaussian Process
 
     :param X: inputs
-    :type X: np.ndarray (N x Q)
+    :type X: np.ndarray (num_data x num_inputs)
     :param Y: observed data
-    :type Y: np.ndarray of observations (N x D)
-    :param batchsize: the size of a h
-
-    Additional kwargs are used as for a sparse GP. They include:
-
+    :type Y: np.ndarray of observations (num_data x output_dim)
+    :param batchsize: the size of a minibatch
     :param q_u: canonical parameters of the distribution squasehd into a 1D array
     :type q_u: np.ndarray
-    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
-    :type M: int
     :param kernel: the kernel/covariance function. See link kernels
     :type kernel: a GPy kernel
-    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (M x Q) | None
-    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
-    :type X_uncertainty: np.ndarray (N x Q) | None
-    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
-    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
-    :type M: int
-    :param beta: noise precision. TODO: ignore beta if doing EP
-    :type beta: float
-    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize_(X|Y): bool
+    :param Z: inducing inputs
+    :type Z: np.ndarray (num_inducing x num_inputs)
 
     """
 

From 7190e0e6bb4f3e4aebcab8ce9360b2f1cbe3aa04 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 24 Oct 2013 22:13:52 +0100
Subject: [PATCH 143/252] general tidying in models

---
 GPy/models/bayesian_gplvm.py | 25 ++++++++++---------
 GPy/models/bcgplvm.py        |  2 +-
 GPy/models/gp_regression.py  |  2 --
 GPy/models/gplvm.py          | 16 ++++++------
 GPy/models/mrd.py            | 47 ++++++++++++++++++------------------
 5 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index d4d29711..21b46a8a 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -49,18 +49,6 @@ class BayesianGPLVM(SparseGP, GPLVM):
         SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
         self.ensure_default_constraints()
 
-    def getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        """
-        return SparseGP.getstate(self) + [self.init]
-
-    def setstate(self, state):
-        self._const_jitter = None
-        self.init = state.pop()
-        SparseGP.setstate(self, state)
-
     def _get_param_names(self):
         X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
         S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
@@ -285,6 +273,19 @@ class BayesianGPLVM(SparseGP, GPLVM):
         fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
         return fig
 
+    def getstate(self):
+        """
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
+        """
+        return SparseGP.getstate(self) + [self.init]
+
+    def setstate(self, state):
+        self._const_jitter = None
+        self.init = state.pop()
+        SparseGP.setstate(self, state)
+
+
 def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
     """
     objective function for fitting the latent variables for test points
diff --git a/GPy/models/bcgplvm.py b/GPy/models/bcgplvm.py
index 9f5866c3..92db6953 100644
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@@ -7,7 +7,7 @@ import pylab as pb
 import sys, pdb
 from ..core import GP
 from ..models import GPLVM
-from ..mappings import *
+from ..mappings import Kernel
 
 
 class BCGPLVM(GPLVM):
diff --git a/GPy/models/gp_regression.py b/GPy/models/gp_regression.py
index 86e1f7de..1644b661 100644
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@@ -39,5 +39,3 @@ class GPRegression(GP):
 
     def setstate(self, state):
         return GP.setstate(self, state)
-
-    pass
diff --git a/GPy/models/gplvm.py b/GPy/models/gplvm.py
index ad78d51f..795389a7 100644
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@@ -44,12 +44,6 @@ class GPLVM(GP):
             Xr[:PC.shape[0], :PC.shape[1]] = PC
         return Xr
 
-    def getstate(self):
-        return GP.getstate(self)
-
-    def setstate(self, state):
-        GP.setstate(self, state)
-
     def _get_param_names(self):
         return sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], []) + GP._get_param_names(self)
 
@@ -68,7 +62,7 @@ class GPLVM(GP):
     def jacobian(self,X):
         target = np.zeros((X.shape[0],X.shape[1],self.output_dim))
         for i in range(self.output_dim):
-        	target[:,:,i]=self.kern.dK_dX(np.dot(self.Ki,self.likelihood.Y[:,i])[None, :],X,self.X)
+        	target[:,:,i] = self.kern.dK_dX(np.dot(self.Ki,self.likelihood.Y[:,i])[None, :],X,self.X)
         return target
    
     def magnification(self,X):
@@ -91,3 +85,11 @@ class GPLVM(GP):
 
     def plot_magnification(self, *args, **kwargs):
         return util.plot_latent.plot_magnification(self, *args, **kwargs)
+
+    def getstate(self):
+        return GP.getstate(self)
+
+    def setstate(self, state):
+        GP.setstate(self, state)
+
+
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index 1435028f..2aaa731c 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -81,29 +81,6 @@ class MRD(Model):
         Model.__init__(self)
         self.ensure_default_constraints()
 
-    def getstate(self):
-        return Model.getstate(self) + [self.names,
-                self.bgplvms,
-                self.gref,
-                self.nparams,
-                self.input_dim,
-                self.num_inducing,
-                self.num_data,
-                self.NQ,
-                self.MQ]
-
-    def setstate(self, state):
-        self.MQ = state.pop()
-        self.NQ = state.pop()
-        self.num_data = state.pop()
-        self.num_inducing = state.pop()
-        self.input_dim = state.pop()
-        self.nparams = state.pop()
-        self.gref = state.pop()
-        self.bgplvms = state.pop()
-        self.names = state.pop()
-        Model.setstate(self, state)
-
     @property
     def X(self):
         return self.gref.X
@@ -371,4 +348,28 @@ class MRD(Model):
         pylab.draw()
         fig.tight_layout()
 
+    def getstate(self):
+        return Model.getstate(self) + [self.names,
+                self.bgplvms,
+                self.gref,
+                self.nparams,
+                self.input_dim,
+                self.num_inducing,
+                self.num_data,
+                self.NQ,
+                self.MQ]
+
+    def setstate(self, state):
+        self.MQ = state.pop()
+        self.NQ = state.pop()
+        self.num_data = state.pop()
+        self.num_inducing = state.pop()
+        self.input_dim = state.pop()
+        self.nparams = state.pop()
+        self.gref = state.pop()
+        self.bgplvms = state.pop()
+        self.names = state.pop()
+        Model.setstate(self, state)
+
+
 

From dc2a8a531ef954bdd154827c75fa10d71b69cd14 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 25 Oct 2013 09:51:41 +0100
Subject: [PATCH 144/252] started changing the plotting in examples to remove
 plot_single_output

---
 GPy/examples/regression.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 3bf2377e..ca4f506d 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -57,8 +57,8 @@ def coregionalization_toy(max_iters=100):
     m.optimize(max_iters=max_iters)
 
     fig, axes = pb.subplots(2,1)
-    m.plot_single_output(output=0,ax=axes[0])
-    m.plot_single_output(output=1,ax=axes[1])
+    m.plot(fixed_inputs=[(1,0)],ax=axes[0])
+    m.plot(fixed_inputs=[(1,1)],ax=axes[1])
     axes[0].set_title('Output 0')
     axes[1].set_title('Output 1')
     return m

From 8ef36258321df6e324c79c0153f7930eac17bb7a Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 25 Oct 2013 12:21:11 +0100
Subject: [PATCH 145/252] Reimplemented gradients for exponential, seems to
 work for laplace now, needs a visual test though

---
 GPy/likelihoods/noise_model_constructors.py   |   2 +-
 .../noise_models/exponential_noise.py         | 116 +++++++++++++++---
 .../noise_models/noise_distributions.py       |   9 --
 .../noise_models/student_t_noise.py           |  32 +++--
 GPy/testing/likelihoods_tests.py              |   7 ++
 5 files changed, 134 insertions(+), 32 deletions(-)

diff --git a/GPy/likelihoods/noise_model_constructors.py b/GPy/likelihoods/noise_model_constructors.py
index 95247c03..e626c6a3 100644
--- a/GPy/likelihoods/noise_model_constructors.py
+++ b/GPy/likelihoods/noise_model_constructors.py
@@ -37,7 +37,7 @@ def exponential(gp_link=None):
     :param gp_link: a GPy gp_link function
     """
     if gp_link is None:
-        gp_link = noise_models.gp_transformations.Identity()
+        gp_link = noise_models.gp_transformations.Log_ex_1()
 
     analytical_mean = False
     analytical_variance = False
diff --git a/GPy/likelihoods/noise_models/exponential_noise.py b/GPy/likelihoods/noise_models/exponential_noise.py
index 450c11be..8e916353 100644
--- a/GPy/likelihoods/noise_models/exponential_noise.py
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@@ -24,24 +24,112 @@ class Exponential(NoiseDistribution):
     def _preprocess_values(self,Y):
         return Y
 
-    def _mass(self,gp,obs):
+    def pdf_link(self, link_f, y, extra_data=None):
         """
-        Mass (or density) function
-        """
-        return np.exp(-obs/self.gp_link.transf(gp))/self.gp_link.transf(gp)
+        Likelihood function given link(f)
 
-    def _nlog_mass(self,gp,obs):
-        """
-        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
-        """
-        return obs/self.gp_link.transf(gp) + np.log(self.gp_link.transf(gp))
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})\\exp (-y\\lambda(f_{i}))
 
-    def _dnlog_mass_dgp(self,gp,obs):
-        return ( 1./self.gp_link.transf(gp) - obs/self.gp_link.transf(gp)**2) * self.gp_link.dtransf_df(gp)
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return np.exp(np.sum(np.log(link_f*np.exp(-y*link_f))))
+        #return np.exp(np.sum(-y/link_f - np.log(link_f) ))
 
-    def _d2nlog_mass_dgp2(self,gp,obs):
-        fgp = self.gp_link.transf(gp)
-        return (2*obs/fgp**3 - 1./fgp**2) * self.gp_link.dtransf_df(gp)**2 + ( 1./fgp - obs/fgp**2) * self.gp_link.d2transf_df2(gp)
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\lambda(f_{i}) - y_{i}\\lambda(f_{i})
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        logpdf_link = np.sum(np.log(link_f) - y*link_f)
+        #logpdf_link = np.sum(-np.log(link_f) - y/link_f)
+        return logpdf_link
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{1}{\\lambda(f)} - y_{i}
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        grad = 1./link_f - y
+        #grad = y/(link_f**2) - 1./link_f
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\frac{1}{\\lambda(f_{i})^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        hess = -1./(link_f**2)
+        #hess = -2*y/(link_f**3) + 1/(link_f**2)
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2}{\\lambda(f_{i})^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in exponential distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3lik_dlink3 = 2./(link_f**3)
+        #d3lik_dlink3 = 6*y/(link_f**4) - 2./(link_f**3)
+        return d3lik_dlink3
 
     def _mean(self,gp):
         """
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 3cd46013..165f8d2e 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -222,21 +222,12 @@ class NoiseDistribution(object):
         raise NotImplementedError
 
     def dlogpdf_link_dtheta(self, link_f, y, extra_data=None):
-        """
-        Need to check if it should even exist by checking length of getparams
-        """
         raise NotImplementedError
 
     def dlogpdf_dlink_dtheta(self, link_f, y, extra_data=None):
-        """
-        Need to check if it should even exist by checking length of getparams
-        """
         raise NotImplementedError
 
     def d2logpdf_dlink2_dtheta(self, link_f, y, extra_data=None):
-        """
-        Need to check if it should even exist by checking length of getparams
-        """
         raise NotImplementedError
 
     def pdf(self, f, y, extra_data=None):
diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index 7937a507..f268c644 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -55,7 +55,7 @@ class StudentT(NoiseDistribution):
         :returns: likelihood evaluated for this point
         :rtype: float
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         e = y - link_f
         #Careful gamma(big_number) is infinity!
         objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
@@ -80,7 +80,7 @@ class StudentT(NoiseDistribution):
         :rtype: float
 
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         e = y - link_f
         objective = (+ gammaln((self.v + 1) * 0.5)
                      - gammaln(self.v * 0.5)
@@ -105,7 +105,7 @@ class StudentT(NoiseDistribution):
         :rtype: Nx1 array
 
         """
-        assert y.shape == link_f.shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         e = y - link_f
         grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
         return grad
@@ -131,7 +131,7 @@ class StudentT(NoiseDistribution):
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
             (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
-        assert y.shape == link_f.shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         e = y - link_f
         hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
         return hess
@@ -151,7 +151,7 @@ class StudentT(NoiseDistribution):
         :returns: third derivative of likelihood evaluated at points f
         :rtype: Nx1 array
         """
-        assert y.shape == link_f.shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         e = y - link_f
         d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
                        ((e**2 + self.sigma2*self.v)**3)
@@ -173,7 +173,7 @@ class StudentT(NoiseDistribution):
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: float
         """
-        assert y.shape == link_f.shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         e = y - link_f
         dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
         return np.sum(dlogpdf_dvar)
@@ -193,7 +193,7 @@ class StudentT(NoiseDistribution):
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert y.shape == link_f.shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         e = y - link_f
         dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
         return dlogpdf_dlink_dvar
@@ -213,7 +213,7 @@ class StudentT(NoiseDistribution):
         :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert y.shape == link_f.shape
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         e = y - link_f
         d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
                               / ((self.sigma2*self.v + (e**2))**3)
@@ -314,3 +314,19 @@ class StudentT(NoiseDistribution):
         p_025 = mu - p
         p_975 = mu + p
         return mu, np.nan*mu, p_025, p_975
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param size: number of samples to compute
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        f = self.gp_link.transf(gp)
+        #student_t_samples = stats.t.rvs(self.v, loc=f,
+                                        #scale=np.sqrt(self.sigma2),
+                                        #size=(num_test_points, num_y_samples, num_f_samples))
+        #Ysim = np.array([np.random.binomial(1,self.gp_link.transf(gpj),size=1) for gpj in gp])
+        return Ysim.reshape(orig_shape)
diff --git a/GPy/testing/likelihoods_tests.py b/GPy/testing/likelihoods_tests.py
index fff5dcac..c3ea6a43 100644
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@@ -83,6 +83,7 @@ class TestNoiseModels(object):
         self.Y = (np.sin(self.X[:, 0]*2*np.pi) + noise)[:, None]
         self.f = np.random.rand(self.N, 1)
         self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
+        self.positive_Y = np.exp(self.Y.copy())
 
         self.var = 0.2
 
@@ -216,6 +217,12 @@ class TestNoiseModels(object):
                             "laplace": True,
                             "Y": self.binary_Y,
                             "ep": True
+                            },
+                        "Exponential_default": {
+                            "model": GPy.likelihoods.exponential(),
+                            "link_f_constraints": [constrain_positive],
+                            "Y": self.positive_Y,
+                            "laplace": True,
                         }
                     }
 

From 2fdb60287f768db6e08ae3c515ad711cf5f61376 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 25 Oct 2013 15:08:53 +0100
Subject: [PATCH 146/252] Added derivatives for poisson and a couple of
 examples, need to fix for EP.

---
 GPy/examples/regression.py                    |  44 ++++++
 GPy/likelihoods/noise_models/poisson_noise.py | 132 +++++++++++++++---
 GPy/testing/likelihoods_tests.py              |  11 ++
 3 files changed, 169 insertions(+), 18 deletions(-)

diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index ca4f506d..2978ebdc 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -270,6 +270,50 @@ def toy_rbf_1d_50(max_iters=100):
     print(m)
     return m
 
+def toy_poisson_rbf_1d(optimizer='bfgs', max_nb_eval_optim=100):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    X = np.linspace(0,10)[:, None]
+    F = np.round(X*3-4)
+    F = np.where(F > 0, F, 0)
+    eps = np.random.randint(0,4, F.shape[0])[:, None]
+    Y = F + eps
+
+    noise_model = GPy.likelihoods.poisson()
+    likelihood = GPy.likelihoods.EP(Y,noise_model)
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+
+    # optimize
+    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
+    # plot
+    m.plot()
+    print(m)
+    return m
+
+def toy_poisson_rbf_1d_laplace(optimizer='bfgs', max_nb_eval_optim=100):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    X = np.linspace(0,10)[:, None]
+    F = np.round(X*3-4)
+    F = np.where(F > 0, F, 0)
+    eps = np.random.randint(0,4, F.shape[0])[:, None]
+    Y = F + eps
+
+    noise_model = GPy.likelihoods.poisson()
+    likelihood = GPy.likelihoods.Laplace(Y,noise_model)
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+
+    # optimize
+    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
+    # plot
+    m.plot()
+    print(m)
+    return m
+
+
+
 def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
     # Create an artificial dataset where the values in the targets (Y)
     # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
diff --git a/GPy/likelihoods/noise_models/poisson_noise.py b/GPy/likelihoods/noise_models/poisson_noise.py
index 80d7951b..fba00417 100644
--- a/GPy/likelihoods/noise_models/poisson_noise.py
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@@ -1,7 +1,7 @@
+from __future__ import division
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
 import numpy as np
 from scipy import stats,special
 import scipy as sp
@@ -14,9 +14,10 @@ class Poisson(NoiseDistribution):
     Poisson likelihood
 
     .. math::
-        L(x) = \\exp(\\lambda) * \\frac{\\lambda^Y_i}{Y_i!}
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\lambda(f_{i})^{y_{i}}}{y_{i}!}e^{-\\lambda(f_{i})}
 
-    ..Note: Y is expected to take values in {0,1,2,...}
+    .. Note::
+        Y is expected to take values in {0,1,2,...}
     """
     def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
         super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
@@ -24,25 +25,108 @@ class Poisson(NoiseDistribution):
     def _preprocess_values(self,Y): #TODO
         return Y
 
-    def _mass(self,gp,obs):
+    def pdf_link(self, link_f, y, extra_data=None):
         """
-        Mass (or density) function
-        """
-        return stats.poisson.pmf(obs,self.gp_link.transf(gp))
+        Likelihood function given link(f)
 
-    def _nlog_mass(self,gp,obs):
-        """
-        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
-        """
-        return self.gp_link.transf(gp) - obs * np.log(self.gp_link.transf(gp)) + np.log(special.gamma(obs+1))
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\lambda(f_{i})^{y_{i}}}{y_{i}!}e^{-\\lambda(f_{i})}
 
-    def _dnlog_mass_dgp(self,gp,obs):
-        return self.gp_link.dtransf_df(gp) * (1. - obs/self.gp_link.transf(gp))
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return np.prod(stats.poisson.pmf(y,link_f))
 
-    def _d2nlog_mass_dgp2(self,gp,obs):
-        d2_df = self.gp_link.d2transf_df2(gp)
-        transf = self.gp_link.transf(gp)
-        return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
+    def logpdf_link(self, link_f, y, extra_data=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = -\\lambda(f_{i}) + y_{i}\\log \\lambda(f_{i}) - \\log y_{i}!
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return np.sum(-link_f + y*np.log(link_f) - special.gammaln(y+1))
+
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - 1
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        return y/link_f - 1
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{-y_{i}}{\\lambda(f_{i})^{2}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        hess = -y/(link_f**2)
+        return hess
+        #d2_df = self.gp_link.d2transf_df2(gp)
+        #transf = self.gp_link.transf(gp)
+        #return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f_{i})^{3}}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3lik_dlink3 = 2*y/(link_f)**3
+        return d3lik_dlink3
 
     def _mean(self,gp):
         """
@@ -55,3 +139,15 @@ class Poisson(NoiseDistribution):
         Mass (or density) function
         """
         return self.gp_link.transf(gp)
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param size: number of samples to compute
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.array([np.random.poisson(self.gp_link.transf(gpj),size=1) for gpj in gp])
+        return Ysim.reshape(orig_shape)
diff --git a/GPy/testing/likelihoods_tests.py b/GPy/testing/likelihoods_tests.py
index c3ea6a43..155842fd 100644
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@@ -84,6 +84,10 @@ class TestNoiseModels(object):
         self.f = np.random.rand(self.N, 1)
         self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
         self.positive_Y = np.exp(self.Y.copy())
+        self.integer_Y = np.round(self.X[:, 0]*3-3)[:, None] + np.random.randint(0,3, self.X.shape[0])[:, None]
+        self.integer_Y = np.where(self.integer_Y > 0, self.integer_Y, 0)
+        print self.integer_Y
+        print self.Y
 
         self.var = 0.2
 
@@ -223,6 +227,13 @@ class TestNoiseModels(object):
                             "link_f_constraints": [constrain_positive],
                             "Y": self.positive_Y,
                             "laplace": True,
+                        },
+                        "Poisson_default": {
+                            "model": GPy.likelihoods.poisson(),
+                            "link_f_constraints": [constrain_positive],
+                            "Y": self.integer_Y,
+                            "laplace": True,
+                            "ep": False #Should work though...
                         }
                     }
 

From 1fe92b2515af5b57e7231f84cdd1a4c7b0366713 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Sat, 26 Oct 2013 15:01:35 +0100
Subject: [PATCH 147/252] fixed up plot in GP_base

---
 GPy/core/gp_base.py | 59 +++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index 12e71c93..ca1e75af 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -162,7 +162,7 @@ class GPBase(Model):
         Plot the posterior of the GP.
           - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
           - In two dimsensions, a contour-plot shows the mean predicted function
-          - Not implemented in higher dimensions
+          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
 
         Can plot only part of the data and part of the posterior functions
         using which_data and which_functions
@@ -198,52 +198,69 @@ class GPBase(Model):
             fig = pb.figure(num=fignum)
             ax = fig.add_subplot(111)
 
-        plotdims = self.input_dim - len(fixed_inputs)
-        if plotdims == 1:
+        #work out what the inputs are for plotting (1D or 2D)
+        fixed_dims = np.array([i for i,v in fixed_inputs])
+        free_dims = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
+
+        #one dimensional plotting
+        if len(free_dims) == 1:
+
+            #define the frame on which to plot
             resolution = resolution or 200
-
             Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
-
-            fixed_dims = np.array([i for i,v in fixed_inputs])
-            freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
-
-            Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
+            Xnew, xmin, xmax = x_frame1D(Xu[:,free_dims], plot_limits=plot_limits)
             Xgrid = np.empty((Xnew.shape[0],self.input_dim))
-            Xgrid[:,freedim] = Xnew
+            Xgrid[:,free_dims] = Xnew
             for i,v in fixed_inputs:
                 Xgrid[:,i] = v
 
+            #make a prediction on the frame and plot it
             m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts)
+            for d in range(m.shape[1]):
+                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
+                ax.plot(Xu[which_data,free_dims], self.likelihood.data[which_data, d], 'kx', mew=1.5)
 
+            #optionally plot some samples
             if samples: #NOTE not tested with fixed_inputs
                 Ysim = self.posterior_samples(Xgrid, samples, which_parts=which_parts, full_cov=True)
                 for yi in Ysim.T:
                     ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                     #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
 
-            for d in range(m.shape[1]):
-                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data,freedim], self.likelihood.data[which_data, d], 'kx', mew=1.5)
+
+            #set the limits of the plot to some sensible values
             ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
             ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
             ax.set_xlim(xmin, xmax)
             ax.set_ylim(ymin, ymax)
 
-        elif self.X.shape[1] == 2:
+        #2D plotting
+        elif len(free_dims) == 2:
 
+            #define the frame for plotting on
             resolution = resolution or 50
-            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
+            Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
+            Xnew, _, _, xmin, xmax = x_frame2D(Xu[:,free_dims], plot_limits, resolution)
+            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
+            Xgrid[:,free_dims] = Xnew
+            for i,v in fixed_inputs:
+                Xgrid[:,i] = v
             x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
-            m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
-            m = m.reshape(resolution, resolution).T
-            ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
-            Yf = self.likelihood.Y.flatten()
-            ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable
+
+            #predict on the frame and plot
+            m, _, _, _ = self.predict(Xgrid, which_parts=which_parts)
+            for d in range(m.shape[1]):
+                m_d = m[:,d].reshape(resolution, resolution).T
+                ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
+                Y_d = self.likelihood.Y[:,d]
+                ax.scatter(self.X[:, free_dims[0]], self.X[:, free_dims[1]], 40, Y_d, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+
+            #set the limits of the plot to some sensible values
             ax.set_xlim(xmin[0], xmax[0])
             ax.set_ylim(xmin[1], xmax[1])
 
             if samples:
-                warnings.warn("Samples only implemented for 1 dimensional inputs.")
+                warnings.warn("Samples are rather difficult to plot for 2D inputs...")
 
         else:
             raise NotImplementedError, "Cannot define a frame with more than two input dimensions"

From eedeaa4492fc0ce5fccd4598be5079398b9acb82 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Sat, 26 Oct 2013 19:57:21 +0100
Subject: [PATCH 148/252] fixed up the plotting

---
 GPy/core/gp_base.py | 124 +++++++++++++++-----------------------------
 1 file changed, 43 insertions(+), 81 deletions(-)

diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index ca1e75af..7b84b547 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -89,90 +89,43 @@ class GPBase(Model):
 
         return Ysim
 
-    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
+    def plot_f(self, *args, **kwargs):
         """
-        Plot the GP's view of the world, where the data is normalized and the
-          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
-          - In two dimsensions, a contour-plot shows the mean predicted function
-          - Not implemented in higher dimensions
+        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
 
-        :param samples: the number of a posteriori samples to plot
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-        :type resolution: int
-        :param full_cov:
-        :type full_cov: bool
-                :param fignum: figure to plot on.
-        :type fignum: figure number
-        :param ax: axes to plot on.
-        :type ax: axes handle
+        This is a convenience function: we simply call self.plot with the
+        argument use_raw_predict set True. All args and kwargs are passed on to
+        plot.
 
-        :param output: which output to plot (for multiple output models only)
-        :type output: integer (first output is 0)
+        see also: gp_base.plot
         """
-        if which_data == 'all':
-            which_data = slice(None)
-
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        if self.X.shape[1] == 1:
-            resolution = resolution or 200
-            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
-
-            m, v = self._raw_predict(Xnew, which_parts=which_parts)
-            if samples:
-                Ysim = self.posterior_samples_f(Xnew, samples, which_parts=which_parts, full_cov=True)
-                for yi in Ysim.T:
-                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
-            gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax)
-
-            ax.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
-            ax.set_xlim(xmin, xmax)
-            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_ylim(ymin, ymax)
-
-        elif self.X.shape[1] == 2:
-
-            resolution = resolution or 50
-            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
-            m, v = self._raw_predict(Xnew, which_parts=which_parts)
-            m = m.reshape(resolution, resolution).T
-            ax.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
-            ax.scatter(self.X[:, 0], self.X[:, 1], 40, self.likelihood.Y, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max()) # @UndefinedVariable
-            ax.set_xlim(xmin[0], xmax[0])
-            ax.set_ylim(xmin[1], xmax[1])
-
-            if samples:
-                warnings.warn("Samples only implemented for 1 dimensional inputs.")
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
-        """
-        Plot the GP with noise where the likelihood is Gaussian.
+        kwargs['use_raw_predict'] = True
+        self.plot(*args, **kwargs)
 
+    def plot(self, plot_limits=None, which_data_rows='all',
+            which_data_ycols='all', which_parts='all', fixed_inputs=[],
+            levels=20, samples=0, fignum=None, ax=None, resolution=None,
+            use_raw_predict=False,
+            linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
+        """ 
         Plot the posterior of the GP.
           - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
           - In two dimsensions, a contour-plot shows the mean predicted function
           - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
 
         Can plot only part of the data and part of the posterior functions
-        using which_data and which_functions
+        using which_data_rowsm which_data_ycols and which_parts
 
         :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
         :type plot_limits: np.array
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_rows: 'all' or a list of integers
         :param which_parts: which of the kernel functions to plot (additively)
         :type which_parts: 'all', or list of bools
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
         :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
         :type resolution: int
         :param levels: number of levels to plot in a contour plot.
@@ -184,16 +137,18 @@ class GPBase(Model):
         :param ax: axes to plot on.
         :type ax: axes handle
         :type output: integer (first output is 0)
-        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
-        :type fixed_inputs: a list of tuples
         :param linecol: color of line to plot.
         :type linecol:
         :param fillcol: color of fill
         :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
         """
-        if which_data == 'all':
-            which_data = slice(None)
-
+        #deal with optional arguments
+        if which_data_rows == 'all':
+            which_data_rows = slice(None)
+        if which_data_ycols == 'all':
+            which_data_ycols = np.arange(self.output_dim)
+        if len(which_data_ycols)==0:
+            raise ValueError('No data selected for plotting')
         if ax is None:
             fig = pb.figure(num=fignum)
             ax = fig.add_subplot(111)
@@ -215,10 +170,15 @@ class GPBase(Model):
                 Xgrid[:,i] = v
 
             #make a prediction on the frame and plot it
-            m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts)
-            for d in range(m.shape[1]):
+            if use_raw_predict:
+                m, v = self._raw_predict(Xgrid, which_parts=which_parts)
+                lower = m - 2*np.sqrt(v)
+                upper = m + 2*np.sqrt(v)
+            else:
+                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts)
+            for d in which_data_ycols:
                 gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data,free_dims], self.likelihood.data[which_data, d], 'kx', mew=1.5)
+                ax.plot(Xu[which_data_rows,free_dims], self.likelihood.data[which_data_rows, d], 'kx', mew=1.5)
 
             #optionally plot some samples
             if samples: #NOTE not tested with fixed_inputs
@@ -227,7 +187,6 @@ class GPBase(Model):
                     ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                     #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
 
-
             #set the limits of the plot to some sensible values
             ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
             ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
@@ -248,12 +207,15 @@ class GPBase(Model):
             x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
 
             #predict on the frame and plot
-            m, _, _, _ = self.predict(Xgrid, which_parts=which_parts)
-            for d in range(m.shape[1]):
+            if use_raw_predict:
+                m, _ = self._raw_predict(Xgrid, which_parts=which_parts)
+            else:
+                m, _, _, _ = self.predict(Xgrid, which_parts=which_parts)
+            for d in which_data_ycols:
                 m_d = m[:,d].reshape(resolution, resolution).T
                 ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-                Y_d = self.likelihood.Y[:,d]
-                ax.scatter(self.X[:, free_dims[0]], self.X[:, free_dims[1]], 40, Y_d, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+                Y_d = self.likelihood.Y[which_data_rows,d]
+                ax.scatter(self.X[which_data_rows, free_dims[0]], self.X[which_data_rows, free_dims[1]], 40, Y_d, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
 
             #set the limits of the plot to some sensible values
             ax.set_xlim(xmin[0], xmax[0])

From a889b0b7b5d7289489e79f6548bb1ac492de408c Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Sat, 26 Oct 2013 20:44:58 +0100
Subject: [PATCH 149/252] fixed up plotting in sparse_gp also

---
 GPy/core/sparse_gp.py | 83 +++++++++++++++++++++++++++++++++----------
 1 file changed, 65 insertions(+), 18 deletions(-)

diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 8c8df30c..e02da768 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -323,7 +323,10 @@ class SparseGP(GPBase):
         return mean, var, _025pm, _975pm
 
 
-    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
+    def plot_f(self, samples=0, plot_limits=None, which_data_rows='all',
+            which_data_cols='all', which_parts='all', resolution=None,
+            full_cov=False, fignum=None, ax=None):
+
         """
         Plot the GP's view of the world, where the data is normalized and the
           - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@@ -332,8 +335,8 @@ class SparseGP(GPBase):
 
         :param samples: the number of a posteriori samples to plot
         :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
+        :param which_data_rows: which if the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
         :param which_parts: which of the kernel functions to plot (additively)
         :type which_parts: 'all', or list of bools
         :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
@@ -353,10 +356,10 @@ class SparseGP(GPBase):
             ax = fig.add_subplot(111)
         if fignum is None and ax is None:
                 fignum = fig.num
-        if which_data is 'all':
-            which_data = slice(None)
+        if which_data_rows is 'all':
+            which_data_rows = slice(None)
 
-        GPBase.plot_f(self, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, full_cov=full_cov, fignum=fignum, ax=ax)
+        GPBase.plot_f(self, samples=samples, plot_limits=plot_limits, which_data_rows=which_data_rows, which_data_ycols=which_data_ycols, which_parts=which_parts, resolution=resolution, full_cov=full_cov, fignum=fignum, ax=ax)
 
         if self.X.shape[1] == 1:
             if self.has_uncertain_inputs:
@@ -371,35 +374,79 @@ class SparseGP(GPBase):
             Zu = self.Z * self._Xscale + self._Xoffset
             ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
 
-
         else:
             raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
 
-    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None):
+    def plot(self, plot_limits=None, which_data_rows='all',
+            which_data_ycols='all', which_parts='all', fixed_inputs=[],
+            levels=20, samples=0, fignum=None, ax=None, resolution=None):
+        """ 
+        Plot the posterior of the sparse GP.
+          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
+          - In two dimsensions, a contour-plot shows the mean predicted function
+          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
+
+        Can plot only part of the data and part of the posterior functions
+        using which_data_rowsm which_data_ycols and which_parts
+
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_rows: 'all' or a list of integers
+        :param which_parts: which of the kernel functions to plot (additively)
+        :type which_parts: 'all', or list of bools
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :type output: integer (first output is 0)
+        :param linecol: color of line to plot.
+        :type linecol:
+        :param fillcol: color of fill
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        """
+        #deal work out which ax to plot on
         if ax is None:
             fig = pb.figure(num=fignum)
             ax = fig.add_subplot(111)
-        if fignum is None and ax is None:
-                fignum = fig.num
-        if which_data is 'all':
-            which_data = slice(None)
 
-        GPBase.plot(self, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, levels=20, fignum=fignum, ax=ax)
+        #work out what the inputs are for plotting (1D or 2D)
+        fixed_dims = np.array([i for i,v in fixed_inputs])
+        free_dims = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
 
-        if self.X.shape[1] == 1:
+        #call the base plotting
+        GPBase.plot(self, samples=samples, plot_limits=plot_limits,
+                which_data_rows=which_data_rows,
+                which_data_ycols=which_data_ycols, fixed_inputs=fixed_inputs,
+                which_parts=which_parts, resolution=resolution, levels=20,
+                fignum=fignum, ax=ax)
+
+        if len(free_dims) == 1:
+            #plot errorbars for the uncertain inputs
             if self.has_uncertain_inputs:
                 Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
-                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
+                ax.errorbar(Xu[which_data_rows, 0], self.likelihood.data[which_data_rows, 0],
+                            xerr=2 * np.sqrt(self.X_variance[which_data_rows, 0]),
                             ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
+
+            #plot the inducing inputs
             Zu = self.Z * self._Xscale + self._Xoffset
             ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
 
-        elif self.X.shape[1] == 2:
+        elif len(free_dims) == 2:
             Zu = self.Z * self._Xscale + self._Xoffset
             ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
 
-
         else:
             raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
 

From 5a924ff5cb6ed13a310a7184100c0951ea69f323 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 28 Oct 2013 15:18:43 +0000
Subject: [PATCH 150/252] Rederived gamma distribution

---
 GPy/likelihoods/noise_models/gamma_noise.py | 128 +++++++++++++++++---
 GPy/testing/likelihoods_tests.py            |  12 +-
 2 files changed, 119 insertions(+), 21 deletions(-)

diff --git a/GPy/likelihoods/noise_models/gamma_noise.py b/GPy/likelihoods/noise_models/gamma_noise.py
index 5229cb4f..2e4e7d15 100644
--- a/GPy/likelihoods/noise_models/gamma_noise.py
+++ b/GPy/likelihoods/noise_models/gamma_noise.py
@@ -12,11 +12,11 @@ from noise_distributions import NoiseDistribution
 class Gamma(NoiseDistribution):
     """
     Gamma likelihood
-    Y is expected to take values in {0,1,2,...}
-    -----
-    $$
-    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
-    $$
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\frac{\\beta^{\\alpha_{i}}}{\\Gamma(\\alpha_{i})}y_{i}^{\\alpha_{i}-1}e^{-\\beta y_{i}}\\\\
+        \\alpha_{i} = \\beta y_{i}
+
     """
     def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,beta=1.):
         self.beta = beta
@@ -25,26 +25,120 @@ class Gamma(NoiseDistribution):
     def _preprocess_values(self,Y):
         return Y
 
-    def _mass(self,gp,obs):
+    def pdf_link(self, link_f, y, extra_data=None):
         """
-        Mass (or density) function
+        Likelihood function given link(f)
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\frac{\\beta^{\\alpha_{i}}}{\\Gamma(\\alpha_{i})}y_{i}^{\\alpha_{i}-1}e^{-\\beta y_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
         """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         #return stats.gamma.pdf(obs,a = self.gp_link.transf(gp)/self.variance,scale=self.variance)
-        alpha = self.gp_link.transf(gp)*self.beta
-        return obs**(alpha - 1.) * np.exp(-self.beta*obs) * self.beta**alpha / special.gamma(alpha)
+        alpha = link_f*self.beta
+        return (y**(alpha - 1.) * np.exp(-self.beta*y) * self.beta**alpha)/ special.gamma(alpha)
 
-    def _nlog_mass(self,gp,obs):
+    def logpdf_link(self, link_f, y, extra_data=None):
         """
-        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\alpha_{i}\\log \\beta - \\log \\Gamma(\\alpha_{i}) + (\\alpha_{i} - 1)\\log y_{i} - \\beta y_{i}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
         """
-        alpha = self.gp_link.transf(gp)*self.beta
-        return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        #alpha = self.gp_link.transf(gp)*self.beta
+        #return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
+        alpha = link_f*self.beta
+        return alpha*np.log(self.beta) - np.log(special.gamma(alpha)) + (alpha - 1)*np.log(y) - self.beta*y
 
-    def _dnlog_mass_dgp(self,gp,obs):
-        return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
+    def dlogpdf_dlink(self, link_f, y, extra_data=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
 
-    def _d2nlog_mass_dgp2(self,gp,obs):
-        return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\beta (\\log \\beta y_{i}) - \\Psi(\\alpha_{i})\\beta\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in gamma distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        grad = self.beta*np.log(self.beta*y) - special.psi(self.beta*link_f)*self.beta
+        #old
+        #return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, extra_data=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\beta^{2}\\frac{d\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in gamma distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        hess = -special.polygamma(1, self.beta*link_f)*(self.beta**2)
+        #old
+        #return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, extra_data=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = -\\beta^{3}\\frac{d^{2}\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param extra_data: extra_data which is not used in gamma distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        d3lik_dlink3 = -special.polygamma(2, self.beta*link_f)*(self.beta**3)
+        return d3lik_dlink3
 
     def _mean(self,gp):
         """
diff --git a/GPy/testing/likelihoods_tests.py b/GPy/testing/likelihoods_tests.py
index 155842fd..8d1466fb 100644
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@@ -84,10 +84,8 @@ class TestNoiseModels(object):
         self.f = np.random.rand(self.N, 1)
         self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
         self.positive_Y = np.exp(self.Y.copy())
-        self.integer_Y = np.round(self.X[:, 0]*3-3)[:, None] + np.random.randint(0,3, self.X.shape[0])[:, None]
-        self.integer_Y = np.where(self.integer_Y > 0, self.integer_Y, 0)
-        print self.integer_Y
-        print self.Y
+        tmp = np.round(self.X[:, 0]*3-3)[:, None] + np.random.randint(0,3, self.X.shape[0])[:, None]
+        self.integer_Y = np.where(tmp > 0, tmp, 0)
 
         self.var = 0.2
 
@@ -234,6 +232,12 @@ class TestNoiseModels(object):
                             "Y": self.integer_Y,
                             "laplace": True,
                             "ep": False #Should work though...
+                        },
+                        "Gamma_default": {
+                            "model": GPy.likelihoods.gamma(),
+                            "link_f_constraints": [constrain_positive],
+                            "Y": self.positive_Y,
+                            "laplace": True
                         }
                     }
 

From 336f8e11c48bb4e749b9f389907c450e44f02786 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 28 Oct 2013 15:22:06 +0000
Subject: [PATCH 151/252] Added sampling for predictive quantiles and also mean
 and variance where necessary

---
 GPy/examples/classification.py                |  1 +
 GPy/examples/regression.py                    | 20 +++---
 GPy/likelihoods/laplace.py                    |  2 +-
 .../noise_models/noise_distributions.py       | 69 +++++++++++--------
 4 files changed, 53 insertions(+), 39 deletions(-)

diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index d4f55d4a..05b6af74 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -61,6 +61,7 @@ def toy_linear_1d_classification(seed=default_seed):
     #m.update_likelihood_approximation()
     # Parameters optimization:
     #m.optimize()
+    #m.update_likelihood_approximation()
     m.pseudo_EM()
 
     # Plot
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 2978ebdc..a37e32c3 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -272,11 +272,10 @@ def toy_rbf_1d_50(max_iters=100):
 
 def toy_poisson_rbf_1d(optimizer='bfgs', max_nb_eval_optim=100):
     """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    X = np.linspace(0,10)[:, None]
-    F = np.round(X*3-4)
-    F = np.where(F > 0, F, 0)
-    eps = np.random.randint(0,4, F.shape[0])[:, None]
-    Y = F + eps
+    x_len = 400
+    X = np.linspace(0, 10, x_len)[:, None]
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
 
     noise_model = GPy.likelihoods.poisson()
     likelihood = GPy.likelihoods.EP(Y,noise_model)
@@ -293,11 +292,10 @@ def toy_poisson_rbf_1d(optimizer='bfgs', max_nb_eval_optim=100):
 
 def toy_poisson_rbf_1d_laplace(optimizer='bfgs', max_nb_eval_optim=100):
     """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    X = np.linspace(0,10)[:, None]
-    F = np.round(X*3-4)
-    F = np.where(F > 0, F, 0)
-    eps = np.random.randint(0,4, F.shape[0])[:, None]
-    Y = F + eps
+    x_len = 30
+    X = np.linspace(0, 10, x_len)[:, None]
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
 
     noise_model = GPy.likelihoods.poisson()
     likelihood = GPy.likelihoods.Laplace(Y,noise_model)
@@ -309,6 +307,8 @@ def toy_poisson_rbf_1d_laplace(optimizer='bfgs', max_nb_eval_optim=100):
     m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
     # plot
     m.plot()
+    # plot the real underlying rate function
+    pb.plot(X, np.exp(f_true), '--k', linewidth=2)
     print(m)
     return m
 
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 047d7f74..8a11b146 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 #
 #Parts of this file were influenced by the Matlab GPML framework written by
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 165f8d2e..77671f84 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -150,6 +150,8 @@ class NoiseDistribution(object):
         :param sigma: standard deviation of posterior
 
         """
+        #FIXME: Quadrature does not work!
+        raise NotImplementedError
         sigma2 = sigma**2
         #Compute first moment
         def int_mean(f):
@@ -193,19 +195,6 @@ class NoiseDistribution(object):
         # V(Y_star | f_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
         return exp_var + var_exp
 
-    def _predictive_percentiles(self,p,mu,sigma):
-        """
-        Percentiles of the predictive distribution
-
-        :parm p: lower tail probability
-        :param mu: cavity distribution mean
-        :param sigma: cavity distribution standard deviation
-        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
-
-        """
-        qf = stats.norm.ppf(p,mu,sigma)
-        return self.gp_link.transf(qf)
-
     def pdf_link(self, link_f, y, extra_data=None):
         raise NotImplementedError
 
@@ -386,26 +375,50 @@ class NoiseDistribution(object):
         assert d2logpdf_df2_dtheta.shape[1] == len(self._get_param_names())
         return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
 
-    def predictive_values(self,mu,var):
+    def predictive_values(self, mu, var, full_cov=False, num_samples=5000,
+                          sampling=False):
         """
         Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
 
-        :param mu: mean of the latent variable, f
-        :param var: variance of the latent variable, f
+        :param mu: mean of the latent variable, f, of posterior
+        :param var: variance of the latent variable, f, of posterior
+        :param full_cov: whether to use the full covariance or just the diagonal
+        :type full_cov: Boolean
+        :param num_samples: number of samples to use in computing quantiles and
+                            possibly mean variance
+        :type num_samples: integer
+        :param sampling: Whether to use samples for mean and variances anyway
+        :type sampling: Boolean
 
         """
-        if isinstance(mu,float) or isinstance(mu,int):
-            mu = [mu]
-            var = [var]
-        pred_mean = []
-        pred_var = []
-        q1 = []
-        q3 = []
-        for m,s in zip(mu,np.sqrt(var)):
-            pred_mean.append(self.predictive_mean(m,s))
-            pred_var.append(self.predictive_variance(m,s,pred_mean[-1]))
-            q1.append(self._predictive_percentiles(.025,m,s))
-            q3.append(self._predictive_percentiles(.975,m,s))
+
+        #Get gp_samples f* using posterior mean and variance
+        if not full_cov:
+            gp_samples = np.random.multivariate_normal(mu.flatten(), np.diag(var.flatten()),
+                                                        size=num_samples).T
+        else:
+            gp_samples = np.random.multivariate_normal(mu.flatten(), var,
+                                                           size=num_samples).T
+
+        #Push gp samples (f*) through likelihood to give p(y*|f*)
+        samples = self.samples(gp_samples)
+        axis=-1
+
+        if self.analytical_mean and not sampling:
+            pred_mean = self.predictive_mean(mu, np.sqrt(var))
+        else:
+            pred_mean = np.mean(samples, axis=axis)
+
+        if self.analytical_variance and not sampling:
+            pred_var = self.predictive_variance(mu, np.sqrt(var), pred_mean)
+        else:
+            pred_var = np.var(samples, axis=axis)
+
+        #Calculate quantiles from samples
+        q1 = np.percentile(samples, 2.5, axis=axis)
+        q3 = np.percentile(samples, 97.5, axis=axis)
+        print "WARNING: Using sampling to calculate predictive quantiles"
+
         pred_mean = np.vstack(pred_mean)
         pred_var = np.vstack(pred_var)
         q1 = np.vstack(q1)

From fc59ef4baf8044eb9496ef9b6d5919f8cadd9d57 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 28 Oct 2013 15:42:25 +0000
Subject: [PATCH 152/252] Tidying up and fixed objective being vector

---
 GPy/likelihoods/laplace.py                        | 8 ++++----
 GPy/likelihoods/noise_models/exponential_noise.py | 7 ++++---
 GPy/likelihoods/noise_models/gamma_noise.py       | 6 ++++--
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 8a11b146..7e570e52 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -340,8 +340,8 @@ class Laplace(likelihood):
                 Ki_f = old_Ki_f + step_size*dKi_f
                 f = np.dot(K, Ki_f)
                 # This is nasty, need to set something within an optimization though
-                self.Ki_f = Ki_f.copy()
-                self.f = f.copy()
+                self.tmp_Ki_f = Ki_f.copy()
+                self.tmp_f = f.copy()
                 return -obj(Ki_f, f)
 
             i_o = partial_func(inner_obj, old_Ki_f=old_Ki_f, dKi_f=dKi_f, K=K)
@@ -349,8 +349,8 @@ class Laplace(likelihood):
             #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
             #steps than get this exact then make a step, if B was bigger it might be the other way around though
             new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
-            f = self.f.copy()
-            Ki_f = self.Ki_f.copy()
+            f = self.tmp_f.copy()
+            Ki_f = self.tmp_Ki_f.copy()
 
             #Optimize without linesearch
             #f_old = f.copy()
diff --git a/GPy/likelihoods/noise_models/exponential_noise.py b/GPy/likelihoods/noise_models/exponential_noise.py
index 8e916353..e637cc02 100644
--- a/GPy/likelihoods/noise_models/exponential_noise.py
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@@ -40,7 +40,8 @@ class Exponential(NoiseDistribution):
         :rtype: float
         """
         assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        return np.exp(np.sum(np.log(link_f*np.exp(-y*link_f))))
+        log_objective = link_f*np.exp(-y*link_f)
+        return np.exp(np.sum(np.log(log_objective)))
         #return np.exp(np.sum(-y/link_f - np.log(link_f) ))
 
     def logpdf_link(self, link_f, y, extra_data=None):
@@ -60,9 +61,9 @@ class Exponential(NoiseDistribution):
 
         """
         assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        logpdf_link = np.sum(np.log(link_f) - y*link_f)
+        log_objective = np.log(link_f) - y*link_f
         #logpdf_link = np.sum(-np.log(link_f) - y/link_f)
-        return logpdf_link
+        return np.sum(log_objective)
 
     def dlogpdf_dlink(self, link_f, y, extra_data=None):
         """
diff --git a/GPy/likelihoods/noise_models/gamma_noise.py b/GPy/likelihoods/noise_models/gamma_noise.py
index 2e4e7d15..2be3106a 100644
--- a/GPy/likelihoods/noise_models/gamma_noise.py
+++ b/GPy/likelihoods/noise_models/gamma_noise.py
@@ -44,7 +44,8 @@ class Gamma(NoiseDistribution):
         assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         #return stats.gamma.pdf(obs,a = self.gp_link.transf(gp)/self.variance,scale=self.variance)
         alpha = link_f*self.beta
-        return (y**(alpha - 1.) * np.exp(-self.beta*y) * self.beta**alpha)/ special.gamma(alpha)
+        objective = (y**(alpha - 1.) * np.exp(-self.beta*y) * self.beta**alpha)/ special.gamma(alpha)
+        return np.exp(np.sum(np.log(objective)))
 
     def logpdf_link(self, link_f, y, extra_data=None):
         """
@@ -67,7 +68,8 @@ class Gamma(NoiseDistribution):
         #alpha = self.gp_link.transf(gp)*self.beta
         #return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
         alpha = link_f*self.beta
-        return alpha*np.log(self.beta) - np.log(special.gamma(alpha)) + (alpha - 1)*np.log(y) - self.beta*y
+        log_objective = alpha*np.log(self.beta) - np.log(special.gamma(alpha)) + (alpha - 1)*np.log(y) - self.beta*y
+        return np.sum(log_objective)
 
     def dlogpdf_dlink(self, link_f, y, extra_data=None):
         """

From df9a546c73fbb2157e8c7ebf294dff5175909c2c Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 28 Oct 2013 16:17:17 +0000
Subject: [PATCH 153/252] Added sampling to student_t noise distribution, very
 slow and is possible to speed up. predictive mean analytical and variance
 need checking

---
 .../noise_models/student_t_noise.py           | 77 +++----------------
 1 file changed, 10 insertions(+), 67 deletions(-)

diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index f268c644..1d11e707 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -241,92 +241,35 @@ class StudentT(NoiseDistribution):
         *((1/(s*sqrt(2*pi)))*exp(-(1/(2*(s^2)))*((y-f)^2)))
         """
 
+        #FIXME: Not correct
         #We want the variance around test points y which comes from int p(y*|f*)p(f*) df*
         #Var(y*) = Var(E[y*|f*]) + E[Var(y*|f*)]
         #Since we are given f* (mu) which is our mean (expected) value of y*|f* then the variance is the variance around this
         #Which was also given to us as (var)
         #We also need to know the expected variance of y* around samples f*, this is the variance of the student t distribution
         #However the variance of the student t distribution is not dependent on f, only on sigma and the degrees of freedom
-        true_var = sigma**2 + self.variance
+        true_var = 1/(1/sigma**2 + 1/self.variance)
 
         return true_var
 
-    def _predictive_mean_analytical(self, mu, var):
+    def _predictive_mean_analytical(self, mu, sigma):
         """
         Compute mean of the prediction
         """
+        #FIXME: Not correct
         return mu
 
-    def sample_predicted_values(self, mu, var):
-        """ Experimental sample approches and numerical integration """
-        raise NotImplementedError
-        #p_025 = stats.t.ppf(.025, mu)
-        #p_975 = stats.t.ppf(.975, mu)
-
-        num_test_points = mu.shape[0]
-        #Each mu is the latent point f* at the test point x*,
-        #and the var is the gaussian variance at this point
-        #Take lots of samples from this, so we have lots of possible values
-        #for latent point f* for each test point x* weighted by how likely we were to pick it
-        print "Taking %d samples of f*".format(num_test_points)
-        num_f_samples = 10
-        num_y_samples = 10
-        student_t_means = np.random.normal(loc=mu, scale=np.sqrt(var), size=(num_test_points, num_f_samples))
-        print "Student t means shape: ", student_t_means.shape
-
-        #Now we have lots of f*, lets work out the likelihood of getting this by sampling
-        #from a student t centred on this point, sample many points from this distribution
-        #centred on f*
-        #for test_point, f in enumerate(student_t_means):
-            #print test_point
-            #print f.shape
-            #student_t_samples = stats.t.rvs(self.v, loc=f[:,None],
-                                            #scale=self.sigma,
-                                            #size=(num_f_samples, num_y_samples))
-            #print student_t_samples.shape
-
-        student_t_samples = stats.t.rvs(self.v, loc=student_t_means[:, None],
-                                        scale=self.sigma,
-                                        size=(num_test_points, num_y_samples, num_f_samples))
-        student_t_samples = np.reshape(student_t_samples,
-                                       (num_test_points, num_y_samples*num_f_samples))
-
-        #Now take the 97.5 and 0.25 percentile of these points
-        p_025 = stats.scoreatpercentile(student_t_samples, .025, axis=1)[:, None]
-        p_975 = stats.scoreatpercentile(student_t_samples, .975, axis=1)[:, None]
-
-        ##Alernenately we could sample from int p(y|f*)p(f*|x*) df*
-        def t_gaussian(f, mu, var):
-            return (((gamma((self.v+1)*0.5)) / (gamma(self.v*0.5)*self.sigma*np.sqrt(self.v*np.pi))) * ((1+(1/self.v)*(((mu-f)/self.sigma)**2))**(-(self.v+1)*0.5))
-                    * ((1/(np.sqrt(2*np.pi*var)))*np.exp(-(1/(2*var)) *((mu-f)**2)))
-                    )
-
-        def t_gauss_int(mu, var):
-            print "Mu: ", mu
-            print "var: ", var
-            result = integrate.quad(t_gaussian, 0.025, 0.975, args=(mu, var))
-            print "Result: ", result
-            return result[0]
-
-        vec_t_gauss_int = np.vectorize(t_gauss_int)
-
-        p = vec_t_gauss_int(mu, var)
-        p_025 = mu - p
-        p_975 = mu + p
-        return mu, np.nan*mu, p_025, p_975
-
     def samples(self, gp):
         """
         Returns a set of samples of observations based on a given value of the latent variable.
 
-        :param size: number of samples to compute
         :param gp: latent variable
         """
         orig_shape = gp.shape
         gp = gp.flatten()
-        f = self.gp_link.transf(gp)
-        #student_t_samples = stats.t.rvs(self.v, loc=f,
-                                        #scale=np.sqrt(self.sigma2),
-                                        #size=(num_test_points, num_y_samples, num_f_samples))
-        #Ysim = np.array([np.random.binomial(1,self.gp_link.transf(gpj),size=1) for gpj in gp])
-        return Ysim.reshape(orig_shape)
+        #FIXME: Very slow as we are computing a new random variable per input!
+        #Can't get it to sample all at the same time
+        student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
+        #student_t_samples = stats.t.rvs(self.v, loc=self.gp_link.transf(gp),
+                                        #scale=np.sqrt(self.sigma2))
+        return student_t_samples.reshape(orig_shape)

From 494d28d09a9279083bc1612a56b252b673e7b16f Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 28 Oct 2013 16:20:55 +0000
Subject: [PATCH 154/252] Ignoring examples tests again

---
 GPy/testing/examples_tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/GPy/testing/examples_tests.py b/GPy/testing/examples_tests.py
index 15dbe234..a525b1c9 100644
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@@ -39,6 +39,7 @@ def model_instance(model):
     #assert isinstance(model, GPy.core.model)
     return isinstance(model, GPy.core.model.Model)
 
+@nottest
 def test_models():
     examples_path = os.path.dirname(GPy.examples.__file__)
     # Load modules

From 11ee480cbf300ae597896ff60a60deef1ba8ed75 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 28 Oct 2013 16:47:17 +0000
Subject: [PATCH 155/252] Sped up sampling a lot for student t, bernoulli and
 poisson, added sampling for gaussian and exponential (untested)

---
 GPy/examples/laplace_approximations.py        | 19 -------------------
 .../noise_models/bernoulli_noise.py           |  4 ++--
 .../noise_models/exponential_noise.py         | 11 +++++++++++
 .../noise_models/gaussian_noise.py            | 11 +++++++++++
 .../noise_models/noise_distributions.py       |  2 +-
 GPy/likelihoods/noise_models/poisson_noise.py |  3 +--
 .../noise_models/student_t_noise.py           |  8 +++++---
 7 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 96b423f0..64185885 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -123,25 +123,6 @@ def student_t_approx():
 
     return m
 
-    #with a student t distribution, since it has heavy tails it should work well
-    #likelihood_function = student_t(deg_free=deg_free, sigma2=real_var)
-    #lap = Laplace(Y, likelihood_function)
-    #cov = kernel.K(X)
-    #lap.fit_full(cov)
-
-    #test_range = np.arange(0, 10, 0.1)
-    #plt.plot(test_range, t_rv.pdf(test_range))
-    #for i in xrange(X.shape[0]):
-        #mode = lap.f_hat[i]
-        #covariance = lap.hess_hat_i[i,i]
-        #scaling = np.exp(lap.ln_z_hat)
-        #normalised_approx = norm(loc=mode, scale=covariance)
-        #print "Normal with mode %f, and variance %f" % (mode, covariance)
-        #plt.plot(test_range, scaling*normalised_approx.pdf(test_range))
-    #plt.show()
-
-    return m
-
 def boston_example():
     import sklearn
     from sklearn.cross_validation import KFold
diff --git a/GPy/likelihoods/noise_models/bernoulli_noise.py b/GPy/likelihoods/noise_models/bernoulli_noise.py
index 77242333..2c4116da 100644
--- a/GPy/likelihoods/noise_models/bernoulli_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@@ -207,10 +207,10 @@ class Bernoulli(NoiseDistribution):
         """
         Returns a set of samples of observations based on a given value of the latent variable.
 
-        :param size: number of samples to compute
         :param gp: latent variable
         """
         orig_shape = gp.shape
         gp = gp.flatten()
-        Ysim = np.array([np.random.binomial(1,self.gp_link.transf(gpj),size=1) for gpj in gp])
+        ns = np.ones_like(gp, dtype=int)
+        Ysim = np.random.binomial(ns, self.gp_link.transf(gp))
         return Ysim.reshape(orig_shape)
diff --git a/GPy/likelihoods/noise_models/exponential_noise.py b/GPy/likelihoods/noise_models/exponential_noise.py
index e637cc02..602ccea5 100644
--- a/GPy/likelihoods/noise_models/exponential_noise.py
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@@ -143,3 +143,14 @@ class Exponential(NoiseDistribution):
         Mass (or density) function
         """
         return self.gp_link.transf(gp)**2
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.random.exponential(1.0/self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index 0ce8ffd9..fce84d27 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -285,3 +285,14 @@ class Gaussian(NoiseDistribution):
             Var_{p(y|f)}[y]
         """
         return self.variance
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        Ysim = np.array([np.random.normal(self.gp_link.transf(gpj), scale=np.sqrt(self.variance), size=1) for gpj in gp])
+        return Ysim.reshape(orig_shape)
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 77671f84..77cc82a4 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -375,7 +375,7 @@ class NoiseDistribution(object):
         assert d2logpdf_df2_dtheta.shape[1] == len(self._get_param_names())
         return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
 
-    def predictive_values(self, mu, var, full_cov=False, num_samples=5000,
+    def predictive_values(self, mu, var, full_cov=False, num_samples=30000,
                           sampling=False):
         """
         Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
diff --git a/GPy/likelihoods/noise_models/poisson_noise.py b/GPy/likelihoods/noise_models/poisson_noise.py
index fba00417..b0300704 100644
--- a/GPy/likelihoods/noise_models/poisson_noise.py
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@@ -144,10 +144,9 @@ class Poisson(NoiseDistribution):
         """
         Returns a set of samples of observations based on a given value of the latent variable.
 
-        :param size: number of samples to compute
         :param gp: latent variable
         """
         orig_shape = gp.shape
         gp = gp.flatten()
-        Ysim = np.array([np.random.poisson(self.gp_link.transf(gpj),size=1) for gpj in gp])
+        Ysim = np.random.poisson(self.gp_link.transf(gp))
         return Ysim.reshape(orig_shape)
diff --git a/GPy/likelihoods/noise_models/student_t_noise.py b/GPy/likelihoods/noise_models/student_t_noise.py
index 1d11e707..daad7186 100644
--- a/GPy/likelihoods/noise_models/student_t_noise.py
+++ b/GPy/likelihoods/noise_models/student_t_noise.py
@@ -269,7 +269,9 @@ class StudentT(NoiseDistribution):
         gp = gp.flatten()
         #FIXME: Very slow as we are computing a new random variable per input!
         #Can't get it to sample all at the same time
-        student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
-        #student_t_samples = stats.t.rvs(self.v, loc=self.gp_link.transf(gp),
-                                        #scale=np.sqrt(self.sigma2))
+        #student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
+        dfs = np.ones_like(gp)*self.v
+        scales = np.ones_like(gp)*np.sqrt(self.sigma2)
+        student_t_samples = stats.t.rvs(dfs, loc=self.gp_link.transf(gp),
+                                        scale=scales)
         return student_t_samples.reshape(orig_shape)

From e7b79b1fb099283b1ce5c293227e81275791b0ec Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 28 Oct 2013 19:15:14 +0000
Subject: [PATCH 156/252] Removed ipython dependency from kern

---
 GPy/kern/parts/hetero.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/GPy/kern/parts/hetero.py b/GPy/kern/parts/hetero.py
index d3939563..c716eaad 100644
--- a/GPy/kern/parts/hetero.py
+++ b/GPy/kern/parts/hetero.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from IPython.core.debugger import Tracer; debug_here=Tracer()
 from kernpart import Kernpart
 import numpy as np
 from ...util.linalg import tdot

From f80b616d10642a9f0cc7cfcac4f85dccabeca41e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 28 Oct 2013 19:21:38 +0000
Subject: [PATCH 157/252] Added dpotrs instead of cho_solve

---
 GPy/likelihoods/laplace.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 7e570e52..15f2b48e 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -12,10 +12,8 @@
 
 import numpy as np
 import scipy as sp
-from scipy.linalg import cho_solve
 from likelihood import likelihood
-from ..util.linalg import mdot, jitchol, pddet
-from scipy.linalg.lapack import dtrtrs
+from ..util.linalg import mdot, jitchol, pddet, dpotrs
 from functools import partial as partial_func
 
 class Laplace(likelihood):
@@ -282,7 +280,7 @@ class Laplace(likelihood):
         B = np.eye(self.N) + W_12*K*W_12.T
         L = jitchol(B)
 
-        W12BiW12= W_12*cho_solve((L, True), W_12*a)
+        W12BiW12, _ = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)
         ln_B_det = 2*np.sum(np.log(np.diag(L)))
         return W12BiW12, ln_B_det
 

From bd062329a84bc53154cc9ee493ed6f3ea2e032d8 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 28 Oct 2013 19:28:30 +0000
Subject: [PATCH 158/252] Fixed the dpotrs use..

---
 GPy/likelihoods/laplace.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 15f2b48e..6a44d5b6 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -280,7 +280,7 @@ class Laplace(likelihood):
         B = np.eye(self.N) + W_12*K*W_12.T
         L = jitchol(B)
 
-        W12BiW12, _ = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)
+        W12BiW12 = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
         ln_B_det = 2*np.sum(np.log(np.diag(L)))
         return W12BiW12, ln_B_det
 

From e5487bff19eb3ed902899d5321d0aeef7c1dec56 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 28 Oct 2013 21:41:10 +0000
Subject: [PATCH 159/252] fixed plotting isue with plot_f

---
 GPy/core/gp_base.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index 5b6b8f61..f07c4b96 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -99,13 +99,13 @@ class GPBase(Model):
 
         see also: gp_base.plot
         """
-        kwargs['use_raw_predict'] = True
+        kwargs['plot_raw'] = True
         self.plot(*args, **kwargs)
 
     def plot(self, plot_limits=None, which_data_rows='all',
             which_data_ycols='all', which_parts='all', fixed_inputs=[],
             levels=20, samples=0, fignum=None, ax=None, resolution=None,
-            use_raw_predict=False,
+            plot_raw=False,
             linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
         """ 
         Plot the posterior of the GP.
@@ -170,15 +170,17 @@ class GPBase(Model):
                 Xgrid[:,i] = v
 
             #make a prediction on the frame and plot it
-            if use_raw_predict:
+            if plot_raw:
                 m, v = self._raw_predict(Xgrid, which_parts=which_parts)
                 lower = m - 2*np.sqrt(v)
                 upper = m + 2*np.sqrt(v)
+                Y = self.likelihood.Y
             else:
                 m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts)
+                Y = self.likelihood.data
             for d in which_data_ycols:
                 gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data_rows,free_dims], self.likelihood.data[which_data_rows, d], 'kx', mew=1.5)
+                ax.plot(Xu[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)
 
             #optionally plot some samples
             if samples: #NOTE not tested with fixed_inputs
@@ -209,13 +211,14 @@ class GPBase(Model):
             #predict on the frame and plot
             if use_raw_predict:
                 m, _ = self._raw_predict(Xgrid, which_parts=which_parts)
+                Y = self.likelihood.Y
             else:
                 m, _, _, _ = self.predict(Xgrid, which_parts=which_parts)
+                Y = self.likelihood.data
             for d in which_data_ycols:
                 m_d = m[:,d].reshape(resolution, resolution).T
                 ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-                Y_d = self.likelihood.Y[which_data_rows,d]
-                ax.scatter(self.X[which_data_rows, free_dims[0]], self.X[which_data_rows, free_dims[1]], 40, Y_d, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+                ax.scatter(self.X[which_data_rows, free_dims[0]], self.X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
 
             #set the limits of the plot to some sensible values
             ax.set_xlim(xmin[0], xmax[0])

From ecfffc97e66fb85f4fe698037a43150fb906c25a Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 28 Oct 2013 22:11:08 +0000
Subject: [PATCH 160/252] even more data plotting

---
 GPy/core/gp_base.py   | 2 +-
 GPy/core/sparse_gp.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index f07c4b96..10d30358 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -190,7 +190,7 @@ class GPBase(Model):
                     #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
 
             #set the limits of the plot to some sensible values
-            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+            ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
             ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
             ax.set_xlim(xmin, xmax)
             ax.set_ylim(ymin, ymax)
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index e02da768..5e381110 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -324,7 +324,7 @@ class SparseGP(GPBase):
 
 
     def plot_f(self, samples=0, plot_limits=None, which_data_rows='all',
-            which_data_cols='all', which_parts='all', resolution=None,
+            which_data_ycols='all', which_parts='all', resolution=None,
             full_cov=False, fignum=None, ax=None):
 
         """
@@ -359,7 +359,7 @@ class SparseGP(GPBase):
         if which_data_rows is 'all':
             which_data_rows = slice(None)
 
-        GPBase.plot_f(self, samples=samples, plot_limits=plot_limits, which_data_rows=which_data_rows, which_data_ycols=which_data_ycols, which_parts=which_parts, resolution=resolution, full_cov=full_cov, fignum=fignum, ax=ax)
+        GPBase.plot_f(self, samples=samples, plot_limits=plot_limits, which_data_rows=which_data_rows, which_data_ycols=which_data_ycols, which_parts=which_parts, resolution=resolution, fignum=fignum, ax=ax)
 
         if self.X.shape[1] == 1:
             if self.has_uncertain_inputs:
@@ -379,6 +379,7 @@ class SparseGP(GPBase):
 
     def plot(self, plot_limits=None, which_data_rows='all',
             which_data_ycols='all', which_parts='all', fixed_inputs=[],
+            plot_raw=False,
             levels=20, samples=0, fignum=None, ax=None, resolution=None):
         """ 
         Plot the posterior of the sparse GP.

From 490755130a850154ad6b38498462fc4cdff06bf7 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 31 Oct 2013 17:47:07 +0000
Subject: [PATCH 161/252] SPELLAFSDIUN

---
 GPy/likelihoods/__init__.py                        | 1 +
 GPy/likelihoods/noise_models/gp_transformations.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 0cb62eb0..b98af4a3 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -2,6 +2,7 @@ from ep import EP
 from ep_mixed_noise import EP_Mixed_Noise
 from gaussian import Gaussian
 from gaussian_mixed_noise import Gaussian_Mixed_Noise
+import noise_models
 from noise_model_constructors import *
 # TODO: from Laplace import Laplace
 
diff --git a/GPy/likelihoods/noise_models/gp_transformations.py b/GPy/likelihoods/noise_models/gp_transformations.py
index e95e9df7..dc83c461 100644
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@@ -105,7 +105,7 @@ class Log_ex_1(GPTransformation):
         return aux*(1.-aux)
 
 class Reciprocal(GPTransformation):
-    def transf(sefl,f):
+    def transf(self,f):
         return 1./f
 
     def dtransf_df(self,f):

From d2d1d58db39a5d78907b21777a93d19b4d0c9cff Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 6 Nov 2013 15:26:09 +0000
Subject: [PATCH 162/252] BGPLVM test for crossterms

---
 GPy/examples/dimensionality_reduction.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index bde249c8..666209f9 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -12,10 +12,10 @@ from GPy.likelihoods.gaussian import Gaussian
 default_seed = np.random.seed(123344)
 
 def BGPLVM(seed=default_seed):
-    N = 5
-    num_inducing = 4
-    Q = 3
-    D = 2
+    N = 13
+    num_inducing = 5
+    Q = 6
+    D = 25
     # generate GPLVM-like data
     X = np.random.rand(N, Q)
     lengthscales = np.random.rand(Q)
@@ -25,9 +25,12 @@ def BGPLVM(seed=default_seed):
     Y = np.random.multivariate_normal(np.zeros(N), K, D).T
     lik = Gaussian(Y, normalize=True)
 
-    k = GPy.kern.rbf_inv(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
+    # k = GPy.kern.rbf_inv(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
     # k = GPy.kern.linear(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
     # k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
+    # k = GPy.kern.rbf(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.rbf(Q, .3, np.ones(Q) * .2, ARD=True)
+    k = GPy.kern.rbf(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.linear(Q, np.ones(Q) * .2, ARD=True)
+    # k = GPy.kern.rbf(Q, .5, 2., ARD=0) + GPy.kern.rbf(Q, .3, .2, ARD=0)
 
     m = GPy.models.BayesianGPLVM(lik, Q, kernel=k, num_inducing=num_inducing)
     m.lengthscales = lengthscales

From 3d991fd127ba6eb130021d3b16271a6e3426d234 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 7 Nov 2013 13:32:58 +0000
Subject: [PATCH 163/252] added variational distribution for latent space

---
 GPy/core/variational.py                   |  19 ++
 GPy/kern/kern.py                          | 243 ++++++++++++++--------
 GPy/testing/psi_stat_expectation_tests.py |  34 +--
 3 files changed, 195 insertions(+), 101 deletions(-)
 create mode 100644 GPy/core/variational.py

diff --git a/GPy/core/variational.py b/GPy/core/variational.py
new file mode 100644
index 00000000..74287dcf
--- /dev/null
+++ b/GPy/core/variational.py
@@ -0,0 +1,19 @@
+'''
+Created on 6 Nov 2013
+
+@author: maxz
+'''
+from parameterized import Parameterized
+from parameter import Param
+
+class Normal(Parameterized):
+    '''
+    Normal distribution for variational approximations.
+    
+    holds the means and variances for a factorizing multivariate normal distribution
+    '''
+    def __init__(self, name, means, variances):
+        Parameterized.__init__(self, name=name)
+        self.means = Param("mean", means)
+        self.variances = Param('variance', variances)
+        self.add_parameters(self.means, self.variances)
\ No newline at end of file
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 805c6b43..37839423 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -18,37 +18,37 @@ class kern(Parameterized):
         like which parameters live where.
 
         The technical code for kernels is divided into _parts_ (see
-        e.g. rbf.py). This object contains a list of parts, which are
-        computed additively. For multiplication, special _prod_ parts
+        e.g. rbf.py). This object contains a list of _parameters_, which are
+        computed additively. For multiplication, special _prod_ _parameters_
         are used.
 
         :param input_dim: The dimensionality of the kernel's input space
         :type input_dim: int
-        :param parts: the 'parts' (PD functions) of the kernel
-        :type parts: list of Kernpart objects
+        :param _parameters_: the '_parameters_' (PD functions) of the kernel
+        :type _parameters_: list of Kernpart objects
         :param input_slices: the slices on the inputs which apply to each kernel
         :type input_slices: list of slice objects, or list of bools
 
         """
-        self.parts = parts
+        self._parameters_ = parts
         self.num_parts = len(parts)
-        self.num_params = sum([p.num_params for p in self.parts])
+        self.num_params = sum([p.num_params for p in self._parameters_])
 
         self.input_dim = input_dim
 
-        part_names = [k.name for k in self.parts]
+        part_names = [k.name for k in self._parameters_]
         self.name=''
         for name in part_names:
             self.name += name + '+'
         self.name = self.name[:-1]
         # deal with input_slices
         if input_slices is None:
-            self.input_slices = [slice(None) for p in self.parts]
+            self.input_slices = [slice(None) for p in self._parameters_]
         else:
-            assert len(input_slices) == len(self.parts)
+            assert len(input_slices) == len(self._parameters_)
             self.input_slices = [sl if type(sl) is slice else slice(None) for sl in input_slices]
 
-        for p in self.parts:
+        for p in self._parameters_:
             assert isinstance(p, Kernpart), "bad kernel part"
 
         self.compute_param_slices()
@@ -60,7 +60,7 @@ class kern(Parameterized):
         Get the current state of the class,
         here just all the indices, rest can get recomputed
         """
-        return Parameterized.getstate(self) + [self.parts,
+        return Parameterized.getstate(self) + [self._parameters_,
                 self.num_parts,
                 self.num_params,
                 self.input_dim,
@@ -74,7 +74,7 @@ class kern(Parameterized):
         self.input_dim = state.pop()
         self.num_params = state.pop()
         self.num_parts = state.pop()
-        self.parts = state.pop()
+        self._parameters_ = state.pop()
         Parameterized.setstate(self, state)
 
 
@@ -99,7 +99,7 @@ class kern(Parameterized):
         xticklabels = []
         bars = []
         x0 = 0
-        for p in self.parts:
+        for p in self._parameters_:
             c = Tango.nextMedium()
             if hasattr(p, 'ARD') and p.ARD:
                 if title is None:
@@ -173,7 +173,7 @@ class kern(Parameterized):
         """
         self.param_slices = []
         count = 0
-        for p in self.parts:
+        for p in self._parameters_:
             self.param_slices.append(slice(count, count + p.num_params))
             count += p.num_params
 
@@ -202,7 +202,7 @@ class kern(Parameterized):
             other_input_indices = [sl.indices(other.input_dim) for sl in other.input_slices]
             other_input_slices = [slice(i[0] + self.input_dim, i[1] + self.input_dim, i[2]) for i in other_input_indices]
 
-            newkern = kern(D, self.parts + other.parts, self_input_slices + other_input_slices)
+            newkern = kern(D, self._parameters_ + other._parameters_, self_input_slices + other_input_slices)
 
             # transfer constraints:
             newkern.constrained_indices = self.constrained_indices + [x + self.num_params for x in other.constrained_indices]
@@ -213,7 +213,7 @@ class kern(Parameterized):
             newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices]
         else:
             assert self.input_dim == other.input_dim
-            newkern = kern(self.input_dim, self.parts + other.parts, self.input_slices + other.input_slices)
+            newkern = kern(self.input_dim, self._parameters_ + other._parameters_, self.input_slices + other.input_slices)
             # transfer constraints:
             newkern.constrained_indices = self.constrained_indices + [i + self.num_params  for i in other.constrained_indices]
             newkern.constraints = self.constraints + other.constraints
@@ -251,7 +251,7 @@ class kern(Parameterized):
             s1[sl1], s2[sl2] = [True], [True]
             slices += [s1 + s2]
 
-        newkernparts = [prod(k1, k2, tensor) for k1, k2 in itertools.product(K1.parts, K2.parts)]
+        newkernparts = [prod(k1, k2, tensor) for k1, k2 in itertools.product(K1._parameters_, K2._parameters_)]
 
         if tensor:
             newkern = kern(K1.input_dim + K2.input_dim, newkernparts, slices)
@@ -266,12 +266,12 @@ class kern(Parameterized):
         # Build the array that allows to go from the initial indices of the param to the new ones
         K1_param = []
         n = 0
-        for k1 in K1.parts:
+        for k1 in K1._parameters_:
             K1_param += [range(n, n + k1.num_params)]
             n += k1.num_params
         n = 0
         K2_param = []
-        for k2 in K2.parts:
+        for k2 in K2._parameters_:
             K2_param += [range(K1.num_params + n, K1.num_params + n + k2.num_params)]
             n += k2.num_params
         index_param = []
@@ -303,19 +303,19 @@ class kern(Parameterized):
             self.constrain(np.where(index_param == i)[0], t)
 
     def _get_params(self):
-        return np.hstack([p._get_params() for p in self.parts])
+        return np.hstack([p._get_params() for p in self._parameters_])
 
     def _set_params(self, x):
-        [p._set_params(x[s]) for p, s in zip(self.parts, self.param_slices)]
+        [p._set_params(x[s]) for p, s in zip(self._parameters_, self.param_slices)]
 
     def _get_param_names(self):
-        # this is a bit nasty: we want to distinguish between parts with the same name by appending a count
-        part_names = np.array([k.name for k in self.parts], dtype=np.str)
+        # this is a bit nasty: we want to distinguish between _parameters_ with the same name by appending a count
+        part_names = np.array([k.name for k in self._parameters_], dtype=np.str)
         counts = [np.sum(part_names == ni) for i, ni in enumerate(part_names)]
         cum_counts = [np.sum(part_names[i:] == ni) for i, ni in enumerate(part_names)]
         names = [name + '_' + str(cum_count) if count > 1 else name for name, count, cum_count in zip(part_names, counts, cum_counts)]
 
-        return sum([[name + '_' + n for n in k._get_param_names()] for name, k in zip(names, self.parts)], [])
+        return sum([[name + '_' + n for n in k._get_param_names()] for name, k in zip(names, self._parameters_)], [])
 
     def K(self, X, X2=None, which_parts='all'):
         """
@@ -334,10 +334,10 @@ class kern(Parameterized):
         assert X.shape[1] == self.input_dim
         if X2 is None:
             target = np.zeros((X.shape[0], X.shape[0]))
-            [p.K(X[:, i_s], None, target=target) for p, i_s, part_i_used in zip(self.parts, self.input_slices, which_parts) if part_i_used]
+            [p.K(X[:, i_s], None, target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
         else:
             target = np.zeros((X.shape[0], X2.shape[0]))
-            [p.K(X[:, i_s], X2[:, i_s], target=target) for p, i_s, part_i_used in zip(self.parts, self.input_slices, which_parts) if part_i_used]
+            [p.K(X[:, i_s], X2[:, i_s], target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
         return target
 
     def dK_dtheta(self, dL_dK, X, X2=None):
@@ -356,9 +356,9 @@ class kern(Parameterized):
         assert X.shape[1] == self.input_dim
         target = np.zeros(self.num_params)
         if X2 is None:
-            [p.dK_dtheta(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self.parts, self.input_slices, self.param_slices)]
+            [p.dK_dtheta(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self.param_slices)]
         else:
-            [p.dK_dtheta(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self.parts, self.input_slices, self.param_slices)]
+            [p.dK_dtheta(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self.param_slices)]
 
         return self._transform_gradients(target)
 
@@ -374,9 +374,9 @@ class kern(Parameterized):
 
         target = np.zeros_like(X)
         if X2 is None: 
-            [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+            [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
         else:
-            [p.dK_dX(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+            [p.dK_dX(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
         return target
 
     def Kdiag(self, X, which_parts='all'):
@@ -385,7 +385,7 @@ class kern(Parameterized):
             which_parts = [True] * self.num_parts
         assert X.shape[1] == self.input_dim
         target = np.zeros(X.shape[0])
-        [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self.parts, self.input_slices, which_parts) if part_on]
+        [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self._parameters_, self.input_slices, which_parts) if part_on]
         return target
 
     def dKdiag_dtheta(self, dL_dKdiag, X):
@@ -393,131 +393,200 @@ class kern(Parameterized):
         assert X.shape[1] == self.input_dim
         assert dL_dKdiag.size == X.shape[0]
         target = np.zeros(self.num_params)
-        [p.dKdiag_dtheta(dL_dKdiag, X[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]
+        [p.dKdiag_dtheta(dL_dKdiag, X[:, i_s], target[ps]) for p, i_s, ps in zip(self._parameters_, self.input_slices, self.param_slices)]
         return self._transform_gradients(target)
 
     def dKdiag_dX(self, dL_dKdiag, X):
         assert X.shape[1] == self.input_dim
         target = np.zeros_like(X)
-        [p.dKdiag_dX(dL_dKdiag, X[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        [p.dKdiag_dX(dL_dKdiag, X[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
         return target
 
     def psi0(self, Z, mu, S):
         target = np.zeros(mu.shape[0])
-        [p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
+        [p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
         return target
 
     def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S):
         target = np.zeros(self.num_params)
-        [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)]
+        [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self._parameters_, self.param_slices, self.input_slices)]
         return self._transform_gradients(target)
 
     def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S):
         target_mu, target_S = np.zeros_like(mu), np.zeros_like(S)
-        [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
         return target_mu, target_S
 
     def psi1(self, Z, mu, S):
         target = np.zeros((mu.shape[0], Z.shape[0]))
-        [p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
+        [p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
         return target
 
     def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S):
         target = np.zeros((self.num_params))
-        [p.dpsi1_dtheta(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)]
+        [p.dpsi1_dtheta(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self._parameters_, self.param_slices, self.input_slices)]
         return self._transform_gradients(target)
 
     def dpsi1_dZ(self, dL_dpsi1, Z, mu, S):
         target = np.zeros_like(Z)
-        [p.dpsi1_dZ(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        [p.dpsi1_dZ(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
         return target
 
     def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S):
         """return shapes are num_samples,num_inducing,input_dim"""
         target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
-        [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
         return target_mu, target_S
 
     def psi2(self, Z, mu, S):
         """
-        Computer the psi2 statistics for the covariance function.
-        
-        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
-        :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
-        :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
-
+        :param Z: np.ndarray of inducing inputs (M x Q)
+        :param mu, S: np.ndarrays of means and variances (each N x Q)
+        :returns psi2: np.ndarray (N,M,M)
         """
         target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
-        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
+        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
 
         # compute the "cross" terms
         # TODO: input_slices needed
-        crossterms = 0
+        from parts.white import White
+        from parts.rbf import RBF
+        from parts.rbf_inv import RBFInv
+        from parts.bias import Bias
+        from parts.linear import Linear
 
-        for [p1, i_s1], [p2, i_s2] in itertools.combinations(zip(self.parts, self.input_slices), 2):
-            if i_s1 == i_s2:
-                # TODO psi1 this must be faster/better/precached/more nice
-                tmp1 = np.zeros((mu.shape[0], Z.shape[0]))
-                p1.psi1(Z[:, i_s1], mu[:, i_s1], S[:, i_s1], tmp1)
-                tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
-                p2.psi1(Z[:, i_s2], mu[:, i_s2], S[:, i_s2], tmp2)
-    
-                prod = np.multiply(tmp1, tmp2)
-                crossterms += prod[:, :, None] + prod[:, None, :]
-
-        # target += crossterms
-        return target + crossterms
+        for (p1, i1), (p2, i2) in itertools.combinations(itertools.izip(self._parameters_, self._param_slices_), 2):
+            # white doesn;t combine with anything
+            if isinstance(p1, White) or isinstance(p2, White):
+                pass
+            # rbf X bias
+            elif isinstance(p1, Bias) and isinstance(p2, (RBF, RBFInv)):
+                target += p1.variance * (p2._psi1[:, :, None] + p2._psi1[:, None, :])
+            elif isinstance(p2, Bias) and isinstance(p1, (RBF, RBFInv)):
+                target += p2.variance * (p1._psi1[:, :, None] + p1._psi1[:, None, :])
+            # linear X bias
+            elif isinstance(p1, Bias) and isinstance(p2, Linear):
+                tmp = np.zeros((mu.shape[0], Z.shape[0]))
+                p2.psi1(Z, mu, S, tmp)
+                target += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
+            elif isinstance(p2, Bias) and isinstance(p1, Linear):
+                tmp = np.zeros((mu.shape[0], Z.shape[0]))
+                p1.psi1(Z, mu, S, tmp)
+                target += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
+            # rbf X linear
+            elif isinstance(p1, Linear) and isinstance(p2, (RBF, RBFInv)):
+                pass
+            elif isinstance(p2, Linear) and isinstance(p1, (RBF, RBFInv)):
+                raise NotImplementedError # TODO
+            elif isinstance(p1, (RBF, RBFInv)) and isinstance(p2, (RBF, RBFInv)):
+                raise NotImplementedError # TODO
+            elif isinstance(p2, (RBF, RBFInv)) and isinstance(p1, (RBF, RBFInv)):
+                raise NotImplementedError # TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
+        return target
 
     def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
-        """Gradient of the psi2 statistics with respect to the parameters."""
-        target = np.zeros(self.num_params)
-        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]
+        target = np.zeros(self.Nparam)
+        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self._parameters_, self.input_slices, self.param_slices)]
 
         # compute the "cross" terms
         # TODO: better looping, input_slices
-        for i1, i2 in itertools.permutations(range(len(self.parts)), 2):
-            p1, p2 = self.parts[i1], self.parts[i2]
+        for i1, i2 in itertools.combinations(range(len(self._parameters_)), 2):
+            p1, p2 = self._parameters_[i1], self._parameters_[i2]
 #             ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
             ps1, ps2 = self.param_slices[i1], self.param_slices[i2]
 
-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dtheta((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target[ps2])
+            # white doesn;t combine with anything
+            if p1.name == 'white' or p2.name == 'white':
+                pass
+            # rbf X bias
+            elif p1.name == 'bias' and p2.name == 'rbf':
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target[ps2])
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1) * p2._psi1 * 2., Z, mu, S, target[ps1])
+            elif p2.name == 'bias' and p1.name == 'rbf':
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1) * p2.variance * 2., Z, mu, S, target[ps1])
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1._psi1 * 2., Z, mu, S, target[ps2])
+            # linear X bias
+            elif p1.name == 'bias' and p2.name == 'linear':
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target[ps2]) # [ps1])
+                psi1 = np.zeros((mu.shape[0], Z.shape[0]))
+                p2.psi1(Z, mu, S, psi1)
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1) * psi1 * 2., Z, mu, S, target[ps1])
+            elif p2.name == 'bias' and p1.name == 'linear':
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1) * p2.variance * 2., Z, mu, S, target[ps1])
+                psi1 = np.zeros((mu.shape[0], Z.shape[0]))
+                p1.psi1(Z, mu, S, psi1)
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1) * psi1 * 2., Z, mu, S, target[ps2])
+            # rbf X linear
+            elif p1.name == 'linear' and p2.name == 'rbf':
+                raise NotImplementedError # TODO
+            elif p2.name == 'linear' and p1.name == 'rbf':
+                raise NotImplementedError # TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
 
         return self._transform_gradients(target)
 
     def dpsi2_dZ(self, dL_dpsi2, Z, mu, S):
         target = np.zeros_like(Z)
-        [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
-        # target *= 2
+        [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
 
         # compute the "cross" terms
         # TODO: we need input_slices here.
-        for p1, p2 in itertools.permutations(self.parts, 2):
-            if p1.name == 'linear' and p2.name == 'linear':
-                raise NotImplementedError("We don't handle linear/linear cross-terms")
-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dZ((tmp[:, None, :] * dL_dpsi2).sum(1), Z, mu, S, target)
+        for p1, p2 in itertools.combinations(self._parameters_, 2):
+            # white doesn;t combine with anything
+            if p1.name == 'white' or p2.name == 'white':
+                pass
+            # rbf X bias
+            elif p1.name == 'bias' and p2.name == 'rbf':
+                p2.dpsi1_dX(dL_dpsi2.sum(1).T * p1.variance, Z, mu, S, target)
+            elif p2.name == 'bias' and p1.name == 'rbf':
+                p1.dpsi1_dZ(dL_dpsi2.sum(1).T * p2.variance, Z, mu, S, target)
+            # linear X bias
+            elif p1.name == 'bias' and p2.name == 'linear':
+                p2.dpsi1_dZ(dL_dpsi2.sum(1).T * p1.variance, Z, mu, S, target)
+            elif p2.name == 'bias' and p1.name == 'linear':
+                p1.dpsi1_dZ(dL_dpsi2.sum(1).T * p2.variance, Z, mu, S, target)
+            # rbf X linear
+            elif p1.name == 'linear' and p2.name == 'rbf':
+                raise NotImplementedError # TODO
+            elif p2.name == 'linear' and p1.name == 'rbf':
+                raise NotImplementedError # TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
 
-        return target * 2
+        return target * 2.
 
     def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S):
         target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
-        [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
 
         # compute the "cross" terms
         # TODO: we need input_slices here.
-        for p1, p2 in itertools.permutations(self.parts, 2):
-            if p1.name == 'linear' and p2.name == 'linear':
-                raise NotImplementedError("We don't handle linear/linear cross-terms")
-
-            tmp = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dmuS((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target_mu, target_S)
+        for p1, p2 in itertools.combinations(self._parameters_, 2):
+            # white doesn;t combine with anything
+            if p1.name == 'white' or p2.name == 'white':
+                pass
+            # rbf X bias
+            elif p1.name == 'bias' and p2.name == 'rbf':
+                p2.dpsi1_dmuS(dL_dpsi2.sum(1).T * p1.variance * 2., Z, mu, S, target_mu, target_S)
+            elif p2.name == 'bias' and p1.name == 'rbf':
+                p1.dpsi1_dmuS(dL_dpsi2.sum(1).T * p2.variance * 2., Z, mu, S, target_mu, target_S)
+            # linear X bias
+            elif p1.name == 'bias' and p2.name == 'linear':
+                p2.dpsi1_dmuS(dL_dpsi2.sum(1).T * p1.variance * 2., Z, mu, S, target_mu, target_S)
+            elif p2.name == 'bias' and p1.name == 'linear':
+                p1.dpsi1_dmuS(dL_dpsi2.sum(1).T * p2.variance * 2., Z, mu, S, target_mu, target_S)
+            # rbf X linear
+            elif p1.name == 'linear' and p2.name == 'rbf':
+                raise NotImplementedError # TODO
+            elif p2.name == 'linear' and p1.name == 'rbf':
+                raise NotImplementedError # TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
 
         return target_mu, target_S
-
     def plot(self, x=None, plot_limits=None, which_parts='all', resolution=None, *args, **kwargs):
         if which_parts == 'all':
             which_parts = [True] * self.num_parts
diff --git a/GPy/testing/psi_stat_expectation_tests.py b/GPy/testing/psi_stat_expectation_tests.py
index bcdbd2af..16904927 100644
--- a/GPy/testing/psi_stat_expectation_tests.py
+++ b/GPy/testing/psi_stat_expectation_tests.py
@@ -28,8 +28,8 @@ def ard(p):
 class Test(unittest.TestCase):
     input_dim = 9
     num_inducing = 4
-    N = 3
-    Nsamples = 5e6
+    N = 30
+    Nsamples = 9e6
 
     def setUp(self):
         i_s_dim_list = [2,4,3]
@@ -45,20 +45,26 @@ class Test(unittest.TestCase):
                                          input_slices = input_slices
                                          )
         self.kerns = (
-                    input_slice_kern,
+#                     input_slice_kern,
 #                       (GPy.kern.rbf(self.input_dim, ARD=True) +
 #                        GPy.kern.linear(self.input_dim, ARD=True) +
 #                        GPy.kern.bias(self.input_dim) +
 #                        GPy.kern.white(self.input_dim)),
 #                     (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-#                      GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-#                      GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
-#                      GPy.kern.bias(self.input_dim) +
-#                      GPy.kern.white(self.input_dim)),
-#                       GPy.kern.rbf(self.input_dim), GPy.kern.rbf(self.input_dim, ARD=True),
+#                     GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+#                     GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
+#                     GPy.kern.bias(self.input_dim) +
+#                     GPy.kern.white(self.input_dim)),
+        (GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
+                    GPy.kern.bias(self.input_dim, np.random.rand()) +
+                    GPy.kern.white(self.input_dim, np.random.rand())),
+                (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+                    GPy.kern.bias(self.input_dim, np.random.rand()) +
+                    GPy.kern.white(self.input_dim, np.random.rand())),
+#                     GPy.kern.rbf(self.input_dim), GPy.kern.rbf(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim, ARD=False), GPy.kern.linear(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim),
-#                       GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim),
+#                     GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim),
 #                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim) + GPy.kern.white(self.input_dim),
 #                       GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim) + GPy.kern.white(self.input_dim),
 #                       GPy.kern.bias(self.input_dim), GPy.kern.white(self.input_dim),
@@ -79,7 +85,7 @@ class Test(unittest.TestCase):
 
     def test_psi1(self):
         for kern in self.kerns:
-            Nsamples = np.floor(self.Nsamples/300.)
+            Nsamples = np.floor(self.Nsamples/self.N)
             psi1 = kern.psi1(self.Z, self.q_x_mean, self.q_x_variance)
             K_ = np.zeros((Nsamples, self.num_inducing))
             diffs = []
@@ -105,7 +111,7 @@ class Test(unittest.TestCase):
 
     def test_psi2(self):
         for kern in self.kerns:
-            Nsamples = self.Nsamples/10.
+            Nsamples = int(np.floor(self.Nsamples/self.N))
             psi2 = kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
             K_ = np.zeros((self.num_inducing, self.num_inducing))
             diffs = []
@@ -119,10 +125,10 @@ class Test(unittest.TestCase):
             try:
                 import pylab
                 pylab.figure(msg)
-                pylab.plot(diffs)
+                pylab.plot(diffs, marker='x', mew=1.3)
 #                 print msg, np.allclose(psi2.squeeze(), K_, rtol=1e-1, atol=.1)
-                self.assertTrue(np.allclose(psi2.squeeze(), K_,
-                                            rtol=1e-1, atol=.1),
+                self.assertTrue(np.allclose(psi2.squeeze(), K_),
+                                            #rtol=1e-1, atol=.1),
                                 msg=msg + ": not matching")
 #                 sys.stdout.write(".")
             except:

From d2db4c66885acdf51480032a43c4e11db09fb480 Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Thu, 7 Nov 2013 17:34:41 +0000
Subject: [PATCH 164/252] passing **noise_args into predictive_values

---
 GPy/likelihoods/ep.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/likelihoods/ep.py b/GPy/likelihoods/ep.py
index 32575813..aa106067 100644
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@@ -49,10 +49,10 @@ class EP(likelihood):
         self.VVT_factor = self.V
         self.trYYT = 0.
 
-    def predictive_values(self,mu,var,full_cov):
+    def predictive_values(self,mu,var,full_cov,**noise_args):
         if full_cov:
             raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.noise_model.predictive_values(mu,var)
+        return self.noise_model.predictive_values(mu,var,**noise_args)
 
     def log_predictive_density(self, y_test, mu_star, var_star):
         """

From ae6648e0cf2e786207c08e4bdf8ed63d9d62fddc Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Thu, 7 Nov 2013 17:35:41 +0000
Subject: [PATCH 165/252] 2D plots fixed

---
 GPy/core/gp_base.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index 10d30358..b6e4ebc0 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -37,7 +37,7 @@ class GPBase(Model):
         # the end
 
 
-    def posterior_samples_f(self,X,size=10,which_parts='all',full_cov=True):
+    def posterior_samples_f(self,X,size=10,which_parts='all'):
         """
         Samples the posterior GP at the points X.
 
@@ -51,16 +51,13 @@ class GPBase(Model):
         :type full_cov: bool.
         :returns: Ysim: set of simulations, a Numpy array (N x samples).
         """
-        m, v = self._raw_predict(X, which_parts=which_parts, full_cov=full_cov)
+        m, v = self._raw_predict(X, which_parts=which_parts, full_cov=True)
         v = v.reshape(m.size,-1) if len(v.shape)==3 else v
-        if not full_cov:
-            Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
-        else:
-            Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
+        Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
 
         return Ysim
 
-    def posterior_samples(self,X,size=10,which_parts='all',full_cov=True,noise_model=None):
+    def posterior_samples(self,X,size=10,which_parts='all',noise_model=None):
         """
         Samples the posterior GP at the points X.
 
@@ -76,7 +73,7 @@ class GPBase(Model):
         :type noise_model: integer.
         :returns: Ysim: set of simulations, a Numpy array (N x samples).
         """
-        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts, full_cov=full_cov)
+        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts, full_cov=True)
         if isinstance(self.likelihood,Gaussian):
             noise_std = np.sqrt(self.likelihood._get_params())
             Ysim += np.random.normal(0,noise_std,Ysim.shape)
@@ -209,11 +206,11 @@ class GPBase(Model):
             x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
 
             #predict on the frame and plot
-            if use_raw_predict:
+            if plot_raw:
                 m, _ = self._raw_predict(Xgrid, which_parts=which_parts)
                 Y = self.likelihood.Y
             else:
-                m, _, _, _ = self.predict(Xgrid, which_parts=which_parts)
+                m, _, _, _ = self.predict(Xgrid, which_parts=which_parts,num_samples=100) #FIXME we need a balance between accuracy and speed to define num_samples
                 Y = self.likelihood.data
             for d in which_data_ycols:
                 m_d = m[:,d].reshape(resolution, resolution).T

From 4f6dfba5be0c8b27f3d6399888c4fb3ba3d4b339 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Fri, 8 Nov 2013 11:12:26 +0000
Subject: [PATCH 166/252] reverted broken kern

---
 GPy/kern/kern.py | 128 ++++++++++++++++++++++++-----------------------
 1 file changed, 66 insertions(+), 62 deletions(-)

diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 37839423..7a4996d6 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -18,37 +18,37 @@ class kern(Parameterized):
         like which parameters live where.
 
         The technical code for kernels is divided into _parts_ (see
-        e.g. rbf.py). This object contains a list of _parameters_, which are
-        computed additively. For multiplication, special _prod_ _parameters_
+        e.g. rbf.py). This object contains a list of parts, which are
+        computed additively. For multiplication, special _prod_ parts
         are used.
 
         :param input_dim: The dimensionality of the kernel's input space
         :type input_dim: int
-        :param _parameters_: the '_parameters_' (PD functions) of the kernel
-        :type _parameters_: list of Kernpart objects
+        :param parts: the 'parts' (PD functions) of the kernel
+        :type parts: list of Kernpart objects
         :param input_slices: the slices on the inputs which apply to each kernel
         :type input_slices: list of slice objects, or list of bools
 
         """
-        self._parameters_ = parts
+        self.parts = parts
         self.num_parts = len(parts)
-        self.num_params = sum([p.num_params for p in self._parameters_])
+        self.num_params = sum([p.num_params for p in self.parts])
 
         self.input_dim = input_dim
 
-        part_names = [k.name for k in self._parameters_]
+        part_names = [k.name for k in self.parts]
         self.name=''
         for name in part_names:
             self.name += name + '+'
         self.name = self.name[:-1]
         # deal with input_slices
         if input_slices is None:
-            self.input_slices = [slice(None) for p in self._parameters_]
+            self.input_slices = [slice(None) for p in self.parts]
         else:
-            assert len(input_slices) == len(self._parameters_)
+            assert len(input_slices) == len(self.parts)
             self.input_slices = [sl if type(sl) is slice else slice(None) for sl in input_slices]
 
-        for p in self._parameters_:
+        for p in self.parts:
             assert isinstance(p, Kernpart), "bad kernel part"
 
         self.compute_param_slices()
@@ -60,7 +60,7 @@ class kern(Parameterized):
         Get the current state of the class,
         here just all the indices, rest can get recomputed
         """
-        return Parameterized.getstate(self) + [self._parameters_,
+        return Parameterized.getstate(self) + [self.parts,
                 self.num_parts,
                 self.num_params,
                 self.input_dim,
@@ -74,7 +74,7 @@ class kern(Parameterized):
         self.input_dim = state.pop()
         self.num_params = state.pop()
         self.num_parts = state.pop()
-        self._parameters_ = state.pop()
+        self.parts = state.pop()
         Parameterized.setstate(self, state)
 
 
@@ -99,7 +99,7 @@ class kern(Parameterized):
         xticklabels = []
         bars = []
         x0 = 0
-        for p in self._parameters_:
+        for p in self.parts:
             c = Tango.nextMedium()
             if hasattr(p, 'ARD') and p.ARD:
                 if title is None:
@@ -173,7 +173,7 @@ class kern(Parameterized):
         """
         self.param_slices = []
         count = 0
-        for p in self._parameters_:
+        for p in self.parts:
             self.param_slices.append(slice(count, count + p.num_params))
             count += p.num_params
 
@@ -202,7 +202,7 @@ class kern(Parameterized):
             other_input_indices = [sl.indices(other.input_dim) for sl in other.input_slices]
             other_input_slices = [slice(i[0] + self.input_dim, i[1] + self.input_dim, i[2]) for i in other_input_indices]
 
-            newkern = kern(D, self._parameters_ + other._parameters_, self_input_slices + other_input_slices)
+            newkern = kern(D, self.parts + other.parts, self_input_slices + other_input_slices)
 
             # transfer constraints:
             newkern.constrained_indices = self.constrained_indices + [x + self.num_params for x in other.constrained_indices]
@@ -213,7 +213,7 @@ class kern(Parameterized):
             newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices]
         else:
             assert self.input_dim == other.input_dim
-            newkern = kern(self.input_dim, self._parameters_ + other._parameters_, self.input_slices + other.input_slices)
+            newkern = kern(self.input_dim, self.parts + other.parts, self.input_slices + other.input_slices)
             # transfer constraints:
             newkern.constrained_indices = self.constrained_indices + [i + self.num_params  for i in other.constrained_indices]
             newkern.constraints = self.constraints + other.constraints
@@ -251,7 +251,7 @@ class kern(Parameterized):
             s1[sl1], s2[sl2] = [True], [True]
             slices += [s1 + s2]
 
-        newkernparts = [prod(k1, k2, tensor) for k1, k2 in itertools.product(K1._parameters_, K2._parameters_)]
+        newkernparts = [prod(k1, k2, tensor) for k1, k2 in itertools.product(K1.parts, K2.parts)]
 
         if tensor:
             newkern = kern(K1.input_dim + K2.input_dim, newkernparts, slices)
@@ -266,12 +266,12 @@ class kern(Parameterized):
         # Build the array that allows to go from the initial indices of the param to the new ones
         K1_param = []
         n = 0
-        for k1 in K1._parameters_:
+        for k1 in K1.parts:
             K1_param += [range(n, n + k1.num_params)]
             n += k1.num_params
         n = 0
         K2_param = []
-        for k2 in K2._parameters_:
+        for k2 in K2.parts:
             K2_param += [range(K1.num_params + n, K1.num_params + n + k2.num_params)]
             n += k2.num_params
         index_param = []
@@ -303,19 +303,19 @@ class kern(Parameterized):
             self.constrain(np.where(index_param == i)[0], t)
 
     def _get_params(self):
-        return np.hstack([p._get_params() for p in self._parameters_])
+        return np.hstack([p._get_params() for p in self.parts])
 
     def _set_params(self, x):
-        [p._set_params(x[s]) for p, s in zip(self._parameters_, self.param_slices)]
+        [p._set_params(x[s]) for p, s in zip(self.parts, self.param_slices)]
 
     def _get_param_names(self):
-        # this is a bit nasty: we want to distinguish between _parameters_ with the same name by appending a count
-        part_names = np.array([k.name for k in self._parameters_], dtype=np.str)
+        # this is a bit nasty: we want to distinguish between parts with the same name by appending a count
+        part_names = np.array([k.name for k in self.parts], dtype=np.str)
         counts = [np.sum(part_names == ni) for i, ni in enumerate(part_names)]
         cum_counts = [np.sum(part_names[i:] == ni) for i, ni in enumerate(part_names)]
         names = [name + '_' + str(cum_count) if count > 1 else name for name, count, cum_count in zip(part_names, counts, cum_counts)]
 
-        return sum([[name + '_' + n for n in k._get_param_names()] for name, k in zip(names, self._parameters_)], [])
+        return sum([[name + '_' + n for n in k._get_param_names()] for name, k in zip(names, self.parts)], [])
 
     def K(self, X, X2=None, which_parts='all'):
         """
@@ -334,10 +334,10 @@ class kern(Parameterized):
         assert X.shape[1] == self.input_dim
         if X2 is None:
             target = np.zeros((X.shape[0], X.shape[0]))
-            [p.K(X[:, i_s], None, target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
+            [p.K(X[:, i_s], None, target=target) for p, i_s, part_i_used in zip(self.parts, self.input_slices, which_parts) if part_i_used]
         else:
             target = np.zeros((X.shape[0], X2.shape[0]))
-            [p.K(X[:, i_s], X2[:, i_s], target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
+            [p.K(X[:, i_s], X2[:, i_s], target=target) for p, i_s, part_i_used in zip(self.parts, self.input_slices, which_parts) if part_i_used]
         return target
 
     def dK_dtheta(self, dL_dK, X, X2=None):
@@ -356,9 +356,9 @@ class kern(Parameterized):
         assert X.shape[1] == self.input_dim
         target = np.zeros(self.num_params)
         if X2 is None:
-            [p.dK_dtheta(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self.param_slices)]
+            [p.dK_dtheta(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self.parts, self.input_slices, self.param_slices)]
         else:
-            [p.dK_dtheta(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self.param_slices)]
+            [p.dK_dtheta(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self.parts, self.input_slices, self.param_slices)]
 
         return self._transform_gradients(target)
 
@@ -374,9 +374,9 @@ class kern(Parameterized):
 
         target = np.zeros_like(X)
         if X2 is None: 
-            [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+            [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
         else:
-            [p.dK_dX(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+            [p.dK_dX(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
         return target
 
     def Kdiag(self, X, which_parts='all'):
@@ -385,7 +385,7 @@ class kern(Parameterized):
             which_parts = [True] * self.num_parts
         assert X.shape[1] == self.input_dim
         target = np.zeros(X.shape[0])
-        [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self._parameters_, self.input_slices, which_parts) if part_on]
+        [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self.parts, self.input_slices, which_parts) if part_on]
         return target
 
     def dKdiag_dtheta(self, dL_dKdiag, X):
@@ -393,49 +393,49 @@ class kern(Parameterized):
         assert X.shape[1] == self.input_dim
         assert dL_dKdiag.size == X.shape[0]
         target = np.zeros(self.num_params)
-        [p.dKdiag_dtheta(dL_dKdiag, X[:, i_s], target[ps]) for p, i_s, ps in zip(self._parameters_, self.input_slices, self.param_slices)]
+        [p.dKdiag_dtheta(dL_dKdiag, X[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]
         return self._transform_gradients(target)
 
     def dKdiag_dX(self, dL_dKdiag, X):
         assert X.shape[1] == self.input_dim
         target = np.zeros_like(X)
-        [p.dKdiag_dX(dL_dKdiag, X[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        [p.dKdiag_dX(dL_dKdiag, X[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
         return target
 
     def psi0(self, Z, mu, S):
         target = np.zeros(mu.shape[0])
-        [p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
+        [p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
         return target
 
     def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S):
         target = np.zeros(self.num_params)
-        [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self._parameters_, self.param_slices, self.input_slices)]
+        [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)]
         return self._transform_gradients(target)
 
     def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S):
         target_mu, target_S = np.zeros_like(mu), np.zeros_like(S)
-        [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
         return target_mu, target_S
 
     def psi1(self, Z, mu, S):
         target = np.zeros((mu.shape[0], Z.shape[0]))
-        [p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
+        [p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
         return target
 
     def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S):
         target = np.zeros((self.num_params))
-        [p.dpsi1_dtheta(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self._parameters_, self.param_slices, self.input_slices)]
+        [p.dpsi1_dtheta(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)]
         return self._transform_gradients(target)
 
     def dpsi1_dZ(self, dL_dpsi1, Z, mu, S):
         target = np.zeros_like(Z)
-        [p.dpsi1_dZ(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        [p.dpsi1_dZ(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
         return target
 
     def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S):
         """return shapes are num_samples,num_inducing,input_dim"""
         target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
-        [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
         return target_mu, target_S
 
     def psi2(self, Z, mu, S):
@@ -445,7 +445,7 @@ class kern(Parameterized):
         :returns psi2: np.ndarray (N,M,M)
         """
         target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
-        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
+        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
 
         # compute the "cross" terms
         # TODO: input_slices needed
@@ -454,46 +454,49 @@ class kern(Parameterized):
         from parts.rbf_inv import RBFInv
         from parts.bias import Bias
         from parts.linear import Linear
+        from parts.fixed import Fixed
 
-        for (p1, i1), (p2, i2) in itertools.combinations(itertools.izip(self._parameters_, self._param_slices_), 2):
+        for (p1, i1), (p2, i2) in itertools.combinations(itertools.izip(self.parts, self.param_slices), 2):
             # white doesn;t combine with anything
             if isinstance(p1, White) or isinstance(p2, White):
                 pass
             # rbf X bias
-            elif isinstance(p1, Bias) and isinstance(p2, (RBF, RBFInv)):
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
                 target += p1.variance * (p2._psi1[:, :, None] + p2._psi1[:, None, :])
-            elif isinstance(p2, Bias) and isinstance(p1, (RBF, RBFInv)):
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+                import ipdb;ipdb.set_trace()
+                tmp1 = p2.variance * (p1._psi1[:, :, None] + p1._psi1[:, None, :])
+
+                renorm = p1.variance*np.exp()
+                
+                tmp2 = asd
                 target += p2.variance * (p1._psi1[:, :, None] + p1._psi1[:, None, :])
             # linear X bias
-            elif isinstance(p1, Bias) and isinstance(p2, Linear):
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, Linear):
                 tmp = np.zeros((mu.shape[0], Z.shape[0]))
                 p2.psi1(Z, mu, S, tmp)
                 target += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
-            elif isinstance(p2, Bias) and isinstance(p1, Linear):
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, Linear):
                 tmp = np.zeros((mu.shape[0], Z.shape[0]))
                 p1.psi1(Z, mu, S, tmp)
                 target += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
-            # rbf X linear
-            elif isinstance(p1, Linear) and isinstance(p2, (RBF, RBFInv)):
+            # rbf X any
+            elif isinstance(p1, (RBF, RBFInv)):
                 pass
-            elif isinstance(p2, Linear) and isinstance(p1, (RBF, RBFInv)):
-                raise NotImplementedError # TODO
-            elif isinstance(p1, (RBF, RBFInv)) and isinstance(p2, (RBF, RBFInv)):
-                raise NotImplementedError # TODO
-            elif isinstance(p2, (RBF, RBFInv)) and isinstance(p1, (RBF, RBFInv)):
+            elif isinstance(p2, (RBF, RBFInv)):
                 raise NotImplementedError # TODO
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
         return target
 
     def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
-        target = np.zeros(self.Nparam)
-        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self._parameters_, self.input_slices, self.param_slices)]
+        target = np.zeros(self.num_params)
+        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]
 
         # compute the "cross" terms
         # TODO: better looping, input_slices
-        for i1, i2 in itertools.combinations(range(len(self._parameters_)), 2):
-            p1, p2 = self._parameters_[i1], self._parameters_[i2]
+        for i1, i2 in itertools.combinations(range(len(self.parts)), 2):
+            p1, p2 = self.parts[i1], self.parts[i2]
 #             ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
             ps1, ps2 = self.param_slices[i1], self.param_slices[i2]
 
@@ -518,7 +521,8 @@ class kern(Parameterized):
                 psi1 = np.zeros((mu.shape[0], Z.shape[0]))
                 p1.psi1(Z, mu, S, psi1)
                 p2.dpsi1_dtheta(dL_dpsi2.sum(1) * psi1 * 2., Z, mu, S, target[ps2])
-            # rbf X linear
+            # rbf X any
+            
             elif p1.name == 'linear' and p2.name == 'rbf':
                 raise NotImplementedError # TODO
             elif p2.name == 'linear' and p1.name == 'rbf':
@@ -530,11 +534,11 @@ class kern(Parameterized):
 
     def dpsi2_dZ(self, dL_dpsi2, Z, mu, S):
         target = np.zeros_like(Z)
-        [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
 
         # compute the "cross" terms
         # TODO: we need input_slices here.
-        for p1, p2 in itertools.combinations(self._parameters_, 2):
+        for p1, p2 in itertools.combinations(self.parts, 2):
             # white doesn;t combine with anything
             if p1.name == 'white' or p2.name == 'white':
                 pass
@@ -560,11 +564,11 @@ class kern(Parameterized):
 
     def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S):
         target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
-        [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+        [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
 
         # compute the "cross" terms
         # TODO: we need input_slices here.
-        for p1, p2 in itertools.combinations(self._parameters_, 2):
+        for p1, p2 in itertools.combinations(self.parts, 2):
             # white doesn;t combine with anything
             if p1.name == 'white' or p2.name == 'white':
                 pass

From 51ec4293e23b84780395c4760fa7ee6d14f27354 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Fri, 8 Nov 2013 11:17:34 +0000
Subject: [PATCH 167/252] in the middle of crossterms

---
 GPy/kern/kern.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 7a4996d6..619d1687 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -462,14 +462,10 @@ class kern(Parameterized):
                 pass
             # rbf X bias
             elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
-                target += p1.variance * (p2._psi1[:, :, None] + p2._psi1[:, None, :])
+                target += 2 * p1.variance * (p2._psi1[:, :, None] + p2._psi1[:, None, :])
             elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
-                import ipdb;ipdb.set_trace()
                 tmp1 = p2.variance * (p1._psi1[:, :, None] + p1._psi1[:, None, :])
-
                 renorm = p1.variance*np.exp()
-                
-                tmp2 = asd
                 target += p2.variance * (p1._psi1[:, :, None] + p1._psi1[:, None, :])
             # linear X bias
             elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, Linear):

From f4ecb47464714fccbb89dfe9246bb7575a568944 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 8 Nov 2013 14:08:19 +0000
Subject: [PATCH 168/252] added getstate/setstate for product kernel

---
 GPy/kern/parts/prod.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/GPy/kern/parts/prod.py b/GPy/kern/parts/prod.py
index 0549ea22..e386a292 100644
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@@ -130,3 +130,14 @@ class Prod(Kernpart):
                 self.k1.K(X[:,self.slice1],X2[:,self.slice1],self._K1)
                 self.k2.K(X[:,self.slice2],X2[:,self.slice2],self._K2)
 
+    def getstate(self):
+        return [self._get_params(), self.k1, self.k2, self.slice1, self.slice2, self.name]
+
+    def setstate(self, state):
+        params, self.k1, self.k2, self.slice1, self.slice2, self.name = state
+        self._X, self._X2, self._params = np.empty(shape=(3,1))
+        self._set_params(params)
+
+
+
+

From c3d84f1d9d0b9cf6a99b4f9fdbaf99ae656f24a0 Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Fri, 8 Nov 2013 17:39:52 +0000
Subject: [PATCH 169/252] predictive_mean and predictive_variance now use
 gp_var as a parameter, rather than gp_std

---
 GPy/likelihoods/noise_models/bernoulli_noise.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/GPy/likelihoods/noise_models/bernoulli_noise.py b/GPy/likelihoods/noise_models/bernoulli_noise.py
index 2c4116da..17390e55 100644
--- a/GPy/likelihoods/noise_models/bernoulli_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@@ -71,15 +71,19 @@ class Bernoulli(NoiseDistribution):
 
         return Z_hat, mu_hat, sigma2_hat
 
-    def _predictive_mean_analytical(self,mu,sigma):
+    def _predictive_mean_analytical(self,mu,variance):
+
         if isinstance(self.gp_link,gp_transformations.Probit):
-            return stats.norm.cdf(mu/np.sqrt(1+sigma**2))
+            return stats.norm.cdf(mu/np.sqrt(1+variance))
+
         elif isinstance(self.gp_link,gp_transformations.Heaviside):
-            return stats.norm.cdf(mu/sigma)
+            return stats.norm.cdf(mu/np.sqrt(variance))
+
         else:
             raise NotImplementedError
 
-    def _predictive_variance_analytical(self,mu,sigma, pred_mean):
+    def _predictive_variance_analytical(self,mu,variance, pred_mean):
+
         if isinstance(self.gp_link,gp_transformations.Heaviside):
             return 0.
         else:

From e3173c4ff43380d9a8f50585ad8d34ae58029c60 Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Fri, 8 Nov 2013 17:40:27 +0000
Subject: [PATCH 170/252] numerical predictions fixed, sampling predictions are
 not working

---
 .../noise_models/noise_distributions.py       | 120 +++++++++---------
 1 file changed, 61 insertions(+), 59 deletions(-)

diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 77cc82a4..79d9ffeb 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -11,6 +11,7 @@ from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from GPy.util.misc import chain_1, chain_2, chain_3
 from scipy.integrate import quad
+import warnings
 
 class NoiseDistribution(object):
     """
@@ -103,23 +104,27 @@ class NoiseDistribution(object):
         def int_1(f):
             return self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
         z, accuracy = quad(int_1, -np.inf, np.inf)
-        z /= np.sqrt(2*np.pi/tau)
+        #z /= np.sqrt(2*np.pi/tau)
 
         #Compute second integral for first moment
         def int_2(f):
             return f*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
         mean, accuracy = quad(int_2, -np.inf, np.inf)
-        mean /= np.sqrt(2*np.pi/tau)
+        #mean /= np.sqrt(2*np.pi/tau)
         mean /= z
 
         #Compute integral for variance
         def int_3(f):
             return (f**2)*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
         Ef2, accuracy = quad(int_3, -np.inf, np.inf)
-        Ef2 /= np.sqrt(2*np.pi/tau)
+        #Ef2 /= np.sqrt(2*np.pi/tau)
         Ef2 /= z
         variance = Ef2 - mean**2
 
+        #Add constant to the zeroth moment
+        #NOTE: this constant is not needed in the other moments because it cancells out.
+        z /= np.sqrt(2*np.pi/tau)
+
         return z, mean, variance
 
     def _predictive_mean_analytical(self,mu,sigma):
@@ -142,7 +147,7 @@ class NoiseDistribution(object):
         """
         raise NotImplementedError
 
-    def _predictive_mean_numerical(self,mu,sigma):
+    def _predictive_mean_numerical(self,mu,variance):
         """
         Quadrature calculation of the predictive mean: E(Y_star|Y) = E( E(Y_star|f_star, Y) )
 
@@ -150,49 +155,51 @@ class NoiseDistribution(object):
         :param sigma: standard deviation of posterior
 
         """
-        #FIXME: Quadrature does not work!
-        raise NotImplementedError
-        sigma2 = sigma**2
-        #Compute first moment
-        def int_mean(f):
-            return self._mean(f)*np.exp(-(0.5/sigma2)*np.square(f - mu))
-        scaled_mean, accuracy = quad(int_mean, -np.inf, np.inf)
-        mean = scaled_mean / np.sqrt(2*np.pi*(sigma2))
+        def int_mean(f,m,v):
+            return self._mean(f)*np.exp(-(0.5/v)*np.square(f - m))
+        scaled_mean = [quad(int_mean, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        mean = np.array(scaled_mean)[:,None] / np.sqrt(2*np.pi*(variance))
 
         return mean
 
-    def _predictive_variance_numerical(self,mu,sigma,predictive_mean=None):
+    def _predictive_variance_numerical(self,mu,variance,predictive_mean=None):
         """
-        Laplace approximation to the predictive variance: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+        Numerical approximation to the predictive variance: V(Y_star)
+
+        The following variance decomposition is used:
+        V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
 
         :param mu: mean of posterior
         :param sigma: standard deviation of posterior
         :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
 
         """
-        sigma2 = sigma**2
-        normalizer = np.sqrt(2*np.pi*sigma2)
+        #sigma2 = sigma**2
+        normalizer = np.sqrt(2*np.pi*variance)
 
         # E( V(Y_star|f_star) )
-        #Compute expected value of variance
-        def int_var(f):
-            return self._variance(f)*np.exp(-(0.5/sigma2)*np.square(f - mu))
-        scaled_exp_variance, accuracy = quad(int_var, -np.inf, np.inf)
-        exp_var = scaled_exp_variance / normalizer
+        def int_var(f,m,v):
+            return self._variance(f)*np.exp(-(0.5/v)*np.square(f - m))
+        scaled_exp_variance = [quad(int_var, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        exp_var = np.array(scaled_exp_variance)[:,None] / normalizer
 
         #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star) )**2
+
+        #E( E(Y_star|f_star)**2 )
         if predictive_mean is None:
-            predictive_mean = self.predictive_mean(mu,sigma)
-
+            predictive_mean = self.predictive_mean(mu,variance)
         predictive_mean_sq = predictive_mean**2
-        def int_pred_mean_sq(f):
-            return predictive_mean_sq*np.exp(-(0.5/(sigma2))*np.square(f - mu))
 
-        scaled_exp_exp2, accuracy = quad(int_pred_mean_sq, -np.inf, np.inf)
-        exp_exp2 = scaled_exp_exp2 / normalizer
+        def int_pred_mean_sq(f,m,v,predictive_mean_sq):
+            return predictive_mean_sq*np.exp(-(0.5/v)*np.square(f - m))
 
-        var_exp = exp_exp2 - predictive_mean**2
-        # V(Y_star | f_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+        scaled_exp_exp2 = [quad(int_pred_mean_sq, -np.inf, np.inf,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
+        exp_exp2 = np.array(scaled_exp_exp2)[:,None] / normalizer
+
+        #E( E(Y_star|f_star) )**2
+        var_exp = exp_exp2 - predictive_mean_sq
+
+        # V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
         return exp_var + var_exp
 
     def pdf_link(self, link_f, y, extra_data=None):
@@ -375,8 +382,7 @@ class NoiseDistribution(object):
         assert d2logpdf_df2_dtheta.shape[1] == len(self._get_param_names())
         return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
 
-    def predictive_values(self, mu, var, full_cov=False, num_samples=30000,
-                          sampling=False):
+    def predictive_values(self, mu, var, full_cov=False, sampling=False, num_samples=10000):
         """
         Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
 
@@ -392,37 +398,33 @@ class NoiseDistribution(object):
 
         """
 
-        #Get gp_samples f* using posterior mean and variance
-        if not full_cov:
-            gp_samples = np.random.multivariate_normal(mu.flatten(), np.diag(var.flatten()),
-                                                        size=num_samples).T
+        if sampling:
+            #Get gp_samples f* using posterior mean and variance
+            if not full_cov:
+                gp_samples = np.random.multivariate_normal(mu.flatten(), np.diag(var.flatten()),
+                                                            size=num_samples).T
+            else:
+                gp_samples = np.random.multivariate_normal(mu.flatten(), var,
+                                                               size=num_samples).T
+            #Push gp samples (f*) through likelihood to give p(y*|f*)
+            samples = self.samples(gp_samples)
+            axis=-1
+
+            #Calculate mean, variance and precentiles from samples
+            print "WARNING: Using sampling to calculate mean, variance and predictive quantiles."
+            pred_mean = np.mean(samples, axis=axis)[:,None]
+            pred_var = np.var(samples, axis=axis)[:,None]
+            q1 = np.percentile(samples, 2.5, axis=axis)[:,None]
+            q3 = np.percentile(samples, 97.5, axis=axis)[:,None]
+
         else:
-            gp_samples = np.random.multivariate_normal(mu.flatten(), var,
-                                                           size=num_samples).T
 
-        #Push gp samples (f*) through likelihood to give p(y*|f*)
-        samples = self.samples(gp_samples)
-        axis=-1
+            pred_mean = self.predictive_mean(mu, var)
+            pred_var = self.predictive_variance(mu, var, pred_mean)
+            print "WARNING: Predictive quantiles are only computed when sampling."
+            q1 = np.repeat(np.nan,pred_mean.size)[:,None]
+            q3 = q1.copy()
 
-        if self.analytical_mean and not sampling:
-            pred_mean = self.predictive_mean(mu, np.sqrt(var))
-        else:
-            pred_mean = np.mean(samples, axis=axis)
-
-        if self.analytical_variance and not sampling:
-            pred_var = self.predictive_variance(mu, np.sqrt(var), pred_mean)
-        else:
-            pred_var = np.var(samples, axis=axis)
-
-        #Calculate quantiles from samples
-        q1 = np.percentile(samples, 2.5, axis=axis)
-        q3 = np.percentile(samples, 97.5, axis=axis)
-        print "WARNING: Using sampling to calculate predictive quantiles"
-
-        pred_mean = np.vstack(pred_mean)
-        pred_var = np.vstack(pred_var)
-        q1 = np.vstack(q1)
-        q3 = np.vstack(q3)
         return pred_mean, pred_var, q1, q3
 
     def samples(self, gp):

From 604e60d5cfaeaa126cb549e88e7e9685f00a1d04 Mon Sep 17 00:00:00 2001
From: Ricardo <rick70x7@gmail.com>
Date: Mon, 11 Nov 2013 08:39:58 +0000
Subject: [PATCH 171/252] Bug fixed in numerical approx. to the predictive
 variance.

---
 .../noise_models/gp_transformations.py        |  2 ++
 .../noise_models/noise_distributions.py       | 21 ++++++++-----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/GPy/likelihoods/noise_models/gp_transformations.py b/GPy/likelihoods/noise_models/gp_transformations.py
index 65730418..5155a69d 100644
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@@ -78,9 +78,11 @@ class Probit(GPTransformation):
         return std_norm_pdf(f)
 
     def d2transf_df2(self,f):
+        #FIXME
         return -f * std_norm_pdf(f)
 
     def d3transf_df3(self,f):
+        #FIXME
         f2 = f**2
         return -(1/(np.sqrt(2*np.pi)))*np.exp(-0.5*(f2))*(1-f2)
 
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 79d9ffeb..8ee7a2cd 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -99,31 +99,29 @@ class NoiseDistribution(object):
         :param tau: cavity distribution 1st natural parameter (precision)
         :param v: cavity distribution 2nd natural paramenter (mu*precision)
         """
-        #Compute first integral for zeroth moment
+        #Compute first integral for zeroth moment.
+        #NOTE constant np.sqrt(2*pi/tau) added at the end of the function
         mu = v/tau
         def int_1(f):
             return self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
-        z, accuracy = quad(int_1, -np.inf, np.inf)
-        #z /= np.sqrt(2*np.pi/tau)
+        z_scaled, accuracy = quad(int_1, -np.inf, np.inf)
 
         #Compute second integral for first moment
         def int_2(f):
             return f*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
         mean, accuracy = quad(int_2, -np.inf, np.inf)
-        #mean /= np.sqrt(2*np.pi/tau)
-        mean /= z
+        mean /= z_scaled
 
         #Compute integral for variance
         def int_3(f):
             return (f**2)*self.pdf(f, obs)*np.exp(-0.5*tau*np.square(mu-f))
         Ef2, accuracy = quad(int_3, -np.inf, np.inf)
-        #Ef2 /= np.sqrt(2*np.pi/tau)
-        Ef2 /= z
+        Ef2 /= z_scaled
         variance = Ef2 - mean**2
 
         #Add constant to the zeroth moment
         #NOTE: this constant is not needed in the other moments because it cancells out.
-        z /= np.sqrt(2*np.pi/tau)
+        z = z_scaled/np.sqrt(2*np.pi/tau)
 
         return z, mean, variance
 
@@ -185,18 +183,17 @@ class NoiseDistribution(object):
 
         #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star) )**2
 
-        #E( E(Y_star|f_star)**2 )
+        #E( E(Y_star|f_star) )**2
         if predictive_mean is None:
             predictive_mean = self.predictive_mean(mu,variance)
         predictive_mean_sq = predictive_mean**2
 
+        #E( E(Y_star|f_star)**2 )
         def int_pred_mean_sq(f,m,v,predictive_mean_sq):
-            return predictive_mean_sq*np.exp(-(0.5/v)*np.square(f - m))
-
+            return self._mean(f)**2*np.exp(-(0.5/v)*np.square(f - m))
         scaled_exp_exp2 = [quad(int_pred_mean_sq, -np.inf, np.inf,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
         exp_exp2 = np.array(scaled_exp_exp2)[:,None] / normalizer
 
-        #E( E(Y_star|f_star) )**2
         var_exp = exp_exp2 - predictive_mean_sq
 
         # V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )

From d7a4e34b3d6f0ea5590b57a4960b33971d678f62 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 11 Nov 2013 09:26:22 +0000
Subject: [PATCH 172/252] fixed product kern get and set state

---
 GPy/kern/parts/prod.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/GPy/kern/parts/prod.py b/GPy/kern/parts/prod.py
index e386a292..7441ae9f 100644
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@@ -19,7 +19,10 @@ class Prod(Kernpart):
     """
     def __init__(self,k1,k2,tensor=False):
         self.num_params = k1.num_params + k2.num_params
-        self.name = '['+k1.name + '**' + k2.name +']'
+        if tensor:
+            self.name = '['+k1.name + '**' + k2.name +']'
+        else:
+            self.name = '['+k1.name + '*' + k2.name +']'
         self.k1 = k1
         self.k2 = k2
         if tensor:
@@ -130,13 +133,12 @@ class Prod(Kernpart):
                 self.k1.K(X[:,self.slice1],X2[:,self.slice1],self._K1)
                 self.k2.K(X[:,self.slice2],X2[:,self.slice2],self._K2)
 
-    def getstate(self):
-        return [self._get_params(), self.k1, self.k2, self.slice1, self.slice2, self.name]
+    def __getstate__(self):
+        return [self.k1, self.k2, self.slice1, self.slice2, self.name, self.input_dim, self.num_params]
 
-    def setstate(self, state):
-        params, self.k1, self.k2, self.slice1, self.slice2, self.name = state
+    def __setstate__(self, state):
+        self.k1, self.k2, self.slice1, self.slice2, self.name, self.input_dim, self.num_params = state
         self._X, self._X2, self._params = np.empty(shape=(3,1))
-        self._set_params(params)
 
 
 

From 4be40da23a3086b004de75da1652c4f633bb715c Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Mon, 11 Nov 2013 14:23:10 +0000
Subject: [PATCH 173/252] Changes in plot function: sampling vs numerical
 approximation

---
 GPy/core/gp_base.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index b6e4ebc0..cb968520 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -173,7 +173,8 @@ class GPBase(Model):
                 upper = m + 2*np.sqrt(v)
                 Y = self.likelihood.Y
             else:
-                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts)
+                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts,sampling=False) #Compute the exact mean
+                m_, v_, lower, upper = self.predict(Xgrid, which_parts=which_parts,sampling=True,num_samples=15000) #Apporximate the percentiles
                 Y = self.likelihood.data
             for d in which_data_ycols:
                 gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
@@ -210,7 +211,7 @@ class GPBase(Model):
                 m, _ = self._raw_predict(Xgrid, which_parts=which_parts)
                 Y = self.likelihood.Y
             else:
-                m, _, _, _ = self.predict(Xgrid, which_parts=which_parts,num_samples=100) #FIXME we need a balance between accuracy and speed to define num_samples
+                m, _, _, _ = self.predict(Xgrid, which_parts=which_parts,sampling=False)
                 Y = self.likelihood.data
             for d in which_data_ycols:
                 m_d = m[:,d].reshape(resolution, resolution).T

From 7184cee6afb4a8d1a1909e45f7814348b024e4d2 Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Mon, 11 Nov 2013 14:23:55 +0000
Subject: [PATCH 174/252] Added **likelihood_params to predictive_values

---
 GPy/likelihoods/gaussian.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 85c028b4..c12d8e6d 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -69,7 +69,7 @@ class Gaussian(likelihood):
             self.covariance_matrix = np.eye(self.N) * x
             self._variance = x
 
-    def predictive_values(self, mu, var, full_cov):
+    def predictive_values(self, mu, var, full_cov, **likelihood_args):
         """
         Un-normalize the prediction and add the likelihood variance, then return the 5%, 95% interval
         """

From e7c7ae8ff41af329c1fc5dc76d98bf4f4e7fb6d9 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 12 Nov 2013 12:06:38 +0000
Subject: [PATCH 175/252]  adding docstring for symmetric kern

---
 GPy/kern/constructors.py | 12 ++++++++++++
 GPy/kern/parts/prod.py   | 10 +++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 392f43ba..b60c7479 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -450,6 +450,18 @@ def prod(k1,k2,tensor=False):
 def symmetric(k):
     """
     Construct a symmetric kernel from an existing kernel
+
+    The symmetric kernel works by adding two GP functions together, and computing the overall covariance.
+
+    Let f ~ GP(x | 0, k(x, x')). Now let g = f(x) + f(-x).
+
+    It's easy to see that g is a symmetric function: g(x) = g(-x).
+
+    by construction, g, is a gaussian Process with mean 0 and covariance
+
+    k(x, x') + k(-x, x') + k(x, -x') + k(-x, -x')
+
+    This constructor builds a covariance function of this form from the initial kernel
     """
     k_ = k.copy()
     k_.parts = [symmetric.Symmetric(p) for p in k.parts]
diff --git a/GPy/kern/parts/prod.py b/GPy/kern/parts/prod.py
index 7441ae9f..f517262c 100644
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@@ -133,12 +133,12 @@ class Prod(Kernpart):
                 self.k1.K(X[:,self.slice1],X2[:,self.slice1],self._K1)
                 self.k2.K(X[:,self.slice2],X2[:,self.slice2],self._K2)
 
-    def __getstate__(self):
-        return [self.k1, self.k2, self.slice1, self.slice2, self.name, self.input_dim, self.num_params]
+    #def __getstate__(self):
+        #return [self.k1, self.k2, self.slice1, self.slice2, self.name, self.input_dim, self.num_params]
 
-    def __setstate__(self, state):
-        self.k1, self.k2, self.slice1, self.slice2, self.name, self.input_dim, self.num_params = state
-        self._X, self._X2, self._params = np.empty(shape=(3,1))
+    #def __setstate__(self, state):
+        #self.k1, self.k2, self.slice1, self.slice2, self.name, self.input_dim, self.num_params = state
+        #self._X, self._X2, self._params = np.empty(shape=(3,1))
 
 
 

From 5fd031fd6351c7c202fa36a500d5129505f722e2 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 12 Nov 2013 12:17:55 +0000
Subject: [PATCH 176/252] added block matrix utility

---
 GPy/util/block_matrices.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 GPy/util/block_matrices.py

diff --git a/GPy/util/block_matrices.py b/GPy/util/block_matrices.py
new file mode 100644
index 00000000..8fd5f89d
--- /dev/null
+++ b/GPy/util/block_matrices.py
@@ -0,0 +1,24 @@
+import numpy as np
+
+def get_blocks(A, blocksizes):
+    assert (A.shape[0]==A.shape[1]) and len(A.shape)==2, "can;t blockify this non-square matrix"
+    N = np.sum(blocksizes)
+    assert A.shape[0] == N, "bad blocksizes"
+    num_blocks = len(blocksizes)
+    B = np.empty(shape=(num_blocks, num_blocks), dtype=np.object)
+    count_i = 0
+    for Bi, i in enumerate(blocksizes):
+        count_j = 0
+        for Bj, j in enumerate(blocksizes):
+            B[Bi, Bj] = A[count_i:count_i + i, count_j : count_j + j]
+            count_j += j
+        count_i += i
+    return B
+
+
+
+if __name__=='__main__':
+    A = np.zeros((5,5))
+    B = get_blocks(A,[2,3])
+    B[0,0] += 7
+    print B

From 73006c6eda072b2472de9ccdc8b1f3d5b639398e Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 12 Nov 2013 14:45:08 +0000
Subject: [PATCH 177/252] fixed up symmetric kern

---
 GPy/kern/constructors.py    | 2 +-
 GPy/kern/parts/symmetric.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index b60c7479..44f4ae3f 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -464,7 +464,7 @@ def symmetric(k):
     This constructor builds a covariance function of this form from the initial kernel
     """
     k_ = k.copy()
-    k_.parts = [symmetric.Symmetric(p) for p in k.parts]
+    k_.parts = [parts.symmetric.Symmetric(p) for p in k.parts]
     return k_
 
 def coregionalize(output_dim,rank=1, W=None, kappa=None):
diff --git a/GPy/kern/parts/symmetric.py b/GPy/kern/parts/symmetric.py
index bbdd5ac0..d836763d 100644
--- a/GPy/kern/parts/symmetric.py
+++ b/GPy/kern/parts/symmetric.py
@@ -56,7 +56,7 @@ class Symmetric(Kernpart):
         AX = np.dot(X,self.transform)
         if X2 is None:
             X2 = X
-            ZX2 = AX
+            AX2 = AX
         else:
             AX2 = np.dot(X2, self.transform)
         self.k.dK_dtheta(dL_dK,X,X2,target)

From df118a404df4ce8c3997b08bfe968e6fd922b3da Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 13 Nov 2013 11:41:58 +0000
Subject: [PATCH 178/252] changed how we search for config files on windows

---
 GPy/util/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPy/util/config.py b/GPy/util/config.py
index d2ed7543..6fd4d005 100644
--- a/GPy/util/config.py
+++ b/GPy/util/config.py
@@ -5,7 +5,8 @@ import ConfigParser
 import os
 config = ConfigParser.ConfigParser()
 
-user_file = os.path.join(os.getenv('HOME'),'.gpy_config.cfg')
+home = os.getenv('HOME') or os.getenv('USERPROFILE')
+user_file = os.path.join(home,'.gpy_config.cfg')
 default_file = os.path.join('..','gpy_config.cfg')
 
 # 1. check if the user has a ~/.gpy_config.cfg

From 68709cfa77f3c90dc17b9ddf5de555b0d34889fd Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 13 Nov 2013 12:46:02 +0000
Subject: [PATCH 179/252] more fiddling with the windows path for config.

Where is the windows guru? out playing beach volley?
---
 GPy/util/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/util/config.py b/GPy/util/config.py
index 6fd4d005..960d6690 100644
--- a/GPy/util/config.py
+++ b/GPy/util/config.py
@@ -7,7 +7,7 @@ config = ConfigParser.ConfigParser()
 
 home = os.getenv('HOME') or os.getenv('USERPROFILE')
 user_file = os.path.join(home,'.gpy_config.cfg')
-default_file = os.path.join('..','gpy_config.cfg')
+default_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'gpy_config.cfg'))
 
 # 1. check if the user has a ~/.gpy_config.cfg
 if os.path.isfile(user_file):

From 0fa287c044af8c2bbbe9118cef66e18bf5343d64 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 13 Nov 2013 12:46:59 +0000
Subject: [PATCH 180/252] allowing the passing of 1D X to a GP. with warning of
 course

---
 GPy/core/gp_base.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index cb968520..548e2924 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -14,8 +14,11 @@ class GPBase(Model):
     Here we define some functions that are use
     """
     def __init__(self, X, likelihood, kernel, normalize_X=False):
+        if len(X.shape)==1:
+            X = X.reshape(-1,1)
+            warning.warn("One dimension output (N,) being reshaped to (N,1)")
         self.X = X
-        assert len(self.X.shape) == 2
+        assert len(self.X.shape) == 2, "too many dimensions for X input"
         self.num_data, self.input_dim = self.X.shape
         assert isinstance(kernel, kern.kern)
         self.kern = kernel

From 280f6560513d03f29ff6ee76adadc03a9f9895f5 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 13 Nov 2013 13:34:10 +0000
Subject: [PATCH 181/252] debugging the config paths

---
 GPy/util/config.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/GPy/util/config.py b/GPy/util/config.py
index 960d6690..cd29a8af 100644
--- a/GPy/util/config.py
+++ b/GPy/util/config.py
@@ -8,11 +8,12 @@ config = ConfigParser.ConfigParser()
 home = os.getenv('HOME') or os.getenv('USERPROFILE')
 user_file = os.path.join(home,'.gpy_config.cfg')
 default_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'gpy_config.cfg'))
+print user_file, os.path.isfile(user_file)
+print default_file, os.path.isfile(default_file)
 
 # 1. check if the user has a ~/.gpy_config.cfg
 if os.path.isfile(user_file):
     config.read(user_file)
 else:
     # 2. if not, use the default one
-    path = os.path.dirname(__file__)
-    config.read(os.path.join(path,default_file))
+    config.read(default_file)

From df97f7814efb0589868cfe9e6ef4026414fb5a83 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 13 Nov 2013 13:40:44 +0000
Subject: [PATCH 182/252] better handling of missing config files

---
 GPy/util/config.py | 5 ++++-
 MANIFEST.in        | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/GPy/util/config.py b/GPy/util/config.py
index cd29a8af..02796e0b 100644
--- a/GPy/util/config.py
+++ b/GPy/util/config.py
@@ -14,6 +14,9 @@ print default_file, os.path.isfile(default_file)
 # 1. check if the user has a ~/.gpy_config.cfg
 if os.path.isfile(user_file):
     config.read(user_file)
-else:
+elif os.path.isfile(default_file):
     # 2. if not, use the default one
     config.read(default_file)
+else:
+    #3. panic
+    raise ValueError, "no configuration file found"
diff --git a/MANIFEST.in b/MANIFEST.in
index c89284cd..8d5b2304 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,3 +2,5 @@ include *.txt
 recursive-include doc *.txt
 include *.md
 recursive-include doc *.md
+include *.cfg
+recursive-include doc *.cfg

From a5c7795487082179b5d38498d0de9249ed4a8163 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 13 Nov 2013 14:10:32 +0000
Subject: [PATCH 183/252] Added cfg file to manfiest and package_data

---
 MANIFEST.in | 2 ++
 setup.py    | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index c89284cd..8d5b2304 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,3 +2,5 @@ include *.txt
 recursive-include doc *.txt
 include *.md
 recursive-include doc *.md
+include *.cfg
+recursive-include doc *.cfg
diff --git a/setup.py b/setup.py
index 9ccf3990..27ebf975 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ setup(name = 'GPy',
       url = "http://sheffieldml.github.com/GPy/",
       packages = ['GPy', 'GPy.core', 'GPy.kern', 'GPy.util', 'GPy.models', 'GPy.inference', 'GPy.examples', 'GPy.likelihoods', 'GPy.testing', 'GPy.util.latent_space_visualizations', 'GPy.util.latent_space_visualizations.controllers', 'GPy.likelihoods.noise_models', 'GPy.kern.parts', 'GPy.mappings'],
       package_dir={'GPy': 'GPy'},
-      package_data = {'GPy': ['GPy/examples']},
+      package_data = {'GPy': ['GPy/examples', 'gpy_config.cfg']},
       py_modules = ['GPy.__init__'],
       long_description=read('README.md'),
       install_requires=['numpy>=1.6', 'scipy>=0.9','matplotlib>=1.1', 'nose'],

From e79794f6ef7f337dc3a500f13fe864b79293e8c5 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Thu, 14 Nov 2013 08:47:16 +0000
Subject: [PATCH 184/252] Part implementation of ode_eq functionality. Not yet
 numerically stable or efficient (some horrible use of cut and paste to get
 things working ...)

---
 GPy/kern/parts/sympy_helpers.cpp | 106 +++++++++++++++-
 GPy/kern/parts/sympy_helpers.h   |   7 ++
 GPy/util/symbolic.py             | 203 ++++++++++++++++++++++++++++---
 3 files changed, 299 insertions(+), 17 deletions(-)

diff --git a/GPy/kern/parts/sympy_helpers.cpp b/GPy/kern/parts/sympy_helpers.cpp
index e4df4d80..d21d2683 100644
--- a/GPy/kern/parts/sympy_helpers.cpp
+++ b/GPy/kern/parts/sympy_helpers.cpp
@@ -1,7 +1,8 @@
 #include <math.h>
 #include <float.h>
 #include <stdlib.h>
-
+#include <iostream>
+#include <stdexcept>
 double DiracDelta(double x){
   // TODO: this doesn't seem to be a dirac delta ... should return infinity. Neil
     if((x<0.000001) & (x>-0.000001))//go on, laugh at my c++ skills
@@ -14,6 +15,7 @@ double DiracDelta(double x,int foo){
 };
 
 double sinc(double x){
+  // compute the sinc function
   if (x==0)
     return 1.0;
   else 
@@ -21,6 +23,7 @@ double sinc(double x){
 }
 
 double sinc_grad(double x){
+  // compute the gradient of the sinc function.
   if (x==0)
     return 0.0;
   else 
@@ -28,6 +31,7 @@ double sinc_grad(double x){
 }
 
 double erfcx(double x){
+  // compute the scaled complex error function.
   double xneg=-sqrt(log(DBL_MAX/2));
   double xmax = 1/(sqrt(M_PI)*DBL_MIN);
   xmax = DBL_MAX<xmax ? DBL_MAX : xmax;
@@ -50,12 +54,108 @@ double erfcx(double x){
 }
 
 double ln_diff_erf(double x0, double x1){
+  // stably compute the log of difference between two erfs.
+  if (x1>x0)
+    throw std::runtime_error("Error: second argument must be smaller than first in ln_diff_err");
+  return log(erf(x0) - erf(x1));
   if (x0==x1)
-    return INFINITY;
+    return -INFINITY;
   else if(x0<0 && x1>0 || x0>0 && x1<0)
     return log(erf(x0)-erf(x1));
   else if(x1>0)
-    return log(erfcx(x1)-erfcx(x0)*exp(x1*x1)- x0*x0)-x1*x1;
+    return log(erfcx(x1)-erfcx(x0)*exp(x1*x1- x0*x0))-x1*x1;
   else 
     return log(erfcx(-x0)-erfcx(-x1)*exp(x0*x0 - x1*x1))-x0*x0;
 }
+
+double h(double t, double tprime, double d_i, double d_j, double l){
+  // Compute the h function for the sim covariance.
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double sign_val = 1.0;
+  if(t/l==0)
+    sign_val = 0.0;
+  else if (t/l < 0)
+    sign_val = -1.0;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  
+  return sign_val*exp(half_l_di*half_l_di - d_i*(t-tprime) + ln_part_1 - log(d_i + d_j)) - sign_val*exp(half_l_di*half_l_di - d_i*t - d_j*tprime + ln_part_2 - log(d_i + d_j));
+}
+
+double dh_dl(double t, double tprime, double d_i, double d_j, double l){
+  // compute gradient of h function with respect to lengthscale for sim covariance
+  // TODO a lot of energy wasted recomputing things here, need to do this in a shared way somehow ... perhaps needs rewrite of sympykern.
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  double diff_t = t - tprime;
+  double l2 = l*l;
+  double hv = h(t, tprime, d_i, d_j, l);
+  return 0.5*d_i*d_i*l*hv + 2/(sqrt(M_PI)*(d_i+d_j))*((-diff_t/l2-d_i/2)*exp(-diff_t*diff_t/l2)+(-tprime/l2+d_i/2)*exp(-tprime*tprime/l2-d_i*t)-(-t/l2-d_i/2)*exp(-t*t/l2-d_j*tprime)-d_i/2*exp(-(d_i*t+d_j*tprime)));
+}
+
+double dh_dd_i(double t, double tprime, double d_i, double d_j, double l){
+  double diff_t = (t-tprime);
+  double l2 = l*l;
+  double hv = h(t, tprime, d_i, d_j, l);
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_1 = half_l_di;
+  arg_2 = half_l_di - t/l;
+  double sign_val = 1.0;
+  if(t/l==0)
+    sign_val = 0.0;
+  else if (t/l < 0)
+    sign_val = -1.0;
+  double ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l);
+
+  double base = ((0.5*d_i*l2*(d_i+d_j)-1)*hv 
+		 + (-diff_t*sign_val*exp(half_l_di*half_l_di
+					 -d_i*diff_t
+					 +ln_part_1)
+		    +t*sign_val*exp(half_l_di*half_l_di
+				    -d_i*t-d_j*tprime
+				    +ln_part_2))
+		 + l/sqrt(M_PI)*(-exp(-diff_t*diff_t/l2)
+			       +exp(-tprime*tprime/l2-d_i*t)
+			       +exp(-t*t/l2-d_j*tprime)
+			       -exp(-(d_i*t + d_j*tprime))));
+  return base/(d_i+d_j);
+}
+
+double dh_dd_j(double t, double tprime, double d_i, double d_j, double l){
+  double diff_t = (t-tprime);
+  double l2 = l*l;
+  double half_l_di = 0.5*l*d_i;
+  double hv = h(t, tprime, d_i, d_j, l);
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_1 = half_l_di;
+  arg_2 = half_l_di - t/l;
+  double sign_val = 1.0;
+  if(t/l==0)
+    sign_val = 0.0;
+  else if (t/l < 0)
+    sign_val = -1.0;
+  double ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l);
+  double base = tprime*sign_val*exp(half_l_di*half_l_di-(d_i*t+d_j*tprime)+ln_part_2)-hv;
+  return base/(d_i+d_j);
+}
+
+
+double dh_dt(double t, double tprime, double d_i, double d_j, double l){
+  return 0.0;
+}
+
+double dh_dtprime(double t, double tprime, double d_i, double d_j, double l){
+  return 0.0;
+}
diff --git a/GPy/kern/parts/sympy_helpers.h b/GPy/kern/parts/sympy_helpers.h
index 56220167..5e58d5d2 100644
--- a/GPy/kern/parts/sympy_helpers.h
+++ b/GPy/kern/parts/sympy_helpers.h
@@ -7,3 +7,10 @@ double sinc_grad(double x);
 
 double erfcx(double x);
 double ln_diff_erf(double x0, double x1);
+
+double h(double t, double tprime, double d_i, double d_j, double l);
+double dh_dl(double t, double tprime, double d_i, double d_j, double l);
+double dh_dd_i(double t, double tprime, double d_i, double d_j, double l);
+double dh_dd_j(double t, double tprime, double d_i, double d_j, double l);
+double dh_dt(double t, double tprime, double d_i, double d_j, double l);
+double dh_dtprime(double t, double tprime, double d_i, double d_j, double l);
diff --git a/GPy/util/symbolic.py b/GPy/util/symbolic.py
index 0b5ca381..d546f940 100644
--- a/GPy/util/symbolic.py
+++ b/GPy/util/symbolic.py
@@ -1,4 +1,4 @@
-from sympy import Function, S, oo, I, cos, sin, asin, log, erf,pi,exp
+from sympy import Function, S, oo, I, cos, sin, asin, log, erf,pi,exp,sqrt,sign
 
 
 class ln_diff_erf(Function):
@@ -19,15 +19,84 @@ class ln_diff_erf(Function):
         if x0.is_Number and x1.is_Number:            
             return log(erf(x0)-erf(x1))
 
-class sim_h(Function):
+class dh_dd_i(Function):
+    nargs = 5
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+
+            diff_t = (t-tprime)
+            l2 = l*l
+            h = h(t, tprime, d_i, d_j, l)
+            half_l_di = 0.5*l*d_i
+            arg_1 = half_l_di + tprime/l
+            arg_2 = half_l_di - (t-tprime)/l
+            ln_part_1 = ln_diff_erf(arg_1, arg_2)
+            arg_1 = half_l_di 
+            arg_2 = half_l_di - t/l
+            sign_val = sign(t/l)
+            ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+
+            base = ((0.5*d_i*l2*(d_i+d_j)-1)*h 
+                    + (-diff_t*sign_val*exp(half_l_di*half_l_di
+                                          -d_i*diff_t
+                                          +ln_part_1)
+                       +t*sign_val*exp(half_l_di*half_l_di
+                                          -d_i*t-d_j*tprime
+                                          +ln_part_2))
+                    + l/sqrt(pi)*(-exp(-diff_t*diff_t/l2)
+                                     +exp(-tprime*tprime/l2-d_i*t)
+                                     +exp(-t*t/l2-d_j*tprime)
+                                     -exp(-(d_i*t + d_j*tprime))))
+            return base/(d_i+d_j)
+
+class dh_dd_j(Function):
+    nargs = 5
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+            diff_t = (t-tprime)
+            l2 = l*l
+            half_l_di = 0.5*l*d_i
+            h = h(t, tprime, d_i, d_j, l)
+            arg_1 = half_l_di + tprime/l
+            arg_2 = half_l_di - (t-tprime)/l
+            ln_part_1 = ln_diff_erf(arg_1, arg_2)
+            arg_1 = half_l_di 
+            arg_2 = half_l_di - t/l
+            sign_val = sign(t/l)
+            ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+            sign_val = sign(t/l)
+            base = tprime*sign_val*exp(half_l_di*half_l_di-(d_i*t+d_j*tprime)+ln_part_2)-h
+            return base/(d_i+d_j)
+    
+class dh_dl(Function):
+    nargs = 5
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+
+            diff_t = (t-tprime)
+            l2 = l*l
+            h = h(t, tprime, d_i, d_j, l)
+            return 0.5*d_i*d_i*l*h + 2./(sqrt(pi)*(d_i+d_j))*((-diff_t/l2-d_i/2.)*exp(-diff_t*diff_t/l2)+(-tprime/l2+d_i/2.)*exp(-tprime*tprime/l2-d_i*t)-(-t/l2-d_i/2.)*exp(-t*t/l2-d_j*tprime)-d_i/2.*exp(-(d_i*t+d_j*tprime)))
+
+class dh_dt(Function):
     nargs = 5
-
-    def fdiff(self, argindex=1):
-        pass
-    
     @classmethod
     def eval(cls, t, tprime, d_i, d_j, l):
-        # putting in the is_Number stuff forces it to look for a fdiff method for derivative.
         if (t.is_Number
             and tprime.is_Number
             and d_i.is_Number
@@ -40,13 +109,119 @@ class sim_h(Function):
                 or l is S.NaN):
                 return S.NaN
             else:
-                return (exp((d_j/2*l)**2)/(d_i+d_j)
-                        *(exp(-d_j*(tprime - t))
-                          *(erf((tprime-t)/l - d_j/2*l)
-                            + erf(t/l + d_j/2*l))
-                          - exp(-(d_j*tprime + d_i))
-                          *(erf(tprime/l - d_j/2*l)
-                            + erf(d_j/2*l))))
+                half_l_di = 0.5*l*d_i
+                arg_1 = half_l_di + tprime/l
+                arg_2 = half_l_di - (t-tprime)/l
+                ln_part_1 = ln_diff_erf(arg_1, arg_2)
+                arg_1 = half_l_di 
+                arg_2 = half_l_di - t/l
+                sign_val = sign(t/l)
+                ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+
+                
+                return (sign_val*exp(half_l_di*half_l_di
+                                        - d_i*(t-tprime)
+                                        + ln_part_1
+                                        - log(d_i + d_j))
+                        - sign_val*exp(half_l_di*half_l_di
+                                          - d_i*t - d_j*tprime
+                                          + ln_part_2
+                                          - log(d_i + d_j))).diff(t)
+
+class dh_dtprime(Function):
+    nargs = 5
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+            if (t is S.NaN
+                or tprime is S.NaN
+                or d_i is S.NaN
+                or d_j is S.NaN
+                or l is S.NaN):
+                return S.NaN
+            else:
+                half_l_di = 0.5*l*d_i
+                arg_1 = half_l_di + tprime/l
+                arg_2 = half_l_di - (t-tprime)/l
+                ln_part_1 = ln_diff_erf(arg_1, arg_2)
+                arg_1 = half_l_di 
+                arg_2 = half_l_di - t/l
+                sign_val = sign(t/l)
+                ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+
+                
+                return (sign_val*exp(half_l_di*half_l_di
+                                        - d_i*(t-tprime)
+                                        + ln_part_1
+                                        - log(d_i + d_j))
+                        - sign_val*exp(half_l_di*half_l_di
+                                          - d_i*t - d_j*tprime
+                                          + ln_part_2
+                                          - log(d_i + d_j))).diff(tprime)
+
+
+class h(Function):
+    nargs = 5
+    def fdiff(self, argindex=5):
+        t, tprime, d_i, d_j, l = self.args
+        if argindex == 1:
+            return dh_dt(t, tprime, d_i, d_j, l)
+        elif argindex == 2:
+            return dh_dtprime(t, tprime, d_i, d_j, l)
+        elif argindex == 3:
+            return dh_dd_i(t, tprime, d_i, d_j, l)
+        elif argindex == 4:
+            return dh_dd_j(t, tprime, d_i, d_j, l)
+        elif argindex == 5:
+            return dh_dl(t, tprime, d_i, d_j, l)
+                                                                
+    
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        # putting in the is_Number stuff forces it to look for a fdiff method for derivative. If it's left out, then when asking for self.diff, it just does the diff on the eval symbolic terms directly. We want to avoid that because we are looking to ensure everything is numerically stable. Maybe it's because of the if statement that this happens? 
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+            if (t is S.NaN
+                or tprime is S.NaN
+                or d_i is S.NaN
+                or d_j is S.NaN
+                or l is S.NaN):
+                return S.NaN
+            else:
+                half_l_di = 0.5*l*d_i
+                arg_1 = half_l_di + tprime/l
+                arg_2 = half_l_di - (t-tprime)/l
+                ln_part_1 = ln_diff_erf(arg_1, arg_2)
+                arg_1 = half_l_di 
+                arg_2 = half_l_di - t/l
+                sign_val = sign(t/l)
+                ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+
+                
+                return (sign_val*exp(half_l_di*half_l_di
+                                        - d_i*(t-tprime)
+                                        + ln_part_1
+                                        - log(d_i + d_j))
+                        - sign_val*exp(half_l_di*half_l_di
+                                          - d_i*t - d_j*tprime
+                                          + ln_part_2
+                                          - log(d_i + d_j)))
+            
+                                  
+                # return (exp((d_j/2.*l)**2)/(d_i+d_j)
+                #         *(exp(-d_j*(tprime - t))
+                #           *(erf((tprime-t)/l - d_j/2.*l)
+                #             + erf(t/l + d_j/2.*l))
+                #           - exp(-(d_j*tprime + d_i))
+                #           *(erf(tprime/l - d_j/2.*l)
+                #             + erf(d_j/2.*l))))
 
 class erfc(Function):
     nargs = 1

From c12ca4c53d625d9ccf5da0ab749e53f145e60ab6 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 14 Nov 2013 08:54:05 +0000
Subject: [PATCH 185/252] a trial namespace renaming

---
 GPy/models/__init__.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index 10ce577b..a8be5890 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -1,18 +1,19 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from gp_regression import GPRegression
-from gp_classification import GPClassification
-from sparse_gp_regression import SparseGPRegression
-from svigp_regression import SVIGPRegression
-from sparse_gp_classification import SparseGPClassification
-from fitc_classification import FITCClassification
-from gplvm import GPLVM
-from bcgplvm import BCGPLVM
-from sparse_gplvm import SparseGPLVM
-from warped_gp import WarpedGP
-from bayesian_gplvm import BayesianGPLVM
-from mrd import MRD
-from gradient_checker import GradientChecker
-from gp_multioutput_regression import GPMultioutputRegression
-from sparse_gp_multioutput_regression import SparseGPMultioutputRegression
+from gp_regression import GPRegression; _gp_regression = gp_regression ; del gp_regression 
+from gp_classification import GPClassification; _gp_classification = gp_classification ; del gp_classification 
+from sparse_gp_regression import SparseGPRegression; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
+from svigp_regression import SVIGPRegression; _svigp_regression = svigp_regression ; del svigp_regression 
+from sparse_gp_classification import SparseGPClassification; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
+from fitc_classification import FITCClassification; _fitc_classification = fitc_classification ; del fitc_classification 
+from gplvm import GPLVM; _gplvm = gplvm ; del gplvm 
+from bcgplvm import BCGPLVM; _bcgplvm = bcgplvm; del bcgplvm
+from sparse_gplvm import SparseGPLVM; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
+from warped_gp import WarpedGP; _warped_gp = warped_gp ; del warped_gp 
+from bayesian_gplvm import BayesianGPLVM; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
+from mrd import MRD; _mrd = mrd ; del mrd 
+from gradient_checker import GradientChecker; _gradient_checker = gradient_checker ; del gradient_checker 
+from gp_multioutput_regression import GPMultioutputRegression; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
+from sparse_gp_multioutput_regression import SparseGPMultioutputRegression; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
+

From d95137e0497cae4b5ba7deed862cbf686bb0f837 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 14 Nov 2013 09:01:05 +0000
Subject: [PATCH 186/252] half way through crossterm objective

---
 GPy/kern/kern.py                          | 17 ++++++++++++-----
 GPy/kern/parts/rbf.py                     | 10 ++++++++++
 GPy/testing/psi_stat_expectation_tests.py | 15 +++++++++------
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 619d1687..d686064a 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -462,10 +462,8 @@ class kern(Parameterized):
                 pass
             # rbf X bias
             elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
-                target += 2 * p1.variance * (p2._psi1[:, :, None] + p2._psi1[:, None, :])
+                target += p1.variance * (p2._psi1[:, :, None] + p2._psi1[:, None, :])
             elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
-                tmp1 = p2.variance * (p1._psi1[:, :, None] + p1._psi1[:, None, :])
-                renorm = p1.variance*np.exp()
                 target += p2.variance * (p1._psi1[:, :, None] + p1._psi1[:, None, :])
             # linear X bias
             elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, Linear):
@@ -478,12 +476,21 @@ class kern(Parameterized):
                 target += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
             # rbf X any
             elif isinstance(p1, (RBF, RBFInv)):
-                pass
+                psi11 = np.zeros((mu.shape[0], Z.shape[0]))
+                psi12 = np.zeros((mu.shape[0], Z.shape[0]))
+                p1.psi1(Z, mu, S, psi11)
+                p2.psi1(Z, mu, S, psi12)
+                
+                crossterms  = psi11[:, :, None] + psi12[:, None, :]
+                crossterms += psi12[:, :, None] + psi11[:, None, :]
+                
+                target += p1._crossterm_product_expectation(p2, Z, mu, S)
+                #import ipdb;ipdb.set_trace()
             elif isinstance(p2, (RBF, RBFInv)):
                 raise NotImplementedError # TODO
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
-        return target
+        return target        
 
     def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
         target = np.zeros(self.num_params)
diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index 585d687f..56a6b0eb 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -208,6 +208,16 @@ class RBF(Kernpart):
         self._psi_computations(Z, mu, S)
         target += self._psi2
 
+    def _crossterm_product_expectation(self, K, Z, mu, S):
+        # compute the crossterm expectation for K as the other kernel:
+        import ipdb;ipdb.set_trace()
+        Sigma = 1./self.lengthscale[None,:] + 1./S # is independent across M, 
+        M = (Z[None,:,:]/self.lengthscale[None,None,:] + (mu/S)[:,None,:]) / Sigma[:,None,:]
+        psi1_other = K.psi1()
+        self.variance
+        # return is [N x M x M]
+        return 
+
     def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
         """Shape N,num_inducing,num_inducing,Ntheta"""
         self._psi_computations(Z, mu, S)
diff --git a/GPy/testing/psi_stat_expectation_tests.py b/GPy/testing/psi_stat_expectation_tests.py
index 16904927..ae3d1022 100644
--- a/GPy/testing/psi_stat_expectation_tests.py
+++ b/GPy/testing/psi_stat_expectation_tests.py
@@ -27,7 +27,7 @@ def ard(p):
 @testing.deepTest(__test__())
 class Test(unittest.TestCase):
     input_dim = 9
-    num_inducing = 4
+    num_inducing = 13
     N = 30
     Nsamples = 9e6
 
@@ -51,13 +51,16 @@ class Test(unittest.TestCase):
 #                        GPy.kern.bias(self.input_dim) +
 #                        GPy.kern.white(self.input_dim)),
 #                     (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-#                     GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-#                     GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
+                    (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
+                     +GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True)
 #                     GPy.kern.bias(self.input_dim) +
 #                     GPy.kern.white(self.input_dim)),
-        (GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
-                    GPy.kern.bias(self.input_dim, np.random.rand()) +
-                    GPy.kern.white(self.input_dim, np.random.rand())),
+                    ),
+        (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
+         +GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
+         #+GPy.kern.bias(self.input_dim, np.random.rand())
+         #+GPy.kern.white(self.input_dim, np.random.rand())),
+         ),
                 (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
                     GPy.kern.bias(self.input_dim, np.random.rand()) +
                     GPy.kern.white(self.input_dim, np.random.rand())),

From a074763eb69597ae22b0b5f7b284a96685d12ea2 Mon Sep 17 00:00:00 2001
From: Nicolo Fusi <nicolo.fusi@gmail.com>
Date: Thu, 14 Nov 2013 12:28:26 -0800
Subject: [PATCH 187/252] fixed problem in warping

---
 GPy/util/warping_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index e05f39af..35ad3b80 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -222,7 +222,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         """
 
 
-        mpsi = psi.coSpy()
+        mpsi = psi.copy()
         d = psi[-1]
         mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
 

From b845c0d634a48f9e11e13cb6a3329629e84e28fd Mon Sep 17 00:00:00 2001
From: mu <mu@mu-DQ67SW.(none)>
Date: Mon, 18 Nov 2013 10:43:58 +0000
Subject: [PATCH 188/252] constructor and init for ODE_UY

---
 GPy/kern/constructors.py   | 17 +++++++++++++++++
 GPy/kern/parts/__init__.py |  1 +
 2 files changed, 18 insertions(+)

diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 392f43ba..1feec4df 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -588,3 +588,20 @@ def ODE_1(input_dim=1, varianceU=1.,  varianceY=1., lengthscaleU=None,  lengthsc
     """
     part = parts.ODE_1.ODE_1(input_dim, varianceU, varianceY, lengthscaleU, lengthscaleY)
     return kern(input_dim, [part])
+
+def ODE_UY(input_dim=2, varianceU=1.,  varianceY=1., lengthscaleU=None,  lengthscaleY=None):
+    """
+    kernel resultiong from a first order ODE with OU driving GP
+    :param input_dim: the number of input dimension, has to be equal to one
+    :type input_dim: int
+    :param input_lengthU: the number of input U length
+    :param varianceU: variance of the driving GP
+    :type varianceU: float
+    :param varianceY: 'variance' of the transfer function
+    :type varianceY: float
+    :param lengthscaleY: 'lengthscale' of the transfer function
+    :type lengthscaleY: float
+    :rtype: kernel object
+    """
+    part = parts.ODE_UY.ODE_UY(input_dim, varianceU, varianceY, lengthscaleU, lengthscaleY)
+    return kern(input_dim, [part])
\ No newline at end of file
diff --git a/GPy/kern/parts/__init__.py b/GPy/kern/parts/__init__.py
index 0a758f1e..3b020828 100644
--- a/GPy/kern/parts/__init__.py
+++ b/GPy/kern/parts/__init__.py
@@ -14,6 +14,7 @@ import Matern32
 import Matern52
 import mlp
 import ODE_1
+import ODE_UY
 import periodic_exponential
 import periodic_Matern32
 import periodic_Matern52

From 241ca0b628b5eb2cf8e00cde11fa842721fcbf6c Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Mon, 18 Nov 2013 16:39:43 +0000
Subject: [PATCH 189/252] Working eq_ode1 in sympy now.

---
 GPy/kern/parts/__init__.py       |   1 +
 GPy/kern/parts/sympy_helpers.cpp | 119 ++++++++++++++++++-------------
 GPy/kern/parts/sympy_helpers.py  |  71 ++++++++++++++++++
 GPy/util/symbolic.py             |   2 +-
 4 files changed, 141 insertions(+), 52 deletions(-)
 create mode 100644 GPy/kern/parts/sympy_helpers.py

diff --git a/GPy/kern/parts/__init__.py b/GPy/kern/parts/__init__.py
index 0a758f1e..54c5bba5 100644
--- a/GPy/kern/parts/__init__.py
+++ b/GPy/kern/parts/__init__.py
@@ -26,4 +26,5 @@ import rbf
 import rbf_inv
 import spline
 import symmetric
+import sympy_helpers
 import white
diff --git a/GPy/kern/parts/sympy_helpers.cpp b/GPy/kern/parts/sympy_helpers.cpp
index d21d2683..9f30eea9 100644
--- a/GPy/kern/parts/sympy_helpers.cpp
+++ b/GPy/kern/parts/sympy_helpers.cpp
@@ -1,3 +1,4 @@
+#include "Python.h"
 #include <math.h>
 #include <float.h>
 #include <stdlib.h>
@@ -29,24 +30,33 @@ double sinc_grad(double x){
   else 
     return (x*cos(x) - sin(x))/(x*x);
 }
-
 double erfcx(double x){
+  // Based on code by Soren Hauberg 2010 for Octave.
   // compute the scaled complex error function.
+  //return erfc(x)*exp(x*x);
   double xneg=-sqrt(log(DBL_MAX/2));
   double xmax = 1/(sqrt(M_PI)*DBL_MIN);
   xmax = DBL_MAX<xmax ? DBL_MAX : xmax;
   // Find values where erfcx can be evaluated
-  double t = 3.97886080735226 / (abs(x) + 3.97886080735226);
+  double t = 3.97886080735226 / (fabs(x) + 3.97886080735226);
   double u = t-0.5;
   double y = (((((((((u * 0.00127109764952614092 + 1.19314022838340944e-4) * u 
-	      - 0.003963850973605135)   * u - 8.70779635317295828e-4) * u 
-	    + 0.00773672528313526668) * u + 0.00383335126264887303) * u 
-	  - 0.0127223813782122755)  * u - 0.0133823644533460069)  * u 
-	+ 0.0161315329733252248)  * u + 0.0390976845588484035)  * u + 0.00249367200053503304;
+		     - 0.003963850973605135)   * u - 8.70779635317295828e-4) * u 
+		   + 0.00773672528313526668) * u + 0.00383335126264887303) * u 
+		 - 0.0127223813782122755)  * u - 0.0133823644533460069)  * u 
+	       + 0.0161315329733252248)  * u + 0.0390976845588484035)  * u + 0.00249367200053503304;
+  y = ((((((((((((y * u - 0.0838864557023001992) * u -		       
+		 0.119463959964325415) * u + 0.0166207924969367356) * u + 
+	       0.357524274449531043) * u + 0.805276408752910567)  * u + 
+	     1.18902982909273333)  * u + 1.37040217682338167)   * u +	
+	   1.31314653831023098)  * u + 1.07925515155856677)   * u +	
+	 0.774368199119538609) * u + 0.490165080585318424)  * u +	
+       0.275374741597376782) * t;
+
   if (x<xneg)
     return -INFINITY;
   else if (x<0)
-    return 2*exp(x*x)-y;
+    return 2.0*exp(x*x)-y;
   else if (x>xmax)
     return 0.0;
   else 
@@ -55,16 +65,19 @@ double erfcx(double x){
 
 double ln_diff_erf(double x0, double x1){
   // stably compute the log of difference between two erfs.
-  if (x1>x0)
-    throw std::runtime_error("Error: second argument must be smaller than first in ln_diff_err");
-  return log(erf(x0) - erf(x1));
-  if (x0==x1)
+  if (x1>x0){
+    PyErr_SetString(PyExc_RuntimeError,"second argument must be smaller than or equal to first in ln_diff_erf");
+    throw 1;
+  }
+  if (x0==x1){
+    PyErr_WarnEx(PyExc_RuntimeWarning,"divide by zero encountered in log", 1);
     return -INFINITY;
-  else if(x0<0 && x1>0 || x0>0 && x1<0)
+  }
+  else if(x0<0 && x1>0 || x0>0 && x1<0) //x0 and x1 have opposite signs
     return log(erf(x0)-erf(x1));
-  else if(x1>0)
-    return log(erfcx(x1)-erfcx(x0)*exp(x1*x1- x0*x0))-x1*x1;
-  else 
+  else if(x0>0) //x0 positive, x1 non-negative
+    return log(erfcx(x1)-erfcx(x0)*exp(x1*x1- x0*x0))-x1*x1; 
+  else //x0 and x1 non-positive
     return log(erfcx(-x0)-erfcx(-x1)*exp(x0*x0 - x1*x1))-x0*x0;
 }
 
@@ -80,26 +93,19 @@ double h(double t, double tprime, double d_i, double d_j, double l){
     sign_val = 0.0;
   else if (t/l < 0)
     sign_val = -1.0;
-  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
-  
-  return sign_val*exp(half_l_di*half_l_di - d_i*(t-tprime) + ln_part_1 - log(d_i + d_j)) - sign_val*exp(half_l_di*half_l_di - d_i*t - d_j*tprime + ln_part_2 - log(d_i + d_j));
-}
-
-double dh_dl(double t, double tprime, double d_i, double d_j, double l){
-  // compute gradient of h function with respect to lengthscale for sim covariance
-  // TODO a lot of energy wasted recomputing things here, need to do this in a shared way somehow ... perhaps needs rewrite of sympykern.
-  double half_l_di = 0.5*l*d_i;
-  double arg_1 = half_l_di + tprime/l;
-  double arg_2 = half_l_di - (t-tprime)/l;
-  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
   arg_2 = half_l_di - t/l;
   double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
-  double diff_t = t - tprime;
-  double l2 = l*l;
-  double hv = h(t, tprime, d_i, d_j, l);
-  return 0.5*d_i*d_i*l*hv + 2/(sqrt(M_PI)*(d_i+d_j))*((-diff_t/l2-d_i/2)*exp(-diff_t*diff_t/l2)+(-tprime/l2+d_i/2)*exp(-tprime*tprime/l2-d_i*t)-(-t/l2-d_i/2)*exp(-t*t/l2-d_j*tprime)-d_i/2*exp(-(d_i*t+d_j*tprime)));
+  // if either ln_part_1 or ln_part_2 are -inf, don't bother computing rest of that term.
+  double part_1 = 0.0;
+  if(isfinite(ln_part_1))
+    part_1 = sign_val*exp(half_l_di*half_l_di - d_i*(t-tprime) + ln_part_1 - log(d_i + d_j));
+  double part_2 = 0.0;
+  if(isfinite(ln_part_2))
+    part_2 = sign_val*exp(half_l_di*half_l_di - d_i*t - d_j*tprime + ln_part_2 - log(d_i + d_j));
+  return part_1 - part_2;
 }
 
+
 double dh_dd_i(double t, double tprime, double d_i, double d_j, double l){
   double diff_t = (t-tprime);
   double l2 = l*l;
@@ -116,41 +122,52 @@ double dh_dd_i(double t, double tprime, double d_i, double d_j, double l){
   else if (t/l < 0)
     sign_val = -1.0;
   double ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l);
-
-  double base = ((0.5*d_i*l2*(d_i+d_j)-1)*hv 
-		 + (-diff_t*sign_val*exp(half_l_di*half_l_di
-					 -d_i*diff_t
-					 +ln_part_1)
-		    +t*sign_val*exp(half_l_di*half_l_di
-				    -d_i*t-d_j*tprime
-				    +ln_part_2))
-		 + l/sqrt(M_PI)*(-exp(-diff_t*diff_t/l2)
-			       +exp(-tprime*tprime/l2-d_i*t)
-			       +exp(-t*t/l2-d_j*tprime)
-			       -exp(-(d_i*t + d_j*tprime))));
+  double base = (0.5*d_i*l2*(d_i+d_j)-1)*hv;
+  if(isfinite(ln_part_1))
+    base -= diff_t*sign_val*exp(half_l_di*half_l_di
+				-d_i*diff_t
+				+ln_part_1);
+  if(isfinite(ln_part_2))
+    base += t*sign_val*exp(half_l_di*half_l_di
+			   -d_i*t-d_j*tprime
+			   +ln_part_2);
+  base += l/sqrt(M_PI)*(-exp(-diff_t*diff_t/l2)
+			+exp(-tprime*tprime/l2-d_i*t)
+			+exp(-t*t/l2-d_j*tprime)
+			-exp(-(d_i*t + d_j*tprime)));
   return base/(d_i+d_j);
+
 }
 
 double dh_dd_j(double t, double tprime, double d_i, double d_j, double l){
-  double diff_t = (t-tprime);
-  double l2 = l*l;
   double half_l_di = 0.5*l*d_i;
   double hv = h(t, tprime, d_i, d_j, l);
-  double arg_1 = half_l_di + tprime/l;
-  double arg_2 = half_l_di - (t-tprime)/l;
-  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
-  arg_1 = half_l_di;
-  arg_2 = half_l_di - t/l;
   double sign_val = 1.0;
   if(t/l==0)
     sign_val = 0.0;
   else if (t/l < 0)
     sign_val = -1.0;
   double ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l);
-  double base = tprime*sign_val*exp(half_l_di*half_l_di-(d_i*t+d_j*tprime)+ln_part_2)-hv;
+  double base = -hv;
+  if(isfinite(ln_part_2))
+    base += tprime*sign_val*exp(half_l_di*half_l_di-(d_i*t+d_j*tprime)+ln_part_2);
   return base/(d_i+d_j);
 }
 
+double dh_dl(double t, double tprime, double d_i, double d_j, double l){
+  // compute gradient of h function with respect to lengthscale for sim covariance
+  // TODO a lot of energy wasted recomputing things here, need to do this in a shared way somehow ... perhaps needs rewrite of sympykern.
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  double diff_t = t - tprime;
+  double l2 = l*l;
+  double hv = h(t, tprime, d_i, d_j, l);
+  return 0.5*d_i*d_i*l*hv + 2/(sqrt(M_PI)*(d_i+d_j))*((-diff_t/l2-d_i/2)*exp(-diff_t*diff_t/l2)+(-tprime/l2+d_i/2)*exp(-tprime*tprime/l2-d_i*t)-(-t/l2-d_i/2)*exp(-t*t/l2-d_j*tprime)-d_i/2*exp(-(d_i*t+d_j*tprime)));
+}
 
 double dh_dt(double t, double tprime, double d_i, double d_j, double l){
   return 0.0;
diff --git a/GPy/kern/parts/sympy_helpers.py b/GPy/kern/parts/sympy_helpers.py
new file mode 100644
index 00000000..125dac58
--- /dev/null
+++ b/GPy/kern/parts/sympy_helpers.py
@@ -0,0 +1,71 @@
+# Code for testing functions written in sympy_helpers.cpp
+from scipy import weave
+import tempfile
+import os
+import numpy as np
+current_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
+extra_compile_args = []
+
+weave_kwargs = {
+    'support_code': "",
+    'include_dirs':[tempfile.gettempdir(), current_dir],
+    'headers':['"parts/sympy_helpers.h"'],
+    'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp")],
+    'extra_compile_args':extra_compile_args,
+    'extra_link_args':['-lgomp'],
+    'verbose':True}
+
+def erfcx(x):
+    code = """
+        // Code for computing scaled complementary erf
+        int i;
+        int dim;
+        int elements = Ntarget[0];
+        for (dim=1; dim<Dtarget; dim++)
+          elements *= Ntarget[dim];
+        for (i=0;i<elements;i++) 
+            target[i] = erfcx(x[i]);
+        """
+    x = np.asarray(x)
+    arg_names = ['target','x']
+    target = np.zeros_like(x)
+    weave.inline(code=code, arg_names=arg_names,**weave_kwargs)
+    return target
+
+def ln_diff_erf(x, y):
+    code = """
+        // Code for computing scaled complementary erf
+        int i;
+        int dim;
+        int elements = Ntarget[0];
+        for (dim=1; dim<Dtarget; dim++)
+          elements *= Ntarget[dim];
+        for (i=0;i<elements;i++) 
+          target[i] = ln_diff_erf(x[i], y[i]);
+        """
+    x = np.asarray(x)
+    y = np.asarray(y)
+    assert(x.shape==y.shape)
+    target = np.zeros_like(x)
+    arg_names = ['target','x', 'y']
+    weave.inline(code=code, arg_names=arg_names,**weave_kwargs)
+    return target
+
+def h(t, tprime, d_i, d_j, l):
+    code = """
+        // Code for computing the 1st order ODE h helper function.
+        int i;
+        int dim;
+        int elements = Ntarget[0];
+        for (dim=1; dim<Dtarget; dim++)
+          elements *= Ntarget[dim];
+        for (i=0;i<elements;i++) 
+          target[i] = h(t[i], tprime[i], d_i, d_j, l);
+        """
+    t = np.asarray(t)
+    tprime = np.asarray(tprime)
+    assert(tprime.shape==t.shape)
+    target = np.zeros_like(t)
+    arg_names = ['target','t', 'tprime', 'd_i', 'd_j', 'l']
+    weave.inline(code=code, arg_names=arg_names,**weave_kwargs)
+    return target
diff --git a/GPy/util/symbolic.py b/GPy/util/symbolic.py
index d546f940..395f9e3e 100644
--- a/GPy/util/symbolic.py
+++ b/GPy/util/symbolic.py
@@ -10,7 +10,7 @@ class ln_diff_erf(Function):
             return -2*exp(-x1**2)/(sqrt(pi)*(erf(x0)-erf(x1)))
         elif argindex == 1:
             x0, x1 = self.args
-            return 2*exp(-x0**2)/(sqrt(pi)*(erf(x0)-erf(x1)))
+            return 2.*exp(-x0**2)/(sqrt(pi)*(erf(x0)-erf(x1)))
         else:
             raise ArgumentIndexError(self, argindex)
         

From f46c72b79b752def2883143bcb90e1cb0394f0ee Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Tue, 19 Nov 2013 06:50:25 +0000
Subject: [PATCH 190/252] Bug fix for single output sympy kernel.

---
 GPy/kern/parts/__init__.py  |  2 +-
 GPy/kern/parts/sympykern.py | 15 +++++++++++----
 GPy/util/datasets.py        | 15 +++++++++++++--
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/GPy/kern/parts/__init__.py b/GPy/kern/parts/__init__.py
index d8e7f8e6..f278941a 100644
--- a/GPy/kern/parts/__init__.py
+++ b/GPy/kern/parts/__init__.py
@@ -14,7 +14,7 @@ import Matern32
 import Matern52
 import mlp
 import ODE_1
-import ODE_UY
+#import ODE_UY
 import periodic_exponential
 import periodic_Matern32
 import periodic_Matern52
diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index 88c179aa..7f7fba11 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -177,8 +177,15 @@ class spkern(Kernpart):
         # Code to compute argument string when only diagonal is required.
         diag_arg_string = re.sub('int jj','//int jj',X_arg_string)
         diag_arg_string = re.sub('j','i',diag_arg_string)
-        diag_precompute_string = precompute_list[0]
-
+        if precompute_string == '':
+            # if it's not multioutput, the precompute strings are set to zero
+            diag_precompute_string = ''
+            diag_precompute_replace = ''
+        else:
+            # for multioutput we need to extract the index of the output form the input.
+            diag_precompute_string = precompute_list[0]
+            diag_precompute_replace = precompute_list[1]
+        
 
         # Here's the code to do the looping for K
         self._K_code =\
@@ -215,13 +222,13 @@ class spkern(Kernpart):
             TARGET2(i, i) += k(%s);
             for (j=0;j<i;j++){
               %s //int jj=(int)X2(j, 1);
-              double kval = k(%s); //double kval = k(X2(i, 0), X2(j, 0), shared_lengthscale, LENGTHSCALE1(ii), SCALE1(ii), LENGTHSCALE1(jj), SCALE1(jj));
+              double kval = k(%s); //double kval = k(X2(i, 0), shared_lengthscale, LENGTHSCALE1(ii), SCALE1(ii));
               TARGET2(i, j) += kval;
               TARGET2(j, i) += kval;
             }
         }
         /*%s*/
-        """%(diag_precompute_string, diag_arg_string, re.sub('Z2', 'X2', precompute_list[1]), X_arg_string,str(self._sp_k)) #adding a string representation forces recompile when needed
+        """%(diag_precompute_string, diag_arg_string, re.sub('Z2', 'X2', diag_precompute_replace), X_arg_string,str(self._sp_k)) #adding a string representation forces recompile when needed
 
         # Code to do the looping for Kdiag
         self._Kdiag_code =\
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 565f8e76..69f010f9 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -689,7 +689,7 @@ def olympic_marathon_men(data_set='olympic_marathon_men'):
     Y = olympics[:, 1:2]
     return data_details_return({'X': X, 'Y': Y}, data_set)
 
-def olympics():
+def olympic_sprints(data_set='rogers_girolami_data'):
     """All olympics sprint winning times for multiple output prediction."""
     X = np.zeros((0, 2))
     Y = np.zeros((0, 1))
@@ -707,7 +707,18 @@ def olympics():
     data['X'] = X
     data['Y'] = Y
     data['info'] = "Olympics sprint event winning for men and women to 2008. Data is from Rogers and Girolami's First Course in Machine Learning."
-    return data
+    return data_details_return({
+        'X': X,
+        'Y': Y,
+        'info': "Olympics sprint event winning for men and women to 2008. Data is from Rogers and Girolami's First Course in Machine Learning.",
+        'output_info': {
+          0:'100m Men', 
+          1:'100m Women', 
+          2:'200m Men', 
+          3:'200m Women', 
+          4:'400m Men', 
+          5:'400m Women'}
+        }, data_set)
 
 # def movielens_small(partNo=1,seed=default_seed):
 #     np.random.seed(seed=seed)

From 5f3b6bd204624941f130c5d85cf9ca3fc250afd2 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Tue, 19 Nov 2013 09:33:06 +0000
Subject: [PATCH 191/252] Moved data resource information to a json file.

---
 GPy/util/data_resources.json               | 319 +++++++++++++++++++++
 GPy/util/datasets.py                       | 131 +--------
 GPy/util/datasets/data_resources_create.py | 127 ++++++++
 3 files changed, 453 insertions(+), 124 deletions(-)
 create mode 100644 GPy/util/data_resources.json
 create mode 100644 GPy/util/datasets/data_resources_create.py

diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json
new file mode 100644
index 00000000..2b36b0c1
--- /dev/null
+++ b/GPy/util/data_resources.json
@@ -0,0 +1,319 @@
+{
+   "rogers_girolami_data":{
+      "files":[
+         [
+            "firstcoursemldata.tar.gz"
+         ]
+      ],
+      "license":null,
+      "citation":"A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146",
+      "details":"Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.",
+      "urls":[
+         "https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/"
+      ],
+      "suffices":[
+         [
+            "?dl=1"
+         ]
+      ],
+      "size":21949154
+   },
+   "ankur_pose_data":{
+      "files":[
+         [
+            "ankurDataPoseSilhouette.mat"
+         ]
+      ],
+      "citation":"3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.",
+      "license":null,
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/"
+      ],
+      "details":"Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."
+   },
+   "osu_accad":{
+      "files":[
+         [
+            "swagger1TXT.ZIP",
+            "handspring1TXT.ZIP",
+            "quickwalkTXT.ZIP",
+            "run1TXT.ZIP",
+            "sprintTXT.ZIP",
+            "dogwalkTXT.ZIP",
+            "camper_04TXT.ZIP",
+            "dance_KB3_TXT.ZIP",
+            "per20_TXT.ZIP",
+            "perTWO07_TXT.ZIP",
+            "perTWO13_TXT.ZIP",
+            "perTWO14_TXT.ZIP",
+            "perTWO15_TXT.ZIP",
+            "perTWO16_TXT.ZIP"
+         ],
+         [
+            "connections.txt"
+         ]
+      ],
+      "license":"Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).",
+      "citation":"The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.",
+      "details":"Motion capture data of different motions from the Open Motion Data Project at Ohio State University.",
+      "urls":[
+         "http://accad.osu.edu/research/mocap/data/",
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"
+      ],
+      "size":15922790
+   },
+   "isomap_face_data":{
+      "files":[
+         [
+            "face_data.mat"
+         ]
+      ],
+      "license":null,
+      "citation":"A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000",
+      "details":"Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/"
+      ],
+      "size":24229368
+   },
+   "boston_housing":{
+      "files":[
+         [
+            "Index",
+            "housing.data",
+            "housing.names"
+         ]
+      ],
+      "license":null,
+      "citation":"Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.",
+      "details":"The Boston Housing data relates house values in Boston to a range of input variables.",
+      "urls":[
+         "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/"
+      ],
+      "size":51276
+   },
+   "cmu_mocap_full":{
+      "files":[
+         [
+            "allasfamc.zip"
+         ]
+      ],
+      "license":"From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.",
+      "citation":"Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.\nThe database was created with funding from NSF EIA-0196217.",
+      "details":"CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.",
+      "urls":[
+         "http://mocap.cs.cmu.edu"
+      ],
+      "size":null
+   },
+   "brendan_faces":{
+      "files":[
+         [
+            "frey_rawface.mat"
+         ]
+      ],
+      "license":null,
+      "citation":"Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.",
+      "details":"A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.",
+      "urls":[
+         "http://www.cs.nyu.edu/~roweis/data/"
+      ],
+      "size":1100584
+   },
+   "olympic_marathon_men":{
+      "files":[
+         [
+            "olympicMarathonTimes.csv"
+         ]
+      ],
+      "license":null,
+      "citation":null,
+      "details":"Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/"
+      ],
+      "size":584
+   },
+   "pumadyn-32nm":{
+      "files":[
+         [
+            "pumadyn-32nm.tar.gz"
+         ]
+      ],
+      "license":"Data is made available by the Delve system at the University of Toronto",
+      "citation":"Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.",
+      "details":"Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.",
+      "urls":[
+         "ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/"
+      ],
+      "size":5861646
+   },
+   "ripley_prnn_data":{
+      "files":[
+         [
+            "Cushings.dat",
+            "README",
+            "crabs.dat",
+            "fglass.dat",
+            "fglass.grp",
+            "pima.te",
+            "pima.tr",
+            "pima.tr2",
+            "synth.te",
+            "synth.tr",
+            "viruses.dat",
+            "virus3.dat"
+         ]
+      ],
+      "license":null,
+      "citation":"Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7",
+      "details":"Data sets from Brian Ripley's Pattern Recognition and Neural Networks",
+      "urls":[
+         "http://www.stats.ox.ac.uk/pub/PRNN/"
+      ],
+      "size":93565
+   },
+   "three_phase_oil_flow":{
+      "files":[
+         [
+            "DataTrnLbls.txt",
+            "DataTrn.txt",
+            "DataTst.txt",
+            "DataTstLbls.txt",
+            "DataVdn.txt",
+            "DataVdnLbls.txt"
+         ]
+      ],
+      "license":null,
+      "citation":"Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593",
+      "details":"The three phase oil data used initially for demonstrating the Generative Topographic mapping.",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/"
+      ],
+      "size":712796
+   },
+   "robot_wireless":{
+      "files":[
+         [
+            "uw-floor.txt"
+         ]
+      ],
+      "license":null,
+      "citation":"WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.",
+      "details":"Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/"
+      ],
+      "size":284390
+   },
+   "xw_pen":{
+      "files":[
+         [
+            "xw_pen_15.csv"
+         ]
+      ],
+      "license":null,
+      "citation":"Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005",
+      "details":"Accelerometer pen data used for robust regression by Tipping and Lawrence.",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/"
+      ],
+      "size":3410
+   },
+   "swiss_roll":{
+      "files":[
+         [
+            "swiss_roll_data.mat"
+         ]
+      ],
+      "license":null,
+      "citation":"A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000",
+      "details":"Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.",
+      "urls":[
+         "http://isomap.stanford.edu/"
+      ],
+      "size":800256
+   },
+   "osu_run1":{
+      "files":[
+         [
+            "run1TXT.ZIP"
+         ],
+         [
+            "connections.txt"
+         ]
+      ],
+      "license":"Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).",
+      "citation":"The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.",
+      "details":"Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.",
+      "urls":[
+         "http://accad.osu.edu/research/mocap/data/",
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"
+      ],
+      "size":338103
+   },
+   "creep_rupture":{
+      "files":[
+         [
+            "creeprupt.tar"
+         ]
+      ],
+      "license":null,
+      "citation":"Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.",
+      "details":"Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.",
+      "urls":[
+         "http://www.msm.cam.ac.uk/map/data/tar/"
+      ],
+      "size":602797
+   },
+   "olivetti_faces":{
+      "files":[
+         [
+            "att_faces.zip"
+         ],
+         [
+            "olivettifaces.mat"
+         ]
+      ],
+      "license":null,
+      "citation":"Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994",
+      "details":"Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/",
+         "http://www.cs.nyu.edu/~roweis/data/"
+      ],
+      "size":8561331
+   },
+   "della_gatta":{
+      "files":[
+         [
+            "DellaGattadata.mat"
+         ]
+      ],
+      "license":null,
+      "citation":"Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008",
+      "details":"The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/"
+      ],
+      "size":3729650
+   },
+   "epomeo_gpx":{
+      "files":[
+         [
+            "endomondo_1.gpx",
+            "endomondo_2.gpx",
+            "garmin_watch_via_endomondo.gpx",
+            "viewranger_phone.gpx",
+            "viewranger_tablet.gpx"
+         ]
+      ],
+      "license":null,
+      "citation":"",
+      "details":"Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/"
+      ],
+      "size":2031872
+   }
+}
\ No newline at end of file
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 69f010f9..f33a2e92 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -7,7 +7,7 @@ import urllib as url
 import zipfile
 import tarfile
 import datetime
-
+import json
 ipython_available=True
 try:
     import IPython
@@ -29,129 +29,10 @@ data_path = os.path.join(os.path.dirname(__file__), 'datasets')
 default_seed = 10000
 overide_manual_authorize=False
 neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/'
-sam_url = 'http://www.cs.nyu.edu/~roweis/data/'
-cmu_url = 'http://mocap.cs.cmu.edu/subjects/'
 
-# Note: there may be a better way of storing data resources, for the
-# moment we are storing them in a dictionary.
-data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
-                                       'files' : [['ankurDataPoseSilhouette.mat']],
-                                       'license' : None,
-                                       'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""",
-                                       'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""},
-
-                  'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'],
-                                      'files' : [['Index', 'housing.data', 'housing.names']],
-                                      'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""",
-                                      'details' : """The Boston Housing data relates house values in Boston to a range of input variables.""",
-                                      'license' : None,
-                                      'size' : 51276
-                                      },
-                  'brendan_faces' : {'urls' : [sam_url],
-                                     'files': [['frey_rawface.mat']],
-                                     'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.',
-                                     'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""",
-                                     'license': None,
-                                     'size' : 1100584},
-                  'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'],
-                                 'files' : [['allasfamc.zip']],
-                                 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.
-The database was created with funding from NSF EIA-0196217.""",
-                                 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
-                                 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
-                                 'size' : None},
-                  'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'],
-                                     'files' : [['creeprupt.tar']],
-                                     'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.',
-                                     'details' : """Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.""",
-                                     'license' : None,
-                                     'size' : 602797},
-                  'della_gatta' : {'urls' : [neil_url + 'della_gatta/'],
-                                   'files': [['DellaGattadata.mat']],
-                                   'citation' : 'Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008',
-                                   'details': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.",
-                                   'license':None,
-                                   'size':3729650},
-                  'epomeo_gpx' : {'urls' : [neil_url + 'epomeo_gpx/'],
-                                   'files': [['endomondo_1.gpx', 'endomondo_2.gpx', 'garmin_watch_via_endomondo.gpx','viewranger_phone.gpx','viewranger_tablet.gpx']],
-                                   'citation' : '',
-                                   'details': "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).",
-                                   'license':None,
-                                   'size': 2031872},
-                  'three_phase_oil_flow': {'urls' : [neil_url + 'three_phase_oil_flow/'],
-                                           'files' : [['DataTrnLbls.txt', 'DataTrn.txt', 'DataTst.txt', 'DataTstLbls.txt', 'DataVdn.txt', 'DataVdnLbls.txt']],
-                                           'citation' : 'Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593',
-                                           'details' : """The three phase oil data used initially for demonstrating the Generative Topographic mapping.""",
-                                           'license' : None,
-                                           'size' : 712796},
-                  'rogers_girolami_data' : {'urls' : ['https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/'],
-                                            'files' : [['firstcoursemldata.tar.gz']],
-                                            'suffices' : [['?dl=1']],
-                                            'citation' : 'A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146',
-                                            'details' : """Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.""",
-                                            'license' : None,
-                                            'size' : 21949154},
-                  'olivetti_faces' : {'urls' : [neil_url + 'olivetti_faces/', sam_url],
-                                      'files' : [['att_faces.zip'], ['olivettifaces.mat']],
-                                            'citation' : 'Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994',
-                                            'details' : """Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. """,
-                                            'license': None,
-                                            'size' : 8561331},
-                  'olympic_marathon_men' : {'urls' : [neil_url + 'olympic_marathon_men/'],
-                                            'files' : [['olympicMarathonTimes.csv']],
-                                            'citation' : None,
-                                            'details' : """Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data""",
-                                            'license': None,
-                                            'size' : 584},
-                  'osu_run1' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
-                                'files': [['run1TXT.ZIP'],['connections.txt']],
-                                'details' : "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.",
-                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
-                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
-                                'size': 338103},
-                  'osu_accad' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
-                                'files': [['swagger1TXT.ZIP','handspring1TXT.ZIP','quickwalkTXT.ZIP','run1TXT.ZIP','sprintTXT.ZIP','dogwalkTXT.ZIP','camper_04TXT.ZIP','dance_KB3_TXT.ZIP','per20_TXT.ZIP','perTWO07_TXT.ZIP','perTWO13_TXT.ZIP','perTWO14_TXT.ZIP','perTWO15_TXT.ZIP','perTWO16_TXT.ZIP'],['connections.txt']],
-                                'details' : "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.",
-                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
-                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
-                                'size': 15922790},
-                  'pumadyn-32nm' : {'urls' : ['ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/'],
-                                    'files' : [['pumadyn-32nm.tar.gz']],
-                                    'details' : """Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.""",
-                                    'citation' : """Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.""",
-                                    'license' : """Data is made available by the Delve system at the University of Toronto""",
-                                    'size' : 5861646},
-                  'robot_wireless' : {'urls' : [neil_url + 'robot_wireless/'],
-                                      'files' : [['uw-floor.txt']],
-                                      'citation' : """WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.""",
-                                      'details' : """Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.""",
-                                      'license' : None,
-                                      'size' : 284390},
-                  'swiss_roll' : {'urls' : ['http://isomap.stanford.edu/'],
-                                  'files' : [['swiss_roll_data.mat']],
-                                  'details' : """Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
-                                  'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
-                                  'license' : None,
-                                  'size' : 800256},
-                  'ripley_prnn_data' : {'urls' : ['http://www.stats.ox.ac.uk/pub/PRNN/'],
-                                        'files' : [['Cushings.dat', 'README', 'crabs.dat', 'fglass.dat', 'fglass.grp', 'pima.te', 'pima.tr', 'pima.tr2', 'synth.te', 'synth.tr', 'viruses.dat', 'virus3.dat']],
-                                        'details' : """Data sets from Brian Ripley's Pattern Recognition and Neural Networks""",
-                                        'citation': """Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7""",
-                                        'license' : None,
-                                        'size' : 93565},
-                  'isomap_face_data' : {'urls' : [neil_url + 'isomap_face_data/'],
-                                        'files' : [['face_data.mat']],
-                                        'details' : """Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
-                                        'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
-                                        'license' : None,
-                                        'size' : 24229368},
-                  'xw_pen' : {'urls' : [neil_url + 'xw_pen/'],
-                                        'files' : [['xw_pen_15.csv']],
-                                        'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""",
-                                        'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005',
-                                        'license' : None,
-                                        'size' : 3410}
-                  }
+# Read data resources from json file.
+json_data=open('data_resources.json').read()
+data_resources = json.loads(json_data)
 
 
 def prompt_user(prompt):
@@ -623,7 +504,7 @@ def xw_pen(data_set='xw_pen'):
     return data_details_return({'Y': Y, 'X': X, 'info': "Tilt data from a personalized digital assistant pen. Plot in original paper showed regression between time steps 175 and 275."}, data_set)
 
 
-def download_rogers_girolami_data():
+def download_rogers_girolami_data(data_set='rogers_girolami_data'):
     if not data_available('rogers_girolami_data'):
         download_data(data_set)
         path = os.path.join(data_path, data_set)
@@ -909,3 +790,5 @@ def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4, data_set=
     if sample_every != 1:
         info += ' Data is sub-sampled to every ' + str(sample_every) + ' frames.'
     return data_details_return({'Y': Y, 'lbls' : lbls, 'Ytest': Ytest, 'lblstest' : lblstest, 'info': info, 'skel': skel}, data_set)
+
+
diff --git a/GPy/util/datasets/data_resources_create.py b/GPy/util/datasets/data_resources_create.py
new file mode 100644
index 00000000..8ae62a85
--- /dev/null
+++ b/GPy/util/datasets/data_resources_create.py
@@ -0,0 +1,127 @@
+import json
+
+neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/'
+sam_url = 'http://www.cs.nyu.edu/~roweis/data/'
+cmu_url = 'http://mocap.cs.cmu.edu/subjects/'
+
+data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
+                                       'files' : [['ankurDataPoseSilhouette.mat']],
+                                       'license' : None,
+                                       'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""",
+                                       'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""},
+
+                  'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'],
+                                      'files' : [['Index', 'housing.data', 'housing.names']],
+                                      'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""",
+                                      'details' : """The Boston Housing data relates house values in Boston to a range of input variables.""",
+                                      'license' : None,
+                                      'size' : 51276
+                                      },
+                  'brendan_faces' : {'urls' : [sam_url],
+                                     'files': [['frey_rawface.mat']],
+                                     'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.',
+                                     'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""",
+                                     'license': None,
+                                     'size' : 1100584},
+                  'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'],
+                                 'files' : [['allasfamc.zip']],
+                                 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.
+The database was created with funding from NSF EIA-0196217.""",
+                                 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
+                                 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
+                                 'size' : None},
+                  'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'],
+                                     'files' : [['creeprupt.tar']],
+                                     'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.',
+                                     'details' : """Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.""",
+                                     'license' : None,
+                                     'size' : 602797},
+                  'della_gatta' : {'urls' : [neil_url + 'della_gatta/'],
+                                   'files': [['DellaGattadata.mat']],
+                                   'citation' : 'Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008',
+                                   'details': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.",
+                                   'license':None,
+                                   'size':3729650},
+                  'epomeo_gpx' : {'urls' : [neil_url + 'epomeo_gpx/'],
+                                   'files': [['endomondo_1.gpx', 'endomondo_2.gpx', 'garmin_watch_via_endomondo.gpx','viewranger_phone.gpx','viewranger_tablet.gpx']],
+                                   'citation' : '',
+                                   'details': "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).",
+                                   'license':None,
+                                   'size': 2031872},
+                  'three_phase_oil_flow': {'urls' : [neil_url + 'three_phase_oil_flow/'],
+                                           'files' : [['DataTrnLbls.txt', 'DataTrn.txt', 'DataTst.txt', 'DataTstLbls.txt', 'DataVdn.txt', 'DataVdnLbls.txt']],
+                                           'citation' : 'Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593',
+                                           'details' : """The three phase oil data used initially for demonstrating the Generative Topographic mapping.""",
+                                           'license' : None,
+                                           'size' : 712796},
+                  'rogers_girolami_data' : {'urls' : ['https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/'],
+                                            'files' : [['firstcoursemldata.tar.gz']],
+                                            'suffices' : [['?dl=1']],
+                                            'citation' : 'A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146',
+                                            'details' : """Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.""",
+                                            'license' : None,
+                                            'size' : 21949154},
+                  'olivetti_faces' : {'urls' : [neil_url + 'olivetti_faces/', sam_url],
+                                      'files' : [['att_faces.zip'], ['olivettifaces.mat']],
+                                            'citation' : 'Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994',
+                                            'details' : """Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. """,
+                                            'license': None,
+                                            'size' : 8561331},
+                  'olympic_marathon_men' : {'urls' : [neil_url + 'olympic_marathon_men/'],
+                                            'files' : [['olympicMarathonTimes.csv']],
+                                            'citation' : None,
+                                            'details' : """Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data""",
+                                            'license': None,
+                                            'size' : 584},
+                  'osu_run1' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
+                                'files': [['run1TXT.ZIP'],['connections.txt']],
+                                'details' : "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.",
+                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
+                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
+                                'size': 338103},
+                  'osu_accad' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
+                                'files': [['swagger1TXT.ZIP','handspring1TXT.ZIP','quickwalkTXT.ZIP','run1TXT.ZIP','sprintTXT.ZIP','dogwalkTXT.ZIP','camper_04TXT.ZIP','dance_KB3_TXT.ZIP','per20_TXT.ZIP','perTWO07_TXT.ZIP','perTWO13_TXT.ZIP','perTWO14_TXT.ZIP','perTWO15_TXT.ZIP','perTWO16_TXT.ZIP'],['connections.txt']],
+                                'details' : "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.",
+                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
+                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
+                                'size': 15922790},
+                  'pumadyn-32nm' : {'urls' : ['ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/'],
+                                    'files' : [['pumadyn-32nm.tar.gz']],
+                                    'details' : """Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.""",
+                                    'citation' : """Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.""",
+                                    'license' : """Data is made available by the Delve system at the University of Toronto""",
+                                    'size' : 5861646},
+                  'robot_wireless' : {'urls' : [neil_url + 'robot_wireless/'],
+                                      'files' : [['uw-floor.txt']],
+                                      'citation' : """WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.""",
+                                      'details' : """Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.""",
+                                      'license' : None,
+                                      'size' : 284390},
+                  'swiss_roll' : {'urls' : ['http://isomap.stanford.edu/'],
+                                  'files' : [['swiss_roll_data.mat']],
+                                  'details' : """Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
+                                  'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
+                                  'license' : None,
+                                  'size' : 800256},
+                  'ripley_prnn_data' : {'urls' : ['http://www.stats.ox.ac.uk/pub/PRNN/'],
+                                        'files' : [['Cushings.dat', 'README', 'crabs.dat', 'fglass.dat', 'fglass.grp', 'pima.te', 'pima.tr', 'pima.tr2', 'synth.te', 'synth.tr', 'viruses.dat', 'virus3.dat']],
+                                        'details' : """Data sets from Brian Ripley's Pattern Recognition and Neural Networks""",
+                                        'citation': """Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7""",
+                                        'license' : None,
+                                        'size' : 93565},
+                  'isomap_face_data' : {'urls' : [neil_url + 'isomap_face_data/'],
+                                        'files' : [['face_data.mat']],
+                                        'details' : """Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
+                                        'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
+                                        'license' : None,
+                                        'size' : 24229368},
+                  'xw_pen' : {'urls' : [neil_url + 'xw_pen/'],
+                                        'files' : [['xw_pen_15.csv']],
+                                        'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""",
+                                        'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005',
+                                        'license' : None,
+                                        'size' : 3410}
+                  }
+
+with open('data_resources.json', 'w') as file:
+    json.dump(data_resources, file)

From fca3287e9c5c042c044361bd35ceb87287aa843a Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 19 Nov 2013 16:54:07 +0000
Subject: [PATCH 192/252] added a path for the data resources. not all users
 will be working in the GPy directory.

---
 GPy/util/datasets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index f33a2e92..732e2a1b 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -31,7 +31,8 @@ overide_manual_authorize=False
 neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/'
 
 # Read data resources from json file.
-json_data=open('data_resources.json').read()
+path = os.path.join(os.path.dirname(__file__), 'data_resources.json')
+json_data=open(path).read()
 data_resources = json.loads(json_data)
 
 

From 4948fb1345ac034af8e337ff5c90dfa406a5f478 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 20 Nov 2013 11:45:33 +0000
Subject: [PATCH 193/252] updated crossterms, rbf x any not working yet
 (derivatives)

---
 GPy/kern/kern.py      | 208 +++++++++++++++++++++++++++++-------------
 GPy/kern/parts/rbf.py |  21 ++---
 2 files changed, 155 insertions(+), 74 deletions(-)

diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index d686064a..5cd5b6aa 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -456,7 +456,7 @@ class kern(Parameterized):
         from parts.linear import Linear
         from parts.fixed import Fixed
 
-        for (p1, i1), (p2, i2) in itertools.combinations(itertools.izip(self.parts, self.param_slices), 2):
+        for (p1, i1), (p2, i2) in itertools.combinations(itertools.izip(self.parts, self.input_slices), 2):
             # white doesn;t combine with anything
             if isinstance(p1, White) or isinstance(p2, White):
                 pass
@@ -466,28 +466,30 @@ class kern(Parameterized):
             elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
                 target += p2.variance * (p1._psi1[:, :, None] + p1._psi1[:, None, :])
             # linear X bias
-            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, Linear):
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (Linear, RBF, RBFInv)):
                 tmp = np.zeros((mu.shape[0], Z.shape[0]))
                 p2.psi1(Z, mu, S, tmp)
                 target += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
-            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, Linear):
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (Linear, RBF, RBFInv)):
                 tmp = np.zeros((mu.shape[0], Z.shape[0]))
                 p1.psi1(Z, mu, S, tmp)
                 target += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
             # rbf X any
-            elif isinstance(p1, (RBF, RBFInv)):
-                psi11 = np.zeros((mu.shape[0], Z.shape[0]))
-                psi12 = np.zeros((mu.shape[0], Z.shape[0]))
+            elif False:#isinstance(p1, (RBF, RBFInv)) or isinstance(p2, (RBF, RBFInv)):
+                if isinstance(p2, (RBF, RBFInv)) and not isinstance(p1, (RBF, RBFInv)):
+                    p1t = p1; p1 = p2; p2 = p1t; del p1t  
+                N, M = mu.shape[0], Z.shape[0]; NM=N*M
+                psi11 = np.zeros((N, M))
+                psi12 = np.zeros((NM, M))
                 p1.psi1(Z, mu, S, psi11)
-                p2.psi1(Z, mu, S, psi12)
+                Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
+                Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
                 
-                crossterms  = psi11[:, :, None] + psi12[:, None, :]
-                crossterms += psi12[:, :, None] + psi11[:, None, :]
-                
-                target += p1._crossterm_product_expectation(p2, Z, mu, S)
+                p2.psi1(Z, Mu, Sigma, psi12)
+                eK2 = psi12.reshape(N, M, M)
+                crossterms = eK2 * (psi11[:, :, None] + psi11[:, None, :])
+                target += crossterms
                 #import ipdb;ipdb.set_trace()
-            elif isinstance(p2, (RBF, RBFInv)):
-                raise NotImplementedError # TODO
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
         return target        
@@ -496,40 +498,81 @@ class kern(Parameterized):
         target = np.zeros(self.num_params)
         [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]
 
+        from parts.white import White
+        from parts.rbf import RBF
+        from parts.rbf_inv import RBFInv
+        from parts.bias import Bias
+        from parts.linear import Linear
+        from parts.fixed import Fixed
+
         # compute the "cross" terms
         # TODO: better looping, input_slices
         for i1, i2 in itertools.combinations(range(len(self.parts)), 2):
             p1, p2 = self.parts[i1], self.parts[i2]
-#             ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
-            ps1, ps2 = self.param_slices[i1], self.param_slices[i2]
-
-            # white doesn;t combine with anything
-            if p1.name == 'white' or p2.name == 'white':
+            #ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
+            ps1, ps2 = self.param_slices[i1], self.param_slices[i2]            
+            if isinstance(p1, White) or isinstance(p2, White):
                 pass
             # rbf X bias
-            elif p1.name == 'bias' and p2.name == 'rbf':
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
                 p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target[ps2])
                 p1.dpsi1_dtheta(dL_dpsi2.sum(1) * p2._psi1 * 2., Z, mu, S, target[ps1])
-            elif p2.name == 'bias' and p1.name == 'rbf':
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
                 p1.dpsi1_dtheta(dL_dpsi2.sum(1) * p2.variance * 2., Z, mu, S, target[ps1])
                 p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1._psi1 * 2., Z, mu, S, target[ps2])
             # linear X bias
-            elif p1.name == 'bias' and p2.name == 'linear':
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, Linear):
                 p2.dpsi1_dtheta(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target[ps2]) # [ps1])
                 psi1 = np.zeros((mu.shape[0], Z.shape[0]))
                 p2.psi1(Z, mu, S, psi1)
                 p1.dpsi1_dtheta(dL_dpsi2.sum(1) * psi1 * 2., Z, mu, S, target[ps1])
-            elif p2.name == 'bias' and p1.name == 'linear':
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, Linear):
                 p1.dpsi1_dtheta(dL_dpsi2.sum(1) * p2.variance * 2., Z, mu, S, target[ps1])
                 psi1 = np.zeros((mu.shape[0], Z.shape[0]))
                 p1.psi1(Z, mu, S, psi1)
                 p2.dpsi1_dtheta(dL_dpsi2.sum(1) * psi1 * 2., Z, mu, S, target[ps2])
             # rbf X any
-            
-            elif p1.name == 'linear' and p2.name == 'rbf':
-                raise NotImplementedError # TODO
-            elif p2.name == 'linear' and p1.name == 'rbf':
-                raise NotImplementedError # TODO
+            elif False:#isinstance(p1, (RBF, RBFInv)) or isinstance(p2, (RBF, RBFInv)):
+                if isinstance(p2, (RBF, RBFInv)) and not isinstance(p1, (RBF, RBFInv)):
+                    # turn around to have rbf in front
+                    p1, p2 = self.parts[i2], self.parts[i1]
+                    ps1, ps2 = self.param_slices[i2], self.param_slices[i1]  
+                
+                N, M = mu.shape[0], Z.shape[0]; NM=N*M
+
+                psi11 = np.zeros((N, M))
+                p1.psi1(Z, mu, S, psi11)
+                
+                Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
+                Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
+                
+                tmp1 = np.zeros_like(target[ps1])
+                tmp2 = np.zeros_like(target[ps2])
+#                 for n in range(N):
+#                     for m in range(M):
+#                         for m_prime in range(M):
+#                             p1.dpsi1_dtheta((dL_dpsi2[n:n+1,m:m+1,m_prime:m_prime+1]*psi12_t.reshape(N,M,M)[n:n+1,m:m+1,m_prime:m_prime+1])[0], Z[m:m+1], mu[n:n+1], S[n:n+1], tmp2)#Z[m_prime:m_prime+1], mu[n:n+1], S[n:n+1], tmp2)
+#                             p1.dpsi1_dtheta((dL_dpsi2[n:n+1,m:m+1,m_prime:m_prime+1]*psi12_t.reshape(N,M,M)[n:n+1,m_prime:m_prime+1,m:m+1])[0], Z[m_prime:m_prime+1], mu[n:n+1], S[n:n+1], tmp2)
+#                             Mu, Sigma= Mu.reshape(N,M,self.input_dim), Sigma.reshape(N,M,self.input_dim)
+#                             p2.dpsi1_dtheta((dL_dpsi2[n:n+1,m:m+1,m_prime:m_prime+1]*(psi11[n:n+1,m_prime:m_prime+1]))[0], Z[m:m+1], Mu[n:n+1,m], Sigma[n:n+1,m], target[ps2])
+#                             p2.dpsi1_dtheta((dL_dpsi2[n:n+1,m:m+1,m_prime:m_prime+1]*(psi11[n:n+1,m:m+1]))[0], Z[m_prime:m_prime+1], Mu[n:n+1, m_prime], Sigma[n:n+1, m_prime], target[ps2])#Z[m_prime:m_prime+1], Mu[n+m:(n+m)+1], Sigma[n+m:(n+m)+1], target[ps2])
+                
+                if isinstance(p1, RBF) and isinstance(p2, RBF):
+                    psi12 = np.zeros((N, M))
+                    p2.psi1(Z, mu, S, psi12)
+                    Mu2, Sigma2 = p2._crossterm_mu_S(Z, mu, S)
+                    Mu2, Sigma2 = Mu2.reshape(NM,self.input_dim), Sigma2.reshape(NM,self.input_dim)
+                    p1.dpsi1_dtheta((dL_dpsi2*(psi12[:,:,None] + psi12[:,None,:])).reshape(NM,M), Z, Mu2, Sigma2, tmp1)
+                    pass
+
+                if isinstance(p1, RBF) and isinstance(p2, Linear):
+                    #import ipdb;ipdb.set_trace()
+                    pass
+                
+                p2.dpsi1_dtheta((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, tmp2)
+                
+                target[ps1] += tmp1
+                target[ps2] += tmp2                
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
 
@@ -539,61 +582,102 @@ class kern(Parameterized):
         target = np.zeros_like(Z)
         [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
 
+        from parts.white import White
+        from parts.rbf import RBF
+        from parts.rbf_inv import RBFInv
+        from parts.bias import Bias
+        from parts.linear import Linear
+        from parts.fixed import Fixed
+
         # compute the "cross" terms
-        # TODO: we need input_slices here.
+        # TODO: better looping, input_slices
         for p1, p2 in itertools.combinations(self.parts, 2):
-            # white doesn;t combine with anything
-            if p1.name == 'white' or p2.name == 'white':
+            if isinstance(p1, White) or isinstance(p2, White):
                 pass
             # rbf X bias
-            elif p1.name == 'bias' and p2.name == 'rbf':
-                p2.dpsi1_dX(dL_dpsi2.sum(1).T * p1.variance, Z, mu, S, target)
-            elif p2.name == 'bias' and p1.name == 'rbf':
-                p1.dpsi1_dZ(dL_dpsi2.sum(1).T * p2.variance, Z, mu, S, target)
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+                p2.dpsi1_dZ(dL_dpsi2.sum(1) * p1.variance, Z, mu, S, target)
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+                p1.dpsi1_dZ(dL_dpsi2.sum(1) * p2.variance, Z, mu, S, target)
             # linear X bias
-            elif p1.name == 'bias' and p2.name == 'linear':
-                p2.dpsi1_dZ(dL_dpsi2.sum(1).T * p1.variance, Z, mu, S, target)
-            elif p2.name == 'bias' and p1.name == 'linear':
-                p1.dpsi1_dZ(dL_dpsi2.sum(1).T * p2.variance, Z, mu, S, target)
-            # rbf X linear
-            elif p1.name == 'linear' and p2.name == 'rbf':
-                raise NotImplementedError # TODO
-            elif p2.name == 'linear' and p1.name == 'rbf':
-                raise NotImplementedError # TODO
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, Linear):
+                p2.dpsi1_dZ(dL_dpsi2.sum(1) * p1.variance, Z, mu, S, target)
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, Linear):
+                p1.dpsi1_dZ(dL_dpsi2.sum(1) * p2.variance, Z, mu, S, target)
+            # rbf X any
+            elif False:#isinstance(p1, (RBF, RBFInv)) or isinstance(p2, (RBF, RBFInv)):
+                if isinstance(p2, (RBF, RBFInv)) and not isinstance(p1, (RBF, RBFInv)):
+                    p1t = p1; p1 = p2; p2 = p1t; del p1t  
+                N, M = mu.shape[0], Z.shape[0]; NM=N*M
+                psi11 = np.zeros((N, M))
+                psi12 = np.zeros((NM, M))
+                #psi12_t = np.zeros((N,M))
+                
+                p1.psi1(Z, mu, S, psi11)
+                Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
+                Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
+                
+                p2.psi1(Z, Mu, Sigma, psi12)
+                tmp1 = np.zeros_like(target)
+                p1.dpsi1_dZ((dL_dpsi2*psi12.reshape(N,M,M)).sum(1), Z, mu, S, tmp1)
+                p1.dpsi1_dZ((dL_dpsi2*psi12.reshape(N,M,M)).sum(2), Z, mu, S, tmp1)
+                target += tmp1
+                
+                #p2.dpsi1_dtheta((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, target)
+                p2.dpsi1_dZ((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, target)
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
-
-        return target * 2.
+        return target * 2
 
     def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S):
         target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
         [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
 
+        from parts.white import White
+        from parts.rbf import RBF
+        from parts.rbf_inv import RBFInv
+        from parts.bias import Bias
+        from parts.linear import Linear
+        from parts.fixed import Fixed
+
         # compute the "cross" terms
-        # TODO: we need input_slices here.
+        # TODO: better looping, input_slices
         for p1, p2 in itertools.combinations(self.parts, 2):
-            # white doesn;t combine with anything
-            if p1.name == 'white' or p2.name == 'white':
+            if isinstance(p1, White) or isinstance(p2, White):
                 pass
             # rbf X bias
-            elif p1.name == 'bias' and p2.name == 'rbf':
-                p2.dpsi1_dmuS(dL_dpsi2.sum(1).T * p1.variance * 2., Z, mu, S, target_mu, target_S)
-            elif p2.name == 'bias' and p1.name == 'rbf':
-                p1.dpsi1_dmuS(dL_dpsi2.sum(1).T * p2.variance * 2., Z, mu, S, target_mu, target_S)
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+                p2.dpsi1_dmuS(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target_mu, target_S)
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+                p1.dpsi1_dmuS(dL_dpsi2.sum(1) * p2.variance * 2., Z, mu, S, target_mu, target_S)
             # linear X bias
-            elif p1.name == 'bias' and p2.name == 'linear':
-                p2.dpsi1_dmuS(dL_dpsi2.sum(1).T * p1.variance * 2., Z, mu, S, target_mu, target_S)
-            elif p2.name == 'bias' and p1.name == 'linear':
-                p1.dpsi1_dmuS(dL_dpsi2.sum(1).T * p2.variance * 2., Z, mu, S, target_mu, target_S)
-            # rbf X linear
-            elif p1.name == 'linear' and p2.name == 'rbf':
-                raise NotImplementedError # TODO
-            elif p2.name == 'linear' and p1.name == 'rbf':
-                raise NotImplementedError # TODO
+            elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, Linear):
+                p2.dpsi1_dmuS(dL_dpsi2.sum(1) * p1.variance * 2., Z, mu, S, target_mu, target_S)
+            elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, Linear):
+                p1.dpsi1_dmuS(dL_dpsi2.sum(1) * p2.variance * 2., Z, mu, S, target_mu, target_S)
+            # rbf X any
+            elif False:#isinstance(p1, (RBF, RBFInv)) or isinstance(p2, (RBF, RBFInv)):
+                if isinstance(p2, (RBF, RBFInv)) and not isinstance(p1, (RBF, RBFInv)):
+                    p1t = p1; p1 = p2; p2 = p1t; del p1t  
+                N, M = mu.shape[0], Z.shape[0]; NM=N*M
+                psi11 = np.zeros((N, M))
+                psi12 = np.zeros((NM, M))
+                #psi12_t = np.zeros((N,M))
+                
+                p1.psi1(Z, mu, S, psi11)
+                Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
+                Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
+                
+                p2.psi1(Z, Mu, Sigma, psi12)
+                p1.dpsi1_dmuS((dL_dpsi2*psi12.reshape(N,M,M)).sum(1), Z, mu, S, target_mu, target_S)
+                p1.dpsi1_dmuS((dL_dpsi2*psi12.reshape(N,M,M)).sum(2), Z, mu, S, target_mu, target_S)
+                
+                #p2.dpsi1_dtheta((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, target)
+                p2.dpsi1_dmuS((dL_dpsi2*(psi11[:,:,None])).sum(1)*2, Z, Mu.reshape(N,M,self.input_dim).sum(1), Sigma.reshape(N,M,self.input_dim).sum(1), target_mu, target_S)
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
-
         return target_mu, target_S
+    
     def plot(self, x=None, plot_limits=None, which_parts='all', resolution=None, *args, **kwargs):
         if which_parts == 'all':
             which_parts = [True] * self.num_parts
diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index 56a6b0eb..dbc689d5 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -186,7 +186,7 @@ class RBF(Kernpart):
         self._psi_computations(Z, mu, S)
         target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
         d_length = self._psi1[:,:,None] * ((self._psi1_dist_sq - 1.)/(self.lengthscale*self._psi1_denom) +1./self.lengthscale)
-        dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
+        dpsi1_dlength = d_length * np.atleast_3d(dL_dpsi1)
         if not self.ARD:
             target[1] += dpsi1_dlength.sum()
         else:
@@ -208,22 +208,19 @@ class RBF(Kernpart):
         self._psi_computations(Z, mu, S)
         target += self._psi2
 
-    def _crossterm_product_expectation(self, K, Z, mu, S):
+    def _crossterm_mu_S(self, Z, mu, S):
         # compute the crossterm expectation for K as the other kernel:
-        import ipdb;ipdb.set_trace()
-        Sigma = 1./self.lengthscale[None,:] + 1./S # is independent across M, 
-        M = (Z[None,:,:]/self.lengthscale[None,None,:] + (mu/S)[:,None,:]) / Sigma[:,None,:]
-        psi1_other = K.psi1()
-        self.variance
-        # return is [N x M x M]
-        return 
+        Sigma = 1./self.lengthscale2[None,None,:] + 1./S[:,None,:] # is independent across M, 
+        Sigma_tilde = (self.lengthscale2[None, :] + S)
+        M = (S*mu/Sigma_tilde)[:, None, :] + (self.lengthscale2[None,:]*Z)[None, :, :]/Sigma_tilde[:, None, :]
+        # make sure return is [N x M x Q]
+        return M, Sigma.repeat(Z.shape[0],1) 
 
     def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
         """Shape N,num_inducing,num_inducing,Ntheta"""
         self._psi_computations(Z, mu, S)
         d_var = 2.*self._psi2 / self.variance
         d_length = 2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] / self.lengthscale2) / (self.lengthscale * self._psi2_denom)
-
         target[0] += np.sum(dL_dpsi2 * d_var)
         dpsi2_dlength = d_length * dL_dpsi2[:, :, :, None]
         if not self.ARD:
@@ -306,8 +303,8 @@ class RBF(Kernpart):
         psi2 = np.empty((N, num_inducing, num_inducing))
 
         psi2_Zdist_sq = self._psi2_Zdist_sq
-        _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim)
-        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(N, self.input_dim)
+        _psi2_denom = self._psi2_denom.squeeze().reshape(-1, input_dim)
+        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(-1, input_dim)
         variance_sq = float(np.square(self.variance))
         if self.ARD:
             lengthscale2 = self.lengthscale2

From 76bfbee5455a331db25cf4d7443ba760bf10d7d4 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 20 Nov 2013 11:58:30 +0000
Subject: [PATCH 194/252] psistattests update

---
 GPy/kern/kern.py                          |  3 ++
 GPy/testing/psi_stat_expectation_tests.py | 42 +++++++++++------------
 GPy/testing/psi_stat_gradient_tests.py    | 38 ++++++++++++++------
 3 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 5cd5b6aa..f021dc3a 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -412,6 +412,9 @@ class kern(Parameterized):
         [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)]
         return self._transform_gradients(target)
 
+    def dpsi0_dZ(self, dL_dpsi0, Z, mu, S):
+        return np.zeros_like(Z)
+
     def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S):
         target_mu, target_S = np.zeros_like(mu), np.zeros_like(S)
         [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
diff --git a/GPy/testing/psi_stat_expectation_tests.py b/GPy/testing/psi_stat_expectation_tests.py
index ae3d1022..90252197 100644
--- a/GPy/testing/psi_stat_expectation_tests.py
+++ b/GPy/testing/psi_stat_expectation_tests.py
@@ -28,8 +28,8 @@ def ard(p):
 class Test(unittest.TestCase):
     input_dim = 9
     num_inducing = 13
-    N = 30
-    Nsamples = 9e6
+    N = 300
+    Nsamples = 1e6
 
     def setUp(self):
         i_s_dim_list = [2,4,3]
@@ -50,20 +50,20 @@ class Test(unittest.TestCase):
 #                        GPy.kern.linear(self.input_dim, ARD=True) +
 #                        GPy.kern.bias(self.input_dim) +
 #                        GPy.kern.white(self.input_dim)),
-#                     (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-                    (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
-                     +GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True)
-#                     GPy.kern.bias(self.input_dim) +
-#                     GPy.kern.white(self.input_dim)),
+                    (#GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
+                     GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True)
+                     +GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
+#                      +GPy.kern.bias(self.input_dim)
+#                      +GPy.kern.white(self.input_dim)),
                     ),
-        (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
-         +GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
-         #+GPy.kern.bias(self.input_dim, np.random.rand())
-         #+GPy.kern.white(self.input_dim, np.random.rand())),
-         ),
-                (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-                    GPy.kern.bias(self.input_dim, np.random.rand()) +
-                    GPy.kern.white(self.input_dim, np.random.rand())),
+#                     (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+#                      GPy.kern.bias(self.input_dim, np.random.rand())),
+#         (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
+#          +GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
+#          #+GPy.kern.bias(self.input_dim, np.random.rand())
+#          #+GPy.kern.white(self.input_dim, np.random.rand())),
+#         ),
+#                     GPy.kern.white(self.input_dim, np.random.rand())),
 #                     GPy.kern.rbf(self.input_dim), GPy.kern.rbf(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim, ARD=False), GPy.kern.linear(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim),
@@ -120,25 +120,25 @@ class Test(unittest.TestCase):
             diffs = []
             for i, q_x_sample_stripe in enumerate(np.array_split(self.q_x_samples, self.Nsamples / Nsamples)):
                 K = kern.K(q_x_sample_stripe, self.Z)
-                K = (K[:, :, None] * K[:, None, :]).mean(0)
-                K_ += K
-                diffs.append(((psi2 - (K_ / (i + 1)))**2).mean())
-            K_ /= self.Nsamples / Nsamples
+                K = (K[:, :, None] * K[:, None, :])
+                K_ += K.sum(0) / self.Nsamples
+                diffs.append(((psi2 - (K_*self.Nsamples/((i+1)*Nsamples)))**2).mean())
+            #K_ /= self.Nsamples / Nsamples
             msg = "psi2: {}".format("+".join([p.name + ard(p) for p in kern.parts]))
             try:
                 import pylab
                 pylab.figure(msg)
-                pylab.plot(diffs, marker='x', mew=1.3)
+                pylab.plot(diffs, marker='x', mew=.2)
 #                 print msg, np.allclose(psi2.squeeze(), K_, rtol=1e-1, atol=.1)
                 self.assertTrue(np.allclose(psi2.squeeze(), K_),
                                             #rtol=1e-1, atol=.1),
                                 msg=msg + ": not matching")
 #                 sys.stdout.write(".")
             except:
-#                 import ipdb;ipdb.set_trace()
 #                 kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
 #                 sys.stdout.write("E")
                 print msg + ": not matching"
+                import ipdb;ipdb.set_trace()
                 pass
 
 if __name__ == "__main__":
diff --git a/GPy/testing/psi_stat_gradient_tests.py b/GPy/testing/psi_stat_gradient_tests.py
index de670f41..edb0f02e 100644
--- a/GPy/testing/psi_stat_gradient_tests.py
+++ b/GPy/testing/psi_stat_gradient_tests.py
@@ -40,10 +40,9 @@ class PsiStatModel(Model):
         return self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance).sum()
     def _log_likelihood_gradients(self):
         psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
-        try:
-            psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
-        except AttributeError:
-            psiZ = numpy.zeros(self.num_inducing * self.input_dim)
+        #psimu, psiS = numpy.ones(self.N * self.input_dim), numpy.ones(self.N * self.input_dim)
+        psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
+        #psiZ = numpy.ones(self.num_inducing * self.input_dim)
         thetagrad = self.kern.__getattribute__("d" + self.which + "_dtheta")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance).flatten()
         return numpy.hstack((psimu.flatten(), psiS.flatten(), psiZ.flatten(), thetagrad))
 
@@ -116,9 +115,9 @@ if __name__ == "__main__":
 #         m.randomize()
 # #         self.assertTrue(m.checkgrad())
         numpy.random.seed(0)
-        input_dim = 5
-        N = 50
-        num_inducing = 10
+        input_dim = 3
+        N = 3
+        num_inducing = 2
         D = 15
         X = numpy.random.randn(N, input_dim)
         X_var = .5 * numpy.ones_like(X) + .1 * numpy.clip(numpy.random.randn(*X.shape), 0, 1)
@@ -143,10 +142,27 @@ if __name__ == "__main__":
 #                          num_inducing=num_inducing, kernel=kernel)
 #         m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
 #                          num_inducing=num_inducing, kernel=GPy.kern.rbf(input_dim))
-        m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-                         num_inducing=num_inducing, kernel=GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
+#         m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+#                          num_inducing=num_inducing, kernel=GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
         # + GPy.kern.bias(input_dim))
-#         m4 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim))
+#         m = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+#                          num_inducing=num_inducing, 
+#                          kernel=(
+#             GPy.kern.rbf(input_dim, ARD=1) 
+#             +GPy.kern.linear(input_dim, ARD=1) 
+#             +GPy.kern.bias(input_dim))
+#                          )
+#         m.ensure_default_constraints()
+        m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+                         num_inducing=num_inducing, kernel=(
+            GPy.kern.rbf(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1) 
+            #+GPy.kern.linear(input_dim, numpy.random.rand(input_dim), ARD=1) 
+            #+GPy.kern.rbf(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1) 
+            #+GPy.kern.rbf(input_dim, numpy.random.rand(), numpy.random.rand(), ARD=0) 
+            +GPy.kern.bias(input_dim)
+            +GPy.kern.white(input_dim)
+            )
+            )
+        m2.ensure_default_constraints()
     else:
         unittest.main()

From f114b9fff588fb84c8908af82ea7ee8490a4e755 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 20 Nov 2013 12:47:06 +0000
Subject: [PATCH 195/252] rename models to _models and import models in
 models.py

---
 GPy/_models/__init__.py                       | 19 ++++++++++++++++
 GPy/{models => _models}/bayesian_gplvm.py     |  4 ++--
 GPy/{models => _models}/bcgplvm.py            |  0
 .../fitc_classification.py                    |  0
 GPy/{models => _models}/gp_classification.py  |  0
 .../gp_multioutput_regression.py              |  0
 GPy/{models => _models}/gp_regression.py      |  1 -
 GPy/{models => _models}/gplvm.py              | 15 +++++--------
 GPy/{models => _models}/gradient_checker.py   |  0
 GPy/{models => _models}/mrd.py                |  4 ++--
 .../sparse_gp_classification.py               |  0
 .../sparse_gp_multioutput_regression.py       |  0
 .../sparse_gp_regression.py                   |  0
 GPy/{models => _models}/sparse_gplvm.py       |  4 ++--
 GPy/{models => _models}/svigp_regression.py   |  0
 GPy/{models => _models}/warped_gp.py          |  0
 GPy/models.py                                 | 22 +++++++++++++++++++
 GPy/models/__init__.py                        | 19 ----------------
 18 files changed, 53 insertions(+), 35 deletions(-)
 create mode 100644 GPy/_models/__init__.py
 rename GPy/{models => _models}/bayesian_gplvm.py (99%)
 rename GPy/{models => _models}/bcgplvm.py (100%)
 rename GPy/{models => _models}/fitc_classification.py (100%)
 rename GPy/{models => _models}/gp_classification.py (100%)
 rename GPy/{models => _models}/gp_multioutput_regression.py (100%)
 rename GPy/{models => _models}/gp_regression.py (98%)
 rename GPy/{models => _models}/gplvm.py (87%)
 rename GPy/{models => _models}/gradient_checker.py (100%)
 rename GPy/{models => _models}/mrd.py (99%)
 rename GPy/{models => _models}/sparse_gp_classification.py (100%)
 rename GPy/{models => _models}/sparse_gp_multioutput_regression.py (100%)
 rename GPy/{models => _models}/sparse_gp_regression.py (100%)
 rename GPy/{models => _models}/sparse_gplvm.py (96%)
 rename GPy/{models => _models}/svigp_regression.py (100%)
 rename GPy/{models => _models}/warped_gp.py (100%)
 create mode 100644 GPy/models.py
 delete mode 100644 GPy/models/__init__.py

diff --git a/GPy/_models/__init__.py b/GPy/_models/__init__.py
new file mode 100644
index 00000000..6fc93631
--- /dev/null
+++ b/GPy/_models/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+# from gp_regression import GPRegression; _gp_regression = gp_regression ; del gp_regression 
+# from gp_classification import GPClassification; _gp_classification = gp_classification ; del gp_classification 
+# from sparse_gp_regression import SparseGPRegression; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
+# from svigp_regression import SVIGPRegression; _svigp_regression = svigp_regression ; del svigp_regression 
+# from sparse_gp_classification import SparseGPClassification; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
+# from fitc_classification import FITCClassification; _fitc_classification = fitc_classification ; del fitc_classification 
+# from gplvm import GPLVM; _gplvm = gplvm ; del gplvm 
+# from bcgplvm import BCGPLVM; _bcgplvm = bcgplvm; del bcgplvm
+# from sparse_gplvm import SparseGPLVM; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
+# from warped_gp import WarpedGP; _warped_gp = warped_gp ; del warped_gp 
+# from bayesian_gplvm import BayesianGPLVM; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
+# from mrd import MRD; _mrd = mrd ; del mrd 
+# from gradient_checker import GradientChecker; _gradient_checker = gradient_checker ; del gradient_checker 
+# from gp_multioutput_regression import GPMultioutputRegression; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
+# from sparse_gp_multioutput_regression import SparseGPMultioutputRegression; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
+
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/_models/bayesian_gplvm.py
similarity index 99%
rename from GPy/models/bayesian_gplvm.py
rename to GPy/_models/bayesian_gplvm.py
index 21b46a8a..2b299ad8 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/_models/bayesian_gplvm.py
@@ -2,14 +2,14 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ..core import SparseGP
+from ..core.sparse_gp import SparseGP
 from ..likelihoods import Gaussian
 from .. import kern
 import itertools
 from matplotlib.colors import colorConverter
 from GPy.inference.optimization import SCG
 from GPy.util import plot_latent, linalg
-from GPy.models.gplvm import GPLVM
+from .gplvm import GPLVM
 from GPy.util.plot_latent import most_significant_input_dimensions
 from matplotlib import pyplot
 
diff --git a/GPy/models/bcgplvm.py b/GPy/_models/bcgplvm.py
similarity index 100%
rename from GPy/models/bcgplvm.py
rename to GPy/_models/bcgplvm.py
diff --git a/GPy/models/fitc_classification.py b/GPy/_models/fitc_classification.py
similarity index 100%
rename from GPy/models/fitc_classification.py
rename to GPy/_models/fitc_classification.py
diff --git a/GPy/models/gp_classification.py b/GPy/_models/gp_classification.py
similarity index 100%
rename from GPy/models/gp_classification.py
rename to GPy/_models/gp_classification.py
diff --git a/GPy/models/gp_multioutput_regression.py b/GPy/_models/gp_multioutput_regression.py
similarity index 100%
rename from GPy/models/gp_multioutput_regression.py
rename to GPy/_models/gp_multioutput_regression.py
diff --git a/GPy/models/gp_regression.py b/GPy/_models/gp_regression.py
similarity index 98%
rename from GPy/models/gp_regression.py
rename to GPy/_models/gp_regression.py
index 633fc1c8..8b44c1ba 100644
--- a/GPy/models/gp_regression.py
+++ b/GPy/_models/gp_regression.py
@@ -2,7 +2,6 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-import numpy as np
 from ..core import GP
 from .. import likelihoods
 from .. import kern
diff --git a/GPy/models/gplvm.py b/GPy/_models/gplvm.py
similarity index 87%
rename from GPy/models/gplvm.py
rename to GPy/_models/gplvm.py
index 795389a7..f27f861c 100644
--- a/GPy/models/gplvm.py
+++ b/GPy/_models/gplvm.py
@@ -4,15 +4,11 @@
 
 import numpy as np
 import pylab as pb
-import sys, pdb
 from .. import kern
-from ..core import Model
-from ..util.linalg import pdinv, PCA
-from ..core.priors import Gaussian as Gaussian_prior
+from ..core import priors
 from ..core import GP
 from ..likelihoods import Gaussian
 from .. import util
-from GPy.util import plot_latent
 
 
 class GPLVM(GP):
@@ -34,12 +30,13 @@ class GPLVM(GP):
             kernel = kern.rbf(input_dim, ARD=input_dim > 1) + kern.bias(input_dim, np.exp(-2))
         likelihood = Gaussian(Y, normalize=normalize_Y, variance=np.exp(-2.))
         GP.__init__(self, X, likelihood, kernel, normalize_X=False)
-        self.set_prior('.*X', Gaussian_prior(0, 1))
+        self.set_prior('.*X', priors.Gaussian(0, 1))
         self.ensure_default_constraints()
 
     def initialise_latent(self, init, input_dim, Y):
         Xr = np.random.randn(Y.shape[0], input_dim)
         if init == 'PCA':
+            from ..util.linalg import PCA
             PC = PCA(Y, input_dim)[0]
             Xr[:PC.shape[0], :PC.shape[1]] = PC
         return Xr
@@ -62,15 +59,15 @@ class GPLVM(GP):
     def jacobian(self,X):
         target = np.zeros((X.shape[0],X.shape[1],self.output_dim))
         for i in range(self.output_dim):
-        	target[:,:,i] = self.kern.dK_dX(np.dot(self.Ki,self.likelihood.Y[:,i])[None, :],X,self.X)
+            target[:,:,i] = self.kern.dK_dX(np.dot(self.Ki,self.likelihood.Y[:,i])[None, :],X,self.X)
         return target
    
     def magnification(self,X):
         target=np.zeros(X.shape[0])
         J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
-    	J=self.jacobian(X)
+        J=self.jacobian(X)
         for i in range(X.shape[0]):
-		    target[i]=np.sqrt(pb.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
+            target[i]=np.sqrt(pb.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
         return target
 
     def plot(self):
diff --git a/GPy/models/gradient_checker.py b/GPy/_models/gradient_checker.py
similarity index 100%
rename from GPy/models/gradient_checker.py
rename to GPy/_models/gradient_checker.py
diff --git a/GPy/models/mrd.py b/GPy/_models/mrd.py
similarity index 99%
rename from GPy/models/mrd.py
rename to GPy/_models/mrd.py
index 2aaa731c..b9c99a64 100644
--- a/GPy/models/mrd.py
+++ b/GPy/_models/mrd.py
@@ -9,8 +9,8 @@ from GPy.util.linalg import PCA
 import numpy
 import itertools
 import pylab
-from GPy.kern.kern import kern
-from GPy.models.bayesian_gplvm import BayesianGPLVM
+from ..kern import kern
+from bayesian_gplvm import BayesianGPLVM
 
 class MRD(Model):
     """
diff --git a/GPy/models/sparse_gp_classification.py b/GPy/_models/sparse_gp_classification.py
similarity index 100%
rename from GPy/models/sparse_gp_classification.py
rename to GPy/_models/sparse_gp_classification.py
diff --git a/GPy/models/sparse_gp_multioutput_regression.py b/GPy/_models/sparse_gp_multioutput_regression.py
similarity index 100%
rename from GPy/models/sparse_gp_multioutput_regression.py
rename to GPy/_models/sparse_gp_multioutput_regression.py
diff --git a/GPy/models/sparse_gp_regression.py b/GPy/_models/sparse_gp_regression.py
similarity index 100%
rename from GPy/models/sparse_gp_regression.py
rename to GPy/_models/sparse_gp_regression.py
diff --git a/GPy/models/sparse_gplvm.py b/GPy/_models/sparse_gplvm.py
similarity index 96%
rename from GPy/models/sparse_gplvm.py
rename to GPy/_models/sparse_gplvm.py
index 6e7e40b1..ab616d5a 100644
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/_models/sparse_gplvm.py
@@ -5,8 +5,8 @@
 import numpy as np
 import pylab as pb
 import sys, pdb
-from GPy.models.sparse_gp_regression import SparseGPRegression
-from GPy.models.gplvm import GPLVM
+from sparse_gp_regression import SparseGPRegression
+from gplvm import GPLVM
 # from .. import kern
 # from ..core import model
 # from ..util.linalg import pdinv, PCA
diff --git a/GPy/models/svigp_regression.py b/GPy/_models/svigp_regression.py
similarity index 100%
rename from GPy/models/svigp_regression.py
rename to GPy/_models/svigp_regression.py
diff --git a/GPy/models/warped_gp.py b/GPy/_models/warped_gp.py
similarity index 100%
rename from GPy/models/warped_gp.py
rename to GPy/_models/warped_gp.py
diff --git a/GPy/models.py b/GPy/models.py
new file mode 100644
index 00000000..9a847ea0
--- /dev/null
+++ b/GPy/models.py
@@ -0,0 +1,22 @@
+'''
+Created on 14 Nov 2013
+
+@author: maxz
+'''
+
+from _models.bayesian_gplvm import BayesianGPLVM
+from _models.gp_regression import GPRegression
+from _models.gp_classification import GPClassification#; _gp_classification = gp_classification ; del gp_classification 
+from _models.sparse_gp_regression import SparseGPRegression#; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
+from _models.svigp_regression import SVIGPRegression#; _svigp_regression = svigp_regression ; del svigp_regression 
+from _models.sparse_gp_classification import SparseGPClassification#; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
+from _models.fitc_classification import FITCClassification#; _fitc_classification = fitc_classification ; del fitc_classification 
+from _models.gplvm import GPLVM#; _gplvm = gplvm ; del gplvm 
+from _models.bcgplvm import BCGPLVM#; _bcgplvm = bcgplvm; del bcgplvm
+from _models.sparse_gplvm import SparseGPLVM#; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
+from _models.warped_gp import WarpedGP#; _warped_gp = warped_gp ; del warped_gp 
+from _models.bayesian_gplvm import BayesianGPLVM#; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
+from _models.mrd import MRD#; _mrd = mrd; del mrd 
+from _models.gradient_checker import GradientChecker#; _gradient_checker = gradient_checker ; del gradient_checker 
+from _models.gp_multioutput_regression import GPMultioutputRegression#; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
+from _models.sparse_gp_multioutput_regression import SparseGPMultioutputRegression#; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
deleted file mode 100644
index a8be5890..00000000
--- a/GPy/models/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-from gp_regression import GPRegression; _gp_regression = gp_regression ; del gp_regression 
-from gp_classification import GPClassification; _gp_classification = gp_classification ; del gp_classification 
-from sparse_gp_regression import SparseGPRegression; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
-from svigp_regression import SVIGPRegression; _svigp_regression = svigp_regression ; del svigp_regression 
-from sparse_gp_classification import SparseGPClassification; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
-from fitc_classification import FITCClassification; _fitc_classification = fitc_classification ; del fitc_classification 
-from gplvm import GPLVM; _gplvm = gplvm ; del gplvm 
-from bcgplvm import BCGPLVM; _bcgplvm = bcgplvm; del bcgplvm
-from sparse_gplvm import SparseGPLVM; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
-from warped_gp import WarpedGP; _warped_gp = warped_gp ; del warped_gp 
-from bayesian_gplvm import BayesianGPLVM; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
-from mrd import MRD; _mrd = mrd ; del mrd 
-from gradient_checker import GradientChecker; _gradient_checker = gradient_checker ; del gradient_checker 
-from gp_multioutput_regression import GPMultioutputRegression; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
-from sparse_gp_multioutput_regression import SparseGPMultioutputRegression; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
-

From d4dff8360bd8770c853e709d9fc030b799c2d962 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 20 Nov 2013 12:47:55 +0000
Subject: [PATCH 196/252] testing imports update and expected failure for
 crossterms

---
 GPy/testing/bgplvm_tests.py            |  2 +-
 GPy/testing/psi_stat_gradient_tests.py | 34 ++++++++++++++++++--------
 GPy/testing/sparse_gplvm_tests.py      |  2 +-
 GPy/testing/unit_tests.py              |  2 ++
 4 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/GPy/testing/bgplvm_tests.py b/GPy/testing/bgplvm_tests.py
index a8777e11..1192448a 100644
--- a/GPy/testing/bgplvm_tests.py
+++ b/GPy/testing/bgplvm_tests.py
@@ -4,7 +4,7 @@
 import unittest
 import numpy as np
 import GPy
-from GPy.models.bayesian_gplvm import BayesianGPLVM
+from ..models import BayesianGPLVM
 
 class BGPLVMTests(unittest.TestCase):
     def test_bias_kern(self):
diff --git a/GPy/testing/psi_stat_gradient_tests.py b/GPy/testing/psi_stat_gradient_tests.py
index edb0f02e..e373aaa3 100644
--- a/GPy/testing/psi_stat_gradient_tests.py
+++ b/GPy/testing/psi_stat_gradient_tests.py
@@ -63,40 +63,54 @@ class DPsiStatTest(unittest.TestCase):
 
     def testPsi0(self):
         for k in self.kernels:
-            m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,
+            m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,\
                              num_inducing=self.num_inducing, kernel=k)
+            m.ensure_default_constraints()
+            m.randomize()
             assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k.parts)))
-
-#     def testPsi1(self):
-#         for k in self.kernels:
-#             m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z,
-#                      num_inducing=self.num_inducing, kernel=k)
-#             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
+        
+    def testPsi1(self):
+        for k in self.kernels:
+            m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z,
+                     num_inducing=self.num_inducing, kernel=k)
+            m.ensure_default_constraints()
+            m.randomize()
+            assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
 
     def testPsi2_lin(self):
         k = self.kernels[0]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     num_inducing=self.num_inducing, kernel=k)
+                 num_inducing=self.num_inducing, kernel=k)
+        m.ensure_default_constraints()
+        m.randomize()
         assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
     def testPsi2_lin_bia(self):
         k = self.kernels[3]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
+        m.ensure_default_constraints()
+        m.randomize()
         assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
     def testPsi2_rbf(self):
         k = self.kernels[1]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
+        m.ensure_default_constraints()
+        m.randomize()
         assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
     def testPsi2_rbf_bia(self):
         k = self.kernels[-1]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
+        m.ensure_default_constraints()
+        m.randomize()
         assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
     def testPsi2_bia(self):
         k = self.kernels[2]
         m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                      num_inducing=self.num_inducing, kernel=k)
+        m.ensure_default_constraints()
+        m.randomize()
         assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
 
 
@@ -134,8 +148,8 @@ if __name__ == "__main__":
 #                      num_inducing=num_inducing, kernel=k)
 #             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
 #
-#         m0 = PsiStatModel('psi0', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=GPy.kern.linear(input_dim))
+        m0 = PsiStatModel('psi0', X=X, X_variance=X_var, Z=Z,
+                         num_inducing=num_inducing, kernel=GPy.kern.rbf(input_dim)+GPy.kern.bias(input_dim))
 #         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
 #                          num_inducing=num_inducing, kernel=kernel)
 #         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
diff --git a/GPy/testing/sparse_gplvm_tests.py b/GPy/testing/sparse_gplvm_tests.py
index e27fccff..c3942b95 100644
--- a/GPy/testing/sparse_gplvm_tests.py
+++ b/GPy/testing/sparse_gplvm_tests.py
@@ -4,7 +4,7 @@
 import unittest
 import numpy as np
 import GPy
-from GPy.models.sparse_gplvm import SparseGPLVM
+from ..models import SparseGPLVM
 
 class sparse_GPLVMTests(unittest.TestCase):
     def test_bias_kern(self):
diff --git a/GPy/testing/unit_tests.py b/GPy/testing/unit_tests.py
index 818cb56e..69a15a7f 100644
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@@ -163,11 +163,13 @@ class GradientTests(unittest.TestCase):
         rbflin = GPy.kern.rbf(2) + GPy.kern.linear(2)
         self.check_model(rbflin, model_type='SparseGPRegression', dimension=2)
 
+    @unittest.expectedFailure
     def test_SparseGPRegression_rbf_linear_white_kern_2D_uncertain_inputs(self):
         ''' Testing the sparse GP regression with rbf, linear kernel on 2d data with uncertain inputs'''
         rbflin = GPy.kern.rbf(2) + GPy.kern.linear(2)
         self.check_model(rbflin, model_type='SparseGPRegression', dimension=2, uncertain_inputs=1)
 
+    @unittest.expectedFailure
     def test_SparseGPRegression_rbf_linear_white_kern_1D_uncertain_inputs(self):
         ''' Testing the sparse GP regression with rbf, linear kernel on 1d data with uncertain inputs'''
         rbflin = GPy.kern.rbf(1) + GPy.kern.linear(1)

From f04a4fa98bc394fb41c9e6914006f92c100ad280 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 20 Nov 2013 12:48:09 +0000
Subject: [PATCH 197/252] dim reduction imports

---
 GPy/examples/dimensionality_reduction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 666209f9..cdd69ab5 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -6,8 +6,8 @@ from matplotlib import pyplot as plt, cm
 
 import GPy
 from GPy.core.transformations import logexp
-from GPy.models.bayesian_gplvm import BayesianGPLVM
 from GPy.likelihoods.gaussian import Gaussian
+from GPy.models import BayesianGPLVM
 
 default_seed = np.random.seed(123344)
 

From 3a08c0d9ab546a7a5969c7c80e83f2fc90054329 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 20 Nov 2013 14:37:14 +0000
Subject: [PATCH 198/252] skipping crossterm tests instead of expected failure

---
 GPy/testing/unit_tests.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/GPy/testing/unit_tests.py b/GPy/testing/unit_tests.py
index 69a15a7f..9269a4c4 100644
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@@ -163,16 +163,18 @@ class GradientTests(unittest.TestCase):
         rbflin = GPy.kern.rbf(2) + GPy.kern.linear(2)
         self.check_model(rbflin, model_type='SparseGPRegression', dimension=2)
 
-    @unittest.expectedFailure
+    #@unittest.expectedFailure
     def test_SparseGPRegression_rbf_linear_white_kern_2D_uncertain_inputs(self):
         ''' Testing the sparse GP regression with rbf, linear kernel on 2d data with uncertain inputs'''
         rbflin = GPy.kern.rbf(2) + GPy.kern.linear(2)
+        raise unittest.SkipTest("This is not implemented yet!")
         self.check_model(rbflin, model_type='SparseGPRegression', dimension=2, uncertain_inputs=1)
 
-    @unittest.expectedFailure
+    #@unittest.expectedFailure
     def test_SparseGPRegression_rbf_linear_white_kern_1D_uncertain_inputs(self):
         ''' Testing the sparse GP regression with rbf, linear kernel on 1d data with uncertain inputs'''
         rbflin = GPy.kern.rbf(1) + GPy.kern.linear(1)
+        raise unittest.SkipTest("This is not implemented yet!")
         self.check_model(rbflin, model_type='SparseGPRegression', dimension=1, uncertain_inputs=1)
 
     def test_GPLVM_rbf_bias_white_kern_2D(self):

From f9e2a389e862a56d43ecefd96788982fae60be73 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Thu, 21 Nov 2013 20:20:03 +0000
Subject: [PATCH 199/252] Committing change for master check out.

---
 GPy/kern/parts/sympykern.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index 7f7fba11..7b98e47b 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -345,7 +345,7 @@ class spkern(Kernpart):
         self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z[', 'X[')
         self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= partial[', '+= 2*partial[')
         self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z2(', 'X2(')
-        self._dK_dX_code_X = self._dK_dX_code.replace('Z2(', 'X2(')
+        self._dK_dX_code_X = self._dK_dX_code_X.replace('Z2(', 'X2(')
 
 
         #TODO: insert multiple functions here via string manipulation

From a8cf725102af1dee769207472bbf59ccede8eec8 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Thu, 21 Nov 2013 20:51:28 +0000
Subject: [PATCH 200/252] removed some sympy stuff

---
 GPy/kern/constructors.py    | 22 -----------------
 GPy/kern/parts/sympykern.py |  8 +++---
 GPy/testing/kernel_tests.py |  6 +----
 GPy/util/symbolic.py        | 49 -------------------------------------
 4 files changed, 6 insertions(+), 79 deletions(-)

diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 083960b4..4ab06bba 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -292,7 +292,6 @@ except ImportError:
 if sympy_available:
     from parts.sympykern import spkern
     from sympy.parsing.sympy_parser import parse_expr
-    from GPy.util.symbolic import sinc
     
     def rbf_sympy(input_dim, ARD=False, variance=1., lengthscale=1.):
         """
@@ -337,27 +336,6 @@ if sympy_available:
             f =  scale_i*scale_j*sp.exp(-dist/(2*(lengthscale_i**2 + lengthscale_j**2 + shared_lengthscale**2)))
         return kern(input_dim, [spkern(input_dim, f, output_dim=output_dim, name='eq_sympy')])
 
-    def sinc(input_dim, ARD=False, variance=1., lengthscale=1.):
-        """
-        TODO: Not clear why this isn't working, suggests argument of sinc is not a number.
-        sinc covariance funciton
-        """
-        X = sp.symbols('x_:' + str(input_dim))
-        Z = sp.symbols('z_:' + str(input_dim))
-        variance = sp.var('variance',positive=True)
-        if ARD:
-            lengthscales = [sp.var('lengthscale_%i' % i, positive=True) for i in range(input_dim)]
-            dist_string = ' + '.join(['(x_%i-z_%i)**2/lengthscale_%i**2' % (i, i, i) for i in range(input_dim)])
-            dist = parse_expr(dist_string)
-            f =  variance*sinc(sp.pi*sp.sqrt(dist))
-        else:
-            lengthscale = sp.var('lengthscale',positive=True)
-            dist_string = ' + '.join(['(x_%i-z_%i)**2' % (i, i) for i in range(input_dim)])
-            dist = parse_expr(dist_string)
-            f =  variance*sinc(sp.pi*sp.sqrt(dist)/lengthscale)
-            
-        return kern(input_dim, [spkern(input_dim, f, name='sinc')])
-
     def sympykern(input_dim, k=None, output_dim=1, name=None, param=None):
         """
         A base kernel object, where all the hard work in done by sympy.
diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index 7f7fba11..d109fea7 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -11,6 +11,7 @@ import tempfile
 import pdb
 import ast
 from kernpart import Kernpart
+from ...util.config import config
 
 class spkern(Kernpart):
     """
@@ -110,8 +111,9 @@ class spkern(Kernpart):
             'headers':['"sympy_helpers.h"'],
             'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp")],
             'extra_compile_args':extra_compile_args,
-            'extra_link_args':['-lgomp'],
+            'extra_link_args':[],
             'verbose':True}
+        if config.getboolean('parallel', 'openmp'): self.weave_kwargs.append('-lgomp')
 
     def __add__(self,other):
         return spkern(self._sp_k+other._sp_k)
@@ -343,9 +345,9 @@ class spkern(Kernpart):
 
         # Code to use when only X is provided. 
         self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z[', 'X[')
-        self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= partial[', '+= 2*partial[')
+        self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= partial[', '+= 2*partial[') 
         self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z2(', 'X2(')
-        self._dK_dX_code_X = self._dK_dX_code.replace('Z2(', 'X2(')
+        self._dK_dX_code_X = self._dK_dX_code_X.replace('Z2(', 'X2(')
 
 
         #TODO: insert multiple functions here via string manipulation
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index f64dac2b..301fa54f 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -34,11 +34,7 @@ class KernelTests(unittest.TestCase):
             self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
     def test_eq_sympykernel(self):
-        kern = GPy.kern.eq_sympy(5, 3, output_ind=4)
-        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
-
-    def test_sinckernel(self):
-        kern = GPy.kern.sinc(5)
+        kern = GPy.kern.eq_sympy(5, 3)
         self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
     def test_rbf_invkernel(self):
diff --git a/GPy/util/symbolic.py b/GPy/util/symbolic.py
index 395f9e3e..4b660c7f 100644
--- a/GPy/util/symbolic.py
+++ b/GPy/util/symbolic.py
@@ -237,52 +237,3 @@ class erfcx(Function):
     def eval(cls, arg):
         return erfc(arg)*exp(arg*arg)
 
-class sinc_grad(Function):
-    nargs = 1
-    
-    def fdiff(self, argindex=1):
-        if argindex==1:
-            # Strictly speaking this should be computed separately, as it won't work when x=0. See http://calculus.subwiki.org/wiki/Sinc_function
-            return ((2-x*x)*sin(self.args[0]) - 2*x*cos(x))/(x*x*x)
-        else:
-            raise ArgumentIndexError(self, argindex)
-
-    
-    @classmethod
-    def eval(cls, x):
-        if x.is_Number:
-            if x is S.NaN:
-                return S.NaN
-            elif x is S.Zero:
-                return S.Zero
-            else:
-                return (x*cos(x) - sin(x))/(x*x)
-            
-class sinc(Function):
-    
-    nargs = 1
-    
-    def fdiff(self, argindex=1):
-        if argindex==1:
-            return sinc_grad(self.args[0])
-        else:
-            raise ArgumentIndexError(self, argindex)
-
-    
-    @classmethod
-    def eval(cls, arg):
-        if arg.is_Number:
-            if arg is S.NaN:
-                return S.NaN
-            elif arg is S.Zero:
-                return S.One
-            else:
-                return sin(arg)/arg
-
-        if arg.func is asin:
-            x = arg.args[0]
-            return x / arg
-
-    def _eval_is_real(self):
-        return self.args[0].is_real
-

From 1deb1bee86871df6ec70b92e2f7928450094dc27 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Thu, 21 Nov 2013 21:42:09 +0000
Subject: [PATCH 201/252] Merge with James's changes

---
 GPy/kern/constructors.py    | 22 ----------------------
 GPy/testing/kernel_tests.py |  9 +++------
 2 files changed, 3 insertions(+), 28 deletions(-)

diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 083960b4..4ab06bba 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -292,7 +292,6 @@ except ImportError:
 if sympy_available:
     from parts.sympykern import spkern
     from sympy.parsing.sympy_parser import parse_expr
-    from GPy.util.symbolic import sinc
     
     def rbf_sympy(input_dim, ARD=False, variance=1., lengthscale=1.):
         """
@@ -337,27 +336,6 @@ if sympy_available:
             f =  scale_i*scale_j*sp.exp(-dist/(2*(lengthscale_i**2 + lengthscale_j**2 + shared_lengthscale**2)))
         return kern(input_dim, [spkern(input_dim, f, output_dim=output_dim, name='eq_sympy')])
 
-    def sinc(input_dim, ARD=False, variance=1., lengthscale=1.):
-        """
-        TODO: Not clear why this isn't working, suggests argument of sinc is not a number.
-        sinc covariance funciton
-        """
-        X = sp.symbols('x_:' + str(input_dim))
-        Z = sp.symbols('z_:' + str(input_dim))
-        variance = sp.var('variance',positive=True)
-        if ARD:
-            lengthscales = [sp.var('lengthscale_%i' % i, positive=True) for i in range(input_dim)]
-            dist_string = ' + '.join(['(x_%i-z_%i)**2/lengthscale_%i**2' % (i, i, i) for i in range(input_dim)])
-            dist = parse_expr(dist_string)
-            f =  variance*sinc(sp.pi*sp.sqrt(dist))
-        else:
-            lengthscale = sp.var('lengthscale',positive=True)
-            dist_string = ' + '.join(['(x_%i-z_%i)**2' % (i, i) for i in range(input_dim)])
-            dist = parse_expr(dist_string)
-            f =  variance*sinc(sp.pi*sp.sqrt(dist)/lengthscale)
-            
-        return kern(input_dim, [spkern(input_dim, f, name='sinc')])
-
     def sympykern(input_dim, k=None, output_dim=1, name=None, param=None):
         """
         A base kernel object, where all the hard work in done by sympy.
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index f64dac2b..f75eb580 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -34,12 +34,9 @@ class KernelTests(unittest.TestCase):
             self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
     def test_eq_sympykernel(self):
-        kern = GPy.kern.eq_sympy(5, 3, output_ind=4)
-        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
-
-    def test_sinckernel(self):
-        kern = GPy.kern.sinc(5)
-        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+        if SYMPY_AVAILABLE:
+            kern = GPy.kern.eq_sympy(5, 3, output_ind=4)
+            self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
     def test_rbf_invkernel(self):
         kern = GPy.kern.rbf_inv(5)

From fedaa5e1f1b6876ca6c41b7923a1e4b347a48f2d Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Thu, 21 Nov 2013 22:15:20 +0000
Subject: [PATCH 202/252] Fixed bug in sympy kernel and added sympolic.py back
 into utils __init__.py

---
 GPy/kern/parts/sympykern.py | 6 +++---
 GPy/util/__init__.py        | 1 +
 GPy/util/symbolic.py        | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/GPy/kern/parts/sympykern.py b/GPy/kern/parts/sympykern.py
index d109fea7..bcd52fe2 100644
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@@ -345,8 +345,8 @@ class spkern(Kernpart):
 
         # Code to use when only X is provided. 
         self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z[', 'X[')
-        self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= partial[', '+= 2*partial[') 
-        self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z2(', 'X2(')
+        self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= PARTIAL2(', '+= 2*PARTIAL2(') 
+        self._dK_dtheta_code_X = self._dK_dtheta_code_X.replace('Z2(', 'X2(')
         self._dK_dX_code_X = self._dK_dX_code_X.replace('Z2(', 'X2(')
 
 
@@ -402,7 +402,7 @@ class spkern(Kernpart):
             self._weave_inline(self._dK_dX_code, X, target, Z, partial)
 
     def dKdiag_dX(self,partial,X,target):
-        self._weave.inline(self._dKdiag_dX_code, X, target, Z, partial)
+        self._weave_inline(self._dKdiag_dX_code, X, target, Z=None, partial=partial)
 
     def compute_psi_stats(self):
         #define some normal distributions
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index db9b7362..629b3f48 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -14,5 +14,6 @@ import visualize
 import decorators
 import classification
 import latent_space_visualizations
+import symbolic
 
 import netpbmfile
diff --git a/GPy/util/symbolic.py b/GPy/util/symbolic.py
index 4b660c7f..49c8c33a 100644
--- a/GPy/util/symbolic.py
+++ b/GPy/util/symbolic.py
@@ -1,4 +1,4 @@
-from sympy import Function, S, oo, I, cos, sin, asin, log, erf,pi,exp,sqrt,sign
+from sympy import Function, S, oo, I, cos, sin, asin, log, erf, pi, exp, sqrt, sign
 
 
 class ln_diff_erf(Function):

From 09de9d7195ca8f6770cd28d695d92d6b9682bfd9 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Thu, 21 Nov 2013 22:35:58 +0000
Subject: [PATCH 203/252] Added eq_ode1 to constructors.py

---
 GPy/kern/constructors.py    | 40 +++++++++++++++++++++++++++++++++----
 GPy/testing/kernel_tests.py |  5 +++++
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 4ab06bba..500ab92f 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -5,6 +5,7 @@ import numpy as np
 from kern import kern
 import parts
 
+
 def rbf_inv(input_dim,variance=1., inv_lengthscale=None,ARD=False):
     """
     Construct an RBF kernel
@@ -292,7 +293,8 @@ except ImportError:
 if sympy_available:
     from parts.sympykern import spkern
     from sympy.parsing.sympy_parser import parse_expr
-    
+    from GPy.util import symbolic
+
     def rbf_sympy(input_dim, ARD=False, variance=1., lengthscale=1.):
         """
         Radial Basis Function covariance.
@@ -312,9 +314,19 @@ if sympy_available:
             f =  variance*sp.exp(-dist/(2*lengthscale**2))
         return kern(input_dim, [spkern(input_dim, f, name='rbf_sympy')])
 
-    def eq_sympy(input_dim, output_dim, ARD=False, variance=1., lengthscale=1.):
+    def eq_sympy(input_dim, output_dim, ARD=False):
         """
-        Exponentiated quadratic with multiple outputs.
+        Latent force model covariance, exponentiated quadratic with multiple outputs. Derived from a diffusion equation with the initial spatial condition layed down by a Gaussian process with lengthscale given by shared_lengthscale.
+
+        See IEEE Trans Pattern Anal Mach Intell. 2013 Nov;35(11):2693-705. doi: 10.1109/TPAMI.2013.86. Linear latent force models using Gaussian processes. Alvarez MA, Luengo D, Lawrence ND.
+
+        :param input_dim: Dimensionality of the kernel
+        :type input_dim: int
+        :param output_dim: number of outputs in the covariance function.
+        :type output_dim: int
+        :param ARD: whether or not to user ARD (default False).
+        :type ARD: bool
+
         """
         real_input_dim = input_dim
         if output_dim>1:
@@ -325,7 +337,7 @@ if sympy_available:
         if ARD:
             lengthscales = [sp.var('lengthscale%i_i lengthscale%i_j' % i, positive=True) for i in range(real_input_dim)]
             shared_lengthscales = [sp.var('shared_lengthscale%i' % i, positive=True) for i in range(real_input_dim)]
-            dist_string = ' + '.join(['(x_%i-z_%i)**2/(shared_lengthscale%i**2 + lengthscale%i_i*lengthscale%i_j)' % (i, i, i) for i in range(real_input_dim)])
+            dist_string = ' + '.join(['(x_%i-z_%i)**2/(shared_lengthscale%i**2 + lengthscale%i_i**2 + lengthscale%i_j**2)' % (i, i, i) for i in range(real_input_dim)])
             dist = parse_expr(dist_string)
             f =  variance*sp.exp(-dist/2.)
         else:
@@ -336,6 +348,26 @@ if sympy_available:
             f =  scale_i*scale_j*sp.exp(-dist/(2*(lengthscale_i**2 + lengthscale_j**2 + shared_lengthscale**2)))
         return kern(input_dim, [spkern(input_dim, f, output_dim=output_dim, name='eq_sympy')])
 
+    def ode1_eq(output_dim=1):
+        """
+        Latent force model covariance, first order differential
+        equation driven by exponentiated quadratic.
+
+        See N. D. Lawrence, G. Sanguinetti and M. Rattray. (2007)
+        'Modelling transcriptional regulation using Gaussian
+        processes' in B. Schoelkopf, J. C. Platt and T. Hofmann (eds)
+        Advances in Neural Information Processing Systems, MIT Press,
+        Cambridge, MA, pp 785--792.
+
+        :param output_dim: number of outputs in the covariance function.
+        :type output_dim: int
+        """
+        input_dim = 2
+        x_0, z_0, decay_i, decay_j, scale_i, scale_j, lengthscale = sp.symbols('x_0, z_0, decay_i, decay_j, scale_i, scale_j, lengthscale')
+        f = scale_i*scale_j*(symbolic.h(x_0, z_0, decay_i, decay_j, lengthscale) 
+     + symbolic.h(z_0, x_0, decay_j, decay_i, lengthscale))
+        return kern(input_dim, [spkern(input_dim, f, output_dim=output_dim, name='ode1_eq')])
+
     def sympykern(input_dim, k=None, output_dim=1, name=None, param=None):
         """
         A base kernel object, where all the hard work in done by sympy.
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index a2194b65..92cad687 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -38,6 +38,11 @@ class KernelTests(unittest.TestCase):
             kern = GPy.kern.eq_sympy(5, 3)
             self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
+    def test_eq_ode1kernel(self):
+        if SYMPY_AVAILABLE:
+            kern = GPy.kern.eq_ode1(3)
+            self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
     def test_rbf_invkernel(self):
         kern = GPy.kern.rbf_inv(5)
         self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))

From 98b9dc0163e376b2e5b76872a8bc77c91916c591 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Thu, 21 Nov 2013 23:07:43 +0000
Subject: [PATCH 204/252] eq_ode1 working but test failing?

---
 GPy/kern/constructors.py    | 27 ---------------------------
 GPy/kern/kern.py            | 12 ++++++++----
 GPy/testing/kernel_tests.py |  8 ++++----
 3 files changed, 12 insertions(+), 35 deletions(-)

diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 500ab92f..05eaa028 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -150,33 +150,6 @@ def white(input_dim,variance=1.):
     part = parts.white.White(input_dim,variance)
     return kern(input_dim, [part])
 
-def eq_ode1(output_dim, W=None, rank=1,  kappa=None, length_scale=1., decay=None, delay=None):
-    """Covariance function for first order differential equation driven by an exponentiated quadratic covariance.
-
-    This outputs of this kernel have the form
-    .. math::
-       \frac{\text{d}y_j}{\text{d}t} = \sum_{i=1}^R w_{j,i} f_i(t-\delta_j) +\sqrt{\kappa_j}g_j(t) - d_jy_j(t)
-
-    where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`f_i(t)` and :math:`g_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
-    
-    :param output_dim: number of outputs driven by latent function.
-    :type output_dim: int
-    :param W: sensitivities of each output to the latent driving function. 
-    :type W: ndarray (output_dim x rank).
-    :param rank: If rank is greater than 1 then there are assumed to be a total of rank latent forces independently driving the system, each with identical covariance.
-    :type rank: int
-    :param decay: decay rates for the first order system. 
-    :type decay: array of length output_dim.
-    :param delay: delay between latent force and output response.
-    :type delay: array of length output_dim.
-    :param kappa: diagonal term that allows each latent output to have an independent component to the response.
-    :type kappa: array of length output_dim.
-    
-    .. Note: see first order differential equation examples in GPy.examples.regression for some usage.
-    """
-    part = parts.eq_ode1.Eq_ode1(output_dim, W, rank, kappa, length_scale, decay, delay)
-    return kern(2, [part])
-
 
 def exponential(input_dim,variance=1., lengthscale=None, ARD=False):
     """
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index f021dc3a..46bb01c8 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -747,7 +747,7 @@ class Kern_check_model(Model):
         if kernel==None:
             kernel = GPy.kern.rbf(1)
         if X==None:
-            X = np.random.randn(num_samples, kernel.input_dim)
+            X = np.random.normal(size=(num_samples, kernel.input_dim))
         if dL_dK==None:
             if X2==None:
                 dL_dK = np.ones((X.shape[0], X.shape[0]))
@@ -844,7 +844,7 @@ class Kern_check_dKdiag_dX(Kern_check_model):
     def _set_params(self, x):
         self.X=x.reshape(self.X.shape)
 
-def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
+def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False, X_positive=False):
     """This function runs on kernels to check the correctness of their implementation. It checks that the covariance function is positive definite for a randomly generated data set.
 
     :param kern: the kernel to be tested.
@@ -858,12 +858,16 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
     pass_checks = True
     if X==None:
         X = np.random.randn(10, kern.input_dim)
+        if X_positive:
+            X = abs(X)
         if output_ind is not None:
-            X[:, output_ind] = np.random.randint(kern.output_dim, X.shape[0])
+            X[:, output_ind] = np.random.randint(kern.parts[0].output_dim, X.shape[0])
     if X2==None:
         X2 = np.random.randn(20, kern.input_dim)
+        if X_positive:
+            X2 = abs(X2)
         if output_ind is not None:
-            X2[:, output_ind] = np.random.randint(kern.output_dim, X2.shape[0])
+            X2[:, output_ind] = np.random.randint(kern.parts[0].output_dim, X2.shape[0])
 
     if verbose:
         print("Checking covariance function is positive definite.")
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 92cad687..5d2fbeec 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -36,12 +36,12 @@ class KernelTests(unittest.TestCase):
     def test_eq_sympykernel(self):
         if SYMPY_AVAILABLE:
             kern = GPy.kern.eq_sympy(5, 3)
-            self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+            self.assertTrue(GPy.kern.kern_test(kern, output_ind=3, verbose=verbose))
 
-    def test_eq_ode1kernel(self):
+    def test_ode1_eqkernel(self):
         if SYMPY_AVAILABLE:
-            kern = GPy.kern.eq_ode1(3)
-            self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+            kern = GPy.kern.ode1_eq(3)
+            self.assertTrue(GPy.kern.kern_test(kern, output_ind=1, verbose=verbose, X_positive=True))
 
     def test_rbf_invkernel(self):
         kern = GPy.kern.rbf_inv(5)

From 5b1f7002389f4fb2fc4c9e75e32cfb26a4e7680d Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 22 Nov 2013 08:58:29 +0000
Subject: [PATCH 205/252] changed nasty whitespace

---
 GPy/core/mapping.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py
index 0da93c7c..5f517706 100644
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@@ -36,7 +36,6 @@ class Mapping(Parameterized):
 
     def df_dtheta(self, dL_df, X):
         """The gradient of the outputs of the multi-layer perceptron with respect to each of the parameters.
-        
         :param dL_df: gradient of the objective with respect to the function.
         :type dL_df: ndarray (num_data x output_dim)
         :param X: input locations where the function is evaluated.
@@ -44,14 +43,13 @@ class Mapping(Parameterized):
         :returns: Matrix containing gradients with respect to parameters of each output for each input data.
         :rtype: ndarray (num_params length)
         """
-        
         raise NotImplementedError
 
     def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue']):
         """
 
         Plot the mapping.
-        
+
         Plots the mapping associated with the model.
           - In one dimension, the function is plotted.
           - In two dimsensions, a contour-plot shows the function
@@ -110,7 +108,7 @@ class Mapping(Parameterized):
             for d in range(y.shape[1]):
                 ax.plot(Xnew, f[:, d], edgecol=linecol)
 
-        elif self.X.shape[1] == 2: 
+        elif self.X.shape[1] == 2:
             resolution = resolution or 50
             Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
             x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
@@ -135,14 +133,14 @@ class Mapping_check_model(Model):
             X = np.random.randn(num_samples, mapping.input_dim)
         if dL_df==None:
             dL_df = np.ones((num_samples, mapping.output_dim))
-        
+
         self.mapping=mapping
         self.X = X
         self.dL_df = dL_df
         self.num_params = self.mapping.num_params
         Model.__init__(self)
 
-        
+
     def _get_params(self):
         return self.mapping._get_params()
 
@@ -157,7 +155,7 @@ class Mapping_check_model(Model):
 
     def _log_likelihood_gradients(self):
         raise NotImplementedError, "This needs to be implemented to use the Mapping_check_model class."
-    
+
 class Mapping_check_df_dtheta(Mapping_check_model):
     """This class allows gradient checks for the gradient of a mapping with respect to parameters. """
     def __init__(self, mapping=None, dL_df=None, X=None):
@@ -175,13 +173,13 @@ class Mapping_check_df_dX(Mapping_check_model):
         if dL_df==None:
             dL_df = np.ones((self.X.shape[0],self.mapping.output_dim))
         self.num_params = self.X.shape[0]*self.mapping.input_dim
-        
+
     def _log_likelihood_gradients(self):
         return self.mapping.df_dX(self.dL_df, self.X).flatten()
 
     def _get_param_names(self):
         return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-                
+
     def _get_params(self):
         return self.X.flatten()
 

From 9feb1304091bc19b0c3d3121a90af84de36125fc Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 22 Nov 2013 08:59:29 +0000
Subject: [PATCH 206/252] formatting docstring

---
 GPy/core/mapping.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py
index 5f517706..7b2c89b9 100644
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@@ -124,7 +124,11 @@ class Mapping(Parameterized):
 from GPy.core.model import Model
 
 class Mapping_check_model(Model):
-    """This is a dummy model class used as a base class for checking that the gradients of a given mapping are implemented correctly. It enables checkgradient() to be called independently on each mapping."""
+    """
+    This is a dummy model class used as a base class for checking that the
+    gradients of a given mapping are implemented correctly. It enables
+    checkgradient() to be called independently on each mapping.
+    """
     def __init__(self, mapping=None, dL_df=None, X=None):
         num_samples = 20
         if mapping==None:

From ae0f5134c2a2d76228f6c000b7dfba64173d11b6 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 22 Nov 2013 14:36:47 +0000
Subject: [PATCH 207/252] lots of medding with the likelihoods to get the tests
 working. the tests still don;t work

---
 GPy/likelihoods/laplace.py                    |  5 +-
 .../noise_models/bernoulli_noise.py           |  2 +
 .../noise_models/gaussian_noise.py            |  2 +
 .../noise_models/noise_distributions.py       |  2 +-
 GPy/testing/likelihoods_tests.py              | 63 ++++++++++---------
 5 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 6a44d5b6..6941de48 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -278,7 +278,10 @@ class Laplace(likelihood):
         #W is diagonal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
         B = np.eye(self.N) + W_12*K*W_12.T
-        L = jitchol(B)
+        try:
+            L = jitchol(B)
+        except:
+            import ipdb; ipdb.set_trace()
 
         W12BiW12 = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
         ln_B_det = 2*np.sum(np.log(np.diag(L)))
diff --git a/GPy/likelihoods/noise_models/bernoulli_noise.py b/GPy/likelihoods/noise_models/bernoulli_noise.py
index 17390e55..14f4adc8 100644
--- a/GPy/likelihoods/noise_models/bernoulli_noise.py
+++ b/GPy/likelihoods/noise_models/bernoulli_noise.py
@@ -22,6 +22,8 @@ class Bernoulli(NoiseDistribution):
     """
     def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
         super(Bernoulli, self).__init__(gp_link,analytical_mean,analytical_variance)
+        if isinstance(gp_link , (gp_transformations.Heaviside, gp_transformations.Probit)):
+            self.log_concave = True
 
     def _preprocess_values(self,Y):
         """
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
index fce84d27..3da6bcc8 100644
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -24,6 +24,8 @@ class Gaussian(NoiseDistribution):
         self.N = N
         self._set_params(np.asarray(variance))
         super(Gaussian, self).__init__(gp_link,analytical_mean,analytical_variance)
+        if isinstance(gp_link , gp_transformations.Identity):
+            self.log_concave = True
 
     def _get_params(self):
         return np.array([self.variance])
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 8ee7a2cd..a67d8792 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -33,7 +33,7 @@ class NoiseDistribution(object):
         else:
             self.predictive_variance = self._predictive_variance_numerical
 
-        self.log_concave = True
+        self.log_concave = False
 
     def _get_params(self):
         return np.zeros(0)
diff --git a/GPy/testing/likelihoods_tests.py b/GPy/testing/likelihoods_tests.py
index 8d1466fb..709fe002 100644
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@@ -186,33 +186,33 @@ class TestNoiseModels(object):
                             "laplace": True,
                             "ep": True
                             },
-                        "Gaussian_log": {
-                            "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log(), variance=self.var, D=self.D, N=self.N),
-                            "grad_params": {
-                                "names": ["noise_model_variance"],
-                                "vals": [self.var],
-                                "constraints": [constrain_positive]
-                                },
-                            "laplace": True
-                            },
-                        "Gaussian_probit": {
-                            "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Probit(), variance=self.var, D=self.D, N=self.N),
-                            "grad_params": {
-                                "names": ["noise_model_variance"],
-                                "vals": [self.var],
-                                "constraints": [constrain_positive]
-                                },
-                            "laplace": True
-                            },
-                        "Gaussian_log_ex": {
-                            "model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
-                            "grad_params": {
-                                "names": ["noise_model_variance"],
-                                "vals": [self.var],
-                                "constraints": [constrain_positive]
-                                },
-                            "laplace": True
-                            },
+                        #"Gaussian_log": {
+                            #"model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log(), variance=self.var, D=self.D, N=self.N),
+                            #"grad_params": {
+                                #"names": ["noise_model_variance"],
+                                #"vals": [self.var],
+                                #"constraints": [constrain_positive]
+                                #},
+                            #"laplace": True
+                            #},
+                        #"Gaussian_probit": {
+                            #"model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Probit(), variance=self.var, D=self.D, N=self.N),
+                            #"grad_params": {
+                                #"names": ["noise_model_variance"],
+                                #"vals": [self.var],
+                                #"constraints": [constrain_positive]
+                                #},
+                            #"laplace": True
+                            #},
+                        #"Gaussian_log_ex": {
+                            #"model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
+                            #"grad_params": {
+                                #"names": ["noise_model_variance"],
+                                #"vals": [self.var],
+                                #"constraints": [constrain_positive]
+                                #},
+                            #"laplace": True
+                            #},
                         "Bernoulli_default": {
                             "model": GPy.likelihoods.bernoulli(),
                             "link_f_constraints": [partial(constrain_bounded, lower=0, upper=1)],
@@ -253,6 +253,7 @@ class TestNoiseModels(object):
                 param_vals = []
                 param_names = []
                 constrain_positive = []
+                param_constraints = [] # ??? TODO: Saul to Fix.
             if "link_f_constraints" in attributes:
                 link_f_constraints = attributes["link_f_constraints"]
             else:
@@ -490,8 +491,14 @@ class TestNoiseModels(object):
             constraints[param_num](name, m)
 
         m.randomize()
-        m.checkgrad(verbose=1, step=step)
+        m.optimize(max_iters=8)
         print m
+        m.checkgrad(verbose=1, step=step)
+        if not m.checkgrad(step=step):
+            m.checkgrad(verbose=1, step=step)
+            import ipdb; ipdb.set_trace()
+            #NOTE this test appears to be stochastic for some likelihoods (student t?)
+            # appears to all be working in test mode right now...
         assert m.checkgrad(step=step)
 
     ###########

From 129917ec8c638806213368f651ee36db480a6d25 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 22 Nov 2013 14:37:14 +0000
Subject: [PATCH 208/252] removing ipdb statements

---
 GPy/testing/likelihoods_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/testing/likelihoods_tests.py b/GPy/testing/likelihoods_tests.py
index 709fe002..191dae57 100644
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@@ -496,7 +496,7 @@ class TestNoiseModels(object):
         m.checkgrad(verbose=1, step=step)
         if not m.checkgrad(step=step):
             m.checkgrad(verbose=1, step=step)
-            import ipdb; ipdb.set_trace()
+            #import ipdb; ipdb.set_trace()
             #NOTE this test appears to be stochastic for some likelihoods (student t?)
             # appears to all be working in test mode right now...
         assert m.checkgrad(step=step)

From aa7f1d53f9aa8f8b42304b13f4dba66c9ab5e0ce Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 25 Nov 2013 11:14:04 +0000
Subject: [PATCH 209/252] fixing up the blas detectino in linalg

---
 GPy/util/linalg.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index f68e1a0b..9db769e6 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -21,9 +21,9 @@ else:
 try:
     _blaslib = ctypes.cdll.LoadLibrary(np.core._dotblas.__file__) # @UndefinedVariable
     _blas_available = True
-    assert hasattr('dsyrk_',_blaslib)
-    assert hasattr('dsyr_',_blaslib)
-except:
+    assert hasattr(_blaslib, 'dsyrk_')
+    assert hasattr(_blaslib, 'dsyr_')
+except AssertionError:
     _blas_available = False
 
 def dtrtrs(A, B, lower=0, trans=0, unitdiag=0):

From 58ffdd813e9f3b868b8ad33fa39dcea945c0395a Mon Sep 17 00:00:00 2001
From: mu <m.niu@sheffield.ac.uk>
Date: Mon, 25 Nov 2013 13:58:06 +0000
Subject: [PATCH 210/252] ODE_UY

---
 GPy/kern/parts/ODE_UY.py   | 253 +++++++++++++++++++++++++++++++++++++
 GPy/kern/parts/__init__.py |   2 +-
 2 files changed, 254 insertions(+), 1 deletion(-)
 create mode 100644 GPy/kern/parts/ODE_UY.py

diff --git a/GPy/kern/parts/ODE_UY.py b/GPy/kern/parts/ODE_UY.py
new file mode 100644
index 00000000..8e0096d2
--- /dev/null
+++ b/GPy/kern/parts/ODE_UY.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+
+def index_to_slices(index):
+    """
+    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index. 
+
+    e.g.
+    >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
+    returns
+    >>> [[slice(0,3,None)],[slice(3,6,None)],[slice(6,9,None)]]
+
+    or, a more complicated example
+    >>> index = np.asarray([0,0,1,1,0,2,2,2,1,1])
+    returns
+    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
+    """
+
+    #contruct the return structure
+    ind = np.asarray(index,dtype=np.int64)
+    ret = [[] for i in range(ind.max()+1)]
+
+    #find the switchpoints
+    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
+    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
+
+    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
+    return ret
+
+class ODE_UY(Kernpart):
+    """
+    kernel resultiong from a first order ODE with OU driving GP
+
+    :param input_dim: the number of input dimension, has to be equal to one
+    :type input_dim: int
+    :param input_lengthU: the number of input U length
+    :type input_dim: int   
+    :param varianceU: variance of the driving GP
+    :type varianceU: float
+    :param lengthscaleU: lengthscale of the driving GP  (sqrt(3)/lengthscaleU)
+    :type lengthscaleU: float
+    :param varianceY: 'variance' of the transfer function
+    :type varianceY: float
+    :param lengthscaleY: 'lengthscale' of the transfer function (1/lengthscaleY)
+    :type lengthscaleY: float
+    :rtype: kernel object
+
+    """
+
+
+
+
+    def __init__(self, input_dim=2,varianceU=1., varianceY=1., lengthscaleU=None, lengthscaleY=None):
+        assert input_dim==2, "Only defined for input_dim = 1"
+        self.input_dim = input_dim
+        self.num_params = 4
+        self.name = 'ODE_UY'
+
+
+        if lengthscaleU is not None:
+            lengthscaleU = np.asarray(lengthscaleU)
+            assert lengthscaleU.size == 1, "lengthscaleU should be one dimensional"
+        else:
+            lengthscaleU = np.ones(1)
+        if lengthscaleY is not None:
+            lengthscaleY = np.asarray(lengthscaleY)
+            assert lengthscaleY.size == 1, "lengthscaleY should be one dimensional"
+        else:
+            lengthscaleY = np.ones(1)
+            #lengthscaleY = 0.5
+        self._set_params(np.hstack((varianceU, varianceY, lengthscaleU,lengthscaleY)))
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return np.hstack((self.varianceU,self.varianceY, self.lengthscaleU,self.lengthscaleY))
+
+    def _set_params(self, x):
+        """set the value of the parameters."""
+        assert x.size == self.num_params
+
+        self.varianceU = x[0]
+        self.varianceY = x[1]
+        self.lengthscaleU = x[2]
+        self.lengthscaleY = x[3]
+
+
+    def _get_param_names(self):
+        """return parameter names."""
+        return ['varianceU','varianceY', 'lengthscaleU', 'lengthscaleY']
+
+
+    def K(self, X, X2, target):
+        """Compute the covariance matrix between X and X2."""
+
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+
+
+        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
+        rdist = X - X2.T
+        ly=1/self.lengthscaleY
+        lu=np.sqrt(3)/self.lengthscaleU
+        #iu=self.input_lengthU  #dimention of U
+        
+        Vu=self.varianceU
+        Vy=self.varianceY
+
+        kuu = lambda dist:Vu * (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
+
+        k1 = lambda dist:np.exp(-ly*np.abs(dist))*(2*lu+ly)/(lu+ly)**2
+        k2 = lambda dist:(np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
+        k3 = lambda dist:np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        kyy = lambda dist:Vu*Vy*(k1(dist) + k2(dist) + k3(dist))
+
+        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
+        kyup = lambda dist:Vu*Vy*(k1(dist)+k2(dist))    #t>0 kyu
+        kyun = lambda dist:Vu*Vy*(kyu3(dist))       #t<0 kyu
+
+        kuyp = lambda dist:Vu*Vy*(kyu3(dist))       #t>0 kuy
+        kuyn = lambda dist:Vu*Vy*(k1(dist)+k2(dist))      #t<0 kuy
+        
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            target[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
+                        elif i==0 and j==1:
+                            target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
+                        elif i==1 and j==1:
+                            target[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
+                        else:
+                            target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[s1[0],s2[0]]) )   )
+
+
+        #KUU = kuu(np.abs(rdist[:iu,:iu]))
+
+        #KYY = kyy(np.abs(rdist[iu:,iu:]))
+
+        #KYU = np.where(rdist[iu:,:iu]>0,kyup(np.abs(rdist[iu:,:iu])),kyun(np.abs(rdist[iu:,:iu]) ))
+
+        #KUY = np.where(rdist[:iu,iu:]>0,kuyp(np.abs(rdist[:iu,iu:])),kuyn(np.abs(rdist[:iu,iu:]) ))
+
+        #ker=np.vstack((np.hstack([KUU,KUY]),np.hstack([KYU,KYY])))
+
+        #np.add(ker, target, target)
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        ly=1/self.lengthscaleY
+        lu=np.sqrt(3)/self.lengthscaleU
+        #ly=self.lengthscaleY
+        #lu=self.lengthscaleU
+        
+        k1 = (2*lu+ly)/(lu+ly)**2
+        k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2 
+        k3 = 1/(lu+ly) + (lu)/(lu+ly)**2 
+
+        slices = index_to_slices(X[:,-1])
+
+        for i, ss1 in enumerate(slices):
+            for s1 in ss1:
+                if i==0:
+                    target[s1]+= self.varianceU 
+                elif i==1:
+                    target[s1]+= self.varianceU*self.varianceY*(k1+k2+k3)
+                else:
+                    raise ValueError, "invalid input/output index"
+        
+        #target[slices[0][0]]+= self.varianceU   #matern32 diag
+        #target[slices[1][0]]+= self.varianceU*self.varianceY*(k1+k2+k3)  #  diag
+
+
+
+
+
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: X2 = X
+        dist = np.abs(X - X2.T)
+
+        ly=1/self.lengthscaleY
+        lu=np.sqrt(3)/self.lengthscaleU
+        #ly=self.lengthscaleY
+        #lu=self.lengthscaleU
+
+        dk1theta1 = lambda dist: np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3
+        #c=np.sqrt(3)
+        #t1=c/lu
+        #t2=1/ly
+        #dk1theta1=np.exp(-dist*ly)*t2*( (2*c*t2+2*t1)/(c*t2+t1)**2 -2*(2*c*t2*t1+t1**2)/(c*t2+t1)**3   )
+
+        dk2theta1 = lambda dist: 1*( 
+            np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2) 
+            +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3) 
+            +np.exp(-dist*ly)*2*(ly-lu)**(-2)
+            +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3)
+            )
+      
+        dk3theta1 = lambda dist: np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist)
+
+        dktheta1 = lambda dist: self.varianceU*self.varianceY*(dk1theta1+dk2theta1+dk3theta1)
+
+
+
+
+        dk1theta2 = lambda dist: np.exp(-ly*dist) * ((lu+ly)**(-2)) * (  (-dist)*(2*lu+ly)  +  1  +  (-2)*(2*lu+ly)/(lu+ly)  )
+
+        dk2theta2 =lambda dist:  1*(
+            np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) )
+            +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) )
+            )
+
+        dk3theta2 = lambda dist: np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3
+
+        dktheta2 = lambda dist: self.varianceU*self.varianceY*(dk1theta2 + dk2theta2 +dk3theta2)
+
+
+
+        k1 = lambda dist: np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
+        k2 = lambda dist: (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
+        k3 = lambda dist: np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        dkdvar = k1+k2+k3
+
+        target[0] += np.sum(self.varianceY*dkdvar * dL_dK)
+        target[1] += np.sum(self.varianceU*dkdvar * dL_dK)
+        target[2] += np.sum(dktheta1*(-np.sqrt(3)*self.lengthscaleU**(-2)) * dL_dK)
+        target[3] += np.sum(dktheta2*(-self.lengthscaleY**(-2)) * dL_dK)
+
+
+    # def dKdiag_dtheta(self, dL_dKdiag, X, target):
+    #     """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+    #     # NB: derivative of diagonal elements wrt lengthscale is 0
+    #     target[0] += np.sum(dL_dKdiag)
+
+    # def dK_dX(self, dL_dK, X, X2, target):
+    #     """derivative of the covariance matrix with respect to X."""
+    #     if X2 is None: X2 = X
+    #     dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
+    #     ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+    #     dK_dX = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
+    #     target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+
+    # def dKdiag_dX(self, dL_dKdiag, X, target):
+    #     pass
diff --git a/GPy/kern/parts/__init__.py b/GPy/kern/parts/__init__.py
index f278941a..d8e7f8e6 100644
--- a/GPy/kern/parts/__init__.py
+++ b/GPy/kern/parts/__init__.py
@@ -14,7 +14,7 @@ import Matern32
 import Matern52
 import mlp
 import ODE_1
-#import ODE_UY
+import ODE_UY
 import periodic_exponential
 import periodic_Matern32
 import periodic_Matern52

From c69f6a2059d6346622bfcf56aa76be2a1e68e05c Mon Sep 17 00:00:00 2001
From: mu <m.niu@sheffield.ac.uk>
Date: Tue, 26 Nov 2013 09:56:42 +0000
Subject: [PATCH 211/252] ODE_UY

---
 GPy/kern/parts/ODE_UY.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/GPy/kern/parts/ODE_UY.py b/GPy/kern/parts/ODE_UY.py
index 8e0096d2..f6c5e9d9 100644
--- a/GPy/kern/parts/ODE_UY.py
+++ b/GPy/kern/parts/ODE_UY.py
@@ -95,6 +95,8 @@ class ODE_UY(Kernpart):
 
     def K(self, X, X2, target):
         """Compute the covariance matrix between X and X2."""
+        # model :   a * dy/dt + b * y = U
+        #lu=sqrt(3)/theta1  ly=1/theta2  theta2= a/b :thetay   sigma2=1/(2ab) :sigmay   
 
         X,slices = X[:,:-1],index_to_slices(X[:,-1])
         if X2 is None:

From a81b5cfd505d6579b2dd8fa9630a1f5a1d79b50b Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Wed, 27 Nov 2013 10:07:08 +0000
Subject: [PATCH 212/252] Fixed test in kern.py to request correct output dim
 for multioutput covariances.

---
 GPy/kern/kern.py             | 4 ++--
 GPy/testing/bcgplvm_tests.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 46bb01c8..bf8ba612 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -861,13 +861,13 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False, X_positive=
         if X_positive:
             X = abs(X)
         if output_ind is not None:
-            X[:, output_ind] = np.random.randint(kern.parts[0].output_dim, X.shape[0])
+            X[:, output_ind] = np.random.randint(low=0,high=kern.parts[0].output_dim, size=X.shape[0])
     if X2==None:
         X2 = np.random.randn(20, kern.input_dim)
         if X_positive:
             X2 = abs(X2)
         if output_ind is not None:
-            X2[:, output_ind] = np.random.randint(kern.parts[0].output_dim, X2.shape[0])
+            X2[:, output_ind] = np.random.randint(low=0, high=kern.parts[0].output_dim, size=X2.shape[0])
 
     if verbose:
         print("Checking covariance function is positive definite.")
diff --git a/GPy/testing/bcgplvm_tests.py b/GPy/testing/bcgplvm_tests.py
index 94282a0b..a5bec821 100644
--- a/GPy/testing/bcgplvm_tests.py
+++ b/GPy/testing/bcgplvm_tests.py
@@ -15,7 +15,7 @@ class BCGPLVMTests(unittest.TestCase):
         k = GPy.kern.mlp(input_dim) + GPy.kern.bias(input_dim)
         bk = GPy.kern.rbf(output_dim)
         mapping = GPy.mappings.Kernel(output_dim=input_dim, X=Y, kernel=bk)
-        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
+        m = GPy._models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
         m.randomize()
         self.assertTrue(m.checkgrad())
         
@@ -28,7 +28,7 @@ class BCGPLVMTests(unittest.TestCase):
         k = GPy.kern.mlp(input_dim) + GPy.kern.bias(input_dim)
         bk = GPy.kern.rbf(output_dim)
         mapping = GPy.mappings.Linear(output_dim=input_dim, input_dim=output_dim)
-        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
+        m = GPy._models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
         m.randomize()
         self.assertTrue(m.checkgrad())
         
@@ -41,7 +41,7 @@ class BCGPLVMTests(unittest.TestCase):
         k = GPy.kern.mlp(input_dim) + GPy.kern.bias(input_dim)
         bk = GPy.kern.rbf(output_dim)
         mapping = GPy.mappings.MLP(output_dim=input_dim, input_dim=output_dim, hidden_dim=[5, 4, 7])
-        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
+        m = GPy._models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
         m.randomize()
         self.assertTrue(m.checkgrad())
 

From 6da3fc5a89b60d1f01f885f4c558e7f42ed7fe30 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Wed, 27 Nov 2013 11:17:33 +0000
Subject: [PATCH 213/252] Added gradient of sympy kernel, seems to pass tests,
 but know it's not numerically stable. Checking in before making numerically
 stable.

---
 GPy/kern/parts/sympy_helpers.cpp | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/GPy/kern/parts/sympy_helpers.cpp b/GPy/kern/parts/sympy_helpers.cpp
index 9f30eea9..9b0d5885 100644
--- a/GPy/kern/parts/sympy_helpers.cpp
+++ b/GPy/kern/parts/sympy_helpers.cpp
@@ -170,9 +170,25 @@ double dh_dl(double t, double tprime, double d_i, double d_j, double l){
 }
 
 double dh_dt(double t, double tprime, double d_i, double d_j, double l){
-  return 0.0;
+  // compute gradient of h function with respect to t.
+  double diff_t = t - tprime;
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - diff_t/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  
+  return (d_i*(erf(d_i*l/2) - erf(d_i*l/2 - t/l))*exp(-d_i*t - d_j*tprime) - d_i*(erf(d_i*l/2 + tprime/l) - erf(d_i*l/2 - (t - tprime)/l))*exp(-d_i*(t - tprime)) + 2*exp(-d_i*(t - tprime) - pow(d_i*l/2 - (t - tprime)/l, 2))/(sqrt(M_PI)*l) - 2*exp(-d_i*t - d_j*tprime - pow(d_i*l/2 - t/l,2))/(sqrt(M_PI)*l))*exp(d_i*l/2*d_i*l/2)/(d_i + d_j);
 }
 
 double dh_dtprime(double t, double tprime, double d_i, double d_j, double l){
-  return 0.0;
+  // compute gradient of h function with respect to tprime.
+  double diff_t = t - tprime;
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - diff_t/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+
+  return (d_i*(erf(d_i*l/2 + tprime/l) - erf(d_i*l/2 - (t - tprime)/l))*exp(-d_i*(t - tprime)) + d_j*(erf(d_i*l/2) - erf(d_i*l/2 - t/l))*exp(-d_i*t - d_j*tprime) + (-2*exp(-pow(d_i*l/2 - (t - tprime)/l,2)) + 2*exp(-pow(d_i*l/2 + tprime/l,2)))*exp(-d_i*(t - tprime))/(sqrt(M_PI)*l))*exp(d_i*l/2*d_i*l/2)/(d_i + d_j);
 }

From 557d296d4c2d77b06c078d9bcd02a3f40d2b3080 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Wed, 27 Nov 2013 11:21:08 +0000
Subject: [PATCH 214/252] Modified to improve part of stability, gradient
 checks still passing.

---
 GPy/kern/parts/sympy_helpers.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/kern/parts/sympy_helpers.cpp b/GPy/kern/parts/sympy_helpers.cpp
index 9b0d5885..d5e0205a 100644
--- a/GPy/kern/parts/sympy_helpers.cpp
+++ b/GPy/kern/parts/sympy_helpers.cpp
@@ -179,7 +179,7 @@ double dh_dt(double t, double tprime, double d_i, double d_j, double l){
   arg_2 = half_l_di - t/l;
   double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
   
-  return (d_i*(erf(d_i*l/2) - erf(d_i*l/2 - t/l))*exp(-d_i*t - d_j*tprime) - d_i*(erf(d_i*l/2 + tprime/l) - erf(d_i*l/2 - (t - tprime)/l))*exp(-d_i*(t - tprime)) + 2*exp(-d_i*(t - tprime) - pow(d_i*l/2 - (t - tprime)/l, 2))/(sqrt(M_PI)*l) - 2*exp(-d_i*t - d_j*tprime - pow(d_i*l/2 - t/l,2))/(sqrt(M_PI)*l))*exp(d_i*l/2*d_i*l/2)/(d_i + d_j);
+  return (d_i*exp(ln_part_2-d_i*t - d_j*tprime) - d_i*(erf(half_l_di + tprime/l) - erf(half_l_di - diff_t/l))*exp(-d_i*diff_t) + 2*exp(-d_i*diff_t - pow(half_l_di - diff_t/l, 2))/(sqrt(M_PI)*l) - 2*exp(-d_i*t - d_j*tprime - pow(half_l_di - t/l,2))/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
 }
 
 double dh_dtprime(double t, double tprime, double d_i, double d_j, double l){
@@ -190,5 +190,5 @@ double dh_dtprime(double t, double tprime, double d_i, double d_j, double l){
   double arg_2 = half_l_di - diff_t/l;
   double ln_part_1 = ln_diff_erf(arg_1, arg_2);
 
-  return (d_i*(erf(d_i*l/2 + tprime/l) - erf(d_i*l/2 - (t - tprime)/l))*exp(-d_i*(t - tprime)) + d_j*(erf(d_i*l/2) - erf(d_i*l/2 - t/l))*exp(-d_i*t - d_j*tprime) + (-2*exp(-pow(d_i*l/2 - (t - tprime)/l,2)) + 2*exp(-pow(d_i*l/2 + tprime/l,2)))*exp(-d_i*(t - tprime))/(sqrt(M_PI)*l))*exp(d_i*l/2*d_i*l/2)/(d_i + d_j);
+  return (d_i*(erf(half_l_di + tprime/l) - erf(half_l_di - diff_t/l))*exp(-d_i*diff_t) + d_j*(erf(half_l_di) - erf(half_l_di - t/l))*exp(-d_i*t - d_j*tprime) + (-2*exp(-pow(half_l_di - diff_t/l,2)) + 2*exp(-pow(half_l_di + tprime/l,2)))*exp(-d_i*diff_t)/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
 }

From ea05ba54bf5926392e2aa4f04cdd0c712f7e1b01 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Wed, 27 Nov 2013 11:25:42 +0000
Subject: [PATCH 215/252] sympykern kern_tests now passing, code is inefficient
 but should be numerically stable.

---
 GPy/kern/parts/sympy_helpers.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/GPy/kern/parts/sympy_helpers.cpp b/GPy/kern/parts/sympy_helpers.cpp
index d5e0205a..56aa6f21 100644
--- a/GPy/kern/parts/sympy_helpers.cpp
+++ b/GPy/kern/parts/sympy_helpers.cpp
@@ -80,7 +80,7 @@ double ln_diff_erf(double x0, double x1){
   else //x0 and x1 non-positive
     return log(erfcx(-x0)-erfcx(-x1)*exp(x0*x0 - x1*x1))-x0*x0;
 }
-
+// TODO: For all these computations of h things are very efficient at the moment. Need to recode sympykern to allow the precomputations to take place and all the gradients to be computed in one function. Not sure of best way forward for that yet. Neil
 double h(double t, double tprime, double d_i, double d_j, double l){
   // Compute the h function for the sim covariance.
   double half_l_di = 0.5*l*d_i;
@@ -179,7 +179,7 @@ double dh_dt(double t, double tprime, double d_i, double d_j, double l){
   arg_2 = half_l_di - t/l;
   double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
   
-  return (d_i*exp(ln_part_2-d_i*t - d_j*tprime) - d_i*(erf(half_l_di + tprime/l) - erf(half_l_di - diff_t/l))*exp(-d_i*diff_t) + 2*exp(-d_i*diff_t - pow(half_l_di - diff_t/l, 2))/(sqrt(M_PI)*l) - 2*exp(-d_i*t - d_j*tprime - pow(half_l_di - t/l,2))/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
+  return (d_i*exp(ln_part_2-d_i*t - d_j*tprime) - d_i*exp(ln_part_1-d_i*diff_t) + 2*exp(-d_i*diff_t - pow(half_l_di - diff_t/l, 2))/(sqrt(M_PI)*l) - 2*exp(-d_i*t - d_j*tprime - pow(half_l_di - t/l,2))/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
 }
 
 double dh_dtprime(double t, double tprime, double d_i, double d_j, double l){
@@ -189,6 +189,8 @@ double dh_dtprime(double t, double tprime, double d_i, double d_j, double l){
   double arg_1 = half_l_di + tprime/l;
   double arg_2 = half_l_di - diff_t/l;
   double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
 
-  return (d_i*(erf(half_l_di + tprime/l) - erf(half_l_di - diff_t/l))*exp(-d_i*diff_t) + d_j*(erf(half_l_di) - erf(half_l_di - t/l))*exp(-d_i*t - d_j*tprime) + (-2*exp(-pow(half_l_di - diff_t/l,2)) + 2*exp(-pow(half_l_di + tprime/l,2)))*exp(-d_i*diff_t)/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
+  return (d_i*exp(ln_part_1-d_i*diff_t) + d_j*exp(ln_part_2-d_i*t - d_j*tprime) + (-2*exp(-pow(half_l_di - diff_t/l,2)) + 2*exp(-pow(half_l_di + tprime/l,2)))*exp(-d_i*diff_t)/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
 }

From f9fa378aa08edb97c95d5775358d39325d235a4e Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 12:30:19 +0000
Subject: [PATCH 216/252] added some tips to the readme

---
 GPy/kern/kern.py |  1 +
 README.md        | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index bf8ba612..ed045534 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -862,6 +862,7 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False, X_positive=
             X = abs(X)
         if output_ind is not None:
             X[:, output_ind] = np.random.randint(low=0,high=kern.parts[0].output_dim, size=X.shape[0])
+            import ipdb; ipdb.set_trace()
     if X2==None:
         X2 = np.random.randn(20, kern.input_dim)
         if X_positive:
diff --git a/README.md b/README.md
index 0ff3d890..10ca8a83 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,31 @@ A Gaussian processes framework in Python.
 
 Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GPy.png)
 
+Getting started
+===============
+Installing with pip
+-------------------
+The simplest way to install GPy is using pip. 
+pip install gpy
+
+Ubuntu
+------
+For the most part, the developers are using ubuntu. To install the required packages:
+sudo apt-get install python-numpy python-scipy python-matplotlib
+
+clone this git repository and add it to your path:
+    git clone git@github.com:SheffieldML/GPy.git \<destination\>
+    echo "PYTHONPATH=$PYTHONPATH:\<detination\> > ~/.bashrc
+
+Windows
+-------
+On windows, we recommend the ![anaconda python distribution](http://continuum.io/downloads). We've also had luck with ![enthought](http://www.enthought.com). git clone or unzip the source to a suitable directory, and add a PYTHONPATH environement variable. 
+
+OSX
+---
+everything appears to work out-of-the box using ![enthought](http://www.enthought.com) on osx Mavericks.
+
+
 
 Compiling documentation:
 ========================

From f8bc7a827fb67a50457d0b090573a252f180ff59 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Wed, 27 Nov 2013 12:31:01 +0000
Subject: [PATCH 217/252] Push minor fix to eq_sympy kernel test.

---
 GPy/kern/kern.py            | 2 ++
 GPy/testing/kernel_tests.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index bf8ba612..37a18f04 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -861,12 +861,14 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False, X_positive=
         if X_positive:
             X = abs(X)
         if output_ind is not None:
+            assert(output_ind<kern.input_dim)
             X[:, output_ind] = np.random.randint(low=0,high=kern.parts[0].output_dim, size=X.shape[0])
     if X2==None:
         X2 = np.random.randn(20, kern.input_dim)
         if X_positive:
             X2 = abs(X2)
         if output_ind is not None:
+            assert(output_ind<kern.input_dim)
             X2[:, output_ind] = np.random.randint(low=0, high=kern.parts[0].output_dim, size=X2.shape[0])
 
     if verbose:
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 5d2fbeec..0fceac60 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -36,7 +36,7 @@ class KernelTests(unittest.TestCase):
     def test_eq_sympykernel(self):
         if SYMPY_AVAILABLE:
             kern = GPy.kern.eq_sympy(5, 3)
-            self.assertTrue(GPy.kern.kern_test(kern, output_ind=3, verbose=verbose))
+            self.assertTrue(GPy.kern.kern_test(kern, output_ind=4, verbose=verbose))
 
     def test_ode1_eqkernel(self):
         if SYMPY_AVAILABLE:

From 0f60fba125e91f41041ebb38b084b55626969fd6 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 27 Nov 2013 12:32:42 +0000
Subject: [PATCH 218/252] Fixed student_t approximation demo and changed
 convergence critera to difference of f

---
 GPy/examples/laplace_approximations.py |  2 +-
 GPy/likelihoods/laplace.py             | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index 64185885..ce47554d 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -106,7 +106,7 @@ def student_t_approx():
     corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
     m = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
     m.ensure_default_constraints()
-    m.constrain_positive('t_noise')
+    m.constrain_bounded('t_noise', 1e-6, 10.)
     m.constrain_fixed('white', 1e-4)
     m.randomize()
     for a in range(1):
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 6941de48..3aa78ffc 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -65,11 +65,10 @@ class Laplace(likelihood):
 
         self.old_Ki_f = None
 
-    def predictive_values(self, mu, var, full_cov):
+    def predictive_values(self,mu,var,full_cov,**noise_args):
         if full_cov:
-            raise NotImplementedError("Cannot make correlated predictions\
-                    with an Laplace likelihood")
-        return self.noise_model.predictive_values(mu, var)
+            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
+        return self.noise_model.predictive_values(mu,var,**noise_args)
 
     def log_predictive_density(self, y_test, mu_star, var_star):
         """
@@ -209,6 +208,7 @@ class Laplace(likelihood):
                    - 0.5*self.f_Ki_f
                    + 0.5*self.y_Wi_Ki_i_y
                   )
+        #print "Term, {}, {}, {}, {}, {}".format(self.lik, - 0.5*self.ln_B_det, + 0.5*self.ln_det_Wi_K, - 0.5*self.f_Ki_f, + 0.5*self.y_Wi_Ki_i_y)
 
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
@@ -380,8 +380,8 @@ class Laplace(likelihood):
 
             #difference = abs(new_obj - old_obj)
             #old_obj = new_obj.copy()
-            #difference = np.abs(np.sum(f - f_old))
-            difference = np.abs(np.sum(Ki_f - old_Ki_f))
+            difference = np.abs(np.sum(f - f_old))
+            #difference = np.abs(np.sum(Ki_f - old_Ki_f))
             old_Ki_f = Ki_f.copy()
             i += 1
 

From 3feba4f7b9f780eb93ece0f8f4fd1b45356d01ae Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 27 Nov 2013 12:38:12 +0000
Subject: [PATCH 219/252] fixed import errors in tests

---
 GPy/testing/bcgplvm_tests.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPy/testing/bcgplvm_tests.py b/GPy/testing/bcgplvm_tests.py
index a5bec821..94282a0b 100644
--- a/GPy/testing/bcgplvm_tests.py
+++ b/GPy/testing/bcgplvm_tests.py
@@ -15,7 +15,7 @@ class BCGPLVMTests(unittest.TestCase):
         k = GPy.kern.mlp(input_dim) + GPy.kern.bias(input_dim)
         bk = GPy.kern.rbf(output_dim)
         mapping = GPy.mappings.Kernel(output_dim=input_dim, X=Y, kernel=bk)
-        m = GPy._models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
+        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
         m.randomize()
         self.assertTrue(m.checkgrad())
         
@@ -28,7 +28,7 @@ class BCGPLVMTests(unittest.TestCase):
         k = GPy.kern.mlp(input_dim) + GPy.kern.bias(input_dim)
         bk = GPy.kern.rbf(output_dim)
         mapping = GPy.mappings.Linear(output_dim=input_dim, input_dim=output_dim)
-        m = GPy._models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
+        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
         m.randomize()
         self.assertTrue(m.checkgrad())
         
@@ -41,7 +41,7 @@ class BCGPLVMTests(unittest.TestCase):
         k = GPy.kern.mlp(input_dim) + GPy.kern.bias(input_dim)
         bk = GPy.kern.rbf(output_dim)
         mapping = GPy.mappings.MLP(output_dim=input_dim, input_dim=output_dim, hidden_dim=[5, 4, 7])
-        m = GPy._models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
+        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
         m.randomize()
         self.assertTrue(m.checkgrad())
 

From 6fb7fe2352960f3d5b5ad1ccb18569ae3ebe9978 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 12:41:47 +0000
Subject: [PATCH 220/252] minor edits to the README

---
 README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 10ca8a83..a3d98466 100644
--- a/README.md
+++ b/README.md
@@ -22,12 +22,13 @@ For the most part, the developers are using ubuntu. To install the required pack
 sudo apt-get install python-numpy python-scipy python-matplotlib
 
 clone this git repository and add it to your path:
-    git clone git@github.com:SheffieldML/GPy.git \<destination\>
-    echo "PYTHONPATH=$PYTHONPATH:\<detination\> > ~/.bashrc
+
+    git clone git@github.com:SheffieldML/GPy.git ~/gpy
+    echo "PYTHONPATH=$PYTHONPATH:~/gpy > ~/.bashrc
 
 Windows
 -------
-On windows, we recommend the ![anaconda python distribution](http://continuum.io/downloads). We've also had luck with ![enthought](http://www.enthought.com). git clone or unzip the source to a suitable directory, and add a PYTHONPATH environement variable. 
+On windows, we recommend the ![anaconda python distribution](http://continuum.io/downloads). We've also had luck with ![enthought](http://www.enthought.com). git clone or unzip the source to a suitable directory, and add a PYTHONPATH environment variable. 
 
 OSX
 ---

From 9231cf4bfc668e0f1aec337de913d776ef1d6373 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 13:02:24 +0000
Subject: [PATCH 221/252] more readme edits

---
 README.md | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index a3d98466..5bf6e44a 100644
--- a/README.md
+++ b/README.md
@@ -13,8 +13,12 @@ Getting started
 ===============
 Installing with pip
 -------------------
-The simplest way to install GPy is using pip. 
-pip install gpy
+The simplest way to install GPy is using pip. ubuntu users can do:
+
+    sudo apt-get install python-pip
+    pip install gpy
+
+If you'd like to install from source, or want to contribute to the project (e.g. by sending pull requests via github), read on.
 
 Ubuntu
 ------
@@ -23,8 +27,9 @@ sudo apt-get install python-numpy python-scipy python-matplotlib
 
 clone this git repository and add it to your path:
 
-    git clone git@github.com:SheffieldML/GPy.git ~/gpy
-    echo "PYTHONPATH=$PYTHONPATH:~/gpy > ~/.bashrc
+    git clone git@github.com:SheffieldML/GPy.git ~/SheffieldML
+    echo 'PYTHONPATH=$PYTHONPATH:~/SheffieldML' >> ~/.bashrc
+
 
 Windows
 -------
@@ -32,8 +37,10 @@ On windows, we recommend the ![anaconda python distribution](http://continuum.io
 
 OSX
 ---
-everything appears to work out-of-the box using ![enthought](http://www.enthought.com) on osx Mavericks.
+Everything appears to work out-of-the box using ![enthought](http://www.enthought.com) on osx Mavericks. Download/clone GPy, and then add GPy to your PYTHONPATH
 
+    git clone git@github.com:SheffieldML/GPy.git ~/SheffieldML
+    echo 'PYTHONPATH=$PYTHONPATH:~/SheffieldML' >> ~/.profile
 
 
 Compiling documentation:

From 36cc17cf2407604c7eb62fc001bb2fa57fa9308f Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 13:09:27 +0000
Subject: [PATCH 222/252] more readme stuff

---
 README.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5bf6e44a..2aada317 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,8 @@ If you'd like to install from source, or want to contribute to the project (e.g.
 Ubuntu
 ------
 For the most part, the developers are using ubuntu. To install the required packages:
-sudo apt-get install python-numpy python-scipy python-matplotlib
+
+    sudo apt-get install python-numpy python-scipy python-matplotlib
 
 clone this git repository and add it to your path:
 
@@ -33,7 +34,11 @@ clone this git repository and add it to your path:
 
 Windows
 -------
-On windows, we recommend the ![anaconda python distribution](http://continuum.io/downloads). We've also had luck with ![enthought](http://www.enthought.com). git clone or unzip the source to a suitable directory, and add a PYTHONPATH environment variable. 
+On windows, we recommend the ![anaconda python distribution](http://continuum.io/downloads). We've also had luck with ![enthought](http://www.enthought.com). git clone or unzip the source to a suitable directory, and add an approptiate PYTHONPATH environment variable. 
+
+On windows 7 (and possibly earlier versions) there's a bug in scipy version 0.13 which tries to write very long filenmnames. Reverting to scipy 0.12 seems to do the trick:
+
+    conda install scipy=0.12
 
 OSX
 ---

From 6673a8ae0218d81e5e972f025253ad073dcf8e82 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 13:10:15 +0000
Subject: [PATCH 223/252] more readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2aada317..27af0b0d 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Windows
 -------
 On windows, we recommend the ![anaconda python distribution](http://continuum.io/downloads). We've also had luck with ![enthought](http://www.enthought.com). git clone or unzip the source to a suitable directory, and add an approptiate PYTHONPATH environment variable. 
 
-On windows 7 (and possibly earlier versions) there's a bug in scipy version 0.13 which tries to write very long filenmnames. Reverting to scipy 0.12 seems to do the trick:
+On windows 7 (and possibly earlier versions) there's a bug in scipy version 0.13 which tries to write very long filenames. Reverting to scipy 0.12 seems to do the trick:
 
     conda install scipy=0.12
 

From 77a0d61bf685e3d002e60be65d294ee86de86304 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 27 Nov 2013 13:12:05 +0000
Subject: [PATCH 224/252] gradientchecker added as a model

---
 GPy/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/GPy/models.py b/GPy/models.py
index 9a847ea0..a56fb305 100644
--- a/GPy/models.py
+++ b/GPy/models.py
@@ -20,3 +20,4 @@ from _models.mrd import MRD#; _mrd = mrd; del mrd
 from _models.gradient_checker import GradientChecker#; _gradient_checker = gradient_checker ; del gradient_checker 
 from _models.gp_multioutput_regression import GPMultioutputRegression#; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
 from _models.sparse_gp_multioutput_regression import SparseGPMultioutputRegression#; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
+from _models.gradient_checker import GradientChecker
\ No newline at end of file

From 4be3f4482dbb64df59c38bbb039be3fd67f96910 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 27 Nov 2013 13:16:00 +0000
Subject: [PATCH 225/252] gradient checker comments and import updates

---
 GPy/_models/gradient_checker.py | 41 ++++++++++++++++-----------------
 GPy/kern/kern.py                |  9 ++++----
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/GPy/_models/gradient_checker.py b/GPy/_models/gradient_checker.py
index 64b8b2fb..dfd0640f 100644
--- a/GPy/_models/gradient_checker.py
+++ b/GPy/_models/gradient_checker.py
@@ -28,38 +28,37 @@ class GradientChecker(Model):
         :param df: Gradient of function to check
         :param x0:
             Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
-            Can be a list of arrays, if takes a list of arrays. This list will be passed
+            Can be a list of arrays, if f takes a list of arrays. This list will be passed
             to f and df in the same order as given here.
-            If only one argument, make sure not to pass a list!!!
-
+            If f takes only one argument, make sure not to pass a list for x0!!!
         :type x0: [array-like] | array-like | float | int
-        :param names:
+        :param list names:
             Names to print, when performing gradcheck. If a list was passed to x0
             a list of names with the same length is expected.
-        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
+        :param args kwargs: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
 
         Examples:
         ---------
-            from GPy.models import GradientChecker
-            N, M, Q = 10, 5, 3
+        from GPy.models import GradientChecker
+        N, M, Q = 10, 5, 3
 
-            Sinusoid:
+        Sinusoid:
 
-                X = numpy.random.rand(N, Q)
-                grad = GradientChecker(numpy.sin,numpy.cos,X,'x')
-                grad.checkgrad(verbose=1)
+            X = numpy.random.rand(N, Q)
+            grad = GradientChecker(numpy.sin,numpy.cos,X,'sin_in')
+            grad.checkgrad(verbose=1)
 
-            Using GPy:
+        Using GPy:
 
-                X, Z = numpy.random.randn(N,Q), numpy.random.randn(M,Q)
-                kern = GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)
-                grad = GradientChecker(kern.K,
-                                       lambda x: 2*kern.dK_dX(numpy.ones((1,1)), x),
-                                       x0 = X.copy(),
-                                       names='X')
-                grad.checkgrad(verbose=1)
-                grad.randomize()
-                grad.checkgrad(verbose=1)
+            X, Z = numpy.random.randn(N,Q), numpy.random.randn(M,Q)
+            kern = GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)
+            grad = GradientChecker(kern.K,
+                                   lambda x: kern.dK_dX(numpy.ones((1,1)), x),
+                                   x0 = X.copy(),
+                                   names=['X_input'])
+            grad.checkgrad(verbose=1)
+            grad.randomize()
+            grad.checkgrad(verbose=1)
         """
         Model.__init__(self)
         if isinstance(x0, (list, tuple)) and names is None:
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index f51c3c13..2c56f47a 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -737,15 +737,16 @@ class kern(Parameterized):
         else:
             raise NotImplementedError, "Cannot plot a kernel with more than two input dimensions"
 
-from GPy.core.model import Model
-
+from ..core.model import Model
 class Kern_check_model(Model):
     """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel."""
     def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
         num_samples = 20
         num_samples2 = 10
         if kernel==None:
+            import GPy
             kernel = GPy.kern.rbf(1)
+            del GPy
         if X==None:
             X = np.random.normal(size=(num_samples, kernel.input_dim))
         if dL_dK==None:
@@ -760,7 +761,7 @@ class Kern_check_model(Model):
         self.dL_dK = dL_dK
         #self.constrained_indices=[]
         #self.constraints=[]
-        Model.__init__(self)
+        super(Kern_check_model, self).__init__()
 
     def is_positive_definite(self):
         v = np.linalg.eig(self.kernel.K(self.X))[0]
@@ -863,7 +864,6 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False, X_positive=
         if output_ind is not None:
             assert(output_ind<kern.input_dim)
             X[:, output_ind] = np.random.randint(low=0,high=kern.parts[0].output_dim, size=X.shape[0])
-            import ipdb; ipdb.set_trace()
     if X2==None:
         X2 = np.random.randn(20, kern.input_dim)
         if X_positive:
@@ -964,3 +964,4 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False, X_positive=
         return False
 
     return pass_checks
+del Model
\ No newline at end of file

From db9e5314e4cdaf9a7bb18e48001b98fb0853ca82 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 13:16:18 +0000
Subject: [PATCH 226/252] removed ipdb statement from kern, cleaned up some
 nasty whitespace

---
 GPy/kern/kern.py | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index f51c3c13..df1e3f47 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -487,12 +487,11 @@ class kern(Parameterized):
                 p1.psi1(Z, mu, S, psi11)
                 Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
                 Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
-                
+
                 p2.psi1(Z, Mu, Sigma, psi12)
                 eK2 = psi12.reshape(N, M, M)
                 crossterms = eK2 * (psi11[:, :, None] + psi11[:, None, :])
                 target += crossterms
-                #import ipdb;ipdb.set_trace()
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
         return target        
@@ -540,15 +539,15 @@ class kern(Parameterized):
                     # turn around to have rbf in front
                     p1, p2 = self.parts[i2], self.parts[i1]
                     ps1, ps2 = self.param_slices[i2], self.param_slices[i1]  
-                
+
                 N, M = mu.shape[0], Z.shape[0]; NM=N*M
 
                 psi11 = np.zeros((N, M))
                 p1.psi1(Z, mu, S, psi11)
-                
+
                 Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
                 Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
-                
+
                 tmp1 = np.zeros_like(target[ps1])
                 tmp2 = np.zeros_like(target[ps2])
 #                 for n in range(N):
@@ -559,7 +558,7 @@ class kern(Parameterized):
 #                             Mu, Sigma= Mu.reshape(N,M,self.input_dim), Sigma.reshape(N,M,self.input_dim)
 #                             p2.dpsi1_dtheta((dL_dpsi2[n:n+1,m:m+1,m_prime:m_prime+1]*(psi11[n:n+1,m_prime:m_prime+1]))[0], Z[m:m+1], Mu[n:n+1,m], Sigma[n:n+1,m], target[ps2])
 #                             p2.dpsi1_dtheta((dL_dpsi2[n:n+1,m:m+1,m_prime:m_prime+1]*(psi11[n:n+1,m:m+1]))[0], Z[m_prime:m_prime+1], Mu[n:n+1, m_prime], Sigma[n:n+1, m_prime], target[ps2])#Z[m_prime:m_prime+1], Mu[n+m:(n+m)+1], Sigma[n+m:(n+m)+1], target[ps2])
-                
+
                 if isinstance(p1, RBF) and isinstance(p2, RBF):
                     psi12 = np.zeros((N, M))
                     p2.psi1(Z, mu, S, psi12)
@@ -571,11 +570,11 @@ class kern(Parameterized):
                 if isinstance(p1, RBF) and isinstance(p2, Linear):
                     #import ipdb;ipdb.set_trace()
                     pass
-                
+
                 p2.dpsi1_dtheta((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, tmp2)
-                
+
                 target[ps1] += tmp1
-                target[ps2] += tmp2                
+                target[ps2] += tmp2
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
 
@@ -615,17 +614,17 @@ class kern(Parameterized):
                 psi11 = np.zeros((N, M))
                 psi12 = np.zeros((NM, M))
                 #psi12_t = np.zeros((N,M))
-                
+
                 p1.psi1(Z, mu, S, psi11)
                 Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
                 Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
-                
+
                 p2.psi1(Z, Mu, Sigma, psi12)
                 tmp1 = np.zeros_like(target)
                 p1.dpsi1_dZ((dL_dpsi2*psi12.reshape(N,M,M)).sum(1), Z, mu, S, tmp1)
                 p1.dpsi1_dZ((dL_dpsi2*psi12.reshape(N,M,M)).sum(2), Z, mu, S, tmp1)
                 target += tmp1
-                
+
                 #p2.dpsi1_dtheta((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, target)
                 p2.dpsi1_dZ((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, target)
             else:
@@ -666,21 +665,21 @@ class kern(Parameterized):
                 psi11 = np.zeros((N, M))
                 psi12 = np.zeros((NM, M))
                 #psi12_t = np.zeros((N,M))
-                
+
                 p1.psi1(Z, mu, S, psi11)
                 Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
                 Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
-                
+
                 p2.psi1(Z, Mu, Sigma, psi12)
                 p1.dpsi1_dmuS((dL_dpsi2*psi12.reshape(N,M,M)).sum(1), Z, mu, S, target_mu, target_S)
                 p1.dpsi1_dmuS((dL_dpsi2*psi12.reshape(N,M,M)).sum(2), Z, mu, S, target_mu, target_S)
-                
+
                 #p2.dpsi1_dtheta((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, target)
                 p2.dpsi1_dmuS((dL_dpsi2*(psi11[:,:,None])).sum(1)*2, Z, Mu.reshape(N,M,self.input_dim).sum(1), Sigma.reshape(N,M,self.input_dim).sum(1), target_mu, target_S)
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
         return target_mu, target_S
-    
+
     def plot(self, x=None, plot_limits=None, which_parts='all', resolution=None, *args, **kwargs):
         if which_parts == 'all':
             which_parts = [True] * self.num_parts
@@ -753,7 +752,7 @@ class Kern_check_model(Model):
                 dL_dK = np.ones((X.shape[0], X.shape[0]))
             else:
                 dL_dK = np.ones((X.shape[0], X2.shape[0]))
-        
+
         self.kernel=kernel
         self.X = X
         self.X2 = X2
@@ -768,7 +767,7 @@ class Kern_check_model(Model):
             return False
         else:
             return True
-        
+
     def _get_params(self):
         return self.kernel._get_params()
 
@@ -783,7 +782,7 @@ class Kern_check_model(Model):
 
     def _log_likelihood_gradients(self):
         raise NotImplementedError, "This needs to be implemented to use the kern_check_model class."
-    
+
 class Kern_check_dK_dtheta(Kern_check_model):
     """This class allows gradient checks for the gradient of a kernel with respect to parameters. """
     def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
@@ -798,7 +797,7 @@ class Kern_check_dKdiag_dtheta(Kern_check_model):
         Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
         if dL_dK==None:
             self.dL_dK = np.ones((self.X.shape[0]))
-        
+
     def log_likelihood(self):
         return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
 
@@ -815,7 +814,7 @@ class Kern_check_dK_dX(Kern_check_model):
 
     def _get_param_names(self):
         return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-                
+
     def _get_params(self):
         return self.X.flatten()
 
@@ -837,7 +836,7 @@ class Kern_check_dKdiag_dX(Kern_check_model):
 
     def _get_param_names(self):
         return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-                
+
     def _get_params(self):
         return self.X.flatten()
 
@@ -863,7 +862,6 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False, X_positive=
         if output_ind is not None:
             assert(output_ind<kern.input_dim)
             X[:, output_ind] = np.random.randint(low=0,high=kern.parts[0].output_dim, size=X.shape[0])
-            import ipdb; ipdb.set_trace()
     if X2==None:
         X2 = np.random.randn(20, kern.input_dim)
         if X_positive:

From f59125d4a138b8ef989e057842198fdb746f4a16 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 27 Nov 2013 13:21:11 +0000
Subject: [PATCH 227/252] Fixed step size for likelihood tests and allowed
 randomizing of laplace

---
 GPy/testing/likelihoods_tests.py | 36 ++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/GPy/testing/likelihoods_tests.py b/GPy/testing/likelihoods_tests.py
index 191dae57..77f78d9b 100644
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@@ -6,6 +6,8 @@ import functools
 import inspect
 from GPy.likelihoods.noise_models import gp_transformations
 from functools import partial
+#np.random.seed(300)
+np.random.seed(690)
 
 def dparam_partial(inst_func, *args):
     """
@@ -144,7 +146,7 @@ class TestNoiseModels(object):
                             "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
                             "grad_params": {
                                 "names": ["t_noise"],
-                                "vals": [1],
+                                "vals": [1.0],
                                 "constraints": [constrain_positive]
                                 },
                             "laplace": True
@@ -158,6 +160,15 @@ class TestNoiseModels(object):
                                 },
                             "laplace": True
                             },
+                        "Student_t_large_var": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [10.0],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
                         "Student_t_approx_gauss": {
                             "model": GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var),
                             "grad_params": {
@@ -315,9 +326,11 @@ class TestNoiseModels(object):
     def t_logpdf(self, model, Y, f):
         print "\n{}".format(inspect.stack()[0][3])
         print model
+        print model._get_params()
         np.testing.assert_almost_equal(
-                               np.log(model.pdf(f.copy(), Y.copy())),
-                               model.logpdf(f.copy(), Y.copy()))
+                               model.pdf(f.copy(), Y.copy()),
+                               np.exp(model.logpdf(f.copy(), Y.copy()))
+                               )
 
     @with_setup(setUp, tearDown)
     def t_dlogpdf_df(self, model, Y, f):
@@ -363,7 +376,7 @@ class TestNoiseModels(object):
         assert (
                 dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
                     params, args=(f, Y), constraints=param_constraints,
-                    randomize=False, verbose=True)
+                    randomize=True, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
@@ -373,7 +386,7 @@ class TestNoiseModels(object):
         assert (
                 dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
                     params, args=(f, Y), constraints=param_constraints,
-                    randomize=False, verbose=True)
+                    randomize=True, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
@@ -383,7 +396,7 @@ class TestNoiseModels(object):
         assert (
                 dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
                     params, args=(f, Y), constraints=param_constraints,
-                    randomize=False, verbose=True)
+                    randomize=True, verbose=True)
                 )
 
     ################
@@ -478,7 +491,7 @@ class TestNoiseModels(object):
         print "\n{}".format(inspect.stack()[0][3])
         #Normalize
         Y = Y/Y.max()
-        white_var = 0.001
+        white_var = 1e-6
         kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), model)
         m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=laplace_likelihood)
@@ -490,12 +503,13 @@ class TestNoiseModels(object):
             m[name] = param_vals[param_num]
             constraints[param_num](name, m)
 
+        print m
         m.randomize()
-        m.optimize(max_iters=8)
+        #m.optimize(max_iters=8)
         print m
         m.checkgrad(verbose=1, step=step)
-        if not m.checkgrad(step=step):
-            m.checkgrad(verbose=1, step=step)
+        #if not m.checkgrad(step=step):
+            #m.checkgrad(verbose=1, step=step)
             #import ipdb; ipdb.set_trace()
             #NOTE this test appears to be stochastic for some likelihoods (student t?)
             # appears to all be working in test mode right now...
@@ -509,7 +523,7 @@ class TestNoiseModels(object):
         print "\n{}".format(inspect.stack()[0][3])
         #Normalize
         Y = Y/Y.max()
-        white_var = 0.001
+        white_var = 1e-6
         kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
         ep_likelihood = GPy.likelihoods.EP(Y.copy(), model)
         m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=ep_likelihood)

From 133d69ff6735e0b30c8db04d28f87ed49f292ab3 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 13:29:19 +0000
Subject: [PATCH 228/252] changeing models to _models in setup.py

---
 GPy/examples/laplace_approximations.py | 2 +-
 setup.py                               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index ce47554d..f74e4d37 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -2,7 +2,7 @@ import GPy
 import numpy as np
 import matplotlib.pyplot as plt
 from GPy.util import datasets
-np.random.seed(1)
+#np.random.seed(1)
 
 def student_t_approx():
     """
diff --git a/setup.py b/setup.py
index 27ebf975..88ee6257 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ setup(name = 'GPy',
       license = "BSD 3-clause",
       keywords = "machine-learning gaussian-processes kernels",
       url = "http://sheffieldml.github.com/GPy/",
-      packages = ['GPy', 'GPy.core', 'GPy.kern', 'GPy.util', 'GPy.models', 'GPy.inference', 'GPy.examples', 'GPy.likelihoods', 'GPy.testing', 'GPy.util.latent_space_visualizations', 'GPy.util.latent_space_visualizations.controllers', 'GPy.likelihoods.noise_models', 'GPy.kern.parts', 'GPy.mappings'],
+      packages = ['GPy', 'GPy.core', 'GPy.kern', 'GPy.util', 'GPy._models', 'GPy.inference', 'GPy.examples', 'GPy.likelihoods', 'GPy.testing', 'GPy.util.latent_space_visualizations', 'GPy.util.latent_space_visualizations.controllers', 'GPy.likelihoods.noise_models', 'GPy.kern.parts', 'GPy.mappings'],
       package_dir={'GPy': 'GPy'},
       package_data = {'GPy': ['GPy/examples', 'gpy_config.cfg']},
       py_modules = ['GPy.__init__'],

From ca4117322549d5a968c427fb23e093c4bba6a0d9 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 13:47:08 +0000
Subject: [PATCH 229/252] better warings for cathcing of blaslib detection

---
 GPy/util/linalg.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 9db769e6..cf210bba 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -12,6 +12,7 @@ import ctypes
 from ctypes import byref, c_char, c_int, c_double # TODO
 # import scipy.lib.lapack
 import scipy
+import warnings
 
 if np.all(np.float64((scipy.__version__).split('.')[:2]) >= np.array([0, 12])):
     import scipy.linalg.lapack as lapack
@@ -25,6 +26,9 @@ try:
     assert hasattr(_blaslib, 'dsyr_')
 except AssertionError:
     _blas_available = False
+except AttributeError e:
+    _blas_available = False
+    warnings.warn("warning: caught this exception:" + str(e))
 
 def dtrtrs(A, B, lower=0, trans=0, unitdiag=0):
     """

From f5329bb9b6ebc4b3296321ebbed4af2cba386601 Mon Sep 17 00:00:00 2001
From: Teo de Campos <teo@compbio1.(none)>
Date: Wed, 27 Nov 2013 14:06:50 +0000
Subject: [PATCH 230/252] Fixed exception handling bug in GPy/util/linalg.py:29

---
 GPy/util/linalg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index cf210bba..e3e421f6 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -26,7 +26,7 @@ try:
     assert hasattr(_blaslib, 'dsyr_')
 except AssertionError:
     _blas_available = False
-except AttributeError e:
+except AttributeError as e:
     _blas_available = False
     warnings.warn("warning: caught this exception:" + str(e))
 

From 042ebab81e5dfd83809a2b385d5e7f1300403bfb Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 14:12:54 +0000
Subject: [PATCH 231/252] argghdfklg

---
 GPy/util/linalg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index cf210bba..e3e421f6 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -26,7 +26,7 @@ try:
     assert hasattr(_blaslib, 'dsyr_')
 except AssertionError:
     _blas_available = False
-except AttributeError e:
+except AttributeError as e:
     _blas_available = False
     warnings.warn("warning: caught this exception:" + str(e))
 

From cfdd91ae7bb9376c2cfe6cf844ae497ce13296d7 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 14:21:18 +0000
Subject: [PATCH 232/252] improved detectino of sympy

---
 GPy/util/__init__.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index 629b3f48..2d2b6e17 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -14,6 +14,15 @@ import visualize
 import decorators
 import classification
 import latent_space_visualizations
-import symbolic
+
+try:
+    import sympy
+    _sympy_available = True
+    del sympy
+except ImportError as e:
+    _sympy_available = False
+
+if _sympy_available:
+    import symbolic
 
 import netpbmfile

From 557d4ea7eab2c4d26147321aa2e4fe7cc0e24f84 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 14:43:48 +0000
Subject: [PATCH 233/252] reverted the brent optimisation in laplace

(For the 1D linesearch using Brent)
---
 GPy/likelihoods/laplace.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 3aa78ffc..57160d64 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -349,7 +349,8 @@ class Laplace(likelihood):
             #Find the stepsize that minimizes the objective function using a brent line search
             #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
             #steps than get this exact then make a step, if B was bigger it might be the other way around though
-            new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
+            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
+            new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
             f = self.tmp_f.copy()
             Ki_f = self.tmp_Ki_f.copy()
 

From 0c3747dc4d42d7dfb157d2377636d2e5f93894eb Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 14:57:57 +0000
Subject: [PATCH 234/252] Fixed symmetry in checkgrad issue

---
 GPy/core/model.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 95d4565d..6fbc9623 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -453,7 +453,12 @@ class Model(Parameterized):
 
         if not verbose:
             # just check the global ratio
-            dx = step * np.sign(np.random.uniform(-1, 1, x.size))
+
+            #choose a random direction to find the linear approximation in
+            if x.size==2:
+                dx = step * np.ones(2) # random direction for 2 parameters can fail dure to symmetry
+            else:
+                dx = step * np.sign(np.random.uniform(-1, 1, x.size))
 
             # evaulate around the point x
             f1, g1 = self.objective_and_gradients(x + dx)

From eafcd50af5848f3cb8d9533c8f7a0229c01e42c7 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 15:00:42 +0000
Subject: [PATCH 235/252] changing the seed seems to fix Alan's bug.

---
 GPy/testing/likelihoods_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/testing/likelihoods_tests.py b/GPy/testing/likelihoods_tests.py
index 77f78d9b..9b7b7eb6 100644
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@@ -7,7 +7,7 @@ import inspect
 from GPy.likelihoods.noise_models import gp_transformations
 from functools import partial
 #np.random.seed(300)
-np.random.seed(690)
+np.random.seed(7)
 
 def dparam_partial(inst_func, *args):
     """

From 944703beff79e30ab46c212bbc102f60e6cf79bb Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 27 Nov 2013 15:02:30 +0000
Subject: [PATCH 236/252] dimensionality reduction example (oil) updated

---
 GPy/examples/dimensionality_reduction.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index cdd69ab5..0155ff94 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -165,19 +165,14 @@ def BGPLVM_oil(optimize=True, N=200, Q=7, num_inducing=40, max_iters=1000, plot=
 
     # optimize
     if optimize:
-        m.constrain_fixed('noise')
-        m.optimize('scg', messages=1, max_iters=200, gtol=.05)
-        m.constrain_positive('noise')
-        m.constrain_bounded('white', 1e-7, 1)
         m.optimize('scg', messages=1, max_iters=max_iters, gtol=.05)
 
     if plot:
         y = m.likelihood.Y[0, :]
         fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
-        plt.sca(latent_axes)
-        m.plot_latent()
+        m.plot_latent(ax=latent_axes)
         data_show = GPy.util.visualize.vector_show(y)
-        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], m, data_show, latent_axes=latent_axes) # , sense_axes=sense_axes)
+        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
         raw_input('Press enter to finish')
         plt.close(fig)
     return m

From 50e9034a6d7d9ea3a16df00d09182b8193d2fca9 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 27 Nov 2013 16:12:58 +0000
Subject: [PATCH 237/252] dimensionality reduction examples updated with
 optimize, plot and verbose

---
 GPy/_models/sparse_gplvm.py              |   4 +-
 GPy/examples/dimensionality_reduction.py | 473 ++++++++++-------------
 2 files changed, 216 insertions(+), 261 deletions(-)

diff --git a/GPy/_models/sparse_gplvm.py b/GPy/_models/sparse_gplvm.py
index ab616d5a..4e401ee3 100644
--- a/GPy/_models/sparse_gplvm.py
+++ b/GPy/_models/sparse_gplvm.py
@@ -66,5 +66,5 @@ class SparseGPLVM(SparseGPRegression, GPLVM):
         pb.plot(mu[:, 0] , mu[:, 1], 'ko')
 
     def plot_latent(self, *args, **kwargs):
-        input_1, input_2 = GPLVM.plot_latent(*args, **kwargs)
-        pb.plot(m.Z[:, input_1], m.Z[:, input_2], '^w')
+        GPLVM.plot_latent(self, *args, **kwargs)
+        #pb.plot(self.Z[:, input_1], self.Z[:, input_2], '^w')
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 0155ff94..9120805c 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -1,99 +1,93 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
+import numpy as _np
+default_seed = _np.random.seed(123344)
 
-import numpy as np
-from matplotlib import pyplot as plt, cm
-
-import GPy
-from GPy.core.transformations import logexp
-from GPy.likelihoods.gaussian import Gaussian
-from GPy.models import BayesianGPLVM
-
-default_seed = np.random.seed(123344)
-
-def BGPLVM(seed=default_seed):
-    N = 13
+def bgplvm_test_model(seed=default_seed, optimize=0, verbose=1, plot=0):
+    """
+    model for testing purposes. Samples from a GP with rbf kernel and learns 
+    the samples with a new kernel. Normally not for optimization, just model cheking
+    """
+    from GPy.likelihoods.gaussian import Gaussian
+    import GPy
+    
+    num_inputs = 13
     num_inducing = 5
-    Q = 6
-    D = 25
+    if plot: 
+        output_dim = 1
+        input_dim = 2
+    else: 
+        input_dim = 2
+        output_dim = 25
+    
     # generate GPLVM-like data
-    X = np.random.rand(N, Q)
-    lengthscales = np.random.rand(Q)
-    k = (GPy.kern.rbf(Q, .5, lengthscales, ARD=True)
-         + GPy.kern.white(Q, 0.01))
+    X = _np.random.rand(num_inputs, input_dim)
+    lengthscales = _np.random.rand(input_dim)
+    k = (GPy.kern.rbf(input_dim, .5, lengthscales, ARD=True)
+         + GPy.kern.white(input_dim, 0.01))
     K = k.K(X)
-    Y = np.random.multivariate_normal(np.zeros(N), K, D).T
+    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, output_dim).T
     lik = Gaussian(Y, normalize=True)
 
-    # k = GPy.kern.rbf_inv(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
-    # k = GPy.kern.linear(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
-    # k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
-    # k = GPy.kern.rbf(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.rbf(Q, .3, np.ones(Q) * .2, ARD=True)
-    k = GPy.kern.rbf(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.linear(Q, np.ones(Q) * .2, ARD=True)
-    # k = GPy.kern.rbf(Q, .5, 2., ARD=0) + GPy.kern.rbf(Q, .3, .2, ARD=0)
+    k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
+    # k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.rbf(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.rbf(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
+    # k = GPy.kern.rbf(input_dim, .5, 2., ARD=0) + GPy.kern.rbf(input_dim, .3, .2, ARD=0)
+    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
 
-    m = GPy.models.BayesianGPLVM(lik, Q, kernel=k, num_inducing=num_inducing)
+    m = GPy.models.BayesianGPLVM(lik, input_dim, kernel=k, num_inducing=num_inducing)
     m.lengthscales = lengthscales
-    # m.constrain_positive('(rbf|bias|noise|white|S)')
-    # m.constrain_fixed('S', 1)
 
-    # pb.figure()
-    # m.plot()
-    # pb.title('PCA initialisation')
-    # pb.figure()
-    # m.optimize(messages = 1)
-    # m.plot()
-    # pb.title('After optimisation')
-    # m.randomize()
-    # m.checkgrad(verbose=1)
+    if plot:
+        import matplotlib.pyplot as pb
+        m.plot()
+        pb.title('PCA initialisation')
+   
+    if optimize:
+        m.optimize('scg', messages=verbose)
+        if plot:
+            m.plot()
+            pb.title('After optimisation')
 
     return m
 
-def GPLVM_oil_100(optimize=True):
+def gplvm_oil_100(optimize=1, verbose=1, plot=1):
+    import GPy
     data = GPy.util.datasets.oil_100()
     Y = data['X']
-
     # create simple GP model
     kernel = GPy.kern.rbf(6, ARD=True) + GPy.kern.bias(6)
     m = GPy.models.GPLVM(Y, 6, kernel=kernel)
     m.data_labels = data['Y'].argmax(axis=1)
-
-    # optimize
-    if optimize:
-        m.optimize('scg', messages=1)
-
-    # plot
-    print(m)
-    m.plot_latent(labels=m.data_labels)
+    if optimize: m.optimize('scg', messages=verbose)
+    if plot: m.plot_latent(labels=m.data_labels)
     return m
 
-def sparseGPLVM_oil(optimize=True, N=100, Q=6, num_inducing=15, max_iters=50):
-    np.random.seed(0)
+def sparse_gplvm_oil(optimize=1, verbose=0, plot=1, N=100, Q=6, num_inducing=15, max_iters=50):
+    import GPy
+    _np.random.seed(0)
     data = GPy.util.datasets.oil()
-
     Y = data['X'][:N]
     Y = Y - Y.mean(0)
     Y /= Y.std(0)
-
-    # create simple GP model
+    # Create the model
     kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q)
     m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing)
-    m.data_labels = data['Y'].argmax(axis=1)
+    m.data_labels = data['Y'][:N].argmax(axis=1)
 
-    # optimize
-    if optimize:
-        m.optimize('scg', messages=1, max_iters=max_iters)
-
-    # plot
-    print(m)
-    # m.plot_latent(labels=m.data_labels)
+    if optimize: m.optimize('scg', messages=verbose, max_iters=max_iters)
+    if plot: 
+        m.plot_latent(labels=m.data_labels)
+        m.kern.plot_ARD()
     return m
 
-def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False):
+def swiss_roll(optimize=1, verbose=1, plot=1, N=1000, num_inducing=15, Q=4, sigma=.2):
+    import GPy
     from GPy.util.datasets import swiss_roll_generated
-    from GPy.core.transformations import logexp_clipped
+    from GPy.models import BayesianGPLVM
 
-    data = swiss_roll_generated(N=N, sigma=sigma)
+    data = swiss_roll_generated(num_samples=N, sigma=sigma)
     Y = data['Y']
     Y -= Y.mean()
     Y /= Y.std()
@@ -106,114 +100,98 @@ def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False
         iso = Isomap().fit(Y)
         X = iso.embedding_
         if Q > 2:
-            X = np.hstack((X, np.random.randn(N, Q - 2)))
+            X = _np.hstack((X, _np.random.randn(N, Q - 2)))
     except ImportError:
-        X = np.random.randn(N, Q)
+        X = _np.random.randn(N, Q)
 
     if plot:
-        from mpl_toolkits import mplot3d
-        import pylab
-        fig = pylab.figure("Swiss Roll Data")
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D  # @UnusedImport
+        fig = plt.figure("Swiss Roll Data")
         ax = fig.add_subplot(121, projection='3d')
         ax.scatter(*Y.T, c=c)
         ax.set_title("Swiss Roll")
 
         ax = fig.add_subplot(122)
         ax.scatter(*X.T[:2], c=c)
-        ax.set_title("Initialization")
-
+        ax.set_title("BGPLVM init")
 
     var = .5
-    S = (var * np.ones_like(X) + np.clip(np.random.randn(N, Q) * var ** 2,
+    S = (var * _np.ones_like(X) + _np.clip(_np.random.randn(N, Q) * var ** 2,
                                          - (1 - var),
                                          (1 - var))) + .001
-    Z = np.random.permutation(X)[:num_inducing]
+    Z = _np.random.permutation(X)[:num_inducing]
 
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2))
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
 
     m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
     m.data_colors = c
     m.data_t = t
-
-    m['rbf_lengthscale'] = 1. # X.var(0).max() / X.var(0)
     m['noise_variance'] = Y.var() / 100.
-    m['bias_variance'] = 0.05
 
     if optimize:
-        m.optimize('scg', messages=1)
+        m.optimize('scg', messages=verbose, max_iters=2e3)
+    
+    if plot:
+        fig = plt.figure('fitted')
+        ax = fig.add_subplot(111)
+        s = m.input_sensitivity().argsort()[::-1][:2]
+        ax.scatter(*m.X.T[s], c=c)
+        
     return m
 
-def BGPLVM_oil(optimize=True, N=200, Q=7, num_inducing=40, max_iters=1000, plot=False, **k):
-    np.random.seed(0)
+def bgplvm_oil(optimize=1, verbose=1, plot=1, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
+    import GPy
+    from GPy.likelihoods import Gaussian
+    from matplotlib import pyplot as plt
+
+    _np.random.seed(0)
     data = GPy.util.datasets.oil()
 
-    # create simple GP model
-    kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2))
-
+    kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2))
     Y = data['X'][:N]
     Yn = Gaussian(Y, normalize=True)
-#     Yn = Y - Y.mean(0)
-#     Yn /= Yn.std(0)
-
     m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k)
     m.data_labels = data['Y'][:N].argmax(axis=1)
-
-    # m.constrain('variance|leng', logexp_clipped())
-    # m['.*lengt'] = m.X.var(0).max() / m.X.var(0)
     m['noise'] = Yn.Y.var() / 100.
 
-
-    # optimize
     if optimize:
-        m.optimize('scg', messages=1, max_iters=max_iters, gtol=.05)
+        m.optimize('scg', messages=verbose, max_iters=max_iters, gtol=.05)
 
     if plot:
         y = m.likelihood.Y[0, :]
         fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
         m.plot_latent(ax=latent_axes)
         data_show = GPy.util.visualize.vector_show(y)
-        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], # @UnusedVariable
+            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)  
         raw_input('Press enter to finish')
         plt.close(fig)
     return m
 
-def oil_100():
-    data = GPy.util.datasets.oil_100()
-    m = GPy.models.GPLVM(data['X'], 2)
-
-    # optimize
-    m.optimize(messages=1, max_iters=2)
-
-    # plot
-    print(m)
-    # m.plot_latent(labels=data['Y'].argmax(axis=1))
-    return m
-
-
-
 def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
-    x = np.linspace(0, 4 * np.pi, N)[:, None]
-    s1 = np.vectorize(lambda x: np.sin(x))
-    s2 = np.vectorize(lambda x: np.cos(x))
-    s3 = np.vectorize(lambda x:-np.exp(-np.cos(2 * x)))
-    sS = np.vectorize(lambda x: np.sin(2 * x))
+    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
+    s1 = _np.vectorize(lambda x: _np.sin(x))
+    s2 = _np.vectorize(lambda x: _np.cos(x))
+    s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
+    sS = _np.vectorize(lambda x: _np.sin(2 * x))
 
     s1 = s1(x)
     s2 = s2(x)
     s3 = s3(x)
     sS = sS(x)
 
-    S1 = np.hstack([s1, sS])
-    S2 = np.hstack([s2, s3, sS])
-    S3 = np.hstack([s3, sS])
+    S1 = _np.hstack([s1, sS])
+    S2 = _np.hstack([s2, s3, sS])
+    S3 = _np.hstack([s3, sS])
 
-    Y1 = S1.dot(np.random.randn(S1.shape[1], D1))
-    Y2 = S2.dot(np.random.randn(S2.shape[1], D2))
-    Y3 = S3.dot(np.random.randn(S3.shape[1], D3))
+    Y1 = S1.dot(_np.random.randn(S1.shape[1], D1))
+    Y2 = S2.dot(_np.random.randn(S2.shape[1], D2))
+    Y3 = S3.dot(_np.random.randn(S3.shape[1], D3))
 
-    Y1 += .3 * np.random.randn(*Y1.shape)
-    Y2 += .2 * np.random.randn(*Y2.shape)
-    Y3 += .25 * np.random.randn(*Y3.shape)
+    Y1 += .3 * _np.random.randn(*Y1.shape)
+    Y2 += .2 * _np.random.randn(*Y2.shape)
+    Y3 += .25 * _np.random.randn(*Y3.shape)
 
     Y1 -= Y1.mean(0)
     Y2 -= Y2.mean(0)
@@ -245,88 +223,74 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
 
     return slist, [S1, S2, S3], Ylist
 
-def bgplvm_simulation_matlab_compare():
-    from GPy.util.datasets import simulation_BGPLVM
-    sim_data = simulation_BGPLVM()
-    Y = sim_data['Y']
-    S = sim_data['S']
-    mu = sim_data['mu']
-    num_inducing, [_, Q] = 3, mu.shape
+# def bgplvm_simulation_matlab_compare():
+#     from GPy.util.datasets import simulation_BGPLVM
+#     from GPy import kern
+#     from GPy.models import BayesianGPLVM
+# 
+#     sim_data = simulation_BGPLVM()
+#     Y = sim_data['Y']
+#     mu = sim_data['mu']
+#     num_inducing, [_, Q] = 3, mu.shape
+# 
+#     k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
+#     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
+#                        _debug=False)
+#     m.auto_scale_factor = True
+#     m['noise'] = Y.var() / 100.
+#     m['linear_variance'] = .01
+#     return m
 
-    from GPy.models import mrd
-    from GPy import kern
-    reload(mrd); reload(kern)
-    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
-    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
-#                        X=mu,
-#                        X_variance=S,
-                       _debug=False)
-    m.auto_scale_factor = True
-    m['noise'] = Y.var() / 100.
-    m['linear_variance'] = .01
-    return m
-
-def bgplvm_simulation(optimize='scg',
-                      plot=True,
+def bgplvm_simulation(optimize=1, verbose=1, 
+                      plot=1, plot_sim=False,
                       max_iters=2e4,
-                      plot_sim=False):
-#     from GPy.core.transformations import logexp_clipped
-    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
-    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
-
-    from GPy.models import mrd
+                      ):
     from GPy import kern
-    reload(mrd); reload(kern)
+    from GPy.models import BayesianGPLVM
 
+    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
     Y = Ylist[0]
-
-    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) # + kern.bias(Q)
+    k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
-
-    # m.constrain('variance|noise', logexp_clipped())
     m['noise'] = Y.var() / 100.
 
     if optimize:
         print "Optimizing model:"
-        m.optimize(optimize, max_iters=max_iters,
-                   messages=True, gtol=.05)
+        m.optimize('scg', messages=verbose, max_iters=max_iters,
+                   gtol=.05)
     if plot:
         m.plot_X_1d("BGPLVM Latent Space 1D")
         m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
     return m
 
-def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
+def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
+    from GPy import kern
+    from GPy.models import MRD
+    from GPy.likelihoods import Gaussian
+    
     D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
-    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
-
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
     likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]
 
-    from GPy.models import mrd
-    from GPy import kern
-
-    reload(mrd); reload(kern)
-
-    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
-    m = mrd.MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
+    k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
+    m = MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
     m.ensure_default_constraints()
 
     for i, bgplvm in enumerate(m.bgplvms):
         m['{}_noise'.format(i)] = bgplvm.likelihood.Y.var() / 500.
 
-
-    # DEBUG
-    # np.seterr("raise")
-
     if optimize:
         print "Optimizing Model:"
-        m.optimize(messages=1, max_iters=8e3, gtol=.1)
+        m.optimize(messages=verbose, max_iters=8e3, gtol=.1)
     if plot:
         m.plot_X_1d("MRD Latent Space 1D")
         m.plot_scales("MRD Scales")
     return m
 
-def brendan_faces():
-    from GPy import kern
+def brendan_faces(optimize=True, verbose=True, plot=True):
+    import GPy
+    
     data = GPy.util.datasets.brendan_faces()
     Q = 2
     Y = data['Y']
@@ -338,18 +302,20 @@ def brendan_faces():
     # optimize
     m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())
 
-    m.optimize('scg', messages=1, max_iters=1000)
+    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
 
-    ax = m.plot_latent(which_indices=(0, 1))
-    y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
-    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-    raw_input('Press enter to finish')
+    if plot:
+        ax = m.plot_latent(which_indices=(0, 1))
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')
 
     return m
 
-def olivetti_faces():
-    from GPy import kern
+def olivetti_faces(optimize=True, verbose=True, plot=True):
+    import GPy
+    
     data = GPy.util.datasets.olivetti_faces()
     Q = 2
     Y = data['Y']
@@ -357,153 +323,142 @@ def olivetti_faces():
     Yn /= Yn.std()
 
     m = GPy.models.GPLVM(Yn, Q)
-    m.optimize('scg', messages=1, max_iters=1000)
-
-    ax = m.plot_latent(which_indices=(0, 1))
-    y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
-    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-    raw_input('Press enter to finish')
+    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
+    if plot:
+        ax = m.plot_latent(which_indices=(0, 1))
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')
 
     return m
 
-def stick_play(range=None, frame_rate=15):
-
+def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=True):
+    import GPy
     data = GPy.util.datasets.osu_run1()
     # optimize
     if range == None:
         Y = data['Y'].copy()
     else:
         Y = data['Y'][range[0]:range[1], :].copy()
-    y = Y[0, :]
-    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-    GPy.util.visualize.data_play(Y, data_show, frame_rate)
+    if plot:
+        y = Y[0, :]
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.util.visualize.data_play(Y, data_show, frame_rate)
     return Y
 
-def stick(kernel=None):
+def stick(kernel=None, optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+    
     data = GPy.util.datasets.osu_run1()
     # optimize
     m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
-    m.optimize(messages=1, max_f_eval=10000)
-    if GPy.util.visualize.visual_available:
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot and GPy.util.visualize.visual_available:
         plt.clf
         ax = m.plot_latent()
         y = m.likelihood.Y[0, :]
         data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
         raw_input('Press enter to finish')
-
+        
     return m
 
-def bcgplvm_linear_stick(kernel=None):
+def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+    
     data = GPy.util.datasets.osu_run1()
     # optimize
     mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
     m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
-    m.optimize(messages=1, max_f_eval=10000)
-    if GPy.util.visualize.visual_available:
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot and GPy.util.visualize.visual_available:
         plt.clf
         ax = m.plot_latent()
         y = m.likelihood.Y[0, :]
         data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
         raw_input('Press enter to finish')
 
     return m
 
-def bcgplvm_stick(kernel=None):
+def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+    
     data = GPy.util.datasets.osu_run1()
     # optimize
     back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.)
     mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
     m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
-    m.optimize(messages=1, max_f_eval=10000)
-    if GPy.util.visualize.visual_available:
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot and GPy.util.visualize.visual_available:
         plt.clf
         ax = m.plot_latent()
         y = m.likelihood.Y[0, :]
         data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
         raw_input('Press enter to finish')
 
     return m
 
-def robot_wireless():
+def robot_wireless(optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+    
     data = GPy.util.datasets.robot_wireless()
     # optimize
     m = GPy.models.GPLVM(data['Y'], 2)
-    m.optimize(messages=1, max_f_eval=10000)
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
     m._set_params(m._get_params())
-    plt.clf
-    ax = m.plot_latent()
+    if plot:
+        m.plot_latent()
 
     return m
 
-def stick_bgplvm(model=None):
+def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
+    from GPy.models import BayesianGPLVM
+    from matplotlib import pyplot as plt
+    import GPy
+    
     data = GPy.util.datasets.osu_run1()
     Q = 6
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2))
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
     m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
     # optimize
     m.ensure_default_constraints()
-    m.optimize('scg', messages=1, max_iters=200, xtol=1e-300, ftol=1e-300)
+    if optimize: m.optimize('scg', messages=verbose, max_iters=200, xtol=1e-300, ftol=1e-300)
     m._set_params(m._get_params())
-    plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
-    plt.sca(latent_axes)
-    m.plot_latent()
-    y = m.likelihood.Y[0, :].copy()
-    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-    lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
-    raw_input('Press enter to finish')
+    if plot:
+        plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
+        plt.sca(latent_axes)
+        m.plot_latent()
+        y = m.likelihood.Y[0, :].copy()
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        raw_input('Press enter to finish')
 
     return m
 
 
-def cmu_mocap(subject='35', motion=['01'], in_place=True):
-
+def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose=True, plot=True):
+    import GPy
+    
     data = GPy.util.datasets.cmu_mocap(subject, motion)
-    Y = data['Y']
     if in_place:
         # Make figure move in place.
         data['Y'][:, 0:3] = 0.0
     m = GPy.models.GPLVM(data['Y'], 2, normalize_Y=True)
 
-    # optimize
-    m.optimize(messages=1, max_f_eval=10000)
-
-    ax = m.plot_latent()
-    y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
-    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-    raw_input('Press enter to finish')
-    lvm_visualizer.close()
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot:
+        ax = m.plot_latent()
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
+        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')
+        lvm_visualizer.close()
 
     return m
-
-# def BGPLVM_oil():
-#     data = GPy.util.datasets.oil()
-#     Y, X = data['Y'], data['X']
-#     X -= X.mean(axis=0)
-#     X /= X.std(axis=0)
-#
-#     Q = 10
-#     num_inducing = 30
-#
-#     kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
-#     m = GPy.models.BayesianGPLVM(X, Q, kernel=kernel, num_inducing=num_inducing)
-#     # m.scale_factor = 100.0
-#     m.constrain_positive('(white|noise|bias|X_variance|rbf_variance|rbf_length)')
-#     from sklearn import cluster
-#     km = cluster.KMeans(num_inducing, verbose=10)
-#     Z = km.fit(m.X).cluster_centers_
-#     # Z = GPy.util.misc.kmm_init(m.X, num_inducing)
-#     m.set('iip', Z)
-#     m.set('bias', 1e-4)
-#     # optimize
-#
-#     import pdb; pdb.set_trace()
-#     m.optimize('tnc', messages=1)
-#     print m
-#     m.plot_latent(labels=data['Y'].argmax(axis=1))
-#     return m
-

From c86981a110f57e19b6841da89c0f1aa6e6a9d317 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Nov 2013 17:02:04 +0000
Subject: [PATCH 238/252] some tidying in the regression examples

---
 GPy/examples/regression.py | 235 +++++++++++++++++++------------------
 1 file changed, 119 insertions(+), 116 deletions(-)

diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index a37e32c3..1ddb0a69 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
 """
 Gaussian Processes regression examples
 """
@@ -9,88 +8,107 @@ import pylab as pb
 import numpy as np
 import GPy
 
-def coregionalization_toy2(max_iters=100):
+def olympic_marathon_men(optimize=True, plot=True):
+    """Run a standard Gaussian process regression on the Olympic marathon data."""
+    data = GPy.util.datasets.olympic_marathon_men()
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'], data['Y'])
+
+    # set the lengthscale to be something sensible (defaults to 1)
+    m['rbf_lengthscale'] = 10
+
+    if optimize:
+        m.optimize('bfgs', max_iters=200)
+    if plot:
+        m.plot(plot_limits=(1850, 2050))
+
+    return m
+
+def coregionalization_toy2(optimize=True, plot=True):
     """
     A simple demonstration of coregionalization on two sinusoidal functions.
     """
+    #build a design matrix with a column of integers indicating the output
     X1 = np.random.rand(50, 1) * 8
     X2 = np.random.rand(30, 1) * 5
     index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
     X = np.hstack((np.vstack((X1, X2)), index))
+
+    #build a suitable set of observed variables
     Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
     Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.
     Y = np.vstack((Y1, Y2))
 
+    #build the kernel
     k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
     k2 = GPy.kern.coregionalize(2,1)
-    k = k1**k2 #k = k1.prod(k2,tensor=True)
+    k = k1**k2
     m = GPy.models.GPRegression(X, Y, kernel=k)
     m.constrain_fixed('.*rbf_var', 1.)
-    # m.constrain_positive('.*kappa')
-    m.optimize('sim', messages=1, max_iters=max_iters)
 
-    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
-    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
-    mean, var, low, up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
-    mean, var, low, up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
-    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
-    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
+    if optimize:
+        m.optimize('bfgs', max_iters=100)
+
+    if plot:
+        m.plot(fixed_inputs=[(1,0)])
+        m.plot(fixed_inputs=[(1,1)], ax=pb.gca())
+
     return m
 
-def coregionalization_toy(max_iters=100):
-    """
-    A simple demonstration of coregionalization on two sinusoidal functions.
-    """
-    X1 = np.random.rand(50, 1) * 8
-    X2 = np.random.rand(30, 1) * 5
-    X = np.vstack((X1, X2))
-    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
-    Y = np.vstack((Y1, Y2))
+#FIXME: Needs recovering once likelihoods are consolidated
+#def coregionalization_toy(optimize=True, plot=True):
+#    """
+#    A simple demonstration of coregionalization on two sinusoidal functions.
+#    """
+#    X1 = np.random.rand(50, 1) * 8
+#    X2 = np.random.rand(30, 1) * 5
+#    X = np.vstack((X1, X2))
+#    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
+#    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
+#    Y = np.vstack((Y1, Y2))
+#
+#    k1 = GPy.kern.rbf(1)
+#    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
+#    m.constrain_fixed('.*rbf_var', 1.)
+#    m.optimize(max_iters=100)
+#
+#    fig, axes = pb.subplots(2,1)
+#    m.plot(fixed_inputs=[(1,0)],ax=axes[0])
+#    m.plot(fixed_inputs=[(1,1)],ax=axes[1])
+#    axes[0].set_title('Output 0')
+#    axes[1].set_title('Output 1')
+#    return m
 
-    k1 = GPy.kern.rbf(1)
-    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
-    m.constrain_fixed('.*rbf_var', 1.)
-    m.optimize(max_iters=max_iters)
-
-    fig, axes = pb.subplots(2,1)
-    m.plot(fixed_inputs=[(1,0)],ax=axes[0])
-    m.plot(fixed_inputs=[(1,1)],ax=axes[1])
-    axes[0].set_title('Output 0')
-    axes[1].set_title('Output 1')
-    return m
-
-def coregionalization_sparse(max_iters=100):
+def coregionalization_sparse(optimize=True, plot=True):
     """
     A simple demonstration of coregionalization on two sinusoidal functions using sparse approximations.
     """
-    X1 = np.random.rand(500, 1) * 8
-    X2 = np.random.rand(300, 1) * 5
-    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
-    X = np.hstack((np.vstack((X1, X2)), index))
-    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
-    Y = np.vstack((Y1, Y2))
+    #fetch the data from the non sparse examples
+    m = coregionalization_toy2(optimize=False, plot=False)
+    X, Y = m.X, m.likelihood.Y
 
-    k1 = GPy.kern.rbf(1)
+    #construct a model
+    m = GPy.models.SparseGPRegression(X,Y)
+    m.constrain_fixed('iip_\d+_1') # don't optimize the inducing input indexes
 
-    m = GPy.models.SparseGPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1],num_inducing=5)
-    m.constrain_fixed('.*rbf_var',1.)
-    #m.optimize(messages=1)
-    m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
+    if optimize:
+        m.optimize('bfgs', max_iters=100, messages=1)
+
+    if plot:
+        m.plot(fixed_inputs=[(1,0)])
+        m.plot(fixed_inputs=[(1,1)], ax=pb.gca())
 
-    fig, axes = pb.subplots(2,1)
-    m.plot_single_output(output=0,ax=axes[0],plot_limits=(-1,9))
-    m.plot_single_output(output=1,ax=axes[1],plot_limits=(-1,9))
-    axes[0].set_title('Output 0')
-    axes[1].set_title('Output 1')
     return m
 
-def epomeo_gpx(max_iters=100):
-    """Perform Gaussian process regression on the latitude and longitude data from the Mount Epomeo runs. Requires gpxpy to be installed on your system to load in the data."""
+
+
+def epomeo_gpx(optimize=True, plot=True):
+    """
+    Perform Gaussian process regression on the latitude and longitude data
+    from the Mount Epomeo runs. Requires gpxpy to be installed on your system
+    to load in the data.
+    """
     data = GPy.util.datasets.epomeo_gpx()
     num_data_list = []
     for Xpart in data['X']:
@@ -119,14 +137,17 @@ def epomeo_gpx(max_iters=100):
     m.constrain_fixed('.*rbf_var', 1.)
     m.constrain_fixed('iip')
     m.constrain_bounded('noise_variance', 1e-3, 1e-1)
-#     m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
     m.optimize(max_iters=max_iters,messages=True)
 
     return m
 
 
 def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300):
-    """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisy mode is higher."""
+    """
+    Show an example of a multimodal error surface for Gaussian process
+    regression. Gene 939 has bimodal behaviour where the noisy mode is
+    higher.
+    """
 
     # Contour over a range of length scales and signal/noise ratios.
     length_scales = np.linspace(0.1, 60., resolution)
@@ -175,12 +196,15 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
     return m # (models, lls)
 
 def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
-    """Evaluate the GP objective function for a given data set for a range of signal to noise ratios and a range of lengthscales.
+    """
+    Evaluate the GP objective function for a given data set for a range of
+    signal to noise ratios and a range of lengthscales.
 
     :data_set: A data set from the utils.datasets director.
     :length_scales: a list of length scales to explore for the contour plot.
     :log_SNRs: a list of base 10 logarithm signal to noise ratios to explore for the contour plot.
-    :kernel: a kernel to use for the 'signal' portion of the data."""
+    :kernel: a kernel to use for the 'signal' portion of the data.
+    """
 
     lls = []
     total_var = np.var(data['Y'])
@@ -203,79 +227,58 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
     return np.array(lls)
 
 
-def olympic_100m_men(max_iters=100, kernel=None):
+def olympic_100m_men(optimize=True, plot=True):
     """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
     data = GPy.util.datasets.olympic_100m_men()
 
     # create simple GP Model
-    m = GPy.models.GPRegression(data['X'], data['Y'], kernel)
+    m = GPy.models.GPRegression(data['X'], data['Y'])
 
     # set the lengthscale to be something sensible (defaults to 1)
-    if kernel==None:
-        m['rbf_lengthscale'] = 10
+    m['rbf_lengthscale'] = 10
 
-    # optimize
-    m.optimize(max_iters=max_iters)
+    if optimize:
+        m.optimize('bfgs', max_iters=200)
 
-    # plot
-    m.plot(plot_limits=(1850, 2050))
-    print(m)
+    if plot:
+        m.plot(plot_limits=(1850, 2050))
     return m
 
-def olympic_marathon_men(max_iters=100, kernel=None):
-    """Run a standard Gaussian process regression on the Olympic marathon data."""
-    data = GPy.util.datasets.olympic_marathon_men()
-
-    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'], data['Y'], kernel)
-
-    # set the lengthscale to be something sensible (defaults to 1)
-    if kernel==None:
-        m['rbf_lengthscale'] = 10
-
-    # optimize
-    m.optimize(max_iters=max_iters)
-
-    # plot
-    m.plot(plot_limits=(1850, 2050))
-    print(m)
-    return m
-
-def toy_rbf_1d(optimizer='tnc', max_nb_eval_optim=100):
+def toy_rbf_1d(optimize=True, plot=True):
     """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
     data = GPy.util.datasets.toy_rbf_1d()
 
     # create simple GP Model
     m = GPy.models.GPRegression(data['X'], data['Y'])
 
-    # optimize
-    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
-    # plot
-    m.plot()
-    print(m)
+    if optimize:
+        m.optimize('bfgs')
+    if plot:
+        m.plot()
+
     return m
 
-def toy_rbf_1d_50(max_iters=100):
+def toy_rbf_1d_50(optimize=True, plot=True):
     """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
     data = GPy.util.datasets.toy_rbf_1d_50()
 
     # create simple GP Model
     m = GPy.models.GPRegression(data['X'], data['Y'])
 
-    # optimize
-    m.optimize(max_iters=max_iters)
+    if optimize:
+        m.optimize('bfgs')
+    if plot:
+        m.plot()
 
-    # plot
-    m.plot()
-    print(m)
     return m
 
-def toy_poisson_rbf_1d(optimizer='bfgs', max_nb_eval_optim=100):
+
+def toy_poisson_rbf_1d(optimize=True, plot=True):
     """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
     x_len = 400
     X = np.linspace(0, 10, x_len)[:, None]
     f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
-    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
+    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true]).reshape(x_len,1)
 
     noise_model = GPy.likelihoods.poisson()
     likelihood = GPy.likelihoods.EP(Y,noise_model)
@@ -283,14 +286,14 @@ def toy_poisson_rbf_1d(optimizer='bfgs', max_nb_eval_optim=100):
     # create simple GP Model
     m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
 
-    # optimize
-    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
-    # plot
-    m.plot()
-    print(m)
+    if optimize:
+        m.optimize('bfgs')
+    if plot:
+        m.plot()
+
     return m
 
-def toy_poisson_rbf_1d_laplace(optimizer='bfgs', max_nb_eval_optim=100):
+def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
     """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
     x_len = 30
     X = np.linspace(0, 10, x_len)[:, None]
@@ -303,13 +306,13 @@ def toy_poisson_rbf_1d_laplace(optimizer='bfgs', max_nb_eval_optim=100):
     # create simple GP Model
     m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
 
-    # optimize
-    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
-    # plot
-    m.plot()
-    # plot the real underlying rate function
-    pb.plot(X, np.exp(f_true), '--k', linewidth=2)
-    print(m)
+    if optimize:
+        m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
+    if plot:
+        m.plot()
+        # plot the real underlying rate function
+        pb.plot(X, np.exp(f_true), '--k', linewidth=2)
+
     return m
 
 
@@ -459,7 +462,7 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100):
     print(m)
     return m
 
-def uncertain_inputs_sparse_regression(max_iters=100):
+def uncertain_inputs_sparse_regression(optimize=True, plot=True):
     """Run a 1D example of a sparse GP regression with uncertain inputs."""
     fig, axes = pb.subplots(1, 2, figsize=(12, 5))
 

From 5809293c98f9dfb51c07cb4842197bc6c6af7969 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 28 Nov 2013 10:01:32 +0000
Subject: [PATCH 239/252] rename _models to models_modules to include in doc

---
 GPy/models.py                                 |  34 +-
 GPy/{_models => models_modules}/__init__.py   |   0
 .../bayesian_gplvm.py                         |  52 +++
 GPy/{_models => models_modules}/bcgplvm.py    |   0
 .../fitc_classification.py                    |   0
 .../gp_classification.py                      |   0
 .../gp_multioutput_regression.py              |   0
 .../gp_regression.py                          |   0
 GPy/{_models => models_modules}/gplvm.py      |   0
 .../gradient_checker.py                       |   0
 GPy/{_models => models_modules}/mrd.py        |   0
 .../sparse_gp_classification.py               |   0
 .../sparse_gp_multioutput_regression.py       |   0
 .../sparse_gp_regression.py                   |   0
 .../sparse_gplvm.py                           |   0
 .../svigp_regression.py                       |   0
 GPy/{_models => models_modules}/warped_gp.py  |   0
 doc/GPy.core.rst                              |  65 ++--
 doc/GPy.examples.rst                          |  45 ++-
 doc/GPy.inference.rst                         |  39 +-
 doc/GPy.kern.parts.rst                        | 161 ++++----
 doc/GPy.kern.rst                              |  55 ++-
 doc/GPy.likelihoods.noise_models.rst          |  53 ++-
 doc/GPy.likelihoods.rst                       | 135 ++++---
 doc/GPy.mappings.rst                          |  33 +-
 doc/GPy.models.rst                            | 134 -------
 doc/GPy.models_modules.rst                    | 131 +++++++
 doc/GPy.rst                                   |  27 +-
 doc/GPy.testing.rst                           |  81 ++---
 ...atent_space_visualizations.controllers.rst |  29 +-
 doc/GPy.util.latent_space_visualizations.rst  |  20 +-
 doc/GPy.util.rst                              | 343 +++++++++---------
 32 files changed, 748 insertions(+), 689 deletions(-)
 rename GPy/{_models => models_modules}/__init__.py (100%)
 rename GPy/{_models => models_modules}/bayesian_gplvm.py (85%)
 rename GPy/{_models => models_modules}/bcgplvm.py (100%)
 rename GPy/{_models => models_modules}/fitc_classification.py (100%)
 rename GPy/{_models => models_modules}/gp_classification.py (100%)
 rename GPy/{_models => models_modules}/gp_multioutput_regression.py (100%)
 rename GPy/{_models => models_modules}/gp_regression.py (100%)
 rename GPy/{_models => models_modules}/gplvm.py (100%)
 rename GPy/{_models => models_modules}/gradient_checker.py (100%)
 rename GPy/{_models => models_modules}/mrd.py (100%)
 rename GPy/{_models => models_modules}/sparse_gp_classification.py (100%)
 rename GPy/{_models => models_modules}/sparse_gp_multioutput_regression.py (100%)
 rename GPy/{_models => models_modules}/sparse_gp_regression.py (100%)
 rename GPy/{_models => models_modules}/sparse_gplvm.py (100%)
 rename GPy/{_models => models_modules}/svigp_regression.py (100%)
 rename GPy/{_models => models_modules}/warped_gp.py (100%)
 delete mode 100644 doc/GPy.models.rst
 create mode 100644 doc/GPy.models_modules.rst

diff --git a/GPy/models.py b/GPy/models.py
index a56fb305..8a1d046c 100644
--- a/GPy/models.py
+++ b/GPy/models.py
@@ -4,20 +4,20 @@ Created on 14 Nov 2013
 @author: maxz
 '''
 
-from _models.bayesian_gplvm import BayesianGPLVM
-from _models.gp_regression import GPRegression
-from _models.gp_classification import GPClassification#; _gp_classification = gp_classification ; del gp_classification 
-from _models.sparse_gp_regression import SparseGPRegression#; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
-from _models.svigp_regression import SVIGPRegression#; _svigp_regression = svigp_regression ; del svigp_regression 
-from _models.sparse_gp_classification import SparseGPClassification#; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
-from _models.fitc_classification import FITCClassification#; _fitc_classification = fitc_classification ; del fitc_classification 
-from _models.gplvm import GPLVM#; _gplvm = gplvm ; del gplvm 
-from _models.bcgplvm import BCGPLVM#; _bcgplvm = bcgplvm; del bcgplvm
-from _models.sparse_gplvm import SparseGPLVM#; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
-from _models.warped_gp import WarpedGP#; _warped_gp = warped_gp ; del warped_gp 
-from _models.bayesian_gplvm import BayesianGPLVM#; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
-from _models.mrd import MRD#; _mrd = mrd; del mrd 
-from _models.gradient_checker import GradientChecker#; _gradient_checker = gradient_checker ; del gradient_checker 
-from _models.gp_multioutput_regression import GPMultioutputRegression#; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
-from _models.sparse_gp_multioutput_regression import SparseGPMultioutputRegression#; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
-from _models.gradient_checker import GradientChecker
\ No newline at end of file
+from models_modules.bayesian_gplvm import BayesianGPLVM
+from models_modules.gp_regression import GPRegression
+from models_modules.gp_classification import GPClassification#; _gp_classification = gp_classification ; del gp_classification 
+from models_modules.sparse_gp_regression import SparseGPRegression#; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
+from models_modules.svigp_regression import SVIGPRegression#; _svigp_regression = svigp_regression ; del svigp_regression 
+from models_modules.sparse_gp_classification import SparseGPClassification#; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
+from models_modules.fitc_classification import FITCClassification#; _fitc_classification = fitc_classification ; del fitc_classification 
+from models_modules.gplvm import GPLVM#; _gplvm = gplvm ; del gplvm 
+from models_modules.bcgplvm import BCGPLVM#; _bcgplvm = bcgplvm; del bcgplvm
+from models_modules.sparse_gplvm import SparseGPLVM#; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
+from models_modules.warped_gp import WarpedGP#; _warped_gp = warped_gp ; del warped_gp 
+from models_modules.bayesian_gplvm import BayesianGPLVM#; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
+from models_modules.mrd import MRD#; _mrd = mrd; del mrd 
+from models_modules.gradient_checker import GradientChecker#; _gradient_checker = gradient_checker ; del gradient_checker 
+from models_modules.gp_multioutput_regression import GPMultioutputRegression#; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
+from models_modules.sparse_gp_multioutput_regression import SparseGPMultioutputRegression#; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
+from models_modules.gradient_checker import GradientChecker
\ No newline at end of file
diff --git a/GPy/_models/__init__.py b/GPy/models_modules/__init__.py
similarity index 100%
rename from GPy/_models/__init__.py
rename to GPy/models_modules/__init__.py
diff --git a/GPy/_models/bayesian_gplvm.py b/GPy/models_modules/bayesian_gplvm.py
similarity index 85%
rename from GPy/_models/bayesian_gplvm.py
rename to GPy/models_modules/bayesian_gplvm.py
index 2b299ad8..90e54111 100644
--- a/GPy/_models/bayesian_gplvm.py
+++ b/GPy/models_modules/bayesian_gplvm.py
@@ -12,6 +12,7 @@ from GPy.util import plot_latent, linalg
 from .gplvm import GPLVM
 from GPy.util.plot_latent import most_significant_input_dimensions
 from matplotlib import pyplot
+from GPy.core.model import Model
 
 class BayesianGPLVM(SparseGP, GPLVM):
     """
@@ -285,6 +286,57 @@ class BayesianGPLVM(SparseGP, GPLVM):
         self.init = state.pop()
         SparseGP.setstate(self, state)
 
+class BayesianGPLVMWithMissingData(Model):
+    """
+    Bayesian Gaussian Process Latent Variable Model with missing data support.
+    NOTE: Missing data is assumed to be missing at random!
+    
+    This extension comes with a large memory and computing time deficiency.
+    Use only if fraction of missing data at random is higher than 60%.
+    Otherwise, try filtering data before using this extension.
+    
+    Y can hold missing data as given by `missing`, standard is :class:`~numpy.nan`.
+    
+    If likelihood is given for Y, this likelihood will be discarded, but the parameters
+    of the likelihood will be taken. Also every effort of creating the same likelihood
+    will be done.
+     
+    :param likelihood_or_Y: observed data (np.ndarray) or GPy.likelihood
+    :type likelihood_or_Y: :class:`~numpy.ndarray` | :class:`~GPy.likelihoods.likelihood.likelihood` instance
+    :param int input_dim: latent dimensionality
+    :param init: initialisation method for the latent space
+    :type init: 'PCA' | 'random'
+    """
+    def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
+                 Z=None, kernel=None, missing=np.nan, **kwargs):
+        if type(likelihood_or_Y) is np.ndarray:
+            likelihood = Gaussian(likelihood_or_Y)
+        else:
+            likelihood = likelihood_or_Y
+
+        if X == None:
+            X = self.initialise_latent(init, input_dim, likelihood.Y)
+        self.init = init
+
+        if X_variance is None:
+            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1)
+
+        if Z is None:
+            Z = np.random.permutation(X.copy())[:num_inducing]
+        assert Z.shape[1] == X.shape[1]
+
+        if kernel is None:
+            kernel = kern.rbf(input_dim) # + kern.white(input_dim)
+
+        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
+        self.ensure_default_constraints()
+
+    def _get_param_names(self):
+        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        return (X_names + S_names + SparseGP._get_param_names(self))
+
+    pass
 
 def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
     """
diff --git a/GPy/_models/bcgplvm.py b/GPy/models_modules/bcgplvm.py
similarity index 100%
rename from GPy/_models/bcgplvm.py
rename to GPy/models_modules/bcgplvm.py
diff --git a/GPy/_models/fitc_classification.py b/GPy/models_modules/fitc_classification.py
similarity index 100%
rename from GPy/_models/fitc_classification.py
rename to GPy/models_modules/fitc_classification.py
diff --git a/GPy/_models/gp_classification.py b/GPy/models_modules/gp_classification.py
similarity index 100%
rename from GPy/_models/gp_classification.py
rename to GPy/models_modules/gp_classification.py
diff --git a/GPy/_models/gp_multioutput_regression.py b/GPy/models_modules/gp_multioutput_regression.py
similarity index 100%
rename from GPy/_models/gp_multioutput_regression.py
rename to GPy/models_modules/gp_multioutput_regression.py
diff --git a/GPy/_models/gp_regression.py b/GPy/models_modules/gp_regression.py
similarity index 100%
rename from GPy/_models/gp_regression.py
rename to GPy/models_modules/gp_regression.py
diff --git a/GPy/_models/gplvm.py b/GPy/models_modules/gplvm.py
similarity index 100%
rename from GPy/_models/gplvm.py
rename to GPy/models_modules/gplvm.py
diff --git a/GPy/_models/gradient_checker.py b/GPy/models_modules/gradient_checker.py
similarity index 100%
rename from GPy/_models/gradient_checker.py
rename to GPy/models_modules/gradient_checker.py
diff --git a/GPy/_models/mrd.py b/GPy/models_modules/mrd.py
similarity index 100%
rename from GPy/_models/mrd.py
rename to GPy/models_modules/mrd.py
diff --git a/GPy/_models/sparse_gp_classification.py b/GPy/models_modules/sparse_gp_classification.py
similarity index 100%
rename from GPy/_models/sparse_gp_classification.py
rename to GPy/models_modules/sparse_gp_classification.py
diff --git a/GPy/_models/sparse_gp_multioutput_regression.py b/GPy/models_modules/sparse_gp_multioutput_regression.py
similarity index 100%
rename from GPy/_models/sparse_gp_multioutput_regression.py
rename to GPy/models_modules/sparse_gp_multioutput_regression.py
diff --git a/GPy/_models/sparse_gp_regression.py b/GPy/models_modules/sparse_gp_regression.py
similarity index 100%
rename from GPy/_models/sparse_gp_regression.py
rename to GPy/models_modules/sparse_gp_regression.py
diff --git a/GPy/_models/sparse_gplvm.py b/GPy/models_modules/sparse_gplvm.py
similarity index 100%
rename from GPy/_models/sparse_gplvm.py
rename to GPy/models_modules/sparse_gplvm.py
diff --git a/GPy/_models/svigp_regression.py b/GPy/models_modules/svigp_regression.py
similarity index 100%
rename from GPy/_models/svigp_regression.py
rename to GPy/models_modules/svigp_regression.py
diff --git a/GPy/_models/warped_gp.py b/GPy/models_modules/warped_gp.py
similarity index 100%
rename from GPy/_models/warped_gp.py
rename to GPy/models_modules/warped_gp.py
diff --git a/doc/GPy.core.rst b/doc/GPy.core.rst
index c4f1849d..d7f18192 100644
--- a/doc/GPy.core.rst
+++ b/doc/GPy.core.rst
@@ -1,102 +1,107 @@
-GPy.core package
-================
+core Package
+============
 
-Submodules
-----------
+:mod:`core` Package
+-------------------
 
-GPy.core.domains module
------------------------
+.. automodule:: GPy.core
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`domains` Module
+---------------------
 
 .. automodule:: GPy.core.domains
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.core.fitc module
---------------------
+:mod:`fitc` Module
+------------------
 
 .. automodule:: GPy.core.fitc
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.core.gp module
-------------------
+:mod:`gp` Module
+----------------
 
 .. automodule:: GPy.core.gp
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.core.gp_base module
------------------------
+:mod:`gp_base` Module
+---------------------
 
 .. automodule:: GPy.core.gp_base
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.core.mapping module
------------------------
+:mod:`mapping` Module
+---------------------
 
 .. automodule:: GPy.core.mapping
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.core.model module
----------------------
+:mod:`model` Module
+-------------------
 
 .. automodule:: GPy.core.model
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.core.parameterized module
------------------------------
+:mod:`parameterized` Module
+---------------------------
 
 .. automodule:: GPy.core.parameterized
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.core.priors module
-----------------------
+:mod:`priors` Module
+--------------------
 
 .. automodule:: GPy.core.priors
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.core.sparse_gp module
--------------------------
+:mod:`sparse_gp` Module
+-----------------------
 
 .. automodule:: GPy.core.sparse_gp
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.core.svigp module
----------------------
+:mod:`svigp` Module
+-------------------
 
 .. automodule:: GPy.core.svigp
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.core.transformations module
--------------------------------
+:mod:`transformations` Module
+-----------------------------
 
 .. automodule:: GPy.core.transformations
     :members:
     :undoc-members:
     :show-inheritance:
 
+:mod:`variational` Module
+-------------------------
 
-Module contents
----------------
-
-.. automodule:: GPy.core
+.. automodule:: GPy.core.variational
     :members:
     :undoc-members:
     :show-inheritance:
+
diff --git a/doc/GPy.examples.rst b/doc/GPy.examples.rst
index 288ff631..176ae396 100644
--- a/doc/GPy.examples.rst
+++ b/doc/GPy.examples.rst
@@ -1,62 +1,59 @@
-GPy.examples package
-====================
+examples Package
+================
 
-Submodules
-----------
+:mod:`examples` Package
+-----------------------
 
-GPy.examples.classification module
-----------------------------------
+.. automodule:: GPy.examples
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`classification` Module
+----------------------------
 
 .. automodule:: GPy.examples.classification
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.examples.dimensionality_reduction module
---------------------------------------------
+:mod:`dimensionality_reduction` Module
+--------------------------------------
 
 .. automodule:: GPy.examples.dimensionality_reduction
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.examples.laplace_approximations module
-------------------------------------------
+:mod:`laplace_approximations` Module
+------------------------------------
 
 .. automodule:: GPy.examples.laplace_approximations
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.examples.regression module
-------------------------------
+:mod:`regression` Module
+------------------------
 
 .. automodule:: GPy.examples.regression
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.examples.stochastic module
-------------------------------
+:mod:`stochastic` Module
+------------------------
 
 .. automodule:: GPy.examples.stochastic
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.examples.tutorials module
------------------------------
+:mod:`tutorials` Module
+-----------------------
 
 .. automodule:: GPy.examples.tutorials
     :members:
     :undoc-members:
     :show-inheritance:
 
-
-Module contents
----------------
-
-.. automodule:: GPy.examples
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/GPy.inference.rst b/doc/GPy.inference.rst
index 28f42994..6a1bef4a 100644
--- a/doc/GPy.inference.rst
+++ b/doc/GPy.inference.rst
@@ -1,62 +1,51 @@
-GPy.inference package
-=====================
+inference Package
+=================
 
-Submodules
-----------
-
-GPy.inference.conjugate_gradient_descent module
------------------------------------------------
+:mod:`conjugate_gradient_descent` Module
+----------------------------------------
 
 .. automodule:: GPy.inference.conjugate_gradient_descent
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.inference.gradient_descent_update_rules module
---------------------------------------------------
+:mod:`gradient_descent_update_rules` Module
+-------------------------------------------
 
 .. automodule:: GPy.inference.gradient_descent_update_rules
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.inference.optimization module
----------------------------------
+:mod:`optimization` Module
+--------------------------
 
 .. automodule:: GPy.inference.optimization
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.inference.samplers module
------------------------------
+:mod:`samplers` Module
+----------------------
 
 .. automodule:: GPy.inference.samplers
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.inference.scg module
-------------------------
+:mod:`scg` Module
+-----------------
 
 .. automodule:: GPy.inference.scg
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.inference.sgd module
-------------------------
+:mod:`sgd` Module
+-----------------
 
 .. automodule:: GPy.inference.sgd
     :members:
     :undoc-members:
     :show-inheritance:
 
-
-Module contents
----------------
-
-.. automodule:: GPy.inference
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/GPy.kern.parts.rst b/doc/GPy.kern.parts.rst
index 650fe5cb..45d3e235 100644
--- a/doc/GPy.kern.parts.rst
+++ b/doc/GPy.kern.parts.rst
@@ -1,262 +1,275 @@
-GPy.kern.parts package
-======================
+parts Package
+=============
 
-Submodules
-----------
+:mod:`parts` Package
+--------------------
 
-GPy.kern.parts.Brownian module
-------------------------------
+.. automodule:: GPy.kern.parts
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`Brownian` Module
+----------------------
 
 .. automodule:: GPy.kern.parts.Brownian
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.Matern32 module
-------------------------------
+:mod:`Matern32` Module
+----------------------
 
 .. automodule:: GPy.kern.parts.Matern32
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.Matern52 module
-------------------------------
+:mod:`Matern52` Module
+----------------------
 
 .. automodule:: GPy.kern.parts.Matern52
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.ODE_1 module
----------------------------
+:mod:`ODE_1` Module
+-------------------
 
 .. automodule:: GPy.kern.parts.ODE_1
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.bias module
---------------------------
+:mod:`ODE_UY` Module
+--------------------
+
+.. automodule:: GPy.kern.parts.ODE_UY
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`bias` Module
+------------------
 
 .. automodule:: GPy.kern.parts.bias
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.coregionalize module
------------------------------------
+:mod:`coregionalize` Module
+---------------------------
 
 .. automodule:: GPy.kern.parts.coregionalize
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.eq_ode1 module
------------------------------
+:mod:`eq_ode1` Module
+---------------------
 
 .. automodule:: GPy.kern.parts.eq_ode1
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.exponential module
----------------------------------
+:mod:`exponential` Module
+-------------------------
 
 .. automodule:: GPy.kern.parts.exponential
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.finite_dimensional module
-----------------------------------------
+:mod:`finite_dimensional` Module
+--------------------------------
 
 .. automodule:: GPy.kern.parts.finite_dimensional
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.fixed module
----------------------------
+:mod:`fixed` Module
+-------------------
 
 .. automodule:: GPy.kern.parts.fixed
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.gibbs module
----------------------------
+:mod:`gibbs` Module
+-------------------
 
 .. automodule:: GPy.kern.parts.gibbs
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.hetero module
-----------------------------
+:mod:`hetero` Module
+--------------------
 
 .. automodule:: GPy.kern.parts.hetero
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.hierarchical module
-----------------------------------
+:mod:`hierarchical` Module
+--------------------------
 
 .. automodule:: GPy.kern.parts.hierarchical
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.independent_outputs module
------------------------------------------
+:mod:`independent_outputs` Module
+---------------------------------
 
 .. automodule:: GPy.kern.parts.independent_outputs
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.kernpart module
-------------------------------
+:mod:`kernpart` Module
+----------------------
 
 .. automodule:: GPy.kern.parts.kernpart
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.linear module
-----------------------------
+:mod:`linear` Module
+--------------------
 
 .. automodule:: GPy.kern.parts.linear
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.mlp module
--------------------------
+:mod:`mlp` Module
+-----------------
 
 .. automodule:: GPy.kern.parts.mlp
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.periodic_Matern32 module
----------------------------------------
+:mod:`periodic_Matern32` Module
+-------------------------------
 
 .. automodule:: GPy.kern.parts.periodic_Matern32
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.periodic_Matern52 module
----------------------------------------
+:mod:`periodic_Matern52` Module
+-------------------------------
 
 .. automodule:: GPy.kern.parts.periodic_Matern52
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.periodic_exponential module
-------------------------------------------
+:mod:`periodic_exponential` Module
+----------------------------------
 
 .. automodule:: GPy.kern.parts.periodic_exponential
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.poly module
---------------------------
+:mod:`poly` Module
+------------------
 
 .. automodule:: GPy.kern.parts.poly
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.prod module
---------------------------
+:mod:`prod` Module
+------------------
 
 .. automodule:: GPy.kern.parts.prod
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.prod_orthogonal module
--------------------------------------
+:mod:`prod_orthogonal` Module
+-----------------------------
 
 .. automodule:: GPy.kern.parts.prod_orthogonal
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.rational_quadratic module
-----------------------------------------
+:mod:`rational_quadratic` Module
+--------------------------------
 
 .. automodule:: GPy.kern.parts.rational_quadratic
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.rbf module
--------------------------
+:mod:`rbf` Module
+-----------------
 
 .. automodule:: GPy.kern.parts.rbf
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.rbf_inv module
------------------------------
+:mod:`rbf_inv` Module
+---------------------
 
 .. automodule:: GPy.kern.parts.rbf_inv
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.rbfcos module
-----------------------------
+:mod:`rbfcos` Module
+--------------------
 
 .. automodule:: GPy.kern.parts.rbfcos
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.spline module
-----------------------------
+:mod:`spline` Module
+--------------------
 
 .. automodule:: GPy.kern.parts.spline
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.symmetric module
--------------------------------
+:mod:`symmetric` Module
+-----------------------
 
 .. automodule:: GPy.kern.parts.symmetric
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.sympykern module
--------------------------------
+:mod:`sympy_helpers` Module
+---------------------------
+
+.. automodule:: GPy.kern.parts.sympy_helpers
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`sympykern` Module
+-----------------------
 
 .. automodule:: GPy.kern.parts.sympykern
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.kern.parts.white module
----------------------------
+:mod:`white` Module
+-------------------
 
 .. automodule:: GPy.kern.parts.white
     :members:
     :undoc-members:
     :show-inheritance:
 
-
-Module contents
----------------
-
-.. automodule:: GPy.kern.parts
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/GPy.kern.rst b/doc/GPy.kern.rst
index b4b9d9aa..35d9ec00 100644
--- a/doc/GPy.kern.rst
+++ b/doc/GPy.kern.rst
@@ -1,5 +1,29 @@
-GPy.kern package
-================
+kern Package
+============
+
+:mod:`kern` Package
+-------------------
+
+.. automodule:: GPy.kern
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`constructors` Module
+--------------------------
+
+.. automodule:: GPy.kern.constructors
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`kern` Module
+------------------
+
+.. automodule:: GPy.kern.kern
+    :members:
+    :undoc-members:
+    :show-inheritance:
 
 Subpackages
 -----------
@@ -8,30 +32,3 @@ Subpackages
 
     GPy.kern.parts
 
-Submodules
-----------
-
-GPy.kern.constructors module
-----------------------------
-
-.. automodule:: GPy.kern.constructors
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.kern.kern module
---------------------
-
-.. automodule:: GPy.kern.kern
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
-Module contents
----------------
-
-.. automodule:: GPy.kern
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/GPy.likelihoods.noise_models.rst b/doc/GPy.likelihoods.noise_models.rst
index 6fec5aff..19e5e9fe 100644
--- a/doc/GPy.likelihoods.noise_models.rst
+++ b/doc/GPy.likelihoods.noise_models.rst
@@ -1,78 +1,75 @@
-GPy.likelihoods.noise_models package
-====================================
+noise_models Package
+====================
 
-Submodules
-----------
+:mod:`noise_models` Package
+---------------------------
 
-GPy.likelihoods.noise_models.bernoulli_noise module
----------------------------------------------------
+.. automodule:: GPy.likelihoods.noise_models
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`bernoulli_noise` Module
+-----------------------------
 
 .. automodule:: GPy.likelihoods.noise_models.bernoulli_noise
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.likelihoods.noise_models.exponential_noise module
------------------------------------------------------
+:mod:`exponential_noise` Module
+-------------------------------
 
 .. automodule:: GPy.likelihoods.noise_models.exponential_noise
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.likelihoods.noise_models.gamma_noise module
------------------------------------------------
+:mod:`gamma_noise` Module
+-------------------------
 
 .. automodule:: GPy.likelihoods.noise_models.gamma_noise
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.likelihoods.noise_models.gaussian_noise module
---------------------------------------------------
+:mod:`gaussian_noise` Module
+----------------------------
 
 .. automodule:: GPy.likelihoods.noise_models.gaussian_noise
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.likelihoods.noise_models.gp_transformations module
-------------------------------------------------------
+:mod:`gp_transformations` Module
+--------------------------------
 
 .. automodule:: GPy.likelihoods.noise_models.gp_transformations
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.likelihoods.noise_models.noise_distributions module
--------------------------------------------------------
+:mod:`noise_distributions` Module
+---------------------------------
 
 .. automodule:: GPy.likelihoods.noise_models.noise_distributions
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.likelihoods.noise_models.poisson_noise module
--------------------------------------------------
+:mod:`poisson_noise` Module
+---------------------------
 
 .. automodule:: GPy.likelihoods.noise_models.poisson_noise
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.likelihoods.noise_models.student_t_noise module
----------------------------------------------------
+:mod:`student_t_noise` Module
+-----------------------------
 
 .. automodule:: GPy.likelihoods.noise_models.student_t_noise
     :members:
     :undoc-members:
     :show-inheritance:
 
-
-Module contents
----------------
-
-.. automodule:: GPy.likelihoods.noise_models
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/GPy.likelihoods.rst b/doc/GPy.likelihoods.rst
index 34d98739..5dcabbd1 100644
--- a/doc/GPy.likelihoods.rst
+++ b/doc/GPy.likelihoods.rst
@@ -1,5 +1,69 @@
-GPy.likelihoods package
-=======================
+likelihoods Package
+===================
+
+:mod:`likelihoods` Package
+--------------------------
+
+.. automodule:: GPy.likelihoods
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`ep` Module
+----------------
+
+.. automodule:: GPy.likelihoods.ep
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`ep_mixed_noise` Module
+----------------------------
+
+.. automodule:: GPy.likelihoods.ep_mixed_noise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`gaussian` Module
+----------------------
+
+.. automodule:: GPy.likelihoods.gaussian
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`gaussian_mixed_noise` Module
+----------------------------------
+
+.. automodule:: GPy.likelihoods.gaussian_mixed_noise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`laplace` Module
+---------------------
+
+.. automodule:: GPy.likelihoods.laplace
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`likelihood` Module
+------------------------
+
+.. automodule:: GPy.likelihoods.likelihood
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`noise_model_constructors` Module
+--------------------------------------
+
+.. automodule:: GPy.likelihoods.noise_model_constructors
+    :members:
+    :undoc-members:
+    :show-inheritance:
 
 Subpackages
 -----------
@@ -8,70 +72,3 @@ Subpackages
 
     GPy.likelihoods.noise_models
 
-Submodules
-----------
-
-GPy.likelihoods.ep module
--------------------------
-
-.. automodule:: GPy.likelihoods.ep
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.likelihoods.ep_mixed_noise module
--------------------------------------
-
-.. automodule:: GPy.likelihoods.ep_mixed_noise
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.likelihoods.gaussian module
--------------------------------
-
-.. automodule:: GPy.likelihoods.gaussian
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.likelihoods.gaussian_mixed_noise module
--------------------------------------------
-
-.. automodule:: GPy.likelihoods.gaussian_mixed_noise
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.likelihoods.laplace module
-------------------------------
-
-.. automodule:: GPy.likelihoods.laplace
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.likelihoods.likelihood module
----------------------------------
-
-.. automodule:: GPy.likelihoods.likelihood
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.likelihoods.noise_model_constructors module
------------------------------------------------
-
-.. automodule:: GPy.likelihoods.noise_model_constructors
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
-Module contents
----------------
-
-.. automodule:: GPy.likelihoods
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/GPy.mappings.rst b/doc/GPy.mappings.rst
index c48cb06e..b7444808 100644
--- a/doc/GPy.mappings.rst
+++ b/doc/GPy.mappings.rst
@@ -1,38 +1,35 @@
-GPy.mappings package
-====================
+mappings Package
+================
 
-Submodules
-----------
+:mod:`mappings` Package
+-----------------------
 
-GPy.mappings.kernel module
---------------------------
+.. automodule:: GPy.mappings
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`kernel` Module
+--------------------
 
 .. automodule:: GPy.mappings.kernel
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.mappings.linear module
---------------------------
+:mod:`linear` Module
+--------------------
 
 .. automodule:: GPy.mappings.linear
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.mappings.mlp module
------------------------
+:mod:`mlp` Module
+-----------------
 
 .. automodule:: GPy.mappings.mlp
     :members:
     :undoc-members:
     :show-inheritance:
 
-
-Module contents
----------------
-
-.. automodule:: GPy.mappings
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/GPy.models.rst b/doc/GPy.models.rst
deleted file mode 100644
index 4440513e..00000000
--- a/doc/GPy.models.rst
+++ /dev/null
@@ -1,134 +0,0 @@
-GPy.models package
-==================
-
-Submodules
-----------
-
-GPy.models.bayesian_gplvm module
---------------------------------
-
-.. automodule:: GPy.models.bayesian_gplvm
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.bcgplvm module
--------------------------
-
-.. automodule:: GPy.models.bcgplvm
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.fitc_classification module
--------------------------------------
-
-.. automodule:: GPy.models.fitc_classification
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.gp_classification module
------------------------------------
-
-.. automodule:: GPy.models.gp_classification
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.gp_multioutput_regression module
--------------------------------------------
-
-.. automodule:: GPy.models.gp_multioutput_regression
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.gp_regression module
--------------------------------
-
-.. automodule:: GPy.models.gp_regression
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.gplvm module
------------------------
-
-.. automodule:: GPy.models.gplvm
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.gradient_checker module
-----------------------------------
-
-.. automodule:: GPy.models.gradient_checker
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.mrd module
----------------------
-
-.. automodule:: GPy.models.mrd
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.sparse_gp_classification module
-------------------------------------------
-
-.. automodule:: GPy.models.sparse_gp_classification
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.sparse_gp_multioutput_regression module
---------------------------------------------------
-
-.. automodule:: GPy.models.sparse_gp_multioutput_regression
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.sparse_gp_regression module
---------------------------------------
-
-.. automodule:: GPy.models.sparse_gp_regression
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.sparse_gplvm module
-------------------------------
-
-.. automodule:: GPy.models.sparse_gplvm
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.svigp_regression module
-----------------------------------
-
-.. automodule:: GPy.models.svigp_regression
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.models.warped_gp module
----------------------------
-
-.. automodule:: GPy.models.warped_gp
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
-Module contents
----------------
-
-.. automodule:: GPy.models
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/GPy.models_modules.rst b/doc/GPy.models_modules.rst
new file mode 100644
index 00000000..4169ec3a
--- /dev/null
+++ b/doc/GPy.models_modules.rst
@@ -0,0 +1,131 @@
+models_modules Package
+======================
+
+:mod:`models_modules` Package
+-----------------------------
+
+.. automodule:: GPy.models_modules
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`bayesian_gplvm` Module
+----------------------------
+
+.. automodule:: GPy.models_modules.bayesian_gplvm
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`bcgplvm` Module
+---------------------
+
+.. automodule:: GPy.models_modules.bcgplvm
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`fitc_classification` Module
+---------------------------------
+
+.. automodule:: GPy.models_modules.fitc_classification
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`gp_classification` Module
+-------------------------------
+
+.. automodule:: GPy.models_modules.gp_classification
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`gp_multioutput_regression` Module
+---------------------------------------
+
+.. automodule:: GPy.models_modules.gp_multioutput_regression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`gp_regression` Module
+---------------------------
+
+.. automodule:: GPy.models_modules.gp_regression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`gplvm` Module
+-------------------
+
+.. automodule:: GPy.models_modules.gplvm
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`gradient_checker` Module
+------------------------------
+
+.. automodule:: GPy.models_modules.gradient_checker
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`mrd` Module
+-----------------
+
+.. automodule:: GPy.models_modules.mrd
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`sparse_gp_classification` Module
+--------------------------------------
+
+.. automodule:: GPy.models_modules.sparse_gp_classification
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`sparse_gp_multioutput_regression` Module
+----------------------------------------------
+
+.. automodule:: GPy.models_modules.sparse_gp_multioutput_regression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`sparse_gp_regression` Module
+----------------------------------
+
+.. automodule:: GPy.models_modules.sparse_gp_regression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`sparse_gplvm` Module
+--------------------------
+
+.. automodule:: GPy.models_modules.sparse_gplvm
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`svigp_regression` Module
+------------------------------
+
+.. automodule:: GPy.models_modules.svigp_regression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`warped_gp` Module
+-----------------------
+
+.. automodule:: GPy.models_modules.warped_gp
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
diff --git a/doc/GPy.rst b/doc/GPy.rst
index 60092e91..31ec3562 100644
--- a/doc/GPy.rst
+++ b/doc/GPy.rst
@@ -1,6 +1,22 @@
-GPy package
+GPy Package
 ===========
 
+:mod:`GPy` Package
+------------------
+
+.. automodule:: GPy.__init__
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`models` Module
+--------------------
+
+.. automodule:: GPy.models
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 Subpackages
 -----------
 
@@ -12,14 +28,7 @@ Subpackages
     GPy.kern
     GPy.likelihoods
     GPy.mappings
-    GPy.models
+    GPy.models_modules
     GPy.testing
     GPy.util
 
-Module contents
----------------
-
-.. automodule:: GPy
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/GPy.testing.rst b/doc/GPy.testing.rst
index 98b001c0..15b0cc79 100644
--- a/doc/GPy.testing.rst
+++ b/doc/GPy.testing.rst
@@ -1,134 +1,131 @@
-GPy.testing package
-===================
+testing Package
+===============
 
-Submodules
-----------
+:mod:`testing` Package
+----------------------
 
-GPy.testing.bcgplvm_tests module
---------------------------------
+.. automodule:: GPy.testing
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`bcgplvm_tests` Module
+---------------------------
 
 .. automodule:: GPy.testing.bcgplvm_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.bgplvm_tests module
--------------------------------
+:mod:`bgplvm_tests` Module
+--------------------------
 
 .. automodule:: GPy.testing.bgplvm_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.cgd_tests module
-----------------------------
+:mod:`cgd_tests` Module
+-----------------------
 
 .. automodule:: GPy.testing.cgd_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.examples_tests module
----------------------------------
+:mod:`examples_tests` Module
+----------------------------
 
 .. automodule:: GPy.testing.examples_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.gp_transformation_tests module
-------------------------------------------
+:mod:`gp_transformation_tests` Module
+-------------------------------------
 
 .. automodule:: GPy.testing.gp_transformation_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.gplvm_tests module
-------------------------------
+:mod:`gplvm_tests` Module
+-------------------------
 
 .. automodule:: GPy.testing.gplvm_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.kernel_tests module
--------------------------------
+:mod:`kernel_tests` Module
+--------------------------
 
 .. automodule:: GPy.testing.kernel_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.likelihoods_tests module
-------------------------------------
+:mod:`likelihoods_tests` Module
+-------------------------------
 
 .. automodule:: GPy.testing.likelihoods_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.mapping_tests module
---------------------------------
+:mod:`mapping_tests` Module
+---------------------------
 
 .. automodule:: GPy.testing.mapping_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.mrd_tests module
-----------------------------
+:mod:`mrd_tests` Module
+-----------------------
 
 .. automodule:: GPy.testing.mrd_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.prior_tests module
-------------------------------
+:mod:`prior_tests` Module
+-------------------------
 
 .. automodule:: GPy.testing.prior_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.psi_stat_expectation_tests module
----------------------------------------------
+:mod:`psi_stat_expectation_tests` Module
+----------------------------------------
 
 .. automodule:: GPy.testing.psi_stat_expectation_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.psi_stat_gradient_tests module
-------------------------------------------
+:mod:`psi_stat_gradient_tests` Module
+-------------------------------------
 
 .. automodule:: GPy.testing.psi_stat_gradient_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.sparse_gplvm_tests module
--------------------------------------
+:mod:`sparse_gplvm_tests` Module
+--------------------------------
 
 .. automodule:: GPy.testing.sparse_gplvm_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.testing.unit_tests module
------------------------------
+:mod:`unit_tests` Module
+------------------------
 
 .. automodule:: GPy.testing.unit_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-
-Module contents
----------------
-
-.. automodule:: GPy.testing
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/GPy.util.latent_space_visualizations.controllers.rst b/doc/GPy.util.latent_space_visualizations.controllers.rst
index a88c1f5c..e78ade7b 100644
--- a/doc/GPy.util.latent_space_visualizations.controllers.rst
+++ b/doc/GPy.util.latent_space_visualizations.controllers.rst
@@ -1,30 +1,27 @@
-GPy.util.latent_space_visualizations.controllers package
-========================================================
+controllers Package
+===================
 
-Submodules
-----------
+:mod:`controllers` Package
+--------------------------
 
-GPy.util.latent_space_visualizations.controllers.axis_event_controller module
------------------------------------------------------------------------------
+.. automodule:: GPy.util.latent_space_visualizations.controllers
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`axis_event_controller` Module
+-----------------------------------
 
 .. automodule:: GPy.util.latent_space_visualizations.controllers.axis_event_controller
     :members:
     :undoc-members:
     :show-inheritance:
 
-GPy.util.latent_space_visualizations.controllers.imshow_controller module
--------------------------------------------------------------------------
+:mod:`imshow_controller` Module
+-------------------------------
 
 .. automodule:: GPy.util.latent_space_visualizations.controllers.imshow_controller
     :members:
     :undoc-members:
     :show-inheritance:
 
-
-Module contents
----------------
-
-.. automodule:: GPy.util.latent_space_visualizations.controllers
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/GPy.util.latent_space_visualizations.rst b/doc/GPy.util.latent_space_visualizations.rst
index d8cbd843..4b440f61 100644
--- a/doc/GPy.util.latent_space_visualizations.rst
+++ b/doc/GPy.util.latent_space_visualizations.rst
@@ -1,5 +1,13 @@
-GPy.util.latent_space_visualizations package
-============================================
+latent_space_visualizations Package
+===================================
+
+:mod:`latent_space_visualizations` Package
+------------------------------------------
+
+.. automodule:: GPy.util.latent_space_visualizations
+    :members:
+    :undoc-members:
+    :show-inheritance:
 
 Subpackages
 -----------
@@ -7,11 +15,5 @@ Subpackages
 .. toctree::
 
     GPy.util.latent_space_visualizations.controllers
+    GPy.util.latent_space_visualizations.views
 
-Module contents
----------------
-
-.. automodule:: GPy.util.latent_space_visualizations
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/doc/GPy.util.rst b/doc/GPy.util.rst
index f2aaed7f..2e20c006 100644
--- a/doc/GPy.util.rst
+++ b/doc/GPy.util.rst
@@ -1,5 +1,181 @@
-GPy.util package
-================
+util Package
+============
+
+:mod:`util` Package
+-------------------
+
+.. automodule:: GPy.util
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`Tango` Module
+-------------------
+
+.. automodule:: GPy.util.Tango
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`block_matrices` Module
+----------------------------
+
+.. automodule:: GPy.util.block_matrices
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`classification` Module
+----------------------------
+
+.. automodule:: GPy.util.classification
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`config` Module
+--------------------
+
+.. automodule:: GPy.util.config
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`datasets` Module
+----------------------
+
+.. automodule:: GPy.util.datasets
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`decorators` Module
+------------------------
+
+.. automodule:: GPy.util.decorators
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`erfcx` Module
+-------------------
+
+.. automodule:: GPy.util.erfcx
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`linalg` Module
+--------------------
+
+.. automodule:: GPy.util.linalg
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`ln_diff_erfs` Module
+--------------------------
+
+.. automodule:: GPy.util.ln_diff_erfs
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`misc` Module
+------------------
+
+.. automodule:: GPy.util.misc
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`mocap` Module
+-------------------
+
+.. automodule:: GPy.util.mocap
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`multioutput` Module
+-------------------------
+
+.. automodule:: GPy.util.multioutput
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`netpbmfile` Module
+------------------------
+
+.. automodule:: GPy.util.netpbmfile
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`pca` Module
+-----------------
+
+.. automodule:: GPy.util.pca
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`plot` Module
+------------------
+
+.. automodule:: GPy.util.plot
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`plot_latent` Module
+-------------------------
+
+.. automodule:: GPy.util.plot_latent
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`squashers` Module
+-----------------------
+
+.. automodule:: GPy.util.squashers
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`symbolic` Module
+----------------------
+
+.. automodule:: GPy.util.symbolic
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`univariate_Gaussian` Module
+---------------------------------
+
+.. automodule:: GPy.util.univariate_Gaussian
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`visualize` Module
+-----------------------
+
+.. automodule:: GPy.util.visualize
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`warping_functions` Module
+-------------------------------
+
+.. automodule:: GPy.util.warping_functions
+    :members:
+    :undoc-members:
+    :show-inheritance:
 
 Subpackages
 -----------
@@ -8,166 +184,3 @@ Subpackages
 
     GPy.util.latent_space_visualizations
 
-Submodules
-----------
-
-GPy.util.Tango module
----------------------
-
-.. automodule:: GPy.util.Tango
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.classification module
-------------------------------
-
-.. automodule:: GPy.util.classification
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.config module
-----------------------
-
-.. automodule:: GPy.util.config
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.datasets module
-------------------------
-
-.. automodule:: GPy.util.datasets
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.decorators module
---------------------------
-
-.. automodule:: GPy.util.decorators
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.erfcx module
----------------------
-
-.. automodule:: GPy.util.erfcx
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.linalg module
-----------------------
-
-.. automodule:: GPy.util.linalg
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.ln_diff_erfs module
-----------------------------
-
-.. automodule:: GPy.util.ln_diff_erfs
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.misc module
---------------------
-
-.. automodule:: GPy.util.misc
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.mocap module
----------------------
-
-.. automodule:: GPy.util.mocap
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.multioutput module
----------------------------
-
-.. automodule:: GPy.util.multioutput
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.netpbmfile module
---------------------------
-
-.. automodule:: GPy.util.netpbmfile
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.plot module
---------------------
-
-.. automodule:: GPy.util.plot
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.plot_latent module
----------------------------
-
-.. automodule:: GPy.util.plot_latent
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.squashers module
--------------------------
-
-.. automodule:: GPy.util.squashers
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.symbolic module
-------------------------
-
-.. automodule:: GPy.util.symbolic
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.univariate_Gaussian module
------------------------------------
-
-.. automodule:: GPy.util.univariate_Gaussian
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.visualize module
--------------------------
-
-.. automodule:: GPy.util.visualize
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-GPy.util.warping_functions module
----------------------------------
-
-.. automodule:: GPy.util.warping_functions
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-
-Module contents
----------------
-
-.. automodule:: GPy.util
-    :members:
-    :undoc-members:
-    :show-inheritance:

From 25635571afe8517d97c23196cd309db8f9d5fc9d Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 28 Nov 2013 10:31:17 +0000
Subject: [PATCH 240/252] added comments for models module and adjusted setup

---
 GPy/models.py | 12 ++++++++++--
 doc/index.rst |  3 +++
 setup.py      |  2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/GPy/models.py b/GPy/models.py
index 8a1d046c..3b2683ea 100644
--- a/GPy/models.py
+++ b/GPy/models.py
@@ -1,9 +1,17 @@
 '''
-Created on 14 Nov 2013
+GPy Models
+==========
 
-@author: maxz
+Implementations for common models used in GP regression and classification.
+The different models can be viewed in :mod:`GPy.models_modules`, which holds
+detailed explanations for the different models.
+
+:warning: This module is a convienince module for endusers to use. For developers 
+see :mod:`GPy.models_modules`, which holds the implementions for each model. 
 '''
 
+__updated__ = '2013-11-28'
+
 from models_modules.bayesian_gplvm import BayesianGPLVM
 from models_modules.gp_regression import GPRegression
 from models_modules.gp_classification import GPClassification#; _gp_classification = gp_classification ; del gp_classification 
diff --git a/doc/index.rst b/doc/index.rst
index 29b4cf43..f6207963 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -15,6 +15,9 @@ For a quick start, you can have a look at one of the tutorials:
 
 You may also be interested by some examples in the GPy/examples folder.
 
+The detailed Developers Documentation is listed below
+=====================================================
+
 Contents:
 
 .. toctree::
diff --git a/setup.py b/setup.py
index 88ee6257..3b493022 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ setup(name = 'GPy',
       license = "BSD 3-clause",
       keywords = "machine-learning gaussian-processes kernels",
       url = "http://sheffieldml.github.com/GPy/",
-      packages = ['GPy', 'GPy.core', 'GPy.kern', 'GPy.util', 'GPy._models', 'GPy.inference', 'GPy.examples', 'GPy.likelihoods', 'GPy.testing', 'GPy.util.latent_space_visualizations', 'GPy.util.latent_space_visualizations.controllers', 'GPy.likelihoods.noise_models', 'GPy.kern.parts', 'GPy.mappings'],
+      packages = ['GPy', 'GPy.core', 'GPy.kern', 'GPy.util', 'GPy.models_modules', 'GPy.inference', 'GPy.examples', 'GPy.likelihoods', 'GPy.testing', 'GPy.util.latent_space_visualizations', 'GPy.util.latent_space_visualizations.controllers', 'GPy.likelihoods.noise_models', 'GPy.kern.parts', 'GPy.mappings'],
       package_dir={'GPy': 'GPy'},
       package_data = {'GPy': ['GPy/examples', 'gpy_config.cfg']},
       py_modules = ['GPy.__init__'],

From 0a4332915006d038bdc336fdfffb38b3aa0c4057 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 28 Nov 2013 15:23:39 +0000
Subject: [PATCH 241/252] Changed some parameters of the laplace, tidied up
 examples

---
 ...lace_approximations.py => non_gaussian.py} | 153 +++++++++---------
 GPy/likelihoods/laplace.py                    |  48 +++---
 2 files changed, 105 insertions(+), 96 deletions(-)
 rename GPy/examples/{laplace_approximations.py => non_gaussian.py} (77%)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/non_gaussian.py
similarity index 77%
rename from GPy/examples/laplace_approximations.py
rename to GPy/examples/non_gaussian.py
index f74e4d37..622b3edd 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/non_gaussian.py
@@ -2,22 +2,21 @@ import GPy
 import numpy as np
 import matplotlib.pyplot as plt
 from GPy.util import datasets
-#np.random.seed(1)
 
-def student_t_approx():
+def student_t_approx(optimize=True, plot=True):
     """
-    Example of regressing with a student t likelihood
+    Example of regressing with a student t likelihood using Laplace
     """
     real_std = 0.1
     #Start a function, any function
     X = np.linspace(0.0, np.pi*2, 100)[:, None]
     Y = np.sin(X) + np.random.randn(*X.shape)*real_std
+    Y = Y/Y.max()
     Yc = Y.copy()
 
     X_full = np.linspace(0.0, np.pi*2, 500)[:, None]
     Y_full = np.sin(X_full)
-
-    Y = Y/Y.max()
+    Y_full = Y_full/Y_full.max()
 
     #Slightly noisy data
     Yc[75:80] += 1
@@ -34,94 +33,93 @@ def student_t_approx():
     deg_free = 5
     print "Real noise: ", real_std
     initial_var_guess = 0.5
+    edited_real_sd = initial_var_guess
 
-    #t_rv = t(deg_free, loc=0, scale=real_var)
-    #noise = t_rvrvs(size=Y.shape)
-    #Y += noise
-
-    plt.figure(1)
-    plt.suptitle('Gaussian likelihood')
     # Kernel object
     kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
     kernel2 = kernel1.copy()
     kernel3 = kernel1.copy()
     kernel4 = kernel1.copy()
-    kernel5 = kernel1.copy()
-    kernel6 = kernel1.copy()
 
-    print "Clean Gaussian"
-    #A GP should completely break down due to the points as they get a lot of weight
-    # create simple GP model
-    m = GPy.models.GPRegression(X, Y, kernel=kernel1)
+    #Gaussian GP model on clean data
+    m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
     # optimize
-    m.ensure_default_constraints()
-    m.constrain_fixed('white', 1e-4)
-    m.randomize()
-    m.optimize()
-    # plot
-    ax = plt.subplot(211)
-    m.plot(ax=ax)
-    plt.plot(X_full, Y_full)
-    plt.ylim(-1.5, 1.5)
-    plt.title('Gaussian clean')
-    print m
+    m1.ensure_default_constraints()
+    m1.constrain_fixed('white', 1e-5)
+    m1.randomize()
 
-    #Corrupt
-    print "Corrupt Gaussian"
-    m = GPy.models.GPRegression(X, Yc, kernel=kernel2)
-    m.ensure_default_constraints()
-    m.constrain_fixed('white', 1e-4)
-    m.randomize()
-    m.optimize()
-    ax = plt.subplot(212)
-    m.plot(ax=ax)
-    plt.plot(X_full, Y_full)
-    plt.ylim(-1.5, 1.5)
-    plt.title('Gaussian corrupt')
-    print m
+    #Gaussian GP model on corrupt data
+    m2 = GPy.models.GPRegression(X, Yc.copy(), kernel=kernel2)
+    m2.ensure_default_constraints()
+    m2.constrain_fixed('white', 1e-5)
+    m2.randomize()
 
-    plt.figure(2)
-    plt.suptitle('Student-t likelihood')
-    edited_real_sd = initial_var_guess
-
-    print "Clean student t, rasm"
+    #Student t GP model on clean data
     t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
-    m = GPy.models.GPRegression(X, Y.copy(), kernel6, likelihood=stu_t_likelihood)
-    m.ensure_default_constraints()
-    m.constrain_positive('t_noise')
-    m.constrain_fixed('white', 1e-4)
-    m.randomize()
-    #m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    ax = plt.subplot(211)
-    m.plot(ax=ax)
-    plt.plot(X_full, Y_full)
-    plt.ylim(-1.5, 1.5)
-    plt.title('Student-t rasm clean')
+    m3 = GPy.models.GPRegression(X, Y.copy(), kernel3, likelihood=stu_t_likelihood)
+    m3.ensure_default_constraints()
+    m3.constrain_bounded('t_noise', 1e-6, 10.)
+    m3.constrain_fixed('white', 1e-5)
+    m3.randomize()
 
-    print "Corrupt student t, rasm"
+    #Student t GP model on corrupt data
     t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
     corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
-    m = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
-    m.ensure_default_constraints()
-    m.constrain_bounded('t_noise', 1e-6, 10.)
-    m.constrain_fixed('white', 1e-4)
-    m.randomize()
-    for a in range(1):
-        m.randomize()
-        m_start = m.copy()
-        print m
-        m.optimize('scg', messages=1)
-    print(m)
-    ax = plt.subplot(212)
-    m.plot(ax=ax)
-    plt.plot(X_full, Y_full)
-    plt.ylim(-1.5, 1.5)
-    plt.title('Student-t rasm corrupt')
+    m4 = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
+    m4.ensure_default_constraints()
+    m4.constrain_bounded('t_noise', 1e-6, 10.)
+    m4.constrain_fixed('white', 1e-5)
+    m4.randomize()
 
-    return m
+    if optimize:
+        optimizer='scg'
+        print "Clean Gaussian"
+        m1.optimize(optimizer, messages=1)
+        print "Corrupt Gaussian"
+        m2.optimize(optimizer, messages=1)
+        print "Clean student t"
+        m3.optimize(optimizer, messages=1)
+        print "Corrupt student t"
+        m4.optimize(optimizer, messages=1)
+
+    if False:
+        print m1
+        print m3
+        plt.figure(3)
+        plt.scatter(X, m1.likelihood.Y, c='g')
+        plt.scatter(X, m3.likelihood.Y, c='r')
+
+    if plot:
+        plt.figure(1)
+        plt.suptitle('Gaussian likelihood')
+        ax = plt.subplot(211)
+        m1.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Gaussian clean')
+
+        ax = plt.subplot(212)
+        m2.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Gaussian corrupt')
+
+        plt.figure(2)
+        plt.suptitle('Student-t likelihood')
+        ax = plt.subplot(211)
+        m3.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Student-t rasm clean')
+
+        ax = plt.subplot(212)
+        m4.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Student-t rasm corrupt')
+
+    return m1, m2, m3, m4
 
 def boston_example():
     import sklearn
@@ -294,3 +292,4 @@ def precipitation_example():
     for n, (train, test) in enumerate(kf):
         X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
         print "Fold {}".format(n)
+
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index 57160d64..e5dcdd19 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -15,6 +15,7 @@ import scipy as sp
 from likelihood import likelihood
 from ..util.linalg import mdot, jitchol, pddet, dpotrs
 from functools import partial as partial_func
+import warnings
 
 class Laplace(likelihood):
     """Laplace approximation to a posterior"""
@@ -64,6 +65,7 @@ class Laplace(likelihood):
         self.YYT = None
 
         self.old_Ki_f = None
+        self.bad_fhat = False
 
     def predictive_values(self,mu,var,full_cov,**noise_args):
         if full_cov:
@@ -198,18 +200,16 @@ class Laplace(likelihood):
         Y_tilde = Wi*self.Ki_f + self.f_hat
 
         self.Wi_K_i = self.W12BiW12
-        self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-        self.lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
-        self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
+        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
+        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
 
-        Z_tilde = (+ self.lik
+        Z_tilde = (+ lik
                    - 0.5*self.ln_B_det
-                   + 0.5*self.ln_det_Wi_K
+                   + 0.5*ln_det_Wi_K
                    - 0.5*self.f_Ki_f
-                   + 0.5*self.y_Wi_Ki_i_y
+                   + 0.5*y_Wi_K_i_y
                   )
-        #print "Term, {}, {}, {}, {}, {}".format(self.lik, - 0.5*self.ln_B_det, + 0.5*self.ln_det_Wi_K, - 0.5*self.f_Ki_f, + 0.5*self.y_Wi_Ki_i_y)
-
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
@@ -247,7 +247,10 @@ class Laplace(likelihood):
         #At this point get the hessian matrix (or vector as W is diagonal)
         self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)
 
-        #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
+        if not self.noise_model.log_concave:
+            #print "Under 1e-10: {}".format(np.sum(self.W < 1e-6))
+            self.W[self.W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+
         self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))
 
         self.Ki_f = self.Ki_f
@@ -283,11 +286,11 @@ class Laplace(likelihood):
         except:
             import ipdb; ipdb.set_trace()
 
-        W12BiW12 = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
+        W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
         ln_B_det = 2*np.sum(np.log(np.diag(L)))
-        return W12BiW12, ln_B_det
+        return W12BiW12a, ln_B_det
 
-    def rasm_mode(self, K, MAX_ITER=30):
+    def rasm_mode(self, K, MAX_ITER=40):
         """
         Rasmussen's numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -302,9 +305,10 @@ class Laplace(likelihood):
         """
         #old_Ki_f = np.zeros((self.N, 1))
 
-        #Start f's at zero originally
-        if self.old_Ki_f is None:
-            old_Ki_f = np.zeros((self.N, 1))
+        #Start f's at zero originally of if we have gone off track, try restarting
+        if self.old_Ki_f is None or self.bad_fhat:
+            old_Ki_f = np.random.rand(self.N, 1)/50.0
+            #old_Ki_f = self.Y
             f = np.dot(K, old_Ki_f)
         else:
             #Start at the old best point
@@ -318,7 +322,7 @@ class Laplace(likelihood):
             return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data)
 
         difference = np.inf
-        epsilon = 1e-5
+        epsilon = 1e-7
         #step_size = 1
         #rs = 0
         i = 0
@@ -381,14 +385,20 @@ class Laplace(likelihood):
 
             #difference = abs(new_obj - old_obj)
             #old_obj = new_obj.copy()
-            difference = np.abs(np.sum(f - f_old))
-            #difference = np.abs(np.sum(Ki_f - old_Ki_f))
+            difference = np.abs(np.sum(f - f_old)) + np.abs(np.sum(Ki_f - old_Ki_f))
+            #difference = np.abs(np.sum(Ki_f - old_Ki_f))/np.float(self.N)
             old_Ki_f = Ki_f.copy()
             i += 1
 
         self.old_Ki_f = old_Ki_f.copy()
+
+        #Warn of bad fits
         if difference > epsilon:
-            print "Not perfect f_hat fit difference: {}".format(difference)
+            self.bad_fhat = True
+            warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
+        elif self.bad_fhat:
+            self.bad_fhat = False
+            warnings.warn("f_hat now perfect again")
 
         self.Ki_f = Ki_f
         return f

From b26c62f6af4c9267025a9066b58419cc7943a88f Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 29 Nov 2013 12:00:37 +0000
Subject: [PATCH 242/252] Added constant to Z_tilde, now log likelihoods are
 equal!

---
 GPy/examples/non_gaussian.py     |  7 ---
 GPy/examples/stochastic.py       |  7 ---
 GPy/likelihoods/laplace.py       |  9 ++--
 GPy/testing/likelihoods_tests.py | 89 ++++++++++++++++++++++++++++++++
 4 files changed, 93 insertions(+), 19 deletions(-)

diff --git a/GPy/examples/non_gaussian.py b/GPy/examples/non_gaussian.py
index 622b3edd..620efc5f 100644
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@@ -83,13 +83,6 @@ def student_t_approx(optimize=True, plot=True):
         print "Corrupt student t"
         m4.optimize(optimizer, messages=1)
 
-    if False:
-        print m1
-        print m3
-        plt.figure(3)
-        plt.scatter(X, m1.likelihood.Y, c='g')
-        plt.scatter(X, m3.likelihood.Y, c='r')
-
     if plot:
         plt.figure(1)
         plt.suptitle('Gaussian likelihood')
diff --git a/GPy/examples/stochastic.py b/GPy/examples/stochastic.py
index 21011901..73daef36 100644
--- a/GPy/examples/stochastic.py
+++ b/GPy/examples/stochastic.py
@@ -32,10 +32,3 @@ def toy_1d():
 
     m.plot_traces()
     return m
-
-
-
-
-
-
-
diff --git a/GPy/likelihoods/laplace.py b/GPy/likelihoods/laplace.py
index e5dcdd19..0def0c8b 100644
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@@ -209,7 +209,9 @@ class Laplace(likelihood):
                    + 0.5*ln_det_Wi_K
                    - 0.5*self.f_Ki_f
                    + 0.5*y_Wi_K_i_y
+                   + self.NORMAL_CONST
                   )
+
         #Convert to float as its (1, 1) and Z must be a scalar
         self.Z = np.float64(Z_tilde)
         self.Y = Y_tilde
@@ -271,7 +273,7 @@ class Laplace(likelihood):
         :returns: (W12BiW12, ln_B_det)
         """
         if not self.noise_model.log_concave:
-            #print "Under 1e-10: {}".format(np.sum(W < 1e-10))
+            #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
             W[W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                 # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                 # To cause the posterior to become less certain than the prior and likelihood,
@@ -281,10 +283,7 @@ class Laplace(likelihood):
         #W is diagonal so its sqrt is just the sqrt of the diagonal elements
         W_12 = np.sqrt(W)
         B = np.eye(self.N) + W_12*K*W_12.T
-        try:
-            L = jitchol(B)
-        except:
-            import ipdb; ipdb.set_trace()
+        L = jitchol(B)
 
         W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
         ln_B_det = 2*np.sum(np.log(np.diag(L)))
diff --git a/GPy/testing/likelihoods_tests.py b/GPy/testing/likelihoods_tests.py
index 9b7b7eb6..58c9a64b 100644
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@@ -593,6 +593,95 @@ class LaplaceTests(unittest.TestCase):
         grad.checkgrad(verbose=1)
         self.assertTrue(grad.checkgrad())
 
+    #@unittest.skip('Not working yet, needs to be checked')
+    def test_laplace_log_likelihood(self):
+        debug = False
+        real_std = 0.1
+        initial_var_guess = 0.5
+
+        #Start a function, any function
+        X = np.linspace(0.0, np.pi*2, 100)[:, None]
+        Y = np.sin(X) + np.random.randn(*X.shape)*real_std
+        Y = Y/Y.max()
+        #Yc = Y.copy()
+        #Yc[75:80] += 1
+        kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        kernel2 = kernel1.copy()
+
+        m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
+        m1.constrain_fixed('white', 1e-6)
+        m1['noise'] = initial_var_guess
+        m1.constrain_bounded('noise', 1e-4, 10)
+        m1.constrain_bounded('rbf', 1e-4, 10)
+        m1.ensure_default_constraints()
+        m1.randomize()
+
+        gauss_distr = GPy.likelihoods.gaussian(variance=initial_var_guess, D=1, N=Y.shape[0])
+        laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), gauss_distr)
+        m2 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel2, likelihood=laplace_likelihood)
+        m2.ensure_default_constraints()
+        m2.constrain_fixed('white', 1e-6)
+        m2.constrain_bounded('rbf', 1e-4, 10)
+        m2.constrain_bounded('noise', 1e-4, 10)
+        m2.randomize()
+
+        if debug:
+            print m1
+            print m2
+        optimizer = 'scg'
+        print "Gaussian"
+        m1.optimize(optimizer, messages=debug)
+        print "Laplace Gaussian"
+        m2.optimize(optimizer, messages=debug)
+        if debug:
+            print m1
+            print m2
+
+        m2._set_params(m1._get_params())
+
+        #Predict for training points to get posterior mean and variance
+        post_mean, post_var, _, _ = m1.predict(X)
+        post_mean_approx, post_var_approx, _, _ = m2.predict(X)
+
+        if debug:
+            import pylab as pb
+            pb.figure(5)
+            pb.title('posterior means')
+            pb.scatter(X, post_mean, c='g')
+            pb.scatter(X, post_mean_approx, c='r', marker='x')
+
+            pb.figure(6)
+            pb.title('plot_f')
+            m1.plot_f(fignum=6)
+            m2.plot_f(fignum=6)
+            fig, axes = pb.subplots(2, 1)
+            fig.suptitle('Covariance matricies')
+            a1 = pb.subplot(121)
+            a1.matshow(m1.likelihood.covariance_matrix)
+            a2 = pb.subplot(122)
+            a2.matshow(m2.likelihood.covariance_matrix)
+
+            pb.figure(8)
+            pb.scatter(X, m1.likelihood.Y, c='g')
+            pb.scatter(X, m2.likelihood.Y, c='r', marker='x')
+
+
+
+        #Check Y's are the same
+        np.testing.assert_almost_equal(Y, m2.likelihood.Y, decimal=5)
+        #Check marginals are the same
+        np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), decimal=2)
+        #Check marginals are the same with random
+        m1.randomize()
+        m2._set_params(m1._get_params())
+        np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), decimal=2)
+
+        #Check they are checkgradding
+        #m1.checkgrad(verbose=1)
+        #m2.checkgrad(verbose=1)
+        self.assertTrue(m1.checkgrad())
+        self.assertTrue(m2.checkgrad())
+
 if __name__ == "__main__":
     print "Running unit tests"
     unittest.main()

From 68ece192118deb816c1513cd59f712909db37af7 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 29 Nov 2013 14:20:33 +0000
Subject: [PATCH 243/252] Fixed gp_base and svigp for sampling (doesn't use it
 but needs the arguments)

---
 GPy/core/gp_base.py | 12 ++++++------
 GPy/core/svigp.py   |  5 ++---
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index 548e2924..2577e06c 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -16,7 +16,7 @@ class GPBase(Model):
     def __init__(self, X, likelihood, kernel, normalize_X=False):
         if len(X.shape)==1:
             X = X.reshape(-1,1)
-            warning.warn("One dimension output (N,) being reshaped to (N,1)")
+            warnings.warn("One dimension output (N,) being reshaped to (N,1)")
         self.X = X
         assert len(self.X.shape) == 2, "too many dimensions for X input"
         self.num_data, self.input_dim = self.X.shape
@@ -76,7 +76,7 @@ class GPBase(Model):
         :type noise_model: integer.
         :returns: Ysim: set of simulations, a Numpy array (N x samples).
         """
-        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts, full_cov=True)
+        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts)
         if isinstance(self.likelihood,Gaussian):
             noise_std = np.sqrt(self.likelihood._get_params())
             Ysim += np.random.normal(0,noise_std,Ysim.shape)
@@ -107,7 +107,7 @@ class GPBase(Model):
             levels=20, samples=0, fignum=None, ax=None, resolution=None,
             plot_raw=False,
             linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
-        """ 
+        """
         Plot the posterior of the GP.
           - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
           - In two dimsensions, a contour-plot shows the mean predicted function
@@ -176,8 +176,8 @@ class GPBase(Model):
                 upper = m + 2*np.sqrt(v)
                 Y = self.likelihood.Y
             else:
-                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts,sampling=False) #Compute the exact mean
-                m_, v_, lower, upper = self.predict(Xgrid, which_parts=which_parts,sampling=True,num_samples=15000) #Apporximate the percentiles
+                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts, sampling=False) #Compute the exact mean
+                m_, v_, lower, upper = self.predict(Xgrid, which_parts=which_parts, sampling=True, num_samples=15000) #Apporximate the percentiles
                 Y = self.likelihood.data
             for d in which_data_ycols:
                 gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
@@ -185,7 +185,7 @@ class GPBase(Model):
 
             #optionally plot some samples
             if samples: #NOTE not tested with fixed_inputs
-                Ysim = self.posterior_samples(Xgrid, samples, which_parts=which_parts, full_cov=True)
+                Ysim = self.posterior_samples(Xgrid, samples, which_parts=which_parts)
                 for yi in Ysim.T:
                     ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                     #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py
index 9f27f465..fdd95aa8 100644
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@@ -31,7 +31,6 @@ class SVIGP(GPBase):
 
     """
 
-
     def __init__(self, X, likelihood, kernel, Z, q_u=None, batchsize=10, X_variance=None):
         GPBase.__init__(self, X, likelihood, kernel, normalize_X=False)
         self.batchsize=batchsize
@@ -433,7 +432,7 @@ class SVIGP(GPBase):
             else:
                 return mu, diag_var[:,None]
 
-    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
+    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False, sampling=False, num_samples=15000):
         # normalize X values
         Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
         if X_variance_new is not None:
@@ -443,7 +442,7 @@ class SVIGP(GPBase):
         mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
 
         # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, sampling=sampling, num_samples=num_samples)
 
         return mean, var, _025pm, _975pm
 

From 3cd808ccccd32166779abe52837a741dbbb49c24 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 29 Nov 2013 14:20:59 +0000
Subject: [PATCH 244/252] Added optimize and plot for classification,
 non_gaussian and stochastic examples

---
 GPy/examples/classification.py | 114 +++++++++++++++++---------------
 GPy/examples/non_gaussian.py   | 116 ++++++++++++++++-----------------
 GPy/examples/stochastic.py     |  23 ++++---
 3 files changed, 132 insertions(+), 121 deletions(-)

diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index 05b6af74..f9aaddd1 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -6,12 +6,11 @@
 Gaussian Processes classification
 """
 import pylab as pb
-import numpy as np
 import GPy
 
 default_seed = 10000
 
-def oil(num_inducing=50, max_iters=100, kernel=None):
+def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
     """
     Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
 
@@ -25,7 +24,7 @@ def oil(num_inducing=50, max_iters=100, kernel=None):
     Ytest[Ytest.flatten()==-1] = 0
 
     # Create GP model
-    m = GPy.models.SparseGPClassification(X, Y,kernel=kernel,num_inducing=num_inducing)
+    m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, num_inducing=num_inducing)
 
     # Contrain all parameters to be positive
     m.tie_params('.*len')
@@ -33,15 +32,16 @@ def oil(num_inducing=50, max_iters=100, kernel=None):
     m.update_likelihood_approximation()
 
     # Optimize
-    m.optimize(max_iters=max_iters)
+    if optimize:
+        m.optimize(max_iters=max_iters)
     print(m)
 
     #Test
     probs = m.predict(Xtest)[0]
-    GPy.util.classification.conf_matrix(probs,Ytest)
+    GPy.util.classification.conf_matrix(probs, Ytest)
     return m
 
-def toy_linear_1d_classification(seed=default_seed):
+def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
     """
     Simple 1D classification example using EP approximation
 
@@ -58,21 +58,23 @@ def toy_linear_1d_classification(seed=default_seed):
     m = GPy.models.GPClassification(data['X'], Y)
 
     # Optimize
-    #m.update_likelihood_approximation()
-    # Parameters optimization:
-    #m.optimize()
-    #m.update_likelihood_approximation()
-    m.pseudo_EM()
+    if optimize:
+        #m.update_likelihood_approximation()
+        # Parameters optimization:
+        #m.optimize()
+        #m.update_likelihood_approximation()
+        m.pseudo_EM()
 
     # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])
 
+    print m
     return m
 
-def toy_linear_1d_classification_laplace(seed=default_seed):
+def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=True):
     """
     Simple 1D classification example using Laplace approximation
 
@@ -90,24 +92,25 @@ def toy_linear_1d_classification_laplace(seed=default_seed):
 
     # Model definition
     m = GPy.models.GPClassification(data['X'], Y, likelihood=laplace_likelihood)
-
     print m
+
     # Optimize
-    #m.update_likelihood_approximation()
-    # Parameters optimization:
-    m.optimize('bfgs', messages=1)
-    #m.pseudo_EM()
+    if optimize:
+        #m.update_likelihood_approximation()
+        # Parameters optimization:
+        m.optimize('bfgs', messages=1)
+        #m.pseudo_EM()
 
     # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])
 
+    print m
     return m
 
-
-def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
+def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, optimize=True, plot=True):
     """
     Sparse 1D classification example
 
@@ -121,24 +124,26 @@ def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
     Y[Y.flatten() == -1] = 0
 
     # Model definition
-    m = GPy.models.SparseGPClassification(data['X'], Y,num_inducing=num_inducing)
-    m['.*len']= 4.
+    m = GPy.models.SparseGPClassification(data['X'], Y, num_inducing=num_inducing)
+    m['.*len'] = 4.
 
     # Optimize
-    #m.update_likelihood_approximation()
-    # Parameters optimization:
-    #m.optimize()
-    m.pseudo_EM()
+    if optimize:
+        #m.update_likelihood_approximation()
+        # Parameters optimization:
+        #m.optimize()
+        m.pseudo_EM()
 
     # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])
 
+    print m
     return m
 
-def toy_heaviside(seed=default_seed):
+def toy_heaviside(seed=default_seed, optimize=True, plot=True):
     """
     Simple 1D classification example using a heavy side gp transformation
 
@@ -153,24 +158,26 @@ def toy_heaviside(seed=default_seed):
 
     # Model definition
     noise_model = GPy.likelihoods.bernoulli(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
-    likelihood = GPy.likelihoods.EP(Y,noise_model)
+    likelihood = GPy.likelihoods.EP(Y, noise_model)
     m = GPy.models.GPClassification(data['X'], likelihood=likelihood)
 
     # Optimize
-    m.update_likelihood_approximation()
-    # Parameters optimization:
-    m.optimize()
-    #m.pseudo_EM()
+    if optimize:
+        m.update_likelihood_approximation()
+        # Parameters optimization:
+        m.optimize()
+        #m.pseudo_EM()
 
     # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])
 
+    print m
     return m
 
-def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=None):
+def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=None, optimize=True, plot=True):
     """
     Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
 
@@ -187,7 +194,7 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
     Y[Y.flatten()==-1] = 0
 
     if model_type == 'Full':
-        m = GPy.models.GPClassification(data['X'], Y,kernel=kernel)
+        m = GPy.models.GPClassification(data['X'], Y, kernel=kernel)
 
     elif model_type == 'DTC':
         m = GPy.models.SparseGPClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
@@ -197,8 +204,11 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
         m = GPy.models.FITCClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
         m['.*len'] = 3.
 
-    m.pseudo_EM()
-    print(m)
-    m.plot()
+    if optimize:
+        m.pseudo_EM()
 
+    if plot:
+        m.plot()
+
+    print m
     return m
diff --git a/GPy/examples/non_gaussian.py b/GPy/examples/non_gaussian.py
index 620efc5f..46849e01 100644
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@@ -114,7 +114,7 @@ def student_t_approx(optimize=True, plot=True):
 
     return m1, m2, m3, m4
 
-def boston_example():
+def boston_example(optimize=True, plot=True):
     import sklearn
     from sklearn.cross_validation import KFold
     optimizer='bfgs'
@@ -143,7 +143,6 @@ def boston_example():
         noise = 1e-1 #np.exp(-2)
         rbf_len = 0.5
         data_axis_plot = 4
-        plot = False
         kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
         kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
 
@@ -158,17 +157,13 @@ def boston_example():
         mgp['rbf_len'] = rbf_len
         mgp['noise'] = noise
         print mgp
-        mgp.optimize(optimizer=optimizer, messages=messages)
+        if optimize:
+            mgp.optimize(optimizer=optimizer, messages=messages)
         Y_test_pred = mgp.predict(X_test)
         score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
         pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
         print mgp
         print pred_density
-        if plot:
-            plt.figure()
-            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-            plt.title('GP gauss')
 
         print "Gaussian Laplace GP"
         N, D = Y_train.shape
@@ -181,20 +176,13 @@ def boston_example():
         mg['rbf_len'] = rbf_len
         mg['noise'] = noise
         print mg
-        try:
+        if optimize:
             mg.optimize(optimizer=optimizer, messages=messages)
-        except Exception:
-            print "Blew up"
         Y_test_pred = mg.predict(X_test)
         score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
         pred_density[2, n] = np.mean(mg.log_predictive_density(X_test, Y_test))
         print pred_density
         print mg
-        if plot:
-            plt.figure()
-            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-            plt.title('Lap gauss')
 
         for stu_num, df in enumerate(degrees_freedoms):
             #Student T
@@ -208,61 +196,71 @@ def boston_example():
             mstu_t['rbf_len'] = rbf_len
             mstu_t['t_noise'] = noise
             print mstu_t
-            try:
+            if optimize:
                 mstu_t.optimize(optimizer=optimizer, messages=messages)
-            except Exception:
-                print "Blew up"
             Y_test_pred = mstu_t.predict(X_test)
             score_folds[3+stu_num, n] = rmse(Y_test, Y_test_pred[0])
             pred_density[3+stu_num, n] = np.mean(mstu_t.log_predictive_density(X_test, Y_test))
             print pred_density
             print mstu_t
-            if plot:
-                plt.figure()
-                plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-                plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-                plt.title('Stu t {}df'.format(df))
+
+    if plot:
+        plt.figure()
+        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+        plt.title('GP gauss')
+
+        plt.figure()
+        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+        plt.title('Lap gauss')
+
+        plt.figure()
+        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+        plt.title('Stu t {}df'.format(df))
 
     print "Average scores: {}".format(np.mean(score_folds, 1))
     print "Average pred density: {}".format(np.mean(pred_density, 1))
 
-    #Plotting
-    stu_t_legends = ['Student T, df={}'.format(df) for df in degrees_freedoms]
-    legends = ['Baseline', 'Gaussian', 'Laplace Approx Gaussian'] + stu_t_legends
+    if plot:
+        #Plotting
+        stu_t_legends = ['Student T, df={}'.format(df) for df in degrees_freedoms]
+        legends = ['Baseline', 'Gaussian', 'Laplace Approx Gaussian'] + stu_t_legends
 
-    #Plot boxplots for RMSE density
-    fig = plt.figure()
-    ax=fig.add_subplot(111)
-    plt.title('RMSE')
-    bp = ax.boxplot(score_folds.T, notch=0, sym='+', vert=1, whis=1.5)
-    plt.setp(bp['boxes'], color='black')
-    plt.setp(bp['whiskers'], color='black')
-    plt.setp(bp['fliers'], color='red', marker='+')
-    xtickNames = plt.setp(ax, xticklabels=legends)
-    plt.setp(xtickNames, rotation=45, fontsize=8)
-    ax.set_ylabel('RMSE')
-    ax.set_xlabel('Distribution')
-    #Make grid and put it below boxes
-    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
-              alpha=0.5)
-    ax.set_axisbelow(True)
+        #Plot boxplots for RMSE density
+        fig = plt.figure()
+        ax=fig.add_subplot(111)
+        plt.title('RMSE')
+        bp = ax.boxplot(score_folds.T, notch=0, sym='+', vert=1, whis=1.5)
+        plt.setp(bp['boxes'], color='black')
+        plt.setp(bp['whiskers'], color='black')
+        plt.setp(bp['fliers'], color='red', marker='+')
+        xtickNames = plt.setp(ax, xticklabels=legends)
+        plt.setp(xtickNames, rotation=45, fontsize=8)
+        ax.set_ylabel('RMSE')
+        ax.set_xlabel('Distribution')
+        #Make grid and put it below boxes
+        ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+                alpha=0.5)
+        ax.set_axisbelow(True)
 
-    #Plot boxplots for predictive density
-    fig = plt.figure()
-    ax=fig.add_subplot(111)
-    plt.title('Predictive density')
-    bp = ax.boxplot(pred_density[1:,:].T, notch=0, sym='+', vert=1, whis=1.5)
-    plt.setp(bp['boxes'], color='black')
-    plt.setp(bp['whiskers'], color='black')
-    plt.setp(bp['fliers'], color='red', marker='+')
-    xtickNames = plt.setp(ax, xticklabels=legends[1:])
-    plt.setp(xtickNames, rotation=45, fontsize=8)
-    ax.set_ylabel('Mean Log probability P(Y*|Y)')
-    ax.set_xlabel('Distribution')
-    #Make grid and put it below boxes
-    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
-              alpha=0.5)
-    ax.set_axisbelow(True)
+        #Plot boxplots for predictive density
+        fig = plt.figure()
+        ax=fig.add_subplot(111)
+        plt.title('Predictive density')
+        bp = ax.boxplot(pred_density[1:,:].T, notch=0, sym='+', vert=1, whis=1.5)
+        plt.setp(bp['boxes'], color='black')
+        plt.setp(bp['whiskers'], color='black')
+        plt.setp(bp['fliers'], color='red', marker='+')
+        xtickNames = plt.setp(ax, xticklabels=legends[1:])
+        plt.setp(xtickNames, rotation=45, fontsize=8)
+        ax.set_ylabel('Mean Log probability P(Y*|Y)')
+        ax.set_xlabel('Distribution')
+        #Make grid and put it below boxes
+        ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+                alpha=0.5)
+        ax.set_axisbelow(True)
     return mstu_t
 
 def precipitation_example():
diff --git a/GPy/examples/stochastic.py b/GPy/examples/stochastic.py
index 73daef36..c302ec7d 100644
--- a/GPy/examples/stochastic.py
+++ b/GPy/examples/stochastic.py
@@ -5,7 +5,7 @@ import pylab as pb
 import numpy as np
 import GPy
 
-def toy_1d():
+def toy_1d(optimize=True, plot=True):
     N = 2000
     M = 20
 
@@ -20,15 +20,18 @@ def toy_1d():
 
     m.param_steplength = 1e-4
 
-    fig = pb.figure()
-    ax = fig.add_subplot(111)
-    def cb():
-        ax.cla()
-        m.plot(ax=ax,Z_height=-3)
-        ax.set_ylim(-3,3)
-        fig.canvas.draw()
+    if plot:
+        fig = pb.figure()
+        ax = fig.add_subplot(111)
+        def cb(foo):
+            ax.cla()
+            m.plot(ax=ax,Z_height=-3)
+            ax.set_ylim(-3,3)
+            fig.canvas.draw()
 
-    m.optimize(500, callback=cb, callback_interval=1)
+    if optimize:
+        m.optimize(500, callback=cb, callback_interval=1)
 
-    m.plot_traces()
+    if plot:
+        m.plot_traces()
     return m

From 98074e1e6c16427c4f7c93034c2dd3fd2c8dacb6 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 29 Nov 2013 14:40:31 +0000
Subject: [PATCH 245/252] Changed more examples to accept optimize and plot

---
 GPy/examples/non_gaussian.py |  40 +++++-----
 GPy/examples/regression.py   | 138 ++++++++++++++++++++---------------
 GPy/examples/tutorials.py    |  79 +++++++++++---------
 3 files changed, 144 insertions(+), 113 deletions(-)

diff --git a/GPy/examples/non_gaussian.py b/GPy/examples/non_gaussian.py
index 46849e01..bda80137 100644
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@@ -263,24 +263,24 @@ def boston_example(optimize=True, plot=True):
         ax.set_axisbelow(True)
     return mstu_t
 
-def precipitation_example():
-    import sklearn
-    from sklearn.cross_validation import KFold
-    data = datasets.boston_housing()
-    X = data['X'].copy()
-    Y = data['Y'].copy()
-    X = X-X.mean(axis=0)
-    X = X/X.std(axis=0)
-    Y = Y-Y.mean()
-    Y = Y/Y.std()
-    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
-    num_folds = 10
-    kf = KFold(len(Y), n_folds=num_folds, indices=True)
-    score_folds = np.zeros((4, num_folds))
-    def rmse(Y, Ystar):
-        return np.sqrt(np.mean((Y-Ystar)**2))
-    #for train, test in kf:
-    for n, (train, test) in enumerate(kf):
-        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
-        print "Fold {}".format(n)
+#def precipitation_example():
+    #import sklearn
+    #from sklearn.cross_validation import KFold
+    #data = datasets.boston_housing()
+    #X = data['X'].copy()
+    #Y = data['Y'].copy()
+    #X = X-X.mean(axis=0)
+    #X = X/X.std(axis=0)
+    #Y = Y-Y.mean()
+    #Y = Y/Y.std()
+    #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+    #num_folds = 10
+    #kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    #score_folds = np.zeros((4, num_folds))
+    #def rmse(Y, Ystar):
+        #return np.sqrt(np.mean((Y-Ystar)**2))
+    ##for train, test in kf:
+    #for n, (train, test) in enumerate(kf):
+        #X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        #print "Fold {}".format(n)
 
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 1ddb0a69..9b910005 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -101,9 +101,7 @@ def coregionalization_sparse(optimize=True, plot=True):
 
     return m
 
-
-
-def epomeo_gpx(optimize=True, plot=True):
+def epomeo_gpx(max_iters=200, optimize=True, plot=True):
     """
     Perform Gaussian process regression on the latitude and longitude data
     from the Mount Epomeo runs. Requires gpxpy to be installed on your system
@@ -141,8 +139,7 @@ def epomeo_gpx(optimize=True, plot=True):
 
     return m
 
-
-def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300):
+def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300, optimize=True, plot=True):
     """
     Show an example of a multimodal error surface for Gaussian process
     regression. Gene 939 has bimodal behaviour where the noisy mode is
@@ -160,13 +157,14 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
     data['Y'] = data['Y'] - np.mean(data['Y'])
 
     lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
-    pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
-    ax = pb.gca()
-    pb.xlabel('length scale')
-    pb.ylabel('log_10 SNR')
+    if plot:
+        pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
+        ax = pb.gca()
+        pb.xlabel('length scale')
+        pb.ylabel('log_10 SNR')
 
-    xlim = ax.get_xlim()
-    ylim = ax.get_ylim()
+        xlim = ax.get_xlim()
+        ylim = ax.get_ylim()
 
     # Now run a few optimizations
     models = []
@@ -183,16 +181,19 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
         optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
 
         # optimize
-        m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)
+        if optimize:
+            m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)
 
         optim_point_x[1] = m['rbf_lengthscale']
         optim_point_y[1] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
 
-        pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
+        if plot:
+            pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
         models.append(m)
 
-    ax.set_xlim(xlim)
-    ax.set_ylim(ylim)
+    if plot:
+        ax.set_xlim(xlim)
+        ax.set_ylim(ylim)
     return m # (models, lls)
 
 def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
@@ -295,6 +296,7 @@ def toy_poisson_rbf_1d(optimize=True, plot=True):
 
 def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
     """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    optimizer='scg'
     x_len = 30
     X = np.linspace(0, 10, x_len)[:, None]
     f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
@@ -307,7 +309,7 @@ def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
     m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
 
     if optimize:
-        m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
+        m.optimize(optimizer)
     if plot:
         m.plot()
         # plot the real underlying rate function
@@ -315,9 +317,7 @@ def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
 
     return m
 
-
-
-def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
+def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
     # Create an artificial dataset where the values in the targets (Y)
     # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
     # see if this dependency can be recovered
@@ -347,13 +347,16 @@ def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
     # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
     # m.set_prior('.*lengthscale',len_prior)
 
-    m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+    if optimize:
+        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
 
-    m.kern.plot_ARD()
-    print(m)
+    if plot:
+        m.kern.plot_ARD()
+
+    print m
     return m
 
-def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
+def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
     # Create an artificial dataset where the values in the targets (Y)
     # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
     # see if this dependency can be recovered
@@ -384,13 +387,16 @@ def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
     # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
     # m.set_prior('.*lengthscale',len_prior)
 
-    m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+    if optimize:
+        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
 
-    m.kern.plot_ARD()
-    print(m)
+    if plot:
+        m.kern.plot_ARD()
+
+    print m
     return m
 
-def robot_wireless(max_iters=100, kernel=None):
+def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
     """Predict the location of a robot given wirelss signal strength readings."""
     data = GPy.util.datasets.robot_wireless()
 
@@ -398,20 +404,24 @@ def robot_wireless(max_iters=100, kernel=None):
     m = GPy.models.GPRegression(data['Y'], data['X'], kernel=kernel)
 
     # optimize
-    m.optimize(messages=True, max_iters=max_iters)
+    if optimize:
+        m.optimize(messages=True, max_iters=max_iters)
+
     Xpredict = m.predict(data['Ytest'])[0]
-    pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-')
-    pb.plot(Xpredict[:, 0], Xpredict[:, 1], 'b-')
-    pb.axis('equal')
-    pb.title('WiFi Localization with Gaussian Processes')
-    pb.legend(('True Location', 'Predicted Location'))
+    if plot:
+        pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-')
+        pb.plot(Xpredict[:, 0], Xpredict[:, 1], 'b-')
+        pb.axis('equal')
+        pb.title('WiFi Localization with Gaussian Processes')
+        pb.legend(('True Location', 'Predicted Location'))
 
     sse = ((data['Xtest'] - Xpredict)**2).sum()
-    print(m)
+
+    print m
     print('Sum of squares error on test data: ' + str(sse))
     return m
 
-def silhouette(max_iters=100):
+def silhouette(max_iters=100, optimize=True, plot=True):
     """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
     data = GPy.util.datasets.silhouette()
 
@@ -419,12 +429,13 @@ def silhouette(max_iters=100):
     m = GPy.models.GPRegression(data['X'], data['Y'])
 
     # optimize
-    m.optimize(messages=True, max_iters=max_iters)
+    if optimize:
+        m.optimize(messages=True, max_iters=max_iters)
 
-    print(m)
+    print m
     return m
 
-def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100):
+def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True):
     """Run a 1D example of a sparse GP regression."""
     # sample inputs and outputs
     X = np.random.uniform(-3., 3., (num_samples, 1))
@@ -433,14 +444,17 @@ def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100):
     rbf = GPy.kern.rbf(1)
     # create simple GP Model
     m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
-
-
     m.checkgrad(verbose=1)
-    m.optimize('tnc', messages=1, max_iters=max_iters)
-    m.plot()
+
+    if optimize:
+        m.optimize('tnc', messages=1, max_iters=max_iters)
+
+    if plot:
+        m.plot()
+
     return m
 
-def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100):
+def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, optimize=True, plot=True):
     """Run a 2D example of a sparse GP regression."""
     X = np.random.uniform(-3., 3., (num_samples, 2))
     Y = np.sin(X[:, 0:1]) * np.sin(X[:, 1:2]) + np.random.randn(num_samples, 1) * 0.05
@@ -456,13 +470,18 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100):
 
     m.checkgrad()
 
-    # optimize and plot
-    m.optimize('tnc', messages=1, max_iters=max_iters)
-    m.plot()
-    print(m)
+    # optimize
+    if optimize:
+        m.optimize('tnc', messages=1, max_iters=max_iters)
+
+    # plot
+    if plot:
+        m.plot()
+
+    print m
     return m
 
-def uncertain_inputs_sparse_regression(optimize=True, plot=True):
+def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
     """Run a 1D example of a sparse GP regression with uncertain inputs."""
     fig, axes = pb.subplots(1, 2, figsize=(12, 5))
 
@@ -477,18 +496,23 @@ def uncertain_inputs_sparse_regression(optimize=True, plot=True):
 
     # create simple GP Model - no input uncertainty on this one
     m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
-    m.optimize('scg', messages=1, max_iters=max_iters)
-    m.plot(ax=axes[0])
-    axes[0].set_title('no input uncertainty')
 
+    if optimize:
+        m.optimize('scg', messages=1, max_iters=max_iters)
+
+    if plot:
+        m.plot(ax=axes[0])
+        axes[0].set_title('no input uncertainty')
+    print m
 
     # the same Model with uncertainty
     m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S)
-    m.optimize('scg', messages=1, max_iters=max_iters)
-    m.plot(ax=axes[1])
-    axes[1].set_title('with input uncertainty')
-    print(m)
-
-    fig.canvas.draw()
+    if optimize:
+        m.optimize('scg', messages=1, max_iters=max_iters)
+    if plot:
+        m.plot(ax=axes[1])
+        axes[1].set_title('with input uncertainty')
+        fig.canvas.draw()
 
+    print m
     return m
diff --git a/GPy/examples/tutorials.py b/GPy/examples/tutorials.py
index 69fc2aaf..7825992d 100644
--- a/GPy/examples/tutorials.py
+++ b/GPy/examples/tutorials.py
@@ -11,7 +11,7 @@ pb.ion()
 import numpy as np
 import GPy
 
-def tuto_GP_regression():
+def tuto_GP_regression(optimize=True, plot=True):
     """The detailed explanations of the commands used in this file can be found in the tutorial section"""
 
     X = np.random.uniform(-3.,3.,(20,1))
@@ -22,7 +22,8 @@ def tuto_GP_regression():
     m = GPy.models.GPRegression(X, Y, kernel)
 
     print m
-    m.plot()
+    if plot:
+        m.plot()
 
     m.constrain_positive('')
 
@@ -31,9 +32,9 @@ def tuto_GP_regression():
     m.constrain_bounded('.*lengthscale',1.,10. )
     m.constrain_fixed('.*noise',0.0025)
 
-    m.optimize()
-
-    m.optimize_restarts(num_restarts = 10)
+    if optimize:
+        m.optimize()
+        m.optimize_restarts(num_restarts = 10)
 
     #######################################################
     #######################################################
@@ -51,22 +52,26 @@ def tuto_GP_regression():
     m.constrain_positive('')
 
     # optimize and plot
-    m.optimize('tnc', max_f_eval = 1000)
-    m.plot()
-    print(m)
+    if optimize:
+        m.optimize('tnc', max_f_eval = 1000)
+    if plot:
+        m.plot()
+
+    print m
     return(m)
 
-def tuto_kernel_overview():
+def tuto_kernel_overview(optimize=True, plot=True):
     """The detailed explanations of the commands used in this file can be found in the tutorial section"""
     ker1 = GPy.kern.rbf(1)  # Equivalent to ker1 = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
     ker2 = GPy.kern.rbf(input_dim=1, variance = .75, lengthscale=2.)
     ker3 = GPy.kern.rbf(1, .5, .5)
-    
+
     print ker2
 
-    ker1.plot()
-    ker2.plot()
-    ker3.plot()
+    if plot:
+        ker1.plot()
+        ker2.plot()
+        ker3.plot()
 
     k1 = GPy.kern.rbf(1,1.,2.)
     k2 = GPy.kern.Matern32(1, 0.5, 0.2)
@@ -77,8 +82,8 @@ def tuto_kernel_overview():
 
     # Sum of kernels
     k_add = k1.add(k2)                          # By default, tensor=False
-    k_addtens = k1.add(k2,tensor=True)    
-    
+    k_addtens = k1.add(k2,tensor=True)
+
     k1 = GPy.kern.rbf(1,1.,2)
     k2 = GPy.kern.periodic_Matern52(1,variance=1e3, lengthscale=1, period = 1.5, lower=-5., upper = 5)
 
@@ -102,7 +107,7 @@ def tuto_kernel_overview():
     k.unconstrain('white')
     k.constrain_bounded('white',lower=1e-5,upper=.5)
     print k
-    
+
     k_cst = GPy.kern.bias(1,variance=1.)
     k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3)
     Kanova = (k_cst + k_mat).prod(k_cst + k_mat,tensor=True)
@@ -114,30 +119,32 @@ def tuto_kernel_overview():
 
     # Create GP regression model
     m = GPy.models.GPRegression(X, Y, Kanova)
-    fig = pb.figure(figsize=(5,5))
-    ax = fig.add_subplot(111)
-    m.plot(ax=ax)
-   
-    pb.figure(figsize=(20,3))
-    pb.subplots_adjust(wspace=0.5)
-    axs = pb.subplot(1,5,1)
-    m.plot(ax=axs)
-    pb.subplot(1,5,2)
-    pb.ylabel("=   ",rotation='horizontal',fontsize='30')
-    axs = pb.subplot(1,5,3)
-    m.plot(ax=axs, which_parts=[False,True,False,False])
-    pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
-    axs = pb.subplot(1,5,4)
-    m.plot(ax=axs, which_parts=[False,False,True,False])
-    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
-    axs = pb.subplot(1,5,5)
-    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
-    m.plot(ax=axs, which_parts=[False,False,False,True])
+
+    if plot:
+        fig = pb.figure(figsize=(5,5))
+        ax = fig.add_subplot(111)
+        m.plot(ax=ax)
+
+        pb.figure(figsize=(20,3))
+        pb.subplots_adjust(wspace=0.5)
+        axs = pb.subplot(1,5,1)
+        m.plot(ax=axs)
+        pb.subplot(1,5,2)
+        pb.ylabel("=   ",rotation='horizontal',fontsize='30')
+        axs = pb.subplot(1,5,3)
+        m.plot(ax=axs, which_parts=[False,True,False,False])
+        pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
+        axs = pb.subplot(1,5,4)
+        m.plot(ax=axs, which_parts=[False,False,True,False])
+        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+        axs = pb.subplot(1,5,5)
+        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+        m.plot(ax=axs, which_parts=[False,False,False,True])
 
     return(m)
 
 
-def model_interaction():
+def model_interaction(optimize=True, plot=True):
     X = np.random.randn(20,1)
     Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5.
     k = GPy.kern.rbf(1) + GPy.kern.bias(1)

From 9e6cc7ea6eef37ba0f03c9aeb660e31d02f949d8 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 29 Nov 2013 14:45:44 +0000
Subject: [PATCH 246/252] Minor changes to naming of signitures

---
 GPy/examples/dimensionality_reduction.py | 58 ++++++++++++------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 9120805c..65881573 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -3,23 +3,23 @@
 import numpy as _np
 default_seed = _np.random.seed(123344)
 
-def bgplvm_test_model(seed=default_seed, optimize=0, verbose=1, plot=0):
+def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False):
     """
-    model for testing purposes. Samples from a GP with rbf kernel and learns 
+    model for testing purposes. Samples from a GP with rbf kernel and learns
     the samples with a new kernel. Normally not for optimization, just model cheking
     """
     from GPy.likelihoods.gaussian import Gaussian
     import GPy
-    
+
     num_inputs = 13
     num_inducing = 5
-    if plot: 
+    if plot:
         output_dim = 1
         input_dim = 2
-    else: 
+    else:
         input_dim = 2
         output_dim = 25
-    
+
     # generate GPLVM-like data
     X = _np.random.rand(num_inputs, input_dim)
     lengthscales = _np.random.rand(input_dim)
@@ -43,7 +43,7 @@ def bgplvm_test_model(seed=default_seed, optimize=0, verbose=1, plot=0):
         import matplotlib.pyplot as pb
         m.plot()
         pb.title('PCA initialisation')
-   
+
     if optimize:
         m.optimize('scg', messages=verbose)
         if plot:
@@ -52,7 +52,7 @@ def bgplvm_test_model(seed=default_seed, optimize=0, verbose=1, plot=0):
 
     return m
 
-def gplvm_oil_100(optimize=1, verbose=1, plot=1):
+def gplvm_oil_100(optimize=True, verbose=1, plot=True):
     import GPy
     data = GPy.util.datasets.oil_100()
     Y = data['X']
@@ -64,7 +64,7 @@ def gplvm_oil_100(optimize=1, verbose=1, plot=1):
     if plot: m.plot_latent(labels=m.data_labels)
     return m
 
-def sparse_gplvm_oil(optimize=1, verbose=0, plot=1, N=100, Q=6, num_inducing=15, max_iters=50):
+def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_inducing=15, max_iters=50):
     import GPy
     _np.random.seed(0)
     data = GPy.util.datasets.oil()
@@ -77,12 +77,12 @@ def sparse_gplvm_oil(optimize=1, verbose=0, plot=1, N=100, Q=6, num_inducing=15,
     m.data_labels = data['Y'][:N].argmax(axis=1)
 
     if optimize: m.optimize('scg', messages=verbose, max_iters=max_iters)
-    if plot: 
+    if plot:
         m.plot_latent(labels=m.data_labels)
         m.kern.plot_ARD()
     return m
 
-def swiss_roll(optimize=1, verbose=1, plot=1, N=1000, num_inducing=15, Q=4, sigma=.2):
+def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4, sigma=.2):
     import GPy
     from GPy.util.datasets import swiss_roll_generated
     from GPy.models import BayesianGPLVM
@@ -131,16 +131,16 @@ def swiss_roll(optimize=1, verbose=1, plot=1, N=1000, num_inducing=15, Q=4, sigm
 
     if optimize:
         m.optimize('scg', messages=verbose, max_iters=2e3)
-    
+
     if plot:
         fig = plt.figure('fitted')
         ax = fig.add_subplot(111)
         s = m.input_sensitivity().argsort()[::-1][:2]
         ax.scatter(*m.X.T[s], c=c)
-        
+
     return m
 
-def bgplvm_oil(optimize=1, verbose=1, plot=1, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
+def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
     import GPy
     from GPy.likelihoods import Gaussian
     from matplotlib import pyplot as plt
@@ -164,7 +164,7 @@ def bgplvm_oil(optimize=1, verbose=1, plot=1, N=200, Q=7, num_inducing=40, max_i
         m.plot_latent(ax=latent_axes)
         data_show = GPy.util.visualize.vector_show(y)
         lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], # @UnusedVariable
-            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)  
+            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
         raw_input('Press enter to finish')
         plt.close(fig)
     return m
@@ -227,12 +227,12 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
 #     from GPy.util.datasets import simulation_BGPLVM
 #     from GPy import kern
 #     from GPy.models import BayesianGPLVM
-# 
+#
 #     sim_data = simulation_BGPLVM()
 #     Y = sim_data['Y']
 #     mu = sim_data['mu']
 #     num_inducing, [_, Q] = 3, mu.shape
-# 
+#
 #     k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
 #     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
 #                        _debug=False)
@@ -241,8 +241,8 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
 #     m['linear_variance'] = .01
 #     return m
 
-def bgplvm_simulation(optimize=1, verbose=1, 
-                      plot=1, plot_sim=False,
+def bgplvm_simulation(optimize=True, verbose=1,
+                      plot=True, plot_sim=False,
                       max_iters=2e4,
                       ):
     from GPy import kern
@@ -268,7 +268,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
     from GPy import kern
     from GPy.models import MRD
     from GPy.likelihoods import Gaussian
-    
+
     D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
     _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
     likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]
@@ -290,7 +290,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
 
 def brendan_faces(optimize=True, verbose=True, plot=True):
     import GPy
-    
+
     data = GPy.util.datasets.brendan_faces()
     Q = 2
     Y = data['Y']
@@ -315,7 +315,7 @@ def brendan_faces(optimize=True, verbose=True, plot=True):
 
 def olivetti_faces(optimize=True, verbose=True, plot=True):
     import GPy
-    
+
     data = GPy.util.datasets.olivetti_faces()
     Q = 2
     Y = data['Y']
@@ -350,7 +350,7 @@ def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=Tru
 def stick(kernel=None, optimize=True, verbose=True, plot=True):
     from matplotlib import pyplot as plt
     import GPy
-    
+
     data = GPy.util.datasets.osu_run1()
     # optimize
     m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
@@ -362,13 +362,13 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True):
         data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
         GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
         raw_input('Press enter to finish')
-        
+
     return m
 
 def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
     from matplotlib import pyplot as plt
     import GPy
-    
+
     data = GPy.util.datasets.osu_run1()
     # optimize
     mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
@@ -387,7 +387,7 @@ def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
 def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
     from matplotlib import pyplot as plt
     import GPy
-    
+
     data = GPy.util.datasets.osu_run1()
     # optimize
     back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.)
@@ -407,7 +407,7 @@ def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
 def robot_wireless(optimize=True, verbose=True, plot=True):
     from matplotlib import pyplot as plt
     import GPy
-    
+
     data = GPy.util.datasets.robot_wireless()
     # optimize
     m = GPy.models.GPLVM(data['Y'], 2)
@@ -422,7 +422,7 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
     from GPy.models import BayesianGPLVM
     from matplotlib import pyplot as plt
     import GPy
-    
+
     data = GPy.util.datasets.osu_run1()
     Q = 6
     kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
@@ -445,7 +445,7 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
 
 def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose=True, plot=True):
     import GPy
-    
+
     data = GPy.util.datasets.cmu_mocap(subject, motion)
     if in_place:
         # Make figure move in place.

From f26455f2b255e0f812248f37dc19ab911e80c18f Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 29 Nov 2013 15:45:18 +0000
Subject: [PATCH 247/252] Fixed examples tests, started changing datasets code
 which has a few bugs

---
 GPy/examples/dimensionality_reduction.py |  8 +++--
 GPy/testing/examples_tests.py            | 37 +++++++++++++++++-------
 GPy/util/datasets.py                     | 12 ++++----
 3 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 65881573..94bb4955 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -206,6 +206,7 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
 
     if plot_sim:
         import pylab
+        import matplotlib.cm as cm
         import itertools
         fig = pylab.figure("MRD Simulation Data", figsize=(8, 6))
         fig.clf()
@@ -216,7 +217,7 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
         ax.legend()
         for i, Y in enumerate(Ylist):
             ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
-            ax.imshow(Y, aspect='auto', cmap=cm.gray) # @UndefinedVariable
+            ax.imshow(Y, aspect='auto', cmap=cm.gray)
             ax.set_title("Y{}".format(i + 1))
         pylab.draw()
         pylab.tight_layout()
@@ -450,9 +451,12 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose
     if in_place:
         # Make figure move in place.
         data['Y'][:, 0:3] = 0.0
+
     m = GPy.models.GPLVM(data['Y'], 2, normalize_Y=True)
 
-    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if optimize:
+        m.optimize(messages=verbose, max_f_eval=10000)
+
     if plot:
         ax = m.plot_latent()
         y = m.likelihood.Y[0, :]
diff --git a/GPy/testing/examples_tests.py b/GPy/testing/examples_tests.py
index a525b1c9..9998590a 100644
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@@ -10,6 +10,7 @@ import os
 import random
 from nose.tools import nottest
 import sys
+import itertools
 
 class ExamplesTests(unittest.TestCase):
     def _checkgrad(self, Model):
@@ -39,8 +40,19 @@ def model_instance(model):
     #assert isinstance(model, GPy.core.model)
     return isinstance(model, GPy.core.model.Model)
 
-@nottest
+def flatten_nested(lst):
+    result = []
+    for element in lst:
+        if hasattr(element, '__iter__'):
+            result.extend(flatten_nested(element))
+        else:
+            result.append(element)
+    return result
+
+#@nottest
 def test_models():
+    optimize=False
+    plot=True
     examples_path = os.path.dirname(GPy.examples.__file__)
     # Load modules
     failing_models = {}
@@ -54,29 +66,34 @@ def test_models():
         print "After"
         print functions
         for example in functions:
-            if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100', 'brendan_faces']:
-                print "SKIPPING"
-                continue
+            #if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100', 'brendan_faces']:
+                #print "SKIPPING"
+                #continue
 
             print "Testing example: ", example[0]
             # Generate model
+
             try:
-                model = example[1]()
+                models = [ example[1](optimize=optimize, plot=plot) ]
+                #If more than one model returned, flatten them
+                models = flatten_nested(models)
             except Exception as e:
                 failing_models[example[0]] = "Cannot make model: \n{e}".format(e=e)
             else:
-                print model
+                print models
                 model_checkgrads.description = 'test_checkgrads_%s' % example[0]
                 try:
-                    if not model_checkgrads(model):
-                        failing_models[model_checkgrads.description] = False
+                    for model in models:
+                        if not model_checkgrads(model):
+                            failing_models[model_checkgrads.description] = False
                 except Exception as e:
                     failing_models[model_checkgrads.description] = e
 
                 model_instance.description = 'test_instance_%s' % example[0]
                 try:
-                    if not model_instance(model):
-                        failing_models[model_instance.description] = False
+                    for model in models:
+                        if not model_instance(model):
+                            failing_models[model_instance.description] = False
                 except Exception as e:
                     failing_models[model_instance.description] = e
 
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 732e2a1b..c95998a7 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -435,7 +435,7 @@ def simulation_BGPLVM():
     Y = np.array(mat_data['Y'], dtype=float)
     S = np.array(mat_data['initS'], dtype=float)
     mu = np.array(mat_data['initMu'], dtype=float)
-    return data_details_return({'S': S, 'Y': Y, 'mu': mu}, data_set)
+    #return data_details_return({'S': S, 'Y': Y, 'mu': mu}, data_set)
     return {'Y': Y, 'S': S,
             'mu' : mu,
             'info': "Simulated test dataset generated in MATLAB to compare BGPLVM between python and MATLAB"}
@@ -594,11 +594,11 @@ def olympic_sprints(data_set='rogers_girolami_data'):
         'Y': Y,
         'info': "Olympics sprint event winning for men and women to 2008. Data is from Rogers and Girolami's First Course in Machine Learning.",
         'output_info': {
-          0:'100m Men', 
-          1:'100m Women', 
-          2:'200m Men', 
-          3:'200m Women', 
-          4:'400m Men', 
+          0:'100m Men',
+          1:'100m Women',
+          2:'200m Men',
+          3:'200m Women',
+          4:'400m Men',
           5:'400m Women'}
         }, data_set)
 

From 7c1c50cf559068225054d84ec4e9e837c8b846d2 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Fri, 29 Nov 2013 17:32:08 +0000
Subject: [PATCH 248/252] Fixed bugs in cmu_mocap loader where cmu_url was
 missing and loading in mocap data twice in same session led to incorrect url
 through copy error.

---
 GPy/util/data_resources.json |  2 +-
 GPy/util/datasets.py         | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json
index 2b36b0c1..d86d9088 100644
--- a/GPy/util/data_resources.json
+++ b/GPy/util/data_resources.json
@@ -102,7 +102,7 @@
       "citation":"Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.\nThe database was created with funding from NSF EIA-0196217.",
       "details":"CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.",
       "urls":[
-         "http://mocap.cs.cmu.edu"
+         "http://mocap.cs.cmu.edu/subjects"
       ],
       "size":null
    },
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index c95998a7..fdba0ac5 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -142,6 +142,8 @@ def cmu_urls_files(subj_motions, messages = True):
     '''
     Find which resources are missing on the local disk for the requested CMU motion capture motions.
     '''
+    dr = data_resources['cmu_mocap_full']
+    cmu_url = dr['urls'][0]
 
     subjects_num = subj_motions[0]
     motions_num = subj_motions[1]
@@ -187,7 +189,7 @@ def cmu_urls_files(subj_motions, messages = True):
                 url_required = True
                 file_download.append(subjects[i] + '_' + motions[i][j] + '.amc')
         if url_required:
-            resource['urls'].append(cmu_url + subjects[i] + '/')
+            resource['urls'].append(cmu_url + '/' + subjects[i] + '/')
             resource['files'].append(file_download)
     return resource
 
@@ -693,15 +695,15 @@ def creep_data(data_set='creep_rupture'):
     X = all_data[:, features].copy()
     return data_details_return({'X': X, 'y': y}, data_set)
 
-def cmu_mocap_49_balance():
+def cmu_mocap_49_balance(data_set='cmu_mocap'):
     """Load CMU subject 49's one legged balancing motion that was used by Alvarez, Luengo and Lawrence at AISTATS 2009."""
     train_motions = ['18', '19']
     test_motions = ['20']
-    data = cmu_mocap('49', train_motions, test_motions, sample_every=4)
+    data = cmu_mocap('49', train_motions, test_motions, sample_every=4, data_set=data_set)
     data['info'] = "One legged balancing motions from CMU data base subject 49. As used in Alvarez, Luengo and Lawrence at AISTATS 2009. It consists of " + data['info']
     return data
 
-def cmu_mocap_35_walk_jog():
+def cmu_mocap_35_walk_jog(data_set='cmu_mocap'):
     """Load CMU subject 35's walking and jogging motions, the same data that was used by Taylor, Roweis and Hinton at NIPS 2007. but without their preprocessing. Also used by Lawrence at AISTATS 2007."""
     train_motions = ['01', '02', '03', '04', '05', '06',
                 '07', '08', '09', '10', '11', '12',
@@ -709,7 +711,7 @@ def cmu_mocap_35_walk_jog():
                 '20', '21', '22', '23', '24', '25',
                 '26', '28', '30', '31', '32', '33', '34']
     test_motions = ['18', '29']
-    data = cmu_mocap('35', train_motions, test_motions, sample_every=4)
+    data = cmu_mocap('35', train_motions, test_motions, sample_every=4, data_set=data_set)
     data['info'] = "Walk and jog data from CMU data base subject 35. As used in Tayor, Roweis and Hinton at NIPS 2007, but without their pre-processing (i.e. as used by Lawrence at AISTATS 2007). It consists of " + data['info']
     return data
 
@@ -721,7 +723,7 @@ def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4, data_set=
     # Make sure the data is downloaded.
     all_motions = train_motions + test_motions
     resource = cmu_urls_files(([subject], [all_motions]))
-    data_resources[data_set] = data_resources['cmu_mocap_full']
+    data_resources[data_set] = data_resources['cmu_mocap_full'].copy()
     data_resources[data_set]['files'] = resource['files']
     data_resources[data_set]['urls'] = resource['urls']
     if resource['urls']:

From e349c12cf0dd830f2b46269d2bad988e8aae60c8 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Fri, 29 Nov 2013 18:39:14 +0000
Subject: [PATCH 249/252] Fixed some bugs in mocap.py where errors weren't
 being raised when file format was incorrect and made datasets.py check for
 404 errors which previously were occuring silently ... shhhhh

---
 GPy/util/datasets.py | 16 +++++++++++++---
 GPy/util/mocap.py    | 12 +++++++-----
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index fdba0ac5..b4a26636 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -3,7 +3,6 @@ import numpy as np
 import GPy
 import scipy.io
 import cPickle as pickle
-import urllib as url
 import zipfile
 import tarfile
 import datetime
@@ -15,7 +14,7 @@ except ImportError:
     ipython_available=False
 
 
-import sys, urllib
+import sys, urllib2
 
 def reporthook(a,b,c):
     # ',' at the end of the line is important!
@@ -82,7 +81,18 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix
     print "Downloading ", url, "->", os.path.join(store_directory, file)
     if not os.path.exists(dir_name):
         os.makedirs(dir_name)
-    urllib.urlretrieve(url+suffix, save_name, reporthook)
+    try:
+        response = urllib2.urlopen(url+suffix)
+    except urllib2.URLError, e:
+        if not hasattr(e, "code"):
+            raise
+        response = e
+        if response.code == 404:
+            raise ValueError('Url ' + url + suffix + ' 404 not found.')
+    with open(save_name, 'wb') as f:
+        f.write(response.read())
+    
+    #urllib.urlretrieve(url+suffix, save_name, reporthook)
 
 def authorize_download(dataset_name=None):
     """Check with the user that the are happy with terms and conditions for the data set."""
diff --git a/GPy/util/mocap.py b/GPy/util/mocap.py
index 78f00955..58662cf9 100644
--- a/GPy/util/mocap.py
+++ b/GPy/util/mocap.py
@@ -67,14 +67,14 @@ class tree:
         for i in range(len(self.vertices)):
             if self.vertices[i].id == id:
                 return i
-        raise Error, 'Reverse look up of id failed.'
+        raise ValueError('Reverse look up of id failed.')
 
     def get_index_by_name(self, name):
         """Give the index associated with a given vertex name."""
         for i in range(len(self.vertices)):
             if self.vertices[i].name == name:
                 return i
-        raise Error, 'Reverse look up of name failed.'
+        raise ValueError('Reverse look up of name failed.')
 
     def order_vertices(self):
         """Order vertices in the graph such that parents always have a lower index than children."""
@@ -433,6 +433,8 @@ class acclaim_skeleton(skeleton):
         lin = self.read_line(fid)
         while lin != ':DEGREES':
             lin = self.read_line(fid)
+            if lin == '':
+                raise ValueError('Could not find :DEGREES in ' + fid.name)
 
         counter = 0
         lin = self.read_line(fid)
@@ -443,9 +445,9 @@ class acclaim_skeleton(skeleton):
                 if frame_no:
                     counter += 1
                     if counter != frame_no:
-                        raise Error, 'Unexpected frame number.'
+                        raise ValueError('Unexpected frame number.')
                 else:
-                    raise Error, 'Single bone name  ...'
+                    raise ValueError('Single bone name  ...')
             else:
                 ind = self.get_index_by_name(parts[0])
                 bones[ind].append(np.array([float(channel) for channel in parts[1:]]))
@@ -573,7 +575,7 @@ class acclaim_skeleton(skeleton):
                         return
                     lin = self.read_line(fid)
             else:
-                raise Error, 'Unrecognised file format'
+                raise ValueError('Unrecognised file format')
             self.finalize()
             
     def read_units(self, fid):

From 4a751fd2da352bcb94d5040c6795277835ac1a58 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <lawrennd@gmail.com>
Date: Sat, 30 Nov 2013 11:02:42 +0000
Subject: [PATCH 250/252] Added some more error checking for downloading
 datasets.

---
 GPy/util/datasets.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index b4a26636..7fd1b6c5 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -87,8 +87,11 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix
         if not hasattr(e, "code"):
             raise
         response = e
-        if response.code == 404:
-            raise ValueError('Url ' + url + suffix + ' 404 not found.')
+        if response.code > 399 and response.code<500:
+            raise ValueError('Tried url ' + url + suffix + ' and received client error ' + str(response.code))
+        elif response.code > 499:
+            raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code))
+    # if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
     with open(save_name, 'wb') as f:
         f.write(response.read())
     

From cb36368d134be6560512873800a45f2787027c58 Mon Sep 17 00:00:00 2001
From: mu <m.niu@sheffield.ac.uk>
Date: Tue, 10 Dec 2013 12:38:34 +0000
Subject: [PATCH 251/252] dk dparameter

---
 GPy/kern/parts/ODE_1.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/GPy/kern/parts/ODE_1.py b/GPy/kern/parts/ODE_1.py
index 416278e3..8c5f123f 100644
--- a/GPy/kern/parts/ODE_1.py
+++ b/GPy/kern/parts/ODE_1.py
@@ -137,7 +137,11 @@ class ODE_1(Kernpart):
         k2 = (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
         k3 = np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
         dkdvar = k1+k2+k3
-
+        
+        #target[0] dk dvarU
+        #target[1] dk dvarY
+        #target[2] dk d theta1
+        #target[3] dk d theta2 
         target[0] += np.sum(self.varianceY*dkdvar * dL_dK)
         target[1] += np.sum(self.varianceU*dkdvar * dL_dK)
         target[2] += np.sum(dktheta1*(-np.sqrt(3)*self.lengthscaleU**(-2)) * dL_dK)

From bab477f149808d14faaf4127895af184feab5793 Mon Sep 17 00:00:00 2001
From: mu <m.niu@sheffield.ac.uk>
Date: Tue, 10 Dec 2013 17:07:37 +0000
Subject: [PATCH 252/252] ode UY

---
 GPy/kern/parts/ODE_UY.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/GPy/kern/parts/ODE_UY.py b/GPy/kern/parts/ODE_UY.py
index f6c5e9d9..bb736cc5 100644
--- a/GPy/kern/parts/ODE_UY.py
+++ b/GPy/kern/parts/ODE_UY.py
@@ -189,6 +189,13 @@ class ODE_UY(Kernpart):
         if X2 is None: X2 = X
         dist = np.abs(X - X2.T)
 
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+
+
         ly=1/self.lengthscaleY
         lu=np.sqrt(3)/self.lengthscaleU
         #ly=self.lengthscaleY
@@ -232,6 +239,25 @@ class ODE_UY(Kernpart):
         k3 = lambda dist: np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
         dkdvar = k1+k2+k3
 
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            #target[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
+                        elif i==0 and j==1:
+                            #target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
+                        elif i==1 and j==1:
+                            #target[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
+                        else:
+                            #target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[s1[0],s2[0]]) )   )
+
+
+
+
+
+
         target[0] += np.sum(self.varianceY*dkdvar * dL_dK)
         target[1] += np.sum(self.varianceU*dkdvar * dL_dK)
         target[2] += np.sum(dktheta1*(-np.sqrt(3)*self.lengthscaleU**(-2)) * dL_dK)