From 1dd83291fef489e2c44d6ccb0d4a1ba8a6776bc6 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 11 Sep 2013 11:54:15 +0100
Subject: [PATCH] Renamed some things, made some small (incorrect) gradient
 changes, generalised the gp regression for any likelihood, and added a place
 holder link function waiting for Richardos changes

---
 GPy/examples/laplace_approximations.py     | 75 +++++++++++-----------
 GPy/likelihoods/__init__.py                |  1 +
 GPy/likelihoods/{Laplace.py => laplace.py} |  0
 GPy/likelihoods/likelihood_functions.py    | 32 +++++----
 GPy/likelihoods/link_functions.py          | 13 ++++
 GPy/models/gp_regression.py                |  7 +-
 GPy/util/linalg.py                         |  8 +++
 7 files changed, 83 insertions(+), 53 deletions(-)
 rename GPy/likelihoods/{Laplace.py => laplace.py} (100%)

diff --git a/GPy/examples/laplace_approximations.py b/GPy/examples/laplace_approximations.py
index b6443664..c0bc3aef 100644
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@@ -25,9 +25,9 @@ def timing():
         edited_real_sd = real_sd
         kernel1 = GPy.kern.rbf(X.shape[1])
 
-        t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+        t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
         corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
-        m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel1)
+        m = GPy.models.GPRegression(X, corrupt_stu_t_likelihood, kernel1)
         m.ensure_default_constraints()
         m.update_likelihood_approximation()
         m.optimize()
@@ -54,9 +54,9 @@ def v_fail_test():
     edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GP(X, stu_t_likelihood, kernel1)
+    m = GPy.models.GPRegression(X, stu_t_likelihood, kernel1)
     m.constrain_positive('')
     vs = 25
     noises = 30
@@ -94,16 +94,16 @@ def student_t_obj_plane():
     deg_free = 1000
 
     kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
     mgp.ensure_default_constraints()
     mgp['noise'] = real_std**2
     print "Gaussian"
     print mgp
 
     kernelst = kernelgp.copy()
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=(real_std**2))
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=(real_std**2))
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m = GPy.models.GPRegression(X, stu_t_likelihood, kernelst)
     m.ensure_default_constraints()
     m.constrain_fixed('t_no', real_std**2)
     vs = 10
@@ -144,7 +144,7 @@ def student_t_f_check():
     deg_free = 1000
 
     kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
     mgp.ensure_default_constraints()
     mgp.randomize()
     mgp.optimize()
@@ -154,9 +154,9 @@ def student_t_f_check():
 
     kernelst = kernelgp.copy()
     #kernelst += GPy.kern.bias(X.shape[1])
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=0.05)
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=0.05)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m = GPy.models.GPRegression(X, stu_t_likelihood, kernelst)
     #m['rbf_v'] = mgp._get_params()[0]
     #m['rbf_l'] = mgp._get_params()[1] + 1
     m.ensure_default_constraints()
@@ -198,7 +198,7 @@ def student_t_fix_optimise_check():
 
     #GP
     kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
     mgp.ensure_default_constraints()
     mgp.randomize()
     mgp.optimize()
@@ -206,12 +206,12 @@ def student_t_fix_optimise_check():
     kernelst = kernelgp.copy()
     real_stu_t_std2 = (real_std**2)*((deg_free - 2)/float(deg_free))
 
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=real_stu_t_std2)
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=real_stu_t_std2)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
 
     plt.figure(1)
     plt.suptitle('Student likelihood')
-    m = GPy.models.GP(X, stu_t_likelihood, kernelst)
+    m = GPy.models.GPRegression(X, stu_t_likelihood, kernelst)
     m.constrain_fixed('rbf_var', mgp._get_params()[0])
     m.constrain_fixed('rbf_len', mgp._get_params()[1])
     m.constrain_positive('t_noise')
@@ -331,7 +331,7 @@ def debug_student_t_noise_approx():
     print "Clean Gaussian"
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
-    #m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    #m = GPy.models.GPRegression(X, Y, kernel=kernel1)
     ## optimize
     #m.ensure_default_constraints()
     #m.optimize()
@@ -349,10 +349,10 @@ def debug_student_t_noise_approx():
     #edited_real_sd = real_sd
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
 
-    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m = GPy.models.GPRegression(X, stu_t_likelihood, kernel6)
     #m['rbf_len'] = 1.5
     #m.constrain_fixed('rbf_v', 1.0898)
     #m.constrain_fixed('rbf_l', 0.2651)
@@ -384,9 +384,9 @@ def debug_student_t_noise_approx():
     return m
 
     #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    #t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
-    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    #m = GPy.models.GPRegression(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
     #m.optimize()
@@ -453,7 +453,7 @@ def student_t_approx():
     print "Clean Gaussian"
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
-    m = GPy.models.GP_regression(X, Y, kernel=kernel1)
+    m = GPy.models.GPRegression(X, Y, kernel=kernel1)
     # optimize
     m.ensure_default_constraints()
     m.optimize()
@@ -466,7 +466,7 @@ def student_t_approx():
 
     #Corrupt
     print "Corrupt Gaussian"
-    m = GPy.models.GP_regression(X, Yc, kernel=kernel2)
+    m = GPy.models.GPRegression(X, Yc, kernel=kernel2)
     m.ensure_default_constraints()
     #m.optimize()
     plt.subplot(212)
@@ -480,9 +480,9 @@ def student_t_approx():
     edited_real_sd = real_std #initial_var_guess
 
     print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GP(X, stu_t_likelihood, kernel6)
+    m = GPy.models.GPRegression(X, Y.copy(), kernel6, stu_t_likelihood)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
     m.randomize()
@@ -496,9 +496,9 @@ def student_t_approx():
     plt.title('Student-t rasm clean')
 
     print "Corrupt student t, rasm"
-    t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='rasm')
-    m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel4)
+    m = GPy.models.GPRegression(X, Yc.copy(), kernel4, corrupt_stu_t_likelihood)
     m.ensure_default_constraints()
     m.constrain_positive('t_noise')
     m.randomize()
@@ -514,9 +514,9 @@ def student_t_approx():
     return m
 
     #print "Clean student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    #t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     #stu_t_likelihood = GPy.likelihoods.Laplace(Y, t_distribution, opt='ncg')
-    #m = GPy.models.GP(X, stu_t_likelihood, kernel3)
+    #m = GPy.models.GPRegression(X, stu_t_likelihood, kernel3)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
     #m.optimize()
@@ -528,9 +528,9 @@ def student_t_approx():
     #plt.title('Student-t ncg clean')
 
     #print "Corrupt student t, ncg"
-    #t_distribution = GPy.likelihoods.likelihood_functions.Student_t(deg_free, sigma2=edited_real_sd)
+    #t_distribution = GPy.likelihoods.functions.StudentT(deg_free, sigma2=edited_real_sd)
     #corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution, opt='ncg')
-    #m = GPy.models.GP(X, corrupt_stu_t_likelihood, kernel5)
+    #m = GPy.models.GPRegression(X, corrupt_stu_t_likelihood, kernel5)
     #m.ensure_default_constraints()
     #m.update_likelihood_approximation()
     #m.optimize()
@@ -582,7 +582,7 @@ def noisy_laplace_approx():
 
     #A GP should completely break down due to the points as they get a lot of weight
     # create simple GP model
-    m = GPy.models.GP_regression(X, Y)
+    m = GPy.models.GPRegression(X, Y)
 
     # optimize
     m.ensure_default_constraints()
@@ -601,7 +601,7 @@ def gaussian_f_check():
     Y = np.sin(X*2*np.pi) + noise
 
     kernelgp = GPy.kern.rbf(X.shape[1]) # + GPy.kern.white(X.shape[1])
-    mgp = GPy.models.GP_regression(X, Y, kernel=kernelgp)
+    mgp = GPy.models.GPRegression(X, Y, kernel=kernelgp)
     mgp.ensure_default_constraints()
     mgp.randomize()
     mgp.optimize()
@@ -612,9 +612,9 @@ def gaussian_f_check():
     kernelg = kernelgp.copy()
     #kernelst += GPy.kern.bias(X.shape[1])
     N, D = X.shape
-    g_distribution = GPy.likelihoods.likelihood_functions.Gaussian(variance=0.1, N=N, D=D)
+    g_distribution = GPy.likelihoods.functions.Gaussian(variance=0.1, N=N, D=D)
     g_likelihood = GPy.likelihoods.Laplace(Y.copy(), g_distribution, opt='rasm')
-    m = GPy.models.GP(X, g_likelihood, kernelg)
+    m = GPy.models.GPRegression(X, Y, kernelg, likelihood=g_likelihood)
     #m['rbf_v'] = mgp._get_params()[0]
     #m['rbf_l'] = mgp._get_params()[1] + 1
     m.ensure_default_constraints()
@@ -624,14 +624,15 @@ def gaussian_f_check():
     #m.constrain_positive('bias')
     m.constrain_positive('noise_var')
     m.randomize()
+    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
     m['noise_variance'] = 0.1
-    m.likelihood.X = X
+    #m.likelihood.X = X
     plt.figure()
-    plt.subplot(211)
-    m.plot()
-    plt.subplot(212)
+    ax = plt.subplot(211)
+    m.plot(ax=ax)
+    ax = plt.subplot(212)
     m.optimize()
-    m.plot()
+    m.plot(ax=ax)
     print "final optimised gaussian"
     print m
     print "real GP"
diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 99e88b6d..5d4e31f7 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -1,4 +1,5 @@
 from ep import EP
+from laplace import Laplace
 from gaussian import Gaussian
 # TODO: from Laplace import Laplace
 import likelihood_functions as functions
diff --git a/GPy/likelihoods/Laplace.py b/GPy/likelihoods/laplace.py
similarity index 100%
rename from GPy/likelihoods/Laplace.py
rename to GPy/likelihoods/laplace.py
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 5d270b2b..06735a9c 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -167,7 +167,7 @@ class Poisson(LikelihoodFunction):
         p_975 = tmp[:,1]
         return mean,np.nan*mean,p_025,p_975 # better variance here TODO
 
-class Student_t(LikelihoodFunction):
+class StudentT(LikelihoodFunction):
     """Student t likelihood distribution
     For nomanclature see Bayesian Data Analysis 2003 p576
 
@@ -180,7 +180,11 @@ class Student_t(LikelihoodFunction):
     d2ln p(yi|fi)_d2fifj
     """
     def __init__(self, deg_free=5, sigma2=2, link=None):
-        super(Student_t, self).__init__(link)
+        self._analytical = None
+        if not link:
+            link = link_functions.Nothing()
+
+        super(StudentT, self).__init__(link)
         self.v = deg_free
         self.sigma2 = sigma2
 
@@ -413,6 +417,10 @@ class Gaussian(LikelihoodFunction):
     Gaussian likelihood - this is a test class for approximation schemes
     """
     def __init__(self, variance, D, N, link=None):
+        self._analytical = None
+        if not link:
+            link = link_functions.Nothing()
+
         super(Gaussian, self).__init__(link)
         self.D = D
         self.N = N
@@ -454,7 +462,7 @@ class Gaussian(LikelihoodFunction):
                      #- 0.5*np.sum(np.multiply(self.Ki, eeT))
                      - 0.5*np.dot(np.dot(e.T, self.Ki), e)
                      )
-        return np.sum(objective)
+        return np.sum(objective) # FIXME: put this back!
 
     def dlik_df(self, y, f, extra_data=None):
         """
@@ -468,7 +476,7 @@ class Gaussian(LikelihoodFunction):
         """
         assert y.shape == f.shape
         s2_i = (1.0/self._variance)*self.I
-        grad = np.dot(s2_i, y) - 0.5*np.dot(s2_i, f)
+        grad = np.dot(s2_i, y) - np.dot(s2_i, f)
         return grad
 
     def d2lik_d2f(self, y, f, extra_data=None):
@@ -486,7 +494,7 @@ class Gaussian(LikelihoodFunction):
         """
         assert y.shape == f.shape
         s2_i = (1.0/self._variance)*self.I
-        hess = 0.5*np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
+        hess = np.diag(-s2_i)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
         return hess
 
     def d3lik_d3f(self, y, f, extra_data=None):
@@ -499,17 +507,17 @@ class Gaussian(LikelihoodFunction):
         d3lik_d3f = np.diagonal(0*self.I)[:, None] # FIXME: CAREFUL THIS MAY NOT WORK WITH MULTIDIMENSIONS?
         return d3lik_d3f
 
-    def lik_dstd(self, y, f, extra_data=None):
+    def lik_dvar(self, y, f, extra_data=None):
         """
         Gradient of the likelihood (lik) w.r.t sigma parameter (standard deviation)
         """
         assert y.shape == f.shape
         e = y - f
         s_4 = 1.0/(self._variance**2)
-        dlik_dsigma = -0.5*self.N*1/self._variance + 0.5*s_4*np.trace(np.dot(e.T, np.dot(self.I, e)))
+        dlik_dsigma = -0.5*self.N/self._variance + 0.5*s_4*np.trace(np.dot(e.T, np.dot(self.I, e)))
         return dlik_dsigma
 
-    def dlik_df_dstd(self, y, f, extra_data=None):
+    def dlik_df_dvar(self, y, f, extra_data=None):
         """
         Gradient of the dlik_df w.r.t sigma parameter (standard deviation)
         """
@@ -518,7 +526,7 @@ class Gaussian(LikelihoodFunction):
         dlik_grad_dsigma = -np.dot(s_4, np.dot(self.I, y)) + 0.5*np.dot(s_4, np.dot(self.I, f))
         return dlik_grad_dsigma
 
-    def d2lik_d2f_dstd(self, y, f, extra_data=None):
+    def d2lik_d2f_dvar(self, y, f, extra_data=None):
         """
         Gradient of the hessian (d2lik_d2f) w.r.t sigma parameter (standard deviation)
 
@@ -530,9 +538,9 @@ class Gaussian(LikelihoodFunction):
 
     def _gradients(self, y, f, extra_data=None):
         #must be listed in same order as 'get_param_names'
-        derivs = ([self.lik_dstd(y, f, extra_data=extra_data)],
-                  [self.dlik_df_dstd(y, f, extra_data=extra_data)],
-                  [self.d2lik_d2f_dstd(y, f, extra_data=extra_data)]
+        derivs = ([self.lik_dvar(y, f, extra_data=extra_data)],
+                  [self.dlik_df_dvar(y, f, extra_data=extra_data)],
+                  [self.d2lik_d2f_dvar(y, f, extra_data=extra_data)]
                  ) # lists as we might learn many parameters
         # ensure we have gradients for every parameter we want to optimize
         assert len(derivs[0]) == len(self._get_param_names())
diff --git a/GPy/likelihoods/link_functions.py b/GPy/likelihoods/link_functions.py
index 3b9a55b2..826983a9 100644
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@@ -31,3 +31,16 @@ class Probit(LinkFunction):
 
     def log_inv_transf(self,f):
         pass
+
+class Nothing(LinkFunction):
+    """
+    Probit link function: Squashes a likelihood between 0 and 1
+    """
+    def transf(self,mu):
+        return mu
+
+    def inv_transf(self,f):
+        return f
+
+    def log_inv_transf(self,f):
+        return np.log(f)
diff --git a/GPy/models/gp_regression.py b/GPy/models/gp_regression.py
index 86e1f7de..633fc1c8 100644
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@@ -25,11 +25,12 @@ class GPRegression(GP):
 
     """
 
-    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False):
+    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False, likelihood=None):
         if kernel is None:
             kernel = kern.rbf(X.shape[1])
 
-        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
+        if likelihood is None:
+            likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
 
         GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
         self.ensure_default_constraints()
@@ -39,5 +40,3 @@ class GPRegression(GP):
 
     def setstate(self, state):
         return GP.setstate(self, state)
-
-    pass
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 19cf6545..8331933d 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -55,6 +55,14 @@ def dpotri(A, lower=0):
     """
     return lapack.dpotri(A, lower=lower)
 
+def pddet(A):
+    """
+    Determinant of a positive definite matrix, only symmetric matricies though
+    """
+    L = jitchol(A)
+    logdetA = 2*sum(np.log(np.diag(L)))
+    return logdetA
+
 def trace_dot(a, b):
     """
     efficiently compute the trace of the matrix product of a and b