From d84ae52d415abfa92f8f78b2b9954ef4e331edc6 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 4 Jun 2013 16:21:00 +0100
Subject: [PATCH 1/4] fixed Alan's dependency nightmare.

---
 GPy/__init__.py            | 5 +++--
 GPy/core/gp_base.py        | 2 +-
 GPy/models/sparse_GPLVM.py | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/GPy/__init__.py b/GPy/__init__.py
index fa69dac3..3b656626 100644
--- a/GPy/__init__.py
+++ b/GPy/__init__.py
@@ -2,16 +2,17 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-import kern
+import core
 import models
 import inference
 import util
 import examples
-from core import priors
 import likelihoods
 import testing
 from numpy.testing import Tester
 from nose.tools import nottest
+import kern
+from core import priors
 
 @nottest
 def tests():
diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index 8df1a5c1..3eb977c6 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -4,7 +4,7 @@ from .. import kern
 from ..util.plot import gpplot, Tango, x_frame1D, x_frame2D
 import pylab as pb
 
-class GPBase(model):
+class GPBase(model.model):
     """
     Gaussian Process model for holding shared behaviour between
     sprase_GP and GP models
diff --git a/GPy/models/sparse_GPLVM.py b/GPy/models/sparse_GPLVM.py
index af388f1c..591c49b2 100644
--- a/GPy/models/sparse_GPLVM.py
+++ b/GPy/models/sparse_GPLVM.py
@@ -8,7 +8,7 @@ import sys, pdb
 # from .. import kern
 # from ..core import model
 # from ..util.linalg import pdinv, PCA
-from ..core import GPLVM
+from GPLVM import GPLVM
 from sparse_GP_regression import sparse_GP_regression
 
 class sparse_GPLVM(sparse_GP_regression, GPLVM):

From 775ec15543dc1105c3b19cb7b92b19b376ba3093 Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Tue, 4 Jun 2013 16:23:04 +0100
Subject: [PATCH 2/4] Link functions defined

---
 GPy/examples/classification.py          |   6 +-
 GPy/likelihoods/EP.py                   |   1 +
 GPy/likelihoods/likelihood_functions.py | 155 ++++++++++++++----------
 GPy/likelihoods/link_functions.py       |  58 +++++++++
 4 files changed, 156 insertions(+), 64 deletions(-)
 create mode 100644 GPy/likelihoods/link_functions.py

diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index 9168db7c..a96911f4 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -79,12 +79,16 @@ def toy_linear_1d_classification(seed=default_seed):
 
     data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
     Y = data['Y'][:, 0:1]
+    Y[Y.flatten() == -1] = 0
 
     # Kernel object
     kernel = GPy.kern.rbf(1)
 
     # Likelihood object
-    distribution = GPy.likelihoods.likelihood_functions.probit()
+    link = GPy.likelihoods.link_functions.probit
+    distribution = GPy.likelihoods.likelihood_functions.binomial(link)
+    #distribution = GPy.likelihoods.likelihood_functions.binomial()
+    #distribution = GPy.likelihoods.likelihood_functions.probit()
     likelihood = GPy.likelihoods.EP(Y, distribution)
 
     # Model definition
diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py
index efca0649..5b538b92 100644
--- a/GPy/likelihoods/EP.py
+++ b/GPy/likelihoods/EP.py
@@ -20,6 +20,7 @@ class EP(likelihood):
         self.N, self.D = self.data.shape
         self.is_heteroscedastic = True
         self.Nparams = 0
+        self._transf_data = self.likelihood_function._preprocess_values(data)
 
         #Initial values - Likelihood approximation parameters:
         #p(y|f) = t(f|tau_tilde,v_tilde)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 66c663dc..00100d17 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -8,19 +8,68 @@ import scipy as sp
 import pylab as pb
 from ..util.plot import gpplot
 from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import link_functions
 
-class likelihood_function:
+class likelihood_function(object):
     """
     Likelihood class for doing Expectation propagation
 
     :param Y: observed output (Nx1 numpy.darray)
     ..Note:: Y values allowed depend on the likelihood_function used
     """
-    def __init__(self,location=0,scale=1):
-        self.location = location
-        self.scale = scale
+    def __init__(self,link):
+        if link == self._analytical:
+            self.moments_match = self._moments_match_analytical
+        else:
+            assert isinstance(link,link_functions.link_function)
+            self.link = link
+            self.moments_match = self._moments_match_numerical
 
-class probit(likelihood_function):
+    def _preprocess_values(self,Y):
+        return Y
+
+    def _product(self,gp,obs,mu,sigma):
+        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._distribution(gp,obs)
+
+    def _nlog_product(self,gp,obs,mu,sigma):
+        return -(-.5*(gp-mu)**2/sigma**2 + self._log_distribution(gp,obs))
+
+    def _locate(self,obs,mu,sigma):
+        """
+        Golden Search to find the mode in the _product function (cavity x exact likelihood) and define a grid around it for numerical integration
+        """
+        golden_A = -1 if obs == 0 else np.array([np.log(obs),mu]).min() #Lower limit
+        golden_B = np.array([np.log(obs),mu]).max() #Upper limit
+        return sp.optimize.golden(self._nlog_product, args=(obs,mu,sigma), brack=(golden_A,golden_B)) #Better to work with _nlog_product than with _product
+
+    def _moments_match_numerical(self,obs,tau,v):
+        """
+        Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input.
+        """
+        mu = v/tau
+        sigma = np.sqrt(1./tau)
+        opt = self._locate(obs,mu,sigma)
+        width = 3./np.log(max(obs,2))
+        A = opt - width #Grid's lower limit
+        B = opt + width #Grid's Upper limit
+        K =  10*int(np.log(max(obs,150))) #Number of points in the grid
+        h = (B-A)/K # length of the intervals
+        grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis)
+        x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier
+        _aux1 = self._product(A,obs,mu,sigma)
+        _aux2 = self._product(B,obs,mu,sigma)
+        _aux3 = 4*self._product(grid_x[range(1,K,2)],obs,mu,sigma)
+        _aux4 = 2*self._product(grid_x[range(2,K-1,2)],obs,mu,sigma)
+        zeroth = np.hstack((_aux1,_aux2,_aux3,_aux4)) # grid of points (Y axis) rearranged
+        first = zeroth*x
+        second = first*x
+        Z_hat = sum(zeroth)*h/3 # Zero-th moment
+        mu_hat = sum(first)*h/(3*Z_hat) # First moment
+        m2 = sum(second)*h/(3*Z_hat) # Second moment
+        sigma2_hat = m2 - mu_hat**2 # Second central moment
+        return float(Z_hat), float(mu_hat), float(sigma2_hat)
+
+class binomial(likelihood_function):
     """
     Probit likelihood
     Y is expected to take values in {-1,1}
@@ -29,8 +78,33 @@ class probit(likelihood_function):
     L(x) = \\Phi (Y_i*f_i)
     $$
     """
+    def __init__(self,link=None):
+        self._analytical = link_functions.probit
+        if not link:
+            link = self._analytical
+        super(binomial, self).__init__(link)
 
-    def moments_match(self,data_i,tau_i,v_i):
+    def _distribution(self,gp,obs):
+        pass
+
+    def _log_distribution(self,gp,obs):
+        pass
+
+    def _preprocess_values(self,Y):
+        """
+        Check if the values of the observations correspond to the values
+        assumed by the likelihood function.
+
+        ..Note:: Binary classification algorithm works better with classes {-1,1}
+        """
+        Y_prep = Y.copy()
+        Y1 = Y[Y.flatten()==1].size
+        Y2 = Y[Y.flatten()==0].size
+        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
+        Y_prep[Y.flatten() == 0] = -1
+        return Y_prep
+
+    def _moments_match_analytical(self,data_i,tau_i,v_i):
         """
         Moments match of the marginal approximation in EP algorithm
 
@@ -38,8 +112,6 @@ class probit(likelihood_function):
         :param tau_i: precision of the cavity distribution (float)
         :param v_i: mean/variance of the cavity distribution (float)
         """
-        #if data_i == 0: data_i = -1 #NOTE Binary classification algorithm works better with classes {-1,1}, 1D-plotting works better with classes {0,1}.
-        # TODO: some version of assert
         z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
         Z_hat = std_norm_cdf(z)
         phi = std_norm_pdf(z)
@@ -50,6 +122,8 @@ class probit(likelihood_function):
     def predictive_values(self,mu,var):
         """
         Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction
+        :param mu: mean of the latent variable
+        :param var: variance of the latent variable
         """
         mu = mu.flatten()
         var = var.flatten()
@@ -69,68 +143,23 @@ class Poisson(likelihood_function):
     L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
     $$
     """
-    def moments_match(self,data_i,tau_i,v_i):
-        """
-        Moments match of the marginal approximation in EP algorithm
+    def __init__(self,link=None):
+        self._analytical = None
+        if not link:
+            link = link_functions.log()
+        super(Poisson, self).__init__(link)
 
-        :param i: number of observation (int)
-        :param tau_i: precision of the cavity distribution (float)
-        :param v_i: mean/variance of the cavity distribution (float)
-        """
-        mu = v_i/tau_i
-        sigma = np.sqrt(1./tau_i)
-        def poisson_norm(f):
-            """
-            Product of the likelihood and the cavity distribution
-            """
-            pdf_norm_f = stats.norm.pdf(f,loc=mu,scale=sigma)
-            rate = np.exp( (f*self.scale)+self.location)
-            poisson = stats.poisson.pmf(float(data_i),rate)
-            return pdf_norm_f*poisson
+    def _distribution(self,gp,obs):
+        return stats.poisson.pmf(obs,self.link.inv_transf(gp))
 
-        def log_pnm(f):
-            """
-            Log of poisson_norm
-            """
-            return -(-.5*(f-mu)**2/sigma**2 - np.exp( (f*self.scale)+self.location) + ( (f*self.scale)+self.location)*data_i)
-
-        """
-        Golden Search and Simpson's Rule
-        --------------------------------
-        Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input.
-        Golden Search is used to find the mode in the poisson_norm distribution and define around it the grid for Simpson's Rule
-        """
-        #TODO golden search & simpson's rule can be defined in the general likelihood class, rather than in each specific case.
-
-        #Golden search
-        golden_A = -1 if data_i == 0 else np.array([np.log(data_i),mu]).min() #Lower limit
-        golden_B = np.array([np.log(data_i),mu]).max() #Upper limit
-        golden_A = (golden_A - self.location)/self.scale
-        golden_B = (golden_B - self.location)/self.scale
-        opt = sp.optimize.golden(log_pnm,brack=(golden_A,golden_B)) #Better to work with log_pnm than with poisson_norm
-
-        # Simpson's approximation
-        width = 3./np.log(max(data_i,2))
-        A = opt - width #Lower limit
-        B = opt + width #Upper limit
-        K =  10*int(np.log(max(data_i,150))) #Number of points in the grid, we DON'T want K to be the same number for every case
-        h = (B-A)/K # length of the intervals
-        grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis)
-        x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier
-        zeroth = np.hstack([poisson_norm(A),poisson_norm(B),[4*poisson_norm(f) for f in grid_x[range(1,K,2)]],[2*poisson_norm(f) for f in grid_x[range(2,K-1,2)]]]) # grid of points (Y axis) rearranged like x
-        first = zeroth*x
-        second = first*x
-        Z_hat = sum(zeroth)*h/3 # Zero-th moment
-        mu_hat = sum(first)*h/(3*Z_hat) # First moment
-        m2 = sum(second)*h/(3*Z_hat) # Second moment
-        sigma2_hat = m2 - mu_hat**2 # Second central moment
-        return float(Z_hat), float(mu_hat), float(sigma2_hat)
+    def _log_distribution(self,gp,obs):
+        return - self.link.inv_transf(gp) + obs * self.link.log_inv_transf(gp)
 
     def predictive_values(self,mu,var):
         """
         Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
         """
-        mean = np.exp(mu*self.scale + self.location)
+        mean = self.link.transf(mu)#np.exp(mu*self.scale + self.location)
         tmp = stats.poisson.ppf(np.array([.025,.975]),mean)
         p_025 = tmp[:,0]
         p_975 = tmp[:,1]
diff --git a/GPy/likelihoods/link_functions.py b/GPy/likelihoods/link_functions.py
new file mode 100644
index 00000000..28beac71
--- /dev/null
+++ b/GPy/likelihoods/link_functions.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats
+import scipy as sp
+import pylab as pb
+from ..util.plot import gpplot
+from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+
+class link_function(object):
+    """
+    Link function class for doing non-Gaussian likelihoods approximation
+
+    :param Y: observed output (Nx1 numpy.darray)
+    ..Note:: Y values allowed depend on the likelihood_function used
+    """
+    def __init__(self):
+        pass
+
+
+
+class identity(link_function):
+    def transf(self,mu):
+        return mu
+
+    def inv_transf(self,f):
+        return f
+
+    def log_inv_transf(self,f):
+        return np.log(f)
+
+class log(link_function):
+
+    def transf(self,mu):
+        return np.log(mu)
+
+    def inv_transf(self,f):
+        return np.exp(f)
+
+    def log_inv_transf(self,f):
+        return f
+
+class log_ex_1(link_function):
+    def transf(self,mu):
+        return np.log(np.exp(mu) - 1)
+
+    def inv_transf(self,f):
+        return np.log(np.exp(f)+1)
+
+    def log_inv_tranf(self,f):
+        return np.log(np.log(np.exp(f)+1))
+
+class probit(link_function):
+    pass
+
+

From 362389781b87a067c6cd2350b95f09f689c06d98 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 4 Jun 2013 16:28:20 +0100
Subject: [PATCH 3/4] fixing some examples

---
 GPy/core/gp_base.py         | 3 +++
 GPy/models/GP_regression.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index 3eb977c6..29234c4d 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -24,6 +24,9 @@ class GPBase(model.model):
             self._Xmean = X.mean(0)[None, :]
             self._Xstd = X.std(0)[None, :]
             self.X = (X.copy() - self._Xmean) / self._Xstd
+        else:
+            self._Xmean = np.zeros((1,self.Q))
+            self._Xstd = np.ones((1,self.Q))
 
         super(GPBase, self).__init__()
 
diff --git a/GPy/models/GP_regression.py b/GPy/models/GP_regression.py
index 2979456b..e5acb4f7 100644
--- a/GPy/models/GP_regression.py
+++ b/GPy/models/GP_regression.py
@@ -31,5 +31,5 @@ class GP_regression(GP):
 
         likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y)
 
-        super(GP_regression, self).__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
+        super(GP_regression, self).__init__(X, likelihood, kernel, normalize_X=normalize_X)
         self._set_params(self._get_params())

From 78aedd84af66d5ed6909e5f58bd4200903e307a3 Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Tue, 4 Jun 2013 16:32:12 +0100
Subject: [PATCH 4/4] Examples changed to use new link_functions

---
 GPy/examples/classification.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index a96911f4..f3adebaa 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -21,13 +21,15 @@ def crescent_data(seed=default_seed): # FIXME
     """
 
     data = GPy.util.datasets.crescent_data(seed=seed)
+    Y = data['Y']
+    Y[Y.flatten()==-1] = 0
 
     # Kernel object
     kernel = GPy.kern.rbf(data['X'].shape[1])
 
     # Likelihood object
-    distribution = GPy.likelihoods.likelihood_functions.probit()
-    likelihood = GPy.likelihoods.EP(data['Y'], distribution)
+    distribution = GPy.likelihoods.likelihood_functions.binomial()
+    likelihood = GPy.likelihoods.EP(Y, distribution)
 
 
     m = GPy.models.GP(data['X'], likelihood, kernel)
@@ -49,12 +51,15 @@ def oil():
     Run a Gaussian process classification on the oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
     """
     data = GPy.util.datasets.oil()
+    Y = data['Y'][:, 0:1]
+    Y[Y.flatten()==-1] = 0
+
     # Kernel object
     kernel = GPy.kern.rbf(12)
 
     # Likelihood object
-    distribution = GPy.likelihoods.likelihood_functions.probit()
-    likelihood = GPy.likelihoods.EP(data['Y'][:, 0:1], distribution)
+    distribution = GPy.likelihoods.likelihood_functions.binomial()
+    likelihood = GPy.likelihoods.EP(Y, distribution)
 
     # Create GP model
     m = GPy.models.GP(data['X'], likelihood=likelihood, kernel=kernel)
@@ -87,8 +92,6 @@ def toy_linear_1d_classification(seed=default_seed):
     # Likelihood object
     link = GPy.likelihoods.link_functions.probit
     distribution = GPy.likelihoods.likelihood_functions.binomial(link)
-    #distribution = GPy.likelihoods.likelihood_functions.binomial()
-    #distribution = GPy.likelihoods.likelihood_functions.probit()
     likelihood = GPy.likelihoods.EP(Y, distribution)
 
     # Model definition
@@ -119,12 +122,13 @@ def sparse_toy_linear_1d_classification(seed=default_seed):
 
     data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
     Y = data['Y'][:, 0:1]
+    Y[Y.flatten() == -1] = 0
 
     # Kernel object
     kernel = GPy.kern.rbf(1) + GPy.kern.white(1)
 
     # Likelihood object
-    distribution = GPy.likelihoods.likelihood_functions.probit()
+    distribution = GPy.likelihoods.likelihood_functions.binomial()
     likelihood = GPy.likelihoods.EP(Y, distribution)
 
     Z = np.random.uniform(data['X'].min(), data['X'].max(), (10, 1))
@@ -160,13 +164,15 @@ def sparse_crescent_data(inducing=10, seed=default_seed):
     """
 
     data = GPy.util.datasets.crescent_data(seed=seed)
+    Y = data['Y']
+    Y[Y.flatten()==-1]=0
 
     # Kernel object
     kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
 
     # Likelihood object
-    distribution = GPy.likelihoods.likelihood_functions.probit()
-    likelihood = GPy.likelihoods.EP(data['Y'], distribution)
+    distribution = GPy.likelihoods.likelihood_functions.binomial()
+    likelihood = GPy.likelihoods.EP(Y, distribution)
 
     sample = np.random.randint(0, data['X'].shape[0], inducing)
     Z = data['X'][sample, :]