Merge branch 'linK_functions2' into devel

Conflicts: GPy/core/gp.py GPy/core/gp_base.py GPy/core/sparse_gp.py GPy/examples/regression.py GPy/kern/constructors.py GPy/kern/parts/coregionalise.py GPy/models/__init__.py GPy/models/sparse_gp_classification.py GPy/util/__init__.py
2026-05-09 12:02:38 +02:00 · 2013-09-13 15:57:34 +01:00 · 2013-09-13 15:57:34 +01:00 · f8c9e6b982
commit f8c9e6b982
parent 304dbfad7f d653921bf3
36 changed files with 2144 additions and 404 deletions
--- a/GPy/core/fitc.py
+++ b/GPy/core/fitc.py
@ -140,7 +140,6 @@ class FITC(SparseGP):
            dA_dnoise = 0.5 * self.input_dim * (dbstar_dnoise/self.beta_star).sum() - 0.5 * self.input_dim * np.sum(self.likelihood.Y**2 * dbstar_dnoise)
            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)
            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)
            dD_dnoise_1 =  mdot(self.V_star*LBiLmipsi1.T,LBiLmipsi1*dbstar_dnoise.T*self.likelihood.Y.T)
            alpha = mdot(LBiLmipsi1,self.V_star)
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -19,9 +19,6 @@ class GP(GPBase):
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :rtype: model object
    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
    :type powerep: list
    .. Note:: Multiple independent outputs are allowed using columns of Y
@ -105,7 +102,12 @@ class GP(GPBase):
        Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
        """
-        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
        if not isinstance(self.likelihood,EP):
            tmp = np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
        else:
            tmp = np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
        return tmp
    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
        """
@ -136,7 +138,7 @@ class GP(GPBase):
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
@ -153,5 +155,71 @@ class GP(GPBase):
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
        return mean, var, _025pm, _975pm
    def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
        """
        For a specific output, predict the function at the new point(s) Xnew.
        Arguments
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param output: output to predict
        :type output: integer in {0,..., num_outputs-1}
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
        .. Note:: For multiple output models only
        """
        assert hasattr(self,'multioutput')
        index = np.ones_like(Xnew)*output
        Xnew = np.hstack((Xnew,index))
        # normalize X values
        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, noise_model = output)
        return mean, var, _025pm, _975pm
    def _raw_predict_single_output(self, _Xnew, output=0, which_parts='all', full_cov=False,stop=False):
        """
        Internal helper function for making predictions for a specific output,
        does not account for normalization or likelihood
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param output: output to predict
        :type output: integer in {0,..., num_outputs-1}
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        .. Note:: For multiple output models only
        """
        assert hasattr(self,'multioutput')
        # creates an index column and appends it to _Xnew
        index = np.ones_like(_Xnew)*output
        _Xnew = np.hstack((_Xnew,index))
        Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
        KiKx, _ = dpotrs(self.L, np.asfortranarray(Kx), lower=1)
        mu = np.dot(KiKx.T, self.likelihood.Y)
        if full_cov:
            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
            var = Kxx - np.dot(KiKx.T, Kx)
        else:
            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
            var = var[:, None]
        if stop:
            debug_this # @UndefinedVariable
        return mu, var
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -57,18 +57,12 @@ class GPBase(Model):
        self.X = state.pop()
        Model.setstate(self, state)
-    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
+    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None,output=None):
-        """
+    """
        Plot the GP's view of the world, where the data is normalized and the
        likelihood is Gaussian.
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
-          - In higher dimensions, we've no implemented this yet !TODO!
+          - Not implemented in higher dimensions
        Can plot only part of the data and part of the posterior functions
        using which_data and which_functions
        :param samples: the number of a posteriori samples to plot
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
@ -85,7 +79,9 @@ class GPBase(Model):
        :param ax: axes to plot on.
        :type ax: axes handle
-        """
+        :param output: which output to plot (for multiple output models only)
        :type output: integer (first output is 0)
    """
        if which_data == 'all':
            which_data = slice(None)
@ -93,7 +89,7 @@ class GPBase(Model):
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
-        if self.X.shape[1] == 1:
+        if self.X.shape[1] == 1 and not hasattr(self,'multioutput'):
            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
            if samples == 0:
                m, v = self._raw_predict(Xnew, which_parts=which_parts)
@ -111,7 +107,7 @@ class GPBase(Model):
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            ax.set_ylim(ymin, ymax)
-        elif self.X.shape[1] == 2:
+        elif self.X.shape[1] == 2 and not hasattr(self,'multioutput'):
            resolution = resolution or 50
            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
            m, v = self._raw_predict(Xnew, which_parts=which_parts)
@ -120,17 +116,51 @@ class GPBase(Model):
            ax.scatter(self.X[:, 0], self.X[:, 1], 40, self.likelihood.Y, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max()) # @UndefinedVariable
            ax.set_xlim(xmin[0], xmax[0])
            ax.set_ylim(xmin[1], xmax[1])
        elif self.X.shape[1] == 2 and hasattr(self,'multioutput'):
            output -= 1
            assert self.num_outputs >= output, 'The model has only %s outputs.' %self.num_outputs
            Xu = self.X[self.X[:,-1]==output ,0:1]
            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
            if samples == 0:
                m, v = self._raw_predict_single_output(Xnew, output=output, which_parts=which_parts)
                gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax)
                ax.plot(Xu[which_data], self.likelihood.Y[self.likelihood.index==output][:,None], 'kx', mew=1.5)
            else:
                m, v = self._raw_predict_single_output(Xnew, output=output, which_parts=which_parts, full_cov=True)
                Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None, ], axes=ax)
                for i in range(samples):
                    ax.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
            ax.set_xlim(xmin, xmax)
            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            ax.set_ylim(ymin, ymax)
            if hasattr(self,'Z'):
                Zu = self.Z[self.Z[:,-1]==output,:]
                Zu = self.Z * self._Xscale + self._Xoffset
                Zu = self.Z[self.Z[:,-1]==output ,0:1] #??
                ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
        elif self.X.shape[1] == 3 and hasattr(self,'multioutput'):
            raise NotImplementedError, "Plots not implemented for multioutput models with 2D inputs...yet"
            output -= 1
            assert self.num_outputs >= output, 'The model has only %s outputs.' %self.num_outputs
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
+    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, output=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
        """
        Plot the GP with noise where the likelihood is Gaussian.
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
-          - In higher dimensions, we've no implemented this yet !TODO!
+          - Not implemented in higher dimensions
        Can plot only part of the data and part of the posterior functions
        using which_data and which_functions
@ -151,15 +181,13 @@ class GPBase(Model):
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :type output: integer (first output is 0)
        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
        :type fixed_inputs: a list of tuples
        :param linecol: color of line to plot.
        :type linecol:
        :param fillcol: color of fill
-        :type fillcol: 
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        :param levels: for 2D plotting, the number of contour levels to use
        is ax is None, create a new figure
        """
        # TODO include samples
        if which_data == 'all':
@ -169,41 +197,81 @@ class GPBase(Model):
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
-        plotdims = self.input_dim - len(fixed_inputs)
+        if not hasattr(self,'multioutput'):
-        if plotdims == 1:
+            plotdims = self.input_dim - len(fixed_inputs)
            if plotdims == 1:
                resolution = resolution or 200
-            Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
+                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-            fixed_dims = np.array([i for i,v in fixed_inputs])
+                fixed_dims = np.array([i for i,v in fixed_inputs])
-            freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
+                freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
-            Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
+                Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
-            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
+                Xgrid = np.empty((Xnew.shape[0],self.input_dim))
-            Xgrid[:,freedim] = Xnew
+                Xgrid[:,freedim] = Xnew
-            for i,v in fixed_inputs:
+                for i,v in fixed_inputs:
-                Xgrid[:,i] = v
+                    Xgrid[:,i] = v
-            m, _, lower, upper = self.predict(Xgrid, which_parts=which_parts)
+                m, _, lower, upper = self.predict(Xgrid, which_parts=which_parts)
-            for d in range(m.shape[1]):
+                for d in range(m.shape[1]):
-                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
+                    gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data,freedim], self.likelihood.data[which_data, d], 'kx', mew=1.5)
+                    ax.plot(Xu[which_data,freedim], self.likelihood.data[which_data, d], 'kx', mew=1.5)
-            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+                ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+                ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_xlim(xmin, xmax)
+                ax.set_xlim(xmin, xmax)
-            ax.set_ylim(ymin, ymax)
+                ax.set_ylim(ymin, ymax)
-        elif self.X.shape[1] == 2: # FIXME
+
-            resolution = resolution or 50
+
-            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
+                Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits,resolution=resolution)
-            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
+                m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
-            m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
+                for d in range(m.shape[1]):
-            m = m.reshape(resolution, resolution).T
+                    gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax)
-            ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
+                    ax.plot(Xu[which_data], self.likelihood.data[which_data, d], 'kx', mew=1.5)
-            Yf = self.likelihood.data.flatten()
+                ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
-            ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable
+                ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_xlim(xmin[0], xmax[0])
+                ax.set_xlim(xmin, xmax)
-            ax.set_ylim(xmin[1], xmax[1])
+                ax.set_ylim(ymin, ymax)
            elif self.X.shape[1] == 2:
                resolution = resolution or 50
                Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
                x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
                m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
                m = m.reshape(resolution, resolution).T
                ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
                Yf = self.likelihood.Y.flatten()
                ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable
                ax.set_xlim(xmin[0], xmax[0])
                ax.set_ylim(xmin[1], xmax[1])
            else:
                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+            assert self.num_outputs > output, 'The model has only %s outputs.' %self.num_outputs
            if self.X.shape[1] == 2:
                resolution = resolution or 200
                Xu = self.X[self.X[:,-1]==output,:] #keep the output of interest
                Xu = self.X * self._Xscale + self._Xoffset
                Xu = self.X[self.X[:,-1]==output ,0:1] #get rid of the index column
                Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
                m, _, lower, upper = self.predict_single_output(Xnew, which_parts=which_parts,output=output)
                for d in range(m.shape[1]):
                    gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax)
                    ax.plot(Xu[which_data], self.likelihood.noise_model_list[output].data, 'kx', mew=1.5)
                ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
                ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
                ax.set_xlim(xmin, xmax)
                ax.set_ylim(ymin, ymax)
            elif self.X.shape[1] == 3:
                raise NotImplementedError, "Plots not yet implemented for multioutput models with 2D inputs"
                resolution = resolution or 50
            else:
                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -549,7 +549,7 @@ class Model(Parameterized):
        :type optimzer: string TODO: valid strings?
        """
-        assert isinstance(self.likelihood, likelihoods.EP), "pseudo_EM is only available for EP likelihoods"
+        assert isinstance(self.likelihood, likelihoods.EP) or isinstance(self.likelihood, likelihoods.EP_Mixed_Noise), "pseudo_EM is only available for EP likelihoods"
        ll_change = epsilon + 1.
        iteration = 0
        last_ll = -np.inf
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -5,7 +5,7 @@ import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, tdot, symmetrify, backsub_both_sides, chol_inv, dtrtrs, dpotrs, dpotri
 from scipy import linalg
-from ..likelihoods import Gaussian
+from ..likelihoods import Gaussian, EP,EP_Mixed_Noise
 from gp_base import GPBase
 class SparseGP(GPBase):
@ -109,7 +109,6 @@ class SparseGP(GPBase):
        tmp, _ = dtrtrs(self._Lm, np.asfortranarray(tmp.T), lower=1)
        self._A = tdot(tmp)
        # factor B
        self.B = np.eye(self.num_inducing) + self._A
        self.LB = jitchol(self.B)
@ -139,6 +138,7 @@ class SparseGP(GPBase):
        dL_dpsi2_beta = 0.5 * backsub_both_sides(self._Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)
        if self.likelihood.is_heteroscedastic:
            if self.has_uncertain_inputs:
                self.dL_dpsi2 = self.likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :]
            else:
@ -160,9 +160,23 @@ class SparseGP(GPBase):
            # save computation here.
            self.partial_for_likelihood = None
        elif self.likelihood.is_heteroscedastic:
-            raise NotImplementedError, "heteroscedatic derivates not implemented"
+
            if self.has_uncertain_inputs:
                raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
            else:
                Lmi_psi1, nil = dtrtrs(self.Lm, np.asfortranarray(self.psi1.T), lower=1, trans=0)
                _LBi_Lmi_psi1, _ = dtrtrs(self.LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0)
                _Bi_Lmi_psi1, _ = dtrtrs(self.LB.T, np.asfortranarray(_LBi_Lmi_psi1), lower=1, trans=0)
                self.partial_for_likelihood = -0.5 * self.likelihood.precision + 0.5 * self.likelihood.V**2
                self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0 - np.sum(Lmi_psi1**2,0))[:,None] * self.likelihood.precision**2
                self.partial_for_likelihood += 0.5*np.sum(_Bi_Lmi_psi1*Lmi_psi1,0)[:,None]*self.likelihood.precision**2 #NOTE this term has numerical issues
                self.partial_for_likelihood += -np.dot(self._LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * self.likelihood.Y * self.likelihood.precision**2
                self.partial_for_likelihood += 0.5*np.dot(self._LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * self.likelihood.precision**2
        else:
-            # likelihood is not heterscedatic
+            # likelihood is not heteroscedatic
            self.partial_for_likelihood = -0.5 * self.num_data * self.output_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2
            self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self._A) * self.likelihood.precision)
            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self._A * self.DBi_plus_BiPBi) - self.data_fit)
@ -298,7 +312,7 @@ class SparseGP(GPBase):
        :type X_variance_new: np.ndarray, Nnew x self.input_dim
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
@ -322,18 +336,15 @@ class SparseGP(GPBase):
        return mean, var, _025pm, _975pm
-    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None):
+    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None, output=None):
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
        if which_data is 'all':
            which_data = slice(None)
-        GPBase.plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax)
+        GPBase.plot(self, samples=0, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax, output=output)
-
+        if self.X.shape[1] == 1 and not hasattr(self,'multioutput'):
        # add the inducing inputs and some errorbars
        if self.X.shape[1] == 1:
            if self.has_uncertain_inputs:
                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
@ -342,6 +353,109 @@ class SparseGP(GPBase):
            Zu = self.Z * self._Xscale + self._Xoffset
            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
-        elif self.X.shape[1] == 2:
+        elif self.X.shape[1] == 2 and not hasattr(self,'multioutput'):
            Zu = self.Z * self._Xscale + self._Xoffset
            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
        elif self.X.shape[1] == 2 and hasattr(self,'multioutput'):
            Xu = self.X[self.X[:,-1]==output,:]
            if self.has_uncertain_inputs:
                Xu = self.X * self._Xscale + self._Xoffset  # NOTE self.X are the normalized values now
                Xu = self.X[self.X[:,-1]==output ,0:1] #??
                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
            Zu = self.Z[self.Z[:,-1]==output,:]
            Zu = self.Z * self._Xscale + self._Xoffset
            Zu = self.Z[self.Z[:,-1]==output ,0:1] #??
            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
            #ax.set_ylim(ax.get_ylim()[0],)
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
    def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
        """
        For a specific output, predict the function at the new point(s) Xnew.
        Arguments
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param output: output to predict
        :type output: integer in {0,..., num_outputs-1}
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
        .. Note:: For multiple output models only
        """
        assert hasattr(self,'multioutput')
        index = np.ones_like(Xnew)*output
        Xnew = np.hstack((Xnew,index))
        # normalize X values
        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, noise_model = output)
        return mean, var, _025pm, _975pm
    def _raw_predict_single_output(self, _Xnew, output=0, X_variance_new=None, which_parts='all', full_cov=False,stop=False):
        """
        Internal helper function for making predictions for a specific output,
        does not account for normalization or likelihood
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param output: output to predict
        :type output: integer in {0,..., num_outputs-1}
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        .. Note:: For multiple output models only
        """
        Bi, _ = dpotri(self.LB, lower=0)  # WTH? this lower switch should be 1, but that doesn't work!
        symmetrify(Bi)
        Kmmi_LmiBLmi = backsub_both_sides(self.Lm, np.eye(self.num_inducing) - Bi)
        if self.Cpsi1V is None:
            psi1V = np.dot(self.psi1.T,self.likelihood.V)
            tmp, _ = dtrtrs(self.Lm, np.asfortranarray(psi1V), lower=1, trans=0)
            tmp, _ = dpotrs(self.LB, tmp, lower=1)
            self.Cpsi1V, _ = dtrtrs(self.Lm, tmp, lower=1, trans=1)
        assert hasattr(self,'multioutput')
        index = np.ones_like(_Xnew)*output
        _Xnew = np.hstack((_Xnew,index))
        if X_variance_new is None:
            Kx = self.kern.K(self.Z, _Xnew, which_parts=which_parts)
            mu = np.dot(Kx.T, self.Cpsi1V)
            if full_cov:
                Kxx = self.kern.K(_Xnew, which_parts=which_parts)
                var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting
            else:
                Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
        else:
            Kx = self.kern.psi1(self.Z, _Xnew, X_variance_new)
            mu = np.dot(Kx, self.Cpsi1V)
            if full_cov:
                raise NotImplementedError, "TODO"
            else:
                Kxx = self.kern.psi0(self.Z, _Xnew, X_variance_new)
                psi2 = self.kern.psi2(self.Z, _Xnew, X_variance_new)
                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
        return mu, var[:, None]
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -22,8 +22,8 @@ def coregionalisation_toy2(max_iters=100):
    Y = np.vstack((Y1, Y2))
    k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
-    k2 = GPy.kern.coregionalise(2, 1)
+    k2 = GPy.kern.coregionalise(2,1)
-    k = k1**k2
+    k = k1**k2 #k = k1.prod(k2,tensor=True)
    m = GPy.models.GPRegression(X, Y, kernel=k)
    m.constrain_fixed('.*rbf_var', 1.)
    # m.constrain_positive('.*kappa')
@ -90,7 +90,6 @@ def coregionalisation_sparse(max_iters=100):
    k1 = GPy.kern.rbf(1)
    k2 = GPy.kern.coregionalise(2, 2)
    k = k1**k2 #.prod(k2, tensor=True) # + GPy.kern.white(2,0.001)
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
    m.constrain_fixed('.*rbf_var', 1.)
    m.constrain_fixed('iip')
@ -401,8 +400,6 @@ def silhouette(max_iters=100):
    print(m)
    return m
 def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100):
    """Run a 1D example of a sparse GP regression."""
    # sample inputs and outputs
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -74,9 +74,9 @@ def gibbs(input_dim,variance=1., mapping=None):
    Gibbs and MacKay non-stationary covariance function.
    .. math::
-       
+
       r = sqrt((x_i - x_j)'*(x_i - x_j))
-       
+
       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
       Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
@ -90,7 +90,7 @@ def gibbs(input_dim,variance=1., mapping=None):
        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
        :param input_dim: the number of input dimensions
-        :type input_dim: int 
+        :type input_dim: int
        :param variance: the variance :math:`\sigma^2`
        :type variance: float
        :param mapping: the mapping that gives the lengthscale across the input space.
@ -340,29 +340,30 @@ def symmetric(k):
    k_.parts = [symmetric.Symmetric(p) for p in k.parts]
    return k_
-def coregionalise(output_dim, rank=1, W=None, kappa=None):
+def coregionalise(num_outpus,W_columns=1, W=None, kappa=None):
    """
-        Coregionalisation kernel. 
+    Coregionlization matrix B, of the form:
    Used for computing covariance functions of the form
    .. math::
       k_2(x, y)=\mathbf{B} k(x, y)
    where
    .. math::
       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
-    :param output_dim: the number of output dimensions
+    An intrinsic/linear coregionalization kernel of the form
-    :type output_dim: int
+    .. math::
-    :param rank: the rank of the coregionalisation matrix.
+       k_2(x, y)=\mathbf{B} k(x, y)
-    :type rank: int
+
-    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B.
+    it is obtainded as the tensor product between a kernel k(x,y) and B.
-    :type W: ndarray
+
-    :param kappa: a diagonal term which allows the outputs to behave independently.
+    :param num_outputs: the number of outputs to corregionalise
    :type num_outputs: int
    :param W_columns: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
    :type W_colunns: int
    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B
    :type W: numpy array of dimensionality (num_outpus, W_columns)
    :param kappa: a vector which allows the outputs to behave independently
    :type kappa: numpy array of dimensionality  (num_outputs,)
    :rtype: kernel object
    .. Note: see coregionalisation examples in GPy.examples.regression for some usage.
    """
-    p = parts.coregionalise.Coregionalise(output_dim,rank,W,kappa)
+    p = parts.coregionalise.Coregionalise(num_outputs,W_columns,W,kappa)
    return kern(1,[p])
--- a/GPy/kern/parts/coregionalise.py
+++ b/GPy/kern/parts/coregionalise.py
@ -9,31 +9,34 @@ from scipy import weave
 class Coregionalise(Kernpart):
    """
-    Coregionalisation kernel. 
+    Kernel for intrinsic/linear coregionalization models
-    Used for computing covariance functions of the form
+    This kernel has the form
    .. math::
-       k_2(x, y)=B k(x, y)
+       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
    where
    .. math::
       B = WW^\top + diag(kappa)
-    :param output_dim: the number of output dimensions
+    An intrinsic/linear coregionalization kernel of the form
-    :type output_dim: int
+    .. math::
-    :param rank: the rank of the coregionalisation matrix.
+       k_2(x, y)=\mathbf{B} k(x, y)
-    :type rank: int
+
-    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B.
+    it is obtainded as the tensor product between a kernel k(x,y) and B.
-    :type W: ndarray
+
-    :param kappa: a diagonal term which allows the outputs to behave independently.
+    :param num_outputs: number of outputs to coregionalize
-    :rtype: kernel object
+    :type num_outputs: int
    :param W_columns: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
    :type W_colunns: int
    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B
    :type W: numpy array of dimensionality (num_outpus, W_columns)
    :param kappa: a vector which allows the outputs to behave independently
    :type kappa: numpy array of dimensionality  (num_outputs,)
    .. Note: see coregionalisation examples in GPy.examples.regression for some usage.
    """
-    def __init__(self,output_dim,rank=1, W=None, kappa=None):
+    def __init__(self,num_outputs,W_columns=1, W=None, kappa=None):
        self.input_dim = 1
        self.name = 'coregion'
-        self.output_dim = output_dim
+        self.num_outputs = num_outputs
-        self.rank = rank
+        self.W_columns = W_columns
        if W is None:
            self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank)
        else:
@ -52,12 +55,12 @@ class Coregionalise(Kernpart):
    def _set_params(self,x):
        assert x.size == self.num_params
-        self.kappa = x[-self.output_dim:]
+        self.kappa = x[-self.num_outputs:]
-        self.W = x[:-self.output_dim].reshape(self.output_dim,self.rank)
+        self.W = x[:-self.num_outputs].reshape(self.num_outputs,self.W_columns)
        self.B = np.dot(self.W,self.W.T) + np.diag(self.kappa)
    def _get_param_names(self):
-        return sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[]) + ['kappa_%i'%i for i in range(self.output_dim)]
+        return sum([['W%i_%i'%(i,j) for j in range(self.W_columns)] for i in range(self.num_outputs)],[]) + ['kappa_%i'%i for i in range(self.num_outputs)]
    def K(self,index,index2,target):
        index = np.asarray(index,dtype=np.int)
@ -75,26 +78,26 @@ class Coregionalise(Kernpart):
        if index2 is None:
            code="""
            for(int i=0;i<N; i++){
-              target[i+i*N] += B[index[i]+output_dim*index[i]];
+              target[i+i*N] += B[index[i]+num_outputs*index[i]];
              for(int j=0; j<i; j++){
-                  target[j+i*N] += B[index[i]+output_dim*index[j]];
+                  target[j+i*N] += B[index[i]+num_outputs*index[j]];
                  target[i+j*N] += target[j+i*N];
                }
              }
            """
-            N,B,output_dim = index.size, self.B, self.output_dim
+            N,B,num_outputs = index.size, self.B, self.num_outputs
-            weave.inline(code,['target','index','N','B','output_dim'])
+            weave.inline(code,['target','index','N','B','num_outputs'])
        else:
            index2 = np.asarray(index2,dtype=np.int)
            code="""
            for(int i=0;i<num_inducing; i++){
              for(int j=0; j<N; j++){
-                  target[i+j*num_inducing] += B[output_dim*index[j]+index2[i]];
+                  target[i+j*num_inducing] += B[num_outputs*index[j]+index2[i]];
                }
              }
            """
-            N,num_inducing,B,output_dim = index.size,index2.size, self.B, self.output_dim
+            N,num_inducing,B,num_outputs = index.size,index2.size, self.B, self.num_outputs
-            weave.inline(code,['target','index','index2','N','num_inducing','B','output_dim'])
+            weave.inline(code,['target','index','index2','N','num_inducing','B','num_outputs'])
    def Kdiag(self,index,target):
@ -111,12 +114,12 @@ class Coregionalise(Kernpart):
        code="""
        for(int i=0; i<num_inducing; i++){
          for(int j=0; j<N; j++){
-            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
+            dL_dK_small[index[j] + num_outputs*index2[i]] += dL_dK[i+j*num_inducing];
          }
        }
        """
-        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
+        N, num_inducing, num_outputs = index.size, index2.size, self.num_outputs
-        weave.inline(code, ['N','num_inducing','output_dim','dL_dK','dL_dK_small','index','index2'])
+        weave.inline(code, ['N','num_inducing','num_outputs','dL_dK','dL_dK_small','index','index2'])
        dkappa = np.diag(dL_dK_small)
        dL_dK_small += dL_dK_small.T
@ -133,8 +136,8 @@ class Coregionalise(Kernpart):
        ii,jj = ii.T, jj.T
        dL_dK_small = np.zeros_like(self.B)
-        for i in range(self.output_dim):
+        for i in range(self.num_outputs):
-            for j in range(self.output_dim):
+            for j in range(self.num_outputs):
                tmp = np.sum(dL_dK[(ii==i)*(jj==j)])
                dL_dK_small[i,j] = tmp
@ -146,8 +149,8 @@ class Coregionalise(Kernpart):
    def dKdiag_dtheta(self,dL_dKdiag,index,target):
        index = np.asarray(index,dtype=np.int).flatten()
-        dL_dKdiag_small = np.zeros(self.output_dim)
+        dL_dKdiag_small = np.zeros(self.num_outputs)
-        for i in range(self.output_dim):
+        for i in range(self.num_outputs):
            dL_dKdiag_small[i] += np.sum(dL_dKdiag[index==i])
        dW = 2.*self.W*dL_dKdiag_small[:,None]
        dkappa = dL_dKdiag_small
@ -155,6 +158,3 @@ class Coregionalise(Kernpart):
    def dK_dX(self,dL_dK,X,X2,target):
        pass
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@ -18,7 +18,7 @@ class Prod(Kernpart):
    """
    def __init__(self,k1,k2,tensor=False):
        self.num_params = k1.num_params + k2.num_params
-        self.name = k1.name + '<times>' + k2.name
+        self.name = '['+k1.name + '(x)' + k2.name +']'
        self.k1 = k1
        self.k2 = k2
        if tensor:
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -1,4 +1,7 @@
 from ep import EP
 from ep_mixed_noise import EP_Mixed_Noise
 from gaussian import Gaussian
 from gaussian_mixed_noise import Gaussian_Mixed_Noise
 from noise_model_constructors import *
 # TODO: from Laplace import Laplace
-import likelihood_functions as functions
+
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@ -4,23 +4,23 @@ from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
 from likelihood import likelihood
 class EP(likelihood):
-    def __init__(self,data,LikelihoodFunction,epsilon=1e-3,power_ep=[1.,1.]):
+    def __init__(self,data,noise_model,epsilon=1e-3,power_ep=[1.,1.]):
        """
        Expectation Propagation
        Arguments
        ---------
        epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
-        LikelihoodFunction : a likelihood function (see likelihood_functions.py)
+        noise_model : a likelihood function (see likelihood_functions.py)
        """
-        self.LikelihoodFunction = LikelihoodFunction
+        self.noise_model = noise_model
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        self.data = data
        self.N, self.output_dim = self.data.shape
        self.is_heteroscedastic = True
        self.Nparams = 0
-        self._transf_data = self.LikelihoodFunction._preprocess_values(data)
+        self._transf_data = self.noise_model._preprocess_values(data)
        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
@ -52,16 +52,23 @@ class EP(likelihood):
    def predictive_values(self,mu,var,full_cov):
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.LikelihoodFunction.predictive_values(mu,var)
+        return self.noise_model.predictive_values(mu,var)
    def _get_params(self):
-        return np.zeros(0)
+        #return np.zeros(0)
        return self.noise_model._get_params()
    def _get_param_names(self):
-        return []
+        #return []
        return self.noise_model._get_param_names()
    def _set_params(self,p):
-        pass # TODO: the EP likelihood might want to take some parameters...
+        #pass # TODO: the EP likelihood might want to take some parameters...
        self.noise_model._set_params(p)
    def _gradients(self,partial):
-        return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
+        #return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
        return self.noise_model._gradients(partial)
    def _compute_GP_variables(self):
        #Variables to be called from GP
@ -116,7 +123,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
@ -206,7 +213,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@ -301,7 +308,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
--- a/GPy/likelihoods/ep_mixed_noise.py
+++ b/GPy/likelihoods/ep_mixed_noise.py
@ -0,0 +1,385 @@
 # Copyright (c) 2013, Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats
 from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
 from likelihood import likelihood
 class EP_Mixed_Noise(likelihood):
    def __init__(self,data_list,noise_model_list,epsilon=1e-3,power_ep=[1.,1.]):
        """
        Expectation Propagation
        Arguments
        ---------
        :param data_list: list of outputs
        :param noise_model_list: a list of noise models
        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations
        :type epsilon: float
        :param power_ep: list of power ep parameters
        """
        assert len(data_list) == len(noise_model_list)
        self.noise_model_list = noise_model_list
        n_list = [data.size for data in data_list]
        self.n_models = len(data_list)
        self.n_params = [noise_model._get_params().size for noise_model in noise_model_list]
        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.n_models),n_list)])
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        self.data = np.vstack(data_list)
        self.N, self.output_dim = self.data.shape
        self.is_heteroscedastic = True
        self.Nparams = 0#FIXME
        self._transf_data = np.vstack([noise_model._preprocess_values(data) for noise_model,data in zip(noise_model_list,data_list)])
        #TODO non-gaussian index
        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
        self.tau_tilde = np.zeros(self.N)
        self.v_tilde = np.zeros(self.N)
        #initial values for the GP variables
        self.Y = np.zeros((self.N,1))
        self.covariance_matrix = np.eye(self.N)
        self.precision = np.ones(self.N)[:,None]
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
        self.VVT_factor = self.V
        self.trYYT = 0.
    def restart(self):
        self.tau_tilde = np.zeros(self.N)
        self.v_tilde = np.zeros(self.N)
        self.Y = np.zeros((self.N,1))
        self.covariance_matrix = np.eye(self.N)
        self.precision = np.ones(self.N)[:,None]
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
        self.VVT_factor = self.V
        self.trYYT = 0.
    def predictive_values(self,mu,var,full_cov,noise_model):
        """
        Predicts the output given the GP
        :param mu: GP's mean
        :param var: GP's variance
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: False|True
        :param noise_model: noise model to use
        :type noise_model: integer
        """
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
        #_mu = []
        #_var = []
        #_q1 = []
        #_q2 = []
        #for m,v,o in zip(mu,var,output.flatten()):
        #    a,b,c,d = self.noise_model_list[int(o)].predictive_values(m,v)
        #    _mu.append(a)
        #    _var.append(b)
        #    _q1.append(c)
        #    _q2.append(d)
        #return np.vstack(_mu),np.vstack(_var),np.vstack(_q1),np.vstack(_q2)
        return self.noise_model_list[noise_model].predictive_values(mu,var)
    def _get_params(self):
        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
    def _get_param_names(self):
        names = []
        for noise_model in self.noise_model_list:
           names += noise_model._get_param_names()
        return names
    def _set_params(self,p):
        cs_params = np.cumsum([0]+self.n_params)
        for i in range(len(self.n_params)):
            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
    def _gradients(self,partial):
        #NOTE this is not tested
        return np.hstack([noise_model._gradients(partial) for noise_model in self.noise_model_list])
    def _compute_GP_variables(self):
        #Variables to be called from GP
        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
        self.Y =  mu_tilde[:,None]
        self.YYT = np.dot(self.Y,self.Y.T)
        self.covariance_matrix = np.diag(1./self.tau_tilde)
        self.precision = self.tau_tilde[:,None]
        self.V = self.precision * self.Y
        self.VVT_factor = self.V
        self.trYYT = np.trace(self.YYT)
    def fit_full(self,K):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006.
        """
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
        mu = np.zeros(self.N)
        Sigma = K.copy()
        """
        Initial values - Cavity distribution parameters:
        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=float)
        self.v_ = np.empty(self.N,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
        self.Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
        #Approximation
        epsilon_np1 = self.epsilon + 1.
        epsilon_np2 = self.epsilon + 1.
       	self.iterations = 0
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
                self.tau_tilde[i] += Delta_tau
                self.v_tilde[i] += Delta_v
                #Posterior distribution parameters update
                DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i])))
                mu = np.dot(Sigma,self.v_tilde)
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
            L = jitchol(B)
            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
            Sigma = K - np.dot(V.T,V)
            mu = np.dot(Sigma,self.v_tilde)
            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
        return self._compute_GP_variables()
    def fit_DTC(self, Kmm, Kmn):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see ... 2013.
        """
        num_inducing = Kmm.shape[0]
        #TODO: this doesn't work with uncertain inputs!
        """
        Prior approximation parameters:
        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
        Sigma0 = Qnn = Knm*Kmmi*Kmn
        """
        KmnKnm = np.dot(Kmn,Kmn.T)
        Lm = jitchol(Kmm)
        Lmi = chol_inv(Lm)
        Kmmi = np.dot(Lmi.T,Lmi)
        KmmiKmn = np.dot(Kmmi,Kmn)
        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        LLT0 = Kmm.copy()
        #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
        #KmnKnm = np.dot(Kmn, Kmn.T)
        #KmmiKmn = np.dot(Kmmi,Kmn)
        #Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        #LLT0 = Kmm.copy()
        """
        Posterior approximation: q(f|y) = N(f| mu, Sigma)
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*Gamma
        """
        mu = np.zeros(self.N)
        LLT = Kmm.copy()
        Sigma_diag = Qnn_diag.copy()
        """
        Initial values - Cavity distribution parameters:
        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=float)
        self.v_ = np.empty(self.N,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
        self.Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
        #Approximation
        epsilon_np1 = 1
        epsilon_np2 = 1
       	self.iterations = 0
        np1 = [self.tau_tilde.copy()]
        np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                self.tau_tilde[i] += Delta_tau
                self.v_tilde[i] += Delta_v
                #Posterior distribution parameters update
                DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
                L = jitchol(LLT)
                #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau))
                V,info = dtrtrs(L,Kmn,lower=1)
                Sigma_diag = np.sum(V*V,-2)
                si = np.sum(V.T*V[:,i],-1)
                mu += (Delta_v-Delta_tau*mu[i])*si
                self.iterations += 1
            #Sigma recomputation with Cholesky decompositon
            LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
            L = jitchol(LLT)
            V,info = dtrtrs(L,Kmn,lower=1)
            V2,info = dtrtrs(L.T,V,lower=0)
            Sigma_diag = np.sum(V*V,-2)
            Knmv_tilde = np.dot(Kmn,self.v_tilde)
            mu = np.dot(V2.T,Knmv_tilde)
            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
            np1.append(self.tau_tilde.copy())
            np2.append(self.v_tilde.copy())
        self._compute_GP_variables()
    def fit_FITC(self, Kmm, Kmn, Knn_diag):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see Naish-Guzman and Holden, 2008.
        """
        num_inducing = Kmm.shape[0]
        """
        Prior approximation parameters:
        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
        """
        Lm = jitchol(Kmm)
        Lmi = chol_inv(Lm)
        Kmmi = np.dot(Lmi.T,Lmi)
        P0 = Kmn.T
        KmnKnm = np.dot(P0.T, P0)
        KmmiKmn = np.dot(Kmmi,P0.T)
        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
        Diag0 = Knn_diag - Qnn_diag
        R0 = jitchol(Kmmi).T
        """
        Posterior approximation: q(f|y) = N(f| mu, Sigma)
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*Gamma
        """
        self.w = np.zeros(self.N)
        self.Gamma = np.zeros(num_inducing)
        mu = np.zeros(self.N)
        P = P0.copy()
        R = R0.copy()
        Diag = Diag0.copy()
        Sigma_diag = Knn_diag
        RPT0 = np.dot(R0,P0.T)
        """
        Initial values - Cavity distribution parameters:
        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.N,dtype=float)
        self.v_ = np.empty(self.N,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
        self.Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
        #Approximation
        epsilon_np1 = 1
        epsilon_np2 = 1
       	self.iterations = 0
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                self.tau_tilde[i] += Delta_tau
                self.v_tilde[i] += Delta_v
                #Posterior distribution parameters update
                dtd1 = Delta_tau*Diag[i] + 1.
                dii = Diag[i]
                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
                pi_ = P[i,:].reshape(1,num_inducing)
                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
                Rp_i = np.dot(R,pi_.T)
                RTR = np.dot(R.T,np.dot(np.eye(num_inducing) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
                R = jitchol(RTR).T
                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
                RPT = np.dot(R,P.T)
                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
                mu = self.w + np.dot(P,self.Gamma)
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
            Diag = Diag0 * Iplus_Dprod_i
            P = Iplus_Dprod_i[:,None] * P0
            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
            L = jitchol(np.eye(num_inducing) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
            R,info = dtrtrs(L,R0,lower=1)
            RPT = np.dot(R,P.T)
            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
            self.w = Diag * self.v_tilde
            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
            mu = self.w + np.dot(P,self.Gamma)
            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
        return self._compute_GP_variables()
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -7,9 +7,9 @@ class Gaussian(likelihood):
    """
    Likelihood class for doing Expectation propagation
-    :param Y: observed output (Nx1 numpy.darray)
+    :param data: observed output
-    ..Note:: Y values allowed depend on the likelihood_function used
+    :type data: Nx1 numpy.darray
-    :param variance :
+    :param variance: noise parameter
    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
    :type normalize: False|True
    """
--- a/GPy/likelihoods/gaussian_mixed_noise.py
+++ b/GPy/likelihoods/gaussian_mixed_noise.py
@ -0,0 +1,106 @@
 # Copyright (c) 2013, Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats
 from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
 from likelihood import likelihood
 from . import Gaussian
 class Gaussian_Mixed_Noise(likelihood):
    """
    Gaussian Likelihood for multiple outputs
    This is a wrapper around likelihood.Gaussian class
    :param data_list: data observations
    :type data_list: list of numpy arrays (num_data_output_i x 1), one array per output
    :param noise_params: noise parameters of each output
    :type noise_params: list of floats, one per output
    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
    :type normalize: False|True
    """
    def __init__(self, data_list, noise_params=None, normalize=True):
        self.Nparams = len(data_list)
        self.n_list = [data.size for data in data_list]
        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.Nparams),self.n_list)])
        if noise_params is None:
            noise_params = [1.] * self.Nparams
        else:
            assert self.Nparams == len(noise_params), 'Number of noise parameters does not match the number of noise models.'
        self.noise_model_list = [Gaussian(Y,variance=v,normalize = normalize) for Y,v in zip(data_list,noise_params)]
        self.n_params = [noise_model._get_params().size for noise_model in self.noise_model_list]
        self.data = np.vstack(data_list)
        self.N, self.output_dim = self.data.shape
        self._offset = np.zeros((1, self.output_dim))
        self._scale = np.ones((1, self.output_dim))
        self.is_heteroscedastic = True
        self.Z = 0. # a correction factor which accounts for the approximation made
        self.set_data(data_list)
        self._set_params(np.asarray(noise_params))
    def set_data(self, data_list):
        self.data = np.vstack(data_list)
        self.N, D = self.data.shape
        assert D == self.output_dim
        self.Y = (self.data - self._offset) / self._scale
        if D > self.N:
            raise NotImplementedError
            #self.YYT = np.dot(self.Y, self.Y.T)
            #self.trYYT = np.trace(self.YYT)
            #self.YYT_factor = jitchol(self.YYT)
        else:
            self.YYT = None
            self.trYYT = np.sum(np.square(self.Y))
            self.YYT_factor = self.Y
    def predictive_values(self,mu,var,full_cov,noise_model):
        """
        Predicts the output given the GP
        :param mu: GP's mean
        :param var: GP's variance
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: False|True
        :param noise_model: noise model to use
        :type noise_model: integer
        """
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
        return self.noise_model_list[noise_model].predictive_values(mu,var,full_cov)
    def _get_params(self):
        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
    def _get_param_names(self):
        if len(self.noise_model_list) == 1:
            names = self.noise_model_list[0]._get_param_names()
        else:
            names = []
            for noise_model,i in zip(self.noise_model_list,range(len(self.n_list))):
                names.append(''.join(noise_model._get_param_names() + ['_%s' %i]))
        return names
    def _set_params(self,p):
        cs_params = np.cumsum([0]+self.n_params)
        for i in range(len(self.n_params)):
            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
        self.precision = np.hstack([np.repeat(noise_model.precision,n) for noise_model,n in zip(self.noise_model_list,self.n_list)])[:,None]
        self.V = self.precision * self.Y
        self.VVT_factor = self.precision * self.YYT_factor
        self.covariance_matrix = np.eye(self.N) * 1./self.precision
    def _gradients(self,partial):
        gradients = []
        aux = np.cumsum([0]+self.n_list)
        for ai,af,noise_model in zip(aux[:-1],aux[1:],self.noise_model_list):
            gradients += [noise_model._gradients(partial[ai:af])]
        return np.hstack(gradients)
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@ -1,180 +0,0 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats
 import scipy as sp
 import pylab as pb
 from ..util.plot import gpplot
 from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import link_functions
 class LikelihoodFunction(object):
    """
    Likelihood class for doing Expectation propagation
    :param Y: observed output (Nx1 numpy.darray)
    ..Note:: Y values allowed depend on the LikelihoodFunction used
    """
    def __init__(self,link):
        if link == self._analytical:
            self.moments_match = self._moments_match_analytical
        else:
            assert isinstance(link,link_functions.LinkFunction)
            self.link = link
            self.moments_match = self._moments_match_numerical
    def _preprocess_values(self,Y):
        return Y
    def _product(self,gp,obs,mu,sigma):
        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._distribution(gp,obs)
    def _log_product_scaled(self,gp,obs,mu,sigma):
        return -.5*(gp-mu)**2/sigma**2 + self._log_distribution_scaled(gp,obs)
    def _log_product_scaled_dgp(self,gp,obs,mu,sigma):
        return -(gp -mu)/sigma**2 + self._log_distribution_scaled_dgp(gp,obs)
    def _locate(self,obs,mu,sigma):
        """
        Golden Search to find the mode in the _product function (cavity x exact likelihood) and define a grid around it for numerical integration
        """
        lower = -1 if obs == 0 else np.array([np.log(obs),mu]).min() #Lower limit #FIXME
        upper = np.array([np.log(obs),mu]).max() #Upper limit #FIXME
        #return sp.optimize.golden(self._nlog_product, args=(obs,mu,sigma), brack=(golden_A,golden_B)) #Better to work with _nlog_product than with _product
        return sp.optimize.brent(self._nlog_product, args=(obs,mu,sigma), brack=(lower,upper)) #Better to work with _nlog_product than with _product
    def _moments_match_numerical(self,obs,tau,v):
        """
        Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input.
        """
        mu = v/tau
        sigma = np.sqrt(1./tau)
        opt = self._locate(obs,mu,sigma)
        width = 3./np.log(max(obs,2))
        A = opt - width #Grid's lower limit
        B = opt + width #Grid's Upper limit
        K =  10*int(np.log(max(obs,150))) #Number of points in the grid
        h = (B-A)/K # length of the intervals
        grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis)
        x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier
        _aux1 = self._product(A,obs,mu,sigma)
        _aux2 = self._product(B,obs,mu,sigma)
        _aux3 = 4*self._product(grid_x[range(1,K,2)],obs,mu,sigma)
        _aux4 = 2*self._product(grid_x[range(2,K-1,2)],obs,mu,sigma)
        zeroth = np.hstack((_aux1,_aux2,_aux3,_aux4)) # grid of points (Y axis) rearranged
        first = zeroth*x
        second = first*x
        Z_hat = sum(zeroth)*h/3 # Zero-th moment
        mu_hat = sum(first)*h/(3*Z_hat) # First moment
        m2 = sum(second)*h/(3*Z_hat) # Second moment
        sigma2_hat = m2 - mu_hat**2 # Second central moment
        return float(Z_hat), float(mu_hat), float(sigma2_hat)
 class Binomial(LikelihoodFunction):
    """
    Probit likelihood
    Y is expected to take values in {-1,1}
    -----
    $$
    L(x) = \\Phi (Y_i*f_i)
    $$
    """
    def __init__(self,link=None):
        self._analytical = link_functions.Probit
        if not link:
            link = self._analytical
        super(Binomial, self).__init__(link)
    def _distribution(self,gp,obs):
        pass
    def _log_distribution_scaled(self,gp,obs):
        pass
    def _preprocess_values(self,Y):
        """
        Check if the values of the observations correspond to the values
        assumed by the likelihood function.
        ..Note:: Binary classification algorithm works better with classes {-1,1}
        """
        Y_prep = Y.copy()
        Y1 = Y[Y.flatten()==1].size
        Y2 = Y[Y.flatten()==0].size
        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
        Y_prep[Y.flatten() == 0] = -1
        return Y_prep
    def _moments_match_analytical(self,data_i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
        Z_hat = std_norm_cdf(z)
        phi = std_norm_pdf(z)
        mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
        sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
        return Z_hat, mu_hat, sigma2_hat
    def predictive_values(self,mu,var):
        """
        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction
        :param mu: mean of the latent variable
        :param var: variance of the latent variable
        """
        mu = mu.flatten()
        var = var.flatten()
        mean = stats.norm.cdf(mu/np.sqrt(1+var))
        norm_025 = [stats.norm.ppf(.025,m,v) for m,v in zip(mu,var)]
        norm_975 = [stats.norm.ppf(.975,m,v) for m,v in zip(mu,var)]
        p_025 = stats.norm.cdf(norm_025/np.sqrt(1+var))
        p_975 = stats.norm.cdf(norm_975/np.sqrt(1+var))
        return mean[:,None], np.nan*var, p_025[:,None], p_975[:,None] # TODO: var
 class Poisson(LikelihoodFunction):
    """
    Poisson likelihood
    Y is expected to take values in {0,1,2,...}
    -----
    $$
    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
    $$
    """
    def __init__(self,link=None):
        self._analytical = None
        if not link:
            link = link_functions.Log()
        super(Poisson, self).__init__(link)
    def _distribution(self,gp,obs):
        return stats.poisson.pmf(obs,self.link.inv_transf(gp))
    def _log_distribution_scaled(self,gp,obs):
        """
        Logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
        """
        return -self.link.inv_transf(gp) + obs * self.link.log_inv_transf(gp)
    def _log_distribution_scaled_dgp(self,gp,obs):
        return -self.link.inv_transf_df(gp) + obs * self.link.log_inv_transf_df(gp)
    def _log_distribution_scaled_d2gp2(self,gp,obs):
        return -self.link.inv_transf_df(gp) + obs * self.link.log_inv_transf_df(gp)
    def predictive_values(self,mu,var):
        """
        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
        """
        mean = self.link.transf(mu)#np.exp(mu*self.scale + self.location)
        tmp = stats.poisson.ppf(np.array([.025,.975]),mean)
        p_025 = tmp[:,0]
        p_975 = tmp[:,1]
        return mean,np.nan*mean,p_025,p_975 # better variance here TODO
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@ -1,62 +0,0 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats
 import scipy as sp
 import pylab as pb
 from ..util.plot import gpplot
 from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 class LinkFunction(object):
    """
    Link function class for doing non-Gaussian likelihoods approximation
    :param Y: observed output (Nx1 numpy.darray)
    ..Note:: Y values allowed depend on the likelihood_function used
    """
    def __init__(self):
        pass
 class Probit(LinkFunction):
    """
    Probit link function
    """
    def transf(self,mu):
        pass
    def inv_transf(self,f):
        pass
    def log_inv_transf(self,f):
        pass
 class Log(LinkFunction):
    """
    Logarithm link function
    """
    def transf(self,mu):
        return np.log(mu)
    def inv_transf(self,f):
        return np.exp(f)
    def log_inv_transf(self,f):
        return f
    def inv_transf_df(sefl,f):
        return np.exp(f)
    def log_inv_transf_df(self,f):
        return 1
    def inv_transf_df(sefl,f):
        return np.exp(f)
    def log_inv_transf_df(self,f):
        return 1
--- a/GPy/likelihoods/noise_model_constructors.py
+++ b/GPy/likelihoods/noise_model_constructors.py
@ -0,0 +1,88 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import noise_models
 def binomial(gp_link=None):
    """
    Construct a binomial likelihood
    :param gp_link: a GPy gp_link function
    """
    if gp_link is None:
        gp_link = noise_models.gp_transformations.Probit()
    #else:
    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
    if isinstance(gp_link,noise_models.gp_transformations.Probit):
        analytical_mean = True
        analytical_variance = False
    elif isinstance(gp_link,noise_models.gp_transformations.Step):
        analytical_mean = True
        analytical_variance = True
    else:
        analytical_mean = False
        analytical_variance = False
    return noise_models.binomial_noise.Binomial(gp_link,analytical_mean,analytical_variance)
 def exponential(gp_link=None):
    """
    Construct a binomial likelihood
    :param gp_link: a GPy gp_link function
    """
    if gp_link is None:
        gp_link = noise_models.gp_transformations.Identity()
    analytical_mean = False
    analytical_variance = False
    return noise_models.exponential_noise.Exponential(gp_link,analytical_mean,analytical_variance)
 def gaussian(gp_link=None,variance=1.):
    """
    Construct a gaussian likelihood
    :param gp_link: a GPy gp_link function
    :param variance: scalar
    """
    if gp_link is None:
        gp_link = noise_models.gp_transformations.Identity()
    #else:
    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
    analytical_mean = False
    analytical_variance = False
    return noise_models.gaussian_noise.Gaussian(gp_link,analytical_mean,analytical_variance,variance)
 def poisson(gp_link=None):
    """
    Construct a Poisson likelihood
    :param gp_link: a GPy gp_link function
    """
    if gp_link is None:
        gp_link = noise_models.gp_transformations.Log_ex_1()
    #else:
    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
    analytical_mean = False
    analytical_variance = False
    return noise_models.poisson_noise.Poisson(gp_link,analytical_mean,analytical_variance)
 def gamma(gp_link=None,beta=1.):
    """
    Construct a Gamma likelihood
    :param gp_link: a GPy gp_link function
    :param beta: scalar
    """
    if gp_link is None:
        gp_link = noise_models.gp_transformations.Log_ex_1()
    analytical_mean = False
    analytical_variance = False
    return noise_models.gamma_noise.Gamma(gp_link,analytical_mean,analytical_variance,beta)
--- a/GPy/likelihoods/noise_models/init.py
+++ b/GPy/likelihoods/noise_models/init.py
@ -0,0 +1,7 @@
 import noise_distributions
 import binomial_noise
 import exponential_noise
 import gaussian_noise
 import gamma_noise
 import poisson_noise
 import gp_transformations
--- a/GPy/likelihoods/noise_models/binomial_noise.py
+++ b/GPy/likelihoods/noise_models/binomial_noise.py
@ -0,0 +1,118 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from noise_distributions import NoiseDistribution
 class Binomial(NoiseDistribution):
    """
    Probit likelihood
    Y is expected to take values in {-1,1}
    -----
    $$
    L(x) = \\Phi (Y_i*f_i)
    $$
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
        super(Binomial, self).__init__(gp_link,analytical_mean,analytical_variance)
    def _preprocess_values(self,Y):
        """
        Check if the values of the observations correspond to the values
        assumed by the likelihood function.
        ..Note:: Binary classification algorithm works better with classes {-1,1}
        """
        Y_prep = Y.copy()
        Y1 = Y[Y.flatten()==1].size
        Y2 = Y[Y.flatten()==0].size
        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
        Y_prep[Y.flatten() == 0] = -1
        return Y_prep
    def _moments_match_analytical(self,data_i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        if isinstance(self.gp_link,gp_transformations.Probit):
            z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
            Z_hat = std_norm_cdf(z)
            phi = std_norm_pdf(z)
            mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
            sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
        elif isinstance(self.gp_link,gp_transformations.Step):
            Z_hat = None
            mu_hat = None
            sigma2_hat = None
        return Z_hat, mu_hat, sigma2_hat
    def _predictive_mean_analytical(self,mu,sigma):
        return stats.norm.cdf(mu/np.sqrt(1+sigma**2))
    def _mass(self,gp,obs):
        #NOTE obs must be in {0,1}
        p = self.gp_link.transf(gp)
        return p**obs * (1.-p)**(1.-obs)
    def _nlog_mass(self,gp,obs):
        p = self.gp_link.transf(gp)
        return obs*np.log(p) + (1.-obs)*np.log(1-p)
    def _dnlog_mass_dgp(self,gp,obs):
        p = self.gp_link.transf(gp)
        dp = self.gp_link.dtransf_df(gp)
        return obs/p * dp - (1.-obs)/(1.-p) * dp
    def _d2nlog_mass_dgp2(self,gp,obs):
        p = self.gp_link.transf(gp)
        return (obs/p + (1.-obs)/(1.-p))*self.gp_link.d2transf_df2(gp) + ((1.-obs)/(1.-p)**2-obs/p**2)*self.gp_link.dtransf_df(gp)
    def _mean(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)
    def _dmean_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)
    def _d2mean_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        p = self.gp_link.transf(gp)
        return p*(1.-p)
    def _dvariance_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)*(1. - 2.*self.gp_link.transf(gp))
    def _d2variance_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)*(1. - 2.*self.gp_link.transf(gp)) - 2*self.gp_link.dtransf_df(gp)**2
    """
    def predictive_values(self,mu,var): #TODO remove
        mu = mu.flatten()
        var = var.flatten()
        #mean = stats.norm.cdf(mu/np.sqrt(1+var))
        mean = self._predictive_mean_analytical(mu,np.sqrt(var))
        norm_025 = [stats.norm.ppf(.025,m,v) for m,v in zip(mu,var)]
        norm_975 = [stats.norm.ppf(.975,m,v) for m,v in zip(mu,var)]
        #p_025 = stats.norm.cdf(norm_025/np.sqrt(1+var))
        #p_975 = stats.norm.cdf(norm_975/np.sqrt(1+var))
        p_025 = self._predictive_mean_analytical(norm_025,np.sqrt(var))
        p_975 = self._predictive_mean_analytical(norm_975,np.sqrt(var))
        return mean[:,None], np.nan*var, p_025[:,None], p_975[:,None] # TODO: var
    """
--- a/GPy/likelihoods/noise_models/exponential_noise.py
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@ -0,0 +1,68 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from noise_distributions import NoiseDistribution
 class Exponential(NoiseDistribution):
    """
    Gamma likelihood
    Y is expected to take values in {0,1,2,...}
    -----
    $$
    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
    $$
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
        super(Exponential, self).__init__(gp_link,analytical_mean,analytical_variance)
    def _preprocess_values(self,Y):
        return Y
    def _mass(self,gp,obs):
        """
        Mass (or density) function
        """
        return np.exp(-obs/self.gp_link.transf(gp))/self.gp_link.transf(gp)
    def _nlog_mass(self,gp,obs):
        """
        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
        """
        return obs/self.gp_link.transf(gp) + np.log(self.gp_link.transf(gp))
    def _dnlog_mass_dgp(self,gp,obs):
        return ( 1./self.gp_link.transf(gp) - obs/self.gp_link.transf(gp)**2) * self.gp_link.dtransf_df(gp)
    def _d2nlog_mass_dgp2(self,gp,obs):
        fgp = self.gp_link.transf(gp)
        return (2*obs/fgp**3 - 1./fgp**2) * self.gp_link.dtransf_df(gp)**2 + ( 1./fgp - obs/fgp**2) * self.gp_link.d2transf_df2(gp)
    def _mean(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)
    def _dmean_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)
    def _d2mean_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)**2
    def _dvariance_dgp(self,gp):
        return 2*self.gp_link.transf(gp)*self.gp_link.dtransf_df(gp)
    def _d2variance_dgp2(self,gp):
        return 2 * (self.gp_link.dtransf_df(gp)**2 + self.gp_link.transf(gp)*self.gp_link.d2transf_df2(gp))
--- a/GPy/likelihoods/noise_models/gamma_noise.py
+++ b/GPy/likelihoods/noise_models/gamma_noise.py
@ -0,0 +1,71 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from noise_distributions import NoiseDistribution
 class Gamma(NoiseDistribution):
    """
    Gamma likelihood
    Y is expected to take values in {0,1,2,...}
    -----
    $$
    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
    $$
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,beta=1.):
        self.beta = beta
        super(Gamma, self).__init__(gp_link,analytical_mean,analytical_variance)
    def _preprocess_values(self,Y):
        return Y
    def _mass(self,gp,obs):
        """
        Mass (or density) function
        """
        #return stats.gamma.pdf(obs,a = self.gp_link.transf(gp)/self.variance,scale=self.variance)
        alpha = self.gp_link.transf(gp)*self.beta
        return obs**(alpha - 1.) * np.exp(-self.beta*obs) * self.beta**alpha / special.gamma(alpha)
    def _nlog_mass(self,gp,obs):
        """
        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
        """
        alpha = self.gp_link.transf(gp)*self.beta
        return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
    def _dnlog_mass_dgp(self,gp,obs):
        return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
    def _d2nlog_mass_dgp2(self,gp,obs):
        return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
    def _mean(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)
    def _dmean_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)
    def _d2mean_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)/self.beta
    def _dvariance_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)/self.beta
    def _d2variance_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)/self.beta
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@ -0,0 +1,98 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from noise_distributions import NoiseDistribution
 class Gaussian(NoiseDistribution):
    """
    Gaussian likelihood
    :param mean: mean value of the Gaussian distribution
    :param variance: mean value of the Gaussian distribution
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,variance=1.):
        self.variance = variance
        super(Gaussian, self).__init__(gp_link,analytical_mean,analytical_variance)
    def _get_params(self):
        return np.array([self.variance])
    def _get_param_names(self):
        return ['noise_model_variance']
    def _set_params(self,p):
        self.variance = p
    def _gradients(self,partial):
        return np.zeros(1)
        #return np.sum(partial)
    def _preprocess_values(self,Y):
        """
        Check if the values of the observations correspond to the values
        assumed by the likelihood function.
        """
        return Y
    def _moments_match_analytical(self,data_i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        sigma2_hat = 1./(1./self.variance + tau_i)
        mu_hat = sigma2_hat*(data_i/self.variance + v_i)
        sum_var = self.variance + 1./tau_i
        Z_hat = 1./np.sqrt(2.*np.pi*sum_var)*np.exp(-.5*(data_i - v_i/tau_i)**2./sum_var)
        return Z_hat, mu_hat, sigma2_hat
    def _predictive_mean_analytical(self,mu,sigma):
        new_sigma2 = self.predictive_variance(mu,sigma)
        return new_sigma2*(mu/sigma**2 + self.gp_link.transf(mu)/self.variance)
    def _predictive_variance_analytical(self,mu,sigma,*args): #TODO *args?
        return 1./(1./self.variance + 1./sigma**2)
    def _mass(self,gp,obs):
        #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
        return stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance)) #FIXME
    def _nlog_mass(self,gp,obs):
        return .5*((self.gp_link.transf(gp)-obs)**2/self.variance + np.log(2.*np.pi*self.variance))
    def _dnlog_mass_dgp(self,gp,obs):
        return (self.gp_link.transf(gp)-obs)/self.variance * self.gp_link.dtransf_df(gp)
    def _d2nlog_mass_dgp2(self,gp,obs):
        return ((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)/self.variance
    def _mean(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)
    def _dmean_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)
    def _d2mean_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.variance
    def _dvariance_dgp(self,gp):
        return 0
    def _d2variance_dgp2(self,gp):
        return 0
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@ -0,0 +1,125 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats
 import scipy as sp
 import pylab as pb
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf,inv_std_norm_cdf
 class GPTransformation(object):
    """
    Link function class for doing non-Gaussian likelihoods approximation
    :param Y: observed output (Nx1 numpy.darray)
    ..Note:: Y values allowed depend on the likelihood_function used
    """
    def __init__(self):
        pass
 class Identity(GPTransformation):
    """
    $$
    g(f) = f
    $$
    """
    #def transf(self,mu):
    #    return mu
    def transf(self,f):
        return f
    def dtransf_df(self,f):
        return 1.
    def d2transf_df2(self,f):
        return 0
 class Probit(GPTransformation):
    """
    $$
    g(f) = \\Phi^{-1} (mu)
    $$
    """
    #def transf(self,mu):
    #    return inv_std_norm_cdf(mu)
    def transf(self,f):
        return std_norm_cdf(f)
    def dtransf_df(self,f):
        return std_norm_pdf(f)
    def d2transf_df2(self,f):
        return -f * std_norm_pdf(f)
 class Log(GPTransformation):
    """
    $$
    g(f) = \log(\mu)
    $$
    """
    #def transf(self,mu):
    #    return np.log(mu)
    def transf(self,f):
        return np.exp(f)
    def dtransf_df(self,f):
        return np.exp(f)
    def d2transf_df2(self,f):
        return np.exp(f)
 class Log_ex_1(GPTransformation):
    """
    $$
    g(f) = \log(\exp(\mu) - 1)
    $$
    """
    #def transf(self,mu):
    #    """
    #    function: output space -> latent space
    #    """
    #    return np.log(np.exp(mu) - 1)
    def transf(self,f):
        """
        function: latent space -> output space
        """
        return np.log(1.+np.exp(f))
    def dtransf_df(self,f):
        return np.exp(f)/(1.+np.exp(f))
    def d2transf_df2(self,f):
        aux = np.exp(f)/(1.+np.exp(f))
        return aux*(1.-aux)
 class Reciprocal(GPTransformation):
    def transf(sefl,f):
        return 1./f
    def dtransf_df(self,f):
        return -1./f**2
    def d2transf_df2(self,f):
        return 2./f**3
 class Step(GPTransformation):
    """
    $$
    g(f) = I_{x \in A}
    $$
    """
    def transf(self,f):
        #transformation goes here
        return np.where(f>0, 1, -1)
    def dtransf_df(self,f):
        pass
    def d2transf_df2(self,f):
        pass
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@ -0,0 +1,384 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 import pylab as pb
 from GPy.util.plot import gpplot
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 class NoiseDistribution(object):
    """
    Likelihood class for doing Expectation propagation
    :param Y: observed output (Nx1 numpy.darray)
    ..Note:: Y values allowed depend on the LikelihoodFunction used
    """
    def __init__(self,gp_link,analytical_mean=False,analytical_variance=False):
        #assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."#FIXME
        self.gp_link = gp_link
        self.analytical_mean = analytical_mean
        self.analytical_variance = analytical_variance
        if self.analytical_mean:
            self.moments_match = self._moments_match_analytical
            self.predictive_mean = self._predictive_mean_analytical
        else:
            self.moments_match = self._moments_match_numerical
            self.predictive_mean = self._predictive_mean_numerical
        if self.analytical_variance:
            self.predictive_variance = self._predictive_variance_analytical
        else:
            self.predictive_variance = self._predictive_variance_numerical
    def _get_params(self):
        return np.zeros(0)
    def _get_param_names(self):
        return []
    def _set_params(self,p):
        pass
    def _gradients(self,partial):
        return np.zeros(0)
    def _preprocess_values(self,Y):
        """
        In case it is needed, this function assess the output values or makes any pertinent transformation on them.
        :param Y: observed output (Nx1 numpy.darray)
        """
        return Y
    def _product(self,gp,obs,mu,sigma):
        """
        Product between the cavity distribution and a likelihood factor.
        :param gp: latent variable
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._mass(gp,obs)
    def _nlog_product_scaled(self,gp,obs,mu,sigma):
        """
        Negative log-product between the cavity distribution and a likelihood factor.
        ..Note:: The constant term in the Gaussian distribution is ignored.
        :param gp: latent variable
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return .5*((gp-mu)/sigma)**2 + self._nlog_mass(gp,obs)
    def _dnlog_product_dgp(self,gp,obs,mu,sigma):
        """
        Derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
        :param gp: latent variable
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return (gp - mu)/sigma**2 + self._dnlog_mass_dgp(gp,obs)
    def _d2nlog_product_dgp2(self,gp,obs,mu,sigma):
        """
        Second derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
        :param gp: latent variable
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return 1./sigma**2 + self._d2nlog_mass_dgp2(gp,obs)
    def _product_mode(self,obs,mu,sigma):
        """
        Newton's CG method to find the mode in _product (cavity x likelihood factor).
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return sp.optimize.fmin_ncg(self._nlog_product_scaled,x0=mu,fprime=self._dnlog_product_dgp,fhess=self._d2nlog_product_dgp2,args=(obs,mu,sigma))
    def _moments_match_analytical(self,obs,tau,v):
        """
        If available, this function computes the moments analytically.
        """
        pass
    def _moments_match_numerical(self,obs,tau,v):
        """
        Lapace approximation to calculate the moments.
        :param obs: observed output
        :param tau: cavity distribution 1st natural parameter (precision)
        :param v: cavity distribution 2nd natural paramenter (mu*precision)
        """
        mu = v/tau
        mu_hat = self._product_mode(obs,mu,np.sqrt(1./tau))
        sigma2_hat = 1./(tau + self._d2nlog_mass_dgp2(mu_hat,obs))
        Z_hat = np.exp(-.5*tau*(mu_hat-mu)**2) * self._mass(mu_hat,obs)*np.sqrt(tau*sigma2_hat)
        return Z_hat,mu_hat,sigma2_hat
    def _nlog_conditional_mean_scaled(self,gp,mu,sigma):
        """
        Negative logarithm of the l.v.'s predictive distribution times the output's mean given the l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        ..Note:: This function helps computing E(Y_star) = E(E(Y_star|f_star))
        """
        return .5*((gp - mu)/sigma)**2 - np.log(self._mean(gp))
    def _dnlog_conditional_mean_dgp(self,gp,mu,sigma):
        """
        Derivative of _nlog_conditional_mean_scaled wrt. l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return (gp - mu)/sigma**2 - self._dmean_dgp(gp)/self._mean(gp)
    def _d2nlog_conditional_mean_dgp2(self,gp,mu,sigma):
        """
        Second derivative of _nlog_conditional_mean_scaled wrt. l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return 1./sigma**2 - self._d2mean_dgp2(gp)/self._mean(gp) + (self._dmean_dgp(gp)/self._mean(gp))**2
    def _nlog_exp_conditional_variance_scaled(self,gp,mu,sigma):
        """
        Negative logarithm of the l.v.'s predictive distribution times the output's variance given the l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        ..Note:: This function helps computing E(V(Y_star|f_star))
        """
        return .5*((gp - mu)/sigma)**2 - np.log(self._variance(gp))
    def _dnlog_exp_conditional_variance_dgp(self,gp,mu,sigma):
        """
        Derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return (gp - mu)/sigma**2 - self._dvariance_dgp(gp)/self._variance(gp)
    def _d2nlog_exp_conditional_variance_dgp2(self,gp,mu,sigma):
        """
        Second derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return 1./sigma**2 - self._d2variance_dgp2(gp)/self._variance(gp) + (self._dvariance_dgp(gp)/self._variance(gp))**2
    def _nlog_exp_conditional_mean_sq_scaled(self,gp,mu,sigma):
        """
        Negative logarithm of the l.v.'s predictive distribution times the output's mean squared given the l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        ..Note:: This function helps computing E( E(Y_star|f_star)**2 )
        """
        return .5*((gp - mu)/sigma)**2 - 2*np.log(self._mean(gp))
    def _dnlog_exp_conditional_mean_sq_dgp(self,gp,mu,sigma):
        """
        Derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return (gp - mu)/sigma**2 - 2*self._dmean_dgp(gp)/self._mean(gp)
    def _d2nlog_exp_conditional_mean_sq_dgp2(self,gp,mu,sigma):
        """
        Second derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return 1./sigma**2 - 2*( self._d2mean_dgp2(gp)/self._mean(gp) - (self._dmean_dgp(gp)/self._mean(gp))**2 )
    def _predictive_mean_analytical(self,mu,sigma):
        """
        If available, this function computes the predictive mean analytically.
        """
        pass
    def _predictive_variance_analytical(self,mu,sigma):
        """
        If available, this function computes the predictive variance analytically.
        """
        pass
    def _predictive_mean_numerical(self,mu,sigma):
        """
        Laplace approximation to the predictive mean: E(Y_star) = E( E(Y_star|f_star) )
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        maximum = sp.optimize.fmin_ncg(self._nlog_conditional_mean_scaled,x0=self._mean(mu),fprime=self._dnlog_conditional_mean_dgp,fhess=self._d2nlog_conditional_mean_dgp2,args=(mu,sigma))
        mean = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma))*sigma)
        """
        pb.figure()
        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
        f = np.array([np.exp(-self._nlog_conditional_mean_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
        pb.plot(x,f,'b-')
        sigma2 = 1./self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma)
        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
        k = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
        pb.plot(x,f2*mean,'r-')
        pb.vlines(maximum,0,f.max())
        """
        return mean
    def _predictive_mean_sq(self,mu,sigma):
        """
        Laplace approximation to the predictive mean squared: E(Y_star**2) = E( E(Y_star|f_star)**2 )
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_mean_sq_scaled,x0=self._mean(mu),fprime=self._dnlog_exp_conditional_mean_sq_dgp,fhess=self._d2nlog_exp_conditional_mean_sq_dgp2,args=(mu,sigma))
        mean_squared = np.exp(-self._nlog_exp_conditional_mean_sq_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_mean_sq_dgp2(maximum,mu,sigma))*sigma)
        return mean_squared
    def _predictive_variance_numerical(self,mu,sigma,predictive_mean=None):
        """
        Laplace approximation to the predictive variance: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
        """
        # E( V(Y_star|f_star) )
        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_variance_scaled,x0=self._variance(mu),fprime=self._dnlog_exp_conditional_variance_dgp,fhess=self._d2nlog_exp_conditional_variance_dgp2,args=(mu,sigma))
        exp_var = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma))*sigma)
        """
        pb.figure()
        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
        f = np.array([np.exp(-self._nlog_exp_conditional_variance_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
        pb.plot(x,f,'b-')
        sigma2 = 1./self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma)
        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
        k = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
        pb.plot(x,f2*exp_var,'r--')
        pb.vlines(maximum,0,f.max())
        """
        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star)**2 )
        exp_exp2 = self._predictive_mean_sq(mu,sigma)
        if predictive_mean is None:
            predictive_mean = self.predictive_mean(mu,sigma)
        var_exp = exp_exp2 - predictive_mean**2
        return exp_var + var_exp
    def _predictive_percentiles(self,p,mu,sigma):
        """
        Percentiles of the predictive distribution
        :parm p: lower tail probability
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
        """
        qf = stats.norm.ppf(p,mu,sigma)
        return self.gp_link.transf(qf)
    def _nlog_joint_predictive_scaled(self,x,mu,sigma):
        """
        Negative logarithm of the joint predictive distribution (latent variable and output).
        :param x: tuple (latent variable,output)
        :param mu: latent variable's predictive mean
        :param sigma: latent variable's predictive standard deviation
        """
        return self._nlog_product_scaled(x[0],x[1],mu,sigma)
    def _gradient_nlog_joint_predictive(self,x,mu,sigma):
        """
        Gradient of _nlog_joint_predictive_scaled.
        :param x: tuple (latent variable,output)
        :param mu: latent variable's predictive mean
        :param sigma: latent variable's predictive standard deviation
        ..Note: Only avilable when the output is continuous
        """
        assert not self.discrete, "Gradient not available for discrete outputs."
        return np.array((self._dnlog_product_dgp(gp=x[0],obs=x[1],mu=mu,sigma=sigma),self._dnlog_mass_dobs(obs=x[1],gp=x[0])))
    def _hessian_nlog_joint_predictive(self,x,mu,sigma):
        """
        Hessian of _nlog_joint_predictive_scaled.
        :param x: tuple (latent variable,output)
        :param mu: latent variable's predictive mean
        :param sigma: latent variable's predictive standard deviation
        ..Note: Only avilable when the output is continuous
        """
        assert not self.discrete, "Hessian not available for discrete outputs."
        cross_derivative = self._d2nlog_mass_dcross(gp=x[0],obs=x[1])
        return np.array((self._d2nlog_product_dgp2(gp=x[0],obs=x[1],mu=mu,sigma=sigma),cross_derivative,cross_derivative,self._d2nlog_mass_dobs2(obs=x[1],gp=x[0]))).reshape(2,2)
    def _joint_predictive_mode(self,mu,sigma):
        """
        Negative logarithm of the joint predictive distribution (latent variable and output).
        :param x: tuple (latent variable,output)
        :param mu: latent variable's predictive mean
        :param sigma: latent variable's predictive standard deviation
        """
        return sp.optimize.fmin_ncg(self._nlog_joint_predictive_scaled,x0=(mu,self.gp_link.transf(mu)),fprime=self._gradient_nlog_joint_predictive,fhess=self._hessian_nlog_joint_predictive,args=(mu,sigma))
    def predictive_values(self,mu,var):
        """
        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction
        :param mu: mean of the latent variable
        :param var: variance of the latent variable
        """
        if isinstance(mu,float) or isinstance(mu,int):
            mu = [mu]
            var = [var]
        pred_mean = []
        pred_var = []
        q1 = []
        q3 = []
        for m,s in zip(mu,np.sqrt(var)):
            pred_mean.append(self.predictive_mean(m,s))
            pred_var.append(self.predictive_variance(m,s,pred_mean[-1]))
            q1.append(self._predictive_percentiles(.025,m,s))
            q3.append(self._predictive_percentiles(.975,m,s))
        pred_mean = np.vstack(pred_mean)
        pred_var = np.vstack(pred_var)
        q1 = np.vstack(q1)
        q3 = np.vstack(q3)
        return pred_mean, pred_var, q1, q3
--- a/GPy/likelihoods/noise_models/poisson_noise.py
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@ -0,0 +1,85 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import gp_transformations
 from noise_distributions import NoiseDistribution
 class Poisson(NoiseDistribution):
    """
    Poisson likelihood
    Y is expected to take values in {0,1,2,...}
    -----
    $$
    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
    $$
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
        #self.discrete = True
        #self.support_limits = (0,np.inf)
        #self.analytical_mean = False
        super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
    def _preprocess_values(self,Y): #TODO
        #self.scale = .5*Y.max()
        #self.shift = Y.mean()
        return Y #(Y - self.shift)/self.scale
    def _mass(self,gp,obs):
        """
        Mass (or density) function
        """
        #obs = obs*self.scale + self.shift
        return stats.poisson.pmf(obs,self.gp_link.transf(gp))
    def _nlog_mass(self,gp,obs):
        """
        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
        """
        return self.gp_link.transf(gp) - obs * np.log(self.gp_link.transf(gp)) + np.log(special.gamma(obs+1))
    def _dnlog_mass_dgp(self,gp,obs):
        return self.gp_link.dtransf_df(gp) * (1. - obs/self.gp_link.transf(gp))
    def _d2nlog_mass_dgp2(self,gp,obs):
        d2_df = self.gp_link.d2transf_df2(gp)
        transf = self.gp_link.transf(gp)
        return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
    def _dnlog_mass_dobs(self,obs,gp): #TODO not needed
        return special.psi(obs+1) -  np.log(self.gp_link.transf(gp))
    def _d2nlog_mass_dobs2(self,obs,gp=None): #TODO not needed
        return special.polygamma(1,obs)
    def _d2nlog_mass_dcross(self,obs,gp): #TODO not needed
        return -self.gp_link.dtransf_df(gp)/self.gp_link.transf(gp)
    def _mean(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)
    def _dmean_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)
    def _d2mean_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)
    def _variance(self,gp):
        """
        Mass (or density) function
        """
        return self.gp_link.transf(gp)
    def _dvariance_dgp(self,gp):
        return self.gp_link.dtransf_df(gp)
    def _d2variance_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -14,3 +14,5 @@ from warped_gp import WarpedGP
 from bayesian_gplvm import BayesianGPLVM
 from mrd import MRD
 from gradient_checker import GradientChecker
 from gp_multioutput_regression import GPMultioutputRegression
 from sparse_gp_multioutput_regression import SparseGPMultioutputRegression
--- a/GPy/models/fitc_classification.py
+++ b/GPy/models/fitc_classification.py
@ -31,8 +31,8 @@ class FITCClassification(FITC):
            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)
        if likelihood is None:
-            distribution = likelihoods.likelihood_functions.Binomial()
+            noise_model = likelihoods.binomial()
-            likelihood = likelihoods.EP(Y, distribution)
+            likelihood = likelihoods.EP(Y, noise_model)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
                raise Warning, 'likelihood.data and Y are different.'
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@ -31,8 +31,8 @@ class GPClassification(GP):
            kernel = kern.rbf(X.shape[1])
        if likelihood is None:
-            distribution = likelihoods.likelihood_functions.Binomial()
+            noise_model = likelihoods.binomial()
-            likelihood = likelihoods.EP(Y, distribution)
+            likelihood = likelihoods.EP(Y, noise_model)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
                raise Warning, 'likelihood.data and Y are different.'
--- a/GPy/models/gp_multioutput_regression.py
+++ b/GPy/models/gp_multioutput_regression.py
@ -0,0 +1,59 @@
 # Copyright (c) 2013, Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import GP
 from .. import likelihoods
 from .. import kern
 from ..util import multioutput
 class GPMultioutputRegression(GP):
    """
    Multiple output Gaussian process with Gaussian noise
    This is a wrapper around the models.GP class, with a set of sensible defaults
    :param X_list: input observations
    :type X_list: list of numpy arrays (num_data_output_i x input_dim), one array per output
    :param Y_list: observed values
    :type Y_list: list of numpy arrays (num_data_output_i x 1), one array per output
    :param kernel_list: GPy kernels, defaults to rbf
    :type kernel_list: list of GPy kernels
    :param noise_variance_list: noise parameters per output, defaults to 1.0 for every output
    :type noise_variance_list: list of floats
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    :param W_columns: number tuples of the corregionalization parameters 'coregion_W' (see coregionalize kernel documentation)
    :type W_columns: integer
    """
    def __init__(self,X_list,Y_list,kernel_list=None,noise_variance_list=None,normalize_X=False,normalize_Y=False,W_columns=1):
        self.num_outputs = len(Y_list)
        assert len(X_list) == self.num_outputs, 'Number of outputs do not match length of inputs list.'
        #Inputs indexing
        i = 0
        index = []
        for x,y in zip(X_list,Y_list):
            assert x.shape[0] == y.shape[0]
            index.append(np.repeat(i,x.size)[:,None])
            i += 1
        index = np.vstack(index)
        X = np.hstack([np.vstack(X_list),index])
        original_dim = X.shape[1] - 1
        #Mixed noise likelihood definition
        likelihood = likelihoods.Gaussian_Mixed_Noise(Y_list,noise_params=noise_variance_list,normalize=normalize_Y)
        #Coregionalization kernel definition
        if kernel_list is None:
            kernel_list = [[kern.rbf(original_dim)],[]]
        mkernel = multioutput.build_lcm(input_dim=original_dim, num_outputs=self.num_outputs, CK = kernel_list[0], NC = kernel_list[1], W_columns=W_columns)
        self.multioutput = True
        GP.__init__(self, X, likelihood, mkernel, normalize_X=normalize_X)
        self.ensure_default_constraints()
--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@ -28,11 +28,11 @@ class SparseGPClassification(SparseGP):
    def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10):
        if kernel is None:
-            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1], 1e-3)
+            kernel = kern.rbf(X.shape[1])# + kern.white(X.shape[1],1e-3)
        if likelihood is None:
-            distribution = likelihoods.likelihood_functions.Binomial()
+            noise_model = likelihoods.binomial()
-            likelihood = likelihoods.EP(Y, distribution)
+            likelihood = likelihoods.EP(Y, noise_model)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
                raise Warning, 'likelihood.data and Y are different.'
--- a/GPy/models/sparse_gp_multioutput_regression.py
+++ b/GPy/models/sparse_gp_multioutput_regression.py
@ -0,0 +1,80 @@
 # Copyright (c) 2013, Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import SparseGP
 from .. import likelihoods
 from .. import kern
 from ..util import multioutput
 class SparseGPMultioutputRegression(SparseGP):
    """
    Sparse multiple output Gaussian process with Gaussian noise
    This is a wrapper around the models.SparseGP class, with a set of sensible defaults
    :param X_list: input observations
    :type X_list: list of numpy arrays (num_data_output_i x input_dim), one array per output
    :param Y_list: observed values
    :type Y_list: list of numpy arrays (num_data_output_i x 1), one array per output
    :param kernel_list: GPy kernels, defaults to rbf
    :type kernel_list: list of GPy kernels
    :param noise_variance_list: noise parameters per output, defaults to 1.0 for every output
    :type noise_variance_list: list of floats
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    :param Z_list: inducing inputs (optional)
    :type Z_list: list of numpy arrays (num_inducing_output_i x input_dim), one array per output | empty list
    :param num_inducing: number of inducing inputs per output, defaults to 10 (ignored if Z_list is not empty)
    :type num_inducing: integer
    :param W_columns: number tuples of the corregionalization parameters 'coregion_W' (see coregionalize kernel documentation)
    :type W_columns: integer
    """
    #NOTE not tested with uncertain inputs
    def __init__(self,X_list,Y_list,kernel_list=None,noise_variance_list=None,normalize_X=False,normalize_Y=False,Z_list=[],num_inducing=10,W_columns=1):
        self.num_outputs = len(Y_list)
        assert len(X_list) == self.num_outputs, 'Number of outputs do not match length of inputs list.'
        #Inducing inputs list
        if len(Z_list):
            assert len(Z_list) == self.num_outputs, 'Number of outputs do not match length of inducing inputs list.'
        else:
            if isinstance(num_inducing,np.int):
                num_inducing = [num_inducing] * self.num_outputs
            num_inducing = np.asarray(num_inducing)
            assert num_inducing.size == self.num_outputs, 'Number of outputs do not match length of inducing inputs list.'
            for ni,X in zip(num_inducing,X_list):
                i = np.random.permutation(X.shape[0])[:ni]
                Z_list.append(X[i].copy())
        #Inputs and inducing inputs indexing
        i = 0
        index = []
        index_z = []
        for x,y,z in zip(X_list,Y_list,Z_list):
            assert x.shape[0] == y.shape[0]
            index.append(np.repeat(i,x.size)[:,None])
            index_z.append(np.repeat(i,z.size)[:,None])
            i += 1
        index = np.vstack(index)
        index_z = np.vstack(index_z)
        X = np.hstack([np.vstack(X_list),index])
        Z = np.hstack([np.vstack(Z_list),index_z])
        original_dim = X.shape[1] - 1
        #Mixed noise likelihood definition
        likelihood = likelihoods.Gaussian_Mixed_Noise(Y_list,noise_params=noise_variance_list,normalize=normalize_Y)
        #Coregionalization kernel definition
        if kernel_list is None:
            kernel_list = [[kern.rbf(original_dim)],[]]
        mkernel = multioutput.build_lcm(input_dim=original_dim, num_outputs=self.num_outputs, CK = kernel_list[0], NC = kernel_list[1], W_columns=W_columns)
        self.multioutput = True
        SparseGP.__init__(self, X, likelihood, mkernel, Z=Z, normalize_X=normalize_X)
        self.constrain_fixed('.*iip_\d+_1')
        self.ensure_default_constraints()
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@ -20,7 +20,11 @@ class SparseGPRegression(SparseGP):
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (num_inducing x input_dim) | None
    :rtype: model object
    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
    :type X_variance: np.ndarray (num_data x input_dim) | None
    .. Note:: Multiple independent outputs are allowed using columns of Y
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -199,10 +199,7 @@ class GradientTests(unittest.TestCase):
        X = np.hstack([np.random.normal(5, 2, N / 2), np.random.normal(10, 2, N / 2)])[:, None]
        Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
        kernel = GPy.kern.rbf(1)
-        distribution = GPy.likelihoods.likelihood_functions.Binomial()
+        m = GPy.models.GPClassification(X,Y,kernel=kernel)
        likelihood = GPy.likelihoods.EP(Y, distribution)
        m = GPy.core.GP(X, likelihood, kernel)
        m.ensure_default_constraints()
        m.update_likelihood_approximation()
        self.assertTrue(m.checkgrad())
@ -212,10 +209,11 @@ class GradientTests(unittest.TestCase):
        Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
        Z = np.linspace(0, 15, 4)[:, None]
        kernel = GPy.kern.rbf(1)
-        distribution = GPy.likelihoods.likelihood_functions.Binomial()
+        m = GPy.models.SparseGPClassification(X,Y,kernel=kernel,Z=Z)
-        likelihood = GPy.likelihoods.EP(Y, distribution)
+        #distribution = GPy.likelihoods.likelihood_functions.Binomial()
-        m = GPy.core.SparseGP(X, likelihood, kernel, Z)
+        #likelihood = GPy.likelihoods.EP(Y, distribution)
-        m.ensure_default_constraints()
+        #m = GPy.core.SparseGP(X, likelihood, kernel, Z)
        #m.ensure_default_constraints()
        m.update_likelihood_approximation()
        self.assertTrue(m.checkgrad())
@ -224,7 +222,7 @@ class GradientTests(unittest.TestCase):
        X = np.hstack([np.random.rand(N / 2) + 1, np.random.rand(N / 2) - 1])[:, None]
        k = GPy.kern.rbf(1) + GPy.kern.white(1)
        Y = np.hstack([np.ones(N/2),np.zeros(N/2)])[:,None]
-        m = GPy.models.FITCClassification(X, Y=Y)
+        m = GPy.models.FITCClassification(X, Y, kernel = k)
        m.update_likelihood_approximation()
        self.assertTrue(m.checkgrad())
--- a/GPy/util/init.py
+++ b/GPy/util/init.py
@ -14,3 +14,4 @@ import visualize
 import decorators
 import classification
 import latent_space_visualizations
 import multioutput
--- a/GPy/util/multioutput.py
+++ b/GPy/util/multioutput.py
@ -0,0 +1,35 @@
 import numpy as np
 import warnings
 from .. import kern
 def build_lcm(input_dim, num_outputs, CK = [], NC = [], W_columns=1,W=None,kappa=None):
    #TODO build_icm or build_lcm
    """
    Builds a kernel for a linear coregionalization model
    :input_dim: Input dimensionality
    :num_outputs: Number of outputs
    :param CK: List of coregionalized kernels (i.e., this will be multiplied by a coregionalise kernel).
    :param K: List of kernels that will be added up together with CK, but won't be multiplied by a coregionalise kernel
    :param W_columns: number tuples of the corregionalization parameters 'coregion_W'
    :type W_columns: integer
    """
    for k in CK:
        if k.input_dim <> input_dim:
            k.input_dim = input_dim
            warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
    for k in NC:
        if k.input_dim <> input_dim + 1:
            k.input_dim = input_dim + 1
            warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
    kernel = CK[0].prod(kern.coregionalise(num_outputs,W_columns,W,kappa),tensor=True)
    for k in CK[1:]:
        k_coreg = kern.coregionalise(num_outputs,W_columns,W,kappa)
        kernel += k.prod(k_coreg,tensor=True)
    for k in NC:
        kernel += k
    return kernel
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@ -32,4 +32,15 @@ def std_norm_cdf(x):
    x = float(x)
    return weave.inline(code,arg_names=['x'],support_code=support_code)
 def inv_std_norm_cdf(x):
    """
    Inverse cumulative standard Gaussian distribution
    Based on Winitzki, S. (2008)
    """
    z = 2*x -1
    ln1z2 = np.log(1-z**2)
    a = 8*(np.pi -3)/(3*np.pi*(4-np.pi))
    b = 2/(np.pi * a) + ln1z2/2
    inv_erf = np.sign(z) * np.sqrt( np.sqrt(b**2 - ln1z2/a) - b )
    return np.sqrt(2) * inv_erf