diff --git a/GPy/core/fitc.py b/GPy/core/fitc.py
index 7a87b98d..eac00fec 100644
--- a/GPy/core/fitc.py
+++ b/GPy/core/fitc.py
@@ -140,7 +140,6 @@ class FITC(SparseGP):
 
             dA_dnoise = 0.5 * self.input_dim * (dbstar_dnoise/self.beta_star).sum() - 0.5 * self.input_dim * np.sum(self.likelihood.Y**2 * dbstar_dnoise)
             dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)
-            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)
 
             dD_dnoise_1 =  mdot(self.V_star*LBiLmipsi1.T,LBiLmipsi1*dbstar_dnoise.T*self.likelihood.Y.T)
             alpha = mdot(LBiLmipsi1,self.V_star)
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 278ddc74..68d4376e 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -19,9 +19,6 @@ class GP(GPBase):
     :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
     :type normalize_X: False|True
     :rtype: model object
-    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
-    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
-    :type powerep: list
 
     .. Note:: Multiple independent outputs are allowed using columns of Y
 
@@ -105,7 +102,12 @@ class GP(GPBase):
 
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
-        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        if not isinstance(self.likelihood,EP):
+            tmp = np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        else:
+            tmp = np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        return tmp
 
     def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
         """
@@ -136,7 +138,7 @@ class GP(GPBase):
         :type Xnew: np.ndarray, Nnew x self.input_dim
         :param which_parts:  specifies which outputs kernel(s) to use in prediction
         :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
         :type full_cov: bool
         :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
         :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
@@ -153,5 +155,71 @@ class GP(GPBase):
 
         # now push through likelihood
         mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
-
         return mean, var, _025pm, _975pm
+
+    def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
+        """
+        For a specific output, predict the function at the new point(s) Xnew.
+        Arguments
+        ---------
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :param output: output to predict
+        :type output: integer in {0,..., num_outputs-1}
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+        :type full_cov: bool
+        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+
+        .. Note:: For multiple output models only
+        """
+        assert hasattr(self,'multioutput')
+        index = np.ones_like(Xnew)*output
+        Xnew = np.hstack((Xnew,index))
+
+        # normalize X values
+        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
+
+        # now push through likelihood
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, noise_model = output)
+        return mean, var, _025pm, _975pm
+
+    def _raw_predict_single_output(self, _Xnew, output=0, which_parts='all', full_cov=False,stop=False):
+        """
+        Internal helper function for making predictions for a specific output,
+        does not account for normalization or likelihood
+        ---------
+
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :param output: output to predict
+        :type output: integer in {0,..., num_outputs-1}
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+
+        .. Note:: For multiple output models only
+        """
+        assert hasattr(self,'multioutput')
+
+        # creates an index column and appends it to _Xnew
+        index = np.ones_like(_Xnew)*output
+        _Xnew = np.hstack((_Xnew,index))
+
+        Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
+        KiKx, _ = dpotrs(self.L, np.asfortranarray(Kx), lower=1)
+        mu = np.dot(KiKx.T, self.likelihood.Y)
+        if full_cov:
+            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
+            var = Kxx - np.dot(KiKx.T, Kx)
+        else:
+            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
+            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
+            var = var[:, None]
+        if stop:
+            debug_this # @UndefinedVariable
+        return mu, var
diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index 6935fc6f..4513ddac 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -57,34 +57,30 @@ class GPBase(Model):
         self.X = state.pop()
         Model.setstate(self, state)
 
-    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
+    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None,output=None):
         """
-        Plot the GP's view of the world, where the data is normalized and the
-        likelihood is Gaussian.
+            Plot the GP's view of the world, where the data is normalized and the
+              - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
+              - In two dimsensions, a contour-plot shows the mean predicted function
+              - Not implemented in higher dimensions
 
-        Plot the posterior of the GP.
-          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
-          - In two dimsensions, a contour-plot shows the mean predicted function
-          - In higher dimensions, we've no implemented this yet !TODO!
-
-        Can plot only part of the data and part of the posterior functions
-        using which_data and which_functions
-
-        :param samples: the number of a posteriori samples to plot
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-        :type resolution: int
-        :param full_cov:
-        :type full_cov: bool
-                :param fignum: figure to plot on.
-        :type fignum: figure number
-        :param ax: axes to plot on.
-        :type ax: axes handle
+            :param samples: the number of a posteriori samples to plot
+            :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+            :param which_data: which if the training data to plot (default all)
+            :type which_data: 'all' or a slice object to slice self.X, self.Y
+            :param which_parts: which of the kernel functions to plot (additively)
+            :type which_parts: 'all', or list of bools
+            :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+            :type resolution: int
+            :param full_cov:
+            :type full_cov: bool
+                    :param fignum: figure to plot on.
+            :type fignum: figure number
+            :param ax: axes to plot on.
+            :type ax: axes handle
 
+            :param output: which output to plot (for multiple output models only)
+            :type output: integer (first output is 0)
         """
         if which_data == 'all':
             which_data = slice(None)
@@ -93,7 +89,7 @@ class GPBase(Model):
             fig = pb.figure(num=fignum)
             ax = fig.add_subplot(111)
 
-        if self.X.shape[1] == 1:
+        if self.X.shape[1] == 1 and not hasattr(self,'multioutput'):
             Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
             if samples == 0:
                 m, v = self._raw_predict(Xnew, which_parts=which_parts)
@@ -111,7 +107,7 @@ class GPBase(Model):
             ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
             ax.set_ylim(ymin, ymax)
 
-        elif self.X.shape[1] == 2:
+        elif self.X.shape[1] == 2 and not hasattr(self,'multioutput'):
             resolution = resolution or 50
             Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
             m, v = self._raw_predict(Xnew, which_parts=which_parts)
@@ -120,17 +116,51 @@ class GPBase(Model):
             ax.scatter(self.X[:, 0], self.X[:, 1], 40, self.likelihood.Y, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max()) # @UndefinedVariable
             ax.set_xlim(xmin[0], xmax[0])
             ax.set_ylim(xmin[1], xmax[1])
+
+
+        elif self.X.shape[1] == 2 and hasattr(self,'multioutput'):
+            output -= 1
+            assert self.num_outputs >= output, 'The model has only %s outputs.' %self.num_outputs
+            Xu = self.X[self.X[:,-1]==output ,0:1]
+            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
+
+            if samples == 0:
+                m, v = self._raw_predict_single_output(Xnew, output=output, which_parts=which_parts)
+                gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax)
+                ax.plot(Xu[which_data], self.likelihood.Y[self.likelihood.index==output][:,None], 'kx', mew=1.5)
+            else:
+                m, v = self._raw_predict_single_output(Xnew, output=output, which_parts=which_parts, full_cov=True)
+                Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
+                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None, ], axes=ax)
+                for i in range(samples):
+                    ax.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
+            ax.set_xlim(xmin, xmax)
+            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
+            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+            ax.set_ylim(ymin, ymax)
+
+            if hasattr(self,'Z'):
+                Zu = self.Z[self.Z[:,-1]==output,:]
+                Zu = self.Z * self._Xscale + self._Xoffset
+                Zu = self.Z[self.Z[:,-1]==output ,0:1] #??
+                ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
+
+        elif self.X.shape[1] == 3 and hasattr(self,'multioutput'):
+            raise NotImplementedError, "Plots not implemented for multioutput models with 2D inputs...yet"
+            output -= 1
+            assert self.num_outputs >= output, 'The model has only %s outputs.' %self.num_outputs
+
         else:
             raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
 
-    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
+    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, output=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
         """
         Plot the GP with noise where the likelihood is Gaussian.
 
         Plot the posterior of the GP.
           - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
           - In two dimsensions, a contour-plot shows the mean predicted function
-          - In higher dimensions, we've no implemented this yet !TODO!
+          - Not implemented in higher dimensions
 
         Can plot only part of the data and part of the posterior functions
         using which_data and which_functions
@@ -151,15 +181,13 @@ class GPBase(Model):
         :type fignum: figure number
         :param ax: axes to plot on.
         :type ax: axes handle
+        :type output: integer (first output is 0)
         :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
         :type fixed_inputs: a list of tuples
         :param linecol: color of line to plot.
         :type linecol:
         :param fillcol: color of fill
-        :type fillcol: 
-        :param levels: for 2D plotting, the number of contour levels to use
-        is ax is None, create a new figure
-
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
         """
         # TODO include samples
         if which_data == 'all':
@@ -169,41 +197,81 @@ class GPBase(Model):
             fig = pb.figure(num=fignum)
             ax = fig.add_subplot(111)
 
-        plotdims = self.input_dim - len(fixed_inputs)
+        if not hasattr(self,'multioutput'):
 
-        if plotdims == 1:
+            plotdims = self.input_dim - len(fixed_inputs)
+            if plotdims == 1:
+                resolution = resolution or 200
 
-            Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
+                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
 
-            fixed_dims = np.array([i for i,v in fixed_inputs])
-            freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
+                fixed_dims = np.array([i for i,v in fixed_inputs])
+                freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
 
-            Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
-            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
-            Xgrid[:,freedim] = Xnew
-            for i,v in fixed_inputs:
-                Xgrid[:,i] = v
+                Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
+                Xgrid = np.empty((Xnew.shape[0],self.input_dim))
+                Xgrid[:,freedim] = Xnew
+                for i,v in fixed_inputs:
+                    Xgrid[:,i] = v
 
-            m, _, lower, upper = self.predict(Xgrid, which_parts=which_parts)
-            for d in range(m.shape[1]):
-                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data,freedim], self.likelihood.data[which_data, d], 'kx', mew=1.5)
-            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_xlim(xmin, xmax)
-            ax.set_ylim(ymin, ymax)
+                m, _, lower, upper = self.predict(Xgrid, which_parts=which_parts)
+                for d in range(m.shape[1]):
+                    gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
+                    ax.plot(Xu[which_data,freedim], self.likelihood.data[which_data, d], 'kx', mew=1.5)
+                ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+                ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+                ax.set_xlim(xmin, xmax)
+                ax.set_ylim(ymin, ymax)
 
-        elif self.X.shape[1] == 2: # FIXME
-            resolution = resolution or 50
-            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
-            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
-            m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
-            m = m.reshape(resolution, resolution).T
-            ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
-            Yf = self.likelihood.data.flatten()
-            ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable
-            ax.set_xlim(xmin[0], xmax[0])
-            ax.set_ylim(xmin[1], xmax[1])
+
+
+                Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits,resolution=resolution)
+                m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
+                for d in range(m.shape[1]):
+                    gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax)
+                    ax.plot(Xu[which_data], self.likelihood.data[which_data, d], 'kx', mew=1.5)
+                ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+                ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+                ax.set_xlim(xmin, xmax)
+                ax.set_ylim(ymin, ymax)
+
+            elif self.X.shape[1] == 2:
+                resolution = resolution or 50
+                Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
+                x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
+                m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
+                m = m.reshape(resolution, resolution).T
+                ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
+                Yf = self.likelihood.Y.flatten()
+                ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable
+                ax.set_xlim(xmin[0], xmax[0])
+                ax.set_ylim(xmin[1], xmax[1])
+
+            else:
+                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
 
         else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+            assert self.num_outputs > output, 'The model has only %s outputs.' %self.num_outputs
+            if self.X.shape[1] == 2:
+                resolution = resolution or 200
+                Xu = self.X[self.X[:,-1]==output,:] #keep the output of interest
+                Xu = self.X * self._Xscale + self._Xoffset
+                Xu = self.X[self.X[:,-1]==output ,0:1] #get rid of the index column
+
+                Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
+                m, _, lower, upper = self.predict_single_output(Xnew, which_parts=which_parts,output=output)
+
+                for d in range(m.shape[1]):
+                    gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax)
+                    ax.plot(Xu[which_data], self.likelihood.noise_model_list[output].data, 'kx', mew=1.5)
+                ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+                ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+                ax.set_xlim(xmin, xmax)
+                ax.set_ylim(ymin, ymax)
+
+            elif self.X.shape[1] == 3:
+                raise NotImplementedError, "Plots not yet implemented for multioutput models with 2D inputs"
+                resolution = resolution or 50
+
+            else:
+                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
diff --git a/GPy/core/model.py b/GPy/core/model.py
index 2965cf94..6832d4e0 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -458,7 +458,7 @@ class Model(Parameterized):
             numerical_gradient = (f1 - f2) / (2 * dx)
             global_ratio = (f1 - f2) / (2 * np.dot(dx, np.where(gradient==0, 1e-32, gradient)))
             
-            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() - 1) < tolerance
+            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance)
         else:
             # check the gradient of each parameter individually, and do some pretty printing
             try:
@@ -549,7 +549,7 @@ class Model(Parameterized):
         :type optimzer: string TODO: valid strings?
 
         """
-        assert isinstance(self.likelihood, likelihoods.EP), "pseudo_EM is only available for EP likelihoods"
+        assert isinstance(self.likelihood, likelihoods.EP) or isinstance(self.likelihood, likelihoods.EP_Mixed_Noise), "pseudo_EM is only available for EP likelihoods"
         ll_change = epsilon + 1.
         iteration = 0
         last_ll = -np.inf
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index abcdf72c..7085c2fc 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -5,7 +5,7 @@ import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, tdot, symmetrify, backsub_both_sides, chol_inv, dtrtrs, dpotrs, dpotri
 from scipy import linalg
-from ..likelihoods import Gaussian
+from ..likelihoods import Gaussian, EP,EP_Mixed_Noise
 from gp_base import GPBase
 
 class SparseGP(GPBase):
@@ -109,7 +109,6 @@ class SparseGP(GPBase):
         tmp, _ = dtrtrs(self._Lm, np.asfortranarray(tmp.T), lower=1)
         self._A = tdot(tmp)
 
-
         # factor B
         self.B = np.eye(self.num_inducing) + self._A
         self.LB = jitchol(self.B)
@@ -139,6 +138,7 @@ class SparseGP(GPBase):
         dL_dpsi2_beta = 0.5 * backsub_both_sides(self._Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)
 
         if self.likelihood.is_heteroscedastic:
+
             if self.has_uncertain_inputs:
                 self.dL_dpsi2 = self.likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :]
             else:
@@ -160,9 +160,23 @@ class SparseGP(GPBase):
             # save computation here.
             self.partial_for_likelihood = None
         elif self.likelihood.is_heteroscedastic:
-            raise NotImplementedError, "heteroscedatic derivates not implemented"
+
+            if self.has_uncertain_inputs:
+                raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+
+            else:
+                Lmi_psi1, nil = dtrtrs(self._Lm, np.asfortranarray(self.psi1.T), lower=1, trans=0)
+                _LBi_Lmi_psi1, _ = dtrtrs(self.LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0)
+                _Bi_Lmi_psi1, _ = dtrtrs(self.LB.T, np.asfortranarray(_LBi_Lmi_psi1), lower=1, trans=0)
+
+                self.partial_for_likelihood = -0.5 * self.likelihood.precision + 0.5 * self.likelihood.V**2
+                self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0 - np.sum(Lmi_psi1**2,0))[:,None] * self.likelihood.precision**2
+                self.partial_for_likelihood += 0.5*np.sum(_Bi_Lmi_psi1*Lmi_psi1,0)[:,None]*self.likelihood.precision**2 #NOTE this term has numerical issues
+                self.partial_for_likelihood += -np.dot(self._LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * self.likelihood.Y * self.likelihood.precision**2
+                self.partial_for_likelihood += 0.5*np.dot(self._LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * self.likelihood.precision**2
+
         else:
-            # likelihood is not heterscedatic
+            # likelihood is not heteroscedatic
             self.partial_for_likelihood = -0.5 * self.num_data * self.output_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2
             self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self._A) * self.likelihood.precision)
             self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self._A * self.DBi_plus_BiPBi) - self.data_fit)
@@ -298,7 +312,7 @@ class SparseGP(GPBase):
         :type X_variance_new: np.ndarray, Nnew x self.input_dim
         :param which_parts:  specifies which outputs kernel(s) to use in prediction
         :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
         :type full_cov: bool
         :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
         :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
@@ -322,18 +336,15 @@ class SparseGP(GPBase):
 
         return mean, var, _025pm, _975pm
 
-    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None):
-
+    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None, output=None):
         if ax is None:
             fig = pb.figure(num=fignum)
             ax = fig.add_subplot(111)
         if which_data is 'all':
             which_data = slice(None)
 
-        GPBase.plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax)
-
-        # add the inducing inputs and some errorbars
-        if self.X.shape[1] == 1:
+        GPBase.plot(self, samples=0, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax, output=output)
+        if self.X.shape[1] == 1 and not hasattr(self,'multioutput'):
             if self.has_uncertain_inputs:
                 Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
                 ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
@@ -342,6 +353,109 @@ class SparseGP(GPBase):
             Zu = self.Z * self._Xscale + self._Xoffset
             ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
 
-        elif self.X.shape[1] == 2:
+        elif self.X.shape[1] == 2 and not hasattr(self,'multioutput'):
             Zu = self.Z * self._Xscale + self._Xoffset
             ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
+
+        elif self.X.shape[1] == 2 and hasattr(self,'multioutput'):
+            Xu = self.X[self.X[:,-1]==output,:]
+            if self.has_uncertain_inputs:
+                Xu = self.X * self._Xscale + self._Xoffset  # NOTE self.X are the normalized values now
+
+                Xu = self.X[self.X[:,-1]==output ,0:1] #??
+
+                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
+                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
+                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
+
+            Zu = self.Z[self.Z[:,-1]==output,:]
+            Zu = self.Z * self._Xscale + self._Xoffset
+            Zu = self.Z[self.Z[:,-1]==output ,0:1] #??
+            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
+            #ax.set_ylim(ax.get_ylim()[0],)
+
+        else:
+            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+
+    def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
+        """
+        For a specific output, predict the function at the new point(s) Xnew.
+        Arguments
+        ---------
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :param output: output to predict
+        :type output: integer in {0,..., num_outputs-1}
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+        :type full_cov: bool
+        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+
+        .. Note:: For multiple output models only
+        """
+
+        assert hasattr(self,'multioutput')
+        index = np.ones_like(Xnew)*output
+        Xnew = np.hstack((Xnew,index))
+
+        # normalize X values
+        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
+
+        # now push through likelihood
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, noise_model = output)
+        return mean, var, _025pm, _975pm
+
+    def _raw_predict_single_output(self, _Xnew, output=0, X_variance_new=None, which_parts='all', full_cov=False,stop=False):
+        """
+        Internal helper function for making predictions for a specific output,
+        does not account for normalization or likelihood
+        ---------
+
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :param output: output to predict
+        :type output: integer in {0,..., num_outputs-1}
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+
+        .. Note:: For multiple output models only
+        """
+        Bi, _ = dpotri(self.LB, lower=0)  # WTH? this lower switch should be 1, but that doesn't work!
+        symmetrify(Bi)
+        Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi)
+
+        if self.Cpsi1V is None:
+            psi1V = np.dot(self.psi1.T,self.likelihood.V)
+            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
+            tmp, _ = dpotrs(self.LB, tmp, lower=1)
+            self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1)
+
+        assert hasattr(self,'multioutput')
+        index = np.ones_like(_Xnew)*output
+        _Xnew = np.hstack((_Xnew,index))
+
+        if X_variance_new is None:
+            Kx = self.kern.K(self.Z, _Xnew, which_parts=which_parts)
+            mu = np.dot(Kx.T, self.Cpsi1V)
+            if full_cov:
+                Kxx = self.kern.K(_Xnew, which_parts=which_parts)
+                var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting
+            else:
+                Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
+                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
+        else:
+            Kx = self.kern.psi1(self.Z, _Xnew, X_variance_new)
+            mu = np.dot(Kx, self.Cpsi1V)
+            if full_cov:
+                raise NotImplementedError, "TODO"
+            else:
+                Kxx = self.kern.psi0(self.Z, _Xnew, X_variance_new)
+                psi2 = self.kern.psi2(self.Z, _Xnew, X_variance_new)
+                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
+
+        return mu, var[:, None]
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 994e3b48..005b131f 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -298,7 +298,7 @@ def bgplvm_simulation(optimize='scg',
     return m
 
 def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
-    D1, D2, D3, N, num_inducing, Q = 30, 10, 15, 60, 3, 10
+    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
     slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
 
     likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]
@@ -321,7 +321,7 @@ def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
 
     if optimize:
         print "Optimizing Model:"
-        m.optimize(messages=1, max_iters=8e3, max_f_eval=8e3, gtol=.1)
+        m.optimize(messages=1, max_iters=8e3, gtol=.1)
     if plot:
         m.plot_X_1d("MRD Latent Space 1D")
         m.plot_scales("MRD Scales")
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index c7849a46..df7b92b8 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -22,8 +22,8 @@ def coregionalisation_toy2(max_iters=100):
     Y = np.vstack((Y1, Y2))
 
     k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
-    k2 = GPy.kern.coregionalise(2, 1)
-    k = k1**k2
+    k2 = GPy.kern.coregionalise(2,1)
+    k = k1**k2 #k = k1.prod(k2,tensor=True)
     m = GPy.models.GPRegression(X, Y, kernel=k)
     m.constrain_fixed('.*rbf_var', 1.)
     # m.constrain_positive('.*kappa')
@@ -46,32 +46,23 @@ def coregionalisation_toy(max_iters=100):
     """
     X1 = np.random.rand(50, 1) * 8
     X2 = np.random.rand(30, 1) * 5
-    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
-    X = np.hstack((np.vstack((X1, X2)), index))
+    X = np.vstack((X1, X2))
     Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
     Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
     Y = np.vstack((Y1, Y2))
 
     k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.coregionalise(2, 2)
-    k = k1**k2 #k1.prod(k2, tensor=True)
-    m = GPy.models.GPRegression(X, Y, kernel=k)
+    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
     m.constrain_fixed('.*rbf_var', 1.)
-    # m.constrain_positive('kappa')
     m.optimize(max_iters=max_iters)
 
-    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
-    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
-    mean, var, low, up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
-    mean, var, low, up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
-    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
-    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
+    fig, axes = pb.subplots(2,1)
+    m.plot(output=0,ax=axes[0])
+    m.plot(output=1,ax=axes[1])
+    axes[0].set_title('Output 0')
+    axes[1].set_title('Output 1')
     return m
 
-
 def coregionalisation_sparse(max_iters=100):
     """
     A simple demonstration of coregionalisation on two sinusoidal functions using sparse approximations.
@@ -86,31 +77,39 @@ def coregionalisation_sparse(max_iters=100):
 
     num_inducing = 40
     Z = np.hstack((np.random.rand(num_inducing, 1) * 8, np.random.randint(0, 2, num_inducing)[:, None]))
+    Z = np.hstack((np.random.rand(num_inducing, 1) * 8, np.random.randint(0, 2, num_inducing)[:, None]))
 
     k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.coregionalise(2, 2)
-    k = k1**k2 #.prod(k2, tensor=True) # + GPy.kern.white(2,0.001)
 
-    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
+    m = GPy.models.SparseGPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1],num_inducing=20)
+    #k2 = GPy.kern.coregionalise(2, 2)
+    #k = k1**k2 #.prod(k2, tensor=True) # + GPy.kern.white(2,0.001)
+    #m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
     m.constrain_fixed('.*rbf_var', 1.)
-    m.constrain_fixed('iip')
-    m.constrain_bounded('noise_variance', 1e-3, 1e-1)
+
+    #m.constrain_fixed('iip')
+    #m.constrain_bounded('noise_variance', 1e-3, 1e-1)
 #     m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
     m.optimize(max_iters=max_iters)
 
+    fig, axes = pb.subplots(2,1)
+    m.plot(output=0,ax=axes[0])
+    m.plot(output=1,ax=axes[1])
+    axes[0].set_title('Output 0')
+    axes[1].set_title('Output 1')
     # plotting:
-    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
-    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
-    mean, var, low, up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
-    mean, var, low, up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
-    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
-    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
-    y = pb.ylim()[0]
-    pb.plot(Z[:, 0][Z[:, 1] == 0], np.zeros(np.sum(Z[:, 1] == 0)) + y, 'r|', mew=2)
-    pb.plot(Z[:, 0][Z[:, 1] == 1], np.zeros(np.sum(Z[:, 1] == 1)) + y, 'g|', mew=2)
+    #pb.figure()
+    #Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
+    #Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
+    #mean, var, low, up = m.predict(Xtest1)
+    #GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
+    #mean, var, low, up = m.predict(Xtest2)
+    #GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
+    #pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
+    #pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
+    #y = pb.ylim()[0]
+    #pb.plot(Z[:, 0][Z[:, 1] == 0], np.zeros(np.sum(Z[:, 1] == 0)) + y, 'r|', mew=2)
+    #pb.plot(Z[:, 0][Z[:, 1] == 1], np.zeros(np.sum(Z[:, 1] == 1)) + y, 'g|', mew=2)
     return m
 
 def epomeo_gpx(max_iters=100):
@@ -401,8 +400,6 @@ def silhouette(max_iters=100):
     print(m)
     return m
 
-
-
 def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100):
     """Run a 1D example of a sparse GP regression."""
     # sample inputs and outputs
diff --git a/GPy/inference/optimization.py b/GPy/inference/optimization.py
index 0ef487af..589ec4c7 100644
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@@ -130,7 +130,7 @@ class opt_lbfgsb(Optimizer):
             opt_dict['pgtol'] = self.gtol
 
         opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint,
-                                            maxfun=self.max_f_eval, **opt_dict)
+                                            maxfun=self.max_iters, **opt_dict)
         self.x_opt = opt_result[0]
         self.f_opt = f_fp(self.x_opt)[0]
         self.funct_eval = opt_result[2]['funcalls']
diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index d788b4f5..943965ed 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -73,9 +73,9 @@ def gibbs(input_dim,variance=1., mapping=None):
     Gibbs and MacKay non-stationary covariance function.
 
     .. math::
-       
+
        r = sqrt((x_i - x_j)'*(x_i - x_j))
-       
+
        k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
 
        Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
@@ -89,7 +89,7 @@ def gibbs(input_dim,variance=1., mapping=None):
         The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
 
         :param input_dim: the number of input dimensions
-        :type input_dim: int 
+        :type input_dim: int
         :param variance: the variance :math:`\sigma^2`
         :type variance: float
         :param mapping: the mapping that gives the lengthscale across the input space.
@@ -346,29 +346,30 @@ def symmetric(k):
     k_.parts = [symmetric.Symmetric(p) for p in k.parts]
     return k_
 
-def coregionalise(output_dim, rank=1, W=None, kappa=None):
+def coregionalise(num_outputs,W_columns=1, W=None, kappa=None):
     """
-        Coregionalisation kernel. 
-
-    Used for computing covariance functions of the form
-    .. math::
-       k_2(x, y)=\mathbf{B} k(x, y)
-    where
+    Coregionlization matrix B, of the form:
     .. math::
        \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
 
-    :param output_dim: the number of output dimensions
-    :type output_dim: int
-    :param rank: the rank of the coregionalisation matrix.
-    :type rank: int
-    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B.
-    :type W: ndarray
-    :param kappa: a diagonal term which allows the outputs to behave independently.
+    An intrinsic/linear coregionalization kernel of the form
+    .. math::
+       k_2(x, y)=\mathbf{B} k(x, y)
+
+    it is obtainded as the tensor product between a kernel k(x,y) and B.
+
+    :param num_outputs: the number of outputs to corregionalise
+    :type num_outputs: int
+    :param W_columns: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
+    :type W_colunns: int
+    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B
+    :type W: numpy array of dimensionality (num_outpus, W_columns)
+    :param kappa: a vector which allows the outputs to behave independently
+    :type kappa: numpy array of dimensionality  (num_outputs,)
     :rtype: kernel object
 
-    .. Note: see coregionalisation examples in GPy.examples.regression for some usage.
     """
-    p = parts.coregionalise.Coregionalise(output_dim,rank,W,kappa)
+    p = parts.coregionalise.Coregionalise(num_outputs,W_columns,W,kappa)
     return kern(1,[p])
 
 
@@ -427,3 +428,31 @@ def hierarchical(k):
     #     assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
     _parts = [parts.hierarchical.Hierarchical(k.parts)]
     return kern(k.input_dim+len(k.parts),_parts)
+
+def build_lcm(input_dim, num_outputs, kernel_list = [], W_columns=1,W=None,kappa=None):
+    """
+    Builds a kernel of a linear coregionalization model
+
+    :input_dim: Input dimensionality
+    :num_outputs: Number of outputs
+    :kernel_list: List of coregionalized kernels, each element in the list will be multiplied by a different corregionalization matrix
+    :type kernel_list: list of GPy kernels
+    :param W_columns: number tuples of the corregionalization parameters 'coregion_W'
+    :type W_columns: integer
+
+    ..Note the kernels dimensionality is overwritten to fit input_dim
+    """
+
+    for k in kernel_list:
+        if k.input_dim <> input_dim:
+            k.input_dim = input_dim
+            warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
+
+    k_coreg = coregionalise(num_outputs,W_columns,W,kappa)
+    kernel = kernel_list[0]**k_coreg.copy()
+
+    for k in kernel_list[1:]:
+        k_coreg = coregionalise(num_outputs,W_columns,W,kappa)
+        kernel += k**k_coreg.copy()
+
+    return kernel
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 9143854e..e7b3e30a 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -421,19 +421,19 @@ class kern(Parameterized):
         # TODO: input_slices needed
         crossterms = 0
 
-        for p1, p2 in itertools.combinations(self.parts, 2):
+        for [p1, i_s1], [p2, i_s2] in itertools.combinations(zip(self.parts, self.input_slices), 2):
+            if i_s1 == i_s2:
+                # TODO psi1 this must be faster/better/precached/more nice
+                tmp1 = np.zeros((mu.shape[0], Z.shape[0]))
+                p1.psi1(Z[:, i_s1], mu[:, i_s1], S[:, i_s1], tmp1)
+                tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
+                p2.psi1(Z[:, i_s2], mu[:, i_s2], S[:, i_s2], tmp2)
+    
+                prod = np.multiply(tmp1, tmp2)
+                crossterms += prod[:, :, None] + prod[:, None, :]
 
-            # TODO psi1 this must be faster/better/precached/more nice
-            tmp1 = np.zeros((mu.shape[0], Z.shape[0]))
-            p1.psi1(Z, mu, S, tmp1)
-            tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
-            p2.psi1(Z, mu, S, tmp2)
-
-            prod = np.multiply(tmp1, tmp2)
-            crossterms += prod[:, :, None] + prod[:, None, :]
-
-        target += crossterms
-        return target
+        # target += crossterms
+        return target + crossterms
 
     def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
         """Gradient of the psi2 statistics with respect to the parameters."""
diff --git a/GPy/kern/parts/coregionalise.py b/GPy/kern/parts/coregionalise.py
index 94179088..66e14052 100644
--- a/GPy/kern/parts/coregionalise.py
+++ b/GPy/kern/parts/coregionalise.py
@@ -9,42 +9,45 @@ from scipy import weave
 
 class Coregionalise(Kernpart):
     """
-    Coregionalisation kernel. 
+    Kernel for intrinsic/linear coregionalization models
 
-    Used for computing covariance functions of the form
+    This kernel has the form
     .. math::
-       k_2(x, y)=B k(x, y)
-    where
-    .. math::
-       B = WW^\top + diag(kappa)
+       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
 
-    :param output_dim: the number of output dimensions
-    :type output_dim: int
-    :param rank: the rank of the coregionalisation matrix.
-    :type rank: int
-    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B.
-    :type W: ndarray
-    :param kappa: a diagonal term which allows the outputs to behave independently.
-    :rtype: kernel object
+    An intrinsic/linear coregionalization kernel of the form
+    .. math::
+       k_2(x, y)=\mathbf{B} k(x, y)
+
+    it is obtainded as the tensor product between a kernel k(x,y) and B.
+
+    :param num_outputs: number of outputs to coregionalize
+    :type num_outputs: int
+    :param W_columns: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
+    :type W_colunns: int
+    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B
+    :type W: numpy array of dimensionality (num_outpus, W_columns)
+    :param kappa: a vector which allows the outputs to behave independently
+    :type kappa: numpy array of dimensionality  (num_outputs,)
 
     .. Note: see coregionalisation examples in GPy.examples.regression for some usage.
     """
-    def __init__(self,output_dim,rank=1, W=None, kappa=None):
+    def __init__(self,num_outputs,W_columns=1, W=None, kappa=None):
         self.input_dim = 1
         self.name = 'coregion'
-        self.output_dim = output_dim
-        self.rank = rank
+        self.num_outputs = num_outputs
+        self.W_columns = W_columns
         if W is None:
-            self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank)
+            self.W = 0.5*np.random.randn(self.num_outputs,self.W_columns)/np.sqrt(self.W_columns)
         else:
-            assert W.shape==(self.output_dim,self.rank)
+            assert W.shape==(self.num_outputs,self.W_columns)
             self.W = W
         if kappa is None:
-            kappa = 0.5*np.ones(self.output_dim)
+            kappa = 0.5*np.ones(self.num_outputs)
         else:
-            assert kappa.shape==(self.output_dim,)
+            assert kappa.shape==(self.num_outputs,)
         self.kappa = kappa
-        self.num_params = self.output_dim*(self.rank + 1)
+        self.num_params = self.num_outputs*(self.W_columns + 1)
         self._set_params(np.hstack([self.W.flatten(),self.kappa]))
 
     def _get_params(self):
@@ -52,12 +55,12 @@ class Coregionalise(Kernpart):
 
     def _set_params(self,x):
         assert x.size == self.num_params
-        self.kappa = x[-self.output_dim:]
-        self.W = x[:-self.output_dim].reshape(self.output_dim,self.rank)
+        self.kappa = x[-self.num_outputs:]
+        self.W = x[:-self.num_outputs].reshape(self.num_outputs,self.W_columns)
         self.B = np.dot(self.W,self.W.T) + np.diag(self.kappa)
 
     def _get_param_names(self):
-        return sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[]) + ['kappa_%i'%i for i in range(self.output_dim)]
+        return sum([['W%i_%i'%(i,j) for j in range(self.W_columns)] for i in range(self.num_outputs)],[]) + ['kappa_%i'%i for i in range(self.num_outputs)]
 
     def K(self,index,index2,target):
         index = np.asarray(index,dtype=np.int)
@@ -75,26 +78,26 @@ class Coregionalise(Kernpart):
         if index2 is None:
             code="""
             for(int i=0;i<N; i++){
-              target[i+i*N] += B[index[i]+output_dim*index[i]];
+              target[i+i*N] += B[index[i]+num_outputs*index[i]];
               for(int j=0; j<i; j++){
-                  target[j+i*N] += B[index[i]+output_dim*index[j]];
+                  target[j+i*N] += B[index[i]+num_outputs*index[j]];
                   target[i+j*N] += target[j+i*N];
                 }
               }
             """
-            N,B,output_dim = index.size, self.B, self.output_dim
-            weave.inline(code,['target','index','N','B','output_dim'])
+            N,B,num_outputs = index.size, self.B, self.num_outputs
+            weave.inline(code,['target','index','N','B','num_outputs'])
         else:
             index2 = np.asarray(index2,dtype=np.int)
             code="""
             for(int i=0;i<num_inducing; i++){
               for(int j=0; j<N; j++){
-                  target[i+j*num_inducing] += B[output_dim*index[j]+index2[i]];
+                  target[i+j*num_inducing] += B[num_outputs*index[j]+index2[i]];
                 }
               }
             """
-            N,num_inducing,B,output_dim = index.size,index2.size, self.B, self.output_dim
-            weave.inline(code,['target','index','index2','N','num_inducing','B','output_dim'])
+            N,num_inducing,B,num_outputs = index.size,index2.size, self.B, self.num_outputs
+            weave.inline(code,['target','index','index2','N','num_inducing','B','num_outputs'])
 
 
     def Kdiag(self,index,target):
@@ -111,12 +114,12 @@ class Coregionalise(Kernpart):
         code="""
         for(int i=0; i<num_inducing; i++){
           for(int j=0; j<N; j++){
-            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
+            dL_dK_small[index[j] + num_outputs*index2[i]] += dL_dK[i+j*num_inducing];
           }
         }
         """
-        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
-        weave.inline(code, ['N','num_inducing','output_dim','dL_dK','dL_dK_small','index','index2'])
+        N, num_inducing, num_outputs = index.size, index2.size, self.num_outputs
+        weave.inline(code, ['N','num_inducing','num_outputs','dL_dK','dL_dK_small','index','index2'])
 
         dkappa = np.diag(dL_dK_small)
         dL_dK_small += dL_dK_small.T
@@ -133,8 +136,8 @@ class Coregionalise(Kernpart):
         ii,jj = ii.T, jj.T
 
         dL_dK_small = np.zeros_like(self.B)
-        for i in range(self.output_dim):
-            for j in range(self.output_dim):
+        for i in range(self.num_outputs):
+            for j in range(self.num_outputs):
                 tmp = np.sum(dL_dK[(ii==i)*(jj==j)])
                 dL_dK_small[i,j] = tmp
 
@@ -146,8 +149,8 @@ class Coregionalise(Kernpart):
 
     def dKdiag_dtheta(self,dL_dKdiag,index,target):
         index = np.asarray(index,dtype=np.int).flatten()
-        dL_dKdiag_small = np.zeros(self.output_dim)
-        for i in range(self.output_dim):
+        dL_dKdiag_small = np.zeros(self.num_outputs)
+        for i in range(self.num_outputs):
             dL_dKdiag_small[i] += np.sum(dL_dKdiag[index==i])
         dW = 2.*self.W*dL_dKdiag_small[:,None]
         dkappa = dL_dKdiag_small
@@ -155,6 +158,3 @@ class Coregionalise(Kernpart):
 
     def dK_dX(self,dL_dK,X,X2,target):
         pass
-
-
-
diff --git a/GPy/kern/parts/prod.py b/GPy/kern/parts/prod.py
index 21fb2e7b..dd8ed7c8 100644
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@@ -18,7 +18,7 @@ class Prod(Kernpart):
     """
     def __init__(self,k1,k2,tensor=False):
         self.num_params = k1.num_params + k2.num_params
-        self.name = k1.name + '<times>' + k2.name
+        self.name = '['+k1.name + '(x)' + k2.name +']'
         self.k1 = k1
         self.k2 = k2
         if tensor:
diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index 1b65133f..855e2b71 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -242,13 +242,14 @@ class RBF(Kernpart):
 
     def _psi_computations(self, Z, mu, S):
         # here are the "statistics" for psi1 and psi2
-        if not fast_array_equal(Z, self._Z):
+        Z_changed = not fast_array_equal(Z, self._Z)
+        if Z_changed:
             # Z has changed, compute Z specific stuff
             self._psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
             self._psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
             self._psi2_Zdist_sq = np.square(self._psi2_Zdist / self.lengthscale) # M,M,Q
 
-        if not fast_array_equal(Z, self._Z) or not fast_array_equal(mu, self._mu) or not fast_array_equal(S, self._S):
+        if Z_changed or not fast_array_equal(mu, self._mu) or not fast_array_equal(S, self._S):
             # something's changed. recompute EVERYTHING
 
             # psi1
diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 99e88b6d..0cb62eb0 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -1,4 +1,7 @@
 from ep import EP
+from ep_mixed_noise import EP_Mixed_Noise
 from gaussian import Gaussian
+from gaussian_mixed_noise import Gaussian_Mixed_Noise
+from noise_model_constructors import *
 # TODO: from Laplace import Laplace
-import likelihood_functions as functions
+
diff --git a/GPy/likelihoods/ep.py b/GPy/likelihoods/ep.py
index 94f760e9..c9f23839 100644
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@@ -4,23 +4,23 @@ from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
 from likelihood import likelihood
 
 class EP(likelihood):
-    def __init__(self,data,LikelihoodFunction,epsilon=1e-3,power_ep=[1.,1.]):
+    def __init__(self,data,noise_model,epsilon=1e-3,power_ep=[1.,1.]):
         """
         Expectation Propagation
 
         Arguments
         ---------
         epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
-        LikelihoodFunction : a likelihood function (see likelihood_functions.py)
+        noise_model : a likelihood function (see likelihood_functions.py)
         """
-        self.LikelihoodFunction = LikelihoodFunction
+        self.noise_model = noise_model
         self.epsilon = epsilon
         self.eta, self.delta = power_ep
         self.data = data
         self.N, self.output_dim = self.data.shape
         self.is_heteroscedastic = True
         self.Nparams = 0
-        self._transf_data = self.LikelihoodFunction._preprocess_values(data)
+        self._transf_data = self.noise_model._preprocess_values(data)
 
         #Initial values - Likelihood approximation parameters:
         #p(y|f) = t(f|tau_tilde,v_tilde)
@@ -52,16 +52,23 @@ class EP(likelihood):
     def predictive_values(self,mu,var,full_cov):
         if full_cov:
             raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.LikelihoodFunction.predictive_values(mu,var)
+        return self.noise_model.predictive_values(mu,var)
 
     def _get_params(self):
-        return np.zeros(0)
+        #return np.zeros(0)
+        return self.noise_model._get_params()
+
     def _get_param_names(self):
-        return []
+        #return []
+        return self.noise_model._get_param_names()
+
     def _set_params(self,p):
-        pass # TODO: the EP likelihood might want to take some parameters...
+        #pass # TODO: the EP likelihood might want to take some parameters...
+        self.noise_model._set_params(p)
+
     def _gradients(self,partial):
-        return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
+        #return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
+        return self.noise_model._gradients(partial)
 
     def _compute_GP_variables(self):
         #Variables to be called from GP
@@ -116,7 +123,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
@@ -206,7 +213,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@@ -301,7 +308,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
diff --git a/GPy/likelihoods/ep_mixed_noise.py b/GPy/likelihoods/ep_mixed_noise.py
new file mode 100644
index 00000000..ffc8cb51
--- /dev/null
+++ b/GPy/likelihoods/ep_mixed_noise.py
@@ -0,0 +1,385 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats
+from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
+from likelihood import likelihood
+
+class EP_Mixed_Noise(likelihood):
+    def __init__(self,data_list,noise_model_list,epsilon=1e-3,power_ep=[1.,1.]):
+        """
+        Expectation Propagation
+
+        Arguments
+        ---------
+        :param data_list: list of outputs
+        :param noise_model_list: a list of noise models
+        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations
+        :type epsilon: float
+        :param power_ep: list of power ep parameters
+        """
+        assert len(data_list) == len(noise_model_list)
+        self.noise_model_list = noise_model_list
+        n_list = [data.size for data in data_list]
+        self.n_models = len(data_list)
+        self.n_params = [noise_model._get_params().size for noise_model in noise_model_list]
+        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.n_models),n_list)])
+        self.epsilon = epsilon
+        self.eta, self.delta = power_ep
+        self.data = np.vstack(data_list)
+        self.N, self.output_dim = self.data.shape
+        self.is_heteroscedastic = True
+        self.Nparams = 0#FIXME
+        self._transf_data = np.vstack([noise_model._preprocess_values(data) for noise_model,data in zip(noise_model_list,data_list)])
+        #TODO non-gaussian index
+
+        #Initial values - Likelihood approximation parameters:
+        #p(y|f) = t(f|tau_tilde,v_tilde)
+        self.tau_tilde = np.zeros(self.N)
+        self.v_tilde = np.zeros(self.N)
+
+        #initial values for the GP variables
+        self.Y = np.zeros((self.N,1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:,None]
+        self.Z = 0
+        self.YYT = None
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = 0.
+
+    def restart(self):
+        self.tau_tilde = np.zeros(self.N)
+        self.v_tilde = np.zeros(self.N)
+        self.Y = np.zeros((self.N,1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:,None]
+        self.Z = 0
+        self.YYT = None
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = 0.
+
+    def predictive_values(self,mu,var,full_cov,noise_model):
+        """
+        Predicts the output given the GP
+
+        :param mu: GP's mean
+        :param var: GP's variance
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+        :type full_cov: False|True
+        :param noise_model: noise model to use
+        :type noise_model: integer
+        """
+        if full_cov:
+            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
+        #_mu = []
+        #_var = []
+        #_q1 = []
+        #_q2 = []
+        #for m,v,o in zip(mu,var,output.flatten()):
+        #    a,b,c,d = self.noise_model_list[int(o)].predictive_values(m,v)
+        #    _mu.append(a)
+        #    _var.append(b)
+        #    _q1.append(c)
+        #    _q2.append(d)
+        #return np.vstack(_mu),np.vstack(_var),np.vstack(_q1),np.vstack(_q2)
+        return self.noise_model_list[noise_model].predictive_values(mu,var)
+
+    def _get_params(self):
+        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
+
+    def _get_param_names(self):
+        names = []
+        for noise_model in self.noise_model_list:
+           names += noise_model._get_param_names()
+        return names
+
+    def _set_params(self,p):
+        cs_params = np.cumsum([0]+self.n_params)
+        for i in range(len(self.n_params)):
+            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
+
+    def _gradients(self,partial):
+        #NOTE this is not tested
+        return np.hstack([noise_model._gradients(partial) for noise_model in self.noise_model_list])
+
+    def _compute_GP_variables(self):
+        #Variables to be called from GP
+        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
+        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
+        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
+        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
+
+        self.Y =  mu_tilde[:,None]
+        self.YYT = np.dot(self.Y,self.Y.T)
+        self.covariance_matrix = np.diag(1./self.tau_tilde)
+        self.precision = self.tau_tilde[:,None]
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = np.trace(self.YYT)
+
+    def fit_full(self,K):
+        """
+        The expectation-propagation algorithm.
+        For nomenclature see Rasmussen & Williams 2006.
+        """
+        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
+        mu = np.zeros(self.N)
+        Sigma = K.copy()
+
+        """
+        Initial values - Cavity distribution parameters:
+        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
+        sigma_ = 1./tau_
+        mu_ = v_/tau_
+        """
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
+
+        #Initial values - Marginal moments
+        z = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
+        phi = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.N,dtype=float)
+
+        #Approximation
+        epsilon_np1 = self.epsilon + 1.
+        epsilon_np2 = self.epsilon + 1.
+       	self.iterations = 0
+        self.np1 = [self.tau_tilde.copy()]
+        self.np2 = [self.v_tilde.copy()]
+        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
+            update_order = np.random.permutation(self.N)
+            for i in update_order:
+                #Cavity distribution parameters
+                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
+                #Marginal moments
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                #Site parameters update
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
+                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
+                self.tau_tilde[i] += Delta_tau
+                self.v_tilde[i] += Delta_v
+                #Posterior distribution parameters update
+                DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i])))
+                mu = np.dot(Sigma,self.v_tilde)
+                self.iterations += 1
+            #Sigma recomptutation with Cholesky decompositon
+            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
+            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
+            L = jitchol(B)
+            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
+            Sigma = K - np.dot(V.T,V)
+            mu = np.dot(Sigma,self.v_tilde)
+            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            self.np1.append(self.tau_tilde.copy())
+            self.np2.append(self.v_tilde.copy())
+
+        return self._compute_GP_variables()
+
+    def fit_DTC(self, Kmm, Kmn):
+        """
+        The expectation-propagation algorithm with sparse pseudo-input.
+        For nomenclature see ... 2013.
+        """
+        num_inducing = Kmm.shape[0]
+
+        #TODO: this doesn't work with uncertain inputs!
+
+        """
+        Prior approximation parameters:
+        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
+        Sigma0 = Qnn = Knm*Kmmi*Kmn
+        """
+        KmnKnm = np.dot(Kmn,Kmn.T)
+        Lm = jitchol(Kmm)
+        Lmi = chol_inv(Lm)
+        Kmmi = np.dot(Lmi.T,Lmi)
+        KmmiKmn = np.dot(Kmmi,Kmn)
+        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
+        LLT0 = Kmm.copy()
+
+        #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
+        #KmnKnm = np.dot(Kmn, Kmn.T)
+        #KmmiKmn = np.dot(Kmmi,Kmn)
+        #Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
+        #LLT0 = Kmm.copy()
+
+        """
+        Posterior approximation: q(f|y) = N(f| mu, Sigma)
+        Sigma = Diag + P*R.T*R*P.T + K
+        mu = w + P*Gamma
+        """
+        mu = np.zeros(self.N)
+        LLT = Kmm.copy()
+        Sigma_diag = Qnn_diag.copy()
+
+        """
+        Initial values - Cavity distribution parameters:
+        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
+        sigma_ = 1./tau_
+        mu_ = v_/tau_
+        """
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
+
+        #Initial values - Marginal moments
+        z = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
+        phi = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.N,dtype=float)
+
+        #Approximation
+        epsilon_np1 = 1
+        epsilon_np2 = 1
+       	self.iterations = 0
+        np1 = [self.tau_tilde.copy()]
+        np2 = [self.v_tilde.copy()]
+        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
+            update_order = np.random.permutation(self.N)
+            for i in update_order:
+                #Cavity distribution parameters
+                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
+                #Marginal moments
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                #Site parameters update
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
+                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
+                self.tau_tilde[i] += Delta_tau
+                self.v_tilde[i] += Delta_v
+                #Posterior distribution parameters update
+                DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
+                L = jitchol(LLT)
+                #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau))
+                V,info = dtrtrs(L,Kmn,lower=1)
+                Sigma_diag = np.sum(V*V,-2)
+                si = np.sum(V.T*V[:,i],-1)
+                mu += (Delta_v-Delta_tau*mu[i])*si
+                self.iterations += 1
+            #Sigma recomputation with Cholesky decompositon
+            LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
+            L = jitchol(LLT)
+            V,info = dtrtrs(L,Kmn,lower=1)
+            V2,info = dtrtrs(L.T,V,lower=0)
+            Sigma_diag = np.sum(V*V,-2)
+            Knmv_tilde = np.dot(Kmn,self.v_tilde)
+            mu = np.dot(V2.T,Knmv_tilde)
+            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
+            np1.append(self.tau_tilde.copy())
+            np2.append(self.v_tilde.copy())
+
+        self._compute_GP_variables()
+
+    def fit_FITC(self, Kmm, Kmn, Knn_diag):
+        """
+        The expectation-propagation algorithm with sparse pseudo-input.
+        For nomenclature see Naish-Guzman and Holden, 2008.
+        """
+        num_inducing = Kmm.shape[0]
+
+        """
+        Prior approximation parameters:
+        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
+        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
+        """
+        Lm = jitchol(Kmm)
+        Lmi = chol_inv(Lm)
+        Kmmi = np.dot(Lmi.T,Lmi)
+        P0 = Kmn.T
+        KmnKnm = np.dot(P0.T, P0)
+        KmmiKmn = np.dot(Kmmi,P0.T)
+        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
+        Diag0 = Knn_diag - Qnn_diag
+        R0 = jitchol(Kmmi).T
+
+        """
+        Posterior approximation: q(f|y) = N(f| mu, Sigma)
+        Sigma = Diag + P*R.T*R*P.T + K
+        mu = w + P*Gamma
+        """
+        self.w = np.zeros(self.N)
+        self.Gamma = np.zeros(num_inducing)
+        mu = np.zeros(self.N)
+        P = P0.copy()
+        R = R0.copy()
+        Diag = Diag0.copy()
+        Sigma_diag = Knn_diag
+        RPT0 = np.dot(R0,P0.T)
+
+        """
+        Initial values - Cavity distribution parameters:
+        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
+        sigma_ = 1./tau_
+        mu_ = v_/tau_
+        """
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
+
+        #Initial values - Marginal moments
+        z = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
+        phi = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.N,dtype=float)
+
+        #Approximation
+        epsilon_np1 = 1
+        epsilon_np2 = 1
+       	self.iterations = 0
+        self.np1 = [self.tau_tilde.copy()]
+        self.np2 = [self.v_tilde.copy()]
+        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
+            update_order = np.random.permutation(self.N)
+            for i in update_order:
+                #Cavity distribution parameters
+                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
+                #Marginal moments
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                #Site parameters update
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
+                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
+                self.tau_tilde[i] += Delta_tau
+                self.v_tilde[i] += Delta_v
+                #Posterior distribution parameters update
+                dtd1 = Delta_tau*Diag[i] + 1.
+                dii = Diag[i]
+                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
+                pi_ = P[i,:].reshape(1,num_inducing)
+                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
+                Rp_i = np.dot(R,pi_.T)
+                RTR = np.dot(R.T,np.dot(np.eye(num_inducing) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
+                R = jitchol(RTR).T
+                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
+                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
+                RPT = np.dot(R,P.T)
+                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
+                mu = self.w + np.dot(P,self.Gamma)
+                self.iterations += 1
+            #Sigma recomptutation with Cholesky decompositon
+            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
+            Diag = Diag0 * Iplus_Dprod_i
+            P = Iplus_Dprod_i[:,None] * P0
+            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
+            L = jitchol(np.eye(num_inducing) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
+            R,info = dtrtrs(L,R0,lower=1)
+            RPT = np.dot(R,P.T)
+            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
+            self.w = Diag * self.v_tilde
+            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
+            mu = self.w + np.dot(P,self.Gamma)
+            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            self.np1.append(self.tau_tilde.copy())
+            self.np2.append(self.v_tilde.copy())
+
+        return self._compute_GP_variables()
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 7665e608..cdf54dd6 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -7,9 +7,9 @@ class Gaussian(likelihood):
     """
     Likelihood class for doing Expectation propagation
 
-    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the likelihood_function used
-    :param variance :
+    :param data: observed output
+    :type data: Nx1 numpy.darray
+    :param variance: noise parameter
     :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
     :type normalize: False|True
     """
diff --git a/GPy/likelihoods/gaussian_mixed_noise.py b/GPy/likelihoods/gaussian_mixed_noise.py
new file mode 100644
index 00000000..e7df2584
--- /dev/null
+++ b/GPy/likelihoods/gaussian_mixed_noise.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats
+from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
+from likelihood import likelihood
+from . import Gaussian
+
+
+class Gaussian_Mixed_Noise(likelihood):
+    """
+    Gaussian Likelihood for multiple outputs
+
+    This is a wrapper around likelihood.Gaussian class
+
+    :param data_list: data observations
+    :type data_list: list of numpy arrays (num_data_output_i x 1), one array per output
+    :param noise_params: noise parameters of each output
+    :type noise_params: list of floats, one per output
+    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
+    :type normalize: False|True
+    """
+    def __init__(self, data_list, noise_params=None, normalize=True):
+        self.Nparams = len(data_list)
+        self.n_list = [data.size for data in data_list]
+        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.Nparams),self.n_list)])
+
+        if noise_params is None:
+            noise_params = [1.] * self.Nparams
+        else:
+            assert self.Nparams == len(noise_params), 'Number of noise parameters does not match the number of noise models.'
+
+        self.noise_model_list = [Gaussian(Y,variance=v,normalize = normalize) for Y,v in zip(data_list,noise_params)]
+        self.n_params = [noise_model._get_params().size for noise_model in self.noise_model_list]
+        self.data = np.vstack(data_list)
+        self.N, self.output_dim = self.data.shape
+        self._offset = np.zeros((1, self.output_dim))
+        self._scale = np.ones((1, self.output_dim))
+
+        self.is_heteroscedastic = True
+        self.Z = 0. # a correction factor which accounts for the approximation made
+
+        self.set_data(data_list)
+        self._set_params(np.asarray(noise_params))
+
+    def set_data(self, data_list):
+        self.data = np.vstack(data_list)
+        self.N, D = self.data.shape
+        assert D == self.output_dim
+        self.Y = (self.data - self._offset) / self._scale
+        if D > self.N:
+            raise NotImplementedError
+            #self.YYT = np.dot(self.Y, self.Y.T)
+            #self.trYYT = np.trace(self.YYT)
+            #self.YYT_factor = jitchol(self.YYT)
+        else:
+            self.YYT = None
+            self.trYYT = np.sum(np.square(self.Y))
+            self.YYT_factor = self.Y
+
+    def predictive_values(self,mu,var,full_cov,noise_model):
+        """
+        Predicts the output given the GP
+
+        :param mu: GP's mean
+        :param var: GP's variance
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+        :type full_cov: False|True
+        :param noise_model: noise model to use
+        :type noise_model: integer
+        """
+        if full_cov:
+            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
+        return self.noise_model_list[noise_model].predictive_values(mu,var,full_cov)
+
+    def _get_params(self):
+        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
+
+    def _get_param_names(self):
+        if len(self.noise_model_list) == 1:
+            names = self.noise_model_list[0]._get_param_names()
+        else:
+            names = []
+            for noise_model,i in zip(self.noise_model_list,range(len(self.n_list))):
+                names.append(''.join(noise_model._get_param_names() + ['_%s' %i]))
+        return names
+
+    def _set_params(self,p):
+        cs_params = np.cumsum([0]+self.n_params)
+
+        for i in range(len(self.n_params)):
+            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
+        self.precision = np.hstack([np.repeat(noise_model.precision,n) for noise_model,n in zip(self.noise_model_list,self.n_list)])[:,None]
+
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.precision * self.YYT_factor
+        self.covariance_matrix = np.eye(self.N) * 1./self.precision
+
+    def _gradients(self,partial):
+        gradients = []
+        aux = np.cumsum([0]+self.n_list)
+        for ai,af,noise_model in zip(aux[:-1],aux[1:],self.noise_model_list):
+            gradients += [noise_model._gradients(partial[ai:af])]
+        return np.hstack(gradients)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
deleted file mode 100644
index 7b9b8982..00000000
--- a/GPy/likelihoods/likelihood_functions.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats
-import scipy as sp
-import pylab as pb
-from ..util.plot import gpplot
-from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import link_functions
-
-class LikelihoodFunction(object):
-    """
-    Likelihood class for doing Expectation propagation
-
-    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the LikelihoodFunction used
-    """
-    def __init__(self,link):
-        if link == self._analytical:
-            self.moments_match = self._moments_match_analytical
-        else:
-            assert isinstance(link,link_functions.LinkFunction)
-            self.link = link
-            self.moments_match = self._moments_match_numerical
-
-    def _preprocess_values(self,Y):
-        return Y
-
-    def _product(self,gp,obs,mu,sigma):
-        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._distribution(gp,obs)
-
-    def _nlog_product(self,gp,obs,mu,sigma):
-        return -(-.5*(gp-mu)**2/sigma**2 + self._log_distribution(gp,obs))
-
-    def _locate(self,obs,mu,sigma):
-        """
-        Golden Search to find the mode in the _product function (cavity x exact likelihood) and define a grid around it for numerical integration
-        """
-        golden_A = -1 if obs == 0 else np.array([np.log(obs),mu]).min() #Lower limit
-        golden_B = np.array([np.log(obs),mu]).max() #Upper limit
-        return sp.optimize.golden(self._nlog_product, args=(obs,mu,sigma), brack=(golden_A,golden_B)) #Better to work with _nlog_product than with _product
-
-    def _moments_match_numerical(self,obs,tau,v):
-        """
-        Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input.
-        """
-        mu = v/tau
-        sigma = np.sqrt(1./tau)
-        opt = self._locate(obs,mu,sigma)
-        width = 3./np.log(max(obs,2))
-        A = opt - width #Grid's lower limit
-        B = opt + width #Grid's Upper limit
-        K =  10*int(np.log(max(obs,150))) #Number of points in the grid
-        h = (B-A)/K # length of the intervals
-        grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis)
-        x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier
-        _aux1 = self._product(A,obs,mu,sigma)
-        _aux2 = self._product(B,obs,mu,sigma)
-        _aux3 = 4*self._product(grid_x[range(1,K,2)],obs,mu,sigma)
-        _aux4 = 2*self._product(grid_x[range(2,K-1,2)],obs,mu,sigma)
-        zeroth = np.hstack((_aux1,_aux2,_aux3,_aux4)) # grid of points (Y axis) rearranged
-        first = zeroth*x
-        second = first*x
-        Z_hat = sum(zeroth)*h/3 # Zero-th moment
-        mu_hat = sum(first)*h/(3*Z_hat) # First moment
-        m2 = sum(second)*h/(3*Z_hat) # Second moment
-        sigma2_hat = m2 - mu_hat**2 # Second central moment
-        return float(Z_hat), float(mu_hat), float(sigma2_hat)
-
-class Binomial(LikelihoodFunction):
-    """
-    Probit likelihood
-    Y is expected to take values in {-1,1}
-    -----
-    $$
-    L(x) = \\Phi (Y_i*f_i)
-    $$
-    """
-    def __init__(self,link=None):
-        self._analytical = link_functions.Probit
-        if not link:
-            link = self._analytical
-        super(Binomial, self).__init__(link)
-
-    def _distribution(self,gp,obs):
-        pass
-
-    def _log_distribution(self,gp,obs):
-        pass
-
-    def _preprocess_values(self,Y):
-        """
-        Check if the values of the observations correspond to the values
-        assumed by the likelihood function.
-
-        ..Note:: Binary classification algorithm works better with classes {-1,1}
-        """
-        Y_prep = Y.copy()
-        Y1 = Y[Y.flatten()==1].size
-        Y2 = Y[Y.flatten()==0].size
-        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
-        Y_prep[Y.flatten() == 0] = -1
-        return Y_prep
-
-    def _moments_match_analytical(self,data_i,tau_i,v_i):
-        """
-        Moments match of the marginal approximation in EP algorithm
-
-        :param i: number of observation (int)
-        :param tau_i: precision of the cavity distribution (float)
-        :param v_i: mean/variance of the cavity distribution (float)
-        """
-        z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
-        Z_hat = std_norm_cdf(z)
-        phi = std_norm_pdf(z)
-        mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
-        sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
-        return Z_hat, mu_hat, sigma2_hat
-
-    def predictive_values(self,mu,var):
-        """
-        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction
-        :param mu: mean of the latent variable
-        :param var: variance of the latent variable
-        """
-        mu = mu.flatten()
-        var = var.flatten()
-        mean = stats.norm.cdf(mu/np.sqrt(1+var))
-        norm_025 = [stats.norm.ppf(.025,m,v) for m,v in zip(mu,var)]
-        norm_975 = [stats.norm.ppf(.975,m,v) for m,v in zip(mu,var)]
-        p_025 = stats.norm.cdf(norm_025/np.sqrt(1+var))
-        p_975 = stats.norm.cdf(norm_975/np.sqrt(1+var))
-        return mean[:,None], np.nan*var, p_025[:,None], p_975[:,None] # TODO: var
-
-class Poisson(LikelihoodFunction):
-    """
-    Poisson likelihood
-    Y is expected to take values in {0,1,2,...}
-    -----
-    $$
-    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
-    $$
-    """
-    def __init__(self,link=None):
-        self._analytical = None
-        if not link:
-            link = link_functions.Log()
-        super(Poisson, self).__init__(link)
-
-    def _distribution(self,gp,obs):
-        return stats.poisson.pmf(obs,self.link.inv_transf(gp))
-
-    def _log_distribution(self,gp,obs):
-        return - self.link.inv_transf(gp) + obs * self.link.log_inv_transf(gp)
-
-    def predictive_values(self,mu,var):
-        """
-        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
-        """
-        mean = self.link.transf(mu)#np.exp(mu*self.scale + self.location)
-        tmp = stats.poisson.ppf(np.array([.025,.975]),mean)
-        p_025 = tmp[:,0]
-        p_975 = tmp[:,1]
-        return mean,np.nan*mean,p_025,p_975 # better variance here TODO
diff --git a/GPy/likelihoods/link_functions.py b/GPy/likelihoods/link_functions.py
deleted file mode 100644
index 3b9a55b2..00000000
--- a/GPy/likelihoods/link_functions.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats
-import scipy as sp
-import pylab as pb
-from ..util.plot import gpplot
-from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-
-class LinkFunction(object):
-    """
-    Link function class for doing non-Gaussian likelihoods approximation
-
-    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the likelihood_function used
-    """
-    def __init__(self):
-        pass
-
-class Probit(LinkFunction):
-    """
-    Probit link function: Squashes a likelihood between 0 and 1
-    """
-    def transf(self,mu):
-        pass
-
-    def inv_transf(self,f):
-        pass
-
-    def log_inv_transf(self,f):
-        pass
diff --git a/GPy/likelihoods/noise_model_constructors.py b/GPy/likelihoods/noise_model_constructors.py
new file mode 100644
index 00000000..158f1674
--- /dev/null
+++ b/GPy/likelihoods/noise_model_constructors.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import noise_models
+
+def binomial(gp_link=None):
+    """
+    Construct a binomial likelihood
+
+    :param gp_link: a GPy gp_link function
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Probit()
+    #else:
+    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
+
+    if isinstance(gp_link,noise_models.gp_transformations.Probit):
+        analytical_mean = True
+        analytical_variance = False
+
+    elif isinstance(gp_link,noise_models.gp_transformations.Step):
+        analytical_mean = True
+        analytical_variance = True
+
+    else:
+        analytical_mean = False
+        analytical_variance = False
+
+    return noise_models.binomial_noise.Binomial(gp_link,analytical_mean,analytical_variance)
+
+def exponential(gp_link=None):
+    """
+    Construct a binomial likelihood
+
+    :param gp_link: a GPy gp_link function
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Identity()
+
+    analytical_mean = False
+    analytical_variance = False
+    return noise_models.exponential_noise.Exponential(gp_link,analytical_mean,analytical_variance)
+
+def gaussian(gp_link=None,variance=1.):
+    """
+    Construct a gaussian likelihood
+
+    :param gp_link: a GPy gp_link function
+    :param variance: scalar
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Identity()
+    #else:
+    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
+
+    analytical_mean = False
+    analytical_variance = False
+    return noise_models.gaussian_noise.Gaussian(gp_link,analytical_mean,analytical_variance,variance)
+
+def poisson(gp_link=None):
+    """
+    Construct a Poisson likelihood
+
+    :param gp_link: a GPy gp_link function
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Log_ex_1()
+    #else:
+    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
+    analytical_mean = False
+    analytical_variance = False
+    return noise_models.poisson_noise.Poisson(gp_link,analytical_mean,analytical_variance)
+
+def gamma(gp_link=None,beta=1.):
+    """
+    Construct a Gamma likelihood
+
+    :param gp_link: a GPy gp_link function
+    :param beta: scalar
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Log_ex_1()
+    analytical_mean = False
+    analytical_variance = False
+    return noise_models.gamma_noise.Gamma(gp_link,analytical_mean,analytical_variance,beta)
+
+
diff --git a/GPy/likelihoods/noise_models/__init__.py b/GPy/likelihoods/noise_models/__init__.py
new file mode 100644
index 00000000..b47702a7
--- /dev/null
+++ b/GPy/likelihoods/noise_models/__init__.py
@@ -0,0 +1,7 @@
+import noise_distributions
+import binomial_noise
+import exponential_noise
+import gaussian_noise
+import gamma_noise
+import poisson_noise
+import gp_transformations
diff --git a/GPy/likelihoods/noise_models/binomial_noise.py b/GPy/likelihoods/noise_models/binomial_noise.py
new file mode 100644
index 00000000..9ceacf79
--- /dev/null
+++ b/GPy/likelihoods/noise_models/binomial_noise.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Binomial(NoiseDistribution):
+    """
+    Probit likelihood
+    Y is expected to take values in {-1,1}
+    -----
+    $$
+    L(x) = \\Phi (Y_i*f_i)
+    $$
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
+        super(Binomial, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y):
+        """
+        Check if the values of the observations correspond to the values
+        assumed by the likelihood function.
+
+        ..Note:: Binary classification algorithm works better with classes {-1,1}
+        """
+        Y_prep = Y.copy()
+        Y1 = Y[Y.flatten()==1].size
+        Y2 = Y[Y.flatten()==0].size
+        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
+        Y_prep[Y.flatten() == 0] = -1
+        return Y_prep
+
+    def _moments_match_analytical(self,data_i,tau_i,v_i):
+        """
+        Moments match of the marginal approximation in EP algorithm
+
+        :param i: number of observation (int)
+        :param tau_i: precision of the cavity distribution (float)
+        :param v_i: mean/variance of the cavity distribution (float)
+        """
+        if isinstance(self.gp_link,gp_transformations.Probit):
+            z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
+            Z_hat = std_norm_cdf(z)
+            phi = std_norm_pdf(z)
+            mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
+            sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
+
+        elif isinstance(self.gp_link,gp_transformations.Step):
+            Z_hat = None
+            mu_hat = None
+            sigma2_hat = None
+
+        return Z_hat, mu_hat, sigma2_hat
+
+    def _predictive_mean_analytical(self,mu,sigma):
+        return stats.norm.cdf(mu/np.sqrt(1+sigma**2))
+
+    def _mass(self,gp,obs):
+        #NOTE obs must be in {0,1}
+        p = self.gp_link.transf(gp)
+        return p**obs * (1.-p)**(1.-obs)
+
+    def _nlog_mass(self,gp,obs):
+        p = self.gp_link.transf(gp)
+        return obs*np.log(p) + (1.-obs)*np.log(1-p)
+
+    def _dnlog_mass_dgp(self,gp,obs):
+        p = self.gp_link.transf(gp)
+        dp = self.gp_link.dtransf_df(gp)
+        return obs/p * dp - (1.-obs)/(1.-p) * dp
+
+    def _d2nlog_mass_dgp2(self,gp,obs):
+        p = self.gp_link.transf(gp)
+        return (obs/p + (1.-obs)/(1.-p))*self.gp_link.d2transf_df2(gp) + ((1.-obs)/(1.-p)**2-obs/p**2)*self.gp_link.dtransf_df(gp)
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _dmean_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)
+
+    def _d2mean_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        p = self.gp_link.transf(gp)
+        return p*(1.-p)
+
+    def _dvariance_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)*(1. - 2.*self.gp_link.transf(gp))
+
+    def _d2variance_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)*(1. - 2.*self.gp_link.transf(gp)) - 2*self.gp_link.dtransf_df(gp)**2
+
+    """
+    def predictive_values(self,mu,var): #TODO remove
+        mu = mu.flatten()
+        var = var.flatten()
+        #mean = stats.norm.cdf(mu/np.sqrt(1+var))
+        mean = self._predictive_mean_analytical(mu,np.sqrt(var))
+        norm_025 = [stats.norm.ppf(.025,m,v) for m,v in zip(mu,var)]
+        norm_975 = [stats.norm.ppf(.975,m,v) for m,v in zip(mu,var)]
+        #p_025 = stats.norm.cdf(norm_025/np.sqrt(1+var))
+        #p_975 = stats.norm.cdf(norm_975/np.sqrt(1+var))
+        p_025 = self._predictive_mean_analytical(norm_025,np.sqrt(var))
+        p_975 = self._predictive_mean_analytical(norm_975,np.sqrt(var))
+        return mean[:,None], np.nan*var, p_025[:,None], p_975[:,None] # TODO: var
+    """
diff --git a/GPy/likelihoods/noise_models/exponential_noise.py b/GPy/likelihoods/noise_models/exponential_noise.py
new file mode 100644
index 00000000..e72b8c22
--- /dev/null
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Exponential(NoiseDistribution):
+    """
+    Gamma likelihood
+    Y is expected to take values in {0,1,2,...}
+    -----
+    $$
+    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
+    $$
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
+        super(Exponential, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y):
+        return Y
+
+    def _mass(self,gp,obs):
+        """
+        Mass (or density) function
+        """
+        return np.exp(-obs/self.gp_link.transf(gp))/self.gp_link.transf(gp)
+
+    def _nlog_mass(self,gp,obs):
+        """
+        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
+        """
+        return obs/self.gp_link.transf(gp) + np.log(self.gp_link.transf(gp))
+
+    def _dnlog_mass_dgp(self,gp,obs):
+        return ( 1./self.gp_link.transf(gp) - obs/self.gp_link.transf(gp)**2) * self.gp_link.dtransf_df(gp)
+
+    def _d2nlog_mass_dgp2(self,gp,obs):
+        fgp = self.gp_link.transf(gp)
+        return (2*obs/fgp**3 - 1./fgp**2) * self.gp_link.dtransf_df(gp)**2 + ( 1./fgp - obs/fgp**2) * self.gp_link.d2transf_df2(gp)
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _dmean_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)
+
+    def _d2mean_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)**2
+
+    def _dvariance_dgp(self,gp):
+        return 2*self.gp_link.transf(gp)*self.gp_link.dtransf_df(gp)
+
+    def _d2variance_dgp2(self,gp):
+        return 2 * (self.gp_link.dtransf_df(gp)**2 + self.gp_link.transf(gp)*self.gp_link.d2transf_df2(gp))
diff --git a/GPy/likelihoods/noise_models/gamma_noise.py b/GPy/likelihoods/noise_models/gamma_noise.py
new file mode 100644
index 00000000..6bf0dd7b
--- /dev/null
+++ b/GPy/likelihoods/noise_models/gamma_noise.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Gamma(NoiseDistribution):
+    """
+    Gamma likelihood
+    Y is expected to take values in {0,1,2,...}
+    -----
+    $$
+    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
+    $$
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,beta=1.):
+        self.beta = beta
+        super(Gamma, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y):
+        return Y
+
+    def _mass(self,gp,obs):
+        """
+        Mass (or density) function
+        """
+        #return stats.gamma.pdf(obs,a = self.gp_link.transf(gp)/self.variance,scale=self.variance)
+        alpha = self.gp_link.transf(gp)*self.beta
+        return obs**(alpha - 1.) * np.exp(-self.beta*obs) * self.beta**alpha / special.gamma(alpha)
+
+    def _nlog_mass(self,gp,obs):
+        """
+        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
+        """
+        alpha = self.gp_link.transf(gp)*self.beta
+        return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
+
+    def _dnlog_mass_dgp(self,gp,obs):
+        return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
+
+    def _d2nlog_mass_dgp2(self,gp,obs):
+        return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _dmean_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)
+
+    def _d2mean_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)/self.beta
+
+    def _dvariance_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)/self.beta
+
+    def _d2variance_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)/self.beta
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
new file mode 100644
index 00000000..398ed32a
--- /dev/null
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Gaussian(NoiseDistribution):
+    """
+    Gaussian likelihood
+
+    :param mean: mean value of the Gaussian distribution
+    :param variance: mean value of the Gaussian distribution
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,variance=1.):
+        self.variance = variance
+        super(Gaussian, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _get_params(self):
+        return np.array([self.variance])
+
+    def _get_param_names(self):
+        return ['noise_model_variance']
+
+    def _set_params(self,p):
+        self.variance = p
+
+    def _gradients(self,partial):
+        return np.zeros(1)
+        #return np.sum(partial)
+
+    def _preprocess_values(self,Y):
+        """
+        Check if the values of the observations correspond to the values
+        assumed by the likelihood function.
+        """
+        return Y
+
+    def _moments_match_analytical(self,data_i,tau_i,v_i):
+        """
+        Moments match of the marginal approximation in EP algorithm
+
+        :param i: number of observation (int)
+        :param tau_i: precision of the cavity distribution (float)
+        :param v_i: mean/variance of the cavity distribution (float)
+        """
+        sigma2_hat = 1./(1./self.variance + tau_i)
+        mu_hat = sigma2_hat*(data_i/self.variance + v_i)
+        sum_var = self.variance + 1./tau_i
+        Z_hat = 1./np.sqrt(2.*np.pi*sum_var)*np.exp(-.5*(data_i - v_i/tau_i)**2./sum_var)
+        return Z_hat, mu_hat, sigma2_hat
+
+    def _predictive_mean_analytical(self,mu,sigma):
+        new_sigma2 = self.predictive_variance(mu,sigma)
+        return new_sigma2*(mu/sigma**2 + self.gp_link.transf(mu)/self.variance)
+
+    def _predictive_variance_analytical(self,mu,sigma,*args): #TODO *args?
+        return 1./(1./self.variance + 1./sigma**2)
+
+    def _mass(self,gp,obs):
+        #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
+        return stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance)) #FIXME
+
+    def _nlog_mass(self,gp,obs):
+        return .5*((self.gp_link.transf(gp)-obs)**2/self.variance + np.log(2.*np.pi*self.variance))
+
+    def _dnlog_mass_dgp(self,gp,obs):
+        return (self.gp_link.transf(gp)-obs)/self.variance * self.gp_link.dtransf_df(gp)
+
+    def _d2nlog_mass_dgp2(self,gp,obs):
+        return ((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)/self.variance
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _dmean_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)
+
+    def _d2mean_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.variance
+
+    def _dvariance_dgp(self,gp):
+        return 0
+
+    def _d2variance_dgp2(self,gp):
+        return 0
diff --git a/GPy/likelihoods/noise_models/gp_transformations.py b/GPy/likelihoods/noise_models/gp_transformations.py
new file mode 100644
index 00000000..5f337aa4
--- /dev/null
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats
+import scipy as sp
+import pylab as pb
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf,inv_std_norm_cdf
+
+class GPTransformation(object):
+    """
+    Link function class for doing non-Gaussian likelihoods approximation
+
+    :param Y: observed output (Nx1 numpy.darray)
+    ..Note:: Y values allowed depend on the likelihood_function used
+    """
+    def __init__(self):
+        pass
+
+class Identity(GPTransformation):
+    """
+    $$
+    g(f) = f
+    $$
+    """
+    #def transf(self,mu):
+    #    return mu
+
+    def transf(self,f):
+        return f
+
+    def dtransf_df(self,f):
+        return 1.
+
+    def d2transf_df2(self,f):
+        return 0
+
+
+class Probit(GPTransformation):
+    """
+    $$
+    g(f) = \\Phi^{-1} (mu)
+    $$
+    """
+    #def transf(self,mu):
+    #    return inv_std_norm_cdf(mu)
+
+    def transf(self,f):
+        return std_norm_cdf(f)
+
+    def dtransf_df(self,f):
+        return std_norm_pdf(f)
+
+    def d2transf_df2(self,f):
+        return -f * std_norm_pdf(f)
+
+class Log(GPTransformation):
+    """
+    $$
+    g(f) = \log(\mu)
+    $$
+    """
+    #def transf(self,mu):
+    #    return np.log(mu)
+
+    def transf(self,f):
+        return np.exp(f)
+
+    def dtransf_df(self,f):
+        return np.exp(f)
+
+    def d2transf_df2(self,f):
+        return np.exp(f)
+
+class Log_ex_1(GPTransformation):
+    """
+    $$
+    g(f) = \log(\exp(\mu) - 1)
+    $$
+    """
+    #def transf(self,mu):
+    #    """
+    #    function: output space -> latent space
+    #    """
+    #    return np.log(np.exp(mu) - 1)
+
+    def transf(self,f):
+        """
+        function: latent space -> output space
+        """
+        return np.log(1.+np.exp(f))
+
+    def dtransf_df(self,f):
+        return np.exp(f)/(1.+np.exp(f))
+
+    def d2transf_df2(self,f):
+        aux = np.exp(f)/(1.+np.exp(f))
+        return aux*(1.-aux)
+
+class Reciprocal(GPTransformation):
+    def transf(sefl,f):
+        return 1./f
+
+    def dtransf_df(self,f):
+        return -1./f**2
+
+    def d2transf_df2(self,f):
+        return 2./f**3
+
+class Step(GPTransformation):
+    """
+    $$
+    g(f) = I_{x \in A}
+    $$
+    """
+    def transf(self,f):
+        #transformation goes here
+        return np.where(f>0, 1, -1)
+
+    def dtransf_df(self,f):
+        pass
+
+    def d2transf_df2(self,f):
+        pass
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
new file mode 100644
index 00000000..9d4eedfb
--- /dev/null
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -0,0 +1,384 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+import pylab as pb
+from GPy.util.plot import gpplot
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+
+
+class NoiseDistribution(object):
+    """
+    Likelihood class for doing Expectation propagation
+
+    :param Y: observed output (Nx1 numpy.darray)
+    ..Note:: Y values allowed depend on the LikelihoodFunction used
+    """
+    def __init__(self,gp_link,analytical_mean=False,analytical_variance=False):
+        #assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."#FIXME
+        self.gp_link = gp_link
+        self.analytical_mean = analytical_mean
+        self.analytical_variance = analytical_variance
+        if self.analytical_mean:
+            self.moments_match = self._moments_match_analytical
+            self.predictive_mean = self._predictive_mean_analytical
+        else:
+            self.moments_match = self._moments_match_numerical
+            self.predictive_mean = self._predictive_mean_numerical
+        if self.analytical_variance:
+            self.predictive_variance = self._predictive_variance_analytical
+        else:
+            self.predictive_variance = self._predictive_variance_numerical
+
+    def _get_params(self):
+        return np.zeros(0)
+
+    def _get_param_names(self):
+        return []
+
+    def _set_params(self,p):
+        pass
+
+    def _gradients(self,partial):
+        return np.zeros(0)
+
+    def _preprocess_values(self,Y):
+        """
+        In case it is needed, this function assess the output values or makes any pertinent transformation on them.
+
+        :param Y: observed output (Nx1 numpy.darray)
+        """
+        return Y
+
+    def _product(self,gp,obs,mu,sigma):
+        """
+        Product between the cavity distribution and a likelihood factor.
+
+        :param gp: latent variable
+        :param obs: observed output
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._mass(gp,obs)
+
+    def _nlog_product_scaled(self,gp,obs,mu,sigma):
+        """
+        Negative log-product between the cavity distribution and a likelihood factor.
+        ..Note:: The constant term in the Gaussian distribution is ignored.
+
+        :param gp: latent variable
+        :param obs: observed output
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return .5*((gp-mu)/sigma)**2 + self._nlog_mass(gp,obs)
+
+    def _dnlog_product_dgp(self,gp,obs,mu,sigma):
+        """
+        Derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
+
+        :param gp: latent variable
+        :param obs: observed output
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return (gp - mu)/sigma**2 + self._dnlog_mass_dgp(gp,obs)
+
+    def _d2nlog_product_dgp2(self,gp,obs,mu,sigma):
+        """
+        Second derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
+
+        :param gp: latent variable
+        :param obs: observed output
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return 1./sigma**2 + self._d2nlog_mass_dgp2(gp,obs)
+
+    def _product_mode(self,obs,mu,sigma):
+        """
+        Newton's CG method to find the mode in _product (cavity x likelihood factor).
+
+        :param obs: observed output
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return sp.optimize.fmin_ncg(self._nlog_product_scaled,x0=mu,fprime=self._dnlog_product_dgp,fhess=self._d2nlog_product_dgp2,args=(obs,mu,sigma))
+
+    def _moments_match_analytical(self,obs,tau,v):
+        """
+        If available, this function computes the moments analytically.
+        """
+        pass
+
+    def _moments_match_numerical(self,obs,tau,v):
+        """
+        Lapace approximation to calculate the moments.
+
+        :param obs: observed output
+        :param tau: cavity distribution 1st natural parameter (precision)
+        :param v: cavity distribution 2nd natural paramenter (mu*precision)
+        """
+        mu = v/tau
+        mu_hat = self._product_mode(obs,mu,np.sqrt(1./tau))
+        sigma2_hat = 1./(tau + self._d2nlog_mass_dgp2(mu_hat,obs))
+        Z_hat = np.exp(-.5*tau*(mu_hat-mu)**2) * self._mass(mu_hat,obs)*np.sqrt(tau*sigma2_hat)
+        return Z_hat,mu_hat,sigma2_hat
+
+    def _nlog_conditional_mean_scaled(self,gp,mu,sigma):
+        """
+        Negative logarithm of the l.v.'s predictive distribution times the output's mean given the l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+
+        ..Note:: This function helps computing E(Y_star) = E(E(Y_star|f_star))
+        """
+        return .5*((gp - mu)/sigma)**2 - np.log(self._mean(gp))
+
+    def _dnlog_conditional_mean_dgp(self,gp,mu,sigma):
+        """
+        Derivative of _nlog_conditional_mean_scaled wrt. l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return (gp - mu)/sigma**2 - self._dmean_dgp(gp)/self._mean(gp)
+
+    def _d2nlog_conditional_mean_dgp2(self,gp,mu,sigma):
+        """
+        Second derivative of _nlog_conditional_mean_scaled wrt. l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return 1./sigma**2 - self._d2mean_dgp2(gp)/self._mean(gp) + (self._dmean_dgp(gp)/self._mean(gp))**2
+
+    def _nlog_exp_conditional_variance_scaled(self,gp,mu,sigma):
+        """
+        Negative logarithm of the l.v.'s predictive distribution times the output's variance given the l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+
+        ..Note:: This function helps computing E(V(Y_star|f_star))
+        """
+        return .5*((gp - mu)/sigma)**2 - np.log(self._variance(gp))
+
+    def _dnlog_exp_conditional_variance_dgp(self,gp,mu,sigma):
+        """
+        Derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return (gp - mu)/sigma**2 - self._dvariance_dgp(gp)/self._variance(gp)
+
+    def _d2nlog_exp_conditional_variance_dgp2(self,gp,mu,sigma):
+        """
+        Second derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return 1./sigma**2 - self._d2variance_dgp2(gp)/self._variance(gp) + (self._dvariance_dgp(gp)/self._variance(gp))**2
+
+    def _nlog_exp_conditional_mean_sq_scaled(self,gp,mu,sigma):
+        """
+        Negative logarithm of the l.v.'s predictive distribution times the output's mean squared given the l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+
+        ..Note:: This function helps computing E( E(Y_star|f_star)**2 )
+        """
+        return .5*((gp - mu)/sigma)**2 - 2*np.log(self._mean(gp))
+
+    def _dnlog_exp_conditional_mean_sq_dgp(self,gp,mu,sigma):
+        """
+        Derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return (gp - mu)/sigma**2 - 2*self._dmean_dgp(gp)/self._mean(gp)
+
+    def _d2nlog_exp_conditional_mean_sq_dgp2(self,gp,mu,sigma):
+        """
+        Second derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return 1./sigma**2 - 2*( self._d2mean_dgp2(gp)/self._mean(gp) - (self._dmean_dgp(gp)/self._mean(gp))**2 )
+
+    def _predictive_mean_analytical(self,mu,sigma):
+        """
+        If available, this function computes the predictive mean analytically.
+        """
+        pass
+
+    def _predictive_variance_analytical(self,mu,sigma):
+        """
+        If available, this function computes the predictive variance analytically.
+        """
+        pass
+
+    def _predictive_mean_numerical(self,mu,sigma):
+        """
+        Laplace approximation to the predictive mean: E(Y_star) = E( E(Y_star|f_star) )
+
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        maximum = sp.optimize.fmin_ncg(self._nlog_conditional_mean_scaled,x0=self._mean(mu),fprime=self._dnlog_conditional_mean_dgp,fhess=self._d2nlog_conditional_mean_dgp2,args=(mu,sigma))
+        mean = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma))*sigma)
+        """
+
+        pb.figure()
+        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
+        f = np.array([np.exp(-self._nlog_conditional_mean_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
+        pb.plot(x,f,'b-')
+        sigma2 = 1./self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma)
+        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
+        k = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
+        pb.plot(x,f2*mean,'r-')
+        pb.vlines(maximum,0,f.max())
+        """
+        return mean
+
+    def _predictive_mean_sq(self,mu,sigma):
+        """
+        Laplace approximation to the predictive mean squared: E(Y_star**2) = E( E(Y_star|f_star)**2 )
+
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_mean_sq_scaled,x0=self._mean(mu),fprime=self._dnlog_exp_conditional_mean_sq_dgp,fhess=self._d2nlog_exp_conditional_mean_sq_dgp2,args=(mu,sigma))
+        mean_squared = np.exp(-self._nlog_exp_conditional_mean_sq_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_mean_sq_dgp2(maximum,mu,sigma))*sigma)
+        return mean_squared
+
+    def _predictive_variance_numerical(self,mu,sigma,predictive_mean=None):
+        """
+        Laplace approximation to the predictive variance: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
+        """
+        # E( V(Y_star|f_star) )
+        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_variance_scaled,x0=self._variance(mu),fprime=self._dnlog_exp_conditional_variance_dgp,fhess=self._d2nlog_exp_conditional_variance_dgp2,args=(mu,sigma))
+        exp_var = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma))*sigma)
+
+        """
+        pb.figure()
+        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
+        f = np.array([np.exp(-self._nlog_exp_conditional_variance_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
+        pb.plot(x,f,'b-')
+        sigma2 = 1./self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma)
+        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
+        k = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
+        pb.plot(x,f2*exp_var,'r--')
+        pb.vlines(maximum,0,f.max())
+        """
+
+        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star)**2 )
+        exp_exp2 = self._predictive_mean_sq(mu,sigma)
+        if predictive_mean is None:
+            predictive_mean = self.predictive_mean(mu,sigma)
+        var_exp = exp_exp2 - predictive_mean**2
+        return exp_var + var_exp
+
+    def _predictive_percentiles(self,p,mu,sigma):
+        """
+        Percentiles of the predictive distribution
+
+        :parm p: lower tail probability
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
+        """
+        qf = stats.norm.ppf(p,mu,sigma)
+        return self.gp_link.transf(qf)
+
+    def _nlog_joint_predictive_scaled(self,x,mu,sigma):
+        """
+        Negative logarithm of the joint predictive distribution (latent variable and output).
+
+        :param x: tuple (latent variable,output)
+        :param mu: latent variable's predictive mean
+        :param sigma: latent variable's predictive standard deviation
+        """
+        return self._nlog_product_scaled(x[0],x[1],mu,sigma)
+
+    def _gradient_nlog_joint_predictive(self,x,mu,sigma):
+        """
+        Gradient of _nlog_joint_predictive_scaled.
+
+        :param x: tuple (latent variable,output)
+        :param mu: latent variable's predictive mean
+        :param sigma: latent variable's predictive standard deviation
+        ..Note: Only avilable when the output is continuous
+        """
+        assert not self.discrete, "Gradient not available for discrete outputs."
+        return np.array((self._dnlog_product_dgp(gp=x[0],obs=x[1],mu=mu,sigma=sigma),self._dnlog_mass_dobs(obs=x[1],gp=x[0])))
+
+    def _hessian_nlog_joint_predictive(self,x,mu,sigma):
+        """
+        Hessian of _nlog_joint_predictive_scaled.
+
+        :param x: tuple (latent variable,output)
+        :param mu: latent variable's predictive mean
+        :param sigma: latent variable's predictive standard deviation
+        ..Note: Only avilable when the output is continuous
+        """
+        assert not self.discrete, "Hessian not available for discrete outputs."
+        cross_derivative = self._d2nlog_mass_dcross(gp=x[0],obs=x[1])
+        return np.array((self._d2nlog_product_dgp2(gp=x[0],obs=x[1],mu=mu,sigma=sigma),cross_derivative,cross_derivative,self._d2nlog_mass_dobs2(obs=x[1],gp=x[0]))).reshape(2,2)
+
+    def _joint_predictive_mode(self,mu,sigma):
+        """
+        Negative logarithm of the joint predictive distribution (latent variable and output).
+
+        :param x: tuple (latent variable,output)
+        :param mu: latent variable's predictive mean
+        :param sigma: latent variable's predictive standard deviation
+        """
+        return sp.optimize.fmin_ncg(self._nlog_joint_predictive_scaled,x0=(mu,self.gp_link.transf(mu)),fprime=self._gradient_nlog_joint_predictive,fhess=self._hessian_nlog_joint_predictive,args=(mu,sigma))
+
+    def predictive_values(self,mu,var):
+        """
+        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction
+        :param mu: mean of the latent variable
+        :param var: variance of the latent variable
+        """
+        if isinstance(mu,float) or isinstance(mu,int):
+            mu = [mu]
+            var = [var]
+        pred_mean = []
+        pred_var = []
+        q1 = []
+        q3 = []
+        for m,s in zip(mu,np.sqrt(var)):
+            pred_mean.append(self.predictive_mean(m,s))
+            pred_var.append(self.predictive_variance(m,s,pred_mean[-1]))
+            q1.append(self._predictive_percentiles(.025,m,s))
+            q3.append(self._predictive_percentiles(.975,m,s))
+        pred_mean = np.vstack(pred_mean)
+        pred_var = np.vstack(pred_var)
+        q1 = np.vstack(q1)
+        q3 = np.vstack(q3)
+        return pred_mean, pred_var, q1, q3
diff --git a/GPy/likelihoods/noise_models/poisson_noise.py b/GPy/likelihoods/noise_models/poisson_noise.py
new file mode 100644
index 00000000..e4ce90d3
--- /dev/null
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Poisson(NoiseDistribution):
+    """
+    Poisson likelihood
+    Y is expected to take values in {0,1,2,...}
+    -----
+    $$
+    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
+    $$
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
+        #self.discrete = True
+        #self.support_limits = (0,np.inf)
+
+        #self.analytical_mean = False
+        super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y): #TODO
+        #self.scale = .5*Y.max()
+        #self.shift = Y.mean()
+        return Y #(Y - self.shift)/self.scale
+
+    def _mass(self,gp,obs):
+        """
+        Mass (or density) function
+        """
+        #obs = obs*self.scale + self.shift
+        return stats.poisson.pmf(obs,self.gp_link.transf(gp))
+
+    def _nlog_mass(self,gp,obs):
+        """
+        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
+        """
+        return self.gp_link.transf(gp) - obs * np.log(self.gp_link.transf(gp)) + np.log(special.gamma(obs+1))
+
+    def _dnlog_mass_dgp(self,gp,obs):
+        return self.gp_link.dtransf_df(gp) * (1. - obs/self.gp_link.transf(gp))
+
+    def _d2nlog_mass_dgp2(self,gp,obs):
+        d2_df = self.gp_link.d2transf_df2(gp)
+        transf = self.gp_link.transf(gp)
+        return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
+
+    def _dnlog_mass_dobs(self,obs,gp): #TODO not needed
+        return special.psi(obs+1) -  np.log(self.gp_link.transf(gp))
+
+    def _d2nlog_mass_dobs2(self,obs,gp=None): #TODO not needed
+        return special.polygamma(1,obs)
+
+    def _d2nlog_mass_dcross(self,obs,gp): #TODO not needed
+        return -self.gp_link.dtransf_df(gp)/self.gp_link.transf(gp)
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _dmean_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)
+
+    def _d2mean_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _dvariance_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)
+
+    def _d2variance_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)
diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index e535ed62..10ce577b 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -14,3 +14,5 @@ from warped_gp import WarpedGP
 from bayesian_gplvm import BayesianGPLVM
 from mrd import MRD
 from gradient_checker import GradientChecker
+from gp_multioutput_regression import GPMultioutputRegression
+from sparse_gp_multioutput_regression import SparseGPMultioutputRegression
diff --git a/GPy/models/fitc_classification.py b/GPy/models/fitc_classification.py
index f4cf4e8d..ee92a1b4 100644
--- a/GPy/models/fitc_classification.py
+++ b/GPy/models/fitc_classification.py
@@ -31,8 +31,8 @@ class FITCClassification(FITC):
             kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)
 
         if likelihood is None:
-            distribution = likelihoods.likelihood_functions.Binomial()
-            likelihood = likelihoods.EP(Y, distribution)
+            noise_model = likelihoods.binomial()
+            likelihood = likelihoods.EP(Y, noise_model)
         elif Y is not None:
             if not all(Y.flatten() == likelihood.data.flatten()):
                 raise Warning, 'likelihood.data and Y are different.'
diff --git a/GPy/models/gp_classification.py b/GPy/models/gp_classification.py
index c6012988..73d492fe 100644
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@@ -31,8 +31,8 @@ class GPClassification(GP):
             kernel = kern.rbf(X.shape[1])
 
         if likelihood is None:
-            distribution = likelihoods.likelihood_functions.Binomial()
-            likelihood = likelihoods.EP(Y, distribution)
+            noise_model = likelihoods.binomial()
+            likelihood = likelihoods.EP(Y, noise_model)
         elif Y is not None:
             if not all(Y.flatten() == likelihood.data.flatten()):
                 raise Warning, 'likelihood.data and Y are different.'
diff --git a/GPy/models/gp_multioutput_regression.py b/GPy/models/gp_multioutput_regression.py
new file mode 100644
index 00000000..d51f3bae
--- /dev/null
+++ b/GPy/models/gp_multioutput_regression.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from ..core import GP
+from .. import likelihoods
+from .. import kern
+#from ..util import multioutput
+
+class GPMultioutputRegression(GP):
+    """
+    Multiple output Gaussian process with Gaussian noise
+
+    This is a wrapper around the models.GP class, with a set of sensible defaults
+
+    :param X_list: input observations
+    :type X_list: list of numpy arrays (num_data_output_i x input_dim), one array per output
+    :param Y_list: observed values
+    :type Y_list: list of numpy arrays (num_data_output_i x 1), one array per output
+    :param kernel_list: GPy kernels, defaults to rbf
+    :type kernel_list: list of GPy kernels
+    :param noise_variance_list: noise parameters per output, defaults to 1.0 for every output
+    :type noise_variance_list: list of floats
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_Y: False|True
+    :param W_columns: number tuples of the corregionalization parameters 'coregion_W' (see coregionalize kernel documentation)
+    :type W_columns: integer
+    """
+
+    def __init__(self,X_list,Y_list,kernel_list=None,noise_variance_list=None,normalize_X=False,normalize_Y=False,W_columns=1):
+
+        self.num_outputs = len(Y_list)
+        assert len(X_list) == self.num_outputs, 'Number of outputs do not match length of inputs list.'
+
+        #Inputs indexing
+        i = 0
+        index = []
+        for x,y in zip(X_list,Y_list):
+            assert x.shape[0] == y.shape[0]
+            index.append(np.repeat(i,x.size)[:,None])
+            i += 1
+        index = np.vstack(index)
+        X = np.hstack([np.vstack(X_list),index])
+        original_dim = X.shape[1] - 1
+
+        #Mixed noise likelihood definition
+        likelihood = likelihoods.Gaussian_Mixed_Noise(Y_list,noise_params=noise_variance_list,normalize=normalize_Y)
+
+        #Coregionalization kernel definition
+        if kernel_list is None:
+            kernel_list = [kern.rbf(original_dim)]
+        mkernel = kern.build_lcm(input_dim=original_dim, num_outputs=self.num_outputs, kernel_list = kernel_list, W_columns=W_columns)
+
+        self.multioutput = True
+        GP.__init__(self, X, likelihood, mkernel, normalize_X=normalize_X)
+        self.ensure_default_constraints()
diff --git a/GPy/models/sparse_gp_classification.py b/GPy/models/sparse_gp_classification.py
index 5f36ebe1..50c2f935 100644
--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@@ -28,11 +28,11 @@ class SparseGPClassification(SparseGP):
 
     def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10):
         if kernel is None:
-            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1], 1e-3)
+            kernel = kern.rbf(X.shape[1])# + kern.white(X.shape[1],1e-3)
 
         if likelihood is None:
-            distribution = likelihoods.likelihood_functions.Binomial()
-            likelihood = likelihoods.EP(Y, distribution)
+            noise_model = likelihoods.binomial()
+            likelihood = likelihoods.EP(Y, noise_model)
         elif Y is not None:
             if not all(Y.flatten() == likelihood.data.flatten()):
                 raise Warning, 'likelihood.data and Y are different.'
diff --git a/GPy/models/sparse_gp_multioutput_regression.py b/GPy/models/sparse_gp_multioutput_regression.py
new file mode 100644
index 00000000..041204b6
--- /dev/null
+++ b/GPy/models/sparse_gp_multioutput_regression.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from ..core import SparseGP
+from .. import likelihoods
+from .. import kern
+from ..util import multioutput
+
+class SparseGPMultioutputRegression(SparseGP):
+    """
+    Sparse multiple output Gaussian process with Gaussian noise
+
+    This is a wrapper around the models.SparseGP class, with a set of sensible defaults
+
+    :param X_list: input observations
+    :type X_list: list of numpy arrays (num_data_output_i x input_dim), one array per output
+    :param Y_list: observed values
+    :type Y_list: list of numpy arrays (num_data_output_i x 1), one array per output
+    :param kernel_list: GPy kernels, defaults to rbf
+    :type kernel_list: list of GPy kernels
+    :param noise_variance_list: noise parameters per output, defaults to 1.0 for every output
+    :type noise_variance_list: list of floats
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_Y: False|True
+    :param Z_list: inducing inputs (optional)
+    :type Z_list: list of numpy arrays (num_inducing_output_i x input_dim), one array per output | empty list
+    :param num_inducing: number of inducing inputs per output, defaults to 10 (ignored if Z_list is not empty)
+    :type num_inducing: integer
+    :param W_columns: number tuples of the corregionalization parameters 'coregion_W' (see coregionalize kernel documentation)
+    :type W_columns: integer
+    """
+    #NOTE not tested with uncertain inputs
+    def __init__(self,X_list,Y_list,kernel_list=None,noise_variance_list=None,normalize_X=False,normalize_Y=False,Z_list=[],num_inducing=10,W_columns=1):
+
+        self.num_outputs = len(Y_list)
+        assert len(X_list) == self.num_outputs, 'Number of outputs do not match length of inputs list.'
+
+        #Inducing inputs list
+        if len(Z_list):
+            assert len(Z_list) == self.num_outputs, 'Number of outputs do not match length of inducing inputs list.'
+        else:
+            if isinstance(num_inducing,np.int):
+                num_inducing = [num_inducing] * self.num_outputs
+            num_inducing = np.asarray(num_inducing)
+            assert num_inducing.size == self.num_outputs, 'Number of outputs do not match length of inducing inputs list.'
+            for ni,X in zip(num_inducing,X_list):
+                i = np.random.permutation(X.shape[0])[:ni]
+                Z_list.append(X[i].copy())
+
+        #Inputs and inducing inputs indexing
+        i = 0
+        index = []
+        index_z = []
+        for x,y,z in zip(X_list,Y_list,Z_list):
+            assert x.shape[0] == y.shape[0]
+            index.append(np.repeat(i,x.size)[:,None])
+            index_z.append(np.repeat(i,z.size)[:,None])
+            i += 1
+        index = np.vstack(index)
+        index_z = np.vstack(index_z)
+        X = np.hstack([np.vstack(X_list),index])
+        Z = np.hstack([np.vstack(Z_list),index_z])
+        original_dim = X.shape[1] - 1
+
+        #Mixed noise likelihood definition
+        likelihood = likelihoods.Gaussian_Mixed_Noise(Y_list,noise_params=noise_variance_list,normalize=normalize_Y)
+
+        #Coregionalization kernel definition
+        if kernel_list is None:
+            kernel_list = [kern.rbf(original_dim)]
+        mkernel = kern.build_lcm(input_dim=original_dim, num_outputs=self.num_outputs, kernel_list = kernel_list, W_columns=W_columns)
+
+        self.multioutput = True
+        SparseGP.__init__(self, X, likelihood, mkernel, Z=Z, normalize_X=normalize_X)
+        self.constrain_fixed('.*iip_\d+_1')
+        self.ensure_default_constraints()
diff --git a/GPy/models/sparse_gp_regression.py b/GPy/models/sparse_gp_regression.py
index 64674f4a..d2e23887 100644
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@@ -20,7 +20,11 @@ class SparseGPRegression(SparseGP):
     :type normalize_X: False|True
     :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
     :type normalize_Y: False|True
+    :param Z: inducing inputs (optional, see note)
+    :type Z: np.ndarray (num_inducing x input_dim) | None
     :rtype: model object
+    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
+    :type X_variance: np.ndarray (num_data x input_dim) | None
 
     .. Note:: Multiple independent outputs are allowed using columns of Y
 
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 3f81f472..9329aba0 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -4,15 +4,18 @@
 import unittest
 import numpy as np
 import GPy
+<<<<<<< HEAD
     
 verbose = False
+=======
+
+>>>>>>> 1bc93747178b0bab1b7177568388ebd4207647e0
 
 class KernelTests(unittest.TestCase):
     def test_kerneltie(self):
         K = GPy.kern.rbf(5, ARD=True)
         K.tie_params('.*[01]')
         K.constrain_fixed('2')
-        
         X = np.random.rand(5,5)
         Y = np.ones((5,1))
         m = GPy.models.GPRegression(X,Y,K)
@@ -95,7 +98,6 @@ class KernelTests(unittest.TestCase):
         self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
 
-       
 if __name__ == "__main__":
     print "Running unit tests, please be (very) patient..."
     unittest.main()
diff --git a/GPy/testing/psi_stat_expectation_tests.py b/GPy/testing/psi_stat_expectation_tests.py
index da71754b..30ca14d6 100644
--- a/GPy/testing/psi_stat_expectation_tests.py
+++ b/GPy/testing/psi_stat_expectation_tests.py
@@ -7,9 +7,14 @@ import unittest
 import GPy
 import numpy as np
 from GPy import testing
+import sys
+import numpy
+from GPy.kern.parts.rbf import RBF
+from GPy.kern.parts.linear import Linear
+from copy import deepcopy
 
-__test__ = False
-np.random.seed(0)
+__test__ = lambda: 'deep' in sys.argv
+# np.random.seed(0)
 
 def ard(p):
     try:
@@ -19,24 +24,37 @@ def ard(p):
         pass
     return ""
 
-@testing.deepTest(__test__)
+@testing.deepTest(__test__())
 class Test(unittest.TestCase):
     input_dim = 9
     num_inducing = 4
     N = 3
-    Nsamples = 6e6
+    Nsamples = 5e6
 
     def setUp(self):
+        i_s_dim_list = [2,4,3]
+        indices = numpy.cumsum(i_s_dim_list).tolist()
+        input_slices = [slice(a,b) for a,b in zip([None]+indices, indices)]
+        #input_slices[2] = deepcopy(input_slices[1])
+        input_slice_kern = GPy.kern.kern(9, 
+                                         [
+                                          RBF(i_s_dim_list[0], np.random.rand(), np.random.rand(i_s_dim_list[0]), ARD=True),
+                                          RBF(i_s_dim_list[1], np.random.rand(), np.random.rand(i_s_dim_list[1]), ARD=True),
+                                          Linear(i_s_dim_list[2], np.random.rand(i_s_dim_list[2]), ARD=True)
+                                          ],
+                                         input_slices = input_slices
+                                         )
         self.kerns = (
+                    input_slice_kern,
 #                       (GPy.kern.rbf(self.input_dim, ARD=True) +
 #                        GPy.kern.linear(self.input_dim, ARD=True) +
 #                        GPy.kern.bias(self.input_dim) +
 #                        GPy.kern.white(self.input_dim)),
-                      (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-                       GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-                       GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
-                       GPy.kern.bias(self.input_dim) +
-                       GPy.kern.white(self.input_dim)),
+#                     (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+#                      GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+#                      GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
+#                      GPy.kern.bias(self.input_dim) +
+#                      GPy.kern.white(self.input_dim)),
 #                       GPy.kern.rbf(self.input_dim), GPy.kern.rbf(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim, ARD=False), GPy.kern.linear(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim),
@@ -61,22 +79,22 @@ class Test(unittest.TestCase):
 
     def test_psi1(self):
         for kern in self.kerns:
-            Nsamples = 100
+            Nsamples = np.floor(self.Nsamples/300.)
             psi1 = kern.psi1(self.Z, self.q_x_mean, self.q_x_variance)
             K_ = np.zeros((Nsamples, self.num_inducing))
             diffs = []
             for i, q_x_sample_stripe in enumerate(np.array_split(self.q_x_samples, self.Nsamples / Nsamples)):
-                K = kern.K(q_x_sample_stripe, self.Z)
+                K = kern.K(q_x_sample_stripe[:Nsamples], self.Z)
                 K_ += K
-                diffs.append(((psi1 - (K_ / (i + 1)))).mean())
+                diffs.append((np.abs(psi1 - (K_ / (i + 1)))**2).mean())
             K_ /= self.Nsamples / Nsamples
             msg = "psi1: " + "+".join([p.name + ard(p) for p in kern.parts])
             try:
                 import pylab
                 pylab.figure(msg)
                 pylab.plot(diffs)
-                self.assertTrue(np.allclose(psi1.squeeze(), K_,
-                                            rtol=1e-1, atol=.1),
+#                 print msg, ((psi1.squeeze() - K_)**2).mean() < .01
+                self.assertTrue(((psi1.squeeze() - K_)**2).mean() < .01,
                                 msg=msg + ": not matching")
 #                 sys.stdout.write(".")
             except:
@@ -87,7 +105,7 @@ class Test(unittest.TestCase):
 
     def test_psi2(self):
         for kern in self.kerns:
-            Nsamples = 100
+            Nsamples = self.Nsamples/300.
             psi2 = kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
             K_ = np.zeros((self.num_inducing, self.num_inducing))
             diffs = []
@@ -95,13 +113,14 @@ class Test(unittest.TestCase):
                 K = kern.K(q_x_sample_stripe, self.Z)
                 K = (K[:, :, None] * K[:, None, :]).mean(0)
                 K_ += K
-                diffs.append(((psi2 - (K_ / (i + 1)))).mean())
+                diffs.append(((psi2 - (K_ / (i + 1)))**2).mean())
             K_ /= self.Nsamples / Nsamples
             msg = "psi2: {}".format("+".join([p.name + ard(p) for p in kern.parts]))
             try:
                 import pylab
                 pylab.figure(msg)
                 pylab.plot(diffs)
+#                 print msg, np.allclose(psi2.squeeze(), K_, rtol=1e-1, atol=.1)
                 self.assertTrue(np.allclose(psi2.squeeze(), K_,
                                             rtol=1e-1, atol=.1),
                                 msg=msg + ": not matching")
@@ -114,10 +133,8 @@ class Test(unittest.TestCase):
                 pass
 
 if __name__ == "__main__":
-    import sys
-    __test__ = 'deep' in sys.argv
     sys.argv = ['',
-         'Test.test_psi0',
+         #'Test.test_psi0',
          'Test.test_psi1',
          'Test.test_psi2',
          ]
diff --git a/GPy/testing/unit_tests.py b/GPy/testing/unit_tests.py
index 246d40d1..6bb624df 100644
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@@ -5,7 +5,6 @@
 import unittest
 import numpy as np
 import GPy
-from GPy.likelihoods.likelihood_functions import Binomial
 
 class GradientTests(unittest.TestCase):
     def setUp(self):
@@ -199,10 +198,7 @@ class GradientTests(unittest.TestCase):
         X = np.hstack([np.random.normal(5, 2, N / 2), np.random.normal(10, 2, N / 2)])[:, None]
         Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
         kernel = GPy.kern.rbf(1)
-        distribution = GPy.likelihoods.likelihood_functions.Binomial()
-        likelihood = GPy.likelihoods.EP(Y, distribution)
-        m = GPy.core.GP(X, likelihood, kernel)
-        m.ensure_default_constraints()
+        m = GPy.models.GPClassification(X,Y,kernel=kernel)
         m.update_likelihood_approximation()
         self.assertTrue(m.checkgrad())
 
@@ -212,10 +208,11 @@ class GradientTests(unittest.TestCase):
         Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
         Z = np.linspace(0, 15, 4)[:, None]
         kernel = GPy.kern.rbf(1)
-        distribution = GPy.likelihoods.likelihood_functions.Binomial()
-        likelihood = GPy.likelihoods.EP(Y, distribution)
-        m = GPy.core.SparseGP(X, likelihood, kernel, Z)
-        m.ensure_default_constraints()
+        m = GPy.models.SparseGPClassification(X,Y,kernel=kernel,Z=Z)
+        #distribution = GPy.likelihoods.likelihood_functions.Binomial()
+        #likelihood = GPy.likelihoods.EP(Y, distribution)
+        #m = GPy.core.SparseGP(X, likelihood, kernel, Z)
+        #m.ensure_default_constraints()
         m.update_likelihood_approximation()
         self.assertTrue(m.checkgrad())
 
@@ -224,10 +221,24 @@ class GradientTests(unittest.TestCase):
         X = np.hstack([np.random.rand(N / 2) + 1, np.random.rand(N / 2) - 1])[:, None]
         k = GPy.kern.rbf(1) + GPy.kern.white(1)
         Y = np.hstack([np.ones(N/2),np.zeros(N/2)])[:,None]
-        m = GPy.models.FITCClassification(X, Y=Y)
+        m = GPy.models.FITCClassification(X, Y, kernel = k)
         m.update_likelihood_approximation()
         self.assertTrue(m.checkgrad())
 
+    def multioutput_regression_1D(self):
+        X1 = np.random.rand(50, 1) * 8
+        X2 = np.random.rand(30, 1) * 5
+        X = np.vstack((X1, X2))
+        Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
+        Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
+        Y = np.vstack((Y1, Y2))
+
+        k1 = GPy.kern.rbf(1)
+        m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
+        m.constrain_fixed('.*rbf_var', 1.)
+        self.assertTrue(m.checkgrad())
+
+
 if __name__ == "__main__":
     print "Running unit tests, please be (very) patient..."
     unittest.main()
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index 99548268..c205548b 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -14,3 +14,4 @@ import visualize
 import decorators
 import classification
 import latent_space_visualizations
+#import multioutput
diff --git a/GPy/util/multioutput.py b/GPy/util/multioutput.py
new file mode 100644
index 00000000..2b06ba95
--- /dev/null
+++ b/GPy/util/multioutput.py
@@ -0,0 +1,35 @@
+import numpy as np
+import warnings
+from .. import kern
+
+def build_lcm(input_dim, num_outputs, CK = [], NC = [], W_columns=1,W=None,kappa=None):
+    #TODO build_icm or build_lcm
+    """
+    Builds a kernel for a linear coregionalization model
+
+    :input_dim: Input dimensionality
+    :num_outputs: Number of outputs
+    :param CK: List of coregionalized kernels (i.e., this will be multiplied by a coregionalise kernel).
+    :param K: List of kernels that will be added up together with CK, but won't be multiplied by a coregionalise kernel
+    :param W_columns: number tuples of the corregionalization parameters 'coregion_W'
+    :type W_columns: integer
+    """
+
+    for k in CK:
+        if k.input_dim <> input_dim:
+            k.input_dim = input_dim
+            warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
+
+    for k in NC:
+        if k.input_dim <> input_dim + 1:
+            k.input_dim = input_dim + 1
+            warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
+
+    kernel = CK[0].prod(kern.coregionalise(num_outputs,W_columns,W,kappa),tensor=True)
+    for k in CK[1:]:
+        k_coreg = kern.coregionalise(num_outputs,W_columns,W,kappa)
+        kernel += k.prod(k_coreg,tensor=True)
+    for k in NC:
+        kernel += k
+
+    return kernel
diff --git a/GPy/util/univariate_Gaussian.py b/GPy/util/univariate_Gaussian.py
index 28946894..5a5880d5 100644
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@@ -32,4 +32,15 @@ def std_norm_cdf(x):
     x = float(x)
     return weave.inline(code,arg_names=['x'],support_code=support_code)
 
+def inv_std_norm_cdf(x):
+    """
+    Inverse cumulative standard Gaussian distribution
+    Based on Winitzki, S. (2008)
+    """
+    z = 2*x -1
+    ln1z2 = np.log(1-z**2)
+    a = 8*(np.pi -3)/(3*np.pi*(4-np.pi))
+    b = 2/(np.pi * a) + ln1z2/2
+    inv_erf = np.sign(z) * np.sqrt( np.sqrt(b**2 - ln1z2/a) - b )
+    return np.sqrt(2) * inv_erf