diff --git a/GPy/core/fitc.py b/GPy/core/fitc.py
index ef171459..eac00fec 100644
--- a/GPy/core/fitc.py
+++ b/GPy/core/fitc.py
@@ -140,7 +140,6 @@ class FITC(SparseGP):
 
             dA_dnoise = 0.5 * self.input_dim * (dbstar_dnoise/self.beta_star).sum() - 0.5 * self.input_dim * np.sum(self.likelihood.Y**2 * dbstar_dnoise)
             dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)
-            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)
 
             dD_dnoise_1 =  mdot(self.V_star*LBiLmipsi1.T,LBiLmipsi1*dbstar_dnoise.T*self.likelihood.Y.T)
             alpha = mdot(LBiLmipsi1,self.V_star)
@@ -174,7 +173,7 @@ class FITC(SparseGP):
 
     def dL_dZ(self):
         dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T,self.Z,self.X)
-        dL_dZ += 2. * self.kern.dK_dX(self._dL_dKmm,X=self.Z)
+        dL_dZ += self.kern.dK_dX(self._dL_dKmm,X=self.Z)
         dL_dZ += self._dpsi1_dX
         dL_dZ += self._dKmm_dX
         return dL_dZ
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 278ddc74..63903242 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -19,9 +19,6 @@ class GP(GPBase):
     :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
     :type normalize_X: False|True
     :rtype: model object
-    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
-    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
-    :type powerep: list
 
     .. Note:: Multiple independent outputs are allowed using columns of Y
 
@@ -105,7 +102,12 @@ class GP(GPBase):
 
         Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
         """
-        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        #return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        if not isinstance(self.likelihood,EP):
+            tmp = np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        else:
+            tmp = np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        return tmp
 
     def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
         """
@@ -136,7 +138,7 @@ class GP(GPBase):
         :type Xnew: np.ndarray, Nnew x self.input_dim
         :param which_parts:  specifies which outputs kernel(s) to use in prediction
         :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
         :type full_cov: bool
         :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
         :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
@@ -153,5 +155,70 @@ class GP(GPBase):
 
         # now push through likelihood
         mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
-
         return mean, var, _025pm, _975pm
+
+    def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
+        """
+        For a specific output, predict the function at the new point(s) Xnew.
+        Arguments
+        ---------
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :param output: output to predict
+        :type output: integer in {0,..., num_outputs-1}
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+        :type full_cov: bool
+        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+
+        .. Note:: For multiple output models only
+        """
+        assert hasattr(self,'multioutput'), 'This function is for multiple output models only.'
+        index = np.ones_like(Xnew)*output
+        Xnew = np.hstack((Xnew,index))
+
+        # normalize X values
+        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
+
+        # now push through likelihood
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, noise_model = output)
+        return mean, var, _025pm, _975pm
+
+    def _raw_predict_single_output(self, _Xnew, output=0, which_parts='all', full_cov=False,stop=False):
+        """
+        Internal helper function for making predictions for a specific output,
+        does not account for normalization or likelihood
+        ---------
+
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :param output: output to predict
+        :type output: integer in {0,..., num_outputs-1}
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+
+        .. Note:: For multiple output models only
+        """
+        assert hasattr(self,'multioutput'), 'This function is for multiple output models only.'
+        # creates an index column and appends it to _Xnew
+        index = np.ones_like(_Xnew)*output
+        _Xnew = np.hstack((_Xnew,index))
+
+        Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
+        KiKx, _ = dpotrs(self.L, np.asfortranarray(Kx), lower=1)
+        mu = np.dot(KiKx.T, self.likelihood.Y)
+        if full_cov:
+            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
+            var = Kxx - np.dot(KiKx.T, Kx)
+        else:
+            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
+            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
+            var = var[:, None]
+        if stop:
+            debug_this # @UndefinedVariable
+        return mu, var
diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py
index 6935fc6f..e26deb0f 100644
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@@ -57,18 +57,12 @@ class GPBase(Model):
         self.X = state.pop()
         Model.setstate(self, state)
 
-    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
+    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None,output=None):
         """
         Plot the GP's view of the world, where the data is normalized and the
-        likelihood is Gaussian.
-
-        Plot the posterior of the GP.
           - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
           - In two dimsensions, a contour-plot shows the mean predicted function
-          - In higher dimensions, we've no implemented this yet !TODO!
-
-        Can plot only part of the data and part of the posterior functions
-        using which_data and which_functions
+          - Not implemented in higher dimensions
 
         :param samples: the number of a posteriori samples to plot
         :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
@@ -85,6 +79,8 @@ class GPBase(Model):
         :param ax: axes to plot on.
         :type ax: axes handle
 
+        :param output: which output to plot (for multiple output models only)
+        :type output: integer (first output is 0)
         """
         if which_data == 'all':
             which_data = slice(None)
@@ -93,44 +89,90 @@ class GPBase(Model):
             fig = pb.figure(num=fignum)
             ax = fig.add_subplot(111)
 
-        if self.X.shape[1] == 1:
-            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
-            if samples == 0:
-                m, v = self._raw_predict(Xnew, which_parts=which_parts)
-                gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax)
+        if not hasattr(self,'multioutput'):
+
+            if self.X.shape[1] == 1:
+                Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
+                if samples == 0:
+                    m, v = self._raw_predict(Xnew, which_parts=which_parts)
+                    gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax)
+                    ax.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
+                else:
+                    m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
+                    v = v.reshape(m.size,-1) if len(v.shape)==3 else v
+                    Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
+                    gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None, ], axes=ax)
+                    for i in range(samples):
+                        ax.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
+
                 ax.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
+                ax.set_xlim(xmin, xmax)
+                ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
+                ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+                ax.set_ylim(ymin, ymax)
+
+                if hasattr(self,'Z'):
+                    Zu = self.Z * self._Xscale + self._Xoffset
+                    ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
+
+            elif self.X.shape[1] == 2:
+                resolution = resolution or 50
+                Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
+                m, v = self._raw_predict(Xnew, which_parts=which_parts)
+                m = m.reshape(resolution, resolution).T
+                ax.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
+                ax.scatter(self.X[:, 0], self.X[:, 1], 40, self.likelihood.Y, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max()) # @UndefinedVariable
+                ax.set_xlim(xmin[0], xmax[0])
+                ax.set_ylim(xmin[1], xmax[1])
+
             else:
-                m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
-                Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
-                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None, ], axes=ax)
-                for i in range(samples):
-                    ax.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
-            ax.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
-            ax.set_xlim(xmin, xmax)
-            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_ylim(ymin, ymax)
-
-        elif self.X.shape[1] == 2:
-            resolution = resolution or 50
-            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
-            m, v = self._raw_predict(Xnew, which_parts=which_parts)
-            m = m.reshape(resolution, resolution).T
-            ax.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
-            ax.scatter(self.X[:, 0], self.X[:, 1], 40, self.likelihood.Y, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max()) # @UndefinedVariable
-            ax.set_xlim(xmin[0], xmax[0])
-            ax.set_ylim(xmin[1], xmax[1])
+                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
         else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+            assert self.num_outputs > output, 'The model has only %s outputs.' %self.num_outputs
 
-    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
+            if self.X.shape[1] == 2:
+                assert self.num_outputs >= output, 'The model has only %s outputs.' %self.num_outputs
+                Xu = self.X[self.X[:,-1]==output ,0:1]
+                Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
+
+                if samples == 0:
+                    m, v = self._raw_predict_single_output(Xnew, output=output, which_parts=which_parts)
+                    gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax)
+                    ax.plot(Xu[which_data], self.likelihood.Y[self.likelihood.index==output][:,None], 'kx', mew=1.5)
+                else:
+                    m, v = self._raw_predict_single_output(Xnew, output=output, which_parts=which_parts, full_cov=True)
+                    v = v.reshape(m.size,-1) if len(v.shape)==3 else v
+                    Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
+                    gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None, ], axes=ax)
+                    for i in range(samples):
+                        ax.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
+                ax.set_xlim(xmin, xmax)
+                ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
+                ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+                ax.set_ylim(ymin, ymax)
+
+            elif self.X.shape[1] == 3:
+                raise NotImplementedError, "Plots not implemented for multioutput models with 2D inputs...yet"
+                assert self.num_outputs >= output, 'The model has only %s outputs.' %self.num_outputs
+
+            else:
+                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+
+        if hasattr(self,'Z'):
+            Zu = self.Z[self.Z[:,-1]==output,:]
+            Zu = self.Z * self._Xscale + self._Xoffset
+            Zu = self.Z[self.Z[:,-1]==output ,0:1] #??
+            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
+
+
+    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, output=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
         """
         Plot the GP with noise where the likelihood is Gaussian.
 
         Plot the posterior of the GP.
           - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
           - In two dimsensions, a contour-plot shows the mean predicted function
-          - In higher dimensions, we've no implemented this yet !TODO!
+          - Not implemented in higher dimensions
 
         Can plot only part of the data and part of the posterior functions
         using which_data and which_functions
@@ -151,15 +193,13 @@ class GPBase(Model):
         :type fignum: figure number
         :param ax: axes to plot on.
         :type ax: axes handle
+        :type output: integer (first output is 0)
         :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
         :type fixed_inputs: a list of tuples
         :param linecol: color of line to plot.
         :type linecol:
         :param fillcol: color of fill
-        :type fillcol: 
-        :param levels: for 2D plotting, the number of contour levels to use
-        is ax is None, create a new figure
-
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
         """
         # TODO include samples
         if which_data == 'all':
@@ -169,41 +209,81 @@ class GPBase(Model):
             fig = pb.figure(num=fignum)
             ax = fig.add_subplot(111)
 
-        plotdims = self.input_dim - len(fixed_inputs)
+        if not hasattr(self,'multioutput'):
 
-        if plotdims == 1:
+            plotdims = self.input_dim - len(fixed_inputs)
+            if plotdims == 1:
+                resolution = resolution or 200
 
-            Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
+                Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
 
-            fixed_dims = np.array([i for i,v in fixed_inputs])
-            freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
+                fixed_dims = np.array([i for i,v in fixed_inputs])
+                freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
 
-            Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
-            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
-            Xgrid[:,freedim] = Xnew
-            for i,v in fixed_inputs:
-                Xgrid[:,i] = v
+                Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
+                Xgrid = np.empty((Xnew.shape[0],self.input_dim))
+                Xgrid[:,freedim] = Xnew
+                for i,v in fixed_inputs:
+                    Xgrid[:,i] = v
 
-            m, _, lower, upper = self.predict(Xgrid, which_parts=which_parts)
-            for d in range(m.shape[1]):
-                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data,freedim], self.likelihood.data[which_data, d], 'kx', mew=1.5)
-            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_xlim(xmin, xmax)
-            ax.set_ylim(ymin, ymax)
+                m, _, lower, upper = self.predict(Xgrid, which_parts=which_parts)
+                for d in range(m.shape[1]):
+                    gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
+                    ax.plot(Xu[which_data,freedim], self.likelihood.data[which_data, d], 'kx', mew=1.5)
+                ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+                ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+                ax.set_xlim(xmin, xmax)
+                ax.set_ylim(ymin, ymax)
 
-        elif self.X.shape[1] == 2: # FIXME
-            resolution = resolution or 50
-            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
-            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
-            m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
-            m = m.reshape(resolution, resolution).T
-            ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
-            Yf = self.likelihood.data.flatten()
-            ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable
-            ax.set_xlim(xmin[0], xmax[0])
-            ax.set_ylim(xmin[1], xmax[1])
+
+
+                Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits,resolution=resolution)
+                m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
+                for d in range(m.shape[1]):
+                    gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax)
+                    ax.plot(Xu[which_data], self.likelihood.data[which_data, d], 'kx', mew=1.5)
+                ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+                ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+                ax.set_xlim(xmin, xmax)
+                ax.set_ylim(ymin, ymax)
+
+            elif self.X.shape[1] == 2:
+                resolution = resolution or 50
+                Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
+                x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
+                m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
+                m = m.reshape(resolution, resolution).T
+                ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
+                Yf = self.likelihood.Y.flatten()
+                ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable
+                ax.set_xlim(xmin[0], xmax[0])
+                ax.set_ylim(xmin[1], xmax[1])
+
+            else:
+                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
 
         else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+            assert self.num_outputs > output, 'The model has only %s outputs.' %self.num_outputs
+            if self.X.shape[1] == 2:
+                resolution = resolution or 200
+                Xu = self.X[self.X[:,-1]==output,:] #keep the output of interest
+                Xu = self.X * self._Xscale + self._Xoffset
+                Xu = self.X[self.X[:,-1]==output ,0:1] #get rid of the index column
+
+                Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
+                m, _, lower, upper = self.predict_single_output(Xnew, which_parts=which_parts,output=output)
+
+                for d in range(m.shape[1]):
+                    gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax)
+                    ax.plot(Xu[which_data], self.likelihood.noise_model_list[output].data, 'kx', mew=1.5)
+                ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+                ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+                ax.set_xlim(xmin, xmax)
+                ax.set_ylim(ymin, ymax)
+
+            elif self.X.shape[1] == 3:
+                raise NotImplementedError, "Plots not yet implemented for multioutput models with 2D inputs"
+                resolution = resolution or 50
+
+            else:
+                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
diff --git a/GPy/core/model.py b/GPy/core/model.py
index c31ea209..89150b3a 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012, 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
@@ -31,8 +31,8 @@ class Model(Parameterized):
     def getstate(self):
         """
         Get the current state of the class.
-        
         Inherited from Parameterized, so add those parameters to the state
+
         :return: list of states from the model.
 
         """
@@ -46,7 +46,7 @@ class Model(Parameterized):
         call Parameterized with the rest of the state
 
         :param state: the state of the model.
-        :type state: list as returned from getstate.        
+        :type state: list as returned from getstate.
         """
         self.preferred_optimizer = state.pop()
         self.sampling_runs = state.pop()
@@ -397,17 +397,20 @@ class Model(Parameterized):
             return np.nan
         return 0.5 * self._get_params().size * np.log(2 * np.pi) + self.log_likelihood() - hld
 
-    def __str__(self, names=None):
-        if names is None:
-            names = self._get_print_names()
-        s = Parameterized.__str__(self, names=names).split('\n')
+    def __str__(self):
+        s = Parameterized.__str__(self).split('\n')
+        #def __str__(self, names=None):
+        #    if names is None:
+        #        names = self._get_print_names()
+        #s = Parameterized.__str__(self, names=names).split('\n')
         # add priors to the string
         if self.priors is not None:
             strs = [str(p) if p is not None else '' for p in self.priors]
         else:
-            strs = [''] * len(self._get_param_names())
-        name_indices = self.grep_param_names("|".join(names))
-        strs = np.array(strs)[name_indices]
+            strs = [''] * len(self._get_params())
+       #         strs = [''] * len(self._get_param_names())
+       #     name_indices = self.grep_param_names("|".join(names))
+       #     strs = np.array(strs)[name_indices]
         width = np.array(max([len(p) for p in strs] + [5])) + 4
 
         log_like = self.log_likelihood()
@@ -456,9 +459,9 @@ class Model(Parameterized):
             gradient = self.objective_function_gradients(x)
 
             numerical_gradient = (f1 - f2) / (2 * dx)
-            global_ratio = (f1 - f2) / (2 * np.dot(dx, gradient))
-
-            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() - 1) < tolerance
+            global_ratio = (f1 - f2) / (2 * np.dot(dx, np.where(gradient==0, 1e-32, gradient)))
+            
+            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance)
         else:
             # check the gradient of each parameter individually, and do some pretty printing
             try:
@@ -496,7 +499,7 @@ class Model(Parameterized):
                 gradient = self.objective_function_gradients(x)[i]
 
                 numerical_gradient = (f1 - f2) / (2 * step)
-                ratio = (f1 - f2) / (2 * step * gradient)
+                ratio = (f1 - f2) / (2 * step * np.where(gradient==0, 1e-312, gradient))
                 difference = np.abs((f1 - f2) / 2 / step - gradient)
 
                 if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance:
@@ -549,7 +552,7 @@ class Model(Parameterized):
         :type optimzer: string TODO: valid strings?
 
         """
-        assert isinstance(self.likelihood, likelihoods.EP), "pseudo_EM is only available for EP likelihoods"
+        assert isinstance(self.likelihood, likelihoods.EP) or isinstance(self.likelihood, likelihoods.EP_Mixed_Noise), "pseudo_EM is only available for EP likelihoods"
         ll_change = epsilon + 1.
         iteration = 0
         last_ll = -np.inf
diff --git a/GPy/core/parameterized.py b/GPy/core/parameterized.py
index fe6eba62..45aa70d5 100644
--- a/GPy/core/parameterized.py
+++ b/GPy/core/parameterized.py
@@ -27,9 +27,9 @@ class Parameterized(object):
 
     def _get_param_names(self):
         raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
-    def _get_print_names(self):
-        """ Override for which names to print out, when using print m """
-        return self._get_param_names()
+    #def _get_print_names(self):
+    #    """ Override for which names to print out, when using print m """
+    #    return self._get_param_names()
 
     def pickle(self, filename, protocol=None):
         if protocol is None:
@@ -63,10 +63,10 @@ class Parameterized(object):
         """
         Get the current state of the class,
         here just all the indices, rest can get recomputed
-        
         For inheriting from Parameterized:
-        Allways append the state of the inherited object 
-        and call down to the inherited object in setstate!! 
+
+        Allways append the state of the inherited object
+        and call down to the inherited object in setstate!!
         """
         return [self.tied_indices,
                 self.fixed_indices,
@@ -336,26 +336,30 @@ class Parameterized(object):
         n = [nn for i, nn in enumerate(n) if not i in remove]
         return n
 
-    @property
-    def all(self):
-        return self.__str__(self._get_param_names())
+    #@property
+    #def all(self):
+    #    return self.__str__(self._get_param_names())
 
 
-    def __str__(self, names=None, nw=30):
+    #def __str__(self, names=None, nw=30):
+    def __str__(self, nw=30):
         """
         Return a string describing the parameter names and their ties and constraints
         """
-        if names is None:
-            names = self._get_print_names()
-        name_indices = self.grep_param_names("|".join(names))
+        names = self._get_param_names()
+        #if names is None:
+        #    names = self._get_print_names()
+        #name_indices = self.grep_param_names("|".join(names))
         N = len(names)
 
         if not N:
             return "This object has no free parameters."
         header = ['Name', 'Value', 'Constraints', 'Ties']
-        values = self._get_params()[name_indices] # map(str,self._get_params())
+        values = self._get_params() # map(str,self._get_params())
+        #values = self._get_params()[name_indices] # map(str,self._get_params())
         # sort out the constraints
-        constraints = [''] * len(self._get_param_names())
+        constraints = [''] * len(names)
+        #constraints = [''] * len(self._get_param_names())
         for i, t in zip(self.constrained_indices, self.constraints):
             for ii in i:
                 constraints[ii] = t.__str__()
@@ -368,7 +372,10 @@ class Parameterized(object):
             for j in tie:
                 ties[j] = '(' + str(i) + ')'
 
-        values = ['%.4f' % float(v) for v in values]
+        if values.size == 1:
+            values = ['%.4f' %float(values)]
+        else:
+            values = ['%.4f' % float(v) for v in values]
         max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
         max_values = max([len(values[i]) for i in range(len(values))] + [len(header[1])])
         max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])])
@@ -383,3 +390,77 @@ class Parameterized(object):
 
 
         return ('\n'.join([header_string[0], separator] + param_string)) + '\n'
+
+    def grep_model(self,regexp):
+        regexp_indices = self.grep_param_names(regexp)
+        all_names = self._get_param_names()
+
+        names = [all_names[pj] for pj in regexp_indices]
+        N = len(names)
+
+        if not N:
+            return "Match not found."
+
+        header = ['Name', 'Value', 'Constraints', 'Ties']
+        all_values = self._get_params()
+        values = np.array([all_values[pj] for pj in regexp_indices])
+        constraints = [''] * len(names)
+
+        _constrained_indices,aux = self._pick_elements(regexp_indices,self.constrained_indices)
+        _constraints = [self.constraints[pj] for pj in aux]
+
+        for i, t in zip(_constrained_indices, _constraints):
+            for ii in i:
+                iii = regexp_indices.tolist().index(ii)
+                constraints[iii] = t.__str__()
+
+        _fixed_indices,aux = self._pick_elements(regexp_indices,self.fixed_indices)
+        for i in _fixed_indices:
+            for ii in i:
+                iii = regexp_indices.tolist().index(ii)
+                constraints[ii] = 'Fixed'
+
+        _tied_indices,aux = self._pick_elements(regexp_indices,self.tied_indices)
+        ties = [''] * len(names)
+        for i,ti in zip(_tied_indices,aux):
+            for ii in i:
+                iii = regexp_indices.tolist().index(ii)
+                ties[iii] = '(' + str(ti) + ')'
+
+        if values.size == 1:
+            values = ['%.4f' %float(values)]
+        else:
+            values = ['%.4f' % float(v) for v in values]
+
+        max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
+        max_values = max([len(values[i]) for i in range(len(values))] + [len(header[1])])
+        max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])])
+        max_ties = max([len(ties[i]) for i in range(len(ties))] + [len(header[3])])
+        cols = np.array([max_names, max_values, max_constraint, max_ties]) + 4
+
+        header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
+        header_string = map(lambda x: '|'.join(x), [header_string])
+        separator = '-' * len(header_string[0])
+        param_string = ["{n:^{c0}}|{v:^{c1}}|{c:^{c2}}|{t:^{c3}}".format(n=names[i], v=values[i], c=constraints[i], t=ties[i], c0=cols[0], c1=cols[1], c2=cols[2], c3=cols[3]) for i in range(len(values))]
+
+        print header_string[0]
+        print separator
+        for string in param_string:
+            print string
+
+    def _pick_elements(self,regexp_ind,array_list):
+        """Removes from array_list the elements different from regexp_ind"""
+        new_array_list = [] #New list with elements matching regexp_ind
+        array_indices = [] #Indices that matches the arrays in new_array_list and array_list
+
+        array_index = 0
+        for array in array_list:
+            _new = []
+            for ai in array:
+                if ai in regexp_ind:
+                    _new.append(ai)
+            if len(_new):
+                new_array_list.append(np.array(_new))
+                array_indices.append(array_index)
+            array_index += 1
+        return new_array_list, array_indices
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 1b5fd814..32ceea62 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -5,7 +5,7 @@ import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, tdot, symmetrify, backsub_both_sides, chol_inv, dtrtrs, dpotrs, dpotri
 from scipy import linalg
-from ..likelihoods import Gaussian
+from ..likelihoods import Gaussian, EP,EP_Mixed_Noise
 from gp_base import GPBase
 
 class SparseGP(GPBase):
@@ -109,7 +109,6 @@ class SparseGP(GPBase):
         tmp, _ = dtrtrs(self._Lm, np.asfortranarray(tmp.T), lower=1)
         self._A = tdot(tmp)
 
-
         # factor B
         self.B = np.eye(self.num_inducing) + self._A
         self.LB = jitchol(self.B)
@@ -139,6 +138,7 @@ class SparseGP(GPBase):
         dL_dpsi2_beta = 0.5 * backsub_both_sides(self._Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)
 
         if self.likelihood.is_heteroscedastic:
+
             if self.has_uncertain_inputs:
                 self.dL_dpsi2 = self.likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :]
             else:
@@ -160,9 +160,27 @@ class SparseGP(GPBase):
             # save computation here.
             self.partial_for_likelihood = None
         elif self.likelihood.is_heteroscedastic:
-            raise NotImplementedError, "heteroscedatic derivates not implemented"
+
+            if self.has_uncertain_inputs:
+                raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+
+            else:
+
+                LBi = chol_inv(self.LB)
+                Lmi_psi1, nil = dtrtrs(self._Lm, np.asfortranarray(self.psi1.T), lower=1, trans=0)
+                _LBi_Lmi_psi1, _ = dtrtrs(self.LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0)
+
+
+                self.partial_for_likelihood = -0.5 * self.likelihood.precision + 0.5 * self.likelihood.V**2
+                self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0 - np.sum(Lmi_psi1**2,0))[:,None] * self.likelihood.precision**2
+
+                self.partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*self.likelihood.precision**2
+
+                self.partial_for_likelihood += -np.dot(self._LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * self.likelihood.Y * self.likelihood.precision**2
+                self.partial_for_likelihood += 0.5*np.dot(self._LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * self.likelihood.precision**2
+
         else:
-            # likelihood is not heterscedatic
+            # likelihood is not heteroscedatic
             self.partial_for_likelihood = -0.5 * self.num_data * self.output_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2
             self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self._A) * self.likelihood.precision)
             self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self._A * self.DBi_plus_BiPBi) - self.data_fit)
@@ -194,8 +212,8 @@ class SparseGP(GPBase):
         return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])], [])\
             + self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
 
-    def _get_print_names(self):
-        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
+    #def _get_print_names(self):
+    #    return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
 
     def update_likelihood_approximation(self):
         """
@@ -240,7 +258,7 @@ class SparseGP(GPBase):
         """
         The derivative of the bound wrt the inducing inputs Z
         """
-        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm, self.Z) # factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
+        dL_dZ = self.kern.dK_dX(self.dL_dKmm, self.Z)
         if self.has_uncertain_inputs:
             dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
             dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
@@ -298,7 +316,7 @@ class SparseGP(GPBase):
         :type X_variance_new: np.ndarray, Nnew x self.input_dim
         :param which_parts:  specifies which outputs kernel(s) to use in prediction
         :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
         :type full_cov: bool
         :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
         :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
@@ -322,26 +340,133 @@ class SparseGP(GPBase):
 
         return mean, var, _025pm, _975pm
 
-    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None):
-
+    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None, output=None):
         if ax is None:
             fig = pb.figure(num=fignum)
             ax = fig.add_subplot(111)
         if which_data is 'all':
             which_data = slice(None)
 
-        GPBase.plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax)
+        GPBase.plot(self, samples=0, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax, output=output)
 
-        # add the inducing inputs and some errorbars
-        if self.X.shape[1] == 1:
-            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
-                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
-                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
-            Zu = self.Z * self._Xscale + self._Xoffset
-            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
+        if not hasattr(self,'multioutput'):
 
-        elif self.X.shape[1] == 2:
-            Zu = self.Z * self._Xscale + self._Xoffset
-            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
+            if self.X.shape[1] == 1:
+                if self.has_uncertain_inputs:
+                    Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
+                    ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
+                                xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
+                                ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
+                Zu = self.Z * self._Xscale + self._Xoffset
+                ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
+
+            elif self.X.shape[1] == 2:
+                Zu = self.Z * self._Xscale + self._Xoffset
+                ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
+
+        else:
+            pass
+            """
+            if self.X.shape[1] == 2 and hasattr(self,'multioutput'):
+                Xu = self.X[self.X[:,-1]==output,:]
+                if self.has_uncertain_inputs:
+                    Xu = self.X * self._Xscale + self._Xoffset  # NOTE self.X are the normalized values now
+
+                    Xu = self.X[self.X[:,-1]==output ,0:1] #??
+
+                    ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
+                                xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
+                                ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
+
+                Zu = self.Z[self.Z[:,-1]==output,:]
+                Zu = self.Z * self._Xscale + self._Xoffset
+                Zu = self.Z[self.Z[:,-1]==output ,0:1] #??
+                ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
+                #ax.set_ylim(ax.get_ylim()[0],)
+
+            else:
+                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+            """
+
+    def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
+        """
+        For a specific output, predict the function at the new point(s) Xnew.
+        Arguments
+        ---------
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :param output: output to predict
+        :type output: integer in {0,..., num_outputs-1}
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+        :type full_cov: bool
+        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+
+        .. Note:: For multiple output models only
+        """
+
+        assert hasattr(self,'multioutput')
+        index = np.ones_like(Xnew)*output
+        Xnew = np.hstack((Xnew,index))
+
+        # normalize X values
+        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
+
+        # now push through likelihood
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, noise_model = output)
+        return mean, var, _025pm, _975pm
+
+    def _raw_predict_single_output(self, _Xnew, output=0, X_variance_new=None, which_parts='all', full_cov=False,stop=False):
+        """
+        Internal helper function for making predictions for a specific output,
+        does not account for normalization or likelihood
+        ---------
+
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :param output: output to predict
+        :type output: integer in {0,..., num_outputs-1}
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+
+        .. Note:: For multiple output models only
+        """
+        Bi, _ = dpotri(self.LB, lower=0)  # WTH? this lower switch should be 1, but that doesn't work!
+        symmetrify(Bi)
+        Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi)
+
+        if self.Cpsi1V is None:
+            psi1V = np.dot(self.psi1.T,self.likelihood.V)
+            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
+            tmp, _ = dpotrs(self.LB, tmp, lower=1)
+            self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1)
+
+        assert hasattr(self,'multioutput')
+        index = np.ones_like(_Xnew)*output
+        _Xnew = np.hstack((_Xnew,index))
+
+        if X_variance_new is None:
+            Kx = self.kern.K(self.Z, _Xnew, which_parts=which_parts)
+            mu = np.dot(Kx.T, self.Cpsi1V)
+            if full_cov:
+                Kxx = self.kern.K(_Xnew, which_parts=which_parts)
+                var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting
+            else:
+                Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
+                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
+        else:
+            Kx = self.kern.psi1(self.Z, _Xnew, X_variance_new)
+            mu = np.dot(Kx, self.Cpsi1V)
+            if full_cov:
+                raise NotImplementedError, "TODO"
+            else:
+                Kxx = self.kern.psi0(self.Z, _Xnew, X_variance_new)
+                psi2 = self.kern.psi2(self.Z, _Xnew, X_variance_new)
+                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
+
+        return mu, var[:, None]
diff --git a/GPy/core/transformations.py b/GPy/core/transformations.py
index eeba3e51..d1964440 100644
--- a/GPy/core/transformations.py
+++ b/GPy/core/transformations.py
@@ -18,9 +18,11 @@ class Transformation(object):
     def gradfactor(self, f):
         """ df_dx evaluated at self.f(x)=f"""
         raise NotImplementedError
+
     def initialize(self, f):
         """ produce a sensible initial value for f(x)"""
         raise NotImplementedError
+
     def __str__(self):
         raise NotImplementedError
 
@@ -47,8 +49,6 @@ class Negative_logexp(Transformation):
         return Logexp.finv(-f)  # np.log(np.exp(-f) - 1.)
     def gradfactor(self, f):
         return -Logexp.gradfactor(-f)
-        #ef = np.exp(-f)
-        #return -(ef - 1.) / ef
     def initialize(self, f):
         return -Logexp.initialize(f)  # np.abs(f)
     def __str__(self):
diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index 77d1982c..88582351 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -166,3 +166,35 @@ def FITC_crescent_data(num_inducing=10, seed=default_seed):
     print(m)
     m.plot()
     return m
+
+
+def toy_heaviside(seed=default_seed):
+    """
+    Simple 1D classification example using a heavy side gp transformation
+    :param seed : seed value for data generation (default is 4).
+    :type seed: int
+    """
+
+    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    Y = data['Y'][:, 0:1]
+    Y[Y.flatten() == -1] = 0
+
+    # Model definition
+    noise_model = GPy.likelihoods.binomial(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
+    likelihood = GPy.likelihoods.EP(Y,noise_model)
+    m = GPy.models.GPClassification(data['X'], likelihood=likelihood)
+
+    # Optimize
+    m.update_likelihood_approximation()
+    # Parameters optimization:
+    m.optimize()
+    #m.pseudo_EM()
+
+    # Plot
+    fig, axes = pb.subplots(2,1)
+    m.plot_f(ax=axes[0])
+    m.plot(ax=axes[1])
+    print(m)
+
+    return m
+
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index f969bbe4..3e46b566 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -9,9 +9,9 @@ import pylab as pb
 import numpy as np
 import GPy
 
-def coregionalisation_toy2(max_iters=100):
+def coregionalization_toy2(max_iters=100):
     """
-    A simple demonstration of coregionalisation on two sinusoidal functions.
+    A simple demonstration of coregionalization on two sinusoidal functions.
     """
     X1 = np.random.rand(50, 1) * 8
     X2 = np.random.rand(30, 1) * 5
@@ -22,8 +22,8 @@ def coregionalisation_toy2(max_iters=100):
     Y = np.vstack((Y1, Y2))
 
     k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
-    k2 = GPy.kern.coregionalise(2, 1)
-    k = k1**k2
+    k2 = GPy.kern.coregionalize(2,1)
+    k = k1**k2 #k = k1.prod(k2,tensor=True)
     m = GPy.models.GPRegression(X, Y, kernel=k)
     m.constrain_fixed('.*rbf_var', 1.)
     # m.constrain_positive('.*kappa')
@@ -40,41 +40,32 @@ def coregionalisation_toy2(max_iters=100):
     pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
     return m
 
-def coregionalisation_toy(max_iters=100):
+def coregionalization_toy(max_iters=100):
     """
-    A simple demonstration of coregionalisation on two sinusoidal functions.
+    A simple demonstration of coregionalization on two sinusoidal functions.
     """
     X1 = np.random.rand(50, 1) * 8
     X2 = np.random.rand(30, 1) * 5
-    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
-    X = np.hstack((np.vstack((X1, X2)), index))
+    X = np.vstack((X1, X2))
     Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
     Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
     Y = np.vstack((Y1, Y2))
 
     k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.coregionalise(2, 2)
-    k = k1**k2 #k1.prod(k2, tensor=True)
-    m = GPy.models.GPRegression(X, Y, kernel=k)
+    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
     m.constrain_fixed('.*rbf_var', 1.)
-    # m.constrain_positive('kappa')
     m.optimize(max_iters=max_iters)
 
-    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
-    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
-    mean, var, low, up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
-    mean, var, low, up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
-    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
-    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
+    fig, axes = pb.subplots(2,1)
+    m.plot(output=0,ax=axes[0])
+    m.plot(output=1,ax=axes[1])
+    axes[0].set_title('Output 0')
+    axes[1].set_title('Output 1')
     return m
 
-
-def coregionalisation_sparse(max_iters=100):
+def coregionalization_sparse(max_iters=100):
     """
-    A simple demonstration of coregionalisation on two sinusoidal functions using sparse approximations.
+    A simple demonstration of coregionalization on two sinusoidal functions using sparse approximations.
     """
     X1 = np.random.rand(500, 1) * 8
     X2 = np.random.rand(300, 1) * 5
@@ -84,33 +75,18 @@ def coregionalisation_sparse(max_iters=100):
     Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
     Y = np.vstack((Y1, Y2))
 
-    num_inducing = 40
-    Z = np.hstack((np.random.rand(num_inducing, 1) * 8, np.random.randint(0, 2, num_inducing)[:, None]))
-
     k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.coregionalise(2, 2)
-    k = k1**k2 #.prod(k2, tensor=True) # + GPy.kern.white(2,0.001)
 
-    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
-    m.constrain_fixed('.*rbf_var', 1.)
-    m.constrain_fixed('iip')
-    m.constrain_bounded('noise_variance', 1e-3, 1e-1)
-#     m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
-    m.optimize(max_iters=max_iters)
+    m = GPy.models.SparseGPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1],num_inducing=20)
+    m.constrain_fixed('.*rbf_var',1.)
+    m.optimize(messages=1)
+    #m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
 
-    # plotting:
-    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
-    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
-    mean, var, low, up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
-    mean, var, low, up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
-    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
-    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
-    y = pb.ylim()[0]
-    pb.plot(Z[:, 0][Z[:, 1] == 0], np.zeros(np.sum(Z[:, 1] == 0)) + y, 'r|', mew=2)
-    pb.plot(Z[:, 0][Z[:, 1] == 1], np.zeros(np.sum(Z[:, 1] == 1)) + y, 'g|', mew=2)
+    fig, axes = pb.subplots(2,1)
+    m.plot(output=0,ax=axes[0])
+    m.plot(output=1,ax=axes[1])
+    axes[0].set_title('Output 0')
+    axes[1].set_title('Output 1')
     return m
 
 def epomeo_gpx(max_iters=100):
@@ -136,8 +112,8 @@ def epomeo_gpx(max_iters=100):
                    np.random.randint(0, 4, num_inducing)[:, None]))
 
     k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.coregionalise(output_dim=5, rank=5)
-    k = k1**k2 
+    k2 = GPy.kern.coregionalize(output_dim=5, rank=5)
+    k = k1**k2
 
     m = GPy.models.SparseGPRegression(t, Y, kernel=k, Z=Z, normalize_Y=True)
     m.constrain_fixed('.*rbf_var', 1.)
@@ -401,8 +377,6 @@ def silhouette(max_iters=100):
     print(m)
     return m
 
-
-
 def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100):
     """Run a 1D example of a sparse GP regression."""
     # sample inputs and outputs
diff --git a/GPy/inference/optimization.py b/GPy/inference/optimization.py
index 0ef487af..589ec4c7 100644
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@@ -130,7 +130,7 @@ class opt_lbfgsb(Optimizer):
             opt_dict['pgtol'] = self.gtol
 
         opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint,
-                                            maxfun=self.max_f_eval, **opt_dict)
+                                            maxfun=self.max_iters, **opt_dict)
         self.x_opt = opt_result[0]
         self.f_opt = f_fp(self.x_opt)[0]
         self.funct_eval = opt_result[2]['funcalls']
diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index f7c0fd67..046b0205 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -5,7 +5,6 @@ import numpy as np
 from kern import kern
 import parts
 
-
 def rbf_inv(input_dim,variance=1., inv_lengthscale=None,ARD=False):
     """
     Construct an RBF kernel
@@ -74,9 +73,9 @@ def gibbs(input_dim,variance=1., mapping=None):
     Gibbs and MacKay non-stationary covariance function.
 
     .. math::
-       
+
        r = sqrt((x_i - x_j)'*(x_i - x_j))
-       
+
        k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
 
        Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
@@ -90,7 +89,7 @@ def gibbs(input_dim,variance=1., mapping=None):
         The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
 
         :param input_dim: the number of input dimensions
-        :type input_dim: int 
+        :type input_dim: int
         :param variance: the variance :math:`\sigma^2`
         :type variance: float
         :param mapping: the mapping that gives the lengthscale across the input space.
@@ -103,6 +102,12 @@ def gibbs(input_dim,variance=1., mapping=None):
     part = parts.gibbs.Gibbs(input_dim,variance,mapping)
     return kern(input_dim, [part])
 
+def hetero(input_dim, mapping=None, transform=None):
+    """
+    """
+    part = parts.hetero.Hetero(input_dim,mapping,transform)
+    return kern(input_dim, [part])
+
 def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2, ARD=False):
     """
     Construct a polynomial kernel
@@ -135,6 +140,7 @@ def white(input_dim,variance=1.):
     part = parts.white.White(input_dim,variance)
     return kern(input_dim, [part])
 
+
 def exponential(input_dim,variance=1., lengthscale=None, ARD=False):
     """
     Construct an exponential kernel
@@ -340,29 +346,30 @@ def symmetric(k):
     k_.parts = [symmetric.Symmetric(p) for p in k.parts]
     return k_
 
-def coregionalise(output_dim, rank=1, W=None, kappa=None):
+def coregionalize(num_outputs,W_columns=1, W=None, kappa=None):
     """
-        Coregionalisation kernel. 
-
-    Used for computing covariance functions of the form
-    .. math::
-       k_2(x, y)=\mathbf{B} k(x, y)
-    where
+    Coregionlization matrix B, of the form:
     .. math::
        \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
 
-    :param output_dim: the number of output dimensions
-    :type output_dim: int
-    :param rank: the rank of the coregionalisation matrix.
-    :type rank: int
-    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B.
-    :type W: ndarray
-    :param kappa: a diagonal term which allows the outputs to behave independently.
+    An intrinsic/linear coregionalization kernel of the form
+    .. math::
+       k_2(x, y)=\mathbf{B} k(x, y)
+
+    it is obtainded as the tensor product between a kernel k(x,y) and B.
+
+    :param num_outputs: the number of outputs to coregionalize
+    :type num_outputs: int
+    :param W_columns: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
+    :type W_colunns: int
+    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalization matrix B
+    :type W: numpy array of dimensionality (num_outpus, W_columns)
+    :param kappa: a vector which allows the outputs to behave independently
+    :type kappa: numpy array of dimensionality  (num_outputs,)
     :rtype: kernel object
 
-    .. Note: see coregionalisation examples in GPy.examples.regression for some usage.
     """
-    p = parts.coregionalise.Coregionalise(output_dim,rank,W,kappa)
+    p = parts.coregionalize.Coregionalize(num_outputs,W_columns,W,kappa)
     return kern(1,[p])
 
 
@@ -421,3 +428,31 @@ def hierarchical(k):
     #     assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
     _parts = [parts.hierarchical.Hierarchical(k.parts)]
     return kern(k.input_dim+len(k.parts),_parts)
+
+def build_lcm(input_dim, num_outputs, kernel_list = [], W_columns=1,W=None,kappa=None):
+    """
+    Builds a kernel of a linear coregionalization model
+
+    :input_dim: Input dimensionality
+    :num_outputs: Number of outputs
+    :kernel_list: List of coregionalized kernels, each element in the list will be multiplied by a different corregionalization matrix
+    :type kernel_list: list of GPy kernels
+    :param W_columns: number tuples of the corregionalization parameters 'coregion_W'
+    :type W_columns: integer
+
+    ..Note the kernels dimensionality is overwritten to fit input_dim
+    """
+
+    for k in kernel_list:
+        if k.input_dim <> input_dim:
+            k.input_dim = input_dim
+            warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
+
+    k_coreg = coregionalize(num_outputs,W_columns,W,kappa)
+    kernel = kernel_list[0]**k_coreg.copy()
+
+    for k in kernel_list[1:]:
+        k_coreg = coregionalize(num_outputs,W_columns,W,kappa)
+        kernel += k**k_coreg.copy()
+
+    return kernel
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 2dc943bf..4a822758 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -13,7 +13,9 @@ import GPy
 class kern(Parameterized):
     def __init__(self, input_dim, parts=[], input_slices=None):
         """
-        This is the main kernel class for GPy. It handles multiple (additive) kernel functions, and keeps track of variaous things like which parameters live where.
+        This is the main kernel class for GPy. It handles multiple
+        (additive) kernel functions, and keeps track of various things
+        like which parameters live where.
 
         The technical code for kernels is divided into _parts_ (see
         e.g. rbf.py). This object contains a list of parts, which are
@@ -34,6 +36,11 @@ class kern(Parameterized):
 
         self.input_dim = input_dim
 
+        part_names = [k.name for k in self.parts]
+        self.name=''
+        for name in part_names:
+            self.name += name + '+'
+        self.name = self.name[:-1]
         # deal with input_slices
         if input_slices is None:
             self.input_slices = [slice(None) for p in self.parts]
@@ -334,10 +341,8 @@ class kern(Parameterized):
         :type X: np.ndarray (num_samples x input_dim)
         :param X2: Observed data inputs (optional, defaults to X)
         :type X2: np.ndarray (num_inducing x input_dim)"""
-        if X2 is None:
-            X2 = X
         target = np.zeros_like(X)
-        if X2 is None:
+        if X2 is None: 
             [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
         else:
             [p.dK_dX(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
@@ -654,17 +659,85 @@ def kern_test(kern, X=None, X2=None, verbose=False):
     :param X2: X2 input values to test the covariance function.
     :type X2: ndarray
     """
+    pass_checks = True
     if X==None:
         X = np.random.randn(10, kern.input_dim)
     if X2==None:
         X2 = np.random.randn(20, kern.input_dim)
-    result = [Kern_check_model(kern, X=X).is_positive_definite(),
-              Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose),
-              Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose),
-             Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose),
-              Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose),
-              Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)]
-    # Need to check 
-    #Kern_check_dK_dX(kern, X, X2=None).checkgrad(verbose=verbose)]
-    # but currently I think these aren't implemented.
-    return np.all(result)
+    if verbose:
+        print("Checking covariance function is positive definite.")
+    result = Kern_check_model(kern, X=X).is_positive_definite()
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Positive definite check failed for " + kern.name + " covariance function.")
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of K(X, X) wrt theta.")
+    result = Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+    
+    if verbose:
+        print("Checking gradients of K(X, X2) wrt theta.")
+    result = Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+    
+    if verbose:
+        print("Checking gradients of Kdiag(X) wrt theta.")
+    result = Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of Kdiag(X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+        
+    if verbose:
+        print("Checking gradients of K(X, X) wrt X.")
+    result = Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of K(X, X2) wrt X.")
+    result = Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of Kdiag(X) wrt X.")
+    result = Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of Kdiag(X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+    
+    return pass_checks
diff --git a/GPy/kern/parts/Matern32.py b/GPy/kern/parts/Matern32.py
index 60f0b6e9..40da79f0 100644
--- a/GPy/kern/parts/Matern32.py
+++ b/GPy/kern/parts/Matern32.py
@@ -98,9 +98,13 @@ class Matern32(Kernpart):
 
     def dK_dX(self, dL_dK, X, X2, target):
         """derivative of the covariance matrix with respect to X."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
-        ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+        if X2 is None:
+            dist = np.sqrt(np.sum(np.square((X[:, None, :] - X[None, :, :]) / self.lengthscale), -1))[:, :, None]
+            ddist_dX = 2*(X[:, None, :] - X[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+
+        else:
+            dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
+            ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
         dK_dX = -np.transpose(3 * self.variance * dist * np.exp(-np.sqrt(3) * dist) * ddist_dX, (1, 0, 2))
         target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
 
diff --git a/GPy/kern/parts/Matern52.py b/GPy/kern/parts/Matern52.py
index e02cb9bf..4bf4a1a8 100644
--- a/GPy/kern/parts/Matern52.py
+++ b/GPy/kern/parts/Matern52.py
@@ -98,9 +98,12 @@ class Matern52(Kernpart):
 
     def dK_dX(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
+        if X2 is None:
+            dist = np.sqrt(np.sum(np.square((X[:,None,:]-X[None,:,:])/self.lengthscale),-1))[:,:,None]
+            ddist_dX = 2*(X[:,None,:]-X[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
+        else:
+            dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
+            ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
         dK_dX = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
         target += np.sum(dK_dX*dL_dK.T[:,:,None],0)
 
diff --git a/GPy/kern/parts/__init__.py b/GPy/kern/parts/__init__.py
index 053e280f..643483f5 100644
--- a/GPy/kern/parts/__init__.py
+++ b/GPy/kern/parts/__init__.py
@@ -1,10 +1,12 @@
 import bias
 import Brownian
-import coregionalise
+import coregionalize
 import exponential
 import finite_dimensional
 import fixed
 import gibbs
+#import hetero #hetero.py is not commited: omitting for now. JH. 
+import hierarchical
 import independent_outputs
 import linear
 import Matern32
@@ -19,8 +21,7 @@ import prod
 import rational_quadratic
 import rbfcos
 import rbf
+import rbf_inv
 import spline
 import symmetric
 import white
-import hierarchical
-import rbf_inv
diff --git a/GPy/kern/parts/coregionalise.py b/GPy/kern/parts/coregionalize.py
similarity index 56%
rename from GPy/kern/parts/coregionalise.py
rename to GPy/kern/parts/coregionalize.py
index 94179088..363d98c3 100644
--- a/GPy/kern/parts/coregionalise.py
+++ b/GPy/kern/parts/coregionalize.py
@@ -7,44 +7,48 @@ from GPy.util.linalg import mdot, pdinv
 import pdb
 from scipy import weave
 
-class Coregionalise(Kernpart):
+class Coregionalize(Kernpart):
     """
-    Coregionalisation kernel. 
+    Covariance function for intrinsic/linear coregionalization models
 
-    Used for computing covariance functions of the form
+    This covariance has the form
     .. math::
-       k_2(x, y)=B k(x, y)
-    where
+       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
+
+    An intrinsic/linear coregionalization covariance function of the form
     .. math::
-       B = WW^\top + diag(kappa)
+       k_2(x, y)=\mathbf{B} k(x, y)
 
-    :param output_dim: the number of output dimensions
-    :type output_dim: int
-    :param rank: the rank of the coregionalisation matrix.
-    :type rank: int
-    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B.
-    :type W: ndarray
-    :param kappa: a diagonal term which allows the outputs to behave independently.
-    :rtype: kernel object
+    it is obtained as the tensor product between a covariance function
+    k(x,y) and B.
 
-    .. Note: see coregionalisation examples in GPy.examples.regression for some usage.
+    :param num_outputs: number of outputs to coregionalize
+    :type num_outputs: int
+    :param W_columns: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
+    :type W_colunns: int
+    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalization matrix B
+    :type W: numpy array of dimensionality (num_outpus, W_columns)
+    :param kappa: a vector which allows the outputs to behave independently
+    :type kappa: numpy array of dimensionality  (num_outputs,)
+
+    .. Note: see coregionalization examples in GPy.examples.regression for some usage.
     """
-    def __init__(self,output_dim,rank=1, W=None, kappa=None):
+    def __init__(self,num_outputs,W_columns=1, W=None, kappa=None):
         self.input_dim = 1
         self.name = 'coregion'
-        self.output_dim = output_dim
-        self.rank = rank
+        self.num_outputs = num_outputs
+        self.W_columns = W_columns
         if W is None:
-            self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank)
+            self.W = 0.5*np.random.randn(self.num_outputs,self.W_columns)/np.sqrt(self.W_columns)
         else:
-            assert W.shape==(self.output_dim,self.rank)
+            assert W.shape==(self.num_outputs,self.W_columns)
             self.W = W
         if kappa is None:
-            kappa = 0.5*np.ones(self.output_dim)
+            kappa = 0.5*np.ones(self.num_outputs)
         else:
-            assert kappa.shape==(self.output_dim,)
+            assert kappa.shape==(self.num_outputs,)
         self.kappa = kappa
-        self.num_params = self.output_dim*(self.rank + 1)
+        self.num_params = self.num_outputs*(self.W_columns + 1)
         self._set_params(np.hstack([self.W.flatten(),self.kappa]))
 
     def _get_params(self):
@@ -52,12 +56,12 @@ class Coregionalise(Kernpart):
 
     def _set_params(self,x):
         assert x.size == self.num_params
-        self.kappa = x[-self.output_dim:]
-        self.W = x[:-self.output_dim].reshape(self.output_dim,self.rank)
+        self.kappa = x[-self.num_outputs:]
+        self.W = x[:-self.num_outputs].reshape(self.num_outputs,self.W_columns)
         self.B = np.dot(self.W,self.W.T) + np.diag(self.kappa)
 
     def _get_param_names(self):
-        return sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[]) + ['kappa_%i'%i for i in range(self.output_dim)]
+        return sum([['W%i_%i'%(i,j) for j in range(self.W_columns)] for i in range(self.num_outputs)],[]) + ['kappa_%i'%i for i in range(self.num_outputs)]
 
     def K(self,index,index2,target):
         index = np.asarray(index,dtype=np.int)
@@ -75,26 +79,26 @@ class Coregionalise(Kernpart):
         if index2 is None:
             code="""
             for(int i=0;i<N; i++){
-              target[i+i*N] += B[index[i]+output_dim*index[i]];
+              target[i+i*N] += B[index[i]+num_outputs*index[i]];
               for(int j=0; j<i; j++){
-                  target[j+i*N] += B[index[i]+output_dim*index[j]];
+                  target[j+i*N] += B[index[i]+num_outputs*index[j]];
                   target[i+j*N] += target[j+i*N];
                 }
               }
             """
-            N,B,output_dim = index.size, self.B, self.output_dim
-            weave.inline(code,['target','index','N','B','output_dim'])
+            N,B,num_outputs = index.size, self.B, self.num_outputs
+            weave.inline(code,['target','index','N','B','num_outputs'])
         else:
             index2 = np.asarray(index2,dtype=np.int)
             code="""
             for(int i=0;i<num_inducing; i++){
               for(int j=0; j<N; j++){
-                  target[i+j*num_inducing] += B[output_dim*index[j]+index2[i]];
+                  target[i+j*num_inducing] += B[num_outputs*index[j]+index2[i]];
                 }
               }
             """
-            N,num_inducing,B,output_dim = index.size,index2.size, self.B, self.output_dim
-            weave.inline(code,['target','index','index2','N','num_inducing','B','output_dim'])
+            N,num_inducing,B,num_outputs = index.size,index2.size, self.B, self.num_outputs
+            weave.inline(code,['target','index','index2','N','num_inducing','B','num_outputs'])
 
 
     def Kdiag(self,index,target):
@@ -111,12 +115,12 @@ class Coregionalise(Kernpart):
         code="""
         for(int i=0; i<num_inducing; i++){
           for(int j=0; j<N; j++){
-            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
+            dL_dK_small[index[j] + num_outputs*index2[i]] += dL_dK[i+j*num_inducing];
           }
         }
         """
-        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
-        weave.inline(code, ['N','num_inducing','output_dim','dL_dK','dL_dK_small','index','index2'])
+        N, num_inducing, num_outputs = index.size, index2.size, self.num_outputs
+        weave.inline(code, ['N','num_inducing','num_outputs','dL_dK','dL_dK_small','index','index2'])
 
         dkappa = np.diag(dL_dK_small)
         dL_dK_small += dL_dK_small.T
@@ -133,8 +137,8 @@ class Coregionalise(Kernpart):
         ii,jj = ii.T, jj.T
 
         dL_dK_small = np.zeros_like(self.B)
-        for i in range(self.output_dim):
-            for j in range(self.output_dim):
+        for i in range(self.num_outputs):
+            for j in range(self.num_outputs):
                 tmp = np.sum(dL_dK[(ii==i)*(jj==j)])
                 dL_dK_small[i,j] = tmp
 
@@ -146,15 +150,13 @@ class Coregionalise(Kernpart):
 
     def dKdiag_dtheta(self,dL_dKdiag,index,target):
         index = np.asarray(index,dtype=np.int).flatten()
-        dL_dKdiag_small = np.zeros(self.output_dim)
-        for i in range(self.output_dim):
+        dL_dKdiag_small = np.zeros(self.num_outputs)
+        for i in range(self.num_outputs):
             dL_dKdiag_small[i] += np.sum(dL_dKdiag[index==i])
         dW = 2.*self.W*dL_dKdiag_small[:,None]
         dkappa = dL_dKdiag_small
         target += np.hstack([dW.flatten(),dkappa])
 
     def dK_dX(self,dL_dK,X,X2,target):
+        #NOTE In this case, pass is equivalent to returning zero.
         pass
-
-
-
diff --git a/GPy/kern/parts/gibbs.py b/GPy/kern/parts/gibbs.py
index 7ddd64f4..f47144e1 100644
--- a/GPy/kern/parts/gibbs.py
+++ b/GPy/kern/parts/gibbs.py
@@ -9,7 +9,7 @@ import GPy
 
 class Gibbs(Kernpart):
     """
-    Gibbs and MacKay non-stationary covariance function.
+    Gibbs non-stationary covariance function. 
 
     .. math::
        
@@ -25,7 +25,10 @@ class Gibbs(Kernpart):
         with input location. This leads to an additional term in front of
         the kernel.
 
-        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+        The parameters are :math:`\sigma^2`, the process variance, and
+        the parameters of l(x) which is a function that can be
+        specified by the user, by default an multi-layer peceptron is
+        used.
 
         :param input_dim: the number of input dimensions
         :type input_dim: int 
@@ -37,6 +40,15 @@ class Gibbs(Kernpart):
         :type ARD: Boolean
         :rtype: Kernpart object
 
+    See Mark Gibbs's thesis for more details: Gibbs,
+    M. N. (1997). Bayesian Gaussian Processes for Regression and
+    Classification. PhD thesis, Department of Physics, University of
+    Cambridge. Or also see Page 93 of Gaussian Processes for Machine
+    Learning by Rasmussen and Williams. Although note that we do not
+    constrain the lengthscale to be positive by default. This allows
+    anticorrelation to occur. The positive constraint can be included
+    by the user manually.
+
     """
 
     def __init__(self, input_dim, variance=1., mapping=None, ARD=False):
@@ -89,12 +101,18 @@ class Gibbs(Kernpart):
         """Derivative of the covariance matrix with respect to X."""
         # First account for gradients arising from presence of X in exponent.
         self._K_computations(X, X2)
-        _K_dist = X[:, None, :] - X2[None, :, :]
+        if X2 is None:
+            _K_dist = 2*(X[:, None, :] - X[None, :, :])
+        else:
+            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_co
         dK_dX = (-2.*self.variance)*np.transpose((self._K_dvar/self._w2)[:, :, None]*_K_dist, (1, 0, 2))
         target += np.sum(dK_dX*dL_dK.T[:, :, None], 0)
         # Now account for gradients arising from presence of X in lengthscale.
         self._dK_computations(dL_dK)
-        target += self.mapping.df_dX(self._dL_dl[:, None], X)
+        if X2 is None:
+            target += 2.*self.mapping.df_dX(self._dL_dl[:, None], X)
+        else:
+            target += self.mapping.df_dX(self._dL_dl[:, None], X)
     
     def dKdiag_dX(self, dL_dKdiag, X, target):
         """Gradient of diagonal of covariance with respect to X."""
@@ -102,7 +120,8 @@ class Gibbs(Kernpart):
 
     def dKdiag_dtheta(self, dL_dKdiag, X, target):
         """Gradient of diagonal of covariance with respect to parameters."""
-        pass
+        target[0] += np.sum(dL_dKdiag)
+
 
     
     def _K_computations(self, X, X2=None):
diff --git a/GPy/kern/parts/hetero.py b/GPy/kern/parts/hetero.py
new file mode 100644
index 00000000..2ee1a549
--- /dev/null
+++ b/GPy/kern/parts/hetero.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from IPython.core.debugger import Tracer; debug_here=Tracer()
+from kernpart import Kernpart
+import numpy as np
+from ...util.linalg import tdot
+from ...core.mapping import Mapping
+import GPy
+
+class Hetero(Kernpart):
+    """
+    TODO: Need to constrain the function outputs positive (still thinking of best way of doing this!!! Yes, intend to use transformations, but what's the *best* way). Currently just squaring output.
+
+    Heteroschedastic noise which depends on input location. See, for example, this paper by Goldberg et al.
+
+    .. math::
+
+       k(x_i, x_j) = \delta_{i,j} \sigma^2(x_i)
+
+       where :math:`\sigma^2(x)` is a function giving the variance  as a function of input space and :math:`\delta_{i,j}` is the Kronecker delta function.
+
+        The parameters are the parameters of \sigma^2(x) which is a
+        function that can be specified by the user, by default an
+        multi-layer peceptron is used.
+
+        :param input_dim: the number of input dimensions
+        :type input_dim: int 
+        :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
+        :type mapping: GPy.core.Mapping
+        :rtype: Kernpart object
+
+    See this paper:
+
+    Goldberg, P. W.  Williams, C. K. I. and Bishop,
+    C. M. (1998) Regression with Input-dependent Noise: a Gaussian
+    Process Treatment In Advances in Neural Information Processing
+    Systems, Volume 10, pp.  493-499. MIT Press
+    
+    for a Gaussian process treatment of this problem.
+
+    """
+
+    def __init__(self, input_dim, mapping=None, transform=None):
+        self.input_dim = input_dim
+        if not mapping:
+            mapping = GPy.mappings.MLP(output_dim=1, hidden_dim=20, input_dim=input_dim)
+        if not transform:
+            transform = GPy.core.transformations.logexp()
+            
+        self.transform = transform
+        self.mapping = mapping
+        self.name='hetero'
+        self.num_params=self.mapping.num_params
+        self._set_params(self.mapping._get_params())
+
+    def _get_params(self):
+        return self.mapping._get_params()
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.mapping._set_params(x)
+
+    def _get_param_names(self):
+        return self.mapping._get_param_names()
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        if X2==None or X2 is X:
+            target[np.diag_indices_from(target)] += self._Kdiag(X)
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        target+=self._Kdiag(X)
+
+    def _Kdiag(self, X):
+        """Helper function for computing the diagonal elements of the covariance."""
+        return self.mapping.f(X).flatten()**2
+    
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        if X2==None or X2 is X:
+            dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
+            self.dKdiag_dtheta(dL_dKdiag, X, target)
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to parameters."""
+        target += 2.*self.mapping.df_dtheta(dL_dKdiag[:, None], X)*self.mapping.f(X)
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X."""
+        if X2==None or X2 is X:
+            dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
+            self.dKdiag_dX(dL_dKdiag, X, target)
+    
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X."""
+        target += 2.*self.mapping.df_dX(dL_dKdiag[:, None], X)*self.mapping.f(X)
+
+
+    
diff --git a/GPy/kern/parts/kernpart.py b/GPy/kern/parts/kernpart.py
index 2d3d1191..c759abc0 100644
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-import numpy
 
 
 class Kernpart(object):
@@ -60,6 +59,45 @@ class Kernpart(object):
     def dK_dX(self, dL_dK, X, X2, target):
         raise NotImplementedError
 
+
+
+class Kernpart_stationary(Kernpart):
+    def __init__(self, input_dim, lengthscale=None, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not ARD:
+            self.num_params = 2
+            if lengthscale is not None:
+                self.lengthscale = np.asarray(lengthscale)
+                assert self.lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                self.lengthscale = np.ones(1)
+        else:
+            self.num_params = self.input_dim + 1
+            if lengthscale is not None:
+                self.lengthscale = np.asarray(lengthscale)
+                assert self.lengthscale.size == self.input_dim, "bad number of lengthscales"
+            else:
+                self.lengthscale = np.ones(self.input_dim)
+
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+
+    def _set_params(self, x):
+        self.lengthscale = x
+        self.lengthscale2 = np.square(self.lengthscale)
+        # reset cached results
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
+
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        # For stationary covariances, derivative of diagonal elements
+        # wrt lengthscale is 0.
+        target[0] += np.sum(dL_dKdiag)
+
+
 class Kernpart_inner(Kernpart):
     def __init__(self,input_dim):
         """
@@ -73,5 +111,5 @@ class Kernpart_inner(Kernpart):
         Kernpart.__init__(self, input_dim)
 
         # initialize cache
-        self._Z, self._mu, self._S = numpy.empty(shape=(3, 1))
-        self._X, self._X2, self._params = numpy.empty(shape=(3, 1))
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
diff --git a/GPy/kern/parts/linear.py b/GPy/kern/parts/linear.py
index e20270ad..ffcbcf5e 100644
--- a/GPy/kern/parts/linear.py
+++ b/GPy/kern/parts/linear.py
@@ -99,7 +99,10 @@ class Linear(Kernpart):
             target += tmp.sum()
 
     def dK_dX(self, dL_dK, X, X2, target):
-        target += (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+        if X2 is None:
+            target += 2*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+        else:
+            target += (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
 
     def dKdiag_dX(self,dL_dKdiag,X,target):
         target += 2.*self.variances*dL_dKdiag[:,None]*X
diff --git a/GPy/kern/parts/mlp.py b/GPy/kern/parts/mlp.py
index 72fd376c..f4825f3d 100644
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
@@ -110,9 +110,13 @@ class MLP(Kernpart):
         arg = self._K_asin_arg
         numer = self._K_numer
         denom = self._K_denom
-        vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
         denom3 = denom*denom*denom
-        target += four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+        if X2 is not None:
+            vec2 = (X2*X2).sum(1)*self.weight_variance+self.bias_variance + 1.
+            target += four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+        else:
+            vec = (X*X).sum(1)*self.weight_variance+self.bias_variance + 1.
+            target += 2*four_over_tau*self.weight_variance*self.variance*((X[None, :, :]/denom[:, :, None] - vec[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
             
     def dKdiag_dX(self, dL_dKdiag, X, target):
         """Gradient of diagonal of covariance with respect to X"""
diff --git a/GPy/kern/parts/poly.py b/GPy/kern/parts/poly.py
index b01f3a01..cdc65210 100644
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@@ -103,7 +103,10 @@ class POLY(Kernpart):
         """Derivative of the covariance matrix with respect to X"""
         self._K_computations(X, X2)
         arg = self._K_poly_arg
-        target += self.weight_variance*self.degree*self.variance*(((X2[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
+        if X2 is None:
+            target += 2*self.weight_variance*self.degree*self.variance*(((X[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
+        else:
+            target += self.weight_variance*self.degree*self.variance*(((X2[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
             
     def dKdiag_dX(self, dL_dKdiag, X, target):
         """Gradient of diagonal of covariance with respect to X"""
diff --git a/GPy/kern/parts/prod.py b/GPy/kern/parts/prod.py
index 21fb2e7b..0549ea22 100644
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@@ -2,6 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from kernpart import Kernpart
+from coregionalize import Coregionalize
 import numpy as np
 import hashlib
 
@@ -18,7 +19,7 @@ class Prod(Kernpart):
     """
     def __init__(self,k1,k2,tensor=False):
         self.num_params = k1.num_params + k2.num_params
-        self.name = k1.name + '<times>' + k2.name
+        self.name = '['+k1.name + '**' + k2.name +']'
         self.k1 = k1
         self.k2 = k2
         if tensor:
@@ -60,7 +61,7 @@ class Prod(Kernpart):
         """Compute the part of the kernel associated with k2."""
         self._K_computations(X, X2)
         return self._K2
-    
+
     def dK_dtheta(self,dL_dK,X,X2,target):
         """Derivative of the covariance matrix with respect to the parameters."""
         self._K_computations(X,X2)
@@ -90,8 +91,18 @@ class Prod(Kernpart):
     def dK_dX(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
         self._K_computations(X,X2)
-        self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
-        self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
+        if X2 is None:
+            if not isinstance(self.k1,Coregionalize) and not isinstance(self.k2,Coregionalize):
+                self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], None, target[:,self.slice1])
+                self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], None, target[:,self.slice2])
+            else:#if isinstance(self.k1,Coregionalize) or isinstance(self.k2,Coregionalize):
+                #NOTE The indices column in the inputs makes the ki.dK_dX fail when passing None instead of X[:,self.slicei]
+                X2 = X
+                self.k1.dK_dX(2.*dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
+                self.k2.dK_dX(2.*dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
+        else:
+            self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
+            self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
 
     def dKdiag_dX(self, dL_dKdiag, X, target):
         K1 = np.zeros(X.shape[0])
diff --git a/GPy/kern/parts/rational_quadratic.py b/GPy/kern/parts/rational_quadratic.py
index d92b43db..a75a5b11 100644
--- a/GPy/kern/parts/rational_quadratic.py
+++ b/GPy/kern/parts/rational_quadratic.py
@@ -57,7 +57,7 @@ class RationalQuadratic(Kernpart):
         dist2 = np.square((X-X2.T)/self.lengthscale)
 
         dvar = (1 + dist2/2.)**(-self.power)
-        dl = self.power * self.variance * dist2 * self.lengthscale**(-3) * (1 + dist2/2./self.power)**(-self.power-1)
+        dl = self.power * self.variance * dist2 / self.lengthscale * (1 + dist2/2.)**(-self.power-1)
         dp = - self.variance * np.log(1 + dist2/2.) * (1 + dist2/2.)**(-self.power)
 
         target[0] += np.sum(dvar*dL_dK)
@@ -70,10 +70,12 @@ class RationalQuadratic(Kernpart):
 
     def dK_dX(self,dL_dK,X,X2,target):
         """derivative of the covariance matrix with respect to X."""
-        if X2 is None: X2 = X
-        dist2 = np.square((X-X2.T)/self.lengthscale)
-
-        dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
+        if X2 is None:
+            dist2 = np.square((X-X.T)/self.lengthscale)
+            dX = -2.*self.variance*self.power * (X-X.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
+        else:
+            dist2 = np.square((X-X2.T)/self.lengthscale)
+            dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
         target += np.sum(dL_dK*dX,1)[:,np.newaxis]
 
     def dKdiag_dX(self,dL_dKdiag,X,target):
diff --git a/GPy/kern/parts/rbf.py b/GPy/kern/parts/rbf.py
index d0fa9742..855e2b71 100644
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@@ -138,7 +138,10 @@ class RBF(Kernpart):
 
     def dK_dX(self, dL_dK, X, X2, target):
         self._K_computations(X, X2)
-        _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
+        if X2 is None:
+            _K_dist = 2*(X[:, None, :] - X[None, :, :])
+        else:
+            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
         dK_dX = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
         target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
 
diff --git a/GPy/kern/parts/rbf_inv.py b/GPy/kern/parts/rbf_inv.py
index 649da044..0433e96c 100644
--- a/GPy/kern/parts/rbf_inv.py
+++ b/GPy/kern/parts/rbf_inv.py
@@ -133,7 +133,10 @@ class RBFInv(RBF):
 
     def dK_dX(self, dL_dK, X, X2, target):
         self._K_computations(X, X2)
-        _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
+        if X2 is None:            
+            _K_dist = 2*(X[:, None, :] - X[None, :, :])
+        else:
+            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
         dK_dX = (-self.variance * self.inv_lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
         target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
 
diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 99e88b6d..0cb62eb0 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -1,4 +1,7 @@
 from ep import EP
+from ep_mixed_noise import EP_Mixed_Noise
 from gaussian import Gaussian
+from gaussian_mixed_noise import Gaussian_Mixed_Noise
+from noise_model_constructors import *
 # TODO: from Laplace import Laplace
-import likelihood_functions as functions
+
diff --git a/GPy/likelihoods/ep.py b/GPy/likelihoods/ep.py
index 94f760e9..9a816e65 100644
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@@ -4,23 +4,23 @@ from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
 from likelihood import likelihood
 
 class EP(likelihood):
-    def __init__(self,data,LikelihoodFunction,epsilon=1e-3,power_ep=[1.,1.]):
+    def __init__(self,data,noise_model,epsilon=1e-3,power_ep=[1.,1.]):
         """
         Expectation Propagation
 
         Arguments
         ---------
         epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
-        LikelihoodFunction : a likelihood function (see likelihood_functions.py)
+        noise_model : a likelihood function (see likelihood_functions.py)
         """
-        self.LikelihoodFunction = LikelihoodFunction
+        self.noise_model = noise_model
         self.epsilon = epsilon
         self.eta, self.delta = power_ep
         self.data = data
         self.N, self.output_dim = self.data.shape
         self.is_heteroscedastic = True
         self.Nparams = 0
-        self._transf_data = self.LikelihoodFunction._preprocess_values(data)
+        self._transf_data = self.noise_model._preprocess_values(data)
 
         #Initial values - Likelihood approximation parameters:
         #p(y|f) = t(f|tau_tilde,v_tilde)
@@ -37,6 +37,8 @@ class EP(likelihood):
         self.VVT_factor = self.V
         self.trYYT = 0.
 
+        super(EP, self).__init__()
+
     def restart(self):
         self.tau_tilde = np.zeros(self.N)
         self.v_tilde = np.zeros(self.N)
@@ -52,16 +54,23 @@ class EP(likelihood):
     def predictive_values(self,mu,var,full_cov):
         if full_cov:
             raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.LikelihoodFunction.predictive_values(mu,var)
+        return self.noise_model.predictive_values(mu,var)
 
     def _get_params(self):
-        return np.zeros(0)
+        #return np.zeros(0)
+        return self.noise_model._get_params()
+
     def _get_param_names(self):
-        return []
+        #return []
+        return self.noise_model._get_param_names()
+
     def _set_params(self,p):
-        pass # TODO: the EP likelihood might want to take some parameters...
+        #pass # TODO: the EP likelihood might want to take some parameters...
+        self.noise_model._set_params(p)
+
     def _gradients(self,partial):
-        return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
+        #return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
+        return self.noise_model._gradients(partial)
 
     def _compute_GP_variables(self):
         #Variables to be called from GP
@@ -116,7 +125,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
@@ -206,7 +215,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@@ -301,7 +310,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
diff --git a/GPy/likelihoods/ep_mixed_noise.py b/GPy/likelihoods/ep_mixed_noise.py
new file mode 100644
index 00000000..ffc8cb51
--- /dev/null
+++ b/GPy/likelihoods/ep_mixed_noise.py
@@ -0,0 +1,385 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats
+from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
+from likelihood import likelihood
+
+class EP_Mixed_Noise(likelihood):
+    def __init__(self,data_list,noise_model_list,epsilon=1e-3,power_ep=[1.,1.]):
+        """
+        Expectation Propagation
+
+        Arguments
+        ---------
+        :param data_list: list of outputs
+        :param noise_model_list: a list of noise models
+        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations
+        :type epsilon: float
+        :param power_ep: list of power ep parameters
+        """
+        assert len(data_list) == len(noise_model_list)
+        self.noise_model_list = noise_model_list
+        n_list = [data.size for data in data_list]
+        self.n_models = len(data_list)
+        self.n_params = [noise_model._get_params().size for noise_model in noise_model_list]
+        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.n_models),n_list)])
+        self.epsilon = epsilon
+        self.eta, self.delta = power_ep
+        self.data = np.vstack(data_list)
+        self.N, self.output_dim = self.data.shape
+        self.is_heteroscedastic = True
+        self.Nparams = 0#FIXME
+        self._transf_data = np.vstack([noise_model._preprocess_values(data) for noise_model,data in zip(noise_model_list,data_list)])
+        #TODO non-gaussian index
+
+        #Initial values - Likelihood approximation parameters:
+        #p(y|f) = t(f|tau_tilde,v_tilde)
+        self.tau_tilde = np.zeros(self.N)
+        self.v_tilde = np.zeros(self.N)
+
+        #initial values for the GP variables
+        self.Y = np.zeros((self.N,1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:,None]
+        self.Z = 0
+        self.YYT = None
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = 0.
+
+    def restart(self):
+        self.tau_tilde = np.zeros(self.N)
+        self.v_tilde = np.zeros(self.N)
+        self.Y = np.zeros((self.N,1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:,None]
+        self.Z = 0
+        self.YYT = None
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = 0.
+
+    def predictive_values(self,mu,var,full_cov,noise_model):
+        """
+        Predicts the output given the GP
+
+        :param mu: GP's mean
+        :param var: GP's variance
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+        :type full_cov: False|True
+        :param noise_model: noise model to use
+        :type noise_model: integer
+        """
+        if full_cov:
+            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
+        #_mu = []
+        #_var = []
+        #_q1 = []
+        #_q2 = []
+        #for m,v,o in zip(mu,var,output.flatten()):
+        #    a,b,c,d = self.noise_model_list[int(o)].predictive_values(m,v)
+        #    _mu.append(a)
+        #    _var.append(b)
+        #    _q1.append(c)
+        #    _q2.append(d)
+        #return np.vstack(_mu),np.vstack(_var),np.vstack(_q1),np.vstack(_q2)
+        return self.noise_model_list[noise_model].predictive_values(mu,var)
+
+    def _get_params(self):
+        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
+
+    def _get_param_names(self):
+        names = []
+        for noise_model in self.noise_model_list:
+           names += noise_model._get_param_names()
+        return names
+
+    def _set_params(self,p):
+        cs_params = np.cumsum([0]+self.n_params)
+        for i in range(len(self.n_params)):
+            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
+
+    def _gradients(self,partial):
+        #NOTE this is not tested
+        return np.hstack([noise_model._gradients(partial) for noise_model in self.noise_model_list])
+
+    def _compute_GP_variables(self):
+        #Variables to be called from GP
+        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
+        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
+        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
+        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
+
+        self.Y =  mu_tilde[:,None]
+        self.YYT = np.dot(self.Y,self.Y.T)
+        self.covariance_matrix = np.diag(1./self.tau_tilde)
+        self.precision = self.tau_tilde[:,None]
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = np.trace(self.YYT)
+
+    def fit_full(self,K):
+        """
+        The expectation-propagation algorithm.
+        For nomenclature see Rasmussen & Williams 2006.
+        """
+        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
+        mu = np.zeros(self.N)
+        Sigma = K.copy()
+
+        """
+        Initial values - Cavity distribution parameters:
+        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
+        sigma_ = 1./tau_
+        mu_ = v_/tau_
+        """
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
+
+        #Initial values - Marginal moments
+        z = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
+        phi = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.N,dtype=float)
+
+        #Approximation
+        epsilon_np1 = self.epsilon + 1.
+        epsilon_np2 = self.epsilon + 1.
+       	self.iterations = 0
+        self.np1 = [self.tau_tilde.copy()]
+        self.np2 = [self.v_tilde.copy()]
+        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
+            update_order = np.random.permutation(self.N)
+            for i in update_order:
+                #Cavity distribution parameters
+                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
+                #Marginal moments
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                #Site parameters update
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
+                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
+                self.tau_tilde[i] += Delta_tau
+                self.v_tilde[i] += Delta_v
+                #Posterior distribution parameters update
+                DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i])))
+                mu = np.dot(Sigma,self.v_tilde)
+                self.iterations += 1
+            #Sigma recomptutation with Cholesky decompositon
+            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
+            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
+            L = jitchol(B)
+            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
+            Sigma = K - np.dot(V.T,V)
+            mu = np.dot(Sigma,self.v_tilde)
+            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            self.np1.append(self.tau_tilde.copy())
+            self.np2.append(self.v_tilde.copy())
+
+        return self._compute_GP_variables()
+
+    def fit_DTC(self, Kmm, Kmn):
+        """
+        The expectation-propagation algorithm with sparse pseudo-input.
+        For nomenclature see ... 2013.
+        """
+        num_inducing = Kmm.shape[0]
+
+        #TODO: this doesn't work with uncertain inputs!
+
+        """
+        Prior approximation parameters:
+        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
+        Sigma0 = Qnn = Knm*Kmmi*Kmn
+        """
+        KmnKnm = np.dot(Kmn,Kmn.T)
+        Lm = jitchol(Kmm)
+        Lmi = chol_inv(Lm)
+        Kmmi = np.dot(Lmi.T,Lmi)
+        KmmiKmn = np.dot(Kmmi,Kmn)
+        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
+        LLT0 = Kmm.copy()
+
+        #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
+        #KmnKnm = np.dot(Kmn, Kmn.T)
+        #KmmiKmn = np.dot(Kmmi,Kmn)
+        #Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
+        #LLT0 = Kmm.copy()
+
+        """
+        Posterior approximation: q(f|y) = N(f| mu, Sigma)
+        Sigma = Diag + P*R.T*R*P.T + K
+        mu = w + P*Gamma
+        """
+        mu = np.zeros(self.N)
+        LLT = Kmm.copy()
+        Sigma_diag = Qnn_diag.copy()
+
+        """
+        Initial values - Cavity distribution parameters:
+        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
+        sigma_ = 1./tau_
+        mu_ = v_/tau_
+        """
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
+
+        #Initial values - Marginal moments
+        z = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
+        phi = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.N,dtype=float)
+
+        #Approximation
+        epsilon_np1 = 1
+        epsilon_np2 = 1
+       	self.iterations = 0
+        np1 = [self.tau_tilde.copy()]
+        np2 = [self.v_tilde.copy()]
+        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
+            update_order = np.random.permutation(self.N)
+            for i in update_order:
+                #Cavity distribution parameters
+                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
+                #Marginal moments
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                #Site parameters update
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
+                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
+                self.tau_tilde[i] += Delta_tau
+                self.v_tilde[i] += Delta_v
+                #Posterior distribution parameters update
+                DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
+                L = jitchol(LLT)
+                #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau))
+                V,info = dtrtrs(L,Kmn,lower=1)
+                Sigma_diag = np.sum(V*V,-2)
+                si = np.sum(V.T*V[:,i],-1)
+                mu += (Delta_v-Delta_tau*mu[i])*si
+                self.iterations += 1
+            #Sigma recomputation with Cholesky decompositon
+            LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
+            L = jitchol(LLT)
+            V,info = dtrtrs(L,Kmn,lower=1)
+            V2,info = dtrtrs(L.T,V,lower=0)
+            Sigma_diag = np.sum(V*V,-2)
+            Knmv_tilde = np.dot(Kmn,self.v_tilde)
+            mu = np.dot(V2.T,Knmv_tilde)
+            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
+            np1.append(self.tau_tilde.copy())
+            np2.append(self.v_tilde.copy())
+
+        self._compute_GP_variables()
+
+    def fit_FITC(self, Kmm, Kmn, Knn_diag):
+        """
+        The expectation-propagation algorithm with sparse pseudo-input.
+        For nomenclature see Naish-Guzman and Holden, 2008.
+        """
+        num_inducing = Kmm.shape[0]
+
+        """
+        Prior approximation parameters:
+        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
+        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
+        """
+        Lm = jitchol(Kmm)
+        Lmi = chol_inv(Lm)
+        Kmmi = np.dot(Lmi.T,Lmi)
+        P0 = Kmn.T
+        KmnKnm = np.dot(P0.T, P0)
+        KmmiKmn = np.dot(Kmmi,P0.T)
+        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
+        Diag0 = Knn_diag - Qnn_diag
+        R0 = jitchol(Kmmi).T
+
+        """
+        Posterior approximation: q(f|y) = N(f| mu, Sigma)
+        Sigma = Diag + P*R.T*R*P.T + K
+        mu = w + P*Gamma
+        """
+        self.w = np.zeros(self.N)
+        self.Gamma = np.zeros(num_inducing)
+        mu = np.zeros(self.N)
+        P = P0.copy()
+        R = R0.copy()
+        Diag = Diag0.copy()
+        Sigma_diag = Knn_diag
+        RPT0 = np.dot(R0,P0.T)
+
+        """
+        Initial values - Cavity distribution parameters:
+        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
+        sigma_ = 1./tau_
+        mu_ = v_/tau_
+        """
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
+
+        #Initial values - Marginal moments
+        z = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
+        phi = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.N,dtype=float)
+
+        #Approximation
+        epsilon_np1 = 1
+        epsilon_np2 = 1
+       	self.iterations = 0
+        self.np1 = [self.tau_tilde.copy()]
+        self.np2 = [self.v_tilde.copy()]
+        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
+            update_order = np.random.permutation(self.N)
+            for i in update_order:
+                #Cavity distribution parameters
+                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
+                #Marginal moments
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model_list[self.index[i]].moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                #Site parameters update
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
+                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
+                self.tau_tilde[i] += Delta_tau
+                self.v_tilde[i] += Delta_v
+                #Posterior distribution parameters update
+                dtd1 = Delta_tau*Diag[i] + 1.
+                dii = Diag[i]
+                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
+                pi_ = P[i,:].reshape(1,num_inducing)
+                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
+                Rp_i = np.dot(R,pi_.T)
+                RTR = np.dot(R.T,np.dot(np.eye(num_inducing) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
+                R = jitchol(RTR).T
+                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
+                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
+                RPT = np.dot(R,P.T)
+                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
+                mu = self.w + np.dot(P,self.Gamma)
+                self.iterations += 1
+            #Sigma recomptutation with Cholesky decompositon
+            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
+            Diag = Diag0 * Iplus_Dprod_i
+            P = Iplus_Dprod_i[:,None] * P0
+            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
+            L = jitchol(np.eye(num_inducing) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
+            R,info = dtrtrs(L,R0,lower=1)
+            RPT = np.dot(R,P.T)
+            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
+            self.w = Diag * self.v_tilde
+            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
+            mu = self.w + np.dot(P,self.Gamma)
+            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            self.np1.append(self.tau_tilde.copy())
+            self.np2.append(self.v_tilde.copy())
+
+        return self._compute_GP_variables()
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 7665e608..8f66d074 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -7,9 +7,9 @@ class Gaussian(likelihood):
     """
     Likelihood class for doing Expectation propagation
 
-    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the likelihood_function used
-    :param variance :
+    :param data: observed output
+    :type data: Nx1 numpy.darray
+    :param variance: noise parameter
     :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
     :type normalize: False|True
     """
@@ -34,6 +34,8 @@ class Gaussian(likelihood):
         self._variance = np.asarray(variance) + 1.
         self._set_params(np.asarray(variance))
 
+        super(Gaussian, self).__init__()
+
     def set_data(self, data):
         self.data = data
         self.N, D = data.shape
diff --git a/GPy/likelihoods/gaussian_mixed_noise.py b/GPy/likelihoods/gaussian_mixed_noise.py
new file mode 100644
index 00000000..4df01ec2
--- /dev/null
+++ b/GPy/likelihoods/gaussian_mixed_noise.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats
+from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
+from likelihood import likelihood
+from . import Gaussian
+
+
+class Gaussian_Mixed_Noise(likelihood):
+    """
+    Gaussian Likelihood for multiple outputs
+
+    This is a wrapper around likelihood.Gaussian class
+
+    :param data_list: data observations
+    :type data_list: list of numpy arrays (num_data_output_i x 1), one array per output
+    :param noise_params: noise parameters of each output
+    :type noise_params: list of floats, one per output
+    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
+    :type normalize: False|True
+    """
+    def __init__(self, data_list, noise_params=None, normalize=True):
+        self.Nparams = len(data_list)
+        self.n_list = [data.size for data in data_list]
+        self.index = np.vstack([np.repeat(i,n)[:,None] for i,n in zip(range(self.Nparams),self.n_list)])
+
+        if noise_params is None:
+            noise_params = [1.] * self.Nparams
+        else:
+            assert self.Nparams == len(noise_params), 'Number of noise parameters does not match the number of noise models.'
+
+        self.noise_model_list = [Gaussian(Y,variance=v,normalize = normalize) for Y,v in zip(data_list,noise_params)]
+        self.n_params = [noise_model._get_params().size for noise_model in self.noise_model_list]
+        self.data = np.vstack(data_list)
+        self.N, self.output_dim = self.data.shape
+        self._offset = np.zeros((1, self.output_dim))
+        self._scale = np.ones((1, self.output_dim))
+
+        self.is_heteroscedastic = True
+        self.Z = 0. # a correction factor which accounts for the approximation made
+
+        self.set_data(data_list)
+        self._set_params(np.asarray(noise_params))
+
+        super(Gaussian_Mixed_Noise, self).__init__()
+
+    def set_data(self, data_list):
+        self.data = np.vstack(data_list)
+        self.N, D = self.data.shape
+        assert D == self.output_dim
+        self.Y = (self.data - self._offset) / self._scale
+        if D > self.N:
+            raise NotImplementedError
+            #self.YYT = np.dot(self.Y, self.Y.T)
+            #self.trYYT = np.trace(self.YYT)
+            #self.YYT_factor = jitchol(self.YYT)
+        else:
+            self.YYT = None
+            self.trYYT = np.sum(np.square(self.Y))
+            self.YYT_factor = self.Y
+
+    def predictive_values(self,mu,var,full_cov,noise_model):
+        """
+        Predicts the output given the GP
+
+        :param mu: GP's mean
+        :param var: GP's variance
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+        :type full_cov: False|True
+        :param noise_model: noise model to use
+        :type noise_model: integer
+        """
+        if full_cov:
+            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
+        return self.noise_model_list[noise_model].predictive_values(mu,var,full_cov)
+
+    def _get_params(self):
+        return np.hstack([noise_model._get_params().flatten() for noise_model in self.noise_model_list])
+
+    def _get_param_names(self):
+        if len(self.noise_model_list) == 1:
+            names = self.noise_model_list[0]._get_param_names()
+        else:
+            names = []
+            for noise_model,i in zip(self.noise_model_list,range(len(self.n_list))):
+                names.append(''.join(noise_model._get_param_names() + ['_%s' %i]))
+        return names
+
+    def _set_params(self,p):
+        cs_params = np.cumsum([0]+self.n_params)
+
+        for i in range(len(self.n_params)):
+            self.noise_model_list[i]._set_params(p[cs_params[i]:cs_params[i+1]])
+        self.precision = np.hstack([np.repeat(noise_model.precision,n) for noise_model,n in zip(self.noise_model_list,self.n_list)])[:,None]
+
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.precision * self.YYT_factor
+        self.covariance_matrix = np.eye(self.N) * 1./self.precision
+
+    def _gradients(self,partial):
+        gradients = []
+        aux = np.cumsum([0]+self.n_list)
+        for ai,af,noise_model in zip(aux[:-1],aux[1:],self.noise_model_list):
+            gradients += [noise_model._gradients(partial[ai:af])]
+        return np.hstack(gradients)
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index d073ba6e..cda62bfc 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -1,7 +1,8 @@
 import numpy as np
 import copy
+from ..core.parameterized import Parameterized
 
-class likelihood:
+class likelihood(Parameterized):
     """
     The atom for a likelihood class
 
@@ -16,10 +17,10 @@ class likelihood:
     self.is_heteroscedastic : enables significant computational savings in GP
     self.precision : a scalar or vector representation of the effective target precision
     self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
-    self.V : self.precision * self.Y 
+    self.V : self.precision * self.Y
     """
-    def __init__(self,data):
-        raise ValueError, "this class is not to be instantiated"
+    def __init__(self):
+        Parameterized.__init__(self)
 
     def _get_params(self):
         raise NotImplementedError
@@ -38,7 +39,3 @@ class likelihood:
 
     def predictive_values(self, mu, var):
         raise NotImplementedError
-
-    def copy(self):
-        """ Returns a (deep) copy of the current likelihood """
-        return copy.deepcopy(self)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
deleted file mode 100644
index 7b9b8982..00000000
--- a/GPy/likelihoods/likelihood_functions.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats
-import scipy as sp
-import pylab as pb
-from ..util.plot import gpplot
-from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-import link_functions
-
-class LikelihoodFunction(object):
-    """
-    Likelihood class for doing Expectation propagation
-
-    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the LikelihoodFunction used
-    """
-    def __init__(self,link):
-        if link == self._analytical:
-            self.moments_match = self._moments_match_analytical
-        else:
-            assert isinstance(link,link_functions.LinkFunction)
-            self.link = link
-            self.moments_match = self._moments_match_numerical
-
-    def _preprocess_values(self,Y):
-        return Y
-
-    def _product(self,gp,obs,mu,sigma):
-        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._distribution(gp,obs)
-
-    def _nlog_product(self,gp,obs,mu,sigma):
-        return -(-.5*(gp-mu)**2/sigma**2 + self._log_distribution(gp,obs))
-
-    def _locate(self,obs,mu,sigma):
-        """
-        Golden Search to find the mode in the _product function (cavity x exact likelihood) and define a grid around it for numerical integration
-        """
-        golden_A = -1 if obs == 0 else np.array([np.log(obs),mu]).min() #Lower limit
-        golden_B = np.array([np.log(obs),mu]).max() #Upper limit
-        return sp.optimize.golden(self._nlog_product, args=(obs,mu,sigma), brack=(golden_A,golden_B)) #Better to work with _nlog_product than with _product
-
-    def _moments_match_numerical(self,obs,tau,v):
-        """
-        Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input.
-        """
-        mu = v/tau
-        sigma = np.sqrt(1./tau)
-        opt = self._locate(obs,mu,sigma)
-        width = 3./np.log(max(obs,2))
-        A = opt - width #Grid's lower limit
-        B = opt + width #Grid's Upper limit
-        K =  10*int(np.log(max(obs,150))) #Number of points in the grid
-        h = (B-A)/K # length of the intervals
-        grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis)
-        x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier
-        _aux1 = self._product(A,obs,mu,sigma)
-        _aux2 = self._product(B,obs,mu,sigma)
-        _aux3 = 4*self._product(grid_x[range(1,K,2)],obs,mu,sigma)
-        _aux4 = 2*self._product(grid_x[range(2,K-1,2)],obs,mu,sigma)
-        zeroth = np.hstack((_aux1,_aux2,_aux3,_aux4)) # grid of points (Y axis) rearranged
-        first = zeroth*x
-        second = first*x
-        Z_hat = sum(zeroth)*h/3 # Zero-th moment
-        mu_hat = sum(first)*h/(3*Z_hat) # First moment
-        m2 = sum(second)*h/(3*Z_hat) # Second moment
-        sigma2_hat = m2 - mu_hat**2 # Second central moment
-        return float(Z_hat), float(mu_hat), float(sigma2_hat)
-
-class Binomial(LikelihoodFunction):
-    """
-    Probit likelihood
-    Y is expected to take values in {-1,1}
-    -----
-    $$
-    L(x) = \\Phi (Y_i*f_i)
-    $$
-    """
-    def __init__(self,link=None):
-        self._analytical = link_functions.Probit
-        if not link:
-            link = self._analytical
-        super(Binomial, self).__init__(link)
-
-    def _distribution(self,gp,obs):
-        pass
-
-    def _log_distribution(self,gp,obs):
-        pass
-
-    def _preprocess_values(self,Y):
-        """
-        Check if the values of the observations correspond to the values
-        assumed by the likelihood function.
-
-        ..Note:: Binary classification algorithm works better with classes {-1,1}
-        """
-        Y_prep = Y.copy()
-        Y1 = Y[Y.flatten()==1].size
-        Y2 = Y[Y.flatten()==0].size
-        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
-        Y_prep[Y.flatten() == 0] = -1
-        return Y_prep
-
-    def _moments_match_analytical(self,data_i,tau_i,v_i):
-        """
-        Moments match of the marginal approximation in EP algorithm
-
-        :param i: number of observation (int)
-        :param tau_i: precision of the cavity distribution (float)
-        :param v_i: mean/variance of the cavity distribution (float)
-        """
-        z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
-        Z_hat = std_norm_cdf(z)
-        phi = std_norm_pdf(z)
-        mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
-        sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
-        return Z_hat, mu_hat, sigma2_hat
-
-    def predictive_values(self,mu,var):
-        """
-        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction
-        :param mu: mean of the latent variable
-        :param var: variance of the latent variable
-        """
-        mu = mu.flatten()
-        var = var.flatten()
-        mean = stats.norm.cdf(mu/np.sqrt(1+var))
-        norm_025 = [stats.norm.ppf(.025,m,v) for m,v in zip(mu,var)]
-        norm_975 = [stats.norm.ppf(.975,m,v) for m,v in zip(mu,var)]
-        p_025 = stats.norm.cdf(norm_025/np.sqrt(1+var))
-        p_975 = stats.norm.cdf(norm_975/np.sqrt(1+var))
-        return mean[:,None], np.nan*var, p_025[:,None], p_975[:,None] # TODO: var
-
-class Poisson(LikelihoodFunction):
-    """
-    Poisson likelihood
-    Y is expected to take values in {0,1,2,...}
-    -----
-    $$
-    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
-    $$
-    """
-    def __init__(self,link=None):
-        self._analytical = None
-        if not link:
-            link = link_functions.Log()
-        super(Poisson, self).__init__(link)
-
-    def _distribution(self,gp,obs):
-        return stats.poisson.pmf(obs,self.link.inv_transf(gp))
-
-    def _log_distribution(self,gp,obs):
-        return - self.link.inv_transf(gp) + obs * self.link.log_inv_transf(gp)
-
-    def predictive_values(self,mu,var):
-        """
-        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
-        """
-        mean = self.link.transf(mu)#np.exp(mu*self.scale + self.location)
-        tmp = stats.poisson.ppf(np.array([.025,.975]),mean)
-        p_025 = tmp[:,0]
-        p_975 = tmp[:,1]
-        return mean,np.nan*mean,p_025,p_975 # better variance here TODO
diff --git a/GPy/likelihoods/link_functions.py b/GPy/likelihoods/link_functions.py
deleted file mode 100644
index 3b9a55b2..00000000
--- a/GPy/likelihoods/link_functions.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2012, 2013 Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from scipy import stats
-import scipy as sp
-import pylab as pb
-from ..util.plot import gpplot
-from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
-
-class LinkFunction(object):
-    """
-    Link function class for doing non-Gaussian likelihoods approximation
-
-    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the likelihood_function used
-    """
-    def __init__(self):
-        pass
-
-class Probit(LinkFunction):
-    """
-    Probit link function: Squashes a likelihood between 0 and 1
-    """
-    def transf(self,mu):
-        pass
-
-    def inv_transf(self,f):
-        pass
-
-    def log_inv_transf(self,f):
-        pass
diff --git a/GPy/likelihoods/noise_model_constructors.py b/GPy/likelihoods/noise_model_constructors.py
new file mode 100644
index 00000000..ec971e04
--- /dev/null
+++ b/GPy/likelihoods/noise_model_constructors.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import noise_models
+
+def binomial(gp_link=None):
+    """
+    Construct a binomial likelihood
+
+    :param gp_link: a GPy gp_link function
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Probit()
+    #else:
+    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
+
+    if isinstance(gp_link,noise_models.gp_transformations.Probit):
+        analytical_mean = True
+        analytical_variance = False
+
+    elif isinstance(gp_link,noise_models.gp_transformations.Heaviside):
+        analytical_mean = True
+        analytical_variance = True
+
+    else:
+        analytical_mean = False
+        analytical_variance = False
+
+    return noise_models.binomial_noise.Binomial(gp_link,analytical_mean,analytical_variance)
+
+def exponential(gp_link=None):
+    """
+    Construct a binomial likelihood
+
+    :param gp_link: a GPy gp_link function
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Identity()
+
+    analytical_mean = False
+    analytical_variance = False
+    return noise_models.exponential_noise.Exponential(gp_link,analytical_mean,analytical_variance)
+
+def gaussian_ep(gp_link=None,variance=1.):
+    """
+    Construct a gaussian likelihood
+
+    :param gp_link: a GPy gp_link function
+    :param variance: scalar
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Identity()
+    #else:
+    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
+
+    analytical_mean = False
+    analytical_variance = False
+    return noise_models.gaussian_noise.Gaussian(gp_link,analytical_mean,analytical_variance,variance)
+
+def poisson(gp_link=None):
+    """
+    Construct a Poisson likelihood
+
+    :param gp_link: a GPy gp_link function
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Log_ex_1()
+    #else:
+    #    assert isinstance(gp_link,noise_models.gp_transformations.GPTransformation), 'gp_link function is not valid.'
+    analytical_mean = False
+    analytical_variance = False
+    return noise_models.poisson_noise.Poisson(gp_link,analytical_mean,analytical_variance)
+
+def gamma(gp_link=None,beta=1.):
+    """
+    Construct a Gamma likelihood
+
+    :param gp_link: a GPy gp_link function
+    :param beta: scalar
+    """
+    if gp_link is None:
+        gp_link = noise_models.gp_transformations.Log_ex_1()
+    analytical_mean = False
+    analytical_variance = False
+    return noise_models.gamma_noise.Gamma(gp_link,analytical_mean,analytical_variance,beta)
+
+
diff --git a/GPy/likelihoods/noise_models/__init__.py b/GPy/likelihoods/noise_models/__init__.py
new file mode 100644
index 00000000..b47702a7
--- /dev/null
+++ b/GPy/likelihoods/noise_models/__init__.py
@@ -0,0 +1,7 @@
+import noise_distributions
+import binomial_noise
+import exponential_noise
+import gaussian_noise
+import gamma_noise
+import poisson_noise
+import gp_transformations
diff --git a/GPy/likelihoods/noise_models/binomial_noise.py b/GPy/likelihoods/noise_models/binomial_noise.py
new file mode 100644
index 00000000..256eaa3c
--- /dev/null
+++ b/GPy/likelihoods/noise_models/binomial_noise.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Binomial(NoiseDistribution):
+    """
+    Probit likelihood
+    Y is expected to take values in {-1,1}
+    -----
+    $$
+    L(x) = \\Phi (Y_i*f_i)
+    $$
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
+        super(Binomial, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y):
+        """
+        Check if the values of the observations correspond to the values
+        assumed by the likelihood function.
+
+        ..Note:: Binary classification algorithm works better with classes {-1,1}
+        """
+        Y_prep = Y.copy()
+        Y1 = Y[Y.flatten()==1].size
+        Y2 = Y[Y.flatten()==0].size
+        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
+        Y_prep[Y.flatten() == 0] = -1
+        return Y_prep
+
+    def _moments_match_analytical(self,data_i,tau_i,v_i):
+        """
+        Moments match of the marginal approximation in EP algorithm
+
+        :param i: number of observation (int)
+        :param tau_i: precision of the cavity distribution (float)
+        :param v_i: mean/variance of the cavity distribution (float)
+        """
+        if isinstance(self.gp_link,gp_transformations.Probit):
+            z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
+            Z_hat = std_norm_cdf(z)
+            phi = std_norm_pdf(z)
+            mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
+            sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
+
+        elif isinstance(self.gp_link,gp_transformations.Heaviside):
+            a = data_i*v_i/np.sqrt(tau_i)
+            Z_hat = std_norm_cdf(a)
+            N = std_norm_pdf(a)
+            mu_hat = v_i/tau_i + data_i*N/Z_hat/np.sqrt(tau_i)
+            sigma2_hat = (1. - a*N/Z_hat - np.square(N/Z_hat))/tau_i
+            if np.any(np.isnan([Z_hat, mu_hat, sigma2_hat])):
+                stop
+
+        return Z_hat, mu_hat, sigma2_hat
+
+    def _predictive_mean_analytical(self,mu,sigma):
+        if isinstance(self.gp_link,gp_transformations.Probit):
+            return stats.norm.cdf(mu/np.sqrt(1+sigma**2))
+        elif isinstance(self.gp_link,gp_transformations.Heaviside):
+            return stats.norm.cdf(mu/sigma)
+        else:
+            raise NotImplementedError
+
+    def _predictive_variance_analytical(self,mu,sigma, pred_mean):
+        if isinstance(self.gp_link,gp_transformations.Heaviside):
+            return 0.
+        else:
+            raise NotImplementedError
+
+    def _mass(self,gp,obs):
+        #NOTE obs must be in {0,1}
+        p = self.gp_link.transf(gp)
+        return p**obs * (1.-p)**(1.-obs)
+
+    def _nlog_mass(self,gp,obs):
+        p = self.gp_link.transf(gp)
+        return obs*np.log(p) + (1.-obs)*np.log(1-p)
+
+    def _dnlog_mass_dgp(self,gp,obs):
+        p = self.gp_link.transf(gp)
+        dp = self.gp_link.dtransf_df(gp)
+        return obs/p * dp - (1.-obs)/(1.-p) * dp
+
+    def _d2nlog_mass_dgp2(self,gp,obs):
+        p = self.gp_link.transf(gp)
+        return (obs/p + (1.-obs)/(1.-p))*self.gp_link.d2transf_df2(gp) + ((1.-obs)/(1.-p)**2-obs/p**2)*self.gp_link.dtransf_df(gp)
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _dmean_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)
+
+    def _d2mean_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        p = self.gp_link.transf(gp)
+        return p*(1.-p)
+
+    def _dvariance_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)*(1. - 2.*self.gp_link.transf(gp))
+
+    def _d2variance_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)*(1. - 2.*self.gp_link.transf(gp)) - 2*self.gp_link.dtransf_df(gp)**2
+
+    """
+    def predictive_values(self,mu,var): #TODO remove
+        mu = mu.flatten()
+        var = var.flatten()
+        #mean = stats.norm.cdf(mu/np.sqrt(1+var))
+        mean = self._predictive_mean_analytical(mu,np.sqrt(var))
+        norm_025 = [stats.norm.ppf(.025,m,v) for m,v in zip(mu,var)]
+        norm_975 = [stats.norm.ppf(.975,m,v) for m,v in zip(mu,var)]
+        #p_025 = stats.norm.cdf(norm_025/np.sqrt(1+var))
+        #p_975 = stats.norm.cdf(norm_975/np.sqrt(1+var))
+        p_025 = self._predictive_mean_analytical(norm_025,np.sqrt(var))
+        p_975 = self._predictive_mean_analytical(norm_975,np.sqrt(var))
+        return mean[:,None], np.nan*var, p_025[:,None], p_975[:,None] # TODO: var
+    """
diff --git a/GPy/likelihoods/noise_models/exponential_noise.py b/GPy/likelihoods/noise_models/exponential_noise.py
new file mode 100644
index 00000000..e72b8c22
--- /dev/null
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Exponential(NoiseDistribution):
+    """
+    Gamma likelihood
+    Y is expected to take values in {0,1,2,...}
+    -----
+    $$
+    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
+    $$
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
+        super(Exponential, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y):
+        return Y
+
+    def _mass(self,gp,obs):
+        """
+        Mass (or density) function
+        """
+        return np.exp(-obs/self.gp_link.transf(gp))/self.gp_link.transf(gp)
+
+    def _nlog_mass(self,gp,obs):
+        """
+        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
+        """
+        return obs/self.gp_link.transf(gp) + np.log(self.gp_link.transf(gp))
+
+    def _dnlog_mass_dgp(self,gp,obs):
+        return ( 1./self.gp_link.transf(gp) - obs/self.gp_link.transf(gp)**2) * self.gp_link.dtransf_df(gp)
+
+    def _d2nlog_mass_dgp2(self,gp,obs):
+        fgp = self.gp_link.transf(gp)
+        return (2*obs/fgp**3 - 1./fgp**2) * self.gp_link.dtransf_df(gp)**2 + ( 1./fgp - obs/fgp**2) * self.gp_link.d2transf_df2(gp)
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _dmean_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)
+
+    def _d2mean_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)**2
+
+    def _dvariance_dgp(self,gp):
+        return 2*self.gp_link.transf(gp)*self.gp_link.dtransf_df(gp)
+
+    def _d2variance_dgp2(self,gp):
+        return 2 * (self.gp_link.dtransf_df(gp)**2 + self.gp_link.transf(gp)*self.gp_link.d2transf_df2(gp))
diff --git a/GPy/likelihoods/noise_models/gamma_noise.py b/GPy/likelihoods/noise_models/gamma_noise.py
new file mode 100644
index 00000000..6bf0dd7b
--- /dev/null
+++ b/GPy/likelihoods/noise_models/gamma_noise.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Gamma(NoiseDistribution):
+    """
+    Gamma likelihood
+    Y is expected to take values in {0,1,2,...}
+    -----
+    $$
+    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
+    $$
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,beta=1.):
+        self.beta = beta
+        super(Gamma, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y):
+        return Y
+
+    def _mass(self,gp,obs):
+        """
+        Mass (or density) function
+        """
+        #return stats.gamma.pdf(obs,a = self.gp_link.transf(gp)/self.variance,scale=self.variance)
+        alpha = self.gp_link.transf(gp)*self.beta
+        return obs**(alpha - 1.) * np.exp(-self.beta*obs) * self.beta**alpha / special.gamma(alpha)
+
+    def _nlog_mass(self,gp,obs):
+        """
+        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
+        """
+        alpha = self.gp_link.transf(gp)*self.beta
+        return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
+
+    def _dnlog_mass_dgp(self,gp,obs):
+        return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
+
+    def _d2nlog_mass_dgp2(self,gp,obs):
+        return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _dmean_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)
+
+    def _d2mean_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)/self.beta
+
+    def _dvariance_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)/self.beta
+
+    def _d2variance_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)/self.beta
diff --git a/GPy/likelihoods/noise_models/gaussian_noise.py b/GPy/likelihoods/noise_models/gaussian_noise.py
new file mode 100644
index 00000000..398ed32a
--- /dev/null
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Gaussian(NoiseDistribution):
+    """
+    Gaussian likelihood
+
+    :param mean: mean value of the Gaussian distribution
+    :param variance: mean value of the Gaussian distribution
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False,variance=1.):
+        self.variance = variance
+        super(Gaussian, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _get_params(self):
+        return np.array([self.variance])
+
+    def _get_param_names(self):
+        return ['noise_model_variance']
+
+    def _set_params(self,p):
+        self.variance = p
+
+    def _gradients(self,partial):
+        return np.zeros(1)
+        #return np.sum(partial)
+
+    def _preprocess_values(self,Y):
+        """
+        Check if the values of the observations correspond to the values
+        assumed by the likelihood function.
+        """
+        return Y
+
+    def _moments_match_analytical(self,data_i,tau_i,v_i):
+        """
+        Moments match of the marginal approximation in EP algorithm
+
+        :param i: number of observation (int)
+        :param tau_i: precision of the cavity distribution (float)
+        :param v_i: mean/variance of the cavity distribution (float)
+        """
+        sigma2_hat = 1./(1./self.variance + tau_i)
+        mu_hat = sigma2_hat*(data_i/self.variance + v_i)
+        sum_var = self.variance + 1./tau_i
+        Z_hat = 1./np.sqrt(2.*np.pi*sum_var)*np.exp(-.5*(data_i - v_i/tau_i)**2./sum_var)
+        return Z_hat, mu_hat, sigma2_hat
+
+    def _predictive_mean_analytical(self,mu,sigma):
+        new_sigma2 = self.predictive_variance(mu,sigma)
+        return new_sigma2*(mu/sigma**2 + self.gp_link.transf(mu)/self.variance)
+
+    def _predictive_variance_analytical(self,mu,sigma,*args): #TODO *args?
+        return 1./(1./self.variance + 1./sigma**2)
+
+    def _mass(self,gp,obs):
+        #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
+        return stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance)) #FIXME
+
+    def _nlog_mass(self,gp,obs):
+        return .5*((self.gp_link.transf(gp)-obs)**2/self.variance + np.log(2.*np.pi*self.variance))
+
+    def _dnlog_mass_dgp(self,gp,obs):
+        return (self.gp_link.transf(gp)-obs)/self.variance * self.gp_link.dtransf_df(gp)
+
+    def _d2nlog_mass_dgp2(self,gp,obs):
+        return ((self.gp_link.transf(gp)-obs)*self.gp_link.d2transf_df2(gp) + self.gp_link.dtransf_df(gp)**2)/self.variance
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _dmean_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)
+
+    def _d2mean_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.variance
+
+    def _dvariance_dgp(self,gp):
+        return 0
+
+    def _d2variance_dgp2(self,gp):
+        return 0
diff --git a/GPy/likelihoods/noise_models/gp_transformations.py b/GPy/likelihoods/noise_models/gp_transformations.py
new file mode 100644
index 00000000..ccf965d9
--- /dev/null
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats
+import scipy as sp
+import pylab as pb
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf,inv_std_norm_cdf
+
+class GPTransformation(object):
+    """
+    Link function class for doing non-Gaussian likelihoods approximation
+
+    :param Y: observed output (Nx1 numpy.darray)
+    ..Note:: Y values allowed depend on the likelihood_function used
+    """
+    def __init__(self):
+        pass
+
+class Identity(GPTransformation):
+    """
+    $$
+    g(f) = f
+    $$
+    """
+    #def transf(self,mu):
+    #    return mu
+
+    def transf(self,f):
+        return f
+
+    def dtransf_df(self,f):
+        return 1.
+
+    def d2transf_df2(self,f):
+        return 0
+
+
+class Probit(GPTransformation):
+    """
+    $$
+    g(f) = \\Phi^{-1} (mu)
+    $$
+    """
+    #def transf(self,mu):
+    #    return inv_std_norm_cdf(mu)
+
+    def transf(self,f):
+        return std_norm_cdf(f)
+
+    def dtransf_df(self,f):
+        return std_norm_pdf(f)
+
+    def d2transf_df2(self,f):
+        return -f * std_norm_pdf(f)
+
+class Log(GPTransformation):
+    """
+    $$
+    g(f) = \log(\mu)
+    $$
+    """
+    #def transf(self,mu):
+    #    return np.log(mu)
+
+    def transf(self,f):
+        return np.exp(f)
+
+    def dtransf_df(self,f):
+        return np.exp(f)
+
+    def d2transf_df2(self,f):
+        return np.exp(f)
+
+class Log_ex_1(GPTransformation):
+    """
+    $$
+    g(f) = \log(\exp(\mu) - 1)
+    $$
+    """
+    #def transf(self,mu):
+    #    """
+    #    function: output space -> latent space
+    #    """
+    #    return np.log(np.exp(mu) - 1)
+
+    def transf(self,f):
+        """
+        function: latent space -> output space
+        """
+        return np.log(1.+np.exp(f))
+
+    def dtransf_df(self,f):
+        return np.exp(f)/(1.+np.exp(f))
+
+    def d2transf_df2(self,f):
+        aux = np.exp(f)/(1.+np.exp(f))
+        return aux*(1.-aux)
+
+class Reciprocal(GPTransformation):
+    def transf(sefl,f):
+        return 1./f
+
+    def dtransf_df(self,f):
+        return -1./f**2
+
+    def d2transf_df2(self,f):
+        return 2./f**3
+
+class Heaviside(GPTransformation):
+    """
+    $$
+    g(f) = I_{x \in A}
+    $$
+    """
+    def transf(self,f):
+        #transformation goes here
+        return np.where(f>0, 1, 0)
+
+    def dtransf_df(self,f):
+        raise NotImplementedError, "This function is not differentiable!"
+
+    def d2transf_df2(self,f):
+        raise NotImplementedError, "This function is not differentiable!"
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
new file mode 100644
index 00000000..4fd9c97f
--- /dev/null
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -0,0 +1,384 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+import pylab as pb
+from GPy.util.plot import gpplot
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+
+
+class NoiseDistribution(object):
+    """
+    Likelihood class for doing Expectation propagation
+
+    :param Y: observed output (Nx1 numpy.darray)
+    ..Note:: Y values allowed depend on the LikelihoodFunction used
+    """
+    def __init__(self,gp_link,analytical_mean=False,analytical_variance=False):
+        #assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."#FIXME
+        self.gp_link = gp_link
+        self.analytical_mean = analytical_mean
+        self.analytical_variance = analytical_variance
+        if self.analytical_mean:
+            self.moments_match = self._moments_match_analytical
+            self.predictive_mean = self._predictive_mean_analytical
+        else:
+            self.moments_match = self._moments_match_numerical
+            self.predictive_mean = self._predictive_mean_numerical
+        if self.analytical_variance:
+            self.predictive_variance = self._predictive_variance_analytical
+        else:
+            self.predictive_variance = self._predictive_variance_numerical
+
+    def _get_params(self):
+        return np.zeros(0)
+
+    def _get_param_names(self):
+        return []
+
+    def _set_params(self,p):
+        pass
+
+    def _gradients(self,partial):
+        return np.zeros(0)
+
+    def _preprocess_values(self,Y):
+        """
+        In case it is needed, this function assess the output values or makes any pertinent transformation on them.
+
+        :param Y: observed output (Nx1 numpy.darray)
+        """
+        return Y
+
+    def _product(self,gp,obs,mu,sigma):
+        """
+        Product between the cavity distribution and a likelihood factor.
+
+        :param gp: latent variable
+        :param obs: observed output
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._mass(gp,obs)
+
+    def _nlog_product_scaled(self,gp,obs,mu,sigma):
+        """
+        Negative log-product between the cavity distribution and a likelihood factor.
+        ..Note:: The constant term in the Gaussian distribution is ignored.
+
+        :param gp: latent variable
+        :param obs: observed output
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return .5*((gp-mu)/sigma)**2 + self._nlog_mass(gp,obs)
+
+    def _dnlog_product_dgp(self,gp,obs,mu,sigma):
+        """
+        Derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
+
+        :param gp: latent variable
+        :param obs: observed output
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return (gp - mu)/sigma**2 + self._dnlog_mass_dgp(gp,obs)
+
+    def _d2nlog_product_dgp2(self,gp,obs,mu,sigma):
+        """
+        Second derivative wrt latent variable of the log-product between the cavity distribution and a likelihood factor.
+
+        :param gp: latent variable
+        :param obs: observed output
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return 1./sigma**2 + self._d2nlog_mass_dgp2(gp,obs)
+
+    def _product_mode(self,obs,mu,sigma):
+        """
+        Newton's CG method to find the mode in _product (cavity x likelihood factor).
+
+        :param obs: observed output
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return sp.optimize.fmin_ncg(self._nlog_product_scaled,x0=mu,fprime=self._dnlog_product_dgp,fhess=self._d2nlog_product_dgp2,args=(obs,mu,sigma),disp=False)
+
+    def _moments_match_analytical(self,obs,tau,v):
+        """
+        If available, this function computes the moments analytically.
+        """
+        pass
+
+    def _moments_match_numerical(self,obs,tau,v):
+        """
+        Lapace approximation to calculate the moments.
+
+        :param obs: observed output
+        :param tau: cavity distribution 1st natural parameter (precision)
+        :param v: cavity distribution 2nd natural paramenter (mu*precision)
+        """
+        mu = v/tau
+        mu_hat = self._product_mode(obs,mu,np.sqrt(1./tau))
+        sigma2_hat = 1./(tau + self._d2nlog_mass_dgp2(mu_hat,obs))
+        Z_hat = np.exp(-.5*tau*(mu_hat-mu)**2) * self._mass(mu_hat,obs)*np.sqrt(tau*sigma2_hat)
+        return Z_hat,mu_hat,sigma2_hat
+
+    def _nlog_conditional_mean_scaled(self,gp,mu,sigma):
+        """
+        Negative logarithm of the l.v.'s predictive distribution times the output's mean given the l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+
+        ..Note:: This function helps computing E(Y_star) = E(E(Y_star|f_star))
+        """
+        return .5*((gp - mu)/sigma)**2 - np.log(self._mean(gp))
+
+    def _dnlog_conditional_mean_dgp(self,gp,mu,sigma):
+        """
+        Derivative of _nlog_conditional_mean_scaled wrt. l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return (gp - mu)/sigma**2 - self._dmean_dgp(gp)/self._mean(gp)
+
+    def _d2nlog_conditional_mean_dgp2(self,gp,mu,sigma):
+        """
+        Second derivative of _nlog_conditional_mean_scaled wrt. l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return 1./sigma**2 - self._d2mean_dgp2(gp)/self._mean(gp) + (self._dmean_dgp(gp)/self._mean(gp))**2
+
+    def _nlog_exp_conditional_variance_scaled(self,gp,mu,sigma):
+        """
+        Negative logarithm of the l.v.'s predictive distribution times the output's variance given the l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+
+        ..Note:: This function helps computing E(V(Y_star|f_star))
+        """
+        return .5*((gp - mu)/sigma)**2 - np.log(self._variance(gp))
+
+    def _dnlog_exp_conditional_variance_dgp(self,gp,mu,sigma):
+        """
+        Derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return (gp - mu)/sigma**2 - self._dvariance_dgp(gp)/self._variance(gp)
+
+    def _d2nlog_exp_conditional_variance_dgp2(self,gp,mu,sigma):
+        """
+        Second derivative of _nlog_exp_conditional_variance_scaled wrt. l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return 1./sigma**2 - self._d2variance_dgp2(gp)/self._variance(gp) + (self._dvariance_dgp(gp)/self._variance(gp))**2
+
+    def _nlog_exp_conditional_mean_sq_scaled(self,gp,mu,sigma):
+        """
+        Negative logarithm of the l.v.'s predictive distribution times the output's mean squared given the l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+
+        ..Note:: This function helps computing E( E(Y_star|f_star)**2 )
+        """
+        return .5*((gp - mu)/sigma)**2 - 2*np.log(self._mean(gp))
+
+    def _dnlog_exp_conditional_mean_sq_dgp(self,gp,mu,sigma):
+        """
+        Derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return (gp - mu)/sigma**2 - 2*self._dmean_dgp(gp)/self._mean(gp)
+
+    def _d2nlog_exp_conditional_mean_sq_dgp2(self,gp,mu,sigma):
+        """
+        Second derivative of _nlog_exp_conditional_mean_sq_scaled wrt. l.v.
+
+        :param gp: latent variable
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        return 1./sigma**2 - 2*( self._d2mean_dgp2(gp)/self._mean(gp) - (self._dmean_dgp(gp)/self._mean(gp))**2 )
+
+    def _predictive_mean_analytical(self,mu,sigma):
+        """
+        If available, this function computes the predictive mean analytically.
+        """
+        pass
+
+    def _predictive_variance_analytical(self,mu,sigma):
+        """
+        If available, this function computes the predictive variance analytically.
+        """
+        pass
+
+    def _predictive_mean_numerical(self,mu,sigma):
+        """
+        Laplace approximation to the predictive mean: E(Y_star) = E( E(Y_star|f_star) )
+
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        maximum = sp.optimize.fmin_ncg(self._nlog_conditional_mean_scaled,x0=self._mean(mu),fprime=self._dnlog_conditional_mean_dgp,fhess=self._d2nlog_conditional_mean_dgp2,args=(mu,sigma),disp=False)
+        mean = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma))*sigma)
+        """
+
+        pb.figure()
+        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
+        f = np.array([np.exp(-self._nlog_conditional_mean_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
+        pb.plot(x,f,'b-')
+        sigma2 = 1./self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma)
+        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
+        k = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
+        pb.plot(x,f2*mean,'r-')
+        pb.vlines(maximum,0,f.max())
+        """
+        return mean
+
+    def _predictive_mean_sq(self,mu,sigma):
+        """
+        Laplace approximation to the predictive mean squared: E(Y_star**2) = E( E(Y_star|f_star)**2 )
+
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        """
+        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_mean_sq_scaled,x0=self._mean(mu),fprime=self._dnlog_exp_conditional_mean_sq_dgp,fhess=self._d2nlog_exp_conditional_mean_sq_dgp2,args=(mu,sigma),disp=False)
+        mean_squared = np.exp(-self._nlog_exp_conditional_mean_sq_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_mean_sq_dgp2(maximum,mu,sigma))*sigma)
+        return mean_squared
+
+    def _predictive_variance_numerical(self,mu,sigma,predictive_mean=None):
+        """
+        Laplace approximation to the predictive variance: V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
+        """
+        # E( V(Y_star|f_star) )
+        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_variance_scaled,x0=self._variance(mu),fprime=self._dnlog_exp_conditional_variance_dgp,fhess=self._d2nlog_exp_conditional_variance_dgp2,args=(mu,sigma),disp=False)
+        exp_var = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma))*sigma)
+
+        """
+        pb.figure()
+        x = np.array([mu + step*sigma for step in np.linspace(-7,7,100)])
+        f = np.array([np.exp(-self._nlog_exp_conditional_variance_scaled(xi,mu,sigma))/np.sqrt(2*np.pi*sigma**2) for xi in x])
+        pb.plot(x,f,'b-')
+        sigma2 = 1./self._d2nlog_exp_conditional_variance_dgp2(maximum,mu,sigma)
+        f2 = np.exp(-.5*(x-maximum)**2/sigma2)/np.sqrt(2*np.pi*sigma2)
+        k = np.exp(-self._nlog_exp_conditional_variance_scaled(maximum,mu,sigma))*np.sqrt(sigma2)/np.sqrt(sigma**2)
+        pb.plot(x,f2*exp_var,'r--')
+        pb.vlines(maximum,0,f.max())
+        """
+
+        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star)**2 )
+        exp_exp2 = self._predictive_mean_sq(mu,sigma)
+        if predictive_mean is None:
+            predictive_mean = self.predictive_mean(mu,sigma)
+        var_exp = exp_exp2 - predictive_mean**2
+        return exp_var + var_exp
+
+    def _predictive_percentiles(self,p,mu,sigma):
+        """
+        Percentiles of the predictive distribution
+
+        :parm p: lower tail probability
+        :param mu: cavity distribution mean
+        :param sigma: cavity distribution standard deviation
+        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
+        """
+        qf = stats.norm.ppf(p,mu,sigma)
+        return self.gp_link.transf(qf)
+
+    def _nlog_joint_predictive_scaled(self,x,mu,sigma):
+        """
+        Negative logarithm of the joint predictive distribution (latent variable and output).
+
+        :param x: tuple (latent variable,output)
+        :param mu: latent variable's predictive mean
+        :param sigma: latent variable's predictive standard deviation
+        """
+        return self._nlog_product_scaled(x[0],x[1],mu,sigma)
+
+    def _gradient_nlog_joint_predictive(self,x,mu,sigma):
+        """
+        Gradient of _nlog_joint_predictive_scaled.
+
+        :param x: tuple (latent variable,output)
+        :param mu: latent variable's predictive mean
+        :param sigma: latent variable's predictive standard deviation
+        ..Note: Only avilable when the output is continuous
+        """
+        assert not self.discrete, "Gradient not available for discrete outputs."
+        return np.array((self._dnlog_product_dgp(gp=x[0],obs=x[1],mu=mu,sigma=sigma),self._dnlog_mass_dobs(obs=x[1],gp=x[0])))
+
+    def _hessian_nlog_joint_predictive(self,x,mu,sigma):
+        """
+        Hessian of _nlog_joint_predictive_scaled.
+
+        :param x: tuple (latent variable,output)
+        :param mu: latent variable's predictive mean
+        :param sigma: latent variable's predictive standard deviation
+        ..Note: Only avilable when the output is continuous
+        """
+        assert not self.discrete, "Hessian not available for discrete outputs."
+        cross_derivative = self._d2nlog_mass_dcross(gp=x[0],obs=x[1])
+        return np.array((self._d2nlog_product_dgp2(gp=x[0],obs=x[1],mu=mu,sigma=sigma),cross_derivative,cross_derivative,self._d2nlog_mass_dobs2(obs=x[1],gp=x[0]))).reshape(2,2)
+
+    def _joint_predictive_mode(self,mu,sigma):
+        """
+        Negative logarithm of the joint predictive distribution (latent variable and output).
+
+        :param x: tuple (latent variable,output)
+        :param mu: latent variable's predictive mean
+        :param sigma: latent variable's predictive standard deviation
+        """
+        return sp.optimize.fmin_ncg(self._nlog_joint_predictive_scaled,x0=(mu,self.gp_link.transf(mu)),fprime=self._gradient_nlog_joint_predictive,fhess=self._hessian_nlog_joint_predictive,args=(mu,sigma),disp=False)
+
+    def predictive_values(self,mu,var):
+        """
+        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction
+        :param mu: mean of the latent variable
+        :param var: variance of the latent variable
+        """
+        if isinstance(mu,float) or isinstance(mu,int):
+            mu = [mu]
+            var = [var]
+        pred_mean = []
+        pred_var = []
+        q1 = []
+        q3 = []
+        for m,s in zip(mu,np.sqrt(var)):
+            pred_mean.append(self.predictive_mean(m,s))
+            pred_var.append(self.predictive_variance(m,s,pred_mean[-1]))
+            q1.append(self._predictive_percentiles(.025,m,s))
+            q3.append(self._predictive_percentiles(.975,m,s))
+        pred_mean = np.vstack(pred_mean)
+        pred_var = np.vstack(pred_var)
+        q1 = np.vstack(q1)
+        q3 = np.vstack(q3)
+        return pred_mean, pred_var, q1, q3
diff --git a/GPy/likelihoods/noise_models/poisson_noise.py b/GPy/likelihoods/noise_models/poisson_noise.py
new file mode 100644
index 00000000..e4ce90d3
--- /dev/null
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import gp_transformations
+from noise_distributions import NoiseDistribution
+
+class Poisson(NoiseDistribution):
+    """
+    Poisson likelihood
+    Y is expected to take values in {0,1,2,...}
+    -----
+    $$
+    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
+    $$
+    """
+    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
+        #self.discrete = True
+        #self.support_limits = (0,np.inf)
+
+        #self.analytical_mean = False
+        super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
+
+    def _preprocess_values(self,Y): #TODO
+        #self.scale = .5*Y.max()
+        #self.shift = Y.mean()
+        return Y #(Y - self.shift)/self.scale
+
+    def _mass(self,gp,obs):
+        """
+        Mass (or density) function
+        """
+        #obs = obs*self.scale + self.shift
+        return stats.poisson.pmf(obs,self.gp_link.transf(gp))
+
+    def _nlog_mass(self,gp,obs):
+        """
+        Negative logarithm of the un-normalized distribution: factors that are not a function of gp are omitted
+        """
+        return self.gp_link.transf(gp) - obs * np.log(self.gp_link.transf(gp)) + np.log(special.gamma(obs+1))
+
+    def _dnlog_mass_dgp(self,gp,obs):
+        return self.gp_link.dtransf_df(gp) * (1. - obs/self.gp_link.transf(gp))
+
+    def _d2nlog_mass_dgp2(self,gp,obs):
+        d2_df = self.gp_link.d2transf_df2(gp)
+        transf = self.gp_link.transf(gp)
+        return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
+
+    def _dnlog_mass_dobs(self,obs,gp): #TODO not needed
+        return special.psi(obs+1) -  np.log(self.gp_link.transf(gp))
+
+    def _d2nlog_mass_dobs2(self,obs,gp=None): #TODO not needed
+        return special.polygamma(1,obs)
+
+    def _d2nlog_mass_dcross(self,obs,gp): #TODO not needed
+        return -self.gp_link.dtransf_df(gp)/self.gp_link.transf(gp)
+
+    def _mean(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _dmean_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)
+
+    def _d2mean_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)
+
+    def _variance(self,gp):
+        """
+        Mass (or density) function
+        """
+        return self.gp_link.transf(gp)
+
+    def _dvariance_dgp(self,gp):
+        return self.gp_link.dtransf_df(gp)
+
+    def _d2variance_dgp2(self,gp):
+        return self.gp_link.d2transf_df2(gp)
diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index e535ed62..10ce577b 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -14,3 +14,5 @@ from warped_gp import WarpedGP
 from bayesian_gplvm import BayesianGPLVM
 from mrd import MRD
 from gradient_checker import GradientChecker
+from gp_multioutput_regression import GPMultioutputRegression
+from sparse_gp_multioutput_regression import SparseGPMultioutputRegression
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index ae74e5f2..e514ad19 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -66,8 +66,8 @@ class BayesianGPLVM(SparseGP, GPLVM):
         S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
         return (X_names + S_names + SparseGP._get_param_names(self))
 
-    def _get_print_names(self):
-        return SparseGP._get_print_names(self)
+    #def _get_print_names(self):
+    #    return SparseGP._get_print_names(self)
 
     def _get_params(self):
         """
diff --git a/GPy/models/bcgplvm.py b/GPy/models/bcgplvm.py
index b6246c32..9f5866c3 100644
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@@ -44,7 +44,7 @@ class BCGPLVM(GPLVM):
         GP._set_params(self, x[self.mapping.num_params:])
 
     def _log_likelihood_gradients(self):
-        dL_df = 2.*self.kern.dK_dX(self.dL_dK, self.X)
+        dL_df = self.kern.dK_dX(self.dL_dK, self.X)
         dL_dtheta = self.mapping.df_dtheta(dL_df, self.likelihood.Y)
         return np.hstack((dL_dtheta.flatten(), GP._log_likelihood_gradients(self)))
 
diff --git a/GPy/models/fitc_classification.py b/GPy/models/fitc_classification.py
index f4cf4e8d..ee92a1b4 100644
--- a/GPy/models/fitc_classification.py
+++ b/GPy/models/fitc_classification.py
@@ -31,8 +31,8 @@ class FITCClassification(FITC):
             kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)
 
         if likelihood is None:
-            distribution = likelihoods.likelihood_functions.Binomial()
-            likelihood = likelihoods.EP(Y, distribution)
+            noise_model = likelihoods.binomial()
+            likelihood = likelihoods.EP(Y, noise_model)
         elif Y is not None:
             if not all(Y.flatten() == likelihood.data.flatten()):
                 raise Warning, 'likelihood.data and Y are different.'
diff --git a/GPy/models/gp_classification.py b/GPy/models/gp_classification.py
index c6012988..fce51cfa 100644
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@@ -14,7 +14,7 @@ class GPClassification(GP):
     This is a thin wrapper around the models.GP class, with a set of sensible defaults
 
     :param X: input observations
-    :param Y: observed values
+    :param Y: observed values, can be None if likelihood is not None
     :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
     :param kernel: a GPy kernel, defaults to rbf
     :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
@@ -31,8 +31,8 @@ class GPClassification(GP):
             kernel = kern.rbf(X.shape[1])
 
         if likelihood is None:
-            distribution = likelihoods.likelihood_functions.Binomial()
-            likelihood = likelihoods.EP(Y, distribution)
+            noise_model = likelihoods.binomial()
+            likelihood = likelihoods.EP(Y, noise_model)
         elif Y is not None:
             if not all(Y.flatten() == likelihood.data.flatten()):
                 raise Warning, 'likelihood.data and Y are different.'
diff --git a/GPy/models/gp_multioutput_regression.py b/GPy/models/gp_multioutput_regression.py
new file mode 100644
index 00000000..20d839ce
--- /dev/null
+++ b/GPy/models/gp_multioutput_regression.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from ..core import GP
+from .. import likelihoods
+from .. import kern
+
+class GPMultioutputRegression(GP):
+    """
+    Multiple output Gaussian process with Gaussian noise
+
+    This is a wrapper around the models.GP class, with a set of sensible defaults
+
+    :param X_list: input observations
+    :type X_list: list of numpy arrays (num_data_output_i x input_dim), one array per output
+    :param Y_list: observed values
+    :type Y_list: list of numpy arrays (num_data_output_i x 1), one array per output
+    :param kernel_list: GPy kernels, defaults to rbf
+    :type kernel_list: list of GPy kernels
+    :param noise_variance_list: noise parameters per output, defaults to 1.0 for every output
+    :type noise_variance_list: list of floats
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_Y: False|True
+    :param W_columns: number tuples of the corregionalization parameters 'coregion_W' (see coregionalize kernel documentation)
+    :type W_columns: integer
+    """
+
+    def __init__(self,X_list,Y_list,kernel_list=None,noise_variance_list=None,normalize_X=False,normalize_Y=False,W_columns=1):
+
+        self.num_outputs = len(Y_list)
+        assert len(X_list) == self.num_outputs, 'Number of outputs do not match length of inputs list.'
+
+        #Inputs indexing
+        i = 0
+        index = []
+        for x,y in zip(X_list,Y_list):
+            assert x.shape[0] == y.shape[0]
+            index.append(np.repeat(i,x.size)[:,None])
+            i += 1
+        index = np.vstack(index)
+        X = np.hstack([np.vstack(X_list),index])
+        original_dim = X.shape[1] - 1
+
+        #Mixed noise likelihood definition
+        likelihood = likelihoods.Gaussian_Mixed_Noise(Y_list,noise_params=noise_variance_list,normalize=normalize_Y)
+
+        #Coregionalization kernel definition
+        if kernel_list is None:
+            kernel_list = [kern.rbf(original_dim)]
+        mkernel = kern.build_lcm(input_dim=original_dim, num_outputs=self.num_outputs, kernel_list = kernel_list, W_columns=W_columns)
+
+        self.multioutput = True
+        GP.__init__(self, X, likelihood, mkernel, normalize_X=normalize_X)
+        self.ensure_default_constraints()
diff --git a/GPy/models/gplvm.py b/GPy/models/gplvm.py
index c2a7768c..ad78d51f 100644
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@@ -61,7 +61,7 @@ class GPLVM(GP):
         GP._set_params(self, x[self.X.size:])
 
     def _log_likelihood_gradients(self):
-        dL_dX = 2.*self.kern.dK_dX(self.dL_dK, self.X)
+        dL_dX = self.kern.dK_dX(self.dL_dK, self.X)
 
         return np.hstack((dL_dX.flatten(), GP._log_likelihood_gradients(self)))
 
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index 4c7fa167..99e50a19 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -25,11 +25,11 @@ class MRD(Model):
     :param input_dim: latent dimensionality
     :type input_dim: int
     :param initx: initialisation method for the latent space :
-        
+
         * 'concat' - PCA on concatenation of all datasets
         * 'single' - Concatenation of PCA on datasets, respectively
         * 'random' - Random draw from a normal
-            
+
     :type initx: ['concat'|'single'|'random']
     :param initz: initialisation method for inducing inputs
     :type initz: 'permute'|'random'
@@ -163,28 +163,31 @@ class MRD(Model):
         self._init_X(initx, self.likelihood_list)
         self._init_Z(initz, self.X)
 
-    def _get_latent_param_names(self):
+    #def _get_latent_param_names(self):
+    def _get_param_names(self):
         n1 = self.gref._get_param_names()
         n1var = n1[:self.NQ * 2 + self.MQ]
-        return n1var
-
-
-    def _get_kernel_names(self):
+    #    return n1var
+    #
+    #def _get_kernel_names(self):
         map_names = lambda ns, name: map(lambda x: "{1}_{0}".format(*x),
                                          itertools.izip(ns,
                                                         itertools.repeat(name)))
-        kernel_names = (map_names(SparseGP._get_param_names(g)[self.MQ:], n) for g, n in zip(self.bgplvms, self.names))
-        return kernel_names
+        return list(itertools.chain(n1var, *(map_names(\
+                SparseGP._get_param_names(g)[self.MQ:], n) \
+                for g, n in zip(self.bgplvms, self.names))))
+    #    kernel_names = (map_names(SparseGP._get_param_names(g)[self.MQ:], n) for g, n in zip(self.bgplvms, self.names))
+    #    return kernel_names
 
-    def _get_param_names(self):
+    #def _get_param_names(self):
         # X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
         # S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
-        n1var = self._get_latent_param_names()
-        kernel_names = self._get_kernel_names()
-        return list(itertools.chain(n1var, *kernel_names))
+    #    n1var = self._get_latent_param_names()
+    #    kernel_names = self._get_kernel_names()
+    #    return list(itertools.chain(n1var, *kernel_names))
 
-    def _get_print_names(self):
-        return list(itertools.chain(*self._get_kernel_names()))
+    #def _get_print_names(self):
+    #    return list(itertools.chain(*self._get_kernel_names()))
 
     def _get_params(self):
         """
diff --git a/GPy/models/sparse_gp_classification.py b/GPy/models/sparse_gp_classification.py
index 5f36ebe1..50c2f935 100644
--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@@ -28,11 +28,11 @@ class SparseGPClassification(SparseGP):
 
     def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10):
         if kernel is None:
-            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1], 1e-3)
+            kernel = kern.rbf(X.shape[1])# + kern.white(X.shape[1],1e-3)
 
         if likelihood is None:
-            distribution = likelihoods.likelihood_functions.Binomial()
-            likelihood = likelihoods.EP(Y, distribution)
+            noise_model = likelihoods.binomial()
+            likelihood = likelihoods.EP(Y, noise_model)
         elif Y is not None:
             if not all(Y.flatten() == likelihood.data.flatten()):
                 raise Warning, 'likelihood.data and Y are different.'
diff --git a/GPy/models/sparse_gp_multioutput_regression.py b/GPy/models/sparse_gp_multioutput_regression.py
new file mode 100644
index 00000000..041204b6
--- /dev/null
+++ b/GPy/models/sparse_gp_multioutput_regression.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from ..core import SparseGP
+from .. import likelihoods
+from .. import kern
+from ..util import multioutput
+
+class SparseGPMultioutputRegression(SparseGP):
+    """
+    Sparse multiple output Gaussian process with Gaussian noise
+
+    This is a wrapper around the models.SparseGP class, with a set of sensible defaults
+
+    :param X_list: input observations
+    :type X_list: list of numpy arrays (num_data_output_i x input_dim), one array per output
+    :param Y_list: observed values
+    :type Y_list: list of numpy arrays (num_data_output_i x 1), one array per output
+    :param kernel_list: GPy kernels, defaults to rbf
+    :type kernel_list: list of GPy kernels
+    :param noise_variance_list: noise parameters per output, defaults to 1.0 for every output
+    :type noise_variance_list: list of floats
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_Y: False|True
+    :param Z_list: inducing inputs (optional)
+    :type Z_list: list of numpy arrays (num_inducing_output_i x input_dim), one array per output | empty list
+    :param num_inducing: number of inducing inputs per output, defaults to 10 (ignored if Z_list is not empty)
+    :type num_inducing: integer
+    :param W_columns: number tuples of the corregionalization parameters 'coregion_W' (see coregionalize kernel documentation)
+    :type W_columns: integer
+    """
+    #NOTE not tested with uncertain inputs
+    def __init__(self,X_list,Y_list,kernel_list=None,noise_variance_list=None,normalize_X=False,normalize_Y=False,Z_list=[],num_inducing=10,W_columns=1):
+
+        self.num_outputs = len(Y_list)
+        assert len(X_list) == self.num_outputs, 'Number of outputs do not match length of inputs list.'
+
+        #Inducing inputs list
+        if len(Z_list):
+            assert len(Z_list) == self.num_outputs, 'Number of outputs do not match length of inducing inputs list.'
+        else:
+            if isinstance(num_inducing,np.int):
+                num_inducing = [num_inducing] * self.num_outputs
+            num_inducing = np.asarray(num_inducing)
+            assert num_inducing.size == self.num_outputs, 'Number of outputs do not match length of inducing inputs list.'
+            for ni,X in zip(num_inducing,X_list):
+                i = np.random.permutation(X.shape[0])[:ni]
+                Z_list.append(X[i].copy())
+
+        #Inputs and inducing inputs indexing
+        i = 0
+        index = []
+        index_z = []
+        for x,y,z in zip(X_list,Y_list,Z_list):
+            assert x.shape[0] == y.shape[0]
+            index.append(np.repeat(i,x.size)[:,None])
+            index_z.append(np.repeat(i,z.size)[:,None])
+            i += 1
+        index = np.vstack(index)
+        index_z = np.vstack(index_z)
+        X = np.hstack([np.vstack(X_list),index])
+        Z = np.hstack([np.vstack(Z_list),index_z])
+        original_dim = X.shape[1] - 1
+
+        #Mixed noise likelihood definition
+        likelihood = likelihoods.Gaussian_Mixed_Noise(Y_list,noise_params=noise_variance_list,normalize=normalize_Y)
+
+        #Coregionalization kernel definition
+        if kernel_list is None:
+            kernel_list = [kern.rbf(original_dim)]
+        mkernel = kern.build_lcm(input_dim=original_dim, num_outputs=self.num_outputs, kernel_list = kernel_list, W_columns=W_columns)
+
+        self.multioutput = True
+        SparseGP.__init__(self, X, likelihood, mkernel, Z=Z, normalize_X=normalize_X)
+        self.constrain_fixed('.*iip_\d+_1')
+        self.ensure_default_constraints()
diff --git a/GPy/models/sparse_gp_regression.py b/GPy/models/sparse_gp_regression.py
index 64674f4a..d2e23887 100644
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@@ -20,7 +20,11 @@ class SparseGPRegression(SparseGP):
     :type normalize_X: False|True
     :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
     :type normalize_Y: False|True
+    :param Z: inducing inputs (optional, see note)
+    :type Z: np.ndarray (num_inducing x input_dim) | None
     :rtype: model object
+    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
+    :type X_variance: np.ndarray (num_data x input_dim) | None
 
     .. Note:: Multiple independent outputs are allowed using columns of Y
 
diff --git a/GPy/notes.txt b/GPy/notes.txt
index 38e546d8..768701f2 100644
--- a/GPy/notes.txt
+++ b/GPy/notes.txt
@@ -38,7 +38,7 @@ In sparse GPs wouldn't it be clearer to call Z inducing?
 
 In coregionalisation matrix, setting the W to all ones will (surely?) ensure that symmetry isn't broken. Also, but allowing it to scale like that, the output variance increases as rank is increased (and if user sets rank to more than output dim they could get very different results).
 
-We are inconsistent about our use of ise and ize e.g. optimize and normalize_X, but coregionalise, we should choose one and stick to it. Suggest -ize.
+We are inconsistent about our use of ise and ize e.g. optimize and normalize_X, but coregionalise, we should choose one and stick to it. Suggest -ize. Neil- I'm imposing the US spellings to keep things consistent, so -ize it is.
 
 Exceptions: we need to provide a list of exceptions we throw and specify what is thrown where. 
 
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 65a8da77..e67649f4 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -4,40 +4,68 @@
 import unittest
 import numpy as np
 import GPy
-    
 
+verbose = False
 
 class KernelTests(unittest.TestCase):
     def test_kerneltie(self):
         K = GPy.kern.rbf(5, ARD=True)
         K.tie_params('.*[01]')
         K.constrain_fixed('2')
-        
         X = np.random.rand(5,5)
         Y = np.ones((5,1))
         m = GPy.models.GPRegression(X,Y,K)
         self.assertTrue(m.checkgrad())
 
     def test_rbfkernel(self):
-        verbose = False
         kern = GPy.kern.rbf(5)
-        self.assertTrue(GPy.kern.Kern_check_model(kern).is_positive_definite())
-        self.assertTrue(GPy.kern.Kern_check_dK_dtheta(kern).checkgrad(verbose=verbose))
-        self.assertTrue(GPy.kern.Kern_check_dKdiag_dtheta(kern).checkgrad(verbose=verbose))
-        self.assertTrue(GPy.kern.Kern_check_dK_dX(kern).checkgrad(verbose=verbose))
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_rbf_invkernel(self):
+        kern = GPy.kern.rbf_inv(5)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_Matern32kernel(self):
+        kern = GPy.kern.Matern32(5)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_Matern52kernel(self):
+        kern = GPy.kern.Matern52(5)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_linearkernel(self):
+        kern = GPy.kern.linear(5)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_periodic_exponentialkernel(self):
+        kern = GPy.kern.periodic_exponential(1)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_periodic_Matern32kernel(self):
+        kern = GPy.kern.periodic_Matern32(1)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_periodic_Matern52kernel(self):
+        kern = GPy.kern.periodic_Matern52(1)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_rational_quadratickernel(self):
+        kern = GPy.kern.rational_quadratic(1)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
     def test_gibbskernel(self):
-        verbose = False
         kern = GPy.kern.gibbs(5, mapping=GPy.mappings.Linear(5, 1))
         self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
+    def test_heterokernel(self):
+        kern = GPy.kern.hetero(5, mapping=GPy.mappings.Linear(5, 1), transform=GPy.core.transformations.logexp())
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
     def test_mlpkernel(self):
-        verbose = False
         kern = GPy.kern.mlp(5)
         self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
     def test_polykernel(self):
-        verbose = False
         kern = GPy.kern.poly(5, degree=4)
         self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
@@ -48,11 +76,10 @@ class KernelTests(unittest.TestCase):
         X = np.random.rand(30, 4)
         K = np.dot(X, X.T)
         kernel = GPy.kern.fixed(4, K)
-        Y = np.ones((30,1))
-        m = GPy.models.GPRegression(X,Y,kernel=kernel)
-        self.assertTrue(m.checkgrad())
+        kern = GPy.kern.poly(5, degree=4)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
-    def test_coregionalisation(self):
+    def test_coregionalization(self):
         X1 = np.random.rand(50,1)*8
         X2 = np.random.rand(30,1)*5
         index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
@@ -62,13 +89,11 @@ class KernelTests(unittest.TestCase):
         Y = np.vstack((Y1,Y2))
 
         k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
-        k2 = GPy.kern.coregionalise(2,1)
-        k = k1.prod(k2,tensor=True)
-        m = GPy.models.GPRegression(X,Y,kernel=k)
-        self.assertTrue(m.checkgrad())
+        k2 = GPy.kern.coregionalize(2,1)
+        kern = k1**k2
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 
 
-       
 if __name__ == "__main__":
     print "Running unit tests, please be (very) patient..."
     unittest.main()
diff --git a/GPy/testing/unit_tests.py b/GPy/testing/unit_tests.py
index 246d40d1..6bb624df 100644
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@@ -5,7 +5,6 @@
 import unittest
 import numpy as np
 import GPy
-from GPy.likelihoods.likelihood_functions import Binomial
 
 class GradientTests(unittest.TestCase):
     def setUp(self):
@@ -199,10 +198,7 @@ class GradientTests(unittest.TestCase):
         X = np.hstack([np.random.normal(5, 2, N / 2), np.random.normal(10, 2, N / 2)])[:, None]
         Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
         kernel = GPy.kern.rbf(1)
-        distribution = GPy.likelihoods.likelihood_functions.Binomial()
-        likelihood = GPy.likelihoods.EP(Y, distribution)
-        m = GPy.core.GP(X, likelihood, kernel)
-        m.ensure_default_constraints()
+        m = GPy.models.GPClassification(X,Y,kernel=kernel)
         m.update_likelihood_approximation()
         self.assertTrue(m.checkgrad())
 
@@ -212,10 +208,11 @@ class GradientTests(unittest.TestCase):
         Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
         Z = np.linspace(0, 15, 4)[:, None]
         kernel = GPy.kern.rbf(1)
-        distribution = GPy.likelihoods.likelihood_functions.Binomial()
-        likelihood = GPy.likelihoods.EP(Y, distribution)
-        m = GPy.core.SparseGP(X, likelihood, kernel, Z)
-        m.ensure_default_constraints()
+        m = GPy.models.SparseGPClassification(X,Y,kernel=kernel,Z=Z)
+        #distribution = GPy.likelihoods.likelihood_functions.Binomial()
+        #likelihood = GPy.likelihoods.EP(Y, distribution)
+        #m = GPy.core.SparseGP(X, likelihood, kernel, Z)
+        #m.ensure_default_constraints()
         m.update_likelihood_approximation()
         self.assertTrue(m.checkgrad())
 
@@ -224,10 +221,24 @@ class GradientTests(unittest.TestCase):
         X = np.hstack([np.random.rand(N / 2) + 1, np.random.rand(N / 2) - 1])[:, None]
         k = GPy.kern.rbf(1) + GPy.kern.white(1)
         Y = np.hstack([np.ones(N/2),np.zeros(N/2)])[:,None]
-        m = GPy.models.FITCClassification(X, Y=Y)
+        m = GPy.models.FITCClassification(X, Y, kernel = k)
         m.update_likelihood_approximation()
         self.assertTrue(m.checkgrad())
 
+    def multioutput_regression_1D(self):
+        X1 = np.random.rand(50, 1) * 8
+        X2 = np.random.rand(30, 1) * 5
+        X = np.vstack((X1, X2))
+        Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
+        Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
+        Y = np.vstack((Y1, Y2))
+
+        k1 = GPy.kern.rbf(1)
+        m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
+        m.constrain_fixed('.*rbf_var', 1.)
+        self.assertTrue(m.checkgrad())
+
+
 if __name__ == "__main__":
     print "Running unit tests, please be (very) patient..."
     unittest.main()
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 19cf6545..1effa9ce 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -51,7 +51,7 @@ def dpotri(A, lower=0):
 
     :param A: Matrix A
     :param lower: is matrix lower (true) or upper (false)
-    :returns:
+    :returns: A inverse
     """
     return lapack.dpotri(A, lower=lower)
 
diff --git a/GPy/util/multioutput.py b/GPy/util/multioutput.py
new file mode 100644
index 00000000..a57593a7
--- /dev/null
+++ b/GPy/util/multioutput.py
@@ -0,0 +1,35 @@
+import numpy as np
+import warnings
+from .. import kern
+
+def build_lcm(input_dim, num_outputs, CK = [], NC = [], W_columns=1,W=None,kappa=None):
+    #TODO build_icm or build_lcm
+    """
+    Builds a kernel for a linear coregionalization model
+
+    :input_dim: Input dimensionality
+    :num_outputs: Number of outputs
+    :param CK: List of coregionalized kernels (i.e., this will be multiplied by a coregionalize kernel).
+    :param K: List of kernels that will be added up together with CK, but won't be multiplied by a coregionalize kernel
+    :param W_columns: number tuples of the corregionalization parameters 'coregion_W'
+    :type W_columns: integer
+    """
+
+    for k in CK:
+        if k.input_dim <> input_dim:
+            k.input_dim = input_dim
+            warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
+
+    for k in NC:
+        if k.input_dim <> input_dim + 1:
+            k.input_dim = input_dim + 1
+            warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
+
+    kernel = CK[0].prod(kern.coregionalize(num_outputs,W_columns,W,kappa),tensor=True)
+    for k in CK[1:]:
+        k_coreg = kern.coregionalize(num_outputs,W_columns,W,kappa)
+        kernel += k.prod(k_coreg,tensor=True)
+    for k in NC:
+        kernel += k
+
+    return kernel
diff --git a/GPy/util/univariate_Gaussian.py b/GPy/util/univariate_Gaussian.py
index 28946894..5a5880d5 100644
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@@ -32,4 +32,15 @@ def std_norm_cdf(x):
     x = float(x)
     return weave.inline(code,arg_names=['x'],support_code=support_code)
 
+def inv_std_norm_cdf(x):
+    """
+    Inverse cumulative standard Gaussian distribution
+    Based on Winitzki, S. (2008)
+    """
+    z = 2*x -1
+    ln1z2 = np.log(1-z**2)
+    a = 8*(np.pi -3)/(3*np.pi*(4-np.pi))
+    b = 2/(np.pi * a) + ln1z2/2
+    inv_erf = np.sign(z) * np.sqrt( np.sqrt(b**2 - ln1z2/a) - b )
+    return np.sqrt(2) * inv_erf
 
diff --git a/README.md b/README.md
index 09bc78f5..0ff3d890 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,57 @@
 GPy
 ===
 
-A Gaussian processes framework in python
+A Gaussian processes framework in Python.
 
 * [Online documentation](https://gpy.readthedocs.org/en/latest/)
 * [Unit tests (Travis-CI)](https://travis-ci.org/SheffieldML/GPy)
 
 
 Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GPy.png)
+
+
+Compiling documentation:
+========================
+
+The documentation is stored in doc/ and is compiled with the Sphinx Python documentation generator, and is written in the reStructuredText format.
+
+The Sphinx documentation is available here: http://sphinx-doc.org/latest/contents.html
+
+
+Installing dependencies:
+------------------------
+
+To compile the documentation, first ensure that Sphinx is installed. On Debian-based systems, this can be achieved as follows:
+
+    sudo apt-get install python-pip
+    sudo pip install sphinx
+
+A LaTeX distribution is also required to compile the equations. Note that the extra packages are necessary to install the unicode packages. To compile the equations to PNG format for use in HTML pages, the package *dvipng* must be installed. IPython is also required. On Debian-based systems, this can be achieved as follows:
+
+    sudo apt-get install texlive texlive-latex-extra texlive-base texlive-recommended
+    sudo apt-get install dvipng
+    sudo apt-get install ipython
+
+
+Compiling documentation:
+------------------------
+
+The documentation can be compiled as follows:
+
+    cd doc
+    make html
+
+The HTML files are then stored in doc/_build/
+
+
+Running unit tests:
+===================
+
+Ensure nose is installed via pip:
+
+    pip install nose
+
+Run nosetests from the root directory of the repository:
+
+    nosetests -v
+
diff --git a/doc/GPy.core.rst b/doc/GPy.core.rst
index 0590450f..c4f1849d 100644
--- a/doc/GPy.core.rst
+++ b/doc/GPy.core.rst
@@ -1,91 +1,102 @@
-core Package
-============
+GPy.core package
+================
 
-:mod:`core` Package
--------------------
+Submodules
+----------
 
-.. automodule:: GPy.core
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`domains` Module
----------------------
+GPy.core.domains module
+-----------------------
 
 .. automodule:: GPy.core.domains
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`fitc` Module
-------------------
+GPy.core.fitc module
+--------------------
 
 .. automodule:: GPy.core.fitc
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`gp` Module
-----------------
+GPy.core.gp module
+------------------
 
 .. automodule:: GPy.core.gp
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`gp_base` Module
----------------------
+GPy.core.gp_base module
+-----------------------
 
 .. automodule:: GPy.core.gp_base
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`model` Module
--------------------
+GPy.core.mapping module
+-----------------------
+
+.. automodule:: GPy.core.mapping
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.core.model module
+---------------------
 
 .. automodule:: GPy.core.model
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`parameterized` Module
----------------------------
+GPy.core.parameterized module
+-----------------------------
 
 .. automodule:: GPy.core.parameterized
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`priors` Module
---------------------
+GPy.core.priors module
+----------------------
 
 .. automodule:: GPy.core.priors
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`sparse_gp` Module
------------------------
+GPy.core.sparse_gp module
+-------------------------
 
 .. automodule:: GPy.core.sparse_gp
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`svigp` Module
--------------------
+GPy.core.svigp module
+---------------------
 
 .. automodule:: GPy.core.svigp
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`transformations` Module
------------------------------
+GPy.core.transformations module
+-------------------------------
 
 .. automodule:: GPy.core.transformations
     :members:
     :undoc-members:
     :show-inheritance:
 
+
+Module contents
+---------------
+
+.. automodule:: GPy.core
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/GPy.examples.rst b/doc/GPy.examples.rst
index fedfd4b9..4fd3528f 100644
--- a/doc/GPy.examples.rst
+++ b/doc/GPy.examples.rst
@@ -1,51 +1,54 @@
-examples Package
-================
+GPy.examples package
+====================
 
-:mod:`examples` Package
------------------------
+Submodules
+----------
 
-.. automodule:: GPy.examples
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`classification` Module
-----------------------------
+GPy.examples.classification module
+----------------------------------
 
 .. automodule:: GPy.examples.classification
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`dimensionality_reduction` Module
---------------------------------------
+GPy.examples.dimensionality_reduction module
+--------------------------------------------
 
 .. automodule:: GPy.examples.dimensionality_reduction
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`regression` Module
-------------------------
+GPy.examples.regression module
+------------------------------
 
 .. automodule:: GPy.examples.regression
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`stochastic` Module
-------------------------
+GPy.examples.stochastic module
+------------------------------
 
 .. automodule:: GPy.examples.stochastic
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`tutorials` Module
------------------------
+GPy.examples.tutorials module
+-----------------------------
 
 .. automodule:: GPy.examples.tutorials
     :members:
     :undoc-members:
     :show-inheritance:
 
+
+Module contents
+---------------
+
+.. automodule:: GPy.examples
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/GPy.inference.rst b/doc/GPy.inference.rst
index 6a1bef4a..28f42994 100644
--- a/doc/GPy.inference.rst
+++ b/doc/GPy.inference.rst
@@ -1,51 +1,62 @@
-inference Package
-=================
+GPy.inference package
+=====================
 
-:mod:`conjugate_gradient_descent` Module
-----------------------------------------
+Submodules
+----------
+
+GPy.inference.conjugate_gradient_descent module
+-----------------------------------------------
 
 .. automodule:: GPy.inference.conjugate_gradient_descent
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`gradient_descent_update_rules` Module
--------------------------------------------
+GPy.inference.gradient_descent_update_rules module
+--------------------------------------------------
 
 .. automodule:: GPy.inference.gradient_descent_update_rules
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`optimization` Module
---------------------------
+GPy.inference.optimization module
+---------------------------------
 
 .. automodule:: GPy.inference.optimization
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`samplers` Module
-----------------------
+GPy.inference.samplers module
+-----------------------------
 
 .. automodule:: GPy.inference.samplers
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`scg` Module
------------------
+GPy.inference.scg module
+------------------------
 
 .. automodule:: GPy.inference.scg
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`sgd` Module
------------------
+GPy.inference.sgd module
+------------------------
 
 .. automodule:: GPy.inference.sgd
     :members:
     :undoc-members:
     :show-inheritance:
 
+
+Module contents
+---------------
+
+.. automodule:: GPy.inference
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/GPy.kern.parts.rst b/doc/GPy.kern.parts.rst
new file mode 100644
index 00000000..ec0661b4
--- /dev/null
+++ b/doc/GPy.kern.parts.rst
@@ -0,0 +1,246 @@
+GPy.kern.parts package
+======================
+
+Submodules
+----------
+
+GPy.kern.parts.Brownian module
+------------------------------
+
+.. automodule:: GPy.kern.parts.Brownian
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.Matern32 module
+------------------------------
+
+.. automodule:: GPy.kern.parts.Matern32
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.Matern52 module
+------------------------------
+
+.. automodule:: GPy.kern.parts.Matern52
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.bias module
+--------------------------
+
+.. automodule:: GPy.kern.parts.bias
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.coregionalize module
+-----------------------------------
+
+.. automodule:: GPy.kern.parts.coregionalize
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.exponential module
+---------------------------------
+
+.. automodule:: GPy.kern.parts.exponential
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.finite_dimensional module
+----------------------------------------
+
+.. automodule:: GPy.kern.parts.finite_dimensional
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.fixed module
+---------------------------
+
+.. automodule:: GPy.kern.parts.fixed
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.gibbs module
+---------------------------
+
+.. automodule:: GPy.kern.parts.gibbs
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.hetero module
+----------------------------
+
+.. automodule:: GPy.kern.parts.hetero
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.hierarchical module
+----------------------------------
+
+.. automodule:: GPy.kern.parts.hierarchical
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.independent_outputs module
+-----------------------------------------
+
+.. automodule:: GPy.kern.parts.independent_outputs
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.kernpart module
+------------------------------
+
+.. automodule:: GPy.kern.parts.kernpart
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.linear module
+----------------------------
+
+.. automodule:: GPy.kern.parts.linear
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.mlp module
+-------------------------
+
+.. automodule:: GPy.kern.parts.mlp
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.periodic_Matern32 module
+---------------------------------------
+
+.. automodule:: GPy.kern.parts.periodic_Matern32
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.periodic_Matern52 module
+---------------------------------------
+
+.. automodule:: GPy.kern.parts.periodic_Matern52
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.periodic_exponential module
+------------------------------------------
+
+.. automodule:: GPy.kern.parts.periodic_exponential
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.poly module
+--------------------------
+
+.. automodule:: GPy.kern.parts.poly
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.prod module
+--------------------------
+
+.. automodule:: GPy.kern.parts.prod
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.prod_orthogonal module
+-------------------------------------
+
+.. automodule:: GPy.kern.parts.prod_orthogonal
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.rational_quadratic module
+----------------------------------------
+
+.. automodule:: GPy.kern.parts.rational_quadratic
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.rbf module
+-------------------------
+
+.. automodule:: GPy.kern.parts.rbf
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.rbf_inv module
+-----------------------------
+
+.. automodule:: GPy.kern.parts.rbf_inv
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.rbfcos module
+----------------------------
+
+.. automodule:: GPy.kern.parts.rbfcos
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.spline module
+----------------------------
+
+.. automodule:: GPy.kern.parts.spline
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.symmetric module
+-------------------------------
+
+.. automodule:: GPy.kern.parts.symmetric
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.sympykern module
+-------------------------------
+
+.. automodule:: GPy.kern.parts.sympykern
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.parts.white module
+---------------------------
+
+.. automodule:: GPy.kern.parts.white
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: GPy.kern.parts
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/GPy.kern.rst b/doc/GPy.kern.rst
index 35d9ec00..b4b9d9aa 100644
--- a/doc/GPy.kern.rst
+++ b/doc/GPy.kern.rst
@@ -1,29 +1,5 @@
-kern Package
-============
-
-:mod:`kern` Package
--------------------
-
-.. automodule:: GPy.kern
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`constructors` Module
---------------------------
-
-.. automodule:: GPy.kern.constructors
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`kern` Module
-------------------
-
-.. automodule:: GPy.kern.kern
-    :members:
-    :undoc-members:
-    :show-inheritance:
+GPy.kern package
+================
 
 Subpackages
 -----------
@@ -32,3 +8,30 @@ Subpackages
 
     GPy.kern.parts
 
+Submodules
+----------
+
+GPy.kern.constructors module
+----------------------------
+
+.. automodule:: GPy.kern.constructors
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.kern.kern module
+--------------------
+
+.. automodule:: GPy.kern.kern
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: GPy.kern
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/GPy.likelihoods.noise_models.rst b/doc/GPy.likelihoods.noise_models.rst
new file mode 100644
index 00000000..d1a4f451
--- /dev/null
+++ b/doc/GPy.likelihoods.noise_models.rst
@@ -0,0 +1,70 @@
+GPy.likelihoods.noise_models package
+====================================
+
+Submodules
+----------
+
+GPy.likelihoods.noise_models.binomial_noise module
+--------------------------------------------------
+
+.. automodule:: GPy.likelihoods.noise_models.binomial_noise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.likelihoods.noise_models.exponential_noise module
+-----------------------------------------------------
+
+.. automodule:: GPy.likelihoods.noise_models.exponential_noise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.likelihoods.noise_models.gamma_noise module
+-----------------------------------------------
+
+.. automodule:: GPy.likelihoods.noise_models.gamma_noise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.likelihoods.noise_models.gaussian_noise module
+--------------------------------------------------
+
+.. automodule:: GPy.likelihoods.noise_models.gaussian_noise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.likelihoods.noise_models.gp_transformations module
+------------------------------------------------------
+
+.. automodule:: GPy.likelihoods.noise_models.gp_transformations
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.likelihoods.noise_models.noise_distributions module
+-------------------------------------------------------
+
+.. automodule:: GPy.likelihoods.noise_models.noise_distributions
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.likelihoods.noise_models.poisson_noise module
+-------------------------------------------------
+
+.. automodule:: GPy.likelihoods.noise_models.poisson_noise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: GPy.likelihoods.noise_models
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/GPy.likelihoods.rst b/doc/GPy.likelihoods.rst
index 9fec38f8..c3da2650 100644
--- a/doc/GPy.likelihoods.rst
+++ b/doc/GPy.likelihoods.rst
@@ -1,51 +1,69 @@
-likelihoods Package
-===================
+GPy.likelihoods package
+=======================
 
-:mod:`likelihoods` Package
---------------------------
+Subpackages
+-----------
 
-.. automodule:: GPy.likelihoods
-    :members:
-    :undoc-members:
-    :show-inheritance:
+.. toctree::
 
-:mod:`ep` Module
-----------------
+    GPy.likelihoods.noise_models
+
+Submodules
+----------
+
+GPy.likelihoods.ep module
+-------------------------
 
 .. automodule:: GPy.likelihoods.ep
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`gaussian` Module
-----------------------
+GPy.likelihoods.ep_mixed_noise module
+-------------------------------------
+
+.. automodule:: GPy.likelihoods.ep_mixed_noise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.likelihoods.gaussian module
+-------------------------------
 
 .. automodule:: GPy.likelihoods.gaussian
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`likelihood` Module
-------------------------
+GPy.likelihoods.gaussian_mixed_noise module
+-------------------------------------------
+
+.. automodule:: GPy.likelihoods.gaussian_mixed_noise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.likelihoods.likelihood module
+---------------------------------
 
 .. automodule:: GPy.likelihoods.likelihood
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`likelihood_functions` Module
-----------------------------------
+GPy.likelihoods.noise_model_constructors module
+-----------------------------------------------
 
-.. automodule:: GPy.likelihoods.likelihood_functions
+.. automodule:: GPy.likelihoods.noise_model_constructors
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`link_functions` Module
-----------------------------
 
-.. automodule:: GPy.likelihoods.link_functions
+Module contents
+---------------
+
+.. automodule:: GPy.likelihoods
     :members:
     :undoc-members:
     :show-inheritance:
-
diff --git a/doc/GPy.mappings.rst b/doc/GPy.mappings.rst
new file mode 100644
index 00000000..c48cb06e
--- /dev/null
+++ b/doc/GPy.mappings.rst
@@ -0,0 +1,38 @@
+GPy.mappings package
+====================
+
+Submodules
+----------
+
+GPy.mappings.kernel module
+--------------------------
+
+.. automodule:: GPy.mappings.kernel
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.mappings.linear module
+--------------------------
+
+.. automodule:: GPy.mappings.linear
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.mappings.mlp module
+-----------------------
+
+.. automodule:: GPy.mappings.mlp
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: GPy.mappings
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/GPy.models.rst b/doc/GPy.models.rst
index 4d227642..4440513e 100644
--- a/doc/GPy.models.rst
+++ b/doc/GPy.models.rst
@@ -1,99 +1,134 @@
-models Package
-==============
+GPy.models package
+==================
 
-:mod:`models` Package
----------------------
+Submodules
+----------
 
-.. automodule:: GPy.models
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`bayesian_gplvm` Module
-----------------------------
+GPy.models.bayesian_gplvm module
+--------------------------------
 
 .. automodule:: GPy.models.bayesian_gplvm
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`fitc_classification` Module
----------------------------------
+GPy.models.bcgplvm module
+-------------------------
+
+.. automodule:: GPy.models.bcgplvm
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.models.fitc_classification module
+-------------------------------------
 
 .. automodule:: GPy.models.fitc_classification
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`gp_classification` Module
--------------------------------
+GPy.models.gp_classification module
+-----------------------------------
 
 .. automodule:: GPy.models.gp_classification
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`gp_regression` Module
----------------------------
+GPy.models.gp_multioutput_regression module
+-------------------------------------------
+
+.. automodule:: GPy.models.gp_multioutput_regression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.models.gp_regression module
+-------------------------------
 
 .. automodule:: GPy.models.gp_regression
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`gplvm` Module
--------------------
+GPy.models.gplvm module
+-----------------------
 
 .. automodule:: GPy.models.gplvm
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`mrd` Module
------------------
+GPy.models.gradient_checker module
+----------------------------------
+
+.. automodule:: GPy.models.gradient_checker
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.models.mrd module
+---------------------
 
 .. automodule:: GPy.models.mrd
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`sparse_gp_classification` Module
---------------------------------------
+GPy.models.sparse_gp_classification module
+------------------------------------------
 
 .. automodule:: GPy.models.sparse_gp_classification
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`sparse_gp_regression` Module
-----------------------------------
+GPy.models.sparse_gp_multioutput_regression module
+--------------------------------------------------
+
+.. automodule:: GPy.models.sparse_gp_multioutput_regression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.models.sparse_gp_regression module
+--------------------------------------
 
 .. automodule:: GPy.models.sparse_gp_regression
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`sparse_gplvm` Module
---------------------------
+GPy.models.sparse_gplvm module
+------------------------------
 
 .. automodule:: GPy.models.sparse_gplvm
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`svigp_regression` Module
-------------------------------
+GPy.models.svigp_regression module
+----------------------------------
 
 .. automodule:: GPy.models.svigp_regression
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`warped_gp` Module
------------------------
+GPy.models.warped_gp module
+---------------------------
 
 .. automodule:: GPy.models.warped_gp
     :members:
     :undoc-members:
     :show-inheritance:
 
+
+Module contents
+---------------
+
+.. automodule:: GPy.models
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/GPy.rst b/doc/GPy.rst
index e56e48e1..60092e91 100644
--- a/doc/GPy.rst
+++ b/doc/GPy.rst
@@ -1,14 +1,6 @@
-GPy Package
+GPy package
 ===========
 
-:mod:`GPy` Package
-------------------
-
-.. automodule:: GPy.__init__
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
 Subpackages
 -----------
 
@@ -19,7 +11,15 @@ Subpackages
     GPy.inference
     GPy.kern
     GPy.likelihoods
+    GPy.mappings
     GPy.models
     GPy.testing
     GPy.util
 
+Module contents
+---------------
+
+.. automodule:: GPy
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/GPy.testing.rst b/doc/GPy.testing.rst
index 6c461177..bd5258b7 100644
--- a/doc/GPy.testing.rst
+++ b/doc/GPy.testing.rst
@@ -1,107 +1,110 @@
-testing Package
-===============
+GPy.testing package
+===================
 
-:mod:`testing` Package
-----------------------
+Submodules
+----------
 
-.. automodule:: GPy.testing
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`bgplvm_tests` Module
---------------------------
+GPy.testing.bgplvm_tests module
+-------------------------------
 
 .. automodule:: GPy.testing.bgplvm_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`cgd_tests` Module
------------------------
+GPy.testing.cgd_tests module
+----------------------------
 
 .. automodule:: GPy.testing.cgd_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`checkgrad` Module
------------------------
-
-.. automodule:: GPy.testing.checkgrad
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`examples_tests` Module
-----------------------------
+GPy.testing.examples_tests module
+---------------------------------
 
 .. automodule:: GPy.testing.examples_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`gplvm_tests` Module
--------------------------
+GPy.testing.gplvm_tests module
+------------------------------
 
 .. automodule:: GPy.testing.gplvm_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`kernel_tests` Module
---------------------------
+GPy.testing.kernel_tests module
+-------------------------------
 
 .. automodule:: GPy.testing.kernel_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`mrd_tests` Module
------------------------
+GPy.testing.mapping_tests module
+--------------------------------
+
+.. automodule:: GPy.testing.mapping_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.testing.mrd_tests module
+----------------------------
 
 .. automodule:: GPy.testing.mrd_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`prior_tests` Module
--------------------------
+GPy.testing.prior_tests module
+------------------------------
 
 .. automodule:: GPy.testing.prior_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`psi_stat_expactation_tests` Module
-----------------------------------------
+GPy.testing.psi_stat_expectation_tests module
+---------------------------------------------
 
-.. automodule:: GPy.testing.psi_stat_expactation_tests
+.. automodule:: GPy.testing.psi_stat_expectation_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`psi_stat_gradient_tests` Module
--------------------------------------
+GPy.testing.psi_stat_gradient_tests module
+------------------------------------------
 
 .. automodule:: GPy.testing.psi_stat_gradient_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`sparse_gplvm_tests` Module
---------------------------------
+GPy.testing.sparse_gplvm_tests module
+-------------------------------------
 
 .. automodule:: GPy.testing.sparse_gplvm_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`unit_tests` Module
-------------------------
+GPy.testing.unit_tests module
+-----------------------------
 
 .. automodule:: GPy.testing.unit_tests
     :members:
     :undoc-members:
     :show-inheritance:
 
+
+Module contents
+---------------
+
+.. automodule:: GPy.testing
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/GPy.util.latent_space_visualizations.controllers.rst b/doc/GPy.util.latent_space_visualizations.controllers.rst
new file mode 100644
index 00000000..a88c1f5c
--- /dev/null
+++ b/doc/GPy.util.latent_space_visualizations.controllers.rst
@@ -0,0 +1,30 @@
+GPy.util.latent_space_visualizations.controllers package
+========================================================
+
+Submodules
+----------
+
+GPy.util.latent_space_visualizations.controllers.axis_event_controller module
+-----------------------------------------------------------------------------
+
+.. automodule:: GPy.util.latent_space_visualizations.controllers.axis_event_controller
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+GPy.util.latent_space_visualizations.controllers.imshow_controller module
+-------------------------------------------------------------------------
+
+.. automodule:: GPy.util.latent_space_visualizations.controllers.imshow_controller
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Module contents
+---------------
+
+.. automodule:: GPy.util.latent_space_visualizations.controllers
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/GPy.util.latent_space_visualizations.rst b/doc/GPy.util.latent_space_visualizations.rst
new file mode 100644
index 00000000..d8cbd843
--- /dev/null
+++ b/doc/GPy.util.latent_space_visualizations.rst
@@ -0,0 +1,17 @@
+GPy.util.latent_space_visualizations package
+============================================
+
+Subpackages
+-----------
+
+.. toctree::
+
+    GPy.util.latent_space_visualizations.controllers
+
+Module contents
+---------------
+
+.. automodule:: GPy.util.latent_space_visualizations
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/GPy.util.rst b/doc/GPy.util.rst
index e66329b6..c86280a7 100644
--- a/doc/GPy.util.rst
+++ b/doc/GPy.util.rst
@@ -1,123 +1,133 @@
-util Package
-============
+GPy.util package
+================
 
-:mod:`util` Package
--------------------
+Subpackages
+-----------
 
-.. automodule:: GPy.util
-    :members:
-    :undoc-members:
-    :show-inheritance:
+.. toctree::
 
-:mod:`Tango` Module
--------------------
+    GPy.util.latent_space_visualizations
+
+Submodules
+----------
+
+GPy.util.Tango module
+---------------------
 
 .. automodule:: GPy.util.Tango
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`classification` Module
-----------------------------
+GPy.util.classification module
+------------------------------
 
 .. automodule:: GPy.util.classification
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`datasets` Module
-----------------------
+GPy.util.datasets module
+------------------------
 
 .. automodule:: GPy.util.datasets
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`decorators` Module
-------------------------
+GPy.util.decorators module
+--------------------------
 
 .. automodule:: GPy.util.decorators
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`linalg` Module
---------------------
+GPy.util.linalg module
+----------------------
 
 .. automodule:: GPy.util.linalg
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`misc` Module
-------------------
+GPy.util.misc module
+--------------------
 
 .. automodule:: GPy.util.misc
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`mocap` Module
--------------------
+GPy.util.mocap module
+---------------------
 
 .. automodule:: GPy.util.mocap
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`pca` Module
------------------
+GPy.util.multioutput module
+---------------------------
 
-.. automodule:: GPy.util.pca
+.. automodule:: GPy.util.multioutput
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`plot` Module
-------------------
+GPy.util.plot module
+--------------------
 
 .. automodule:: GPy.util.plot
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`plot_latent` Module
--------------------------
+GPy.util.plot_latent module
+---------------------------
 
 .. automodule:: GPy.util.plot_latent
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`squashers` Module
------------------------
+GPy.util.squashers module
+-------------------------
 
 .. automodule:: GPy.util.squashers
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`univariate_Gaussian` Module
----------------------------------
+GPy.util.univariate_Gaussian module
+-----------------------------------
 
 .. automodule:: GPy.util.univariate_Gaussian
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`visualize` Module
------------------------
+GPy.util.visualize module
+-------------------------
 
 .. automodule:: GPy.util.visualize
     :members:
     :undoc-members:
     :show-inheritance:
 
-:mod:`warping_functions` Module
--------------------------------
+GPy.util.warping_functions module
+---------------------------------
 
 .. automodule:: GPy.util.warping_functions
     :members:
     :undoc-members:
     :show-inheritance:
 
+
+Module contents
+---------------
+
+.. automodule:: GPy.util
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/doc/conf.py b/doc/conf.py
index b6dad7fd..42def116 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -83,6 +83,7 @@ print "finished importing"
 #############################################################################
 
 class Mock(object):
+    __all__ = []
     def __init__(self, *args, **kwargs):
         pass
 
@@ -105,8 +106,7 @@ class Mock(object):
 print "Mocking"
 MOCK_MODULES = ['sympy',
     'sympy.utilities', 'sympy.utilities.codegen', 'sympy.core.cache',
-    'sympy.core', 'sympy.parsing', 'sympy.parsing.sympy_parser',
-    'matplotlib.pyplot'
+    'sympy.core', 'sympy.parsing', 'sympy.parsing.sympy_parser'
     ]
 for mod_name in MOCK_MODULES:
     sys.modules[mod_name] = Mock()
diff --git a/setup.py b/setup.py
index 90645e71..9ccf3990 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ setup(name = 'GPy',
       license = "BSD 3-clause",
       keywords = "machine-learning gaussian-processes kernels",
       url = "http://sheffieldml.github.com/GPy/",
-      packages = ['GPy', 'GPy.core', 'GPy.kern', 'GPy.util', 'GPy.models', 'GPy.inference', 'GPy.examples', 'GPy.likelihoods', 'GPy.testing'],
+      packages = ['GPy', 'GPy.core', 'GPy.kern', 'GPy.util', 'GPy.models', 'GPy.inference', 'GPy.examples', 'GPy.likelihoods', 'GPy.testing', 'GPy.util.latent_space_visualizations', 'GPy.util.latent_space_visualizations.controllers', 'GPy.likelihoods.noise_models', 'GPy.kern.parts', 'GPy.mappings'],
       package_dir={'GPy': 'GPy'},
       package_data = {'GPy': ['GPy/examples']},
       py_modules = ['GPy.__init__'],