diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 3552c37e..73680900 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -120,8 +120,12 @@ class GP(Model):
         mu, var = self._raw_predict(Xnew, full_cov=full_cov)
 
         # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata)
-        return mean, var, _025pm, _975pm
+        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata)
+        return mean, var
+
+    def predict_quantiles(self, X, quantiles=(0.025, 0.975), Y_metadata=None):
+        m, v = self._raw_predict(X,  full_cov=False)
+        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
 
     def posterior_samples_f(self,X,size=10, full_cov=True):
         """
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 16b66676..23f8e690 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -54,13 +54,13 @@ class SparseGP(GP):
 
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y)
-        self.likelihood.update_gradients(self.grad_dict.pop('partial_for_likelihood'))
+        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
         if isinstance(self.X, VariationalPosterior):
             #gradients wrt kernel
             dL_dKmm = self.grad_dict.pop('dL_dKmm')
             self.kern.update_gradients_full(dL_dKmm, self.Z, None)
             target = self.kern.gradient.copy()
-            self.kern.update_gradients_expectations(variational_posterior=self.X, Z=self.Z, **self.grad_dict)
+            self.kern.update_gradients_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
             self.kern.gradient += target
 
             #gradients wrt Z
diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index a633c381..58d77c03 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -16,8 +16,8 @@ If the likelihood object is something other than Gaussian, then exact inference
 is not tractable. We then resort to a Laplace approximation (laplace.py) or
 expectation propagation (ep.py).
 
-The inference methods return a 
-:class:`~GPy.inference.latent_function_inference.posterior.Posterior` 
+The inference methods return a
+:class:`~GPy.inference.latent_function_inference.posterior.Posterior`
 instance, which is a simple
 structure which contains a summary of the posterior. The model classes can then
 use this posterior object for making predictions, optimizing hyper-parameters,
@@ -33,13 +33,13 @@ from dtc import DTC
 from fitc import FITC
 
 # class FullLatentFunctionData(object):
-#     
-# 
+#
+#
 # class LatentFunctionInference(object):
 #     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
 #         """
 #         Do inference on the latent functions given a covariance function `kern`,
-#         inputs and outputs `X` and `Y`, and a likelihood `likelihood`. 
+#         inputs and outputs `X` and `Y`, and a likelihood `likelihood`.
 #         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
 #         """
-#         raise NotImplementedError, "Abstract base class for full inference"
\ No newline at end of file
+#         raise NotImplementedError, "Abstract base class for full inference"
diff --git a/GPy/inference/latent_function_inference/dtc.py b/GPy/inference/latent_function_inference/dtc.py
index df2d5a03..5ebc5e53 100644
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@@ -40,7 +40,7 @@ class DTC(object):
         U = Knm
         Uy = np.dot(U.T,Y)
 
-        #factor Kmm 
+        #factor Kmm
         Kmmi, L, Li, _ = pdinv(Kmm)
 
         # Compute A
@@ -78,7 +78,9 @@ class DTC(object):
         Uv = np.dot(U, v)
         dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - 1./beta + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1))*beta**2
 
-        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':np.zeros_like(Knn), 'dL_dKnm':dL_dU.T}
+        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
+
+        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':np.zeros_like(Knn), 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL}
 
         #construct a posterior object
         post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)
@@ -154,11 +156,8 @@ class vDTC(object):
         dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - 1./beta + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1) )*beta**2
         dL_dR -=beta*trace_term/num_data
 
-        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':np.zeros_like(Knn) + -0.5*beta, 'dL_dKnm':dL_dU.T}
-
-        #update gradients
-        kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
-        likelihood.update_gradients(dL_dR)
+        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
+        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':np.zeros_like(Knn) + -0.5*beta, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL}
 
         #construct a posterior object
         post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)
diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index b063b64d..e76575c6 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -33,7 +33,7 @@ class ExactGaussianInference(object):
             #if Y in self.cache, return self.Cache[Y], else store Y in cache and return L.
             raise NotImplementedError, 'TODO' #TODO
 
-    def inference(self, kern, X, likelihood, Y, **Y_metadata):
+    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
         """
         Returns a Posterior class containing essential quantities of the posterior
         """
@@ -41,12 +41,14 @@ class ExactGaussianInference(object):
 
         K = kern.K(X)
 
-        Wi, LW, LWi, W_logdet = pdinv(K + likelihood.covariance_matrix(Y, **Y_metadata))
+        Wi, LW, LWi, W_logdet = pdinv(K + likelihood.covariance_matrix(Y, Y_metadata))
 
         alpha, _ = dpotrs(LW, YYT_factor, lower=1)
 
         log_marginal =  0.5*(-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor))
-        
+
         dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi)
 
-        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK}
+        dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK))
+
+        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
diff --git a/GPy/inference/latent_function_inference/fitc.py b/GPy/inference/latent_function_inference/fitc.py
index 9e9c14e2..9294e25d 100644
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@@ -18,10 +18,6 @@ class FITC(object):
         self.const_jitter = 1e-6
 
     def inference(self, kern, X, Z, likelihood, Y):
-        
-        #TODO: MAX! fix this!
-        from ...util.misc import param_to_array
-        Y = param_to_array(Y)
 
         num_inducing, _ = Z.shape
         num_data, output_dim = Y.shape
@@ -36,7 +32,7 @@ class FITC(object):
         Knm = kern.K(X, Z)
         U = Knm
 
-        #factor Kmm 
+        #factor Kmm
         Kmmi, L, Li, _ = pdinv(Kmm)
 
         #compute beta_star, the effective noise precision
@@ -72,7 +68,7 @@ class FITC(object):
         vvT_P = tdot(v.reshape(-1,1)) + P
         dL_dK = 0.5*(Kmmi - vvT_P)
         KiU = np.dot(Kmmi, U.T)
-        dL_dK += np.dot(KiU*dL_dR, KiU.T) 
+        dL_dK += np.dot(KiU*dL_dR, KiU.T)
 
         # Compute dL_dU
         vY = np.dot(v.reshape(-1,1),Y.T)
@@ -80,7 +76,8 @@ class FITC(object):
         dL_dU *= beta_star
         dL_dU -= 2.*KiU*dL_dR
 
-        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':dL_dR, 'dL_dKnm':dL_dU.T, 'partial_for_likelihood':dL_dR}
+        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
+        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':dL_dR, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL}
 
         #construct a posterior object
         post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index 6239e5a4..7a0c14e8 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -137,25 +137,25 @@ class VarDTC(object):
             psi0, A, LB, trYYT, data_fit)
 
         #put the gradients in the right places
-        partial_for_likelihood = _compute_partial_for_likelihood(likelihood,
+        dL_dR = _compute_dL_dR(likelihood,
             het_noise, uncertain_inputs, LB,
             _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
             psi0, psi1, beta,
             data_fit, num_data, output_dim, trYYT)
 
-        #likelihood.update_gradients(partial_for_likelihood)
+        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
 
         if uncertain_inputs:
             grad_dict = {'dL_dKmm': dL_dKmm,
                          'dL_dpsi0':dL_dpsi0,
                          'dL_dpsi1':dL_dpsi1,
                          'dL_dpsi2':dL_dpsi2,
-                         'partial_for_likelihood':partial_for_likelihood}
+                         'dL_dthetaL':dL_dthetaL}
         else:
             grad_dict = {'dL_dKmm': dL_dKmm,
                          'dL_dKdiag':dL_dpsi0,
                          'dL_dKnm':dL_dpsi1,
-                         'partial_for_likelihood':partial_for_likelihood}
+                         'dL_dthetaL':dL_dthetaL}
 
         #get sufficient things for posterior prediction
         #TODO: do we really want to do this in  the loop?
@@ -232,7 +232,7 @@ class VarDTCMissingData(object):
         if uncertain_inputs:
             dL_dpsi2_all = np.zeros((Y.shape[0], num_inducing, num_inducing))
 
-        partial_for_likelihood = 0
+        dL_dR = 0
         woodbury_vector = np.zeros((num_inducing, Y.shape[1]))
         woodbury_inv_all = np.zeros((num_inducing, num_inducing, Y.shape[1]))
         dL_dKmm = 0
@@ -308,7 +308,7 @@ class VarDTCMissingData(object):
                 psi0, A, LB, trYYT, data_fit)
 
             #put the gradients in the right places
-            partial_for_likelihood += _compute_partial_for_likelihood(likelihood,
+            dL_dR += _compute_dL_dR(likelihood,
                 het_noise, uncertain_inputs, LB,
                 _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
                 psi0, psi1, beta,
@@ -334,12 +334,12 @@ class VarDTCMissingData(object):
                          'dL_dpsi0':dL_dpsi0_all,
                          'dL_dpsi1':dL_dpsi1_all,
                          'dL_dpsi2':dL_dpsi2_all,
-                         'partial_for_likelihood':partial_for_likelihood}
+                         'dL_dR':dL_dR}
         else:
             grad_dict = {'dL_dKmm': dL_dKmm,
                          'dL_dKdiag':dL_dpsi0_all,
                          'dL_dKnm':dL_dpsi1_all,
-                         'partial_for_likelihood':partial_for_likelihood}
+                         'dL_dR':dL_dR}
 
         #get sufficient things for posterior prediction
         #TODO: do we really want to do this in  the loop?
@@ -385,11 +385,11 @@ def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, C
     return dL_dpsi0, dL_dpsi1, dL_dpsi2
 
 
-def _compute_partial_for_likelihood(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT):
+def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT):
     # the partial derivative vector for the likelihood
     if likelihood.size == 0:
         # save computation here.
-        partial_for_likelihood = None
+        dL_dR = None
     elif het_noise:
         if uncertain_inputs:
             raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
@@ -399,20 +399,20 @@ def _compute_partial_for_likelihood(likelihood, het_noise, uncertain_inputs, LB,
             Lmi_psi1, nil = dtrtrs(Lm, psi1.T, lower=1, trans=0)
             _LBi_Lmi_psi1, _ = dtrtrs(LB, Lmi_psi1, lower=1, trans=0)
 
-            partial_for_likelihood = -0.5 * beta + 0.5 * likelihood.V**2
-            partial_for_likelihood += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
+            dL_dR = -0.5 * beta + 0.5 * likelihood.V**2
+            dL_dR += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
 
-            partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
+            dL_dR += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
 
-            partial_for_likelihood += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * beta**2
-            partial_for_likelihood += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
+            dL_dR += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * likelihood.Y * beta**2
+            dL_dR += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
 
     else:
         # likelihood is not heteroscedatic
-        partial_for_likelihood = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
-        partial_for_likelihood += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
-        partial_for_likelihood += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
-    return partial_for_likelihood
+        dL_dR = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
+        dL_dR += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
+        dL_dR += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
+    return dL_dR
 
 def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit):
 #compute log marginal likelihood
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 8e34f6b9..032136a7 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -18,6 +18,7 @@ import link_functions
 from likelihood import Likelihood
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
+from scipy import stats
 
 class Gaussian(Likelihood):
     """
@@ -49,11 +50,14 @@ class Gaussian(Likelihood):
         if isinstance(gp_link, link_functions.Identity):
             self.log_concave = True
 
-    def covariance_matrix(self, Y, **Y_metadata):
+    def covariance_matrix(self, Y, Y_metadata=None):
         return np.eye(Y.shape[0]) * self.variance
 
-    def update_gradients(self, partial):
-        self.variance.gradient = np.sum(partial)
+    def update_gradients(self, grad):
+        self.variance.gradient = grad
+
+    def exact_inference_gradients(self, dL_dKdiag):
+        return dL_dKdiag.sum()
 
     def _preprocess_values(self, Y):
         """
@@ -76,16 +80,12 @@ class Gaussian(Likelihood):
         Z_hat = 1./np.sqrt(2.*np.pi*sum_var)*np.exp(-.5*(data_i - v_i/tau_i)**2./sum_var)
         return Z_hat, mu_hat, sigma2_hat
 
-    def predictive_values(self, mu, var, full_cov=False):
+    def predictive_values(self, mu, var, full_cov=False, Y_metadata=None):
         if full_cov:
             var += np.eye(var.shape[0])*self.variance
-            d = 2*np.sqrt(np.diag(var))
-            low, up = mu - d, mu + d
         else:
             var += self.variance
-            d = 2*np.sqrt(var)
-            low, up = mu - d, mu + d
-        return mu, var, low, up
+        return mu, var
 
     def predictive_mean(self, mu, sigma):
         return mu
@@ -93,6 +93,9 @@ class Gaussian(Likelihood):
     def predictive_variance(self, mu, sigma, predictive_mean=None):
         return self.variance + sigma**2
 
+    def predictive_quantiles(self, mu, var, quantiles, Y_metadata):
+        return  [stats.norm.ppf(q)*np.sqrt(var) + mu for q in quantiles]
+
     def pdf_link(self, link_f, y, extra_data=None):
         """
         Likelihood function given link(f)
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index aff55533..331bcf8d 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -135,7 +135,7 @@ class Likelihood(Parameterized):
 
         return mean
 
-    def _predictive_variance(self,mu,variance,predictive_mean=None):
+    def _predictive_variance(self, mu,variance, predictive_mean=None):
         """
         Numerical approximation to the predictive variance: V(Y_star)
 
@@ -358,7 +358,7 @@ class Likelihood(Parameterized):
 
         return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
 
-    def predictive_values(self, mu, var, full_cov=False, sampling=True, num_samples=10000):
+    def predictive_values(self, mu, var, full_cov=False, Y_metadata=None):
         """
         Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
 
@@ -366,14 +366,21 @@ class Likelihood(Parameterized):
         :param var: variance of the latent variable, f, of posterior
         :param full_cov: whether to use the full covariance or just the diagonal
         :type full_cov: Boolean
-        :param num_samples: number of samples to use in computing quantiles and
-                            possibly mean variance
-        :type num_samples: integer
-        :param sampling: Whether to use samples for mean and variances anyway
-        :type sampling: Boolean
-
         """
 
+        pred_mean = self.predictive_mean(mu, var, Y_metadata)
+        pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata)
+
+        return pred_mean, pred_var
+
+
+    def samples(self, gp):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        raise NotImplementedError
         if sampling:
             #Get gp_samples f* using posterior mean and variance
             if not full_cov:
@@ -393,20 +400,4 @@ class Likelihood(Parameterized):
             q1 = np.percentile(samples, 2.5, axis=axis)[:,None]
             q3 = np.percentile(samples, 97.5, axis=axis)[:,None]
 
-        else:
 
-            pred_mean = self.predictive_mean(mu, var)
-            pred_var = self.predictive_variance(mu, var, pred_mean)
-            print "WARNING: Predictive quantiles are only computed when sampling."
-            q1 = np.repeat(np.nan,pred_mean.size)[:,None]
-            q3 = q1.copy()
-
-        return pred_mean, pred_var, q1, q3
-
-    def samples(self, gp):
-        """
-        Returns a set of samples of observations based on a given value of the latent variable.
-
-        :param gp: latent variable
-        """
-        raise NotImplementedError
diff --git a/GPy/likelihoods/mixed_noise.py b/GPy/likelihoods/mixed_noise.py
index b60f3adf..946cbaf6 100644
--- a/GPy/likelihoods/mixed_noise.py
+++ b/GPy/likelihoods/mixed_noise.py
@@ -3,56 +3,57 @@ from scipy import stats, special
 from GPy.util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
 import link_functions
 from likelihood import Likelihood
+from gaussian import Gaussian
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
 from ..core.parameterization import Parameterized
 import itertools
 
 class MixedNoise(Likelihood):
-    def __init__(self, likelihoods_list, noise_index, variance = None, name='mixed_noise'):
-
-        Nlike = len(likelihoods_list)
-        self.order = np.unique(noise_index)
-
-        assert self.order.size == Nlike
-
-        if variance is None:
-            variance = np.ones(Nlike)
-        else:
-            assert variance.size == Nlike
+    def __init__(self, likelihoods_list, name='mixed_noise'):
 
         super(Likelihood, self).__init__(name=name)
 
         self.add_parameters(*likelihoods_list)
         self.likelihoods_list = likelihoods_list
-        self.noise_index = noise_index
         self.log_concave = False
-        self.likelihoods_indices = [noise_index.flatten()==j for j in self.order]
 
-    def covariance_matrix(self, Y, noise_index, **Y_metadata):
-        variance = np.zeros(Y.shape[0])
-        for lik, ind in itertools.izip(self.likelihoods_list, self.likelihoods_indices):
-            variance[ind] = lik.variance
-        return np.diag(variance)
+    def update_gradients(self, gradients):
+        self.gradient = gradients
 
-    def update_gradients(self, partial, noise_index, **Y_metadata):
-        [lik.update_gradients(partial[ind]) for lik,ind in itertools.izip(self.likelihoods_list, self.likelihoods_indices)]
+    def exact_inference_gradients(self, dL_dKdiag, Y_metadata):
+        assert all([isinstance(l, Gaussian) for l in self.likelihoods_list])
+        ind = Y_metadata['output_index']
+        return np.array([dL_dKdiag[ind==i].sum() for i in range(len(self.likelihoods_list))])
 
-    def predictive_values(self, mu, var, full_cov=False, noise_index=None, **Y_metadata):
-        _variance = np.array([ self.likelihoods_list[j].variance for j in noise_index ])
-        if full_cov:
-            var += np.eye(var.shape[0])*_variance
-            d = 2*np.sqrt(np.diag(var))
-            low, up = mu - d, mu + d
+    def predictive_values(self, mu, var, full_cov=False, Y_metadata=None):
+        if all([isinstance(l, Gaussian) for l in self.likelihoods_list]):
+            ind = Y_metadata['output_index']
+            _variance = np.array([self.likelihoods_list[j].variance for j in ind ])
+            if full_cov:
+                var += np.eye(var.shape[0])*_variance
+                d = 2*np.sqrt(np.diag(var))
+                low, up = mu - d, mu + d
+            else:
+                var += _variance
+                d = 2*np.sqrt(var)
+                low, up = mu - d, mu + d
+            return mu, var, low, up
         else:
-            var += _variance
-            d = 2*np.sqrt(var)
-            low, up = mu - d, mu + d
-        return mu, var, low, up
+            raise NotImplementedError
 
-    def predictive_variance(self, mu, sigma, noise_index, predictive_mean=None, **Y_metadata):
+    def predictive_variance(self, mu, sigma, **other_shit):
         if isinstance(noise_index,int):
             _variance = self.variance[noise_index]
         else:
             _variance = np.array([ self.variance[j] for j in noise_index ])[:,None]
         return _variance + sigma**2
+
+
+    def covariance_matrix(self, Y, Y_metadata):
+        assert all([isinstance(l, Gaussian) for l in self.likelihoods_list])
+        variance = np.zeros(Y.shape[0])
+        for lik, ind in itertools.izip(self.likelihoods_list, self.likelihoods_indices):
+            variance[ind] = lik.variance
+        return np.diag(variance)
+
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index faf2cf84..c87eb694 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -12,7 +12,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         which_data_ycols='all', fixed_inputs=[],
         levels=20, samples=0, fignum=None, ax=None, resolution=None,
         plot_raw=False,
-        linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
+        linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None):
     """
     Plot the posterior of the GP.
       - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@@ -84,17 +84,12 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
             m, v = model._raw_predict(Xgrid)
             lower = m - 2*np.sqrt(v)
             upper = m + 2*np.sqrt(v)
-            Y = Y
         else:
-            if 'noise_index' in model.Y_metadata.keys():
-                if np.unique(model.Y_metadata['noise_index'][which_data_rows]).size > 1:
-                    print "Data slices choosen have different noise models. Just one will be used."
-                noise_index = np.repeat(model.Y_metadata['noise_index'][which_data_rows][0], Xgrid.shape[0])[:,None]
-                m, v, lower, upper = model.predict(Xgrid,full_cov=False,noise_index=noise_index)
-            else:
-                noise_index = None
-                m, v, lower, upper = model.predict(Xgrid,full_cov=False)
-            Y = Y
+            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata)
+
+        lower, upper = model.predict_quantiles(Xgrid, Y_metadata=Y_metadata)
+
+
         for d in which_data_ycols:
             plots['gpplot'] = gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], ax=ax, edgecol=linecol, fillcol=fillcol)
             plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)
@@ -144,10 +139,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         #predict on the frame and plot
         if plot_raw:
             m, _ = model._raw_predict(Xgrid)
-            Y = Y
         else:
-            m, _, _, _ = model.predict(Xgrid)
-            Y = Y
+            m, _ = model.predict(Xgrid)
         for d in which_data_ycols:
             m_d = m[:,d].reshape(resolution, resolution).T
             plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
diff --git a/GPy/testing/likelihood_tests.py b/GPy/testing/likelihood_tests.py
index d55b0190..b1db94a7 100644
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@@ -667,8 +667,8 @@ class LaplaceTests(unittest.TestCase):
         m2[:] = m1[:]
 
         #Predict for training points to get posterior mean and variance
-        post_mean, post_var, _, _ = m1.predict(X)
-        post_mean_approx, post_var_approx, _, _ = m2.predict(X)
+        post_mean, post_var = m1.predict(X)
+        post_mean_approx, post_var_approx, = m2.predict(X)
 
         if debug:
             import pylab as pb
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index 5de03059..96b4cee9 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -48,7 +48,7 @@ class Cacher(object):
                 if k in kw and kw[k] is not None:
                     return self.operation(*args, **kw)
         # TODO: WARNING !!! Cache OFFSWITCH !!! WARNING
-        # return self.operation(*args)
+        return self.operation(*args)
 
         #if the result is cached, return the cached computation
         state = [all(a is b for a, b in itertools.izip_longest(args, cached_i)) for cached_i in self.cached_inputs]