diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index bbd3939b..75e5d49a 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -6,14 +6,13 @@ import sys
 from .. import kern
 from .model import Model
 from .parameterization import ObsAr
-from .model import Model
 from .mapping import Mapping
-from .parameterization import ObsAr
 from .. import likelihoods
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
 from .parameterization.variational import VariationalPosterior
 
 import logging
+import warnings
 from GPy.util.normalizer import MeanNorm
 logger = logging.getLogger("GP")
 
@@ -65,10 +64,14 @@ class GP(Model):
             self.Y = ObsAr(Y)
             self.Y_normalized = self.Y
 
-        assert Y.shape[0] == self.num_data
+        if Y.shape[0] != self.num_data:
+            #There can be cases where we want inputs than outputs, for example if we have multiple latent
+            #function values
+            warnings.warn("There are more rows in your input data X, \
+                         than in your output data Y, be VERY sure this is what you want")
         _, self.output_dim = self.Y.shape
 
-        #TODO: check the type of this is okay?
+        assert ((Y_metadata is None) or isinstance(Y_metadata, dict))
         self.Y_metadata = Y_metadata
 
         assert isinstance(kernel, kern.Kern)
@@ -296,7 +299,7 @@ class GP(Model):
         :type size: int.
         :param full_cov: whether to return the full covariance matrix, or just the diagonal.
         :type full_cov: bool.
-        :returns: Ysim: set of simulations
+        :returns: fsim: set of simulations
         :rtype: np.ndarray (N x samples)
         """
         m, v = self._raw_predict(X,  full_cov=full_cov)
@@ -304,11 +307,11 @@ class GP(Model):
             m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
         v = v.reshape(m.size,-1) if len(v.shape)==3 else v
         if not full_cov:
-            Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
+            fsim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
         else:
-            Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
+            fsim = np.random.multivariate_normal(m.flatten(), v, size).T
 
-        return Ysim
+        return fsim
 
     def posterior_samples(self, X, size=10, full_cov=False, Y_metadata=None):
         """
@@ -324,16 +327,16 @@ class GP(Model):
         :type noise_model: integer.
         :returns: Ysim: set of simulations, a Numpy array (N x samples).
         """
-        Ysim = self.posterior_samples_f(X, size, full_cov=full_cov)
-        Ysim = self.likelihood.samples(Ysim, Y_metadata)
-
+        fsim = self.posterior_samples_f(X, size, full_cov=full_cov)
+        Ysim = self.likelihood.samples(fsim, Y_metadata)
         return Ysim
 
     def plot_f(self, plot_limits=None, which_data_rows='all',
         which_data_ycols='all', fixed_inputs=[],
         levels=20, samples=0, fignum=None, ax=None, resolution=None,
         plot_raw=True,
-        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
+        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx',
+        apply_link=False):
         """
         Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
         This is a call to plot with plot_raw=True.
@@ -370,6 +373,8 @@ class GP(Model):
         :type Y_metadata: dict
         :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
         :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
+        :param apply_link: if there is a link function of the likelihood, plot the link(f*) rather than f*
+        :type apply_link: boolean
         """
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ..plotting.matplot_dep import models_plots
@@ -382,7 +387,7 @@ class GP(Model):
                                      which_data_ycols, fixed_inputs,
                                      levels, samples, fignum, ax, resolution,
                                      plot_raw=plot_raw, Y_metadata=Y_metadata,
-                                     data_symbol=data_symbol, **kw)
+                                     data_symbol=data_symbol, apply_link=apply_link, **kw)
 
     def plot(self, plot_limits=None, which_data_rows='all',
         which_data_ycols='all', fixed_inputs=[],
diff --git a/GPy/core/model.py b/GPy/core/model.py
index 4108e72c..937d30e5 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -256,7 +256,7 @@ class Model(Parameterized):
         else:
             optimizer = optimization.get_optimizer(optimizer)
             opt = optimizer(start, model=self, max_iters=max_iters, **kwargs)
-                        
+
         with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook) as vo:
             opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
             vo.finish(opt)
@@ -371,7 +371,12 @@ class Model(Parameterized):
                 f1 = self._objective(xx)
                 xx[xind] -= 2.*step
                 f2 = self._objective(xx)
-                df_ratio = np.abs((f1 - f2) / min(f1, f2))
+                #Avoid divide by zero, if any of the values are above 1e-15, otherwise both values are essentiall
+                #the same
+                if f1 > 1e-15 or f1 < -1e-15 or f2 > 1e-15 or f2 < -1e-15:
+                    df_ratio = np.abs((f1 - f2) / min(f1, f2))
+                else:
+                    df_ratio = 1.0
                 df_unstable = df_ratio < df_tolerance
                 numerical_gradient = (f1 - f2) / (2 * step)
                 if np.all(gradient[xind] == 0): ratio = (f1 - f2) == gradient[xind]
diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index 2a0a2592..76b10f08 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -64,3 +64,17 @@ class ExactGaussianInference(LatentFunctionInference):
         dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata)
 
         return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
+
+    def LOO(self, kern, X, Y, likelihood, posterior, Y_metadata=None, K=None):
+        """
+        Leave one out error as found in
+        "Bayesian leave-one-out cross-validation approximations for Gaussian latent variable models"
+        Vehtari et al. 2014.
+        """
+        g = posterior.woodbury_vector
+        c = posterior.woodbury_inv
+        c_diag = np.diag(c)[:, None]
+        neg_log_marginal_LOO = 0.5*np.log(2*np.pi) - 0.5*np.log(c_diag) + 0.5*(g**2)/c_diag
+        #believe from Predictive Approaches for Choosing Hyperparameters in Gaussian Processes
+        #this is the negative marginal LOO
+        return -neg_log_marginal_LOO
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index c6921f57..aefc82ac 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -19,6 +19,7 @@ def warning_on_one_line(message, category, filename, lineno, file=None, line=Non
 warnings.formatwarning = warning_on_one_line
 from scipy import optimize
 from . import LatentFunctionInference
+from scipy.integrate import quad
 
 class Laplace(LatentFunctionInference):
 
@@ -39,6 +40,85 @@ class Laplace(LatentFunctionInference):
         self.first_run = True
         self._previous_Ki_fhat = None
 
+    def LOO(self, kern, X, Y, likelihood, posterior, Y_metadata=None, K=None, f_hat=None, W=None, Ki_W_i=None):
+        """
+        Leave one out log predictive density as found in
+        "Bayesian leave-one-out cross-validation approximations for Gaussian latent variable models"
+        Vehtari et al. 2014.
+        """
+        Ki_f_init = np.zeros_like(Y)
+
+        if K is None:
+            K = kern.K(X)
+
+        if f_hat is None:
+            f_hat, _ = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
+
+        if W is None:
+            W = -likelihood.d2logpdf_df2(f_hat, Y, Y_metadata=Y_metadata)
+
+        if Ki_W_i is None:
+            _, _, _, Ki_W_i = self._compute_B_statistics(K, W, likelihood.log_concave)
+
+        logpdf_dfhat = likelihood.dlogpdf_df(f_hat, Y, Y_metadata=Y_metadata)
+
+        if W.shape[1] == 1:
+            W = np.diagflat(W)
+
+        #Eq 14, and 16
+        var_site = 1./np.diag(W)[:, None]
+        mu_site = f_hat + var_site*logpdf_dfhat
+        prec_site = 1./var_site
+        #Eq 19
+        marginal_cov = Ki_W_i
+        marginal_mu = marginal_cov.dot(np.diagflat(prec_site)).dot(mu_site)
+        marginal_var = np.diag(marginal_cov)[:, None]
+        #Eq 30 with using site parameters instead of Gaussian site parameters
+        #(var_site instead of sigma^{2} )
+        posterior_cav_var = 1./(1./marginal_var - 1./var_site)
+        posterior_cav_mean = posterior_cav_var*((1./marginal_var)*marginal_mu - (1./var_site)*Y)
+
+        flat_y = Y.flatten()
+        flat_mu = posterior_cav_mean.flatten()
+        flat_var = posterior_cav_var.flatten()
+
+        if Y_metadata is not None:
+            #Need to zip individual elements of Y_metadata aswell
+            Y_metadata_flat = {}
+            if Y_metadata is not None:
+                for key, val in Y_metadata.items():
+                    Y_metadata_flat[key] = np.atleast_1d(val).reshape(-1, 1)
+
+            zipped_values = []
+
+            for i in range(Y.shape[0]):
+                y_m = {}
+                for key, val in Y_metadata_flat.items():
+                    if np.isscalar(val) or val.shape[0] == 1:
+                        y_m[key] = val
+                    else:
+                        #Won't broadcast yet
+                        y_m[key] = val[i]
+                zipped_values.append((flat_y[i], flat_mu[i], flat_var[i], y_m))
+        else:
+            #Otherwise just pass along None's
+            zipped_values = zip(flat_y, flat_mu, flat_var, [None]*Y.shape[0])
+
+        def integral_generator(yi, mi, vi, yi_m):
+            def f(fi_star):
+                #More stable in the log space
+                p_fi = np.exp(likelihood.logpdf(fi_star, yi, yi_m)
+                              - 0.5*np.log(2*np.pi*vi)
+                              - 0.5*np.square(mi-fi_star)/vi)
+                return p_fi
+            return f
+
+        #Eq 30
+        p_ystar, _ = zip(*[quad(integral_generator(y, m, v, yi_m), -np.inf, np.inf)
+                           for y, m, v, yi_m in zipped_values])
+        p_ystar = np.array(p_ystar).reshape(-1, 1)
+        return np.log(p_ystar)
+
     def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
         """
         Returns a Posterior class containing essential quantities of the posterior
diff --git a/GPy/kern/_src/independent_outputs.py b/GPy/kern/_src/independent_outputs.py
index aa9dca80..6f8b7be1 100644
--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@@ -8,7 +8,7 @@ import itertools
 
 def index_to_slices(index):
     """
-    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index. 
+    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index.
 
     e.g.
     >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
@@ -79,10 +79,10 @@ class IndependentOutputs(CombinationKernel):
 
     def update_gradients_full(self,dL_dK,X,X2=None):
         slices = index_to_slices(X[:,self.index_dim])
-        if self.single_kern: 
+        if self.single_kern:
             target = np.zeros(self.kern.size)
             kerns = itertools.repeat(self.kern)
-        else: 
+        else:
             kerns = self.kern
             target = [np.zeros(kern.size) for kern, _ in zip(kerns, slices)]
         def collate_grads(kern, i, dL, X, X2):
@@ -94,7 +94,7 @@ class IndependentOutputs(CombinationKernel):
         else:
             slices2 = index_to_slices(X2[:,self.index_dim])
             [[[collate_grads(kern, i, dL_dK[s,s2],X[s],X2[s2]) for s in slices_i] for s2 in slices_j] for i,(kern,slices_i,slices_j) in enumerate(zip(kerns,slices,slices2))]
-        if self.single_kern: 
+        if self.single_kern:
             self.kern.gradient = target
         else:
             [kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
@@ -104,12 +104,14 @@ class IndependentOutputs(CombinationKernel):
         kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
         if X2 is None:
             # TODO: make use of index_to_slices
+            # FIXME: Broken as X is already sliced out
+            print "Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!"
             values = np.unique(X[:,self.index_dim])
             slices = [X[:,self.index_dim]==i for i in values]
             [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
               for kern, s in zip(kerns, slices)]
             #slices = index_to_slices(X[:,self.index_dim])
-            #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s]) 
+            #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s])
             #  for s in slices_i] for kern, slices_i in zip(kerns, slices)]
             #import ipdb;ipdb.set_trace()
             #[[(np.add(target[s ], kern.gradients_X(dL_dK[s ,ss],X[s ], X[ss]), out=target[s ]),
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 9ecf7dbf..9abb8cde 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -132,10 +132,8 @@ class Gaussian(Likelihood):
         :returns: log likelihood evaluated for this point
         :rtype: float
         """
-        N = y.shape[0]
         ln_det_cov = np.log(self.variance)
-
-        return -0.5*((y-link_f)**2/self.variance + ln_det_cov + np.log(2.*np.pi))
+        return -(1.0/(2*self.variance))*((y-link_f)**2) - 0.5*ln_det_cov - 0.5*np.log(2.*np.pi)
 
     def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
         """
@@ -220,7 +218,6 @@ class Gaussian(Likelihood):
         """
         e = y - link_f
         s_4 = 1.0/(self.variance**2)
-        N = y.shape[0]
         dlik_dsigma = -0.5/self.variance + 0.5*s_4*np.square(e)
         return dlik_dsigma
 
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index c876100d..5388526e 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -114,21 +114,24 @@ class Likelihood(Parameterized):
             #Otherwise just pass along None's
             zipped_values = zip(flat_y_test, flat_mu_star, flat_var_star, [None]*y_test.shape[0])
 
-        def integral_generator(y, m, v, y_m):
-            """Generate a function which can be integrated to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
-            def f(f_star):
+        def integral_generator(yi, mi, vi, yi_m):
+            """Generate a function which can be integrated
+            to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
+            def f(fi_star):
                 #exponent = np.exp(-(1./(2*v))*np.square(m-f_star))
                 #from GPy.util.misc import safe_exp
                 #exponent = safe_exp(exponent)
                 #return self.pdf(f_star, y, y_m)*exponent
 
                 #More stable in the log space
-                return np.exp(self.logpdf(f_star, y, y_m) -(1./(2*v))*np.square(m-f_star))
+                return np.exp(self.logpdf(fi_star, yi, yi_m)
+                              - 0.5*np.log(2*np.pi*vi)
+                              - 0.5*np.square(mi-fi_star)/vi)
             return f
 
-        scaled_p_ystar, accuracy = zip(*[quad(integral_generator(y, m, v, y_m), -np.inf, np.inf) for y, m, v, y_m in zipped_values])
-        scaled_p_ystar = np.array(scaled_p_ystar).reshape(-1,1)
-        p_ystar = scaled_p_ystar/np.sqrt(2*np.pi*var_star)
+        p_ystar, _ = zip(*[quad(integral_generator(yi, mi, vi, yi_m), -np.inf, np.inf)
+                           for yi, mi, vi, yi_m in zipped_values])
+        p_ystar = np.array(p_ystar).reshape(-1, 1)
         return np.log(p_ystar)
 
     def _moments_match_ep(self,obs,tau,v):
@@ -545,9 +548,9 @@ class Likelihood(Parameterized):
 
         #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
         # ensure we have gradients for every parameter we want to optimize
-        assert dlogpdf_dtheta.shape[0] == self.size #f, d x num_param array
-        assert dlogpdf_df_dtheta.shape[0] == self.size #f x d x num_param matrix or just f x num_param
-        assert d2logpdf_df2_dtheta.shape[0] == self.size #f x num_param matrix or f x d x num_param matrix, f x f x num_param or f x f x d x num_param
+        assert dlogpdf_dtheta.shape[0] == self.size #num_param array x f, d
+        assert dlogpdf_df_dtheta.shape[0] == self.size #num_param x f x d x matrix or just num_param x f
+        assert d2logpdf_df2_dtheta.shape[0] == self.size #num_param x f matrix or num_param x f x d x matrix, num_param x f x f or num_param x f x f x d
 
         return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
 
diff --git a/GPy/likelihoods/link_functions.py b/GPy/likelihoods/link_functions.py
index 41bff1f8..3d753395 100644
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@@ -4,9 +4,7 @@
 import numpy as np
 from ..util.univariate_Gaussian import std_norm_cdf, std_norm_pdf
 import scipy as sp
-
-_exp_lim_val = np.finfo(np.float64).max
-_lim_val = np.log(_exp_lim_val)
+from ..util.misc import safe_exp, safe_square, safe_cube, safe_quad, safe_three_times
 
 class GPTransformation(object):
     """
@@ -63,12 +61,13 @@ class Identity(GPTransformation):
     def d3transf_df3(self,f):
         return np.zeros_like(f)
 
+
 class Probit(GPTransformation):
     """
     .. math::
 
         g(f) = \\Phi^{-1} (mu)
-
+    
     """
     def transf(self,f):
         return std_norm_cdf(f)
@@ -80,7 +79,7 @@ class Probit(GPTransformation):
         return -f * std_norm_pdf(f)
 
     def d3transf_df3(self,f):
-        return (np.square(f)-1.)*std_norm_pdf(f)
+        return (safe_square(f)-1.)*std_norm_pdf(f)
 
 
 class Cloglog(GPTransformation):
@@ -96,19 +95,23 @@ class Cloglog(GPTransformation):
 
     """
     def transf(self,f):
-        return 1-np.exp(-np.exp(f))
+        ef = safe_exp(f)
+        return 1-np.exp(-ef)
 
     def dtransf_df(self,f):
-        return np.exp(f-np.exp(f))
+        ef = safe_exp(f)
+        return np.exp(f-ef)
 
     def d2transf_df2(self,f):
-        ef = np.exp(f)
+        ef = safe_exp(f)
         return -np.exp(f-ef)*(ef-1.)
 
     def d3transf_df3(self,f):
-        ef = np.exp(f)
-        return np.exp(f-ef)*(1.-3*ef + ef**2)
-
+        ef = safe_exp(f)
+        ef2 = safe_square(ef)
+        three_times_ef = safe_three_times(ef)
+        r_val = np.exp(f-ef)*(1.-three_times_ef + ef2)
+        return r_val
 
 class Log(GPTransformation):
     """
@@ -118,16 +121,16 @@ class Log(GPTransformation):
 
     """
     def transf(self,f):
-        return np.exp(np.clip(f, -np.inf, _lim_val))
+        return safe_exp(f)
 
     def dtransf_df(self,f):
-        return np.exp(np.clip(f, -np.inf, _lim_val))
+        return safe_exp(f)
 
     def d2transf_df2(self,f):
-        return np.exp(np.clip(f, -np.inf, _lim_val))
+        return safe_exp(f)
 
     def d3transf_df3(self,f):
-        return np.exp(np.clip(f, -np.inf, _lim_val))
+        return safe_exp(f)
 
 class Log_ex_1(GPTransformation):
     """
@@ -137,17 +140,20 @@ class Log_ex_1(GPTransformation):
 
     """
     def transf(self,f):
-        return np.log(1.+np.exp(f))
+        return np.log1p(safe_exp(f))
 
     def dtransf_df(self,f):
-        return np.exp(f)/(1.+np.exp(f))
+        ef = safe_exp(f)
+        return ef/(1.+ef)
 
     def d2transf_df2(self,f):
-        aux = np.exp(f)/(1.+np.exp(f))
+        ef = safe_exp(f)
+        aux = ef/(1.+ef)
         return aux*(1.-aux)
 
     def d3transf_df3(self,f):
-        aux = np.exp(f)/(1.+np.exp(f))
+        ef = safe_exp(f)
+        aux = ef/(1.+ef)
         daux_df = aux*(1.-aux)
         return daux_df - (2.*aux*daux_df)
 
@@ -155,14 +161,17 @@ class Reciprocal(GPTransformation):
     def transf(self,f):
         return 1./f
 
-    def dtransf_df(self,f):
-        return -1./(f**2)
+    def dtransf_df(self, f):
+        f2 = safe_square(f)
+        return -1./f2
 
-    def d2transf_df2(self,f):
-        return 2./(f**3)
+    def d2transf_df2(self, f):
+        f3 = safe_cube(f)
+        return 2./f3
 
     def d3transf_df3(self,f):
-        return -6./(f**4)
+        f4 = safe_quad(f)
+        return -6./f4
 
 class Heaviside(GPTransformation):
     """
diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index 8f8fd838..0d18eb47 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -11,7 +11,7 @@ from .sparse_gplvm import SparseGPLVM
 from .warped_gp import WarpedGP
 from .bayesian_gplvm import BayesianGPLVM
 from .mrd import MRD
-from .gradient_checker import GradientChecker
+from .gradient_checker import GradientChecker, HessianChecker, SkewChecker
 from .ss_gplvm import SSGPLVM
 from .gp_coregionalized_regression import GPCoregionalizedRegression
 from .sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 7cbd69eb..3ac703fe 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -24,7 +24,7 @@ class BayesianGPLVM(SparseGP_MPI):
     def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
                  Z=None, kernel=None, inference_method=None, likelihood=None,
                  name='bayesian gplvm', mpi_comm=None, normalizer=None,
-                 missing_data=False, stochastic=False, batchsize=1):
+                 missing_data=False, stochastic=False, batchsize=1, Y_metadata=None):
 
         self.logger = logging.getLogger(self.__class__.__name__)
         if X is None:
@@ -69,6 +69,7 @@ class BayesianGPLVM(SparseGP_MPI):
                                            name=name, inference_method=inference_method,
                                            normalizer=normalizer, mpi_comm=mpi_comm,
                                            variational_prior=self.variational_prior,
+                                           Y_metadata=Y_metadata
                                            )
         self.link_parameter(self.X, index=0)
 
@@ -83,7 +84,7 @@ class BayesianGPLVM(SparseGP_MPI):
     def parameters_changed(self):
         super(BayesianGPLVM,self).parameters_changed()
         if isinstance(self.inference_method, VarDTC_minibatch):
-            return        
+            return
 
         kl_fctr = 1.
         self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
diff --git a/GPy/models/gradient_checker.py b/GPy/models/gradient_checker.py
index 74026f8e..c2cde834 100644
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@@ -5,6 +5,8 @@ from ..core.model import Model
 import itertools
 import numpy
 from ..core.parameterization import Param
+np = numpy
+from ..util.block_matrices import get_blocks, get_block_shapes, unblock, get_blocks_3d, get_block_shapes_3d
 
 def get_shape(x):
     if isinstance(x, numpy.ndarray):
@@ -111,3 +113,261 @@ class GradientChecker(Model):
         #for name, shape in zip(self.names, self.shapes):
             #_param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
         #return _param_names
+
+
+class HessianChecker(GradientChecker):
+
+    def __init__(self, f, df, ddf, x0, names=None, *args, **kwargs):
+        """
+        :param f: Function (only used for numerical hessian gradient)
+        :param df: Gradient of function to check
+        :param ddf: Analytical gradient function
+        :param x0:
+            Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
+            Can be a list of arrays, if takes a list of arrays. This list will be passed
+            to f and df in the same order as given here.
+            If only one argument, make sure not to pass a list!!!
+
+        :type x0: [array-like] | array-like | float | int
+        :param names:
+            Names to print, when performing gradcheck. If a list was passed to x0
+            a list of names with the same length is expected.
+        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
+
+        """
+        super(HessianChecker, self).__init__(df, ddf, x0, names=names, *args, **kwargs)
+        self._f = f
+        self._df = df
+        self._ddf = ddf
+
+    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3, block_indices=None, plot=False):
+        """
+        Overwrite checkgrad method to check whole block instead of looping through
+
+        Shows diagnostics using matshow instead
+
+        :param verbose: If True, print a "full" checking of each parameter
+        :type verbose: bool
+        :param step: The size of the step around which to linearise the objective
+        :type step: float (default 1e-6)
+        :param tolerance: the tolerance allowed (see note)
+        :type tolerance: float (default 1e-3)
+
+        Note:-
+           The gradient is considered correct if the ratio of the analytical
+           and numerical gradients is within <tolerance> of unity.
+        """
+        try:
+            import numdifftools as nd
+        except:
+            raise ImportError("Don't have numdifftools package installed, it is not a GPy dependency as of yet, it is only used for hessian tests")
+
+        if target_param:
+            raise NotImplementedError('Only basic functionality is provided with this gradchecker')
+
+        #Repeat for each parameter, not the nicest but shouldn't be many cases where there are many
+        #variables
+        current_index = 0
+        for name, shape in zip(self.names, self.shapes):
+            current_size = numpy.prod(shape)
+            x = self.optimizer_array.copy()
+            #x = self._get_params_transformed().copy()
+            x = x[current_index:current_index + current_size].reshape(shape)
+
+            # Check gradients
+            analytic_hess = self._ddf(x)
+            if analytic_hess.shape[1] == 1:
+                analytic_hess = numpy.diagflat(analytic_hess)
+
+            #From the docs:
+            #x0 : vector location
+            #at which to differentiate fun
+            #If x0 is an N x M array, then fun is assumed to be a function
+            #of N*M variables., thus we must have it flat, not (N,1), but just (N,)
+            #numeric_hess_partial = nd.Hessian(self._f, vectorized=False)
+            numeric_hess_partial = nd.Jacobian(self._df, vectorized=False)
+            #numeric_hess_partial = nd.Derivative(self._df, vectorized=True)
+            numeric_hess = numeric_hess_partial(x)
+
+            check_passed = self.checkgrad_block(analytic_hess, numeric_hess, verbose=verbose, step=step, tolerance=tolerance, block_indices=block_indices, plot=plot)
+            current_index += current_size
+        return check_passed
+
+    def checkgrad_block(self, analytic_hess, numeric_hess, verbose=False, step=1e-6, tolerance=1e-3, block_indices=None, plot=False):
+        """
+        Checkgrad a block matrix
+        """
+        if analytic_hess.dtype is np.dtype('object'):
+            #Make numeric hessian also into a block matrix
+            real_size = get_block_shapes(analytic_hess)
+            num_elements = np.sum(real_size)
+            if (num_elements, num_elements) == numeric_hess.shape:
+                #If the sizes are the same we assume they are the same
+                #(we have not fixed any values so the numeric is the whole hessian)
+                numeric_hess = get_blocks(numeric_hess, real_size)
+            else:
+                #Make a fake empty matrix and fill out the correct block
+                tmp_numeric_hess = get_blocks(np.zeros((num_elements, num_elements)), real_size)
+                tmp_numeric_hess[block_indices] = numeric_hess.copy()
+                numeric_hess = tmp_numeric_hess
+
+        if block_indices is not None:
+            #Extract the right block
+            analytic_hess = analytic_hess[block_indices]
+            numeric_hess = numeric_hess[block_indices]
+        else:
+            #Unblock them if they are in blocks and you aren't checking a single block (checking whole hessian)
+            if analytic_hess.dtype is np.dtype('object'):
+                analytic_hess = unblock(analytic_hess)
+                numeric_hess = unblock(numeric_hess)
+
+        ratio = numeric_hess / (numpy.where(analytic_hess==0, 1e-10, analytic_hess))
+        difference = numpy.abs(analytic_hess - numeric_hess)
+
+        check_passed = numpy.all((numpy.abs(1 - ratio)) < tolerance) or numpy.allclose(numeric_hess, analytic_hess, atol = tolerance)
+
+        if verbose:
+            if block_indices:
+                print "\nBlock {}".format(block_indices)
+            else:
+                print "\nAll blocks"
+
+            header = ['Checked', 'Max-Ratio', 'Min-Ratio', 'Min-Difference', 'Max-Difference']
+            header_string = map(lambda x: ' | '.join(header), [header])
+            separator = '-' * len(header_string[0])
+            print '\n'.join([header_string[0], separator])
+            min_r = '%.6f' % float(numpy.min(ratio))
+            max_r = '%.6f' % float(numpy.max(ratio))
+            max_d = '%.6f' % float(numpy.max(difference))
+            min_d = '%.6f' % float(numpy.min(difference))
+            cols = [max_r, min_r, min_d, max_d]
+
+            if check_passed:
+                checked = "\033[92m  True \033[0m"
+            else:
+                checked = "\033[91m  False \033[0m"
+
+            grad_string = "{} | {}  | {} |    {}    |   {} ".format(checked, cols[0], cols[1], cols[2], cols[3])
+            print grad_string
+
+            if plot:
+                import pylab as pb
+                fig, axes = pb.subplots(2, 2)
+                max_lim = numpy.max(numpy.vstack((analytic_hess, numeric_hess)))
+                min_lim = numpy.min(numpy.vstack((analytic_hess, numeric_hess)))
+                msa = axes[0,0].matshow(analytic_hess, vmin=min_lim, vmax=max_lim)
+                axes[0,0].set_title('Analytic hessian')
+                axes[0,0].xaxis.set_ticklabels([None])
+                axes[0,0].yaxis.set_ticklabels([None])
+                axes[0,0].xaxis.set_ticks([None])
+                axes[0,0].yaxis.set_ticks([None])
+                msn = axes[0,1].matshow(numeric_hess, vmin=min_lim, vmax=max_lim)
+                pb.colorbar(msn, ax=axes[0,1])
+                axes[0,1].set_title('Numeric hessian')
+                axes[0,1].xaxis.set_ticklabels([None])
+                axes[0,1].yaxis.set_ticklabels([None])
+                axes[0,1].xaxis.set_ticks([None])
+                axes[0,1].yaxis.set_ticks([None])
+                msr = axes[1,0].matshow(ratio)
+                pb.colorbar(msr, ax=axes[1,0])
+                axes[1,0].set_title('Ratio')
+                axes[1,0].xaxis.set_ticklabels([None])
+                axes[1,0].yaxis.set_ticklabels([None])
+                axes[1,0].xaxis.set_ticks([None])
+                axes[1,0].yaxis.set_ticks([None])
+                msd = axes[1,1].matshow(difference)
+                pb.colorbar(msd, ax=axes[1,1])
+                axes[1,1].set_title('difference')
+                axes[1,1].xaxis.set_ticklabels([None])
+                axes[1,1].yaxis.set_ticklabels([None])
+                axes[1,1].xaxis.set_ticks([None])
+                axes[1,1].yaxis.set_ticks([None])
+                if block_indices:
+                    fig.suptitle("Block: {}".format(block_indices))
+                pb.show()
+
+        return check_passed
+
+class SkewChecker(HessianChecker):
+
+    def __init__(self, df, ddf, dddf, x0, names=None, *args, **kwargs):
+        """
+        :param df: gradient of function
+        :param ddf: Gradient of function to check (hessian)
+        :param dddf: Analytical gradient function (third derivative)
+        :param x0:
+            Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
+            Can be a list of arrays, if takes a list of arrays. This list will be passed
+            to f and df in the same order as given here.
+            If only one argument, make sure not to pass a list!!!
+
+        :type x0: [array-like] | array-like | float | int
+        :param names:
+            Names to print, when performing gradcheck. If a list was passed to x0
+            a list of names with the same length is expected.
+        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
+
+        """
+        super(SkewChecker, self).__init__(df, ddf, dddf, x0, names=names, *args, **kwargs)
+
+    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3, block_indices=None, plot=False, super_plot=False):
+        """
+        Gradient checker that just checks each hessian individually
+
+        super_plot will plot the hessian wrt every parameter, plot will just do the first one
+        """
+        try:
+            import numdifftools as nd
+        except:
+            raise ImportError("Don't have numdifftools package installed, it is not a GPy dependency as of yet, it is only used for hessian tests")
+
+        if target_param:
+            raise NotImplementedError('Only basic functionality is provided with this gradchecker')
+
+        #Repeat for each parameter, not the nicest but shouldn't be many cases where there are many
+        #variables
+        current_index = 0
+        for name, n_shape in zip(self.names, self.shapes):
+            current_size = numpy.prod(n_shape)
+            x = self.optimizer_array.copy()
+            #x = self._get_params_transformed().copy()
+            x = x[current_index:current_index + current_size].reshape(n_shape)
+
+            # Check gradients
+            #Actually the third derivative
+            analytic_hess = self._ddf(x)
+
+            #Can only calculate jacobian for one variable at a time
+            #From the docs:
+            #x0 : vector location
+            #at which to differentiate fun
+            #If x0 is an N x M array, then fun is assumed to be a function
+            #of N*M variables., thus we must have it flat, not (N,1), but just (N,)
+            #numeric_hess_partial = nd.Hessian(self._f, vectorized=False)
+            #Actually _df is already the hessian
+            numeric_hess_partial = nd.Jacobian(self._df, vectorized=True)
+            numeric_hess = numeric_hess_partial(x)
+
+            print "Done making numerical hessian"
+            if analytic_hess.dtype is np.dtype('object'):
+                #Blockify numeric_hess aswell
+                blocksizes, pagesizes = get_block_shapes_3d(analytic_hess)
+                #HACK
+                real_block_size = np.sum(blocksizes)
+                numeric_hess = numeric_hess.reshape(real_block_size, real_block_size, pagesizes)
+                #numeric_hess = get_blocks_3d(numeric_hess, blocksizes)#, pagesizes)
+            else:
+                numeric_hess = numeric_hess.reshape(*analytic_hess.shape)
+
+            #Check every block individually (for ease)
+            check_passed = [False]*numeric_hess.shape[2]
+            for block_ind in xrange(numeric_hess.shape[2]):
+                #Unless super_plot is set, just plot the first one
+                p = True if (plot and block_ind == numeric_hess.shape[2]-1) or super_plot else False
+                if verbose:
+                    print "Checking derivative of hessian wrt parameter number {}".format(block_ind)
+                check_passed[block_ind] = self.checkgrad_block(analytic_hess[:,:,block_ind], numeric_hess[:,:,block_ind], verbose=verbose, step=step, tolerance=tolerance, block_indices=block_indices, plot=p)
+
+            current_index += current_size
+        return np.all(check_passed)
+
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 5cdf69fc..0cda12f1 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2015, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 try:
@@ -16,7 +16,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         which_data_ycols='all', fixed_inputs=[],
         levels=20, samples=0, fignum=None, ax=None, resolution=None,
         plot_raw=False,
-        linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None, data_symbol='kx'):
+        linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None, data_symbol='kx',
+        apply_link=False, samples_f=0, plot_uncertain_inputs=True):
     """
     Plot the posterior of the GP.
       - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@@ -38,7 +39,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
     :type resolution: int
     :param levels: number of levels to plot in a contour plot.
     :type levels: int
-    :param samples: the number of a posteriori samples to plot
+    :param samples: the number of a posteriori samples to plot p(y*|y)
     :type samples: int
     :param fignum: figure to plot on.
     :type fignum: figure number
@@ -49,6 +50,10 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
     :type linecol:
     :param fillcol: color of fill
     :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+    :param apply_link: apply the link function if plotting f (default false)
+    :type apply_link: boolean
+    :param samples_f: the number of posteriori f samples to plot p(f*|y)
+    :type samples_f: int
     """
     #deal with optional arguments
     if which_data_rows == 'all':
@@ -88,8 +93,14 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         #make a prediction on the frame and plot it
         if plot_raw:
             m, v = model._raw_predict(Xgrid)
-            lower = m - 2*np.sqrt(v)
-            upper = m + 2*np.sqrt(v)
+            if apply_link:
+                lower = model.likelihood.gp_link.transf(m - 2*np.sqrt(v))
+                upper = model.likelihood.gp_link.transf(m + 2*np.sqrt(v))
+                #Once transformed this is now the median of the function
+                m = model.likelihood.gp_link.transf(m)
+            else:
+                lower = m - 2*np.sqrt(v)
+                upper = m + 2*np.sqrt(v)
         else:
             if isinstance(model,GPCoregionalizedRegression) or isinstance(model,SparseGPCoregionalizedRegression):
                 meta = {'output_index': Xgrid[:,-1:].astype(np.int)}
@@ -110,13 +121,31 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
                 plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                 #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
 
+        if samples_f: #NOTE not tested with fixed_inputs
+            Fsim = model.posterior_samples_f(Xgrid, samples_f)
+            for fi in Fsim.T:
+                plots['posterior_samples_f'] = ax.plot(Xnew, fi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
+                #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
+
 
         #add error bars for uncertain (if input uncertainty is being modelled)
-        if hasattr(model,"has_uncertain_inputs") and model.has_uncertain_inputs():
-            plots['xerrorbar'] = ax.errorbar(X[which_data_rows, free_dims].flatten(), Y[which_data_rows, which_data_ycols].flatten(),
-                        xerr=2 * np.sqrt(X_variance[which_data_rows, free_dims].flatten()),
-                        ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
-
+        if hasattr(model,"has_uncertain_inputs") and model.has_uncertain_inputs() and plot_uncertain_inputs:
+            if plot_raw:
+                #add error bars for uncertain (if input uncertainty is being modelled), for plot_f
+                #Hack to plot error bars on latent function, rather than on the data
+                vs = model.X.mean.values.copy()
+                for i,v in fixed_inputs:
+                    vs[:,i] = v
+                m_X, _ = model._raw_predict(vs)
+                if apply_link:
+                    m_X = model.likelihood.gp_link.transf(m_X)
+                plots['xerrorbar'] = ax.errorbar(X[which_data_rows, free_dims].flatten(), m_X[which_data_rows, which_data_ycols].flatten(),
+                            xerr=2 * np.sqrt(X_variance[which_data_rows, free_dims].flatten()),
+                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
+            else:
+                plots['xerrorbar'] = ax.errorbar(X[which_data_rows, free_dims].flatten(), Y[which_data_rows, which_data_ycols].flatten(),
+                            xerr=2 * np.sqrt(X_variance[which_data_rows, free_dims].flatten()),
+                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
 
         #set the limits of the plot to some sensible values
         ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
@@ -186,3 +215,29 @@ def plot_fit_f(model, *args, **kwargs):
     """
     kwargs['plot_raw'] = True
     plot_fit(model,*args, **kwargs)
+
+def fixed_inputs(model, non_fixed_inputs, fix_routine='median'):
+    """
+    Convenience function for returning back fixed_inputs where the other inputs
+    are fixed using fix_routine
+    :param model: model
+    :type model: Model
+    :param non_fixed_inputs: dimensions of non fixed inputs
+    :type non_fixed_inputs: list
+    :param fix_routine: fixing routine to use, 'mean', 'median', 'zero'
+    :type fix_routine: string
+    """
+    f_inputs = []
+    if hasattr(model, 'has_uncertain_inputs') and model.has_uncertain_inputs():
+        X = model.X.mean.values.copy()
+    else:
+        X = model.X.values.copy()
+    for i in range(X.shape[1]):
+        if i not in non_fixed_inputs:
+            if fix_routine == 'mean':
+                f_inputs.append( (i, np.mean(X[:,i])) )
+            if fix_routine == 'median':
+                f_inputs.append( (i, np.median(X[:,i])) )
+            elif fix_routine == 'zero':
+                f_inputs.append( (i, 0) )
+    return f_inputs
diff --git a/GPy/testing/link_function_tests.py b/GPy/testing/link_function_tests.py
new file mode 100644
index 00000000..fb8fba99
--- /dev/null
+++ b/GPy/testing/link_function_tests.py
@@ -0,0 +1,143 @@
+import numpy as np
+import scipy as sp
+from scipy.special import cbrt
+from GPy.models import GradientChecker
+_lim_val = np.finfo(np.float64).max
+_lim_val_exp = np.log(_lim_val)
+_lim_val_square = np.sqrt(_lim_val)
+_lim_val_cube = cbrt(_lim_val)
+from GPy.likelihoods.link_functions import Identity, Probit, Cloglog, Log, Log_ex_1, Reciprocal, Heaviside
+
+class LinkFunctionTests(np.testing.TestCase):
+    def setUp(self):
+        self.small_f = np.array([[-1e-4]])
+        self.zero_f = np.array([[1e-4]])
+        self.mid_f = np.array([[5.0]])
+        self.large_f = np.array([[1e4]])
+        self.f_lower_lim = np.array(-np.inf)
+        self.f_upper_lim = np.array(np.inf)
+
+    def check_gradient(self, link_func, lim_of_inf, test_lim=False):
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.mid_f)
+        self.assertTrue(grad.checkgrad(verbose=True))
+        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.mid_f)
+        self.assertTrue(grad2.checkgrad(verbose=True))
+        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.mid_f)
+        self.assertTrue(grad3.checkgrad(verbose=True))
+
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.small_f)
+        self.assertTrue(grad.checkgrad(verbose=True))
+        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.small_f)
+        self.assertTrue(grad2.checkgrad(verbose=True))
+        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.small_f)
+        self.assertTrue(grad3.checkgrad(verbose=True))
+
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.zero_f)
+        self.assertTrue(grad.checkgrad(verbose=True))
+        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.zero_f)
+        self.assertTrue(grad2.checkgrad(verbose=True))
+        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.zero_f)
+        self.assertTrue(grad3.checkgrad(verbose=True))
+
+        #Do a limit test if the large f value is too large
+        large_f = np.clip(self.large_f, -np.inf, lim_of_inf-1e-3)
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=large_f)
+        self.assertTrue(grad.checkgrad(verbose=True))
+        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=large_f)
+        self.assertTrue(grad2.checkgrad(verbose=True))
+        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=large_f)
+        self.assertTrue(grad3.checkgrad(verbose=True))
+
+        if test_lim:
+            print "Testing limits"
+            #Remove some otherwise we are too close to the limit for gradcheck to work effectively
+            lim_of_inf = lim_of_inf - 1e-4
+            grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=lim_of_inf)
+            self.assertTrue(grad.checkgrad(verbose=True))
+            grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=lim_of_inf)
+            self.assertTrue(grad2.checkgrad(verbose=True))
+            grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=lim_of_inf)
+            self.assertTrue(grad3.checkgrad(verbose=True))
+
+    def check_overflow(self, link_func, lim_of_inf):
+        #Check that it does something sensible beyond this limit,
+        #note this is not checking the value is correct, just that it isn't nan
+        beyond_lim_of_inf = lim_of_inf + 100.0
+        self.assertFalse(np.isinf(link_func.transf(beyond_lim_of_inf)))
+        self.assertFalse(np.isinf(link_func.dtransf_df(beyond_lim_of_inf)))
+        self.assertFalse(np.isinf(link_func.d2transf_df2(beyond_lim_of_inf)))
+
+        self.assertFalse(np.isnan(link_func.transf(beyond_lim_of_inf)))
+        self.assertFalse(np.isnan(link_func.dtransf_df(beyond_lim_of_inf)))
+        self.assertFalse(np.isnan(link_func.d2transf_df2(beyond_lim_of_inf)))
+
+    def test_log_overflow(self):
+        link = Log()
+        lim_of_inf = _lim_val_exp
+
+        np.testing.assert_almost_equal(np.exp(self.mid_f), link.transf(self.mid_f))
+        assert np.isinf(np.exp(np.log(self.f_upper_lim)))
+        #Check the clipping works
+        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
+        #Need to look at most significant figures here rather than the decimals
+        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), _lim_val, significant=5)
+        self.check_overflow(link, lim_of_inf)
+
+        #Check that it would otherwise fail
+        beyond_lim_of_inf = lim_of_inf + 10.0
+        old_err_state = np.seterr(over='ignore')
+        self.assertTrue(np.isinf(np.exp(beyond_lim_of_inf)))
+        np.seterr(**old_err_state)
+
+    def test_log_ex_1_overflow(self):
+        link = Log_ex_1()
+        lim_of_inf = _lim_val_exp
+
+        np.testing.assert_almost_equal(np.log1p(np.exp(self.mid_f)), link.transf(self.mid_f))
+        assert np.isinf(np.log1p(np.exp(np.log(self.f_upper_lim))))
+        #Check the clipping works
+        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
+        #Need to look at most significant figures here rather than the decimals
+        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), np.log1p(_lim_val), significant=5)
+        self.check_overflow(link, lim_of_inf)
+
+        #Check that it would otherwise fail
+        beyond_lim_of_inf = lim_of_inf + 10.0
+        old_err_state = np.seterr(over='ignore')
+        self.assertTrue(np.isinf(np.log1p(np.exp(beyond_lim_of_inf))))
+        np.seterr(**old_err_state)
+
+
+    def test_log_gradients(self):
+        # transf dtransf_df d2transf_df2 d3transf_df3
+        link = Log()
+        lim_of_inf = _lim_val_exp
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_identity_gradients(self):
+        link = Identity()
+        lim_of_inf = _lim_val
+        #FIXME: Should be able to think of a way to test the limits of this
+        self.check_gradient(link, lim_of_inf, test_lim=False)
+
+    def test_probit_gradients(self):
+        link = Probit()
+        lim_of_inf = _lim_val
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_Cloglog_gradients(self):
+        link = Cloglog()
+        lim_of_inf = _lim_val_exp
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_Log_ex_1_gradients(self):
+        link = Log_ex_1()
+        lim_of_inf = _lim_val_exp
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+        self.check_overflow(link, lim_of_inf)
+
+    def test_reciprocal_gradients(self):
+        link = Reciprocal()
+        lim_of_inf = _lim_val
+        #Does not work with much smaller values, and values closer to zero than 1e-5
+        self.check_gradient(link, lim_of_inf, test_lim=True)
diff --git a/GPy/util/block_matrices.py b/GPy/util/block_matrices.py
index a047abc6..e1e04aaa 100644
--- a/GPy/util/block_matrices.py
+++ b/GPy/util/block_matrices.py
@@ -1,9 +1,37 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2014-2015, Alan Saul
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 
+def get_blocks_3d(A, blocksizes, pagesizes=None):
+    """
+    Given a 3d matrix, make a block matrix, where the first and second dimensions are blocked according
+    to blocksizes, and the pages are blocked using pagesizes
+    """
+    assert (A.shape[0]==A.shape[1]) and len(A.shape)==3, "can't blockify this non-square matrix, may need to use 2d version"
+    N = np.sum(blocksizes)
+    assert A.shape[0] == N, "bad blocksizes"
+    num_blocks = len(blocksizes)
+    if pagesizes == None:
+        #Assume each page of A should be its own dimension
+        pagesizes = range(A.shape[2])#[0]*A.shape[2]
+    num_pages = len(pagesizes)
+    B = np.empty(shape=(num_blocks, num_blocks, num_pages), dtype=np.object)
+    count_k = 0
+    #for Bk, k in enumerate(pagesizes):
+    for Bk in pagesizes:
+        count_i = 0
+        for Bi, i in enumerate(blocksizes):
+            count_j = 0
+            for Bj, j in enumerate(blocksizes):
+                #We want to have it count_k:count_k + k but its annoying as it makes a NxNx1 array is page sizes are set to 1
+                B[Bi, Bj, Bk] = A[count_i:count_i + i, count_j:count_j + j, Bk]
+                count_j += j
+            count_i += i
+        #count_k += k
+    return B
+
 def get_blocks(A, blocksizes):
-    assert (A.shape[0]==A.shape[1]) and len(A.shape)==2, "can;t blockify this non-square matrix"
+    assert (A.shape[0]==A.shape[1]) and len(A.shape)==2, "can't blockify this non-square matrix"
     N = np.sum(blocksizes)
     assert A.shape[0] == N, "bad blocksizes"
     num_blocks = len(blocksizes)
@@ -17,6 +45,11 @@ def get_blocks(A, blocksizes):
         count_i += i
     return B
 
+def get_block_shapes_3d(B):
+    assert B.dtype is np.dtype('object'), "Must be a block matrix"
+    #FIXME: This isn't general AT ALL...
+    return get_block_shapes(B[:,:,0]), B.shape[2]
+
 def get_block_shapes(B):
     assert B.dtype is np.dtype('object'), "Must be a block matrix"
     return [B[b,b].shape[0] for b in range(0, B.shape[0])]
@@ -35,7 +68,7 @@ def unblock(B):
         count_i += i
     return A
 
-def block_dot(A, B):
+def block_dot(A, B, diagonal=False):
     """
     Element wise dot product on block matricies
 
@@ -48,21 +81,30 @@ def block_dot(A, B):
     +-------------+   +------+------+    +-------+-------+
 
     ..Note
+        If any block of either (A or B) are stored as 1d vectors then we assume
+        that it denotes a diagonal matrix efficient dot product using numpy
+        broadcasting will be used, i.e. A11*B11
+
         If either (A or B) of the diagonal matrices are stored as vectors then a more
         efficient dot product using numpy broadcasting will be used, i.e. A11*B11
     """
     #Must have same number of blocks and be a block matrix
     assert A.dtype is np.dtype('object'), "Must be a block matrix"
     assert B.dtype is np.dtype('object'), "Must be a block matrix"
-    Ashape = A.shape
-    Bshape = B.shape
-    assert Ashape == Bshape
-    def f(A,B):
-        if Ashape[0] == Ashape[1] or Bshape[0] == Bshape[1]:
-            #FIXME: Careful if one is transpose of other, would make a matrix
-            return A*B
+    assert A.shape == B.shape
+    def f(C,D):
+        """
+        C is an element of A, D is the associated element of B
+        """
+        Cshape = C.shape
+        Dshape = D.shape
+        if diagonal and (len(Cshape) == 1 or len(Dshape) == 1\
+                or C.shape[0] != C.shape[1] or D.shape[0] != D.shape[1]):
+            print "Broadcasting, C: {} D:{}".format(C.shape, D.shape)
+            return C*D
         else:
-            return np.dot(A,B)
+            print "Dotting, C: {} C:{}".format(C.shape, D.shape)
+            return np.dot(C,D)
     dot = np.vectorize(f, otypes = [np.object])
     return dot(A,B)
 
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 8ac5418f..26c4b774 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -20,7 +20,7 @@ try:
     from scipy import weave
 except ImportError:
     config.set('weave', 'working', 'False')
-    
+
 
 _scipyversion = np.float64((scipy.__version__).split('.')[:2])
 _fix_dpotri_scipy_bug = True
@@ -102,7 +102,6 @@ def jitchol(A, maxtries=5):
         num_tries = 1
         while num_tries <= maxtries and np.isfinite(jitter):
             try:
-                print(jitter)
                 L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
                 return L
             except:
@@ -115,7 +114,6 @@ def jitchol(A, maxtries=5):
     except:
         logging.warning('\n'.join(['Added jitter of {:.10e}'.format(jitter),
             '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
-    import ipdb;ipdb.set_trace()
     return L
 
 # def dtrtri(L, lower=1):
diff --git a/GPy/util/misc.py b/GPy/util/misc.py
index 431d6f8f..ddc29fa0 100644
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@@ -2,18 +2,37 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
+from scipy.special import cbrt
 from .config import *
 
 _lim_val = np.finfo(np.float64).max
-
 _lim_val_exp = np.log(_lim_val)
 _lim_val_square = np.sqrt(_lim_val)
-_lim_val_cube = np.power(_lim_val, -3)
+#_lim_val_cube = cbrt(_lim_val)
+_lim_val_cube = np.nextafter(_lim_val**(1/3.0), -np.inf)
+_lim_val_quad = np.nextafter(_lim_val**(1/4.0), -np.inf)
+_lim_val_three_times = np.nextafter(_lim_val/3.0, -np.inf)
 
 def safe_exp(f):
     clip_f = np.clip(f, -np.inf, _lim_val_exp)
     return np.exp(clip_f)
 
+def safe_square(f):
+    f = np.clip(f, -np.inf, _lim_val_square)
+    return f**2
+
+def safe_cube(f):
+    f = np.clip(f, -np.inf, _lim_val_cube)
+    return f**3
+
+def safe_quad(f):
+    f = np.clip(f, -np.inf, _lim_val_quad)
+    return f**4
+
+def safe_three_times(f):
+    f = np.clip(f, -np.inf, _lim_val_three_times)
+    return 3*f
+
 def chain_1(df_dg, dg_dx):
     """
     Generic chaining function for first derivative
@@ -39,8 +58,8 @@ def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
         return d2f_dg2
     if  len(d2f_dg2) > 1 and len(d2f_dg2.shape)>1 and d2f_dg2.shape[-1] > 1:
         raise NotImplementedError('Not implemented for matricies yet')
-    #dg_dx_2 = np.clip(dg_dx, 1e-12, _lim_val_square)**2
-    dg_dx_2 = dg_dx**2
+    dg_dx_2 = np.clip(dg_dx, -np.inf, _lim_val_square)**2
+    #dg_dx_2 = dg_dx**2
     return d2f_dg2*(dg_dx_2) + df_dg*d2g_dx2
 
 def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
@@ -55,8 +74,8 @@ def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
     if (  (len(d2f_dg2) > 1 and d2f_dg2.shape[-1] > 1)
            or (len(d3f_dg3) > 1 and d3f_dg3.shape[-1] > 1)):
         raise NotImplementedError('Not implemented for matricies yet')
-    #dg_dx_3 = np.clip(dg_dx, 1e-12, _lim_val_cube)**3
-    dg_dx_3 = dg_dx**3
+    dg_dx_3 = np.clip(dg_dx, -np.inf, _lim_val_cube)**3
+    #dg_dx_3 = dg_dx**3
     return d3f_dg3*(dg_dx_3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3
 
 def opt_wrapper(m, **kwargs):