diff --git a/GPy/__init__.py b/GPy/__init__.py
index 5e091170..26713406 100644
--- a/GPy/__init__.py
+++ b/GPy/__init__.py
@@ -3,23 +3,23 @@
 import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 
-import core
-from core.parameterization import transformations, priors
+from . import core
+from .core.parameterization import transformations, priors
 constraints = transformations
-import models
-import mappings
-import inference
-import util
-import examples
-import likelihoods
-import testing
+from . import models
+from . import mappings
+from . import inference
+from . import util
+from . import examples
+from . import likelihoods
+from . import testing
 from numpy.testing import Tester
-import kern
-import plotting
+from . import kern
+from . import plotting
 
 # Direct imports for convenience:
-from core import Model
-from core.parameterization import Param, Parameterized, ObsAr
+from .core import Model
+from .core.parameterization import Param, Parameterized, ObsAr
 
 #@nottest
 try:
diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py
index ebed29bb..142eccbf 100644
--- a/GPy/core/__init__.py
+++ b/GPy/core/__init__.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from model import *
-from parameterization.parameterized import adjust_name_for_printing, Parameterizable
-from parameterization.param import Param, ParamConcatenation
-from parameterization.observable_array import ObsAr
+from .model import *
+from .parameterization.parameterized import adjust_name_for_printing, Parameterizable
+from .parameterization.param import Param, ParamConcatenation
+from .parameterization.observable_array import ObsAr
 
-from gp import GP
-from svgp import SVGP
-from sparse_gp import SparseGP
-from mapping import *
+from .gp import GP
+from .svgp import SVGP
+from .sparse_gp import SparseGP
+from .mapping import *
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 3252ac08..8100cfcc 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -4,13 +4,15 @@
 import numpy as np
 import sys
 from .. import kern
-from model import Model
-from parameterization import ObsAr
+from .model import Model
+from .parameterization import ObsAr
+from .mapping import Mapping
 from .. import likelihoods
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
-from parameterization.variational import VariationalPosterior
+from .parameterization.variational import VariationalPosterior
 
 import logging
+import warnings
 from GPy.util.normalizer import MeanNorm
 logger = logging.getLogger("GP")
 
@@ -34,7 +36,7 @@ class GP(Model):
 
 
     """
-    def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
+    def __init__(self, X, Y, kernel, likelihood, mean_function=None, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
         super(GP, self).__init__(name)
 
         assert X.ndim == 2
@@ -62,10 +64,14 @@ class GP(Model):
             self.Y = ObsAr(Y)
             self.Y_normalized = self.Y
 
-        assert Y.shape[0] == self.num_data
+        if Y.shape[0] != self.num_data:
+            #There can be cases where we want inputs than outputs, for example if we have multiple latent
+            #function values
+            warnings.warn("There are more rows in your input data X, \
+                         than in your output data Y, be VERY sure this is what you want")
         _, self.output_dim = self.Y.shape
 
-        #TODO: check the type of this is okay?
+        assert ((Y_metadata is None) or isinstance(Y_metadata, dict))
         self.Y_metadata = Y_metadata
 
         assert isinstance(kernel, kern.Kern)
@@ -75,6 +81,15 @@ class GP(Model):
         assert isinstance(likelihood, likelihoods.Likelihood)
         self.likelihood = likelihood
 
+        #handle the mean function
+        self.mean_function = mean_function
+        if mean_function is not None:
+            assert isinstance(self.mean_function, Mapping)
+            assert mean_function.input_dim == self.input_dim
+            assert mean_function.output_dim == self.output_dim
+            self.link_parameter(mean_function)
+
+
         #find a sensible inference method
         logger.info("initializing inference method")
         if inference_method is None:
@@ -82,14 +97,16 @@ class GP(Model):
                 inference_method = exact_gaussian_inference.ExactGaussianInference()
             else:
                 inference_method = expectation_propagation.EP()
-                print "defaulting to ", inference_method, "for latent function inference"
+                print("defaulting to ", inference_method, "for latent function inference")
         self.inference_method = inference_method
 
         logger.info("adding kernel and likelihood as parameters")
         self.link_parameter(self.kern)
         self.link_parameter(self.likelihood)
+        self.posterior = None
 
-    def set_XY(self, X=None, Y=None):
+
+    def set_XY(self, X=None, Y=None, trigger_update=True):
         """
         Set the input / output data of the model
         This is useful if we wish to change our existing data but maintain the same model
@@ -99,7 +116,7 @@ class GP(Model):
         :param Y: output observations
         :type Y: np.ndarray
         """
-        self.update_model(False)
+        if trigger_update: self.update_model(False)
         if Y is not None:
             if self.normalizer is not None:
                 self.normalizer.scale_by(Y)
@@ -123,26 +140,26 @@ class GP(Model):
                     self.link_parameters(self.X)
             else:
                 self.X = ObsAr(X)
-        self.update_model(True)
-        self._trigger_params_changed()
+        if trigger_update: self.update_model(True)
+        if trigger_update: self._trigger_params_changed()
 
-    def set_X(self,X):
+    def set_X(self,X, trigger_update=True):
         """
         Set the input data of the model
 
         :param X: input observations
         :type X: np.ndarray
         """
-        self.set_XY(X=X)
+        self.set_XY(X=X, trigger_update=trigger_update)
 
-    def set_Y(self,Y):
+    def set_Y(self,Y, trigger_update=True):
         """
         Set the output data of the model
 
         :param X: output observations
         :type X: np.ndarray
         """
-        self.set_XY(Y=Y)
+        self.set_XY(Y=Y, trigger_update=trigger_update)
 
     def parameters_changed(self):
         """
@@ -153,9 +170,11 @@ class GP(Model):
             This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
             this method yourself, there may be unexpected consequences.
         """
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata)
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.mean_function, self.Y_metadata)
         self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
         self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
+        if self.mean_function is not None:
+            self.mean_function.update_gradients(self.grad_dict['dL_dm'], self.X)
 
     def log_likelihood(self):
         """
@@ -192,6 +211,10 @@ class GP(Model):
 
         #force mu to be a column vector
         if len(mu.shape)==1: mu = mu[:,None]
+
+        #add the mean function in
+        if not self.mean_function is None:
+            mu += self.mean_function.f(_Xnew)
         return mu, var
 
     def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
@@ -241,12 +264,14 @@ class GP(Model):
 
     def predictive_gradients(self, Xnew):
         """
-        Compute the derivatives of the latent function with respect to X*
+        Compute the derivatives of the predicted latent function with respect to X*
 
         Given a set of points at which to predict X* (size [N*,Q]), compute the
         derivatives of the mean and variance. Resulting arrays are sized:
          dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
 
+        Note that this is not the same as computing the mean and variance of the derivative of the function!
+
          dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
         :param X: The points at which to get the predictive gradients
         :type X: np.ndarray (Xnew x self.input_dim)
@@ -276,7 +301,7 @@ class GP(Model):
         :type size: int.
         :param full_cov: whether to return the full covariance matrix, or just the diagonal.
         :type full_cov: bool.
-        :returns: Ysim: set of simulations
+        :returns: fsim: set of simulations
         :rtype: np.ndarray (N x samples)
         """
         m, v = self._raw_predict(X,  full_cov=full_cov)
@@ -284,11 +309,11 @@ class GP(Model):
             m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
         v = v.reshape(m.size,-1) if len(v.shape)==3 else v
         if not full_cov:
-            Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
+            fsim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
         else:
-            Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
+            fsim = np.random.multivariate_normal(m.flatten(), v, size).T
 
-        return Ysim
+        return fsim
 
     def posterior_samples(self, X, size=10, full_cov=False, Y_metadata=None):
         """
@@ -304,16 +329,16 @@ class GP(Model):
         :type noise_model: integer.
         :returns: Ysim: set of simulations, a Numpy array (N x samples).
         """
-        Ysim = self.posterior_samples_f(X, size, full_cov=full_cov)
-        Ysim = self.likelihood.samples(Ysim, Y_metadata)
-
+        fsim = self.posterior_samples_f(X, size, full_cov=full_cov)
+        Ysim = self.likelihood.samples(fsim, Y_metadata)
         return Ysim
 
     def plot_f(self, plot_limits=None, which_data_rows='all',
         which_data_ycols='all', fixed_inputs=[],
         levels=20, samples=0, fignum=None, ax=None, resolution=None,
         plot_raw=True,
-        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
+        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx',
+        apply_link=False):
         """
         Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
         This is a call to plot with plot_raw=True.
@@ -350,6 +375,8 @@ class GP(Model):
         :type Y_metadata: dict
         :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
         :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
+        :param apply_link: if there is a link function of the likelihood, plot the link(f*) rather than f*
+        :type apply_link: boolean
         """
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ..plotting.matplot_dep import models_plots
@@ -362,7 +389,7 @@ class GP(Model):
                                      which_data_ycols, fixed_inputs,
                                      levels, samples, fignum, ax, resolution,
                                      plot_raw=plot_raw, Y_metadata=Y_metadata,
-                                     data_symbol=data_symbol, **kw)
+                                     data_symbol=data_symbol, apply_link=apply_link, **kw)
 
     def plot(self, plot_limits=None, which_data_rows='all',
         which_data_ycols='all', fixed_inputs=[],
@@ -441,7 +468,7 @@ class GP(Model):
         try:
             super(GP, self).optimize(optimizer, start, **kwargs)
         except KeyboardInterrupt:
-            print "KeyboardInterrupt caught, calling on_optimization_end() to round things up"
+            print("KeyboardInterrupt caught, calling on_optimization_end() to round things up")
             self.inference_method.on_optimization_end()
             raise
 
diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py
index 111fec6f..30614384 100644
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@@ -1,13 +1,14 @@
 # Copyright (c) 2013,2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import sys
-from parameterization import Parameterized
+from .parameterization import Parameterized
 import numpy as np
 
 class Mapping(Parameterized):
     """
-    Base model for shared behavior between models that can act like a mapping.
+    Base model for shared mapping behaviours
     """
 
     def __init__(self, input_dim, output_dim, name='mapping'):
@@ -18,49 +19,12 @@ class Mapping(Parameterized):
     def f(self, X):
         raise NotImplementedError
 
-    def df_dX(self, dL_df, X):
-        """Evaluate derivatives of mapping outputs with respect to inputs.
-
-        :param dL_df: gradient of the objective with respect to the function.
-        :type dL_df: ndarray (num_data x output_dim)
-        :param X: the input locations where derivatives are to be evaluated.
-        :type X: ndarray (num_data x input_dim)
-        :returns: matrix containing gradients of the function with respect to the inputs.
-        """
+    def gradients_X(self, dL_dF, X):
         raise NotImplementedError
 
-    def df_dtheta(self, dL_df, X):
-        """The gradient of the outputs of the mapping with respect to each of the parameters.
-
-        :param dL_df: gradient of the objective with respect to the function.
-        :type dL_df: ndarray (num_data x output_dim)
-        :param X: input locations where the function is evaluated.
-        :type X: ndarray (num_data x input_dim)
-        :returns: Matrix containing gradients with respect to parameters of each output for each input data.
-        :rtype: ndarray (num_params length)
-        """
-
+    def update_gradients(self, dL_dF, X):
         raise NotImplementedError
 
-    def plot(self, *args):
-        """
-        Plots the mapping associated with the model.
-          - In one dimension, the function is plotted.
-          - In two dimensions, a contour-plot shows the function
-          - In higher dimensions, we've not implemented this yet !TODO!
-
-        Can plot only part of the data and part of the posterior functions
-        using which_data and which_functions
-
-        This is a convenience function: arguments are passed to
-        GPy.plotting.matplot_dep.models_plots.plot_mapping
-        """
-
-        if "matplotlib" in sys.modules:
-            from ..plotting.matplot_dep import models_plots
-            mapping_plots.plot_mapping(self,*args)
-        else:
-            raise NameError, "matplotlib package has not been imported."
 
 class Bijective_mapping(Mapping):
     """
@@ -74,72 +38,4 @@ class Bijective_mapping(Mapping):
         """Inverse mapping from output domain of the function to the inputs."""
         raise NotImplementedError
 
-from model import Model
-
-class Mapping_check_model(Model):
-    """
-    This is a dummy model class used as a base class for checking that the
-    gradients of a given mapping are implemented correctly. It enables
-    checkgradient() to be called independently on each mapping.
-    """
-    def __init__(self, mapping=None, dL_df=None, X=None):
-        num_samples = 20
-        if mapping==None:
-            mapping = GPy.mapping.linear(1, 1)
-        if X==None:
-            X = np.random.randn(num_samples, mapping.input_dim)
-        if dL_df==None:
-            dL_df = np.ones((num_samples, mapping.output_dim))
-
-        self.mapping=mapping
-        self.X = X
-        self.dL_df = dL_df
-        self.num_params = self.mapping.num_params
-        Model.__init__(self)
-
-
-    def _get_params(self):
-        return self.mapping._get_params()
-
-    def _get_param_names(self):
-        return self.mapping._get_param_names()
-
-    def _set_params(self, x):
-        self.mapping._set_params(x)
-
-    def log_likelihood(self):
-        return (self.dL_df*self.mapping.f(self.X)).sum()
-
-    def _log_likelihood_gradients(self):
-        raise NotImplementedError, "This needs to be implemented to use the Mapping_check_model class."
-
-class Mapping_check_df_dtheta(Mapping_check_model):
-    """This class allows gradient checks for the gradient of a mapping with respect to parameters. """
-    def __init__(self, mapping=None, dL_df=None, X=None):
-        Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X)
-
-    def _log_likelihood_gradients(self):
-        return self.mapping.df_dtheta(self.dL_df, self.X)
-
-
-class Mapping_check_df_dX(Mapping_check_model):
-    """This class allows gradient checks for the gradient of a mapping with respect to X. """
-    def __init__(self, mapping=None, dL_df=None, X=None):
-        Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X)
-
-        if dL_df==None:
-            dL_df = np.ones((self.X.shape[0],self.mapping.output_dim))
-        self.num_params = self.X.shape[0]*self.mapping.input_dim
-
-    def _log_likelihood_gradients(self):
-        return self.mapping.df_dX(self.dL_df, self.X).flatten()
-
-    def _get_param_names(self):
-        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-
-    def _get_params(self):
-        return self.X.flatten()
-
-    def _set_params(self, x):
-        self.X=x.reshape(self.X.shape)
 
diff --git a/GPy/core/model.py b/GPy/core/model.py
index c5d318e7..937d30e5 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -5,7 +5,7 @@
 from .. import likelihoods
 from ..inference import optimization
 from ..util.misc import opt_wrapper
-from parameterization import Parameterized
+from .parameterization import Parameterized
 import multiprocessing as mp
 import numpy as np
 from numpy.linalg.linalg import LinAlgError
@@ -13,6 +13,7 @@ import itertools
 import sys
 from .verbose_optimization import VerboseOptimization
 # import numdifftools as ndt
+from functools import reduce
 
 class Model(Parameterized):
     _fail_count = 0  # Count of failed optimization steps (see objective)
@@ -30,7 +31,7 @@ class Model(Parameterized):
         self.add_observer(self.tie, self.tie._parameters_changed_notification, priority=-500)
 
     def log_likelihood(self):
-        raise NotImplementedError, "this needs to be implemented to use the model class"
+        raise NotImplementedError("this needs to be implemented to use the model class")
     def _log_likelihood_gradients(self):
         return self.gradient.copy()
 
@@ -82,7 +83,7 @@ class Model(Parameterized):
                 pool.close()  # signal that no more data coming in
                 pool.join()  # wait for all the tasks to complete
             except KeyboardInterrupt:
-                print "Ctrl+c received, terminating and joining pool."
+                print("Ctrl+c received, terminating and joining pool.")
                 pool.terminate()
                 pool.join()
 
@@ -95,10 +96,10 @@ class Model(Parameterized):
                     self.optimization_runs.append(jobs[i].get())
 
                 if verbose:
-                    print("Optimization restart {0}/{1}, f = {2}".format(i + 1, num_restarts, self.optimization_runs[-1].f_opt))
+                    print(("Optimization restart {0}/{1}, f = {2}".format(i + 1, num_restarts, self.optimization_runs[-1].f_opt)))
             except Exception as e:
                 if robust:
-                    print("Warning - optimization restart {0}/{1} failed".format(i + 1, num_restarts))
+                    print(("Warning - optimization restart {0}/{1} failed".format(i + 1, num_restarts)))
                 else:
                     raise e
 
@@ -119,7 +120,7 @@ class Model(Parameterized):
 
         DEPRECATED.
         """
-        raise DeprecationWarning, 'parameters now have default constraints'
+        raise DeprecationWarning('parameters now have default constraints')
 
     def objective_function(self):
         """
@@ -213,14 +214,14 @@ class Model(Parameterized):
             self.obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e10, 1e10)
         return obj_f, self.obj_grads
 
-    def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, **kwargs):
+    def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, clear_after_finish=False, **kwargs):
         """
         Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
 
         kwargs are passed to the optimizer. They can be:
 
-        :param max_f_eval: maximum number of function evaluations
-        :type max_f_eval: int
+        :param max_iters: maximum number of function evaluations
+        :type max_iters: int
         :messages: True: Display messages during optimisation, "ipython_notebook":
         :type messages: bool"string
         :param optimizer: which optimizer to use (defaults to self.preferred optimizer)
@@ -237,10 +238,10 @@ class Model(Parameterized):
 
         """
         if self.is_fixed or self.size == 0:
-            print 'nothing to optimize'
+            print('nothing to optimize')
 
         if not self.update_model():
-            print "updates were off, setting updates on again"
+            print("updates were off, setting updates on again")
             self.update_model(True)
 
         if start == None:
@@ -255,7 +256,7 @@ class Model(Parameterized):
         else:
             optimizer = optimization.get_optimizer(optimizer)
             opt = optimizer(start, model=self, max_iters=max_iters, **kwargs)
-                        
+
         with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook) as vo:
             opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
             vo.finish(opt)
@@ -305,7 +306,7 @@ class Model(Parameterized):
                     transformed_index = (indices - (~self._fixes_).cumsum())[transformed_index[which[0]]]
 
                 if transformed_index.size == 0:
-                    print "No free parameters to check"
+                    print("No free parameters to check")
                     return
 
             # just check the global ratio
@@ -340,9 +341,9 @@ class Model(Parameterized):
             cols.extend([max(float_len, len(header[i])) for i in range(1, len(header))])
             cols = np.array(cols) + 5
             header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
-            header_string = map(lambda x: '|'.join(x), [header_string])
+            header_string = list(map(lambda x: '|'.join(x), [header_string]))
             separator = '-' * len(header_string[0])
-            print '\n'.join([header_string[0], separator])
+            print('\n'.join([header_string[0], separator]))
             if target_param is None:
                 param_index = range(len(x))
                 transformed_index = param_index
@@ -358,19 +359,24 @@ class Model(Parameterized):
                     transformed_index = param_index
 
                 if param_index.size == 0:
-                    print "No free parameters to check"
+                    print("No free parameters to check")
                     return
 
             gradient = self._grads(x).copy()
             np.where(gradient == 0, 1e-312, gradient)
             ret = True
-            for nind, xind in itertools.izip(param_index, transformed_index):
+            for nind, xind in zip(param_index, transformed_index):
                 xx = x.copy()
                 xx[xind] += step
                 f1 = self._objective(xx)
                 xx[xind] -= 2.*step
                 f2 = self._objective(xx)
-                df_ratio = np.abs((f1 - f2) / min(f1, f2))
+                #Avoid divide by zero, if any of the values are above 1e-15, otherwise both values are essentiall
+                #the same
+                if f1 > 1e-15 or f1 < -1e-15 or f2 > 1e-15 or f2 < -1e-15:
+                    df_ratio = np.abs((f1 - f2) / min(f1, f2))
+                else:
+                    df_ratio = 1.0
                 df_unstable = df_ratio < df_tolerance
                 numerical_gradient = (f1 - f2) / (2 * step)
                 if np.all(gradient[xind] == 0): ratio = (f1 - f2) == gradient[xind]
@@ -392,7 +398,7 @@ class Model(Parameterized):
                 ng = '%.6f' % float(numerical_gradient)
                 df = '%1.e' % float(df_ratio)
                 grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}|{5:^{c5}}".format(formatted_name, r, d, g, ng, df, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4], c5=cols[5])
-                print grad_string
+                print(grad_string)
 
             self.optimizer_array = x
             return ret
@@ -402,6 +408,7 @@ class Model(Parameterized):
         model_details = [['<b>Model</b>', self.name + '<br>'],
                          ['<b>Log-likelihood</b>', '{}<br>'.format(float(self.log_likelihood()))],
                          ["<b>Number of Parameters</b>", '{}<br>'.format(self.size)],
+                         ["<b>Number of Optimization Parameters</b>", '{}<br>'.format(self._size_transformed())],
                          ["<b>Updates</b>", '{}<br>'.format(self._update_on)],
                          ]
         from operator import itemgetter
@@ -419,6 +426,7 @@ class Model(Parameterized):
         model_details = [['Name', self.name],
                          ['Log-likelihood', '{}'.format(float(self.log_likelihood()))],
                          ["Number of Parameters", '{}'.format(self.size)],
+                         ["Number of Optimization Parameters", '{}'.format(self._size_transformed())],
                          ["Updates", '{}'.format(self._update_on)],
                          ]
         from operator import itemgetter
diff --git a/GPy/core/parameterization/__init__.py b/GPy/core/parameterization/__init__.py
index 8e9aa094..de736671 100644
--- a/GPy/core/parameterization/__init__.py
+++ b/GPy/core/parameterization/__init__.py
@@ -1,5 +1,5 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from param import Param, ObsAr
-from parameterized import Parameterized
+from .param import Param, ObsAr
+from .parameterized import Parameterized
diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py
index 61c82da1..e4803f37 100644
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@@ -3,7 +3,8 @@
 
 import numpy
 from numpy.lib.function_base import vectorize
-from lists_and_dicts import IntArrayDict
+from .lists_and_dicts import IntArrayDict
+from functools import reduce
 
 def extract_properties_to_index(index, props):
     prop_index = dict()
@@ -62,12 +63,15 @@ class ParameterIndexOperations(object):
     def __init__(self, constraints=None):
         self._properties = IntArrayDict()
         if constraints is not None:
-            for t, i in constraints.iteritems():
+            #python 3 fix
+            #for t, i in constraints.iteritems():
+            for t, i in constraints.items():
                 self.add(t, i)
 
-    def iteritems(self):
-        return self._properties.iteritems()
-
+    #iteritems has gone in python 3
+    #def iteritems(self):
+    #    return self._properties.iteritems()
+        
     def items(self):
         return self._properties.items()
 
@@ -75,7 +79,7 @@ class ParameterIndexOperations(object):
         return self._properties.keys()
 
     def iterproperties(self):
-        return self._properties.iterkeys()
+        return iter(self._properties)
 
     def shift_right(self, start, size):
         for ind in self.iterindices():
@@ -83,7 +87,7 @@ class ParameterIndexOperations(object):
             ind[toshift] += size
 
     def shift_left(self, start, size):
-        for v, ind in self.items():
+        for v, ind in list(self.items()):
             todelete = (ind>=start) * (ind<start+size)
             if todelete.size != 0:
                 ind = ind[~todelete]
@@ -101,7 +105,11 @@ class ParameterIndexOperations(object):
         return reduce(lambda a,b: a+b.size, self.iterindices(), 0)
 
     def iterindices(self):
-        return self._properties.itervalues()
+        try:
+            return self._properties.itervalues()
+        except AttributeError:
+	#Changed this from itervalues to values for Py3 compatibility. It didn't break the test suite.
+            return self._properties.values()
 
     def indices(self):
         return self._properties.values()
@@ -150,14 +158,18 @@ class ParameterIndexOperations(object):
         return numpy.array([]).astype(int)
 
     def update(self, parameter_index_view, offset=0):
-        for i, v in parameter_index_view.iteritems():
+        #py3 fix
+        #for i, v in parameter_index_view.iteritems():
+        for i, v in parameter_index_view.items():
             self.add(i, v+offset)
 
     def copy(self):
         return self.__deepcopy__(None)
 
     def __deepcopy__(self, memo):
-        return ParameterIndexOperations(dict(self.iteritems()))
+        #py3 fix
+        #return ParameterIndexOperations(dict(self.iteritems()))
+        return ParameterIndexOperations(dict(self.items()))
 
     def __getitem__(self, prop):
         return self._properties[prop]
@@ -195,22 +207,26 @@ class ParameterIndexOperationsView(object):
     def _filter_index(self, ind):
         return ind[(ind >= self._offset) * (ind < (self._offset + self._size))] - self._offset
 
-
-    def iteritems(self):
-        for i, ind in self._param_index_ops.iteritems():
+    #iteritems has gone in python 3. It has been renamed items()
+    def items(self):
+        _items_list = list(self._param_index_ops.items())
+        for i, ind in _items_list:
             ind2 = self._filter_index(ind)
             if ind2.size > 0:
                 yield i, ind2
-
-    def items(self):
-        return [[i,v] for i,v in self.iteritems()]
+    
+    #Python 3 items() is now implemented as per py2 iteritems
+    #def items(self):
+    #    return [[i,v] for i,v in self.iteritems()]
 
     def properties(self):
         return [i for i in self.iterproperties()]
 
 
     def iterproperties(self):
-        for i, _ in self.iteritems():
+        #py3 fix
+        #for i, _ in self.iteritems():
+        for i, _ in self.items():
             yield i
 
 
@@ -230,7 +246,9 @@ class ParameterIndexOperationsView(object):
 
 
     def iterindices(self):
-        for _, ind in self.iteritems():
+        #py3 fix
+        #for _, ind in self.iteritems():
+        for _, ind in self.items():
             yield ind
 
 
@@ -286,10 +304,14 @@ class ParameterIndexOperationsView(object):
 
     def __str__(self, *args, **kwargs):
         import pprint
-        return pprint.pformat(dict(self.iteritems()))
+        #py3 fixes
+        #return pprint.pformat(dict(self.iteritems()))
+        return pprint.pformat(dict(self.items()))
 
     def update(self, parameter_index_view, offset=0):
-        for i, v in parameter_index_view.iteritems():
+        #py3 fixes
+        #for i, v in parameter_index_view.iteritems():
+        for i, v in parameter_index_view.items():
             self.add(i, v+offset)
 
 
@@ -297,6 +319,8 @@ class ParameterIndexOperationsView(object):
         return self.__deepcopy__(None)
 
     def __deepcopy__(self, memo):
-        return ParameterIndexOperations(dict(self.iteritems()))
+        #py3 fix
+        #return ParameterIndexOperations(dict(self.iteritems()))
+        return ParameterIndexOperations(dict(self.items()))
     pass
 
diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py
index 5afbb8ed..2d774a76 100644
--- a/GPy/core/parameterization/lists_and_dicts.py
+++ b/GPy/core/parameterization/lists_and_dicts.py
@@ -32,7 +32,7 @@ class ArrayList(list):
             if el is item:
                 return index
             index += 1
-        raise ValueError, "{} is not in list".format(item)
+        raise ValueError("{} is not in list".format(item))
     pass
 
 class ObserverList(object):
@@ -75,7 +75,7 @@ class ObserverList(object):
 
     def __str__(self):
         from . import ObsAr, Param
-        from parameter_core import Parameterizable
+        from .parameter_core import Parameterizable
         ret = []
         curr_p = None
         
diff --git a/GPy/core/parameterization/observable.py b/GPy/core/parameterization/observable.py
index 8a85c6ca..0836b5d6 100644
--- a/GPy/core/parameterization/observable.py
+++ b/GPy/core/parameterization/observable.py
@@ -12,7 +12,7 @@ class Observable(object):
     """
     def __init__(self, *args, **kwargs):
         super(Observable, self).__init__()
-        from lists_and_dicts import ObserverList
+        from .lists_and_dicts import ObserverList
         self.observers = ObserverList()
         self._update_on = True
 
diff --git a/GPy/core/parameterization/observable_array.py b/GPy/core/parameterization/observable_array.py
index 271fe7b9..c6fea497 100644
--- a/GPy/core/parameterization/observable_array.py
+++ b/GPy/core/parameterization/observable_array.py
@@ -3,8 +3,8 @@
 
 
 import numpy as np
-from parameter_core import Pickleable
-from observable import Observable
+from .parameter_core import Pickleable
+from .observable import Observable
 
 class ObsAr(np.ndarray, Pickleable, Observable):
     """
@@ -39,7 +39,7 @@ class ObsAr(np.ndarray, Pickleable, Observable):
         return self.view(np.ndarray)
 
     def copy(self):
-        from lists_and_dicts import ObserverList
+        from .lists_and_dicts import ObserverList
         memo = {}
         memo[id(self)] = self
         memo[id(self.observers)] = ObserverList()
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 1246bc18..1838f2bf 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -4,8 +4,9 @@
 import itertools
 import numpy
 np = numpy
-from parameter_core import Parameterizable, adjust_name_for_printing, Pickleable
-from observable_array import ObsAr
+from .parameter_core import Parameterizable, adjust_name_for_printing, Pickleable
+from .observable_array import ObsAr
+from functools import reduce
 
 ###### printing
 __constraints_name__ = "Constraint"
@@ -156,7 +157,7 @@ class Param(Parameterizable, ObsAr):
     #===========================================================================
     @property
     def is_fixed(self):
-        from transformations import __fixed__
+        from .transformations import __fixed__
         return self.constraints[__fixed__].size == self.size
 
     def _get_original(self, param):
@@ -207,10 +208,14 @@ class Param(Parameterizable, ObsAr):
         return 0
     @property
     def _constraints_str(self):
-        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.iteritems()))]
+        #py3 fix
+        #return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.iteritems()))]
+        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.items()))]
     @property
     def _priors_str(self):
-        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.iteritems()))]
+        #py3 fix
+        #return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.iteritems()))]
+        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.items()))]
     @property
     def _ties_str(self):
         return ['']
@@ -279,7 +284,7 @@ class Param(Parameterizable, ObsAr):
 .tg th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
 .tg .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
 .tg .tg-right{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:right;}
-</style>"""] + ['<table class="tg">'] + [header] + ["<tr><td class=tg-left>{i}</td><td  class=tg-right>{x}</td><td class=tg-left>{c}</td><td class=tg-left>{p}</td><td class=tg-left>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])
+</style>"""] + ['<table class="tg">'] + [header] + ["<tr><td class=tg-left>{i}</td><td  class=tg-right>{x}</td><td class=tg-left>{c}</td><td class=tg-left>{p}</td><td class=tg-left>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in zip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])
 
     def __str__(self, constr_matrix=None, indices=None, prirs=None, ties=None, lc=None, lx=None, li=None, lp=None, lt=None, only_name=False):
         filter_ = self._current_slice_
@@ -300,7 +305,7 @@ class Param(Parameterizable, ObsAr):
         if only_name: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hierarchy_name(), c=sep*lc, i=sep*li, t=sep*lt, p=sep*lp)  # nice header for printing
         else: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hierarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
         if not ties: ties = itertools.cycle([''])
-        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
+        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in zip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
         # except: return super(Param, self).__str__()
 
 class ParamConcatenation(object):
@@ -313,7 +318,7 @@ class ParamConcatenation(object):
         See :py:class:`GPy.core.parameter.Param` for more details on constraining.
         """
         # self.params = params
-        from lists_and_dicts import ArrayList
+        from .lists_and_dicts import ArrayList
         self.params = ArrayList([])
         for p in params:
             for p in p.flattened_parameters:
@@ -336,7 +341,9 @@ class ParamConcatenation(object):
                     level += 1
                     parent = parent._parent_
         import operator
-        self.parents = map(lambda x: x[0], sorted(parents.iteritems(), key=operator.itemgetter(1)))
+        #py3 fix
+        #self.parents = map(lambda x: x[0], sorted(parents.iteritems(), key=operator.itemgetter(1)))
+        self.parents = map(lambda x: x[0], sorted(parents.items(), key=operator.itemgetter(1)))
     #===========================================================================
     # Get/set items, enable broadcasting
     #===========================================================================
@@ -429,14 +436,14 @@ class ParamConcatenation(object):
         params = self.params
         constr_matrices, ties_matrices, prior_matrices = zip(*map(f, params))
         indices = [p._indices() for p in params]
-        lc = max([p._max_len_names(cm, __constraints_name__) for p, cm in itertools.izip(params, constr_matrices)])
+        lc = max([p._max_len_names(cm, __constraints_name__) for p, cm in zip(params, constr_matrices)])
         lx = max([p._max_len_values() for p in params])
-        li = max([p._max_len_index(i) for p, i in itertools.izip(params, indices)])
-        lt = max([p._max_len_names(tm, __tie_name__) for p, tm in itertools.izip(params, ties_matrices)])
-        lp = max([p._max_len_names(pm, __constraints_name__) for p, pm in itertools.izip(params, prior_matrices)])
+        li = max([p._max_len_index(i) for p, i in zip(params, indices)])
+        lt = max([p._max_len_names(tm, __tie_name__) for p, tm in zip(params, ties_matrices)])
+        lp = max([p._max_len_names(pm, __constraints_name__) for p, pm in zip(params, prior_matrices)])
         strings = []
         start = True
-        for p, cm, i, tm, pm in itertools.izip(params,constr_matrices,indices,ties_matrices,prior_matrices):
+        for p, cm, i, tm, pm in zip(params,constr_matrices,indices,ties_matrices,prior_matrices):
             strings.append(p.__str__(constr_matrix=cm, indices=i, prirs=pm, ties=tm, lc=lc, lx=lx, li=li, lp=lp, lt=lt, only_name=(1-start)))
             start = False
         return "\n".join(strings)
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index bee160b2..1bc6a29e 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -13,11 +13,12 @@ Observable Pattern for patameterization
 
 """
 
-from transformations import Transformation,Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
+from .transformations import Transformation,Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
 import numpy as np
 import re
 import logging
-from updateable import Updateable
+from .updateable import Updateable
+from functools import reduce
 
 class HierarchyError(Exception):
     """
@@ -36,7 +37,7 @@ def adjust_name_for_printing(name):
         name = name.replace("/", "_l_").replace("@", '_at_')
         name = name.replace("(", "_of_").replace(")", "")
         if re.match(r'^[a-zA-Z_][a-zA-Z0-9-_]*$', name) is None:
-            raise NameError, "name {} converted to {} cannot be further converted to valid python variable name!".format(name2, name)
+            raise NameError("name {} converted to {} cannot be further converted to valid python variable name!".format(name2, name))
         return name
     return ''
 
@@ -65,13 +66,13 @@ class Parentable(object):
         Gets called, when the parent changed, so we can adjust our
         inner attributes according to the new parent.
         """
-        raise NotImplementedError, "shouldnt happen, Parentable objects need to be able to change their parent"
+        raise NotImplementedError("shouldnt happen, Parentable objects need to be able to change their parent")
 
     def _disconnect_parent(self, *args, **kw):
         """
         Disconnect this object from its parent
         """
-        raise NotImplementedError, "Abstract superclass"
+        raise NotImplementedError("Abstract superclass")
 
     @property
     def _highest_parent_(self):
@@ -109,7 +110,10 @@ class Pickleable(object):
                   it properly.
         :param protocol: pickling protocol to use, python-pickle for details.
         """
-        import cPickle as pickle
+        try: #Py2
+            import cPickle as pickle
+        except ImportError: #Py3
+            import pickle
         if isinstance(f, str):
             with open(f, 'wb') as f:
                 pickle.dump(self, f, protocol)
@@ -138,9 +142,9 @@ class Pickleable(object):
             which = self
         which.traverse_parents(parents.append) # collect parents
         for p in parents:
-            if not memo.has_key(id(p)):memo[id(p)] = None # set all parents to be None, so they will not be copied
-        if not memo.has_key(id(self.gradient)):memo[id(self.gradient)] = None # reset the gradient
-        if not memo.has_key(id(self._fixes_)):memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent
+            if not id(p) in memo :memo[id(p)] = None # set all parents to be None, so they will not be copied
+        if not id(self.gradient) in memo:memo[id(self.gradient)] = None # reset the gradient
+        if not id(self._fixes_) in memo :memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent
         copy = copy.deepcopy(self, memo) # and start the copy
         copy._parent_index_ = None
         copy._trigger_params_changed()
@@ -163,14 +167,16 @@ class Pickleable(object):
                        '_Cacher_wrap__cachers', # never pickle cachers
                        ]
         dc = dict()
-        for k,v in self.__dict__.iteritems():
+        #py3 fix
+        #for k,v in self.__dict__.iteritems():
+        for k,v in self.__dict__.items():
             if k not in ignore_list:
                 dc[k] = v
         return dc
 
     def __setstate__(self, state):
         self.__dict__.update(state)
-        from lists_and_dicts import ObserverList
+        from .lists_and_dicts import ObserverList
         self.observers = ObserverList()
         self._setup_observers()
         self._optimizer_copy_transformed = False
@@ -214,7 +220,7 @@ class Gradcheckable(Pickleable, Parentable):
         Perform the checkgrad on the model.
         TODO: this can be done more efficiently, when doing it inside here
         """
-        raise HierarchyError, "This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!"
+        raise HierarchyError("This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!")
 
 class Nameable(Gradcheckable):
     """
@@ -268,7 +274,7 @@ class Indexable(Nameable, Updateable):
     def __init__(self, name, default_constraint=None, *a, **kw):
         super(Indexable, self).__init__(name=name, *a, **kw)
         self._default_constraint_ = default_constraint
-        from index_operations import ParameterIndexOperations
+        from .index_operations import ParameterIndexOperations
         self.constraints = ParameterIndexOperations()
         self.priors = ParameterIndexOperations()
         if self._default_constraint_ is not None:
@@ -310,7 +316,7 @@ class Indexable(Nameable, Updateable):
         that is an int array, containing the indexes for the flattened
         param inside this parameterized logic.
         """
-        from param import ParamConcatenation
+        from .param import ParamConcatenation
         if isinstance(param, ParamConcatenation):
             return np.hstack((self._raveled_index_for(p) for p in param.params))
         return param._raveled_index() + self._offset_for(param)
@@ -407,7 +413,7 @@ class Indexable(Nameable, Updateable):
         repriorized = self.unset_priors()
         self._add_to_index_operations(self.priors, repriorized, prior, warning)
 
-        from domains import _REAL, _POSITIVE, _NEGATIVE
+        from .domains import _REAL, _POSITIVE, _NEGATIVE
         if prior.domain is _POSITIVE:
             self.constrain_positive(warning)
         elif prior.domain is _NEGATIVE:
@@ -426,7 +432,9 @@ class Indexable(Nameable, Updateable):
         """evaluate the prior"""
         if self.priors.size > 0:
             x = self.param_array
-            return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()), 0)
+            #py3 fix
+            #return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()), 0)
+            return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.items()), 0)
         return 0.
 
     def _log_prior_gradients(self):
@@ -434,7 +442,9 @@ class Indexable(Nameable, Updateable):
         if self.priors.size > 0:
             x = self.param_array
             ret = np.zeros(x.size)
-            [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.iteritems()]
+            #py3 fix
+            #[np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.iteritems()]
+            [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.items()]
             return ret
         return 0.
 
@@ -536,7 +546,7 @@ class Indexable(Nameable, Updateable):
         update the constraints and priors view, so that
         constraining is automized for the parent.
         """
-        from index_operations import ParameterIndexOperationsView
+        from .index_operations import ParameterIndexOperationsView
         #if getattr(self, "_in_init_"):
             #import ipdb;ipdb.set_trace()
             #self.constraints.update(param.constraints, start)
@@ -558,7 +568,7 @@ class Indexable(Nameable, Updateable):
         """
         if warning and reconstrained.size > 0:
             # TODO: figure out which parameters have changed and only print those
-            print "WARNING: reconstraining parameters {}".format(self.hierarchy_name() or self.name)
+            print("WARNING: reconstraining parameters {}".format(self.hierarchy_name() or self.name))
         index = self._raveled_index()
         which.add(what, index)
         return index
@@ -571,7 +581,7 @@ class Indexable(Nameable, Updateable):
         if len(transforms) == 0:
             transforms = which.properties()
         removed = np.empty((0,), dtype=int)
-        for t in transforms:
+        for t in list(transforms):
             unconstrained = which.remove(t, self._raveled_index())
             removed = np.union1d(removed, unconstrained)
             if t is __fixed__:
@@ -612,7 +622,9 @@ class OptimizationHandlable(Indexable):
 
         if not self._optimizer_copy_transformed:
             self._optimizer_copy_.flat = self.param_array.flat
-            [np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
+            #py3 fix
+            #[np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
+            [np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.items() if c != __fixed__]
             if self.has_parent() and (self.constraints[__fixed__].size != 0 or self._has_ties()):
                 fixes = np.ones(self.size).astype(bool)
                 fixes[self.constraints[__fixed__]] = FIXED
@@ -641,21 +653,25 @@ class OptimizationHandlable(Indexable):
         if f is None:
             self.param_array.flat = p
             [np.put(self.param_array, ind, c.f(self.param_array.flat[ind]))
-             for c, ind in self.constraints.iteritems() if c != __fixed__]
+             #py3 fix
+             #for c, ind in self.constraints.iteritems() if c != __fixed__]
+             for c, ind in self.constraints.items() if c != __fixed__]
         else:
             self.param_array.flat[f] = p
             [np.put(self.param_array, ind[f[ind]], c.f(self.param_array.flat[ind[f[ind]]]))
-             for c, ind in self.constraints.iteritems() if c != __fixed__]
+             #py3 fix
+             #for c, ind in self.constraints.iteritems() if c != __fixed__]
+             for c, ind in self.constraints.items() if c != __fixed__]
         #self._highest_parent_.tie.propagate_val()
 
         self._optimizer_copy_transformed = False
         self.trigger_update()
 
     def _get_params_transformed(self):
-        raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!"
+        raise DeprecationWarning("_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!")
 #
     def _set_params_transformed(self, p):
-        raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!"
+        raise DeprecationWarning("_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!")
 
     def _trigger_params_changed(self, trigger_parent=True):
         """
@@ -680,7 +696,9 @@ class OptimizationHandlable(Indexable):
         constraint to it.
         """
         self._highest_parent_.tie.collate_gradient()
-        [np.put(g, i, c.gradfactor(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        #py3 fix
+        #[np.put(g, i, c.gradfactor(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        [np.put(g, i, c.gradfactor(self.param_array[i], g[i])) for c, i in self.constraints.items() if c != __fixed__]
         if self._has_fixes(): return g[self._fixes_]
         return g
 
@@ -690,7 +708,9 @@ class OptimizationHandlable(Indexable):
         constraint to it.
         """
         self._highest_parent_.tie.collate_gradient()
-        [np.put(g, i, c.gradfactor_non_natural(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        #py3 fix
+        #[np.put(g, i, c.gradfactor_non_natural(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        [np.put(g, i, c.gradfactor_non_natural(self.param_array[i], g[i])) for c, i in self.constraints.items() if c != __fixed__]
         if self._has_fixes(): return g[self._fixes_]
         return g
 
@@ -701,7 +721,7 @@ class OptimizationHandlable(Indexable):
         Return the number of parameters of this parameter_handle.
         Param objects will always return 0.
         """
-        raise NotImplemented, "Abstract, please implement in respective classes"
+        raise NotImplemented("Abstract, please implement in respective classes")
 
     def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
         """
@@ -750,7 +770,9 @@ class OptimizationHandlable(Indexable):
         self.optimizer_array = x  # makes sure all of the tied parameters get the same init (since there's only one prior object...)
         # now draw from prior where possible
         x = self.param_array.copy()
-        [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.iteritems() if not p is None]
+        #Py3 fix
+        #[np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.iteritems() if not p is None]
+        [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.items() if not p is None]
         unfixlist = np.ones((self.size,),dtype=np.bool)
         unfixlist[self.constraints[__fixed__]] = False
         self.param_array.flat[unfixlist] = x.view(np.ndarray).ravel()[unfixlist]
@@ -947,7 +969,7 @@ class Parameterizable(OptimizationHandlable):
             self._add_parameter_name(param, ignore_added_names)
         # and makes sure to not delete programmatically added parameters
         for other in self.parameters[::-1]:
-            if other is not param and other.name.startswith(param.name):
+            if other is not param and other.name == param.name:
                 warn_and_retry(param, _name_digit.match(other.name))
                 return
         if pname not in dir(self):
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 44173f58..691bf4a7 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2014, Max Zwiessele, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
+import six # For metaclass support in Python 2 and 3 simultaneously
 import numpy; np = numpy
 import itertools
 from re import compile, _pattern_type
-from param import ParamConcatenation
-from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
+from .param import ParamConcatenation
+from .parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
 
 import logging
 from GPy.core.parameterization.index_operations import ParameterIndexOperationsView
@@ -27,6 +27,7 @@ class ParametersChangedMeta(type):
         self.parameters_changed()
         return self
 
+@six.add_metaclass(ParametersChangedMeta)
 class Parameterized(Parameterizable):
     """
     Parameterized class
@@ -73,7 +74,9 @@ class Parameterized(Parameterizable):
     # Metaclass for parameters changed after init.
     # This makes sure, that parameters changed will always be called after __init__
     # **Never** call parameters_changed() yourself
-    __metaclass__ = ParametersChangedMeta
+    #This is ignored in Python 3 -- you need to put the meta class in the function definition. 
+    #__metaclass__ = ParametersChangedMeta
+    #The six module is used to support both Python 2 and 3 simultaneously
     #===========================================================================
     def __init__(self, name=None, parameters=[], *a, **kw):
         super(Parameterized, self).__init__(name=name, *a, **kw)
@@ -131,7 +134,7 @@ class Parameterized(Parameterizable):
             if param.has_parent():
                 def visit(parent, self):
                     if parent is self:
-                        raise HierarchyError, "You cannot add a parameter twice into the hierarchy"
+                        raise HierarchyError("You cannot add a parameter twice into the hierarchy")
                 param.traverse_parents(visit, self)
                 param._parent_.unlink_parameter(param)
             # make sure the size is set
@@ -173,7 +176,7 @@ class Parameterized(Parameterizable):
                 self._highest_parent_._connect_fixes()
 
         else:
-            raise HierarchyError, """Parameter exists already, try making a copy"""
+            raise HierarchyError("""Parameter exists already, try making a copy""")
 
 
     def link_parameters(self, *parameters):
@@ -189,9 +192,9 @@ class Parameterized(Parameterizable):
         """
         if not param in self.parameters:
             try:
-                raise RuntimeError, "{} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
+                raise RuntimeError("{} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name))
             except AttributeError:
-                raise RuntimeError, "{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param))
+                raise RuntimeError("{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param)))
 
         start = sum([p.size for p in self.parameters[:param._parent_index_]])
         self._remove_parameter_name(param)
@@ -215,9 +218,9 @@ class Parameterized(Parameterizable):
         self._highest_parent_._notify_parent_change()
 
     def add_parameter(self, *args, **kwargs):
-        raise DeprecationWarning, "add_parameter was renamed to link_parameter to avoid confusion of setting variables, use link_parameter instead"
+        raise DeprecationWarning("add_parameter was renamed to link_parameter to avoid confusion of setting variables, use link_parameter instead")
     def remove_parameter(self, *args, **kwargs):
-        raise DeprecationWarning, "remove_parameter was renamed to unlink_parameter to avoid confusion of setting variables, use unlink_parameter instead"
+        raise DeprecationWarning("remove_parameter was renamed to unlink_parameter to avoid confusion of setting variables, use unlink_parameter instead")
 
     def _connect_parameters(self, ignore_added_names=False):
         # connect parameterlist to this parameterized object
@@ -237,7 +240,7 @@ class Parameterized(Parameterizable):
         self._param_slices_ = []
         for i, p in enumerate(self.parameters):
             if not p.param_array.flags['C_CONTIGUOUS']:
-                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
+                raise ValueError("This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS")
 
             p._parent_ = self
             p._parent_index_ = i
@@ -268,7 +271,7 @@ class Parameterized(Parameterizable):
         """
         if not isinstance(regexp, _pattern_type): regexp = compile(regexp)
         found_params = []
-        for n, p in itertools.izip(self.parameter_names(False, False, True), self.flattened_parameters):
+        for n, p in zip(self.parameter_names(False, False, True), self.flattened_parameters):
             if regexp.match(n) is not None:
                 found_params.append(p)
         return found_params
@@ -279,7 +282,7 @@ class Parameterized(Parameterizable):
         else:
             if paramlist is None:
                 paramlist = self.grep_param_names(name)
-            if len(paramlist) < 1: raise AttributeError, name
+            if len(paramlist) < 1: raise AttributeError(name)
             if len(paramlist) == 1:
                 if isinstance(paramlist[-1], Parameterized):
                     paramlist = paramlist[-1].flattened_parameters
@@ -295,7 +298,7 @@ class Parameterized(Parameterizable):
             try:
                 self.param_array[name] = value
             except:
-                raise ValueError, "Setting by slice or index only allowed with array-like"
+                raise ValueError("Setting by slice or index only allowed with array-like")
             self.trigger_update()
         else:
             try: param = self.__getitem__(name, paramlist)
@@ -325,7 +328,7 @@ class Parameterized(Parameterizable):
             self._notify_parent_change()
             self.parameters_changed()
         except Exception as e:
-            print "WARNING: caught exception {!s}, trying to continue".format(e)
+            print("WARNING: caught exception {!s}, trying to continue".format(e))
 
     def copy(self, memo=None):
         if memo is None:
@@ -379,7 +382,7 @@ class Parameterized(Parameterizable):
         pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
         format_spec = "<tr><td class=tg-left>{{name:<{0}s}}</td><td class=tg-right>{{desc:>{1}s}}</td><td class=tg-left>{{const:^{2}s}}</td><td class=tg-left>{{pri:^{3}s}}</td><td class=tg-left>{{t:^{4}s}}</td></tr>".format(nl, sl, cl, pl, tl)
         to_print = []
-        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
+        for n, d, c, t, p in zip(names, desc, constrs, ts, prirs):
             to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
         sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
         if header:
@@ -414,7 +417,7 @@ class Parameterized(Parameterizable):
         pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
         format_spec = "  \033[1m{{name:<{0}s}}\033[0;0m  |  {{desc:>{1}s}}  |  {{const:^{2}s}}  |  {{pri:^{3}s}}  |  {{t:^{4}s}}".format(nl, sl, cl, pl, tl)
         to_print = []
-        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
+        for n, d, c, t, p in zip(names, desc, constrs, ts, prirs):
             to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
         sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
         if header:
diff --git a/GPy/core/parameterization/priors.py b/GPy/core/parameterization/priors.py
index 4a6b93e3..3c474438 100644
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@@ -5,7 +5,7 @@
 import numpy as np
 from scipy.special import gammaln, digamma
 from ...util.linalg import pdinv
-from domains import _REAL, _POSITIVE
+from .domains import _REAL, _POSITIVE
 import warnings
 import weakref
 
@@ -15,8 +15,12 @@ class Prior(object):
     _instance = None
     def __new__(cls, *args, **kwargs):
         if not cls._instance or cls._instance.__class__ is not cls:
-            cls._instance = super(Prior, cls).__new__(cls, *args, **kwargs)
-        return cls._instance
+                newfunc = super(Prior, cls).__new__
+                if newfunc is object.__new__:
+                    cls._instance = newfunc(cls)  
+                else:
+                    cls._instance = newfunc(cls, *args, **kwargs)
+                return cls._instance
 
     def pdf(self, x):
         return np.exp(self.lnpdf(x))
@@ -52,7 +56,11 @@ class Gaussian(Prior):
             for instance in cls._instances:
                 if instance().mu == mu and instance().sigma == sigma:
                     return instance()
-        o = super(Prior, cls).__new__(cls, mu, sigma)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, mu, sigma)            
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
 
@@ -140,7 +148,11 @@ class LogGaussian(Gaussian):
             for instance in cls._instances:
                 if instance().mu == mu and instance().sigma == sigma:
                     return instance()
-        o = super(Prior, cls).__new__(cls, mu, sigma)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, mu, sigma)
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
 
@@ -258,7 +270,11 @@ class Gamma(Prior):
             for instance in cls._instances:
                 if instance().a == a and instance().b == b:
                     return instance()
-        o = super(Prior, cls).__new__(cls, a, b)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, a, b)
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
 
@@ -398,7 +414,7 @@ class DGPLVM_KFDA(Prior):
     def compute_cls(self, x):
         cls = {}
         # Appending each data point to its proper class
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in cls:
                 cls[class_label] = []
@@ -537,7 +553,7 @@ class DGPLVM(Prior):
     def compute_cls(self, x):
         cls = {}
         # Appending each data point to its proper class
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in cls:
                 cls[class_label] = []
@@ -549,14 +565,14 @@ class DGPLVM(Prior):
         M_i = np.zeros((self.classnum, self.dim))
         for i in cls:
             # Mean of each class
-	    class_i = cls[i]
+            class_i = cls[i]
             M_i[i] = np.mean(class_i, axis=0)
         return M_i
 
     # Adding data points as tuple to the dictionary so that we can access indices
     def compute_indices(self, x):
         data_idx = {}
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in data_idx:
                 data_idx[class_label] = []
@@ -575,7 +591,7 @@ class DGPLVM(Prior):
             else:
                 lst_idx = []
             # Here we put indices of each class in to the list called lst_idx_all
-            for m in xrange(len(data_idx[i])):
+            for m in range(len(data_idx[i])):
                 lst_idx.append(data_idx[i][m][0])
             lst_idx_all.append(lst_idx)
         return lst_idx_all
@@ -611,7 +627,7 @@ class DGPLVM(Prior):
             # pdb.set_trace()
             # Calculating Bi
             B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
-        for k in xrange(self.datanum):
+        for k in range(self.datanum):
             for i in data_idx:
                 N_i = float(len(data_idx[i]))
                 if k in lst_idx_all[i]:
@@ -663,7 +679,7 @@ class DGPLVM(Prior):
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
         #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
         return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
 
     # This function calculates derivative of the log of prior function
@@ -684,7 +700,7 @@ class DGPLVM(Prior):
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
         #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
         Sb_inv_N_trans = np.transpose(Sb_inv_N)
         Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
         Sw_trans = np.transpose(Sw)
@@ -990,7 +1006,7 @@ class DGPLVM_T(Prior):
         self.datanum = lbl.shape[0]
         self.x_shape = x_shape
         self.dim = x_shape[1]
-	self.vec = vec
+        self.vec = vec
 
 
     def get_class_label(self, y):
@@ -1004,7 +1020,7 @@ class DGPLVM_T(Prior):
     def compute_cls(self, x):
         cls = {}
         # Appending each data point to its proper class
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in cls:
                 cls[class_label] = []
@@ -1024,7 +1040,7 @@ class DGPLVM_T(Prior):
     # Adding data points as tuple to the dictionary so that we can access indices
     def compute_indices(self, x):
         data_idx = {}
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in data_idx:
                 data_idx[class_label] = []
@@ -1043,7 +1059,7 @@ class DGPLVM_T(Prior):
             else:
                 lst_idx = []
             # Here we put indices of each class in to the list called lst_idx_all
-            for m in xrange(len(data_idx[i])):
+            for m in range(len(data_idx[i])):
                 lst_idx.append(data_idx[i][m][0])
             lst_idx_all.append(lst_idx)
         return lst_idx_all
@@ -1079,7 +1095,7 @@ class DGPLVM_T(Prior):
             # pdb.set_trace()
             # Calculating Bi
             B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
-        for k in xrange(self.datanum):
+        for k in range(self.datanum):
             for i in data_idx:
                 N_i = float(len(data_idx[i]))
                 if k in lst_idx_all[i]:
@@ -1135,7 +1151,7 @@ class DGPLVM_T(Prior):
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
 	#print 'SB_inv: ', Sb_inv_N
         #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
         return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
 
     # This function calculates derivative of the log of prior function
@@ -1160,7 +1176,7 @@ class DGPLVM_T(Prior):
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
 	#print 'SB_inv: ',Sb_inv_N
         #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
         Sb_inv_N_trans = np.transpose(Sb_inv_N)
         Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
         Sw_trans = np.transpose(Sw)
diff --git a/GPy/core/parameterization/ties_and_remappings.py b/GPy/core/parameterization/ties_and_remappings.py
index a81b8d61..527bc47c 100644
--- a/GPy/core/parameterization/ties_and_remappings.py
+++ b/GPy/core/parameterization/ties_and_remappings.py
@@ -2,8 +2,8 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from parameterized import Parameterized
-from param import Param
+from .parameterized import Parameterized
+from .param import Param
 
 class Remapping(Parameterized):
     def mapping(self):
@@ -98,7 +98,7 @@ class Tie(Parameterized):
             if np.all(self.label_buf[idx]==0):
                 # None of p has been tied before.
                 tie_idx = self._expandTieParam(1)
-                print tie_idx
+                print(tie_idx)
                 tie_id = self.label_buf.max()+1
                 self.label_buf[tie_idx] = tie_id
             else:
@@ -185,18 +185,18 @@ class Tie(Parameterized):
     def _check_change(self):
         changed = False
         if self.tied_param is not None:
-            for i in xrange(self.tied_param.size):
+            for i in range(self.tied_param.size):
                 b0 = self.label_buf==self.label_buf[self.buf_idx[i]]
                 b = self._highest_parent_.param_array[b0]!=self.tied_param[i]
                 if b.sum()==0:
-                    print 'XXX'
+                    print('XXX')
                     continue
                 elif b.sum()==1:
-                    print '!!!'
+                    print('!!!')
                     val = self._highest_parent_.param_array[b0][b][0]
                     self._highest_parent_.param_array[b0] = val
                 else:
-                    print '@@@'
+                    print('@@@')
                     self._highest_parent_.param_array[b0] = self.tied_param[i]
                 changed = True
         return changed
@@ -212,11 +212,11 @@ class Tie(Parameterized):
         if self.tied_param is not None:
             self.tied_param.gradient = 0.
             [np.put(self.tied_param.gradient, i, self._highest_parent_.gradient[self.label_buf==self.label_buf[self.buf_idx[i]]].sum()) 
-                for i in xrange(self.tied_param.size)]
+                for i in range(self.tied_param.size)]
     
     def propagate_val(self):
         if self.tied_param is not None:
-            for i in xrange(self.tied_param.size):
+            for i in range(self.tied_param.size):
                 self._highest_parent_.param_array[self.label_buf==self.label_buf[self.buf_idx[i]]] = self.tied_param[i]
 
 
diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py
index d929b1d9..7e15cee9 100644
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@@ -3,7 +3,7 @@
 
 
 import numpy as np
-from domains import _POSITIVE,_NEGATIVE, _BOUNDED
+from .domains import _POSITIVE,_NEGATIVE, _BOUNDED
 import weakref
 
 import sys
@@ -72,7 +72,7 @@ class Logexp(Transformation):
         return np.einsum('i,i->i', df, np.where(f>_lim_val, 1., 1. - np.exp(-f)))
     def initialize(self, f):
         if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         return np.abs(f)
     def __str__(self):
         return '+ve'
@@ -130,7 +130,7 @@ class NormalTheta(Transformation):
 
     def initialize(self, f):
         if np.any(f[self.var_indices] < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
             f[self.var_indices] = np.abs(f[self.var_indices])
         return f
 
@@ -177,7 +177,7 @@ class NormalNaturalAntti(NormalTheta):
 
     def initialize(self, f):
         if np.any(f[self.var_indices] < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
             f[self.var_indices] = np.abs(f[self.var_indices])
         return f
 
@@ -220,7 +220,7 @@ class NormalEta(Transformation):
 
     def initialize(self, f):
         if np.any(f[self.var_indices] < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
             f[self.var_indices] = np.abs(f[self.var_indices])
         return f
 
@@ -360,7 +360,7 @@ class LogexpNeg(Transformation):
         return np.einsum('i,i->i', df, np.where(f>_lim_val, -1, -1 + np.exp(-f)))
     def initialize(self, f):
         if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         return np.abs(f)
     def __str__(self):
         return '+ve'
@@ -412,7 +412,7 @@ class LogexpClipped(Logexp):
         return np.einsum('i,i->i', df, gf) # np.where(f < self.lower, 0, gf)
     def initialize(self, f):
         if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         return np.abs(f)
     def __str__(self):
         return '+ve_c'
@@ -428,7 +428,7 @@ class Exponent(Transformation):
         return np.einsum('i,i->i', df, f)
     def initialize(self, f):
         if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         return np.abs(f)
     def __str__(self):
         return '+ve'
@@ -468,7 +468,11 @@ class Logistic(Transformation):
             for instance in cls._instances:
                 if instance().lower == lower and instance().upper == upper:
                     return instance()
-        o = super(Transformation, cls).__new__(cls, lower, upper, *args, **kwargs)
+        newfunc = super(Transformation, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, lower, upper, *args, **kwargs)
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
     def __init__(self, lower, upper):
@@ -486,7 +490,7 @@ class Logistic(Transformation):
         return np.einsum('i,i->i', df, (f - self.lower) * (self.upper - f) / self.difference)
     def initialize(self, f):
         if np.any(np.logical_or(f < self.lower, f > self.upper)):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         #return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
         #FIXME: Max, zeros_like right?
         return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(np.zeros_like(f)), f)
diff --git a/GPy/core/parameterization/updateable.py b/GPy/core/parameterization/updateable.py
index 379e92e1..07083ce0 100644
--- a/GPy/core/parameterization/updateable.py
+++ b/GPy/core/parameterization/updateable.py
@@ -3,7 +3,7 @@ Created on 11 Nov 2014
 
 @author: maxz
 '''
-from observable import Observable
+from .observable import Observable
 
 
 class Updateable(Observable):
@@ -35,7 +35,7 @@ class Updateable(Observable):
         self.trigger_update()
 
     def toggle_update(self):
-        print "deprecated: toggle_update was renamed to update_toggle for easier access"
+        print("deprecated: toggle_update was renamed to update_toggle for easier access")
         self.update_toggle()
     def update_toggle(self):
         self.update_model(not self.update_model())
diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index 7cc5c99a..ab196b98 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -5,9 +5,9 @@ Created on 6 Nov 2013
 '''
 
 import numpy as np
-from parameterized import Parameterized
-from param import Param
-from transformations import Logexp, Logistic,__fixed__
+from .parameterized import Parameterized
+from .param import Param
+from .transformations import Logexp, Logistic,__fixed__
 from GPy.util.misc import param_to_array
 from GPy.util.caching import Cache_this
 
@@ -16,13 +16,13 @@ class VariationalPrior(Parameterized):
         super(VariationalPrior, self).__init__(name=name, **kw)
 
     def KL_divergence(self, variational_posterior):
-        raise NotImplementedError, "override this for variational inference of latent space"
+        raise NotImplementedError("override this for variational inference of latent space")
 
     def update_gradients_KL(self, variational_posterior):
         """
         updates the gradients for mean and variance **in place**
         """
-        raise NotImplementedError, "override this for variational inference of latent space"
+        raise NotImplementedError("override this for variational inference of latent space")
 
 class NormalPrior(VariationalPrior):
     def KL_divergence(self, variational_posterior):
@@ -50,31 +50,29 @@ class SpikeAndSlabPrior(VariationalPrior):
     def KL_divergence(self, variational_posterior):
         mu = variational_posterior.mean
         S = variational_posterior.variance
-        gamma,gamma1 = variational_posterior.gamma_probabilities()
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma = variational_posterior.gamma.values
         if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
             pi = self.pi[idx]
         else:
             pi = self.pi
             
         var_mean = np.square(mu)/self.variance
         var_S = (S/self.variance - np.log(S))
-        var_gamma = (gamma*(log_gamma-np.log(pi))).sum()+(gamma1*(log_gamma1-np.log(1-pi))).sum()
+        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
         return var_gamma+ (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
 
     def update_gradients_KL(self, variational_posterior):
         mu = variational_posterior.mean
         S = variational_posterior.variance
-        gamma,gamma1 = variational_posterior.gamma_probabilities()
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma = variational_posterior.gamma.values
         if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
             pi = self.pi[idx]
         else:
             pi = self.pi
 
-        variational_posterior.binary_prob.gradient -= (np.log((1-pi)/pi)+log_gamma-log_gamma1+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.)*gamma*gamma1
+        variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
         mu.gradient -= gamma*mu/self.variance
         S.gradient -= (1./self.variance - 1./S) * gamma /2.
         if self.learnPi:
@@ -141,7 +139,7 @@ class NormalPosterior(VariationalPosterior):
     holds the means and variances for a factorizing multivariate normal distribution
     '''
 
-    def plot(self, *args):
+    def plot(self, *args, **kwargs):
         """
         Plot latent space X in 1D:
 
@@ -150,8 +148,7 @@ class NormalPosterior(VariationalPosterior):
         import sys
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ...plotting.matplot_dep import variational_plots
-        import matplotlib
-        return variational_plots.plot(self,*args)
+        return variational_plots.plot(self, *args, **kwargs)
 
 class SpikeAndSlabPosterior(VariationalPosterior):
     '''
@@ -162,24 +159,8 @@ class SpikeAndSlabPosterior(VariationalPosterior):
         binary_prob : the probability of the distribution on the slab part.
         """
         super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
-        self.gamma = Param("binary_prob",binary_prob)
+        self.gamma = Param("binary_prob",binary_prob,Logistic(0.,1.))
         self.link_parameter(self.gamma)
-        
-    @Cache_this(limit=5)
-    def gamma_probabilities(self):
-        prob = np.zeros_like(param_to_array(self.gamma))
-        prob[self.gamma>-710] = 1./(1.+np.exp(-self.gamma[self.gamma>-710]))
-        prob1 = -np.zeros_like(param_to_array(self.gamma))
-        prob1[self.gamma<710] = 1./(1.+np.exp(self.gamma[self.gamma<710]))
-        return prob, prob1
-    
-    @Cache_this(limit=5)
-    def gamma_log_prob(self):
-        loggamma = param_to_array(self.gamma).copy()
-        loggamma[loggamma>-40] = -np.log1p(np.exp(-loggamma[loggamma>-40]))
-        loggamma1 = -param_to_array(self.gamma).copy()
-        loggamma1[loggamma1>-40] = -np.log1p(np.exp(-loggamma1[loggamma1>-40]))
-        return loggamma,loggamma1
 
     def set_gradients(self, grad):
         self.mean.gradient, self.variance.gradient, self.gamma.gradient = grad
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 005ef2ac..624a8f9c 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -2,19 +2,15 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from gp import GP
-from parameterization.param import Param
+from .gp import GP
+from .parameterization.param import Param
 from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
-from parameterization.variational import VariationalPosterior, NormalPosterior
+from .parameterization.variational import VariationalPosterior, NormalPosterior
 from ..util.linalg import mdot
 
 import logging
-from GPy.inference.latent_function_inference.posterior import Posterior
-from GPy.inference.optimization.stochastics import SparseGPStochastics,\
-    SparseGPMissing
-#no stochastics.py file added! from GPy.inference.optimization.stochastics import SparseGPStochastics,\
-    #SparseGPMissing
+import itertools
 logger = logging.getLogger("sparse gp")
 
 class SparseGP(GP):
@@ -25,6 +21,10 @@ class SparseGP(GP):
     (Gaussian likelihoods) as well as non-conjugate sparse methods based on
     these.
 
+    This is not for missing data, as the implementation for missing data involves
+    some inefficient optimization routine decisions.
+    See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
+
     :param X: inputs
     :type X: np.ndarray (num_data x input_dim)
     :param likelihood: a likelihood instance, containing the observed data
@@ -40,7 +40,7 @@ class SparseGP(GP):
 
     """
 
-    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, inference_method=None,
                  name='sparse gp', Y_metadata=None, normalizer=False):
         #pick a sensible inference method
         if inference_method is None:
@@ -48,13 +48,13 @@ class SparseGP(GP):
                 inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
             else:
                 #inference_method = ??
-                raise NotImplementedError, "what to do what to do?"
-            print "defaulting to ", inference_method, "for latent function inference"
+                raise NotImplementedError("what to do what to do?")
+            print("defaulting to ", inference_method, "for latent function inference")
 
         self.Z = Param('inducing inputs', Z)
         self.num_inducing = Z.shape[0]
 
-        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+        GP.__init__(self, X, Y, kernel, likelihood, mean_function, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
 
         logger.info("Adding Z as parameter")
         self.link_parameter(self.Z, index=0)
@@ -63,6 +63,14 @@ class SparseGP(GP):
     def has_uncertain_inputs(self):
         return isinstance(self.X, VariationalPosterior)
 
+    def set_Z(self, Z, trigger_update=True):
+        if trigger_update: self.update_model(False)
+        self.unlink_parameter(self.Z)
+        self.Z = Param('inducing inputs',Z)
+        self.link_parameter(self.Z, index=0)
+        if trigger_update: self.update_model(True)
+        if trigger_update: self._trigger_params_changed()
+
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata)
 
@@ -103,15 +111,15 @@ class SparseGP(GP):
 
     def _raw_predict(self, Xnew, full_cov=False, kern=None):
         """
-        Make a prediction for the latent function values. 
-    
+        Make a prediction for the latent function values.
+
         For certain inputs we give back a full_cov of shape NxN,
         if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, 
         we take only the diagonal elements across N.
         
         For uncertain inputs, the SparseGP bound produces a full covariance structure across D, so for full_cov we 
         return a NxDxD matrix and in the not full_cov case, we return the diagonal elements across D (NxD).
-        This is for both with and without missing data.
+        This is for both with and without missing data. See for missing data SparseGP implementation py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
         """
 
         if kern is None: kern = self.kern
@@ -128,7 +136,16 @@ class SparseGP(GP):
                 var = var
             else:
                 Kxx = kern.Kdiag(Xnew)
-                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
+                if self.posterior.woodbury_inv.ndim == 2:
+                    var = Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0)
+                elif self.posterior.woodbury_inv.ndim == 3:
+                    var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
+                    for i in range(var.shape[1]):
+                        var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
+                var = var
+            #add in the mean function
+            if self.mean_function is not None:
+                mu += self.mean_function.f(Xnew)
         else:
             psi0_star = self.kern.psi0(self.Z, Xnew)
             psi1_star = self.kern.psi1(self.Z, Xnew)
@@ -158,4 +175,5 @@ class SparseGP(GP):
                     var[i] = var_
                 else:
                     var[i] = np.diag(var_)+p0-t2
+
         return mu, var
diff --git a/GPy/core/sparse_gp_mpi.py b/GPy/core/sparse_gp_mpi.py
index 15d3ad76..28de3124 100644
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from sparse_gp import SparseGP
+from .sparse_gp import SparseGP
 from numpy.linalg.linalg import LinAlgError
 from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
 
@@ -56,7 +56,7 @@ class SparseGP_MPI(SparseGP):
             self.N_range = (N_start, N_end)
             self.N_list = np.array(N_list)
             self.Y_local = self.Y[N_start:N_end]
-            print 'MPI RANK '+str(self.mpi_comm.rank)+' with the data range '+str(self.N_range)
+            print('MPI RANK '+str(self.mpi_comm.rank)+' with the data range '+str(self.N_range))
             mpi_comm.Bcast(self.param_array, root=0)
         self.update_model(True)
 
diff --git a/GPy/core/svgp.py b/GPy/core/svgp.py
index 42044b1b..06a9749c 100644
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@@ -3,13 +3,13 @@
 
 import numpy as np
 from ..util import choleskies
-from sparse_gp import SparseGP
-from parameterization.param import Param
+from .sparse_gp import SparseGP
+from .parameterization.param import Param
 from ..inference.latent_function_inference import SVGP as svgp_inf
 
 
 class SVGP(SparseGP):
-    def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None, batchsize=None):
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None):
         """
         Stochastic Variational GP.
 
@@ -25,25 +25,20 @@ class SVGP(SparseGP):
 
         Hensman, Matthews and Ghahramani, Scalable Variational GP Classification, ArXiv 1411.2005
         """
-        if batchsize is None:
-            batchsize = X.shape[0]
-
-        self.X_all, self.Y_all = X, Y
-        # how to rescale the batch likelihood in case of minibatches
         self.batchsize = batchsize
-        batch_scale = float(self.X_all.shape[0])/float(self.batchsize)
-        #KL_scale = 1./np.float64(self.mpi_comm.size)
-        KL_scale = 1.0
-
-        import climin.util
-        #Make a climin slicer to make drawing minibatches much quicker. Annoyingly, this doesn;t pickle.
-        self.slicer = climin.util.draw_mini_slices(self.X_all.shape[0], self.batchsize)
-        X_batch, Y_batch = self.new_batch()
+        self.X_all, self.Y_all = X, Y
+        if batchsize is None:
+            X_batch, Y_batch = X, Y
+        else:
+            import climin.util
+            #Make a climin slicer to make drawing minibatches much quicker
+            self.slicer = climin.util.draw_mini_slices(self.X_all.shape[0], self.batchsize)
+            X_batch, Y_batch = self.new_batch()
 
         #create the SVI inference method
         inf_method = svgp_inf()
 
-        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
+        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
                  name=name, Y_metadata=Y_metadata, normalizer=False)
 
         self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
@@ -53,23 +48,31 @@ class SVGP(SparseGP):
         self.link_parameter(self.m)
 
     def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.mean_function, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
 
         #update the kernel gradients
         self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
         grad = self.kern.gradient.copy()
         self.kern.update_gradients_full(self.grad_dict['dL_dKmn'], self.Z, self.X)
-        grad += self.kern.gradient
+        grad += self.kern.gradient.copy()
         self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
         self.kern.gradient += grad
         if not self.Z.is_fixed:# only compute these expensive gradients if we need them
             self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) + self.kern.gradients_X(self.grad_dict['dL_dKmn'], self.Z, self.X)
 
+
         self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
         #update the variational parameter gradients:
         self.m.gradient = self.grad_dict['dL_dm']
         self.chol.gradient = self.grad_dict['dL_dchol']
 
+        if self.mean_function is not None:
+            self.mean_function.update_gradients(self.grad_dict['dL_dmfX'], self.X)
+            g = self.mean_function.gradient[:].copy()
+            self.mean_function.update_gradients(self.grad_dict['dL_dmfZ'], self.Z)
+            self.mean_function.gradient[:] += g
+            self.Z.gradient[:] += self.mean_function.gradients_X(self.grad_dict['dL_dmfZ'], self.Z)
+
     def set_data(self, X, Y):
         """
         Set the data without calling parameters_changed to avoid wasted computation
diff --git a/GPy/core/symbolic.py b/GPy/core/symbolic.py
index ed3a9d59..4a9fcb76 100644
--- a/GPy/core/symbolic.py
+++ b/GPy/core/symbolic.py
@@ -223,7 +223,7 @@ class Symbolic_core():
 
     def code_gradients_cacheable(self, function, variable):
         if variable not in self.cacheable:
-            raise RuntimeError, variable + ' must be a cacheable.'
+            raise RuntimeError(variable + ' must be a cacheable.')
         lcode = 'gradients_' + variable + ' = np.zeros_like(' + variable + ')\n'
         lcode += 'self.update_cache(' + ', '.join(self.cacheable) + ')\n'
         for i, theta in enumerate(self.variables[variable]):
diff --git a/GPy/core/verbose_optimization.py b/GPy/core/verbose_optimization.py
index 1a87b3da..a5fb019e 100644
--- a/GPy/core/verbose_optimization.py
+++ b/GPy/core/verbose_optimization.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, Max Zwiessele.
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
+from __future__ import print_function
 import numpy as np
 import sys
 import time
@@ -11,7 +11,7 @@ def exponents(fnow, current_grad):
     return np.sign(exps) * np.log10(exps).astype(int)
 
 class VerboseOptimization(object):
-    def __init__(self, model, opt, maxiters, verbose=False, current_iteration=0, ipython_notebook=True):
+    def __init__(self, model, opt, maxiters, verbose=False, current_iteration=0, ipython_notebook=True, clear_after_finish=False):
         self.verbose = verbose
         if self.verbose:
             self.model = model
@@ -22,55 +22,59 @@ class VerboseOptimization(object):
             self.opt_name = opt.opt_name
             self.model.add_observer(self, self.print_status)
             self.status = 'running'
+            self.clear = clear_after_finish
 
             self.update()
 
             try:
                 from IPython.display import display
-                from IPython.html.widgets import FloatProgressWidget, HTMLWidget, ContainerWidget
-                self.text = HTMLWidget()
-                self.progress = FloatProgressWidget()
-                self.model_show = HTMLWidget()
+                from IPython.html.widgets import IntProgress, HTML, Box, VBox, HBox, FlexBox
+                self.text = HTML(width='100%')
+                self.progress = IntProgress(min=0, max=maxiters)
+                #self.progresstext = Text(width='100%', disabled=True, value='0/{}'.format(maxiters))
+                self.model_show = HTML()
                 self.ipython_notebook = ipython_notebook
             except:
                 # Not in Ipython notebook
                 self.ipython_notebook = False
 
             if self.ipython_notebook:
-                self.text.set_css('width', '100%')
-                #self.progress.set_css('width', '100%')
+                left_col = VBox(children=[self.progress, self.text], padding=2, width='40%')
+                right_col = Box(children=[self.model_show], padding=2, width='60%')
+                self.hor_align = FlexBox(children = [left_col, right_col], width='100%', orientation='horizontal')
 
-                left_col = ContainerWidget(children = [self.progress, self.text])
-                right_col = ContainerWidget(children = [self.model_show])
-                hor_align = ContainerWidget(children = [left_col, right_col])
+                display(self.hor_align)
+                                
+                try:
+                    self.text.set_css('width', '100%')
+                    left_col.set_css({
+                             'padding': '2px',
+                             'width': "100%",
+                             })
+    
+                    right_col.set_css({
+                             'padding': '2px',
+                             })
+    
+                    self.hor_align.set_css({
+                             'width': "100%",
+                             })
 
-                display(hor_align)
+                    self.hor_align.remove_class('vbox')
+                    self.hor_align.add_class('hbox')
+    
+                    left_col.add_class("box-flex1")
+                    right_col.add_class('box-flex0')
 
-                left_col.set_css({
-                         'padding': '2px',
-                         'width': "100%",
-                         })
-
-                right_col.set_css({
-                         'padding': '2px',
-                         })
-
-                hor_align.set_css({
-                         'width': "100%",
-                         })
-
-                hor_align.remove_class('vbox')
-                hor_align.add_class('hbox')
-
-                left_col.add_class("box-flex1")
-                right_col.add_class('box-flex0')
+                except:
+                    pass
 
                 #self.text.add_class('box-flex2')
                 #self.progress.add_class('box-flex1')
             else:
                 self.exps = exponents(self.fnow, self.current_gradient)
-                print 'Running {} Code:'.format(self.opt_name)
-                print ' {3:7s}   {0:{mi}s}   {1:11s}    {2:11s}'.format("i", "f", "|g|", "secs", mi=self.len_maxiters)
+                print('Running {} Code:'.format(self.opt_name))
+                print(' {3:7s}   {0:{mi}s}   {1:11s}    {2:11s}'.format("i", "f", "|g|", "secs", mi=self.len_maxiters))
 
     def __enter__(self):
         self.start = time.time()
@@ -102,7 +106,8 @@ class VerboseOptimization(object):
                 html_body += "<td class='tg-right'>{}</td>".format(val)
                 html_body += "</tr>"
             self.text.value = html_begin + html_body + html_end
-            self.progress.value = 100*(self.iteration+1)/self.maxiters
+            self.progress.value = (self.iteration+1)
+            #self.progresstext.value = '0/{}'.format((self.iteration+1))
             self.model_show.value = self.model._repr_html_()
         else:
             n_exps = exponents(self.fnow, self.current_gradient)
@@ -111,11 +116,11 @@ class VerboseOptimization(object):
                 b = np.any(n_exps < self.exps)
                 if a or b:
                     self.p_iter = self.iteration
-                    print ''
+                    print('')
                 if b:
                     self.exps = n_exps
-            print '\r',
-            print '{3:> 7.2g}  {0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), time.time()-self.start, mi=self.len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+            print('\r', end=' ')
+            print('{3:> 7.2g}  {0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), time.time()-self.start, mi=self.len_maxiters), end=' ') # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
             sys.stdout.flush()
 
     def print_status(self, me, which=None):
@@ -144,7 +149,9 @@ class VerboseOptimization(object):
             self.print_out()
 
             if not self.ipython_notebook:
-                print ''
-                print 'Optimization finished in {0:.5g} Seconds'.format(self.stop-self.start)
-                print 'Optimization status: {0:.5g}'.format(self.status)
-                print
+                print()
+                print('Optimization finished in {0:.5g} Seconds'.format(self.stop-self.start))
+                print('Optimization status: {0}'.format(self.status))             
+                print()
+            elif self.clear:
+                self.hor_align.close()
diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py
index 968333e0..4e9e984e 100644
--- a/GPy/examples/__init__.py
+++ b/GPy/examples/__init__.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-import classification
-import regression
-import dimensionality_reduction
-import non_gaussian
+from . import classification
+from . import regression
+from . import dimensionality_reduction
+from . import non_gaussian
diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index b3780073..d4518f24 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -15,7 +15,7 @@ def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
 
     """
     try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
     data = pods.datasets.oil()
     X = data['X']
     Xtest = data['Xtest']
@@ -52,7 +52,7 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
     """
 
     try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
     data = pods.datasets.toy_linear_1d_classification(seed=seed)
     Y = data['Y'][:, 0:1]
     Y[Y.flatten() == -1] = 0
@@ -75,7 +75,7 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
 
-    print m
+    print(m)
     return m
 
 def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=True):
@@ -88,7 +88,7 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=
     """
 
     try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
     data = pods.datasets.toy_linear_1d_classification(seed=seed)
     Y = data['Y'][:, 0:1]
     Y[Y.flatten() == -1] = 0
@@ -114,7 +114,7 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
 
-    print m
+    print(m)
     return m
 
 def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, optimize=True, plot=True):
@@ -127,7 +127,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
     """
 
     try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
     data = pods.datasets.toy_linear_1d_classification(seed=seed)
     Y = data['Y'][:, 0:1]
     Y[Y.flatten() == -1] = 0
@@ -147,7 +147,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
 
-    print m
+    print(m)
     return m
 
 def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
@@ -160,7 +160,7 @@ def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
     """
 
     try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
     data = pods.datasets.toy_linear_1d_classification(seed=seed)
     Y = data['Y'][:, 0:1]
     Y[Y.flatten() == -1] = 0
@@ -177,7 +177,7 @@ def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
         # Parameters optimization:
         for _ in range(5):
             m.optimize(max_iters=int(max_iters/5))
-        print m
+        print(m)
 
     # Plot
     if plot:
@@ -186,7 +186,7 @@ def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
 
-    print m
+    print(m)
     return m
 
 def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=None, optimize=True, plot=True):
@@ -202,7 +202,7 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
     :type kernel: a GPy kernel
     """
     try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
     data = pods.datasets.crescent_data(seed=seed)
     Y = data['Y']
     Y[Y.flatten()==-1] = 0
@@ -224,5 +224,5 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
     if plot:
         m.plot()
 
-    print m
+    print(m)
     return m
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index df9093a2..46107a71 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -333,7 +333,7 @@ def bgplvm_simulation(optimize=True, verbose=1,
     m.likelihood.variance = .1
 
     if optimize:
-        print "Optimizing model:"
+        print("Optimizing model:")
         m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                    gtol=.05)
     if plot:
@@ -358,7 +358,7 @@ def ssgplvm_simulation(optimize=True, verbose=1,
     m.likelihood.variance = .1
 
     if optimize:
-        print "Optimizing model:"
+        print("Optimizing model:")
         m.optimize('scg', messages=verbose, max_iters=max_iters,
                    gtol=.05)
     if plot:
@@ -388,7 +388,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
     m.Yreal = Y
 
     if optimize:
-        print "Optimizing model:"
+        print("Optimizing model:")
         m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                    gtol=.05)
     if plot:
@@ -411,7 +411,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
     m['.*noise'] = [Y.var() / 40. for Y in Ylist]
 
     if optimize:
-        print "Optimizing Model:"
+        print("Optimizing Model:")
         m.optimize(messages=verbose, max_iters=8e3)
     if plot:
         m.X.plot("MRD Latent Space 1D")
@@ -439,7 +439,7 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim
             initx="random", initz='permute', **kw)
 
     if optimize:
-        print "Optimizing Model:"
+        print("Optimizing Model:")
         m.optimize('bfgs', messages=verbose, max_iters=8e3, gtol=.1)
     if plot:
         m.X.plot("MRD Latent Space 1D")
@@ -603,7 +603,7 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
     try:
         if optimize: m.optimize('bfgs', messages=verbose, max_iters=5e3, bfgs_factor=10)
     except KeyboardInterrupt:
-        print "Keyboard interrupt, continuing to plot and return"
+        print("Keyboard interrupt, continuing to plot and return")
 
     if plot:
         fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
@@ -653,7 +653,7 @@ def ssgplvm_simulation_linear():
     def sample_X(Q, pi):
         x = np.empty(Q)
         dies = np.random.rand(Q)
-        for q in xrange(Q):
+        for q in range(Q):
             if dies[q] < pi:
                 x[q] = np.random.randn()
             else:
@@ -663,7 +663,7 @@ def ssgplvm_simulation_linear():
     Y = np.empty((N, D))
     X = np.empty((N, Q))
     # Generate data from random sampled weight matrices
-    for n in xrange(N):
+    for n in range(N):
         X[n] = sample_X(Q, pi)
         w = np.random.randn(D, Q)
         Y[n] = np.dot(w, X[n])
diff --git a/GPy/examples/non_gaussian.py b/GPy/examples/non_gaussian.py
index ddac8813..3652b4d3 100644
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@@ -37,7 +37,7 @@ def student_t_approx(optimize=True, plot=True):
 
     #Add student t random noise to datapoints
     deg_free = 1
-    print "Real noise: ", real_std
+    print("Real noise: ", real_std)
     initial_var_guess = 0.5
     edited_real_sd = initial_var_guess
 
@@ -73,7 +73,7 @@ def student_t_approx(optimize=True, plot=True):
     m4['.*t_scale2'].constrain_bounded(1e-6, 10.)
     m4['.*white'].constrain_fixed(1e-5)
     m4.randomize()
-    print m4
+    print(m4)
     debug=True
     if debug:
         m4.optimize(messages=1)
@@ -81,18 +81,18 @@ def student_t_approx(optimize=True, plot=True):
         pb.plot(m4.X, m4.inference_method.f_hat)
         pb.plot(m4.X, m4.Y, 'rx')
         m4.plot()
-        print m4
+        print(m4)
         return m4
 
     if optimize:
         optimizer='scg'
-        print "Clean Gaussian"
+        print("Clean Gaussian")
         m1.optimize(optimizer, messages=1)
-        print "Corrupt Gaussian"
+        print("Corrupt Gaussian")
         m2.optimize(optimizer, messages=1)
-        print "Clean student t"
+        print("Clean student t")
         m3.optimize(optimizer, messages=1)
-        print "Corrupt student t"
+        print("Corrupt student t")
         m4.optimize(optimizer, messages=1)
 
     if plot:
@@ -151,7 +151,7 @@ def boston_example(optimize=True, plot=True):
 
     for n, (train, test) in enumerate(kf):
         X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
-        print "Fold {}".format(n)
+        print("Fold {}".format(n))
 
         noise = 1e-1 #np.exp(-2)
         rbf_len = 0.5
@@ -163,21 +163,21 @@ def boston_example(optimize=True, plot=True):
         score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
 
         #Gaussian GP
-        print "Gauss GP"
+        print("Gauss GP")
         mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
         mgp.constrain_fixed('.*white', 1e-5)
         mgp['.*len'] = rbf_len
         mgp['.*noise'] = noise
-        print mgp
+        print(mgp)
         if optimize:
             mgp.optimize(optimizer=optimizer, messages=messages)
         Y_test_pred = mgp.predict(X_test)
         score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
         pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
-        print mgp
-        print pred_density
+        print(mgp)
+        print(pred_density)
 
-        print "Gaussian Laplace GP"
+        print("Gaussian Laplace GP")
         N, D = Y_train.shape
         g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
         g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
@@ -186,18 +186,18 @@ def boston_example(optimize=True, plot=True):
         mg.constrain_fixed('.*white', 1e-5)
         mg['rbf_len'] = rbf_len
         mg['noise'] = noise
-        print mg
+        print(mg)
         if optimize:
             mg.optimize(optimizer=optimizer, messages=messages)
         Y_test_pred = mg.predict(X_test)
         score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
         pred_density[2, n] = np.mean(mg.log_predictive_density(X_test, Y_test))
-        print pred_density
-        print mg
+        print(pred_density)
+        print(mg)
 
         for stu_num, df in enumerate(degrees_freedoms):
             #Student T
-            print "Student-T GP {}df".format(df)
+            print("Student-T GP {}df".format(df))
             t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
             stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
             mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
@@ -205,14 +205,14 @@ def boston_example(optimize=True, plot=True):
             mstu_t.constrain_bounded('.*t_scale2', 0.0001, 1000)
             mstu_t['rbf_len'] = rbf_len
             mstu_t['.*t_scale2'] = noise
-            print mstu_t
+            print(mstu_t)
             if optimize:
                 mstu_t.optimize(optimizer=optimizer, messages=messages)
             Y_test_pred = mstu_t.predict(X_test)
             score_folds[3+stu_num, n] = rmse(Y_test, Y_test_pred[0])
             pred_density[3+stu_num, n] = np.mean(mstu_t.log_predictive_density(X_test, Y_test))
-            print pred_density
-            print mstu_t
+            print(pred_density)
+            print(mstu_t)
 
     if plot:
         plt.figure()
@@ -230,8 +230,8 @@ def boston_example(optimize=True, plot=True):
         plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
         plt.title('Stu t {}df'.format(df))
 
-    print "Average scores: {}".format(np.mean(score_folds, 1))
-    print "Average pred density: {}".format(np.mean(pred_density, 1))
+    print("Average scores: {}".format(np.mean(score_folds, 1)))
+    print("Average pred density: {}".format(np.mean(pred_density, 1)))
 
     if plot:
         #Plotting
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 37a18f63..267c6d1e 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -15,7 +15,7 @@ def olympic_marathon_men(optimize=True, plot=True):
     """Run a standard Gaussian process regression on the Olympic marathon data."""
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.olympic_marathon_men()
 
@@ -88,7 +88,7 @@ def epomeo_gpx(max_iters=200, optimize=True, plot=True):
     """
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.epomeo_gpx()
     num_data_list = []
@@ -135,7 +135,7 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
 
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta',gene_number=gene_number)
     # data['Y'] = data['Y'][0::2, :]
@@ -219,7 +219,7 @@ def olympic_100m_men(optimize=True, plot=True):
     """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.olympic_100m_men()
 
@@ -240,7 +240,7 @@ def toy_rbf_1d(optimize=True, plot=True):
     """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.toy_rbf_1d()
 
@@ -258,7 +258,7 @@ def toy_rbf_1d_50(optimize=True, plot=True):
     """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.toy_rbf_1d_50()
 
@@ -377,7 +377,7 @@ def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
     """Predict the location of a robot given wirelss signal strength readings."""
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.robot_wireless()
 
@@ -398,14 +398,14 @@ def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
 
     sse = ((data['Xtest'] - Xpredict)**2).sum()
 
-    print('Sum of squares error on test data: ' + str(sse))
+    print(('Sum of squares error on test data: ' + str(sse)))
     return m
 
 def silhouette(max_iters=100, optimize=True, plot=True):
     """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.silhouette()
 
@@ -416,7 +416,7 @@ def silhouette(max_iters=100, optimize=True, plot=True):
     if optimize:
         m.optimize(messages=True, max_iters=max_iters)
 
-    print m
+    print(m)
     return m
 
 def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True, checkgrad=False):
@@ -468,7 +468,7 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, opt
     if plot:
         m.plot()
 
-    print m
+    print(m)
     return m
 
 def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
@@ -492,7 +492,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
     if plot:
         m.plot(ax=axes[0])
         axes[0].set_title('no input uncertainty')
-    print m
+    print(m)
 
     # the same Model with uncertainty
     m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.RBF(1), Z=Z, X_variance=S)
@@ -503,5 +503,50 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
         axes[1].set_title('with input uncertainty')
         fig.canvas.draw()
 
-    print m
+    print(m)
     return m
+
+def simple_mean_function(max_iters=100, optimize=True, plot=True):
+    """
+    The simplest possible mean function. No parameters, just a simple Sinusoid.
+    """
+    #create  simple mean function
+    mf = GPy.core.Mapping(1,1)
+    mf.f = np.sin
+    mf.update_gradients = lambda a,b: None
+
+    X = np.linspace(0,10,50).reshape(-1,1)
+    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
+
+    k =GPy.kern.RBF(1)
+    lik = GPy.likelihoods.Gaussian()
+    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+    if optimize:
+        m.optimize(max_iters=max_iters)
+    if plot:
+        m.plot(plot_limits=(-10,15))
+    return m
+
+def parametric_mean_function(max_iters=100, optimize=True, plot=True):
+    """
+    A linear mean function with parameters that we'll learn alongside the kernel
+    """
+    #create  simple mean function
+    mf = GPy.core.Mapping(1,1)
+    mf.f = np.sin
+
+    X = np.linspace(0,10,50).reshape(-1,1)
+    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
+
+    mf = GPy.mappings.Linear(1,1)
+
+    k =GPy.kern.RBF(1)
+    lik = GPy.likelihoods.Gaussian()
+    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+    if optimize:
+        m.optimize(max_iters=max_iters)
+    if plot:
+        m.plot()
+    return m
+
+
diff --git a/GPy/inference/__init__.py b/GPy/inference/__init__.py
index 7b1307e3..c5044582 100644
--- a/GPy/inference/__init__.py
+++ b/GPy/inference/__init__.py
@@ -1,3 +1,3 @@
-import latent_function_inference
-import optimization
-import mcmc
+from . import latent_function_inference
+from . import optimization
+from . import mcmc
diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index 67f57638..6754000d 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -50,26 +50,26 @@ class InferenceMethodList(LatentFunctionInference, list):
     def on_optimization_end(self):
         for inf in self:
             inf.on_optimization_end()
-    
+
     def __getstate__(self):
         state = []
         for inf in self:
             state.append(inf)
         return state
-    
+
     def __setstate__(self, state):
         for inf in state:
             self.append(inf)
 
-from exact_gaussian_inference import ExactGaussianInference
-from laplace import Laplace
+from .exact_gaussian_inference import ExactGaussianInference
+from .laplace import Laplace,LaplaceBlock
 from GPy.inference.latent_function_inference.var_dtc import VarDTC
-from expectation_propagation import EP
-from expectation_propagation_dtc import EPDTC
-from dtc import DTC
-from fitc import FITC
-from var_dtc_parallel import VarDTC_minibatch
-from svgp import SVGP
+from .expectation_propagation import EP
+from .expectation_propagation_dtc import EPDTC
+from .dtc import DTC
+from .fitc import FITC
+from .var_dtc_parallel import VarDTC_minibatch
+from .svgp import SVGP
 
 # class FullLatentFunctionData(object):
 #
@@ -78,9 +78,9 @@ from svgp import SVGP
 # class EMLikeLatentFunctionInference(LatentFunctionInference):
 #     def update_approximation(self):
 #         """
-#         This function gets called when the 
+#         This function gets called when the
 #         """
-#     
+#
 #     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
 #         Do inference on the latent functions given a covariance function `kern`,
@@ -88,7 +88,7 @@ from svgp import SVGP
 #         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
 #         """
 #         raise NotImplementedError, "Abstract base class for full inference"
-# 
+#
 # class VariationalLatentFunctionInference(LatentFunctionInference):
 #     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
diff --git a/GPy/inference/latent_function_inference/dtc.py b/GPy/inference/latent_function_inference/dtc.py
index 5590a079..0aa990c1 100644
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 import numpy as np
 from . import LatentFunctionInference
@@ -20,7 +20,8 @@ class DTC(LatentFunctionInference):
     def __init__(self):
         self.const_jitter = 1e-6
 
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
         assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
 
         num_inducing, _ = Z.shape
@@ -29,7 +30,7 @@ class DTC(LatentFunctionInference):
         #make sure the noise is not hetero
         beta = 1./likelihood.gaussian_variance(Y_metadata)
         if beta.size > 1:
-            raise NotImplementedError, "no hetero noise with this implementation of DTC"
+            raise NotImplementedError("no hetero noise with this implementation of DTC")
 
         Kmm = kern.K(Z)
         Knn = kern.Kdiag(X)
@@ -88,7 +89,8 @@ class vDTC(object):
     def __init__(self):
         self.const_jitter = 1e-6
 
-    def inference(self, kern, X, X_variance, Z, likelihood, Y, Y_metadata):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
         assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
 
         num_inducing, _ = Z.shape
@@ -97,7 +99,7 @@ class vDTC(object):
         #make sure the noise is not hetero
         beta = 1./likelihood.gaussian_variance(Y_metadata)
         if beta.size > 1:
-            raise NotImplementedError, "no hetero noise with this implementation of DTC"
+            raise NotImplementedError("no hetero noise with this implementation of DTC")
 
         Kmm = kern.K(Z)
         Knn = kern.Kdiag(X)
diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index 1312d36a..76b10f08 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import pdinv, dpotrs, tdot
 from ...util import diag
 import numpy as np
@@ -36,11 +36,18 @@ class ExactGaussianInference(LatentFunctionInference):
             #print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
             return Y
 
-    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
         """
         Returns a Posterior class containing essential quantities of the posterior
         """
-        YYT_factor = self.get_YYTfactor(Y)
+
+        if mean_function is None:
+            m = 0
+        else:
+            m = mean_function.f(X)
+
+
+        YYT_factor = self.get_YYTfactor(Y-m)
 
         K = kern.K(X)
 
@@ -56,4 +63,18 @@ class ExactGaussianInference(LatentFunctionInference):
 
         dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata)
 
-        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
+        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
+
+    def LOO(self, kern, X, Y, likelihood, posterior, Y_metadata=None, K=None):
+        """
+        Leave one out error as found in
+        "Bayesian leave-one-out cross-validation approximations for Gaussian latent variable models"
+        Vehtari et al. 2014.
+        """
+        g = posterior.woodbury_vector
+        c = posterior.woodbury_inv
+        c_diag = np.diag(c)[:, None]
+        neg_log_marginal_LOO = 0.5*np.log(2*np.pi) - 0.5*np.log(c_diag) + 0.5*(g**2)/c_diag
+        #believe from Predictive Approaches for Choosing Hyperparameters in Gaussian Processes
+        #this is the negative marginal LOO
+        return -neg_log_marginal_LOO
diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py
index 26144974..85841a33 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs
-from posterior import Posterior
+from .posterior import Posterior
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 
@@ -33,15 +33,19 @@ class EP(LatentFunctionInference):
         # TODO: update approximation in the end as well? Maybe even with a switch?
         pass
 
-    def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
+        assert mean_function is None, "inference with a mean function not implemented"
         num_data, output_dim = Y.shape
         assert output_dim ==1, "ep in 1D only (for now!)"
 
         K = kern.K(X)
 
         if self._ep_approximation is None:
+
+            #if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation
             mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
         else:
+            #if we've already run EP, just use the existing approximation stored in self._ep_approximation
             mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation
 
         Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde))
diff --git a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
index 35b1b7dc..e182c9f7 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@@ -6,7 +6,7 @@ from ...util import diag
 from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify, DSYR
 from ...core.parameterization.variational import VariationalPosterior
 from . import LatentFunctionInference
-from posterior import Posterior
+from .posterior import Posterior
 log_2_pi = np.log(2*np.pi)
 
 class EPDTC(LatentFunctionInference):
@@ -64,7 +64,8 @@ class EPDTC(LatentFunctionInference):
         self.old_mutilde, self.old_vtilde = None, None
         self._ep_approximation = None
 
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
         num_data, output_dim = Y.shape
         assert output_dim ==1, "ep in 1D only (for now!)"
 
@@ -179,7 +180,7 @@ class EPDTC(LatentFunctionInference):
         if VVT_factor.shape[1] == Y.shape[1]:
             woodbury_vector = Cpsi1Vf # == Cpsi1V
         else:
-            print 'foobar'
+            print('foobar')
             psi1V = np.dot(mu_tilde[:,None].T*beta, psi1).T
             tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
             tmp, _ = dpotrs(LB, tmp, lower=1)
@@ -314,7 +315,7 @@ def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf,
         dL_dR = None
     elif het_noise:
         if uncertain_inputs:
-            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+            raise NotImplementedError("heteroscedatic derivates with uncertain inputs not implemented")
         else:
             #from ...util.linalg import chol_inv
             #LBi = chol_inv(LB)
diff --git a/GPy/inference/latent_function_inference/fitc.py b/GPy/inference/latent_function_inference/fitc.py
index a184c6c4..f38eb52b 100644
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 from ...util import diag
 import numpy as np
@@ -18,7 +18,8 @@ class FITC(LatentFunctionInference):
     """
     const_jitter = 1e-6
 
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
 
         num_inducing, _ = Z.shape
         num_data, output_dim = Y.shape
@@ -26,7 +27,7 @@ class FITC(LatentFunctionInference):
         #make sure the noise is not hetero
         sigma_n = likelihood.gaussian_variance(Y_metadata)
         if sigma_n.size >1:
-            raise NotImplementedError, "no hetero noise with this implementation of FITC"
+            raise NotImplementedError("no hetero noise with this implementation of FITC")
 
         Kmm = kern.K(Z)
         Knn = kern.Kdiag(X)
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 05711b0b..aefc82ac 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -12,13 +12,14 @@
 
 import numpy as np
 from ...util.linalg import mdot, jitchol, dpotrs, dtrtrs, dpotri, symmetrify, pdinv
-from posterior import Posterior
+from .posterior import Posterior
 import warnings
 def warning_on_one_line(message, category, filename, lineno, file=None, line=None):
     return ' %s:%s: %s:%s\n' % (filename, lineno, category.__name__, message)
 warnings.formatwarning = warning_on_one_line
 from scipy import optimize
 from . import LatentFunctionInference
+from scipy.integrate import quad
 
 class Laplace(LatentFunctionInference):
 
@@ -39,10 +40,90 @@ class Laplace(LatentFunctionInference):
         self.first_run = True
         self._previous_Ki_fhat = None
 
-    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+    def LOO(self, kern, X, Y, likelihood, posterior, Y_metadata=None, K=None, f_hat=None, W=None, Ki_W_i=None):
+        """
+        Leave one out log predictive density as found in
+        "Bayesian leave-one-out cross-validation approximations for Gaussian latent variable models"
+        Vehtari et al. 2014.
+        """
+        Ki_f_init = np.zeros_like(Y)
+
+        if K is None:
+            K = kern.K(X)
+
+        if f_hat is None:
+            f_hat, _ = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
+
+        if W is None:
+            W = -likelihood.d2logpdf_df2(f_hat, Y, Y_metadata=Y_metadata)
+
+        if Ki_W_i is None:
+            _, _, _, Ki_W_i = self._compute_B_statistics(K, W, likelihood.log_concave)
+
+        logpdf_dfhat = likelihood.dlogpdf_df(f_hat, Y, Y_metadata=Y_metadata)
+
+        if W.shape[1] == 1:
+            W = np.diagflat(W)
+
+        #Eq 14, and 16
+        var_site = 1./np.diag(W)[:, None]
+        mu_site = f_hat + var_site*logpdf_dfhat
+        prec_site = 1./var_site
+        #Eq 19
+        marginal_cov = Ki_W_i
+        marginal_mu = marginal_cov.dot(np.diagflat(prec_site)).dot(mu_site)
+        marginal_var = np.diag(marginal_cov)[:, None]
+        #Eq 30 with using site parameters instead of Gaussian site parameters
+        #(var_site instead of sigma^{2} )
+        posterior_cav_var = 1./(1./marginal_var - 1./var_site)
+        posterior_cav_mean = posterior_cav_var*((1./marginal_var)*marginal_mu - (1./var_site)*Y)
+
+        flat_y = Y.flatten()
+        flat_mu = posterior_cav_mean.flatten()
+        flat_var = posterior_cav_var.flatten()
+
+        if Y_metadata is not None:
+            #Need to zip individual elements of Y_metadata aswell
+            Y_metadata_flat = {}
+            if Y_metadata is not None:
+                for key, val in Y_metadata.items():
+                    Y_metadata_flat[key] = np.atleast_1d(val).reshape(-1, 1)
+
+            zipped_values = []
+
+            for i in range(Y.shape[0]):
+                y_m = {}
+                for key, val in Y_metadata_flat.items():
+                    if np.isscalar(val) or val.shape[0] == 1:
+                        y_m[key] = val
+                    else:
+                        #Won't broadcast yet
+                        y_m[key] = val[i]
+                zipped_values.append((flat_y[i], flat_mu[i], flat_var[i], y_m))
+        else:
+            #Otherwise just pass along None's
+            zipped_values = zip(flat_y, flat_mu, flat_var, [None]*Y.shape[0])
+
+        def integral_generator(yi, mi, vi, yi_m):
+            def f(fi_star):
+                #More stable in the log space
+                p_fi = np.exp(likelihood.logpdf(fi_star, yi, yi_m)
+                              - 0.5*np.log(2*np.pi*vi)
+                              - 0.5*np.square(mi-fi_star)/vi)
+                return p_fi
+            return f
+
+        #Eq 30
+        p_ystar, _ = zip(*[quad(integral_generator(y, m, v, yi_m), -np.inf, np.inf)
+                           for y, m, v, yi_m in zipped_values])
+        p_ystar = np.array(p_ystar).reshape(-1, 1)
+        return np.log(p_ystar)
+
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
         """
         Returns a Posterior class containing essential quantities of the posterior
         """
+        assert mean_function is None, "inference with a mean function not implemented"
 
         # Compute K
         K = kern.K(X)
@@ -50,21 +131,25 @@ class Laplace(LatentFunctionInference):
         #Find mode
         if self.bad_fhat or self.first_run:
             Ki_f_init = np.zeros_like(Y)
-            first_run = False
+            self.first_run = False
         else:
             Ki_f_init = self._previous_Ki_fhat
 
+        Ki_f_init = np.zeros_like(Y)# FIXME: take this out
+
         f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
+
         self.f_hat = f_hat
-        self.Ki_fhat =  Ki_fhat
-        self.K = K.copy()
+        #self.Ki_fhat =  Ki_fhat
+        #self.K = K.copy()
+
         #Compute hessian and other variables at mode
         log_marginal, woodbury_inv, dL_dK, dL_dthetaL = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, kern, Y_metadata)
 
         self._previous_Ki_fhat = Ki_fhat.copy()
         return Posterior(woodbury_vector=Ki_fhat, woodbury_inv=woodbury_inv, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
 
-    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None, *args, **kwargs):
         """
         Rasmussen's numerically stable mode finding
         For nomenclature see Rasmussen & Williams 2006
@@ -89,7 +174,12 @@ class Laplace(LatentFunctionInference):
 
         #define the objective function (to be maximised)
         def obj(Ki_f, f):
-            return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
+            ll = -0.5*np.sum(np.dot(Ki_f.T, f)) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
+            if np.isnan(ll):
+                return -np.inf
+            else:
+                return ll
+
 
         difference = np.inf
         iteration = 0
@@ -104,7 +194,7 @@ class Laplace(LatentFunctionInference):
             W_f = W*f
 
             b = W_f + grad # R+W p46 line 6.
-            W12BiW12, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave)
+            W12BiW12, _, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave, *args, **kwargs)
             W12BiW12Kb = np.dot(W12BiW12, np.dot(K, b))
 
             #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
@@ -121,7 +211,9 @@ class Laplace(LatentFunctionInference):
             step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
             Ki_f_new = Ki_f + step*dKi_f
             f_new = np.dot(K, Ki_f_new)
-
+            #print "new {} vs old {}".format(obj(Ki_f_new, f_new), obj(Ki_f, f))
+            if obj(Ki_f_new, f_new) < obj(Ki_f, f):
+                raise ValueError("Shouldn't happen, brent optimization failing")
             difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
             Ki_f = Ki_f_new
             f = f_new
@@ -152,14 +244,10 @@ class Laplace(LatentFunctionInference):
         if np.any(np.isnan(W)):
             raise ValueError('One or more element(s) of W is NaN')
 
-        K_Wi_i, L, LiW12 = self._compute_B_statistics(K, W, likelihood.log_concave)
-
-        #compute vital matrices
-        C = np.dot(LiW12, K)
-        Ki_W_i  = K - C.T.dot(C)
+        K_Wi_i, logdet_I_KW, I_KW_i, Ki_W_i = self._compute_B_statistics(K, W, likelihood.log_concave)
 
         #compute the log marginal
-        log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + np.sum(likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata)) - np.sum(np.log(np.diag(L)))
+        log_marginal = -0.5*np.sum(np.dot(Ki_f.T, f_hat)) + np.sum(likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata)) - 0.5*logdet_I_KW
 
         # Compute matrices for derivatives
         dW_df = -likelihood.d3logpdf_df3(f_hat, Y, Y_metadata=Y_metadata) # -d3lik_d3fhat
@@ -196,23 +284,23 @@ class Laplace(LatentFunctionInference):
             dL_dthetaL = np.zeros(num_params)
             for thetaL_i in range(num_params):
                 #Explicit
-                dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
+                dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i,:, :])
                                 # The + comes from the fact that dlik_hess_dthetaL == -dW_dthetaL
-                                + 0.5*np.sum(np.diag(Ki_W_i).flatten()*dlik_hess_dthetaL[:, thetaL_i].flatten())
+                                  + 0.5*np.sum(np.diag(Ki_W_i)*np.squeeze(dlik_hess_dthetaL[thetaL_i, :, :]))
                                 )
 
                 #Implicit
-                dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[:, thetaL_i])
-                #dfhat_dthetaL = mdot(Ki_W_i, dlik_grad_dthetaL[:, thetaL_i])
+                dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[thetaL_i, :, :])
+                #dfhat_dthetaL = mdot(Ki_W_i, dlik_grad_dthetaL[thetaL_i, :, :])
                 dL_dthetaL_imp = np.dot(dL_dfhat.T, dfhat_dthetaL)
-                dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+                dL_dthetaL[thetaL_i] = np.sum(dL_dthetaL_exp + dL_dthetaL_imp)
 
         else:
             dL_dthetaL = np.zeros(likelihood.size)
 
         return log_marginal, K_Wi_i, dL_dK, dL_dthetaL
 
-    def _compute_B_statistics(self, K, W, log_concave):
+    def _compute_B_statistics(self, K, W, log_concave, *args, **kwargs):
         """
         Rasmussen suggests the use of a numerically stable positive definite matrix B
         Which has a positive diagonal elements and can be easily inverted
@@ -225,7 +313,7 @@ class Laplace(LatentFunctionInference):
         """
         if not log_concave:
             #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
-            W[W<1e-6] = 1e-6
+            W = np.clip(W, 1e-6, 1e+30)
             # NOTE: when setting a parameter inside parameters_changed it will allways come to closed update circles!!!
             #W.__setitem__(W < 1e-6, 1e-6, update=False)  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                 # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@@ -247,5 +335,160 @@ class Laplace(LatentFunctionInference):
         #K_Wi_i_2 , _= dpotri(L2)
         #symmetrify(K_Wi_i_2)
 
-        return K_Wi_i, L, LiW12
+        #compute vital matrices
+        C = np.dot(LiW12, K)
+        Ki_W_i  = K - C.T.dot(C)
 
+        I_KW_i = np.eye(K.shape[0]) - np.dot(K, K_Wi_i)
+        logdet_I_KW = 2*np.sum(np.log(np.diag(L)))
+
+        return K_Wi_i, logdet_I_KW, I_KW_i, Ki_W_i
+
+class LaplaceBlock(Laplace):
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None, *args, **kwargs):
+        Ki_f = Ki_f_init.copy()
+        f = np.dot(K, Ki_f)
+
+        #define the objective function (to be maximised)
+        def obj(Ki_f, f):
+            ll = -0.5*np.dot(Ki_f.T, f) + np.sum(likelihood.logpdf_sum(f, Y, Y_metadata=Y_metadata))
+            if np.isnan(ll):
+                return -np.inf
+            else:
+                return ll
+
+        difference = np.inf
+        iteration = 0
+
+        I = np.eye(K.shape[0])
+        while difference > self._mode_finding_tolerance and iteration < self._mode_finding_max_iter:
+            W = -likelihood.d2logpdf_df2(f, Y, Y_metadata=Y_metadata)
+
+            W[np.diag_indices_from(W)] = np.clip(np.diag(W), 1e-6, 1e+30)
+
+            W_f = np.dot(W, f)
+            grad = likelihood.dlogpdf_df(f, Y, Y_metadata=Y_metadata)
+
+            b = W_f + grad # R+W p46 line 6.
+            K_Wi_i, _, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave, *args, **kwargs)
+
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            #a = (I - (K+Wi)i*K)*b
+            full_step_Ki_f = np.dot(I - np.dot(K_Wi_i, K), b)
+            dKi_f = full_step_Ki_f - Ki_f
+
+            #define an objective for the line search (minimize this one)
+            def inner_obj(step_size):
+                Ki_f_trial = Ki_f + step_size*dKi_f
+                f_trial = np.dot(K, Ki_f_trial)
+                return -obj(Ki_f_trial, f_trial)
+
+            #use scipy for the line search, the compute new values of f, Ki_f
+            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
+
+            Ki_f_new = Ki_f + step*dKi_f
+            f_new = np.dot(K, Ki_f_new)
+
+            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
+            Ki_f = Ki_f_new
+            f = f_new
+            iteration += 1
+
+        #Warn of bad fits
+        if difference > self._mode_finding_tolerance:
+            if not self.bad_fhat:
+                warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
+            self._previous_Ki_fhat = np.zeros_like(Y)
+            self.bad_fhat = True
+        elif self.bad_fhat:
+            self.bad_fhat = False
+            warnings.warn("f_hat now fine again")
+        if iteration > self._mode_finding_max_iter:
+            warnings.warn("didn't find the best")
+
+        return f, Ki_f
+
+    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, kern, Y_metadata):
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        W = -likelihood.d2logpdf_df2(f_hat, Y, Y_metadata=Y_metadata)
+
+        W[np.diag_indices_from(W)] = np.clip(np.diag(W), 1e-6, 1e+30)
+
+        K_Wi_i, log_B_det, I_KW_i, Ki_W_i = self._compute_B_statistics(K, W, likelihood.log_concave)
+
+        #compute the log marginal
+        #FIXME: The derterminant should be output_dim*0.5 I think, gradients may now no longer check
+        log_marginal = -0.5*np.dot(f_hat.T, Ki_f) + np.sum(likelihood.logpdf_sum(f_hat, Y, Y_metadata=Y_metadata)) - 0.5*log_B_det
+
+        #Compute vival matrices for derivatives
+        dW_df = -likelihood.d3logpdf_df3(f_hat, Y, Y_metadata=Y_metadata) # -d3lik_d3fhat
+
+        #dL_dfhat = np.zeros((f_hat.shape[0]))
+        #for i in range(f_hat.shape[0]):
+            #dL_dfhat[i] = -0.5*np.trace(np.dot(Ki_W_i, dW_df[:,:,i]))
+
+        dL_dfhat = -0.5*np.einsum('ij,ijk->k', Ki_W_i, dW_df)
+
+        woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, Y_metadata=Y_metadata)
+
+        ####################
+        #compute dL_dK#
+        ####################
+        if kern.size > 0 and not kern.is_fixed:
+            #Explicit
+            explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
+
+            #Implicit
+            implicit_part = woodbury_vector.dot(dL_dfhat[None,:]).dot(I_KW_i)
+            #implicit_part = Ki_f.dot(dL_dfhat[None,:]).dot(I_KW_i)
+
+            dL_dK = explicit_part + implicit_part
+        else:
+            dL_dK = np.zeros_like(K)
+
+        ####################
+        #compute dL_dthetaL#
+        ####################
+        if likelihood.size > 0 and not likelihood.is_fixed:
+            raise NotImplementedError
+        else:
+            dL_dthetaL = np.zeros(likelihood.size)
+
+        #self.K_Wi_i = K_Wi_i
+        #self.Ki_W_i = Ki_W_i
+        #self.W = W
+        #self.K = K
+        #self.dL_dfhat = dL_dfhat
+        #self.explicit_part = explicit_part
+        #self.implicit_part = implicit_part
+        return log_marginal, K_Wi_i, dL_dK, dL_dthetaL
+
+    def _compute_B_statistics(self, K, W, log_concave, *args, **kwargs):
+        """
+        Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :param K: Prior Covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        :param W: Negative hessian at a point (diagonal matrix)
+        :type W: Vector of diagonal values of hessian (1xN)
+        :returns: (K_Wi_i, L_B, not_provided)
+        """
+        #w = GPy.util.diag.view(W)
+        #W[:] = np.where(w<1e-6, 1e-6, w)
+
+        #B = I + KW
+        B = np.eye(K.shape[0]) + np.dot(K, W)
+        #Bi, L, Li, logdetB = pdinv(B)
+        Bi = np.linalg.inv(B)
+
+        #K_Wi_i = np.eye(K.shape[0]) - mdot(W, Bi, K)
+        K_Wi_i = np.dot(W, Bi)
+
+        #self.K_Wi_i_brute = np.linalg.inv(K + np.linalg.inv(W))
+        #self.B = B
+        #self.Bi = Bi
+        Ki_W_i = np.dot(Bi, K)
+
+        sign, logdetB = np.linalg.slogdet(B)
+        return K_Wi_i, sign*logdetB, Bi, Ki_W_i
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index 34f0b3bb..fbd72f57 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -15,7 +15,7 @@ class Posterior(object):
     the function at any new point x_* by integrating over this posterior.
 
     """
-    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
+    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None, prior_mean=0):
         """
         woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
         woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
@@ -52,7 +52,7 @@ class Posterior(object):
                 or ((mean is not None) and (cov is not None)):
             pass # we have sufficient to compute the posterior
         else:
-            raise ValueError, "insufficient information to compute the posterior"
+            raise ValueError("insufficient information to compute the posterior")
 
         self._K_chol = K_chol
         self._K = K
@@ -67,6 +67,7 @@ class Posterior(object):
         #option 2:
         self._mean = mean
         self._covariance = cov
+        self._prior_mean = prior_mean
 
         #compute this lazily
         self._precision = None
@@ -107,7 +108,7 @@ class Posterior(object):
         if self._precision is None:
             cov = np.atleast_3d(self.covariance)
             self._precision = np.zeros(cov.shape) # if one covariance per dimension
-            for p in xrange(cov.shape[-1]):
+            for p in range(cov.shape[-1]):
                 self._precision[:,:,p] = pdinv(cov[:,:,p])[0]
         return self._precision
 
@@ -125,7 +126,7 @@ class Posterior(object):
             if self._woodbury_inv is not None:
                 winv = np.atleast_3d(self._woodbury_inv)
                 self._woodbury_chol = np.zeros(winv.shape)
-                for p in xrange(winv.shape[-1]):
+                for p in range(winv.shape[-1]):
                     self._woodbury_chol[:,:,p] = pdinv(winv[:,:,p])[2]
                 #Li = jitchol(self._woodbury_inv)
                 #self._woodbury_chol, _ = dtrtri(Li)
@@ -134,13 +135,13 @@ class Posterior(object):
                 #self._woodbury_chol = jitchol(W)
             #try computing woodbury chol from cov
             elif self._covariance is not None:
-                raise NotImplementedError, "TODO: check code here"
+                raise NotImplementedError("TODO: check code here")
                 B = self._K - self._covariance
                 tmp, _ = dpotrs(self.K_chol, B)
                 self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T)
                 _, _, self._woodbury_chol, _ = pdinv(self._woodbury_inv)
             else:
-                raise ValueError, "insufficient information to compute posterior"
+                raise ValueError("insufficient information to compute posterior")
         return self._woodbury_chol
 
     @property
@@ -160,7 +161,7 @@ class Posterior(object):
             elif self._covariance is not None:
                 B = np.atleast_3d(self._K) - np.atleast_3d(self._covariance)
                 self._woodbury_inv = np.empty_like(B)
-                for i in xrange(B.shape[-1]):
+                for i in range(B.shape[-1]):
                     tmp, _ = dpotrs(self.K_chol, B[:,:,i])
                     self._woodbury_inv[:,:,i], _ = dpotrs(self.K_chol, tmp.T)
         return self._woodbury_inv
@@ -175,7 +176,7 @@ class Posterior(object):
         $$
         """
         if self._woodbury_vector is None:
-            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean)
+            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean - self._prior_mean)
         return self._woodbury_vector
 
     @property
diff --git a/GPy/inference/latent_function_inference/svgp.py b/GPy/inference/latent_function_inference/svgp.py
index 52db242c..bad73a71 100644
--- a/GPy/inference/latent_function_inference/svgp.py
+++ b/GPy/inference/latent_function_inference/svgp.py
@@ -2,11 +2,12 @@ from . import LatentFunctionInference
 from ...util import linalg
 from ...util import choleskies
 import numpy as np
-from posterior import Posterior
+from .posterior import Posterior
 
 class SVGP(LatentFunctionInference):
 
-    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
+    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
+
         num_inducing = Z.shape[0]
         num_data, num_outputs = Y.shape
 
@@ -22,6 +23,15 @@ class SVGP(LatentFunctionInference):
             #S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
             #Si, Lnew, _,_ = linalg.pdinv(S)
 
+        #compute mean function stuff
+        if mean_function is not None:
+            prior_mean_u = mean_function.f(Z)
+            prior_mean_f = mean_function.f(X)
+        else:
+            prior_mean_u = np.zeros((num_inducing, num_outputs))
+            prior_mean_f = np.zeros((num_data, num_outputs))
+
+
         #compute kernel related stuff
         Kmm = kern.K(Z)
         Knm = kern.K(X, Z)
@@ -30,29 +40,43 @@ class SVGP(LatentFunctionInference):
 
         #compute the marginal means and variances of q(f)
         A = np.dot(Knm, Kmmi)
-        mu = np.dot(A, q_u_mean)
+        mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
         v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
 
         #compute the KL term
         Kmmim = np.dot(Kmmi, q_u_mean)
         KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
         KL = KLs.sum()
-        dKL_dm = Kmmim
+        #gradient of the KL term (assuming zero mean function)
+        dKL_dm = Kmmim.copy()
         dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
         dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
 
+        if mean_function is not None:
+            #adjust KL term for mean function
+            Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
+            KL += -np.sum(q_u_mean*Kmmi_mfZ)
+            KL += 0.5*np.sum(Kmmi_mfZ*prior_mean_u)
+
+            #adjust gradient for mean fucntion
+            dKL_dm -= Kmmi_mfZ
+            dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
+            dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)
+
+            #compute gradients for mean_function
+            dKL_dmfZ = Kmmi_mfZ - Kmmim
 
         #quadrature for the likelihood
-        F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v)
+        F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)
 
         #rescale the F term if working on a batch
         F, dF_dmu, dF_dv =  F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale
+        if dF_dthetaL is not None:
+            dF_dthetaL =  dF_dthetaL.sum(1).sum(1)*batch_scale
 
-        #derivatives of expected likelihood
+        #derivatives of expected likelihood, assuming zero mean function
         Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
         Admu = A.T.dot(dF_dmu)
-        #AdvA = np.einsum('ijk,jl->ilk', Adv, A)
-        #AdvA = np.dot(A.T, Adv).swapaxes(0,1)
         AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
         tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
         dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
@@ -62,6 +86,14 @@ class SVGP(LatentFunctionInference):
         dF_dm = Admu
         dF_dS = AdvA
 
+        #adjust gradient to account for mean function
+        if mean_function is not None:
+            dF_dmfX = dF_dmu.copy()
+            dF_dmfZ = -Admu
+            dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
+            dF_dKmm += Admu.dot(Kmmi_mfZ.T)
+
+
         #sum (gradients of) expected likelihood and KL part
         log_marginal = F.sum() - KL
         dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
@@ -69,4 +101,8 @@ class SVGP(LatentFunctionInference):
         dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
         dL_dchol = choleskies.triang_to_flat(dL_dchol)
 
-        return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
+        grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
+        if mean_function is not None:
+            grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
+            grad_dict['dL_dmfX'] = dF_dmfX
+        return Posterior(mean=q_u_mean, cov=S, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index 9c4d51bb..97d8dfe3 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify
 from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
@@ -170,7 +170,7 @@ class VarDTC(LatentFunctionInference):
         if VVT_factor.shape[1] == Y.shape[1]:
             woodbury_vector = Cpsi1Vf # == Cpsi1V
         else:
-            print 'foobar'
+            print('foobar')
             import ipdb; ipdb.set_trace()
             psi1V = np.dot(Y.T*beta, psi1).T
             tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
@@ -213,7 +213,7 @@ def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf,
         dL_dR = None
     elif het_noise:
         if uncertain_inputs:
-            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+            raise NotImplementedError("heteroscedatic derivates with uncertain inputs not implemented")
         else:
             #from ...util.linalg import chol_inv
             #LBi = chol_inv(LB)
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index cac69872..4b884d4c 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdinv
 from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
@@ -92,7 +92,7 @@ class VarDTC_minibatch(LatentFunctionInference):
         psi0_full = 0.
         YRY_full = 0.
 
-        for n_start in xrange(0,num_data,batchsize):
+        for n_start in range(0,num_data,batchsize):
             n_end = min(batchsize+n_start, num_data)
             if batchsize==num_data:
                 Y_slice = Y
@@ -169,11 +169,13 @@ class VarDTC_minibatch(LatentFunctionInference):
 
         Kmm = kern.K(Z).copy()
         diag.add(Kmm, self.const_jitter)
-        Lm = jitchol(Kmm, maxtries=100)
+        if not np.isfinite(Kmm).all():
+            print(Kmm)
+        Lm = jitchol(Kmm)
 
         LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
         Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
-        LL = jitchol(Lambda, maxtries=100)
+        LL = jitchol(Lambda)
         logdet_L = 2.*np.sum(np.log(np.diag(LL)))
         b = dtrtrs(LL,dtrtrs(Lm,psi1Y_full.T)[0])[0]
         bbt = np.square(b).sum()
diff --git a/GPy/inference/mcmc/__init__.py b/GPy/inference/mcmc/__init__.py
index 956448d4..8f185457 100644
--- a/GPy/inference/mcmc/__init__.py
+++ b/GPy/inference/mcmc/__init__.py
@@ -1 +1 @@
-from hmc import HMC
+from .hmc import HMC
diff --git a/GPy/inference/mcmc/hmc.py b/GPy/inference/mcmc/hmc.py
index ec6399b6..fcc72591 100644
--- a/GPy/inference/mcmc/hmc.py
+++ b/GPy/inference/mcmc/hmc.py
@@ -39,7 +39,7 @@ class HMC:
         :rtype: numpy.ndarray
         """
         params = np.empty((num_samples,self.p.size))
-        for i in xrange(num_samples):
+        for i in range(num_samples):
             self.p[:] = np.random.multivariate_normal(np.zeros(self.p.size),self.M)
             H_old = self._computeH()
             theta_old = self.model.optimizer_array.copy()
@@ -59,7 +59,7 @@ class HMC:
         return params
 
     def _update(self, hmc_iters):
-        for i in xrange(hmc_iters):
+        for i in range(hmc_iters):
             self.p[:] += -self.stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
             self.model.optimizer_array = self.model.optimizer_array + self.stepsize*np.dot(self.Minv, self.p)
             self.p[:] += -self.stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
@@ -82,7 +82,7 @@ class HMC_shortcut:
 
     def sample(self, m_iters=1000, hmc_iters=20):
         params = np.empty((m_iters,self.p.size))
-        for i in xrange(m_iters):
+        for i in range(m_iters):
             # sample a stepsize from the uniform distribution
             stepsize = np.exp(np.random.rand()*(self.stepsize_range[1]-self.stepsize_range[0])+self.stepsize_range[0])
             self.p[:] = np.random.multivariate_normal(np.zeros(self.p.size),self.M)
diff --git a/GPy/inference/mcmc/samplers.py b/GPy/inference/mcmc/samplers.py
index 444d99d7..6459e8af 100644
--- a/GPy/inference/mcmc/samplers.py
+++ b/GPy/inference/mcmc/samplers.py
@@ -9,7 +9,13 @@ import sys
 import re
 import numdifftools as ndt
 import pdb
-import cPickle
+
+try:
+    #In Python 2, cPickle is faster. It does not exist in Python 3 but the underlying code is always used
+    #if available
+    import cPickle as pickle
+except ImportError:
+    import pickle
 
 
 class Metropolis_Hastings:
@@ -40,7 +46,7 @@ class Metropolis_Hastings:
         fcurrent = self.model.log_likelihood() + self.model.log_prior()
         accepted = np.zeros(Ntotal,dtype=np.bool)
         for it in range(Ntotal):
-            print "sample %d of %d\r"%(it,Ntotal),
+            print("sample %d of %d\r"%(it,Ntotal), end=' ')
             sys.stdout.flush()
             prop = np.random.multivariate_normal(current, self.cov*self.scale*self.scale)
             self.model._set_params_transformed(prop)
diff --git a/GPy/inference/optimization/__init__.py b/GPy/inference/optimization/__init__.py
index 1a8f043b..909f897b 100644
--- a/GPy/inference/optimization/__init__.py
+++ b/GPy/inference/optimization/__init__.py
@@ -1,2 +1,2 @@
-from scg import SCG
-from optimization import *
+from .scg import SCG
+from .optimization import *
diff --git a/GPy/inference/optimization/conjugate_gradient_descent.py b/GPy/inference/optimization/conjugate_gradient_descent.py
index dfc4a48d..fc2d8b61 100644
--- a/GPy/inference/optimization/conjugate_gradient_descent.py
+++ b/GPy/inference/optimization/conjugate_gradient_descent.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from gradient_descent_update_rules import FletcherReeves, \
+from .gradient_descent_update_rules import FletcherReeves, \
     PolakRibiere
 from Queue import Empty
 from multiprocessing import Value
@@ -74,7 +74,7 @@ class _Async_Optimization(Thread):
         if self.outq is not None:
             self.outq.put(self.SENTINEL)
         if self.messages:
-            print ""
+            print("")
         self.runsignal.clear()
 
     def run(self, *args, **kwargs):
@@ -213,7 +213,7 @@ class Async_Optimize(object):
 #                     # print "^C"
 #                     self.runsignal.clear()
 #                     c.join()
-            print "WARNING: callback still running, optimisation done!"
+            print("WARNING: callback still running, optimisation done!")
         return p.result
 
 class CGD(Async_Optimize):
diff --git a/GPy/inference/optimization/optimization.py b/GPy/inference/optimization/optimization.py
index aa9be793..fd140688 100644
--- a/GPy/inference/optimization/optimization.py
+++ b/GPy/inference/optimization/optimization.py
@@ -10,7 +10,7 @@ try:
     rasm_available = True
 except ImportError:
     rasm_available = False
-from scg import SCG
+from .scg import SCG
 
 class Optimizer():
     """
@@ -54,7 +54,7 @@ class Optimizer():
         self.time = str(end - start)
 
     def opt(self, f_fp=None, f=None, fp=None):
-        raise NotImplementedError, "this needs to be implemented to use the optimizer class"
+        raise NotImplementedError("this needs to be implemented to use the optimizer class")
 
     def plot(self):
         """
@@ -125,9 +125,9 @@ class opt_lbfgsb(Optimizer):
 
         opt_dict = {}
         if self.xtol is not None:
-            print "WARNING: l-bfgs-b doesn't have an xtol arg, so I'm going to ignore it"
+            print("WARNING: l-bfgs-b doesn't have an xtol arg, so I'm going to ignore it")
         if self.ftol is not None:
-            print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it"
+            print("WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it")
         if self.gtol is not None:
             opt_dict['pgtol'] = self.gtol
         if self.bfgs_factor is not None:
@@ -140,6 +140,10 @@ class opt_lbfgsb(Optimizer):
         self.funct_eval = opt_result[2]['funcalls']
         self.status = rcstrings[opt_result[2]['warnflag']]
 
+        #a more helpful error message is available in opt_result in the Error case
+        if opt_result[2]['warnflag']==2:
+            self.status = 'Error' + opt_result[2]['task']
+
 class opt_simplex(Optimizer):
     def __init__(self, *args, **kwargs):
         Optimizer.__init__(self, *args, **kwargs)
@@ -158,7 +162,7 @@ class opt_simplex(Optimizer):
         if self.ftol is not None:
             opt_dict['ftol'] = self.ftol
         if self.gtol is not None:
-            print "WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it"
+            print("WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it")
 
         opt_result = optimize.fmin(f, self.x_init, (), disp=self.messages,
                    maxfun=self.max_f_eval, full_output=True, **opt_dict)
@@ -186,11 +190,11 @@ class opt_rasm(Optimizer):
 
         opt_dict = {}
         if self.xtol is not None:
-            print "WARNING: minimize doesn't have an xtol arg, so I'm going to ignore it"
+            print("WARNING: minimize doesn't have an xtol arg, so I'm going to ignore it")
         if self.ftol is not None:
-            print "WARNING: minimize doesn't have an ftol arg, so I'm going to ignore it"
+            print("WARNING: minimize doesn't have an ftol arg, so I'm going to ignore it")
         if self.gtol is not None:
-            print "WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it"
+            print("WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it")
 
         opt_result = rasm.minimize(self.x_init, f_fp, (), messages=self.messages,
                                    maxnumfuneval=self.max_f_eval)
diff --git a/GPy/inference/optimization/scg.py b/GPy/inference/optimization/scg.py
index 34dd181f..8960de1d 100644
--- a/GPy/inference/optimization/scg.py
+++ b/GPy/inference/optimization/scg.py
@@ -21,14 +21,13 @@
 #      OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #      POSSIBILITY OF SUCH DAMAGE.
 
-
+from __future__ import print_function
 import numpy as np
 import sys
 
-
 def print_out(len_maxiters, fnow, current_grad, beta, iteration):
-    print '\r',
-    print '{0:>0{mi}g}  {1:> 12e}  {2:< 12.6e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+    print('\r', end=' ')
+    print('{0:>0{mi}g}  {1:> 12e}  {2:< 12.6e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), end=' ') # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
     sys.stdout.flush()
 
 def exponents(fnow, current_grad):
@@ -80,7 +79,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
 
     len_maxiters = len(str(maxiters))
     if display:
-        print ' {0:{mi}s}   {1:11s}    {2:11s}    {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters)
+        print(' {0:{mi}s}   {1:11s}    {2:11s}    {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters))
         exps = exponents(fnow, current_grad)
         p_iter = iteration
 
@@ -140,7 +139,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
                 b = np.any(n_exps < exps)
                 if a or b:
                     p_iter = iteration
-                    print ''
+                    print('')
                 if b:
                     exps = n_exps
 
@@ -189,6 +188,6 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
 
     if display:
         print_out(len_maxiters, fnow, current_grad, beta, iteration)
-        print ""
-        print status
+        print("")
+        print(status)
     return x, flog, function_eval, status
diff --git a/GPy/inference/optimization/stochastics.py b/GPy/inference/optimization/stochastics.py
index dc71d539..f1532bc5 100644
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@@ -30,7 +30,7 @@ class SparseGPMissing(StochasticStorage):
         Thus, we can just make sure the loop goes over self.d every
         time.
         """
-        self.d = xrange(model.Y_normalized.shape[1])
+        self.d = range(model.Y_normalized.shape[1])
 
 class SparseGPStochastics(StochasticStorage):
     """
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 718be74f..2bd55617 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -1,20 +1,23 @@
-from _src.kern import Kern
-from _src.rbf import RBF
-from _src.linear import Linear, LinearFull
-from _src.static import Bias, White, Fixed
-from _src.brownian import Brownian
-from _src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
-from _src.mlp import MLP
-from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
-from _src.independent_outputs import IndependentOutputs, Hierarchical
-from _src.coregionalize import Coregionalize
-from _src.ODE_UY import ODE_UY
-from _src.ODE_UYC import ODE_UYC
-from _src.ODE_st import ODE_st
-from _src.ODE_t import ODE_t
-from _src.poly import Poly
-from _src.eq_ode2 import EQ_ODE2
+from ._src.kern import Kern
+from ._src.rbf import RBF
+from ._src.linear import Linear, LinearFull
+from ._src.static import Bias, White, Fixed
+from ._src.brownian import Brownian
+from ._src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
+from ._src.mlp import MLP
+from ._src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
+from ._src.independent_outputs import IndependentOutputs, Hierarchical
+from ._src.coregionalize import Coregionalize
+from ._src.ODE_UY import ODE_UY
+from ._src.ODE_UYC import ODE_UYC
+from ._src.ODE_st import ODE_st
+from ._src.ODE_t import ODE_t
+from ._src.poly import Poly
+from ._src.eq_ode2 import EQ_ODE2
+from ._src.trunclinear import TruncLinear,TruncLinear_inf
+from ._src.splitKern import SplitKern,DEtime
+from ._src.splitKern import DEtime as DiffGenomeKern
 
-from _src.trunclinear import TruncLinear,TruncLinear_inf
-from _src.splitKern import SplitKern,DiffGenomeKern
+
+from _src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
 
diff --git a/GPy/kern/_src/ODE_UY.py b/GPy/kern/_src/ODE_UY.py
index b4a2b42d..9c9b47be 100644
--- a/GPy/kern/_src/ODE_UY.py
+++ b/GPy/kern/_src/ODE_UY.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
-from independent_outputs import index_to_slices
+from .independent_outputs import index_to_slices
 
 class ODE_UY(Kern):
     def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., active_dims=None, name='ode_uy'):
@@ -114,7 +114,7 @@ class ODE_UY(Kern):
                 elif i==1:
                     Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                 else:
-                    raise ValueError, "invalid input/output index"
+                    raise ValueError("invalid input/output index")
         #Kdiag[slices[0][0]]+= self.variance_U   #matern32 diag
         #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3)  #  diag
         return Kdiag
diff --git a/GPy/kern/_src/ODE_UYC.py b/GPy/kern/_src/ODE_UYC.py
index 1722d2e1..ff75a328 100644
--- a/GPy/kern/_src/ODE_UYC.py
+++ b/GPy/kern/_src/ODE_UYC.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
-from independent_outputs import index_to_slices
+from .independent_outputs import index_to_slices
 
 class ODE_UYC(Kern):
     def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., ubias =1. ,active_dims=None, name='ode_uyc'):
@@ -115,7 +115,7 @@ class ODE_UYC(Kern):
                 elif i==1:
                     Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                 else:
-                    raise ValueError, "invalid input/output index"
+                    raise ValueError("invalid input/output index")
         #Kdiag[slices[0][0]]+= self.variance_U   #matern32 diag
         #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3)  #  diag
         return Kdiag
diff --git a/GPy/kern/_src/ODE_st.py b/GPy/kern/_src/ODE_st.py
index 665be230..afa46d09 100644
--- a/GPy/kern/_src/ODE_st.py
+++ b/GPy/kern/_src/ODE_st.py
@@ -1,10 +1,10 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
-from independent_outputs import index_to_slices
+from .independent_outputs import index_to_slices
 
 
 class ODE_st(Kern):
@@ -135,7 +135,7 @@ class ODE_st(Kern):
                     Kdiag[s1]+= b**2*k1 - 2*a*c*k2 + a**2*k3 + c**2*vyt*vyx
                     #Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                 else:
-                    raise ValueError, "invalid input/output index"
+                    raise ValueError("invalid input/output index")
 
         return Kdiag
         
diff --git a/GPy/kern/_src/ODE_t.py b/GPy/kern/_src/ODE_t.py
index a470cbec..80625f51 100644
--- a/GPy/kern/_src/ODE_t.py
+++ b/GPy/kern/_src/ODE_t.py
@@ -1,8 +1,8 @@
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
-from independent_outputs import index_to_slices
+from .independent_outputs import index_to_slices
 
 
 class ODE_t(Kern):
@@ -85,7 +85,7 @@ class ODE_t(Kern):
                             Kdiag[s1]+= k1 + vyt+self.ubias
                             #Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                         else:
-                            raise ValueError, "invalid input/output index"
+                            raise ValueError("invalid input/output index")
 
                 return Kdiag
 
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index 4c72a254..696a8b04 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -4,7 +4,8 @@
 import numpy as np
 import itertools
 from ...util.caching import Cache_this
-from kern import CombinationKernel
+from .kern import CombinationKernel
+from functools import reduce
 
 class Add(CombinationKernel):
     """
@@ -84,10 +85,10 @@ class Add(CombinationKernel):
         psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
         #return psi2
         # compute the "cross" terms
-        from static import White, Bias
-        from rbf import RBF
+        from .static import White, Bias
+        from .rbf import RBF
         #from rbf_inv import RBFInv
-        from linear import Linear
+        from .linear import Linear
         #ffrom fixed import Fixed
 
         for p1, p2 in itertools.combinations(self.parts, 2):
@@ -111,11 +112,11 @@ class Add(CombinationKernel):
                 psi2 += np.einsum('nm,no->mo',tmp1,tmp2)+np.einsum('nm,no->mo',tmp2,tmp1)
                 #(tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
             else:
-                raise NotImplementedError, "psi2 cannot be computed for this kernel"
+                raise NotImplementedError("psi2 cannot be computed for this kernel")
         return psi2
 
     def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        from static import White, Bias
+        from .static import White, Bias
         for p1 in self.parts:
             #compute the effective dL_dpsi1. Extra terms appear becaue of the cross terms in psi2!
             eff_dL_dpsi1 = dL_dpsi1.copy()
@@ -131,7 +132,7 @@ class Add(CombinationKernel):
             p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
 
     def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        from static import White, Bias
+        from .static import White, Bias
         target = np.zeros(Z.shape)
         for p1 in self.parts:
             #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
@@ -149,7 +150,7 @@ class Add(CombinationKernel):
         return target
 
     def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        from static import White, Bias
+        from .static import White, Bias
         target_grads = [np.zeros(v.shape) for v in variational_posterior.parameters]
         for p1 in self.parameters:
             #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
@@ -164,7 +165,7 @@ class Add(CombinationKernel):
                 else:
                     eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
             grads = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
-            [np.add(target_grads[i],grads[i],target_grads[i]) for i in xrange(len(grads))]
+            [np.add(target_grads[i],grads[i],target_grads[i]) for i in range(len(grads))]
         return target_grads
 
     def add(self, other):
@@ -180,9 +181,12 @@ class Add(CombinationKernel):
 
     def input_sensitivity(self, summarize=True):
         if summarize:
-            return reduce(np.add, [k.input_sensitivity(summarize) for k in self.parts])
+            i_s = np.zeros((self.input_dim))
+            for k in self.parts:
+                i_s[k.active_dims] += k.input_sensitivity(summarize)
+            return i_s
         else:
             i_s = np.zeros((len(self.parts), self.input_dim))
             from operator import setitem
-            [setitem(i_s, (i, Ellipsis), k.input_sensitivity(summarize)) for i, k in enumerate(self.parts)]
+            [setitem(i_s, (i, k.active_dims), k.input_sensitivity(summarize)) for i, k in enumerate(self.parts)]
             return i_s
diff --git a/GPy/kern/_src/basis_funcs.py b/GPy/kern/_src/basis_funcs.py
new file mode 100644
index 00000000..1b300661
--- /dev/null
+++ b/GPy/kern/_src/basis_funcs.py
@@ -0,0 +1,101 @@
+# #Copyright (c) 2012, Max Zwiessele (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+from .kern import Kern
+from ...core.parameterization.param import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+from ...util.caching import Cache_this
+from ...util.linalg import tdot
+
+class BasisFuncKernel(Kern):
+    def __init__(self, input_dim, variance=1., active_dims=None, name='basis func kernel'):
+        """
+        Abstract superclass for kernels with explicit basis functions for use in GPy.
+        
+        This class does NOT automatically add an offset to the design matrix phi!
+        """
+        super(BasisFuncKernel, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', variance, Logexp())
+        self.link_parameter(self.variance)
+    
+    def phi(self, X):
+        raise NotImplementedError('Overwrite this phi function, which maps the input X into the higher dimensional space and forms the design matrix Phi')
+        
+    def K(self, X, X2=None):
+        return self.variance * self._K(X, X2)
+
+    def Kdiag(self, X, X2=None):
+        return self.variance * np.diag(self._K(X, X2))
+    
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        self.variance.gradient = np.einsum('ij,ij', dL_dK, self._K(X, X2))
+        
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = np.einsum('i,i', dL_dKdiag, self._K(X))
+        
+    def concatenate_offset(self, X):
+        return np.c_[np.ones((X.shape[0], 1)), X]
+    
+    def posterior_inf(self, X=None, posterior=None):
+        """
+        Do the posterior inference on the parameters given this kernels functions 
+        and the model posterior, which has to be a GPy posterior, usually found at m.posterior, if m is a GPy model. 
+        If not given we search for the the highest parent to be a model, containing the posterior, and for X accordingly. 
+        """
+        if X is None:
+            try:
+                X = self._highest_parent_.X
+            except NameError:
+                raise RuntimeError("This kernel is not part of a model and cannot be used for posterior inference")
+        if posterior is None:
+            try:
+                posterior = self._highest_parent_.posterior
+            except NameError:
+                raise RuntimeError("This kernel is not part of a model and cannot be used for posterior inference")
+        phi = self.phi(X)
+        return self.variance * phi.T.dot(posterior.woodbury_vector), self.variance * (1 - self.variance * phi.T.dot(posterior.woodbury_inv.dot(phi)))
+    
+    @Cache_this(limit=3, ignore_args=())
+    def _K(self, X, X2):
+        if X2 is None or X is X2:
+            phi = self.phi(X)
+            if phi.ndim != 2:
+                phi = phi[:, None]
+            return tdot(phi)
+        else:
+            phi1 = self.phi(X)
+            phi2 = self.phi(X2)
+            if phi1.ndim != 2:
+                phi1 = phi1[:, None]
+                phi2 = phi2[:, None]
+            return phi1.dot(phi2.T)
+        
+        
+class LinearSlopeBasisFuncKernel(BasisFuncKernel):
+    def __init__(self, input_dim, start, stop, variance=1., active_dims=None, name='linear_segment'):
+        super(LinearSlopeBasisFuncKernel, self).__init__(input_dim, variance, active_dims, name)
+        self.start = np.array(start)
+        self.stop = np.array(stop)
+    
+    @Cache_this(limit=3, ignore_args=())
+    def phi(self, X):
+        phi = np.where(X < self.start, self.start, X)
+        phi = np.where(phi > self.stop, self.stop, phi)
+        return ((phi-(self.stop+self.start)/2.))#/(.5*(self.stop-self.start)))-1.
+        return self.concatenate_offset(phi)  # ((phi-self.start)/(self.stop-self.start))-.5
+    
+class ChangePointBasisFuncKernel(BasisFuncKernel):
+    def __init__(self, input_dim, changepoint, variance=1., active_dims=None, name='changepoint'):
+        super(ChangePointBasisFuncKernel, self).__init__(input_dim, variance, active_dims, name)
+        self.changepoint = changepoint
+    
+    @Cache_this(limit=3, ignore_args=())
+    def phi(self, X):
+        return self.concatenate_offset(np.where((X < self.changepoint), -1, 1))
+
+class DomainKernel(LinearSlopeBasisFuncKernel):
+    @Cache_this(limit=3, ignore_args=())
+    def phi(self, X):
+        phi = np.where((X>self.start)*(X<self.stop), 1., 0.)
+        return phi#((phi-self.start)/(self.stop-self.start))-.5
+        return self.concatenate_offset(phi)  # ((phi-self.start)/(self.stop-self.start))-.5
diff --git a/GPy/kern/_src/brownian.py b/GPy/kern/_src/brownian.py
index fd79973c..d403fce7 100644
--- a/GPy/kern/_src/brownian.py
+++ b/GPy/kern/_src/brownian.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
diff --git a/GPy/kern/_src/coregionalize.py b/GPy/kern/_src/coregionalize.py
index 291402ec..5b91de1c 100644
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@@ -1,13 +1,17 @@
 # Copyright (c) 2012, James Hensman and Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kern import Kern
+from .kern import Kern
 import numpy as np
-from scipy import weave
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.config import config # for assesing whether to use weave
 
+try:
+    from scipy import weave
+except ImportError:
+    config.set('weave', 'working', 'False')
+
 class Coregionalize(Kern):
     """
     Covariance function for intrinsic/linear coregionalization models
@@ -61,7 +65,7 @@ class Coregionalize(Kern):
             try:
                 return self._K_weave(X, X2)
             except:
-                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
                 config.set('weave', 'working', 'False')
                 return self._K_numpy(X, X2)
         else:
@@ -123,7 +127,7 @@ class Coregionalize(Kern):
             try:
                 dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
             except:
-                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
                 config.set('weave', 'working', 'False')
                 dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
         else:
@@ -162,7 +166,7 @@ class Coregionalize(Kern):
 
     def update_gradients_diag(self, dL_dKdiag, X):
         index = np.asarray(X, dtype=np.int).flatten()
-        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in xrange(self.output_dim)])
+        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in range(self.output_dim)])
         self.W.gradient = 2.*self.W*dL_dKdiag_small[:, None]
         self.kappa.gradient = dL_dKdiag_small
 
diff --git a/GPy/kern/_src/eq_ode2.py b/GPy/kern/_src/eq_ode2.py
index 59f67b8b..2d42a3e6 100644
--- a/GPy/kern/_src/eq_ode2.py
+++ b/GPy/kern/_src/eq_ode2.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 from scipy.special import wofz
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.caching import Cache_this
diff --git a/GPy/kern/_src/independent_outputs.py b/GPy/kern/_src/independent_outputs.py
index 21958267..6f8b7be1 100644
--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@@ -2,13 +2,13 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-from kern import Kern, CombinationKernel
+from .kern import Kern, CombinationKernel
 import numpy as np
 import itertools
 
 def index_to_slices(index):
     """
-    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index. 
+    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index.
 
     e.g.
     >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
@@ -79,10 +79,10 @@ class IndependentOutputs(CombinationKernel):
 
     def update_gradients_full(self,dL_dK,X,X2=None):
         slices = index_to_slices(X[:,self.index_dim])
-        if self.single_kern: 
+        if self.single_kern:
             target = np.zeros(self.kern.size)
             kerns = itertools.repeat(self.kern)
-        else: 
+        else:
             kerns = self.kern
             target = [np.zeros(kern.size) for kern, _ in zip(kerns, slices)]
         def collate_grads(kern, i, dL, X, X2):
@@ -94,20 +94,24 @@ class IndependentOutputs(CombinationKernel):
         else:
             slices2 = index_to_slices(X2[:,self.index_dim])
             [[[collate_grads(kern, i, dL_dK[s,s2],X[s],X2[s2]) for s in slices_i] for s2 in slices_j] for i,(kern,slices_i,slices_j) in enumerate(zip(kerns,slices,slices2))]
-        if self.single_kern: kern.gradient = target
-        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
+        if self.single_kern:
+            self.kern.gradient = target
+        else:
+            [kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
 
     def gradients_X(self,dL_dK, X, X2=None):
         target = np.zeros(X.shape)
         kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
         if X2 is None:
             # TODO: make use of index_to_slices
+            # FIXME: Broken as X is already sliced out
+            print "Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!"
             values = np.unique(X[:,self.index_dim])
             slices = [X[:,self.index_dim]==i for i in values]
             [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
               for kern, s in zip(kerns, slices)]
             #slices = index_to_slices(X[:,self.index_dim])
-            #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s]) 
+            #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s])
             #  for s in slices_i] for kern, slices_i in zip(kerns, slices)]
             #import ipdb;ipdb.set_trace()
             #[[(np.add(target[s ], kern.gradients_X(dL_dK[s ,ss],X[s ], X[ss]), out=target[s ]),
@@ -142,7 +146,7 @@ class IndependentOutputs(CombinationKernel):
             if self.single_kern: target[:] += kern.gradient
             else: target[i][:] += kern.gradient
         [[collate_grads(kern, i, dL_dKdiag[s], X[s,:]) for s in slices_i] for i, (kern, slices_i) in enumerate(zip(kerns, slices))]
-        if self.single_kern: kern.gradient = target
+        if self.single_kern: self.kern.gradient = target
         else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
 
 class Hierarchical(CombinationKernel):
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 57b2bff5..e63ddad4 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -4,17 +4,20 @@
 import sys
 import numpy as np
 from ...core.parameterization.parameterized import Parameterized
-from kernel_slice_operations import KernCallsViaSlicerMeta
+from .kernel_slice_operations import KernCallsViaSlicerMeta
 from ...util.caching import Cache_this
 from GPy.core.parameterization.observable_array import ObsAr
+from functools import reduce
+import six
 
-
-
+@six.add_metaclass(KernCallsViaSlicerMeta)
 class Kern(Parameterized):
     #===========================================================================
     # This adds input slice support. The rather ugly code for slicing can be
     # found in kernel_slice_operations
-    __metaclass__ = KernCallsViaSlicerMeta
+    # __meataclass__ is ignored in Python 3 - needs to be put in the function definiton
+    #__metaclass__ = KernCallsViaSlicerMeta
+    #Here, we use the Python module six to support Py3 and Py2 simultaneously
     #===========================================================================
     _support_GPU=False
     def __init__(self, input_dim, active_dims, name, useGPU=False, *a, **kw):
@@ -178,7 +181,7 @@ class Kern(Parameterized):
 
         """
         assert isinstance(other, Kern), "only kernels can be added to kernels..."
-        from add import Add
+        from .add import Add
         return Add([self, other], name=name)
 
     def __mul__(self, other):
@@ -210,7 +213,7 @@ class Kern(Parameterized):
 
         """
         assert isinstance(other, Kern), "only kernels can be multiplied to kernels..."
-        from prod import Prod
+        from .prod import Prod
         #kernels = []
         #if isinstance(self, Prod): kernels.extend(self.parameters)
         #else: kernels.append(self)
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index 9d1a956b..e3a45c67 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -3,7 +3,7 @@
 
 
 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...util.linalg import tdot
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
diff --git a/GPy/kern/_src/mlp.py b/GPy/kern/_src/mlp.py
index 16e84363..4488ea82 100644
--- a/GPy/kern/_src/mlp.py
+++ b/GPy/kern/_src/mlp.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
diff --git a/GPy/kern/_src/periodic.py b/GPy/kern/_src/periodic.py
index e8e16506..23818007 100644
--- a/GPy/kern/_src/periodic.py
+++ b/GPy/kern/_src/periodic.py
@@ -3,11 +3,12 @@
 
 
 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...util.linalg import mdot
 from ...util.decorators import silence_errors
 from ...core.parameterization.param import Param
 from ...core.parameterization.transformations import Logexp
+from functools import reduce
 
 class Periodic(Kern):
     def __init__(self, input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name):
diff --git a/GPy/kern/_src/poly.py b/GPy/kern/_src/poly.py
index b90e8f8f..a5306c2a 100644
--- a/GPy/kern/_src/poly.py
+++ b/GPy/kern/_src/poly.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 class Poly(Kern):
diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index bff6d841..ff7cf140 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -2,9 +2,24 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from kern import CombinationKernel
+from .kern import CombinationKernel
 from ...util.caching import Cache_this
 import itertools
+from functools import reduce
+
+
+def numpy_invalid_op_as_exception(func):
+    """
+    A decorator that allows catching numpy invalid operations
+    as exceptions (the default behaviour is raising warnings).
+    """
+    def func_wrapper(*args, **kwargs):
+        np.seterr(invalid='raise')
+        result = func(*args, **kwargs)
+        np.seterr(invalid='warn')
+        return result
+    return func_wrapper
+
 
 class Prod(CombinationKernel):
     """
@@ -46,18 +61,20 @@ class Prod(CombinationKernel):
             self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
             self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2)
         else:
-            k = self.K(X,X2)*dL_dK
-            for p in self.parts:
-                p.update_gradients_full(k/p.K(X,X2),X,X2)
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
+                to_update = list(set(self.parts) - set(combination))[0]
+                to_update.update_gradients_full(dL_dK * prod, X, X2)
 
     def update_gradients_diag(self, dL_dKdiag, X):
         if len(self.parts)==2:
             self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
             self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
         else:
-            k = self.Kdiag(X)*dL_dKdiag
-            for p in self.parts:
-                p.update_gradients_diag(k/p.Kdiag(X),X)
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
+                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination])
+                to_update = list(set(self.parts) - set(combination))[0]
+                to_update.update_gradients_diag(dL_dKdiag * prod, X)
 
     def gradients_X(self, dL_dK, X, X2=None):
         target = np.zeros(X.shape)
@@ -65,9 +82,10 @@ class Prod(CombinationKernel):
             target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
             target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
         else:
-            k = self.K(X,X2)*dL_dK
-            for p in self.parts:
-                target += p.gradients_X(k/p.K(X,X2),X,X2)
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
+                to_update = list(set(self.parts) - set(combination))[0]
+                target += to_update.gradients_X(dL_dK * prod, X, X2)
         return target
 
     def gradients_X_diag(self, dL_dKdiag, X):
@@ -80,3 +98,5 @@ class Prod(CombinationKernel):
             for p in self.parts:
                 target += p.gradients_X_diag(k/p.Kdiag(X),X)
         return target
+
+
diff --git a/GPy/kern/_src/psi_comp/__init__.py b/GPy/kern/_src/psi_comp/__init__.py
index a277ff02..5041da50 100644
--- a/GPy/kern/_src/psi_comp/__init__.py
+++ b/GPy/kern/_src/psi_comp/__init__.py
@@ -4,10 +4,10 @@
 from ....core.parameterization.parameter_core import Pickleable
 from GPy.util.caching import Cache_this
 from ....core.parameterization import variational
-import rbf_psi_comp
-import ssrbf_psi_comp
-import sslinear_psi_comp
-import linear_psi_comp
+from . import rbf_psi_comp
+from . import ssrbf_psi_comp
+from . import sslinear_psi_comp
+from . import linear_psi_comp
 
 class PSICOMP_RBF(Pickleable):
     @Cache_this(limit=2, ignore_args=(0,))
@@ -17,7 +17,7 @@ class PSICOMP_RBF(Pickleable):
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
         else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
+            raise ValueError("unknown distriubtion received for psi-statistics")
 
     @Cache_this(limit=2, ignore_args=(0,1,2,3))
     def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
@@ -26,7 +26,7 @@ class PSICOMP_RBF(Pickleable):
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
         else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
+            raise ValueError("unknown distriubtion received for psi-statistics")
 
     def _setup_observers(self):
         pass
@@ -40,7 +40,7 @@ class PSICOMP_Linear(Pickleable):
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             return sslinear_psi_comp.psicomputations(variance, Z, variational_posterior)
         else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
+            raise ValueError("unknown distriubtion received for psi-statistics")
 
     @Cache_this(limit=2, ignore_args=(0,1,2,3))
     def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
@@ -49,7 +49,7 @@ class PSICOMP_Linear(Pickleable):
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
         else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
+            raise ValueError("unknown distriubtion received for psi-statistics")
 
     def _setup_observers(self):
         pass
\ No newline at end of file
diff --git a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
index 5f261785..d431cd61 100644
--- a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
@@ -37,11 +37,11 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variati
 
     # Compute for psi0 and psi1
     mu2S = np.square(mu)+S
-    dL_dvar += np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu)
-    dL_dgamma += np.einsum('n,q,nq->nq',dL_dpsi0,variance,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,variance,Z,mu)
-    dL_dmu += np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*variance,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,variance,Z)
-    dL_dS += np.einsum('n,nq,q->nq',dL_dpsi0,gamma,variance)
-    dL_dZ +=  np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, variance,mu)
+    dL_dvar += (dL_dpsi0[:,None]*gamma*mu2S).sum(axis=0) + (dL_dpsi1.T.dot(gamma*mu)*Z).sum(axis=0)
+    dL_dgamma += dL_dpsi0[:,None]*variance*mu2S+ dL_dpsi1.dot(Z)*mu*variance
+    dL_dmu += dL_dpsi0[:,None]*2.*variance*gamma*mu + dL_dpsi1.dot(Z)*gamma*variance
+    dL_dS += dL_dpsi0[:,None]*variance*gamma
+    dL_dZ += dL_dpsi1.T.dot(gamma*mu)*variance
     
     return dL_dvar, dL_dZ, dL_dmu, dL_dS, dL_dgamma
 
@@ -64,29 +64,23 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma):
     gamma2 = np.square(gamma)
     variance2 = np.square(variance)
     mu2S = mu2+S # NxQ
-    gvm = np.einsum('nq,nq,q->nq',gamma,mu,variance)
-    common_sum = np.einsum('nq,mq->nm',gvm,Z)
-#     common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
-    Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z)
+    gvm = gamma*mu*variance
+    common_sum = gvm.dot(Z.T)
+    Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
+    Z_expect_var2 = Z_expect*variance2
     dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
-    tmp = np.einsum('mo,oq->mq',dL_dpsi2T,Z)
-    common_expect = np.einsum('mq,nm->nq',tmp,common_sum)
-#     common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum)
-    Z2_expect = np.einsum('om,nm->no',dL_dpsi2T,common_sum)
-    Z1_expect = np.einsum('om,mq->oq',dL_dpsi2T,Z)
+    common_expect = common_sum.dot(dL_dpsi2T).dot(Z)
+    Z2_expect = common_sum.dot(dL_dpsi2T)
+    Z1_expect = dL_dpsi2T.dot(Z)
     
-    dL_dvar = np.einsum('nq,q,q->q',2.*(gamma*mu2S-gamma2*mu2),variance,Z_expect)+\
-        np.einsum('nq,nq,nq->q',common_expect,gamma,mu)
+    dL_dvar = variance*Z_expect*2.*(gamma*mu2S-gamma2*mu2).sum(axis=0)+(common_expect*gamma*mu).sum(axis=0)
         
-    dL_dgamma = np.einsum('q,q,nq->nq',Z_expect,variance2,(mu2S-2.*gamma*mu2))+\
-        np.einsum('nq,q,nq->nq',common_expect,variance,mu)
+    dL_dgamma = Z_expect_var2*(mu2S-2.*gamma*mu2)+common_expect*mu*variance
+                
+    dL_dmu = Z_expect_var2*mu*2.*(gamma-gamma2) + common_expect*gamma*variance
+
+    dL_dS = gamma*Z_expect_var2
     
-    dL_dmu = np.einsum('q,q,nq,nq->nq',Z_expect,variance2,mu,2.*(gamma-gamma2))+\
-            np.einsum('nq,nq,q->nq',common_expect,gamma,variance)
-                    
-    dL_dS = np.einsum('q,nq,q->nq',Z_expect,gamma,variance2)
-    
-#     dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,(mu2S-gamma*mu2))+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum))
-    dL_dZ = Z1_expect*np.einsum('nq,q,nq->q',gamma,variance2,(mu2S-gamma*mu2))+np.einsum('nq,q,nq,nm->mq',gamma,variance,mu,Z2_expect)
+    dL_dZ = (gamma*(mu2S-gamma*mu2)).sum(axis=0)*variance2*Z1_expect+ Z2_expect.T.dot(gamma*mu)*variance
 
     return dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ
diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
index 18a4d751..f6a24c86 100644
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@@ -22,12 +22,14 @@ try:
         # _psi1                NxM
         mu = variational_posterior.mean
         S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
          
         N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
         l2 = np.square(lengthscale)
         log_denom1 = np.log(S/l2+1)
         log_denom2 = np.log(2*S/l2+1)
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        log_gamma = np.log(gamma)
+        log_gamma1 = np.log(1.-gamma)
         variance = float(variance)
         psi0 = np.empty(N)
         psi0[:] = variance
@@ -37,6 +39,7 @@ try:
         from ....util.misc import param_to_array
         S = param_to_array(S)
         mu = param_to_array(mu)
+        gamma = param_to_array(gamma)
         Z = param_to_array(Z)
          
         support_code = """
@@ -79,7 +82,7 @@ try:
             }
         }
         """
-        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
+        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
      
         psi2 = psi2n.sum(axis=0)
         return psi0,psi1,psi2,psi2n
@@ -94,12 +97,13 @@ try:
      
         mu = variational_posterior.mean
         S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
         N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
         l2 = np.square(lengthscale)
         log_denom1 = np.log(S/l2+1)
         log_denom2 = np.log(2*S/l2+1)
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
-        gamma, gamma1 = variational_posterior.gamma_probabilities()
+        log_gamma = np.log(gamma)
+        log_gamma1 = np.log(1.-gamma)
         variance = float(variance)
      
         dvar = np.zeros(1)
@@ -113,6 +117,7 @@ try:
         from ....util.misc import param_to_array
         S = param_to_array(S)
         mu = param_to_array(mu)
+        gamma = param_to_array(gamma)
         Z = param_to_array(Z)
          
         support_code = """
@@ -130,7 +135,6 @@ try:
                         double Zm1q = Z(m1,q);
                         double Zm2q = Z(m2,q);
                         double gnq = gamma(n,q);
-                        double g1nq = gamma1(n,q);
                         double mu_nq = mu(n,q);
                          
                         if(m2==0) {
@@ -156,7 +160,7 @@ try:
                              
                             dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum);
                             dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.;
-                            dgamma(n,q) += lpsi1*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                            dgamma(n,q) += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                             dl(q) += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zm1q*Zm1q/(lq*lq)*d_exp2)/(2.*exp_sum);
                             dZ(m1,q) += lpsi1*(-Zmu/denom*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
                         }
@@ -184,7 +188,7 @@ try:
                          
                         dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum;
                         dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
-                        dgamma(n,q) += lpsi2*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                        dgamma(n,q) += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                         dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
                         dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;                   
                     }
@@ -192,7 +196,7 @@ try:
             }
         }
         """
-        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','gamma1','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
+        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
      
         dl *= 2.*lengthscale
         if not ARD:
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 0c6a4aef..c6998370 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -3,9 +3,9 @@
 
 
 import numpy as np
-from stationary import Stationary
-from psi_comp import PSICOMP_RBF
-from psi_comp.rbf_psi_gpucomp import PSICOMP_RBF_GPU
+from .stationary import Stationary
+from .psi_comp import PSICOMP_RBF
+from .psi_comp.rbf_psi_gpucomp import PSICOMP_RBF_GPU
 from ...util.config import *
 
 class RBF(Stationary):
diff --git a/GPy/kern/_src/splitKern.py b/GPy/kern/_src/splitKern.py
index 27e4f76b..c131dcd8 100644
--- a/GPy/kern/_src/splitKern.py
+++ b/GPy/kern/_src/splitKern.py
@@ -3,11 +3,11 @@ A new kernel
 """
 
 import numpy as np
-from kern import Kern,CombinationKernel
+from .kern import Kern,CombinationKernel
 from .independent_outputs import index_to_slices
 import itertools
 
-class DiffGenomeKern(Kern):
+class DEtime(Kern):
 
     def __init__(self, kernel, idx_p, Xp, index_dim=-1, name='DiffGenomeKern'):
         self.idx_p = idx_p
@@ -104,7 +104,7 @@ class SplitKern(CombinationKernel):
             assert len(slices2)<=2, 'The Split kernel only support two different indices'
             target = np.zeros((X.shape[0], X2.shape[0]))
             # diagonal blocks
-            [[target.__setitem__((s,s2), self.kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[i], slices2[i])] for i in xrange(min(len(slices),len(slices2)))]
+            [[target.__setitem__((s,s2), self.kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[i], slices2[i])] for i in range(min(len(slices),len(slices2)))]
             if len(slices)>1:
                 [target.__setitem__((s,s2), self.kern_cross.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[1], slices2[0])]
             if len(slices2)>1:
@@ -135,7 +135,7 @@ class SplitKern(CombinationKernel):
         else:
             assert dL_dK.shape==(X.shape[0],X2.shape[0])
             slices2 = index_to_slices(X2[:,self.index_dim])
-            [[collate_grads(dL_dK[s,s2],X[s],X2[s2]) for s,s2 in itertools.product(slices[i], slices2[i])] for i in xrange(min(len(slices),len(slices2)))]
+            [[collate_grads(dL_dK[s,s2],X[s],X2[s2]) for s,s2 in itertools.product(slices[i], slices2[i])] for i in range(min(len(slices),len(slices2)))]
             if len(slices)>1:
                 [collate_grads(dL_dK[s,s2], X[s], X2[s2], True) for s,s2 in itertools.product(slices[1], slices2[0])]
             if len(slices2)>1:
diff --git a/GPy/kern/_src/static.py b/GPy/kern/_src/static.py
index f4223bf4..64d14018 100644
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-from kern import Kern
+from .kern import Kern
 import numpy as np
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
@@ -60,7 +60,10 @@ class White(Static):
         return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
 
     def update_gradients_full(self, dL_dK, X, X2=None):
-        self.variance.gradient = np.trace(dL_dK)
+        if X2 is None:
+            self.variance.gradient = np.trace(dL_dK)
+        else:
+            self.variance.gradient = 0.
 
     def update_gradients_diag(self, dL_dKdiag, X):
         self.variance.gradient = dL_dKdiag.sum()
@@ -106,7 +109,7 @@ class Fixed(Static):
         return self.variance * self.fixed_K
 
     def Kdiag(self, X):
-        return self.variance * self.fixed_K.diag()
+        return self.variance * self.fixed_K.diagonal()
 
     def update_gradients_full(self, dL_dK, X, X2=None):
         self.variance.gradient = np.einsum('ij,ij', dL_dK, self.fixed_K)
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index 06671b23..6bc4b304 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -2,16 +2,21 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.linalg import tdot
 from ... import util
 import numpy as np
-from scipy import integrate, weave
+from scipy import integrate
 from ...util.config import config # for assesing whether to use weave
 from ...util.caching import Cache_this
 
+try:
+    from scipy import weave
+except ImportError:
+    config.set('weave', 'working', 'False')
+
 class Stationary(Kern):
     """
     Stationary kernels (covariance functions).
@@ -65,10 +70,10 @@ class Stationary(Kern):
         self.link_parameters(self.variance, self.lengthscale)
 
     def K_of_r(self, r):
-        raise NotImplementedError, "implement the covariance function as a fn of r to use this class"
+        raise NotImplementedError("implement the covariance function as a fn of r to use this class")
 
     def dK_dr(self, r):
-        raise NotImplementedError, "implement derivative of the covariance function wrt r to use this class"
+        raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")
 
     @Cache_this(limit=5, ignore_args=())
     def K(self, X, X2=None):
@@ -165,11 +170,11 @@ class Stationary(Kern):
                 try:
                     self.lengthscale.gradient = self.weave_lengthscale_grads(tmp, X, X2)
                 except:
-                    print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                    print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
                     config.set('weave', 'working', 'False')
-                    self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+                    self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in range(self.input_dim)])
             else:
-                self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+                self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in range(self.input_dim)])
         else:
             r = self._scaled_dist(X, X2)
             self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale
@@ -214,7 +219,7 @@ class Stationary(Kern):
             try:
                 return self.gradients_X_weave(dL_dK, X, X2)
             except:
-                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
                 config.set('weave', 'working', 'False')
                 return self.gradients_X_(dL_dK, X, X2)
         else:
@@ -234,7 +239,7 @@ class Stationary(Kern):
 
         #the lower memory way with a loop
         ret = np.empty(X.shape, dtype=np.float64)
-        for q in xrange(self.input_dim):
+        for q in range(self.input_dim):
             np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=ret[:,q])
         ret /= self.lengthscale**2
 
@@ -296,6 +301,8 @@ class Exponential(Stationary):
         return -0.5*self.K_of_r(r)
 
 
+
+
 class OU(Stationary):
     """
     OU kernel:
diff --git a/GPy/kern/_src/symbolic.py b/GPy/kern/_src/symbolic.py
index 006af9dc..c339893a 100644
--- a/GPy/kern/_src/symbolic.py
+++ b/GPy/kern/_src/symbolic.py
@@ -1,7 +1,7 @@
 # Check Matthew Rocklin's blog post.
 import sympy as sym
 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...core.symbolic import Symbolic_core
 
 
@@ -11,7 +11,7 @@ class Symbolic(Kern, Symbolic_core):
     def __init__(self, input_dim, k=None, output_dim=1, name='symbolic', parameters=None, active_dims=None, operators=None, func_modules=[]):
 
         if k is None:
-            raise ValueError, "You must provide an argument for the covariance function."
+            raise ValueError("You must provide an argument for the covariance function.")
 
         Kern.__init__(self, input_dim, active_dims, name=name)
         kdiag = k
diff --git a/GPy/kern/_src/trunclinear.py b/GPy/kern/_src/trunclinear.py
index 4ebd51b6..8c48f134 100644
--- a/GPy/kern/_src/trunclinear.py
+++ b/GPy/kern/_src/trunclinear.py
@@ -3,7 +3,7 @@
 
 
 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.caching import Cache_this
diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 28e44541..3157bd5b 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -1,8 +1,10 @@
-from bernoulli import Bernoulli
-from exponential import Exponential
-from gaussian import Gaussian
-from gamma import Gamma
-from poisson import Poisson
-from student_t import StudentT
-from likelihood import Likelihood
-from mixed_noise import MixedNoise
+from .bernoulli import Bernoulli
+from .exponential import Exponential
+from .gaussian import Gaussian
+from .gamma import Gamma
+from .poisson import Poisson
+from .student_t import StudentT
+from .likelihood import Likelihood
+from .mixed_noise import MixedNoise
+from .binomial import Binomial
+
diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py
index 26de274b..e540f016 100644
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@@ -3,9 +3,8 @@
 
 import numpy as np
 from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
-import link_functions
-from likelihood import Likelihood
-from scipy import stats
+from . import link_functions
+from .likelihood import Likelihood
 
 class Bernoulli(Likelihood):
     """
@@ -77,23 +76,22 @@ class Bernoulli(Likelihood):
 
         return Z_hat, mu_hat, sigma2_hat
 
-    def variational_expectations(self, Y, m, v, gh_points=None):
+    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
         if isinstance(self.gp_link, link_functions.Probit):
 
             if gh_points is None:
-                gh_x, gh_w = np.polynomial.hermite.hermgauss(20)
+                gh_x, gh_w = self._gh_points()
             else:
                 gh_x, gh_w = gh_points
 
-            from scipy import stats
 
             shape = m.shape
             m,v,Y = m.flatten(), v.flatten(), Y.flatten()
             Ysign = np.where(Y==1,1,-1)
             X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + (m*Ysign)[:,None]
-            p = stats.norm.cdf(X)
+            p = std_norm_cdf(X)
             p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability
-            N = stats.norm.pdf(X)
+            N = std_norm_pdf(X)
             F = np.log(p).dot(gh_w)
             NoverP = N/p
             dF_dm = (NoverP*Ysign[:,None]).dot(gh_w)
@@ -106,10 +104,10 @@ class Bernoulli(Likelihood):
     def predictive_mean(self, mu, variance, Y_metadata=None):
 
         if isinstance(self.gp_link, link_functions.Probit):
-            return stats.norm.cdf(mu/np.sqrt(1+variance))
+            return std_norm_cdf(mu/np.sqrt(1+variance))
 
         elif isinstance(self.gp_link, link_functions.Heaviside):
-            return stats.norm.cdf(mu/np.sqrt(variance))
+            return std_norm_cdf(mu/np.sqrt(variance))
 
         else:
             raise NotImplementedError
diff --git a/GPy/likelihoods/binomial.py b/GPy/likelihoods/binomial.py
new file mode 100644
index 00000000..22009968
--- /dev/null
+++ b/GPy/likelihoods/binomial.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
+from . import link_functions
+from .likelihood import Likelihood
+from scipy import special
+
+class Binomial(Likelihood):
+    """
+    Binomial likelihood
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+    .. Note::
+        Y takes values in either {-1, 1} or {0, 1}.
+        link function should have the domain [0, 1], e.g. probit (default) or Heaviside
+
+    .. See also::
+        likelihood.py, for the parent class
+    """
+    def __init__(self, gp_link=None):
+        if gp_link is None:
+            gp_link = link_functions.Probit()
+
+        super(Binomial, self).__init__(gp_link, 'Binomial')
+
+    def conditional_mean(self, gp, Y_metadata):
+        return self.gp_link(gp)*Y_metadata['trials']
+
+    def pdf_link(self, inv_link_f, y, Y_metadata):
+        """
+        Likelihood function given inverse link of f.
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata must contain 'trials'
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        .. Note:
+            Each y_i must be in {0, 1}
+        """
+        return np.exp(self.logpdf_link(inv_link_f, y, Y_metadata))
+
+    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
+        """
+        Log Likelihood function given inverse link of f.
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = y_{i}\\log\\lambda(f_{i}) + (1-y_{i})\\log (1-f_{i})
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata must contain 'trials'
+        :returns: log likelihood evaluated at points inverse link of f.
+        :rtype: float
+        """
+        N = Y_metadata['trials']
+        nchoosey = special.gammaln(N+1) - special.gammaln(y+1) - special.gammaln(N-y+1)
+
+        return nchoosey + y*np.log(inv_link_f) + (N-y)*np.log(1.-inv_link_f)
+
+    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
+        """
+        Gradient of the pdf at y, given inverse link of f w.r.t inverse link of f.
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata must contain 'trials'
+        :returns: gradient of log likelihood evaluated at points inverse link of f.
+        :rtype: Nx1 array
+        """
+        N = Y_metadata['trials']
+        return y/inv_link_f - (N-y)/(1-inv_link_f)
+
+    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
+        """
+        Hessian at y, given inv_link_f, w.r.t inv_link_f the hessian will be 0 unless i == j
+        i.e. second derivative logpdf at y given inverse link of f_i and inverse link of f_j  w.r.t inverse link of f_i and inverse link of f_j.
+
+
+        .. math::
+            \\frac{d^{2}\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)^{2}} = \\frac{-y_{i}}{\\lambda(f)^{2}} - \\frac{(1-y_{i})}{(1-\\lambda(f))^{2}}
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in binomial
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points inverse link of f.
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on inverse link of f_i not on inverse link of f_(j!=i)
+        """
+        N = Y_metadata['trials']
+        return -y/np.square(inv_link_f) - (N-y)/np.square(1-inv_link_f)
+
+    def samples(self, gp, Y_metadata=None):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        N = Y_metadata['trials']
+        Ysim = np.random.binomial(N, self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
+
+    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+        pass
diff --git a/GPy/likelihoods/exponential.py b/GPy/likelihoods/exponential.py
index 8110c7d4..0a6c543d 100644
--- a/GPy/likelihoods/exponential.py
+++ b/GPy/likelihoods/exponential.py
@@ -5,8 +5,8 @@
 import numpy as np
 from scipy import stats,special
 import scipy as sp
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood
 
 class Exponential(Likelihood):
     """
@@ -57,9 +57,8 @@ class Exponential(Likelihood):
         :rtype: float
 
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         log_objective = np.log(link_f) - y*link_f
-        return np.sum(log_objective)
+        return log_objective
 
     def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
         """
@@ -77,7 +76,6 @@ class Exponential(Likelihood):
         :rtype: Nx1 array
 
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         grad = 1./link_f - y
         #grad = y/(link_f**2) - 1./link_f
         return grad
@@ -103,7 +101,6 @@ class Exponential(Likelihood):
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
             (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         hess = -1./(link_f**2)
         #hess = -2*y/(link_f**3) + 1/(link_f**2)
         return hess
@@ -123,7 +120,6 @@ class Exponential(Likelihood):
         :returns: third derivative of likelihood evaluated at points f
         :rtype: Nx1 array
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         d3lik_dlink3 = 2./(link_f**3)
         #d3lik_dlink3 = 6*y/(link_f**4) - 2./(link_f**3)
         return d3lik_dlink3
diff --git a/GPy/likelihoods/gamma.py b/GPy/likelihoods/gamma.py
index c79e196c..79aba4a5 100644
--- a/GPy/likelihoods/gamma.py
+++ b/GPy/likelihoods/gamma.py
@@ -6,8 +6,8 @@ import numpy as np
 from scipy import stats,special
 import scipy as sp
 from ..core.parameterization import Param
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood
 
 class Gamma(Likelihood):
     """
@@ -66,12 +66,11 @@ class Gamma(Likelihood):
         :rtype: float
 
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         #alpha = self.gp_link.transf(gp)*self.beta
         #return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
         alpha = link_f*self.beta
         log_objective = alpha*np.log(self.beta) - np.log(special.gamma(alpha)) + (alpha - 1)*np.log(y) - self.beta*y
-        return np.sum(log_objective)
+        return log_objective
 
     def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
         """
@@ -90,7 +89,6 @@ class Gamma(Likelihood):
         :rtype: Nx1 array
 
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         grad = self.beta*np.log(self.beta*y) - special.psi(self.beta*link_f)*self.beta
         #old
         #return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
@@ -118,7 +116,6 @@ class Gamma(Likelihood):
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
             (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         hess = -special.polygamma(1, self.beta*link_f)*(self.beta**2)
         #old
         #return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
@@ -140,6 +137,5 @@ class Gamma(Likelihood):
         :returns: third derivative of likelihood evaluated at points f
         :rtype: Nx1 array
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         d3lik_dlink3 = -special.polygamma(2, self.beta*link_f)*(self.beta**3)
         return d3lik_dlink3
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index a6e5b7e0..9abb8cde 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -13,8 +13,8 @@ James 11/12/13
 
 import numpy as np
 from scipy import stats, special
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
 from scipy import stats
@@ -34,7 +34,9 @@ class Gaussian(Likelihood):
         if gp_link is None:
             gp_link = link_functions.Identity()
 
-        assert isinstance(gp_link, link_functions.Identity), "the likelihood only implemented for the identity link"
+        if not isinstance(gp_link, link_functions.Identity):
+            print("Warning, Exact inference is not implemeted for non-identity link functions,\
+            if you are not already, ensure Laplace inference_method is used")
 
         super(Gaussian, self).__init__(gp_link, name=name)
 
@@ -130,11 +132,8 @@ class Gaussian(Likelihood):
         :returns: log likelihood evaluated for this point
         :rtype: float
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
-        N = y.shape[0]
-        ln_det_cov = N*np.log(self.variance)
-
-        return -0.5*(np.sum((y-link_f)**2/self.variance) + ln_det_cov + N*np.log(2.*np.pi))
+        ln_det_cov = np.log(self.variance)
+        return -(1.0/(2*self.variance))*((y-link_f)**2) - 0.5*ln_det_cov - 0.5*np.log(2.*np.pi)
 
     def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
         """
@@ -151,8 +150,7 @@ class Gaussian(Likelihood):
         :returns: gradient of log likelihood evaluated at points link(f)
         :rtype: Nx1 array
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
-        s2_i = (1.0/self.variance)
+        s2_i = 1.0/self.variance
         grad = s2_i*y - s2_i*link_f
         return grad
 
@@ -178,9 +176,9 @@ class Gaussian(Likelihood):
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
             (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
         N = y.shape[0]
-        hess = -(1.0/self.variance)*np.ones((N, 1))
+        D = link_f.shape[1]
+        hess = -(1.0/self.variance)*np.ones((N, D))
         return hess
 
     def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
@@ -198,9 +196,9 @@ class Gaussian(Likelihood):
         :returns: third derivative of log likelihood evaluated at points link(f)
         :rtype: Nx1 array
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
         N = y.shape[0]
-        d3logpdf_dlink3 = np.zeros((N,1))
+        D = link_f.shape[1]
+        d3logpdf_dlink3 = np.zeros((N,D))
         return d3logpdf_dlink3
 
     def dlogpdf_link_dvar(self, link_f, y, Y_metadata=None):
@@ -218,12 +216,10 @@ class Gaussian(Likelihood):
         :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
         :rtype: float
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
         e = y - link_f
         s_4 = 1.0/(self.variance**2)
-        N = y.shape[0]
-        dlik_dsigma = -0.5*N/self.variance + 0.5*s_4*np.sum(np.square(e))
-        return np.sum(dlik_dsigma) # Sure about this sum?
+        dlik_dsigma = -0.5/self.variance + 0.5*s_4*np.square(e)
+        return dlik_dsigma
 
     def dlogpdf_dlink_dvar(self, link_f, y, Y_metadata=None):
         """
@@ -240,7 +236,6 @@ class Gaussian(Likelihood):
         :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
         s_4 = 1.0/(self.variance**2)
         dlik_grad_dsigma = -s_4*y + s_4*link_f
         return dlik_grad_dsigma
@@ -260,23 +255,26 @@ class Gaussian(Likelihood):
         :returns: derivative of log hessian evaluated at points link(f_i) and link(f_j) w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
         s_4 = 1.0/(self.variance**2)
         N = y.shape[0]
-        d2logpdf_dlink2_dvar = np.ones((N,1))*s_4
+        D = link_f.shape[1]
+        d2logpdf_dlink2_dvar = np.ones((N, D))*s_4
         return d2logpdf_dlink2_dvar
 
     def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
-        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
-        return np.asarray([[dlogpdf_dvar]])
+        dlogpdf_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dtheta[0,:,:] = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
+        return dlogpdf_dtheta
 
     def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
-        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
-        return dlogpdf_dlink_dvar
+        dlogpdf_dlink_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dlink_dtheta[0, :, :]= self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
+        return dlogpdf_dlink_dtheta
 
     def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
-        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
-        return d2logpdf_dlink2_dvar
+        d2logpdf_dlink2_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        d2logpdf_dlink2_dtheta[0, :, :] = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
+        return d2logpdf_dlink2_dtheta
 
     def _mean(self, gp):
         """
@@ -309,18 +307,17 @@ class Gaussian(Likelihood):
         Ysim = np.array([np.random.normal(self.gp_link.transf(gpj), scale=np.sqrt(self.variance), size=1) for gpj in gp])
         return Ysim.reshape(orig_shape)
 
-    def log_predictive_density(self, y_test, mu_star, var_star):
+    def log_predictive_density(self, y_test, mu_star, var_star, Y_metadata=None):
         """
         assumes independence
         """
         v = var_star + self.variance
         return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v
 
-    def variational_expectations(self, Y, m, v, gh_points=None):
+    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
         lik_var = float(self.variance)
         F = -0.5*np.log(2*np.pi) -0.5*np.log(lik_var) - 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/lik_var
         dF_dmu = (Y - m)/lik_var
         dF_dv = np.ones_like(v)*(-0.5/lik_var)
-        dF_dlik_var = np.sum(-0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2))
-        dF_dtheta = [dF_dlik_var]
-        return F, dF_dmu, dF_dv, dF_dtheta
+        dF_dtheta = -0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2)
+        return F, dF_dmu, dF_dv, dF_dtheta.reshape(1, Y.shape[0], Y.shape[1])
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index 790c6ba4..5388526e 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
+# Copyright (c) 2012-2015 The GPy authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
 from scipy import stats,special
 import scipy as sp
-import link_functions
-from ..util.misc import chain_1, chain_2, chain_3
+from . import link_functions
+from ..util.misc import chain_1, chain_2, chain_3, blockify_dhess_dtheta, blockify_third, blockify_hessian, safe_exp
 from scipy.integrate import quad
 import warnings
 from ..core.parameterization import Parameterized
@@ -39,6 +39,7 @@ class Likelihood(Parameterized):
         assert isinstance(gp_link,link_functions.GPTransformation), "gp_link is not a valid GPTransformation."
         self.gp_link = gp_link
         self.log_concave = False
+        self.not_block_really = False
 
     def _gradients(self,partial):
         return np.zeros(0)
@@ -69,7 +70,7 @@ class Likelihood(Parameterized):
         """
         raise NotImplementedError
 
-    def log_predictive_density(self, y_test, mu_star, var_star):
+    def log_predictive_density(self, y_test, mu_star, var_star, Y_metadata=None):
         """
         Calculation of the log predictive density
 
@@ -86,15 +87,51 @@ class Likelihood(Parameterized):
         assert y_test.shape==mu_star.shape
         assert y_test.shape==var_star.shape
         assert y_test.shape[1] == 1
-        def integral_generator(y, m, v):
-            """Generate a function which can be integrated to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
-            def f(f_star):
-                return self.pdf(f_star, y)*np.exp(-(1./(2*v))*np.square(m-f_star))
+
+        flat_y_test = y_test.flatten()
+        flat_mu_star = mu_star.flatten()
+        flat_var_star = var_star.flatten()
+
+        if Y_metadata is not None:
+            #Need to zip individual elements of Y_metadata aswell
+            Y_metadata_flat = {}
+            if Y_metadata is not None:
+                for key, val in Y_metadata.items():
+                    Y_metadata_flat[key] = np.atleast_1d(val).reshape(-1,1)
+
+            zipped_values = []
+
+            for i in range(y_test.shape[0]):
+                y_m = {}
+                for key, val in Y_metadata_flat.items():
+                    if np.isscalar(val) or val.shape[0] == 1:
+                        y_m[key] = val
+                    else:
+                        #Won't broadcast yet
+                        y_m[key] = val[i]
+                zipped_values.append((flat_y_test[i], flat_mu_star[i], flat_var_star[i], y_m))
+        else:
+            #Otherwise just pass along None's
+            zipped_values = zip(flat_y_test, flat_mu_star, flat_var_star, [None]*y_test.shape[0])
+
+        def integral_generator(yi, mi, vi, yi_m):
+            """Generate a function which can be integrated
+            to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
+            def f(fi_star):
+                #exponent = np.exp(-(1./(2*v))*np.square(m-f_star))
+                #from GPy.util.misc import safe_exp
+                #exponent = safe_exp(exponent)
+                #return self.pdf(f_star, y, y_m)*exponent
+
+                #More stable in the log space
+                return np.exp(self.logpdf(fi_star, yi, yi_m)
+                              - 0.5*np.log(2*np.pi*vi)
+                              - 0.5*np.square(mi-fi_star)/vi)
             return f
 
-        scaled_p_ystar, accuracy = zip(*[quad(integral_generator(y, m, v), -np.inf, np.inf) for y, m, v in zip(y_test.flatten(), mu_star.flatten(), var_star.flatten())])
-        scaled_p_ystar = np.array(scaled_p_ystar).reshape(-1,1)
-        p_ystar = scaled_p_ystar/np.sqrt(2*np.pi*var_star)
+        p_ystar, _ = zip(*[quad(integral_generator(yi, mi, vi, yi_m), -np.inf, np.inf)
+                           for yi, mi, vi, yi_m in zipped_values])
+        p_ystar = np.array(p_ystar).reshape(-1, 1)
         return np.log(p_ystar)
 
     def _moments_match_ep(self,obs,tau,v):
@@ -131,7 +168,14 @@ class Likelihood(Parameterized):
 
         return z, mean, variance
 
-    def variational_expectations(self, Y, m, v, gh_points=None):
+    #only compute gh points if required
+    __gh_points = None
+    def _gh_points(self):
+        if self.__gh_points is None:
+            self.__gh_points = np.polynomial.hermite.hermgauss(20)
+        return self.__gh_points
+
+    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
         """
         Use Gauss-Hermite Quadrature to compute
 
@@ -143,10 +187,9 @@ class Likelihood(Parameterized):
 
         if no gh_points are passed, we construct them using defualt options
         """
-        #May be broken
 
         if gh_points is None:
-            gh_x, gh_w = np.polynomial.hermite.hermgauss(20)
+            gh_x, gh_w = self._gh_points()
         else:
             gh_x, gh_w = gh_points
 
@@ -158,9 +201,9 @@ class Likelihood(Parameterized):
 
         #evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid.
         # broadcast needs to be handled carefully.
-        logp = self.logpdf(X,Y[:,None])
-        dlogp_dx = self.dlogpdf_df(X, Y[:,None])
-        d2logp_dx2 = self.d2logpdf_df2(X, Y[:,None])
+        logp = self.logpdf(X,Y[:,None], Y_metadata=Y_metadata)
+        dlogp_dx = self.dlogpdf_df(X, Y[:,None], Y_metadata=Y_metadata)
+        d2logp_dx2 = self.d2logpdf_df2(X, Y[:,None], Y_metadata=Y_metadata)
 
         #clipping for numerical stability
         #logp = np.clip(logp,-1e9,1e9)
@@ -177,7 +220,12 @@ class Likelihood(Parameterized):
         if np.any(np.isnan(dF_dm)) or np.any(np.isinf(dF_dm)):
             stop
 
-        dF_dtheta = None # Not yet implemented
+        if self.size:
+            dF_dtheta = self.dlogpdf_dtheta(X, Y[:,None]) # Ntheta x (orig size) x N_{quad_points}
+            dF_dtheta = np.dot(dF_dtheta, gh_w)
+            dF_dtheta = dF_dtheta.reshape(self.size, shape[0], shape[1])
+        else:
+            dF_dtheta = None # Not yet implemented
         return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), dF_dtheta
 
     def predictive_mean(self, mu, variance, Y_metadata=None):
@@ -189,28 +237,35 @@ class Likelihood(Parameterized):
 
         """
         #conditional_mean: the edpected value of y given some f, under this likelihood
+        fmin = -np.inf
+        fmax = np.inf
         def int_mean(f,m,v):
-            p = np.exp(-(0.5/v)*np.square(f - m))
+            exponent = -(0.5/v)*np.square(f - m)
+            #If exponent is under -30 then exp(exponent) will be very small, so don't exp it!)
             #If p is zero then conditional_mean will overflow
+            assert v.all() > 0
+            p = safe_exp(exponent)
+
+            #If p is zero then conditional_variance will overflow
             if p < 1e-10:
                 return 0.
             else:
                 return self.conditional_mean(f)*p
-        scaled_mean = [quad(int_mean, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        scaled_mean = [quad(int_mean, fmin, fmax,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
         mean = np.array(scaled_mean)[:,None] / np.sqrt(2*np.pi*(variance))
 
         return mean
 
     def _conditional_mean(self, f):
         """Quadrature calculation of the conditional mean: E(Y_star|f)"""
-        raise NotImplementedError, "implement this function to make predictions"
+        raise NotImplementedError("implement this function to make predictions")
 
     def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
         """
         Approximation to the predictive variance: V(Y_star)
 
         The following variance decomposition is used:
-        V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+        V(Y_star) = E( V(Y_star|f_star)**2 ) + V( E(Y_star|f_star) )**2
 
         :param mu: mean of posterior
         :param sigma: standard deviation of posterior
@@ -220,15 +275,22 @@ class Likelihood(Parameterized):
         #sigma2 = sigma**2
         normalizer = np.sqrt(2*np.pi*variance)
 
+        fmin_v = -np.inf
+        fmin_m = np.inf
+        fmin = -np.inf
+        fmax = np.inf
+
+        from ..util.misc import safe_exp
         # E( V(Y_star|f_star) )
         def int_var(f,m,v):
-            p = np.exp(-(0.5/v)*np.square(f - m))
+            exponent = -(0.5/v)*np.square(f - m)
+            p = safe_exp(exponent)
             #If p is zero then conditional_variance will overflow
             if p < 1e-10:
                 return 0.
             else:
                 return self.conditional_variance(f)*p
-        scaled_exp_variance = [quad(int_var, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        scaled_exp_variance = [quad(int_var, fmin_v, fmax,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
         exp_var = np.array(scaled_exp_variance)[:,None] / normalizer
 
         #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star) )**2
@@ -240,14 +302,15 @@ class Likelihood(Parameterized):
 
         #E( E(Y_star|f_star)**2 )
         def int_pred_mean_sq(f,m,v,predictive_mean_sq):
-            p = np.exp(-(0.5/v)*np.square(f - m))
+            exponent = -(0.5/v)*np.square(f - m)
+            p = np.exp(exponent)
             #If p is zero then conditional_mean**2 will overflow
             if p < 1e-10:
                 return 0.
             else:
                 return self.conditional_mean(f)**2*p
 
-        scaled_exp_exp2 = [quad(int_pred_mean_sq, -np.inf, np.inf,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
+        scaled_exp_exp2 = [quad(int_pred_mean_sq, fmin_m, fmax,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
         exp_exp2 = np.array(scaled_exp_exp2)[:,None] / normalizer
 
         var_exp = exp_exp2 - predictive_mean_sq
@@ -295,8 +358,18 @@ class Likelihood(Parameterized):
         :returns: likelihood evaluated for this point
         :rtype: float
         """
-        inv_link_f = self.gp_link.transf(f)
-        return self.pdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+        if isinstance(self.gp_link, link_functions.Identity):
+            return self.pdf_link(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            return self.pdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+
+    def logpdf_sum(self, f, y, Y_metadata=None):
+        """
+        Convenience function that can overridden for functions where this could
+        be computed more efficiently
+        """
+        return np.sum(self.logpdf(f, y, Y_metadata=Y_metadata))
 
     def logpdf(self, f, y, Y_metadata=None):
         """
@@ -313,8 +386,11 @@ class Likelihood(Parameterized):
         :returns: log likelihood evaluated for this point
         :rtype: float
         """
-        inv_link_f = self.gp_link.transf(f)
-        return self.logpdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+        if isinstance(self.gp_link, link_functions.Identity):
+            return self.logpdf_link(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            return self.logpdf_link(inv_link_f, y, Y_metadata=Y_metadata)
 
     def dlogpdf_df(self, f, y, Y_metadata=None):
         """
@@ -332,11 +408,15 @@ class Likelihood(Parameterized):
         :returns: derivative of log likelihood evaluated for this point
         :rtype: 1xN array
         """
-        inv_link_f = self.gp_link.transf(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
-        dlink_df = self.gp_link.dtransf_df(f)
-        return chain_1(dlogpdf_dlink, dlink_df)
+        if isinstance(self.gp_link, link_functions.Identity):
+            return self.dlogpdf_dlink(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+            dlink_df = self.gp_link.dtransf_df(f)
+            return chain_1(dlogpdf_dlink, dlink_df)
 
+    @blockify_hessian
     def d2logpdf_df2(self, f, y, Y_metadata=None):
         """
         Evaluates the link function link(f) then computes the second derivative of log likelihood using it
@@ -353,13 +433,18 @@ class Likelihood(Parameterized):
         :returns: second derivative of log likelihood evaluated for this point (diagonal only)
         :rtype: 1xN array
         """
-        inv_link_f = self.gp_link.transf(f)
-        d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
-        dlink_df = self.gp_link.dtransf_df(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
-        d2link_df2 = self.gp_link.d2transf_df2(f)
-        return chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
+        if isinstance(self.gp_link, link_functions.Identity):
+            d2logpdf_df2 = self.d2logpdf_dlink2(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
+            dlink_df = self.gp_link.dtransf_df(f)
+            dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            d2logpdf_df2 = chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
+        return d2logpdf_df2
 
+    @blockify_third
     def d3logpdf_df3(self, f, y, Y_metadata=None):
         """
         Evaluates the link function link(f) then computes the third derivative of log likelihood using it
@@ -376,53 +461,85 @@ class Likelihood(Parameterized):
         :returns: third derivative of log likelihood evaluated for this point
         :rtype: float
         """
-        inv_link_f = self.gp_link.transf(f)
-        d3logpdf_dlink3 = self.d3logpdf_dlink3(inv_link_f, y, Y_metadata=Y_metadata)
-        dlink_df = self.gp_link.dtransf_df(f)
-        d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
-        d2link_df2 = self.gp_link.d2transf_df2(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
-        d3link_df3 = self.gp_link.d3transf_df3(f)
-        return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
+        if isinstance(self.gp_link, link_functions.Identity):
+            d3logpdf_df3 = self.d3logpdf_dlink3(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            d3logpdf_dlink3 = self.d3logpdf_dlink3(inv_link_f, y, Y_metadata=Y_metadata)
+            dlink_df = self.gp_link.dtransf_df(f)
+            d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+            d3link_df3 = self.gp_link.d3transf_df3(f)
+            d3logpdf_df3 = chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
+        return d3logpdf_df3
+
 
     def dlogpdf_dtheta(self, f, y, Y_metadata=None):
         """
         TODO: Doc strings
         """
         if self.size > 0:
-            inv_link_f = self.gp_link.transf(f)
-            return self.dlogpdf_link_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+            if self.not_block_really:
+                raise NotImplementedError("Need to make a decorator for this!")
+            if isinstance(self.gp_link, link_functions.Identity):
+                return self.dlogpdf_link_dtheta(f, y, Y_metadata=Y_metadata)
+            else:
+                inv_link_f = self.gp_link.transf(f)
+                return self.dlogpdf_link_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
         else:
             # There are no parameters so return an empty array for derivatives
-            return np.zeros([1, 0])
+            return np.zeros((0, f.shape[0], f.shape[1]))
 
     def dlogpdf_df_dtheta(self, f, y, Y_metadata=None):
         """
         TODO: Doc strings
         """
         if self.size > 0:
-            inv_link_f = self.gp_link.transf(f)
-            dlink_df = self.gp_link.dtransf_df(f)
-            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
-            return chain_1(dlogpdf_dlink_dtheta, dlink_df)
+            if self.not_block_really:
+                raise NotImplementedError("Need to make a decorator for this!")
+            if isinstance(self.gp_link, link_functions.Identity):
+                return self.dlogpdf_dlink_dtheta(f, y, Y_metadata=Y_metadata)
+            else:
+                inv_link_f = self.gp_link.transf(f)
+                dlink_df = self.gp_link.dtransf_df(f)
+                dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+
+                dlogpdf_df_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+                #Chain each parameter of hte likelihood seperately
+                for p in range(self.size):
+                    dlogpdf_df_dtheta[p, :, :] = chain_1(dlogpdf_dlink_dtheta[p,:,:], dlink_df)
+                return dlogpdf_df_dtheta
+                #return chain_1(dlogpdf_dlink_dtheta, dlink_df)
         else:
             # There are no parameters so return an empty array for derivatives
-            return np.zeros([f.shape[0], 0])
+            return np.zeros((0, f.shape[0], f.shape[1]))
 
     def d2logpdf_df2_dtheta(self, f, y, Y_metadata=None):
         """
         TODO: Doc strings
         """
         if self.size > 0:
-            inv_link_f = self.gp_link.transf(f)
-            dlink_df = self.gp_link.dtransf_df(f)
-            d2link_df2 = self.gp_link.d2transf_df2(f)
-            d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
-            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
-            return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
+            if self.not_block_really:
+                raise NotImplementedError("Need to make a decorator for this!")
+            if isinstance(self.gp_link, link_functions.Identity):
+                return self.d2logpdf_dlink2_dtheta(f, y, Y_metadata=Y_metadata)
+            else:
+                inv_link_f = self.gp_link.transf(f)
+                dlink_df = self.gp_link.dtransf_df(f)
+                d2link_df2 = self.gp_link.d2transf_df2(f)
+                d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+                dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+
+                d2logpdf_df2_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+                #Chain each parameter of hte likelihood seperately
+                for p in range(self.size):
+                    d2logpdf_df2_dtheta[p, :, :] = chain_2(d2logpdf_dlink2_dtheta[p,:,:], dlink_df, dlogpdf_dlink_dtheta[p,:,:], d2link_df2)
+                return d2logpdf_df2_dtheta
+                #return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
         else:
             # There are no parameters so return an empty array for derivatives
-            return np.zeros([f.shape[0], 0])
+            return np.zeros((0, f.shape[0], f.shape[1]))
 
     def _laplace_gradients(self, f, y, Y_metadata=None):
         dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, Y_metadata=Y_metadata)
@@ -431,9 +548,9 @@ class Likelihood(Parameterized):
 
         #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
         # ensure we have gradients for every parameter we want to optimize
-        assert len(dlogpdf_dtheta) == self.size #1 x num_param array
-        assert dlogpdf_df_dtheta.shape[1] == self.size #f x num_param matrix
-        assert d2logpdf_df2_dtheta.shape[1] == self.size #f x num_param matrix
+        assert dlogpdf_dtheta.shape[0] == self.size #num_param array x f, d
+        assert dlogpdf_df_dtheta.shape[0] == self.size #num_param x f x d x matrix or just num_param x f
+        assert d2logpdf_df2_dtheta.shape[0] == self.size #num_param x f matrix or num_param x f x d x matrix, num_param x f x f or num_param x f x f x d
 
         return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
 
@@ -454,19 +571,98 @@ class Likelihood(Parameterized):
 
     def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
         #compute the quantiles by sampling!!!
-        N_samp = 1000
+        N_samp = 500
         s = np.random.randn(mu.shape[0], N_samp)*np.sqrt(var) + mu
         #ss_f = s.flatten()
         #ss_y = self.samples(ss_f, Y_metadata)
+        #ss_y = self.samples(s, Y_metadata, samples=100)
         ss_y = self.samples(s, Y_metadata)
         #ss_y = ss_y.reshape(mu.shape[0], N_samp)
 
         return [np.percentile(ss_y ,q, axis=1)[:,None] for q in quantiles]
 
-    def samples(self, gp, Y_metadata=None):
+    def samples(self, gp, Y_metadata=None, samples=1):
         """
         Returns a set of samples of observations based on a given value of the latent variable.
 
         :param gp: latent variable
+        :param samples: number of samples to take for each f location
         """
-        raise NotImplementedError
+        raise NotImplementedError("""May be possible to use MCMC with user-tuning, see
+                                  MCMC_pdf_samples in likelihood.py and write samples function
+                                  using this, beware this is a simple implementation
+                                  of Metropolis and will not work well for all likelihoods""")
+
+    def MCMC_pdf_samples(self, fNew, num_samples=1000, starting_loc=None, stepsize=0.1, burn_in=1000, Y_metadata=None):
+        """
+        Simple implementation of Metropolis sampling algorithm
+
+        Will run a parallel chain for each input dimension (treats each f independently)
+        Thus assumes f*_1 independant of f*_2 etc.
+
+        :param num_samples: Number of samples to take
+        :param fNew: f at which to sample around
+        :param starting_loc: Starting locations of the independant chains (usually will be conditional_mean of likelihood), often link_f
+        :param stepsize: Stepsize for the normal proposal distribution (will need modifying)
+        :param burnin: number of samples to use for burnin (will need modifying)
+        :param Y_metadata: Y_metadata for pdf
+        """
+        print("Warning, using MCMC for sampling y*, needs to be tuned!")
+        if starting_loc is None:
+            starting_loc = fNew
+        from functools import partial
+        logpdf = partial(self.logpdf, f=fNew, Y_metadata=Y_metadata)
+        pdf = lambda y_star: np.exp(logpdf(y=y_star[:, None]))
+        #Should be the link function of f is a good starting point
+        #(i.e. the point before you corrupt it with the likelihood)
+        par_chains = starting_loc.shape[0]
+        chain_values = np.zeros((par_chains, num_samples))
+        chain_values[:, 0][:,None] = starting_loc
+        #Use same stepsize for all par_chains
+        stepsize = np.ones(par_chains)*stepsize
+        accepted = np.zeros((par_chains, num_samples+burn_in))
+        accept_ratio = np.zeros(num_samples+burn_in)
+        #Whilst burning in, only need to keep the previous lot
+        burnin_cache = np.zeros(par_chains)
+        burnin_cache[:] = starting_loc.flatten()
+        burning_in = True
+        for i in xrange(burn_in+num_samples):
+            next_ind = i-burn_in
+            if burning_in:
+                old_y = burnin_cache
+            else:
+                old_y = chain_values[:,next_ind-1]
+
+            old_lik = pdf(old_y)
+            #Propose new y from Gaussian proposal
+            new_y = np.random.normal(loc=old_y, scale=stepsize)
+            new_lik = pdf(new_y)
+            #Accept using Metropolis (not hastings) acceptance
+            #Always accepts if new_lik > old_lik
+            accept_probability = np.minimum(1, new_lik/old_lik)
+            u = np.random.uniform(0,1,par_chains)
+            #print "Accept prob: ", accept_probability
+            accepts = u < accept_probability
+            if burning_in:
+                burnin_cache[accepts] = new_y[accepts]
+                burnin_cache[~accepts] = old_y[~accepts]
+                if i == burn_in:
+                    burning_in = False
+                    chain_values[:,0] = burnin_cache
+            else:
+                #If it was accepted then new_y becomes the latest sample
+                chain_values[accepts, next_ind] = new_y[accepts]
+                #Otherwise use old y as the sample
+                chain_values[~accepts, next_ind] = old_y[~accepts]
+
+            accepted[~accepts, i] = 0
+            accepted[accepts, i] = 1
+            accept_ratio[i] = np.sum(accepted[:,i])/float(par_chains)
+
+            #Show progress
+            if i % int((burn_in+num_samples)*0.1) == 0:
+                print("{}% of samples taken ({})".format((i/int((burn_in+num_samples)*0.1)*10), i))
+                print("Last run accept ratio: ", accept_ratio[i])
+
+        print("Average accept ratio: ", np.mean(accept_ratio))
+        return chain_values
diff --git a/GPy/likelihoods/link_functions.py b/GPy/likelihoods/link_functions.py
index a4ddc760..3d753395 100644
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@@ -1,13 +1,10 @@
-# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
+# Copyright (c) 2012-2015 The GPy authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from scipy import stats
+from ..util.univariate_Gaussian import std_norm_cdf, std_norm_pdf
 import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf,inv_std_norm_cdf
-
-_exp_lim_val = np.finfo(np.float64).max
-_lim_val = np.log(_exp_lim_val)
+from ..util.misc import safe_exp, safe_square, safe_cube, safe_quad, safe_three_times
 
 class GPTransformation(object):
     """
@@ -79,13 +76,10 @@ class Probit(GPTransformation):
         return std_norm_pdf(f)
 
     def d2transf_df2(self,f):
-        #FIXME
         return -f * std_norm_pdf(f)
 
     def d3transf_df3(self,f):
-        #FIXME
-        f2 = f**2
-        return -(1/(np.sqrt(2*np.pi)))*np.exp(-0.5*(f2))*(1-f2)
+        return (safe_square(f)-1.)*std_norm_pdf(f)
 
 
 class Cloglog(GPTransformation):
@@ -98,22 +92,26 @@ class Cloglog(GPTransformation):
         or
 
         f = \log (-\log(1-p))
-    
+
     """
     def transf(self,f):
-        return 1-np.exp(-np.exp(f))
+        ef = safe_exp(f)
+        return 1-np.exp(-ef)
 
     def dtransf_df(self,f):
-        return np.exp(f-np.exp(f))
+        ef = safe_exp(f)
+        return np.exp(f-ef)
 
     def d2transf_df2(self,f):
-        ef = np.exp(f)
+        ef = safe_exp(f)
         return -np.exp(f-ef)*(ef-1.)
 
     def d3transf_df3(self,f):
-        ef = np.exp(f)
-        return np.exp(f-ef)*(1.-3*ef + ef**2)
-
+        ef = safe_exp(f)
+        ef2 = safe_square(ef)
+        three_times_ef = safe_three_times(ef)
+        r_val = np.exp(f-ef)*(1.-three_times_ef + ef2)
+        return r_val
 
 class Log(GPTransformation):
     """
@@ -123,16 +121,16 @@ class Log(GPTransformation):
 
     """
     def transf(self,f):
-        return np.exp(np.clip(f, -_lim_val, _lim_val))
+        return safe_exp(f)
 
     def dtransf_df(self,f):
-        return np.exp(np.clip(f, -_lim_val, _lim_val))
+        return safe_exp(f)
 
     def d2transf_df2(self,f):
-        return np.exp(np.clip(f, -_lim_val, _lim_val))
+        return safe_exp(f)
 
     def d3transf_df3(self,f):
-        return np.exp(np.clip(f, -_lim_val, _lim_val))
+        return safe_exp(f)
 
 class Log_ex_1(GPTransformation):
     """
@@ -142,17 +140,20 @@ class Log_ex_1(GPTransformation):
 
     """
     def transf(self,f):
-        return np.log(1.+np.exp(f))
+        return np.log1p(safe_exp(f))
 
     def dtransf_df(self,f):
-        return np.exp(f)/(1.+np.exp(f))
+        ef = safe_exp(f)
+        return ef/(1.+ef)
 
     def d2transf_df2(self,f):
-        aux = np.exp(f)/(1.+np.exp(f))
+        ef = safe_exp(f)
+        aux = ef/(1.+ef)
         return aux*(1.-aux)
 
     def d3transf_df3(self,f):
-        aux = np.exp(f)/(1.+np.exp(f))
+        ef = safe_exp(f)
+        aux = ef/(1.+ef)
         daux_df = aux*(1.-aux)
         return daux_df - (2.*aux*daux_df)
 
@@ -160,21 +161,24 @@ class Reciprocal(GPTransformation):
     def transf(self,f):
         return 1./f
 
-    def dtransf_df(self,f):
-        return -1./(f**2)
+    def dtransf_df(self, f):
+        f2 = safe_square(f)
+        return -1./f2
 
-    def d2transf_df2(self,f):
-        return 2./(f**3)
+    def d2transf_df2(self, f):
+        f3 = safe_cube(f)
+        return 2./f3
 
     def d3transf_df3(self,f):
-        return -6./(f**4)
+        f4 = safe_quad(f)
+        return -6./f4
 
 class Heaviside(GPTransformation):
     """
 
     .. math::
 
-        g(f) = I_{x \\in A}
+        g(f) = I_{x \\geq 0}
 
     """
     def transf(self,f):
@@ -182,7 +186,7 @@ class Heaviside(GPTransformation):
         return np.where(f>0, 1, 0)
 
     def dtransf_df(self,f):
-        raise NotImplementedError, "This function is not differentiable!"
+        raise NotImplementedError("This function is not differentiable!")
 
     def d2transf_df2(self,f):
-        raise NotImplementedError, "This function is not differentiable!"
+        raise NotImplementedError("This function is not differentiable!")
diff --git a/GPy/likelihoods/mixed_noise.py b/GPy/likelihoods/mixed_noise.py
index 8c56f45b..84b3001d 100644
--- a/GPy/likelihoods/mixed_noise.py
+++ b/GPy/likelihoods/mixed_noise.py
@@ -3,9 +3,9 @@
 
 import numpy as np
 from scipy import stats, special
-import link_functions
-from likelihood import Likelihood
-from gaussian import Gaussian
+from . import link_functions
+from .likelihood import Likelihood
+from .gaussian import Gaussian
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
 from ..core.parameterization import Parameterized
diff --git a/GPy/likelihoods/poisson.py b/GPy/likelihoods/poisson.py
index ea9b2d10..5aa85a91 100644
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@@ -5,8 +5,8 @@ from __future__ import division
 import numpy as np
 from scipy import stats,special
 import scipy as sp
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood
 
 class Poisson(Likelihood):
     """
@@ -64,8 +64,7 @@ class Poisson(Likelihood):
         :rtype: float
 
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        return np.sum(-link_f + y*np.log(link_f) - special.gammaln(y+1))
+        return -link_f + y*np.log(link_f) - special.gammaln(y+1)
 
     def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
         """
@@ -83,7 +82,6 @@ class Poisson(Likelihood):
         :rtype: Nx1 array
 
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         return y/link_f - 1
 
     def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
@@ -107,12 +105,7 @@ class Poisson(Likelihood):
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
             (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        hess = -y/(link_f**2)
-        return hess
-        #d2_df = self.gp_link.d2transf_df2(gp)
-        #transf = self.gp_link.transf(gp)
-        #return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
+        return -y/(link_f**2)
 
     def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
         """
@@ -129,7 +122,6 @@ class Poisson(Likelihood):
         :returns: third derivative of likelihood evaluated at points f
         :rtype: Nx1 array
         """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
         d3lik_dlink3 = 2*y/(link_f)**3
         return d3lik_dlink3
 
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index 855f6b40..b66d4c0f 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -4,10 +4,10 @@
 import numpy as np
 from scipy import stats, special
 import scipy as sp
-import link_functions
+from . import link_functions
 from scipy import stats, integrate
 from scipy.special import gammaln, gamma
-from likelihood import Likelihood
+from .likelihood import Likelihood
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
 
@@ -35,8 +35,8 @@ class StudentT(Likelihood):
 
         self.log_concave = False
 
-    def parameters_changed(self):
-        self.variance = (self.v / float(self.v - 2)) * self.sigma2
+    #def parameters_changed(self):
+        #self.variance = (self.v / float(self.v - 2)) * self.sigma2
 
     def update_gradients(self, grads):
         """
@@ -86,7 +86,6 @@ class StudentT(Likelihood):
         :rtype: float
 
         """
-        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
         e = y - inv_link_f
         #FIXME:
         #Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?!
@@ -97,7 +96,7 @@ class StudentT(Likelihood):
                     - 0.5*np.log(self.sigma2 * self.v * np.pi)
                     - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
                     )
-        return np.sum(objective)
+        return objective
 
     def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
         """
@@ -115,7 +114,6 @@ class StudentT(Likelihood):
         :rtype: Nx1 array
 
         """
-        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
         e = y - inv_link_f
         grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))
         return grad
@@ -141,7 +139,6 @@ class StudentT(Likelihood):
             Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
             (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
         """
-        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
         e = y - inv_link_f
         hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
         return hess
@@ -161,7 +158,6 @@ class StudentT(Likelihood):
         :returns: third derivative of likelihood evaluated at points f
         :rtype: Nx1 array
         """
-        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
         e = y - inv_link_f
         d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
                        ((e**2 + self.sigma2*self.v)**3)
@@ -183,10 +179,10 @@ class StudentT(Likelihood):
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: float
         """
-        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
         e = y - inv_link_f
-        dlogpdf_dvar = self.v*(e**2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e**2))
-        return np.sum(dlogpdf_dvar)
+        e2 = np.square(e)
+        dlogpdf_dvar = self.v*(e2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e2))
+        return dlogpdf_dvar
 
     def dlogpdf_dlink_dvar(self, inv_link_f, y, Y_metadata=None):
         """
@@ -203,7 +199,6 @@ class StudentT(Likelihood):
         :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
         e = y - inv_link_f
         dlogpdf_dlink_dvar = (self.v*(self.v+1)*(-e))/((self.sigma2*self.v + e**2)**2)
         return dlogpdf_dlink_dvar
@@ -223,7 +218,6 @@ class StudentT(Likelihood):
         :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
         :rtype: Nx1 array
         """
-        assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
         e = y - inv_link_f
         d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
                               / ((self.sigma2*self.v + (e**2))**3)
@@ -233,20 +227,21 @@ class StudentT(Likelihood):
     def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
         dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
         dlogpdf_dv = np.zeros_like(dlogpdf_dvar) #FIXME: Not done yet
-        return np.hstack((dlogpdf_dvar, dlogpdf_dv))
+        return np.array((dlogpdf_dvar, dlogpdf_dv))
 
     def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
         dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
         dlogpdf_dlink_dv = np.zeros_like(dlogpdf_dlink_dvar) #FIXME: Not done yet
-        return np.hstack((dlogpdf_dlink_dvar, dlogpdf_dlink_dv))
+        return np.array((dlogpdf_dlink_dvar, dlogpdf_dlink_dv))
 
     def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
         d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
         d2logpdf_dlink2_dv = np.zeros_like(d2logpdf_dlink2_dvar) #FIXME: Not done yet
-        return np.hstack((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv))
+
+        return np.array((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv))
 
     def predictive_mean(self, mu, sigma, Y_metadata=None):
-        # The comment here confuses mean and median. 
+        # The comment here confuses mean and median.
         return self.gp_link.transf(mu) # only true if link is monotonic, which it is.
 
     def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
diff --git a/GPy/mappings/__init__.py b/GPy/mappings/__init__.py
index d331c678..5193a232 100644
--- a/GPy/mappings/__init__.py
+++ b/GPy/mappings/__init__.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kernel import Kernel
-from linear import Linear
-from mlp import MLP
-#from rbf import RBF
+from .kernel import Kernel
+from .linear import Linear
+from .mlp import MLP
+from .additive import Additive
+from .compound import Compound
+
diff --git a/GPy/mappings/additive.py b/GPy/mappings/additive.py
index 5297982b..1c86b680 100644
--- a/GPy/mappings/additive.py
+++ b/GPy/mappings/additive.py
@@ -2,8 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ..core.mapping import Mapping
-import GPy
+from ..core import Mapping
 
 class Additive(Mapping):
     """
@@ -17,45 +16,23 @@ class Additive(Mapping):
     :type mapping1: GPy.mappings.Mapping
     :param mapping2: second mapping to add together.
     :type mapping2: GPy.mappings.Mapping
-    :param tensor: whether or not to use the tensor product of input spaces
-    :type tensor: bool
 
     """
 
-    def __init__(self, mapping1, mapping2, tensor=False):
-        if tensor:
-            input_dim = mapping1.input_dim + mapping2.input_dim
-        else:
-            input_dim = mapping1.input_dim
-            assert(mapping1.input_dim==mapping2.input_dim)
+    def __init__(self, mapping1, mapping2):
+        assert(mapping1.input_dim==mapping2.input_dim)
         assert(mapping1.output_dim==mapping2.output_dim)
-        output_dim = mapping1.output_dim
+        input_dim, output_dim = mapping1.input_dim, mapping1.output_dim
         Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
         self.mapping1 = mapping1
         self.mapping2 = mapping2
-        self.num_params = self.mapping1.num_params + self.mapping2.num_params
-        self.name = self.mapping1.name + '+' + self.mapping2.name
-    def _get_param_names(self):
-        return self.mapping1._get_param_names + self.mapping2._get_param_names
-
-    def _get_params(self):
-        return np.hstack((self.mapping1._get_params(), self.mapping2._get_params()))
-
-    def _set_params(self, x):
-        self.mapping1._set_params(x[:self.mapping1.num_params])
-        self.mapping2._set_params(x[self.mapping1.num_params:])
-        
-    def randomize(self):
-        self.mapping1._randomize()
-        self.mapping2._randomize()
 
     def f(self, X):
         return self.mapping1.f(X) + self.mapping2.f(X)
 
-    def df_dtheta(self, dL_df, X):
-        self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T
-        self._df_dbias = (dL_df.sum(0))
-        return np.hstack((self._df_dA.flatten(), self._df_dbias))
+    def update_gradients(self, dL_dF, X):
+        self.mapping1.update_gradients(dL_dF, X)
+        self.mapping2.update_gradients(dL_dF, X)
 
-    def df_dX(self, dL_df, X):
-        return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X) 
+    def gradients_X(self, dL_dF, X):
+        return self.mapping1.gradients_X(dL_dF, X) + self.mapping2.gradients_X(dL_dF, X)
diff --git a/GPy/mappings/compound.py b/GPy/mappings/compound.py
new file mode 100644
index 00000000..5a1e8dd1
--- /dev/null
+++ b/GPy/mappings/compound.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2015, James Hensman and Alan Saul
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from ..core import Mapping
+
+class Compound(Mapping):
+    """
+    Mapping based on passing one mapping through another
+
+    .. math::
+
+       f(\mathbf{x}) = f_2(f_1(\mathbf{x}))
+
+    :param mapping1: first mapping
+    :type mapping1: GPy.mappings.Mapping
+    :param mapping2: second mapping
+    :type mapping2: GPy.mappings.Mapping
+
+    """
+
+    def __init__(self, mapping1, mapping2):
+        assert(mapping1.output_dim==mapping2.input_dim)
+        input_dim, output_dim = mapping1.input_dim, mapping2.output_dim
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
+        self.mapping1 = mapping1
+        self.mapping2 = mapping2
+        self.link_parameters(self.mapping1, self.mapping2)
+
+    def f(self, X):
+        return self.mapping2.f(self.mapping1.f(X))
+
+    def update_gradients(self, dL_dF, X):
+        hidden = self.mapping1.f(X)
+        self.mapping2.update_gradients(dL_dF, hidden)
+        self.mapping1.update_gradients(self.mapping2.gradients_X(dL_dF, hidden), X)
+
+    def gradients_X(self, dL_dF, X):
+        hidden = self.mapping1.f(X)
+        return self.mapping1.gradients_X(self.mapping2.gradients_X(dL_dF, hidden), X)
diff --git a/GPy/mappings/identity.py b/GPy/mappings/identity.py
new file mode 100644
index 00000000..b15e476c
--- /dev/null
+++ b/GPy/mappings/identity.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2015, James Hensman
+
+from ..core.mapping import Mapping
+from ..core import Param
+
+class Identity(Mapping):
+    """
+    A mapping that does nothing!
+    """
+    def __init__(self, input_dim, output_dim, name='identity'):
+        Mapping.__init__(self, input_dim, output_dim, name)
+
+    def f(self, X):
+        return X
+
+    def update_gradients(self, dL_dF, X):
+        pass
+
+    def gradients_X(self, dL_dF, X):
+        return dL_dF
+
+
+
+
+
+
diff --git a/GPy/mappings/kernel.py b/GPy/mappings/kernel.py
index 74fa344f..ea1720db 100644
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
 from ..core.mapping import Mapping
-import GPy
+from ..core import Param
 
 class Kernel(Mapping):
     """
@@ -11,50 +12,41 @@ class Kernel(Mapping):
 
     .. math::
 
-       f(\mathbf{x}*) = \mathbf{A}\mathbf{k}(\mathbf{X}, \mathbf{x}^*) + \mathbf{b}
+       f(\mathbf{x}) = \sum_i \alpha_i k(\mathbf{z}_i, \mathbf{x})
 
-    :param X: input observations containing :math:`\mathbf{X}`
-    :type X: ndarray
+    or for multple outputs
+
+    .. math::
+
+       f_i(\mathbf{x}) = \sum_j \alpha_{i,j} k(\mathbf{z}_i, \mathbf{x})
+
+
+    :param input_dim: dimension of input.
+    :type input_dim: int
     :param output_dim: dimension of output.
     :type output_dim: int
+    :param Z: input observations containing :math:`\mathbf{Z}`
+    :type Z: ndarray
     :param kernel: a GPy kernel, defaults to GPy.kern.RBF
     :type kernel: GPy.kern.kern
 
     """
 
-    def __init__(self, X, output_dim=1, kernel=None):
-        Mapping.__init__(self, input_dim=X.shape[1], output_dim=output_dim)
-        if kernel is None:
-            kernel = GPy.kern.RBF(self.input_dim)
+    def __init__(self, input_dim, output_dim, Z, kernel, name='kernmap'):
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
         self.kern = kernel
-        self.X = X
-        self.num_data = X.shape[0]
-        self.num_params = self.output_dim*(self.num_data + 1)
-        self.A = np.array((self.num_data, self.output_dim))
-        self.bias = np.array(self.output_dim)
-        self.randomize()
-        self.name = 'kernel'
-    def _get_param_names(self):
-        return sum([['A_%i_%i' % (n, d) for d in range(self.output_dim)] for n in range(self.num_data)], []) + ['bias_%i' % d for d in range(self.output_dim)]
-
-    def _get_params(self):
-        return np.hstack((self.A.flatten(), self.bias))
-
-    def _set_params(self, x):
-        self.A = x[:self.num_data * self.output_dim].reshape(self.num_data, self.output_dim).copy()
-        self.bias = x[self.num_data*self.output_dim:].copy()
-
-    def randomize(self):
-        self.A = np.random.randn(self.num_data, self.output_dim)/np.sqrt(self.num_data+1)
-        self.bias = np.random.randn(self.output_dim)/np.sqrt(self.num_data+1)
+        self.Z = Z
+        self.num_bases, Zdim = Z.shape
+        assert Zdim == self.input_dim
+        self.A = Param('A', np.random.randn(self.num_bases, self.output_dim))
+        self.link_parameter(self.A)
 
     def f(self, X):
-        return np.dot(self.kern.K(X, self.X),self.A) + self.bias
+        return np.dot(self.kern.K(X, self.Z), self.A)
 
-    def df_dtheta(self, dL_df, X):
-        self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T
-        self._df_dbias = (dL_df.sum(0))
-        return np.hstack((self._df_dA.flatten(), self._df_dbias))
+    def update_gradients(self, dL_dF, X):
+        self.kern.update_gradients_full(np.dot(dL_dF, self.A.T), X, self.Z)
+        self.A.gradient = np.dot( self.kern.K(self.Z, X), dL_dF)
 
-    def df_dX(self, dL_df, X):
-        return self.kern.gradients_X((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X)
+    def gradients_X(self, dL_dF, X):
+        return self.kern.gradients_X(np.dot(dL_dF, self.A.T), X, self.Z)
diff --git a/GPy/mappings/linear.py b/GPy/mappings/linear.py
index 315dfc0e..ee464694 100644
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@@ -1,43 +1,39 @@
 # Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ..core.mapping import Bijective_mapping
+from ..core.mapping import Mapping
 from ..core.parameterization import Param
 
-class Linear(Bijective_mapping):
+class Linear(Mapping):
     """
-    Mapping based on a linear model.
+    A Linear mapping.
 
     .. math::
 
-       f(\mathbf{x}*) = \mathbf{W}\mathbf{x}^* + \mathbf{b}
+       F(\mathbf{x}) = \mathbf{A} \mathbf{x})
 
-    :param X: input observations
-    :type X: ndarray
+
+    :param input_dim: dimension of input.
+    :type input_dim: int
     :param output_dim: dimension of output.
     :type output_dim: int
+    :param kernel: a GPy kernel, defaults to GPy.kern.RBF
+    :type kernel: GPy.kern.kern
 
     """
 
-    def __init__(self, input_dim=1, output_dim=1, name='linear'):
-        Bijective_mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
-        self.W = Param('W',np.array((self.input_dim, self.output_dim)))
-        self.bias = Param('bias',np.array(self.output_dim))
-        self.link_parameters(self.W, self.bias)
+    def __init__(self, input_dim, output_dim, name='linmap'):
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
+        self.A = Param('A', np.random.randn(self.input_dim, self.output_dim))
+        self.link_parameter(self.A)
 
     def f(self, X):
-        return np.dot(X,self.W) + self.bias
+        return np.dot(X, self.A)
 
-    def g(self, f):
-        V = np.linalg.solve(np.dot(self.W.T, self.W), W.T)
-        return np.dot(f-self.bias, V)  
+    def update_gradients(self, dL_dF, X):
+        self.A.gradient = np.dot( X.T, dL_dF)
 
-    def df_dtheta(self, dL_df, X):
-        df_dW = (dL_df[:, :, None]*X[:, None, :]).sum(0).T
-        df_dbias = (dL_df.sum(0))
-        return np.hstack((df_dW.flatten(), df_dbias))
-
-    def dL_dX(self, partial, X):
-        """The gradient of L with respect to the inputs to the mapping, where L is a function that is dependent on the output of the mapping, f."""
-        return (partial[:, None, :]*self.W[None, :, :]).sum(2)
+    def gradients_X(self, dL_dF, X):
+        return np.dot(dL_dF, self.A.T)
diff --git a/GPy/mappings/mlp.py b/GPy/mappings/mlp.py
index 46dbc2a9..4afc2fa1 100644
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@@ -3,128 +3,53 @@
 
 import numpy as np
 from ..core.mapping import Mapping
+from ..core import Param
 
 class MLP(Mapping):
     """
-    Mapping based on a multi-layer perceptron neural network model.
-
-    .. math::
-
-       f(\\mathbf{x}*) = \\mathbf{W}^0\\boldsymbol{\\phi}(\\mathbf{W}^1\\mathbf{x}+\\mathbf{b}^1)^* + \\mathbf{b}^0
-
-    where
-
-    .. math::
-
-      \\phi(\\cdot) = \\text{tanh}(\\cdot)
-
-    :param X: input observations
-    :type X: ndarray
-    :param output_dim: dimension of output.
-    :type output_dim: int
-    :param hidden_dim: dimension of hidden layer. If it is an int, there is one hidden layer of the given dimension. If it is a list of ints there are as manny hidden layers as the length of the list, each with the given number of hidden nodes in it.
-    :type hidden_dim: int or list of ints. 
-
+    Mapping based on a multi-layer perceptron neural network model, with a single hidden layer
     """
 
-    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3):
-        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
-        self.name = 'mlp'
-        if isinstance(hidden_dim, int):
-            hidden_dim = [hidden_dim]
+    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3, name='mlpmap'):
+        super(MLP, self).__init__(input_dim=input_dim, output_dim=output_dim, name=name)
         self.hidden_dim = hidden_dim
-        self.activation = [None]*len(self.hidden_dim)
-        self.W = []
-        self._dL_dW = []
-        self.bias = []
-        self._dL_dbias = []
-        self.W.append(np.zeros((self.input_dim, self.hidden_dim[0])))
-        self._dL_dW.append(np.zeros((self.input_dim, self.hidden_dim[0])))
-        self.bias.append(np.zeros(self.hidden_dim[0]))
-        self._dL_dbias.append(np.zeros(self.hidden_dim[0]))
-        self.num_params = self.hidden_dim[0]*(self.input_dim+1)
-        for h1, h0 in zip(hidden_dim[1:], hidden_dim[0:-1]):
-            self.W.append(np.zeros((h0, h1)))
-            self._dL_dW.append(np.zeros((h0, h1)))
-            self.bias.append(np.zeros(h1))
-            self._dL_dbias.append(np.zeros(h1))
-            self.num_params += h1*(h0+1)
-        self.W.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
-        self._dL_dW.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
-        self.bias.append(np.zeros(self.output_dim))
-        self._dL_dbias.append(np.zeros(self.output_dim))
-        self.num_params += self.output_dim*(self.hidden_dim[-1]+1)
-        self.randomize()
+        self.W1 = Param('W1', np.random.randn(self.input_dim, self.hidden_dim))
+        self.b1 = Param('b1', np.random.randn(self.hidden_dim))
+        self.W2 = Param('W2', np.random.randn(self.hidden_dim, self.output_dim))
+        self.b2 = Param('b2', np.random.randn(self.output_dim))
+        self.link_parameters(self.W1, self.b1, self.W2, self.b2)
 
-    def _get_param_names(self):
-        return sum([['W%i_%i_%i' % (i, n, d)  for n in range(self.W[i].shape[0]) for d in range(self.W[i].shape[1])] + ['bias%i_%i' % (i, d) for d in range(self.W[i].shape[1])] for i in range(len(self.W))], [])
-
-    def _get_params(self):
-        param = np.array([])
-        for W, bias in zip(self.W, self.bias):
-            param = np.hstack((param, W.flatten(), bias))
-        return param
-    
-    def _set_params(self, x):
-        start = 0
-        for W, bias in zip(self.W, self.bias):
-            end = W.shape[0]*W.shape[1]+start
-            W[:] = x[start:end].reshape(W.shape[0], W.shape[1]).copy()
-            start = end
-            end = W.shape[1]+end
-            bias[:] = x[start:end].copy()
-            start = end
-
-    def randomize(self):
-        for W, bias in zip(self.W, self.bias):
-            W[:] = np.random.randn(W.shape[0], W.shape[1])/np.sqrt(W.shape[0]+1)
-            bias[:] = np.random.randn(W.shape[1])/np.sqrt(W.shape[0]+1)
 
     def f(self, X):
-        self._f_computations(X)
-        return np.dot(np.tanh(self.activation[-1]), self.W[-1]) + self.bias[-1]
+        layer1 = np.dot(X, self.W1) + self.b1
+        activations = np.tanh(layer1)
+        return  np.dot(activations, self.W2) + self.b2
 
-    def _f_computations(self, X):
-        W = self.W[0]
-        bias = self.bias[0]
-        self.activation[0] = np.dot(X,W) + bias
-        for W, bias, index in zip(self.W[1:-1], self.bias[1:-1], range(1, len(self.activation))):
-            self.activation[index] = np.dot(np.tanh(self.activation[index-1]), W)+bias
+    def update_gradients(self, dL_dF, X):
+        layer1 = np.dot(X,self.W1) + self.b1
+        activations = np.tanh(layer1)
+
+        #Evaluate second-layer gradients.
+        self.W2.gradient = np.dot(activations.T, dL_dF)
+        self.b2.gradient = np.sum(dL_dF, 0)
+
+        # Backpropagation to hidden layer.
+        dL_dact = np.dot(dL_dF, self.W2.T)
+        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
+
+        # Finally, evaluate the first-layer gradients.
+        self.W1.gradient = np.dot(X.T,dL_dlayer1)
+        self.b1.gradient = np.sum(dL_dlayer1, 0)
+
+    def gradients_X(self, dL_dF, X):
+        layer1 = np.dot(X,self.W1) + self.b1
+        activations = np.tanh(layer1)
+
+        # Backpropagation to hidden layer.
+        dL_dact = np.dot(dL_dF, self.W2.T)
+        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
+
+        return np.dot(dL_dlayer1, self.W1.T)
 
-    def df_dtheta(self, dL_df, X):
-        self._df_computations(dL_df, X)
-        g = np.array([])
-        for gW, gbias in zip(self._dL_dW, self._dL_dbias):
-            g = np.hstack((g, gW.flatten(), gbias))
-        return g
 
-    def _df_computations(self, dL_df, X):
-        self._f_computations(X)
-        a0 = self.activation[-1]
-        W = self.W[-1]
-        self._dL_dW[-1] = (dL_df[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
-        dL_dta=(dL_df[:, None, :]*W[None, :, :]).sum(2)
-        self._dL_dbias[-1] = (dL_df.sum(0))
-        for dL_dW, dL_dbias, W, bias, a0, a1 in zip(self._dL_dW[-2:0:-1],
-                                                    self._dL_dbias[-2:0:-1],
-                                                    self.W[-2:0:-1],
-                                                    self.bias[-2:0:-1],
-                                                    self.activation[-2::-1],
-                                                    self.activation[-1:0:-1]):
-            ta = np.tanh(a1)
-            dL_da = dL_dta*(1-ta*ta)
-            dL_dW[:] = (dL_da[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
-            dL_dbias[:] = (dL_da.sum(0))
-            dL_dta = (dL_da[:, None, :]*W[None, :, :]).sum(2)
-        ta = np.tanh(self.activation[0])
-        dL_da = dL_dta*(1-ta*ta)
-        W = self.W[0]
-        self._dL_dW[0] = (dL_da[:, :, None]*X[:, None, :]).sum(0).T
-        self._dL_dbias[0] = (dL_da.sum(0))
-        self._dL_dX = (dL_da[:, None, :]*W[None, :, :]).sum(2)
 
-        
-    def df_dX(self, dL_df, X):
-        self._df_computations(dL_df, X)
-        return self._dL_dX
-    
diff --git a/GPy/mappings/piecewise_linear.py b/GPy/mappings/piecewise_linear.py
new file mode 100644
index 00000000..8bdee81e
--- /dev/null
+++ b/GPy/mappings/piecewise_linear.py
@@ -0,0 +1,94 @@
+from GPy.core.mapping import Mapping
+from GPy.core import Param
+import numpy as np
+
+class PiecewiseLinear(Mapping):
+    """
+    A piecewise-linear mapping.
+
+    The parameters of this mapping are the positions and values of the function where it is broken (self.breaks, self.values).
+
+    Outside the range of the breaks, the function is assumed to have gradient 1
+    """
+    def __init__(self, input_dim, output_dim, values, breaks, name='piecewise_linear'):
+
+        assert input_dim==1
+        assert output_dim==1
+
+        Mapping.__init__(self, input_dim, output_dim, name)
+
+        values, breaks = np.array(values).flatten(), np.array(breaks).flatten()
+        assert values.size == breaks.size
+        self.values = Param('values', values)
+        self.breaks = Param('breaks', breaks)
+        self.link_parameter(self.values)
+        self.link_parameter(self.breaks)
+
+    def parameters_changed(self):
+        self.order = np.argsort(self.breaks)*1
+        self.reverse_order = np.zeros_like(self.order)
+        self.reverse_order[self.order] = np.arange(self.order.size)
+
+        self.sorted_breaks = self.breaks[self.order]
+        self.sorted_values = self.values[self.order]
+
+        self.grads = np.diff(self.sorted_values)/np.diff(self.sorted_breaks)
+
+    def f(self, X):
+        x = X.flatten()
+        y = x.copy()
+
+        #first adjus the points below the first value
+        y[x<self.sorted_breaks[0]]  = x[x<self.sorted_breaks[0]] + self.sorted_values[0] - self.sorted_breaks[0]
+
+        #now all the points pas the last break
+        y[x>self.sorted_breaks[-1]]  = x[x>self.sorted_breaks[-1]] + self.sorted_values[-1] - self.sorted_breaks[-1]
+
+        #loop throught the pairs of points
+        for low, up, g, v in zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1]):
+            i = np.logical_and(x>low, x<up)
+            y[i] = v + (x[i]-low)*g
+
+        return y.reshape(-1,1)
+
+    def update_gradients(self, dL_dF, X):
+        x = X.flatten()
+        dL_dF = dL_dF.flatten()
+
+        dL_db = np.zeros(self.sorted_breaks.size)
+        dL_dv = np.zeros(self.sorted_values.size)
+
+        #loop across each interval, computing the gradient for each of the 4 parameters that define it
+        for i, (low, up, g, v) in enumerate(zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1])):
+            index = np.logical_and(x>low, x<up)
+            xx = x[index]
+            grad = dL_dF[index]
+            span = up-low
+            dL_dv[i] += np.sum(grad*( (low - xx)/span + 1))
+            dL_dv[i+1] += np.sum(grad*(xx-low)/span)
+            dL_db[i] += np.sum(grad*g*(xx-up)/span)
+            dL_db[i+1] += np.sum(grad*g*(low-xx)/span)
+
+        #now the end parts
+        dL_db[0] -= np.sum(dL_dF[x<self.sorted_breaks[0]])
+        dL_db[-1] -= np.sum(dL_dF[x>self.sorted_breaks[-1]])
+        dL_dv[0] += np.sum(dL_dF[x<self.sorted_breaks[0]])
+        dL_dv[-1] += np.sum(dL_dF[x>self.sorted_breaks[-1]])
+
+        #now put the gradients back in the correct order!
+        self.breaks.gradient = dL_db[self.reverse_order]
+        self.values.gradient = dL_dv[self.reverse_order]
+
+    def gradients_X(self, dL_dF, X):
+        x = X.flatten()
+
+        #outside the range of the breakpoints, the function is just offset by a contant, so the partial derivative is 1.
+        dL_dX = dL_dF.copy().flatten()
+
+        #insude the breakpoints, the partial derivative is self.grads
+        for low, up, g, v in zip(self. sorted_breaks[:-1], self.sorted_breaks[1:], self.grads, self.sorted_values[:-1]):
+            i = np.logical_and(x>low, x<up)
+            dL_dX[i] = dL_dF[i]*g
+
+        return dL_dX.reshape(-1,1)
+
diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index 3d1d8a9c..f01390e4 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -1,24 +1,24 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from gp_regression import GPRegression
-from gp_classification import GPClassification
-from sparse_gp_regression import SparseGPRegression, SparseGPRegressionUncertainInput
-from sparse_gp_classification import SparseGPClassification
-from gplvm import GPLVM
-from bcgplvm import BCGPLVM
-from sparse_gplvm import SparseGPLVM
-from warped_gp import WarpedGP
-from bayesian_gplvm import BayesianGPLVM
-from mrd import MRD
-from gradient_checker import GradientChecker
-from ss_gplvm import SSGPLVM
-from gp_coregionalized_regression import GPCoregionalizedRegression
-from sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
-from gp_heteroscedastic_regression import GPHeteroscedasticRegression
-from ss_mrd import SSMRD
-from gp_kronecker_gaussian_regression import GPKroneckerGaussianRegression
-from gp_var_gauss import GPVariationalGaussianApproximation
-from one_vs_all_classification import OneVsAllClassification
-from one_vs_all_sparse_classification import OneVsAllSparseClassification
-from dpgplvm import DPBayesianGPLVM
+from .gp_regression import GPRegression
+from .gp_classification import GPClassification
+from .sparse_gp_regression import SparseGPRegression, SparseGPRegressionUncertainInput
+from .sparse_gp_classification import SparseGPClassification
+from .gplvm import GPLVM
+from .bcgplvm import BCGPLVM
+from .sparse_gplvm import SparseGPLVM
+from .warped_gp import WarpedGP
+from .bayesian_gplvm import BayesianGPLVM
+from .mrd import MRD
+from .gradient_checker import GradientChecker, HessianChecker, SkewChecker
+from .ss_gplvm import SSGPLVM
+from .gp_coregionalized_regression import GPCoregionalizedRegression
+from .sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
+from .gp_heteroscedastic_regression import GPHeteroscedasticRegression
+from .ss_mrd import SSMRD
+from .gp_kronecker_gaussian_regression import GPKroneckerGaussianRegression
+from .gp_var_gauss import GPVariationalGaussianApproximation
+from .one_vs_all_classification import OneVsAllClassification
+from .one_vs_all_sparse_classification import OneVsAllSparseClassification
+from .dpgplvm import DPBayesianGPLVM
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 7cbd69eb..3ac703fe 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -24,7 +24,7 @@ class BayesianGPLVM(SparseGP_MPI):
     def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
                  Z=None, kernel=None, inference_method=None, likelihood=None,
                  name='bayesian gplvm', mpi_comm=None, normalizer=None,
-                 missing_data=False, stochastic=False, batchsize=1):
+                 missing_data=False, stochastic=False, batchsize=1, Y_metadata=None):
 
         self.logger = logging.getLogger(self.__class__.__name__)
         if X is None:
@@ -69,6 +69,7 @@ class BayesianGPLVM(SparseGP_MPI):
                                            name=name, inference_method=inference_method,
                                            normalizer=normalizer, mpi_comm=mpi_comm,
                                            variational_prior=self.variational_prior,
+                                           Y_metadata=Y_metadata
                                            )
         self.link_parameter(self.X, index=0)
 
@@ -83,7 +84,7 @@ class BayesianGPLVM(SparseGP_MPI):
     def parameters_changed(self):
         super(BayesianGPLVM,self).parameters_changed()
         if isinstance(self.inference_method, VarDTC_minibatch):
-            return        
+            return
 
         kl_fctr = 1.
         self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
diff --git a/GPy/models/gradient_checker.py b/GPy/models/gradient_checker.py
index 74026f8e..c2cde834 100644
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@@ -5,6 +5,8 @@ from ..core.model import Model
 import itertools
 import numpy
 from ..core.parameterization import Param
+np = numpy
+from ..util.block_matrices import get_blocks, get_block_shapes, unblock, get_blocks_3d, get_block_shapes_3d
 
 def get_shape(x):
     if isinstance(x, numpy.ndarray):
@@ -111,3 +113,261 @@ class GradientChecker(Model):
         #for name, shape in zip(self.names, self.shapes):
             #_param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
         #return _param_names
+
+
+class HessianChecker(GradientChecker):
+
+    def __init__(self, f, df, ddf, x0, names=None, *args, **kwargs):
+        """
+        :param f: Function (only used for numerical hessian gradient)
+        :param df: Gradient of function to check
+        :param ddf: Analytical gradient function
+        :param x0:
+            Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
+            Can be a list of arrays, if takes a list of arrays. This list will be passed
+            to f and df in the same order as given here.
+            If only one argument, make sure not to pass a list!!!
+
+        :type x0: [array-like] | array-like | float | int
+        :param names:
+            Names to print, when performing gradcheck. If a list was passed to x0
+            a list of names with the same length is expected.
+        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
+
+        """
+        super(HessianChecker, self).__init__(df, ddf, x0, names=names, *args, **kwargs)
+        self._f = f
+        self._df = df
+        self._ddf = ddf
+
+    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3, block_indices=None, plot=False):
+        """
+        Overwrite checkgrad method to check whole block instead of looping through
+
+        Shows diagnostics using matshow instead
+
+        :param verbose: If True, print a "full" checking of each parameter
+        :type verbose: bool
+        :param step: The size of the step around which to linearise the objective
+        :type step: float (default 1e-6)
+        :param tolerance: the tolerance allowed (see note)
+        :type tolerance: float (default 1e-3)
+
+        Note:-
+           The gradient is considered correct if the ratio of the analytical
+           and numerical gradients is within <tolerance> of unity.
+        """
+        try:
+            import numdifftools as nd
+        except:
+            raise ImportError("Don't have numdifftools package installed, it is not a GPy dependency as of yet, it is only used for hessian tests")
+
+        if target_param:
+            raise NotImplementedError('Only basic functionality is provided with this gradchecker')
+
+        #Repeat for each parameter, not the nicest but shouldn't be many cases where there are many
+        #variables
+        current_index = 0
+        for name, shape in zip(self.names, self.shapes):
+            current_size = numpy.prod(shape)
+            x = self.optimizer_array.copy()
+            #x = self._get_params_transformed().copy()
+            x = x[current_index:current_index + current_size].reshape(shape)
+
+            # Check gradients
+            analytic_hess = self._ddf(x)
+            if analytic_hess.shape[1] == 1:
+                analytic_hess = numpy.diagflat(analytic_hess)
+
+            #From the docs:
+            #x0 : vector location
+            #at which to differentiate fun
+            #If x0 is an N x M array, then fun is assumed to be a function
+            #of N*M variables., thus we must have it flat, not (N,1), but just (N,)
+            #numeric_hess_partial = nd.Hessian(self._f, vectorized=False)
+            numeric_hess_partial = nd.Jacobian(self._df, vectorized=False)
+            #numeric_hess_partial = nd.Derivative(self._df, vectorized=True)
+            numeric_hess = numeric_hess_partial(x)
+
+            check_passed = self.checkgrad_block(analytic_hess, numeric_hess, verbose=verbose, step=step, tolerance=tolerance, block_indices=block_indices, plot=plot)
+            current_index += current_size
+        return check_passed
+
+    def checkgrad_block(self, analytic_hess, numeric_hess, verbose=False, step=1e-6, tolerance=1e-3, block_indices=None, plot=False):
+        """
+        Checkgrad a block matrix
+        """
+        if analytic_hess.dtype is np.dtype('object'):
+            #Make numeric hessian also into a block matrix
+            real_size = get_block_shapes(analytic_hess)
+            num_elements = np.sum(real_size)
+            if (num_elements, num_elements) == numeric_hess.shape:
+                #If the sizes are the same we assume they are the same
+                #(we have not fixed any values so the numeric is the whole hessian)
+                numeric_hess = get_blocks(numeric_hess, real_size)
+            else:
+                #Make a fake empty matrix and fill out the correct block
+                tmp_numeric_hess = get_blocks(np.zeros((num_elements, num_elements)), real_size)
+                tmp_numeric_hess[block_indices] = numeric_hess.copy()
+                numeric_hess = tmp_numeric_hess
+
+        if block_indices is not None:
+            #Extract the right block
+            analytic_hess = analytic_hess[block_indices]
+            numeric_hess = numeric_hess[block_indices]
+        else:
+            #Unblock them if they are in blocks and you aren't checking a single block (checking whole hessian)
+            if analytic_hess.dtype is np.dtype('object'):
+                analytic_hess = unblock(analytic_hess)
+                numeric_hess = unblock(numeric_hess)
+
+        ratio = numeric_hess / (numpy.where(analytic_hess==0, 1e-10, analytic_hess))
+        difference = numpy.abs(analytic_hess - numeric_hess)
+
+        check_passed = numpy.all((numpy.abs(1 - ratio)) < tolerance) or numpy.allclose(numeric_hess, analytic_hess, atol = tolerance)
+
+        if verbose:
+            if block_indices:
+                print "\nBlock {}".format(block_indices)
+            else:
+                print "\nAll blocks"
+
+            header = ['Checked', 'Max-Ratio', 'Min-Ratio', 'Min-Difference', 'Max-Difference']
+            header_string = map(lambda x: ' | '.join(header), [header])
+            separator = '-' * len(header_string[0])
+            print '\n'.join([header_string[0], separator])
+            min_r = '%.6f' % float(numpy.min(ratio))
+            max_r = '%.6f' % float(numpy.max(ratio))
+            max_d = '%.6f' % float(numpy.max(difference))
+            min_d = '%.6f' % float(numpy.min(difference))
+            cols = [max_r, min_r, min_d, max_d]
+
+            if check_passed:
+                checked = "\033[92m  True \033[0m"
+            else:
+                checked = "\033[91m  False \033[0m"
+
+            grad_string = "{} | {}  | {} |    {}    |   {} ".format(checked, cols[0], cols[1], cols[2], cols[3])
+            print grad_string
+
+            if plot:
+                import pylab as pb
+                fig, axes = pb.subplots(2, 2)
+                max_lim = numpy.max(numpy.vstack((analytic_hess, numeric_hess)))
+                min_lim = numpy.min(numpy.vstack((analytic_hess, numeric_hess)))
+                msa = axes[0,0].matshow(analytic_hess, vmin=min_lim, vmax=max_lim)
+                axes[0,0].set_title('Analytic hessian')
+                axes[0,0].xaxis.set_ticklabels([None])
+                axes[0,0].yaxis.set_ticklabels([None])
+                axes[0,0].xaxis.set_ticks([None])
+                axes[0,0].yaxis.set_ticks([None])
+                msn = axes[0,1].matshow(numeric_hess, vmin=min_lim, vmax=max_lim)
+                pb.colorbar(msn, ax=axes[0,1])
+                axes[0,1].set_title('Numeric hessian')
+                axes[0,1].xaxis.set_ticklabels([None])
+                axes[0,1].yaxis.set_ticklabels([None])
+                axes[0,1].xaxis.set_ticks([None])
+                axes[0,1].yaxis.set_ticks([None])
+                msr = axes[1,0].matshow(ratio)
+                pb.colorbar(msr, ax=axes[1,0])
+                axes[1,0].set_title('Ratio')
+                axes[1,0].xaxis.set_ticklabels([None])
+                axes[1,0].yaxis.set_ticklabels([None])
+                axes[1,0].xaxis.set_ticks([None])
+                axes[1,0].yaxis.set_ticks([None])
+                msd = axes[1,1].matshow(difference)
+                pb.colorbar(msd, ax=axes[1,1])
+                axes[1,1].set_title('difference')
+                axes[1,1].xaxis.set_ticklabels([None])
+                axes[1,1].yaxis.set_ticklabels([None])
+                axes[1,1].xaxis.set_ticks([None])
+                axes[1,1].yaxis.set_ticks([None])
+                if block_indices:
+                    fig.suptitle("Block: {}".format(block_indices))
+                pb.show()
+
+        return check_passed
+
+class SkewChecker(HessianChecker):
+
+    def __init__(self, df, ddf, dddf, x0, names=None, *args, **kwargs):
+        """
+        :param df: gradient of function
+        :param ddf: Gradient of function to check (hessian)
+        :param dddf: Analytical gradient function (third derivative)
+        :param x0:
+            Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
+            Can be a list of arrays, if takes a list of arrays. This list will be passed
+            to f and df in the same order as given here.
+            If only one argument, make sure not to pass a list!!!
+
+        :type x0: [array-like] | array-like | float | int
+        :param names:
+            Names to print, when performing gradcheck. If a list was passed to x0
+            a list of names with the same length is expected.
+        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
+
+        """
+        super(SkewChecker, self).__init__(df, ddf, dddf, x0, names=names, *args, **kwargs)
+
+    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3, block_indices=None, plot=False, super_plot=False):
+        """
+        Gradient checker that just checks each hessian individually
+
+        super_plot will plot the hessian wrt every parameter, plot will just do the first one
+        """
+        try:
+            import numdifftools as nd
+        except:
+            raise ImportError("Don't have numdifftools package installed, it is not a GPy dependency as of yet, it is only used for hessian tests")
+
+        if target_param:
+            raise NotImplementedError('Only basic functionality is provided with this gradchecker')
+
+        #Repeat for each parameter, not the nicest but shouldn't be many cases where there are many
+        #variables
+        current_index = 0
+        for name, n_shape in zip(self.names, self.shapes):
+            current_size = numpy.prod(n_shape)
+            x = self.optimizer_array.copy()
+            #x = self._get_params_transformed().copy()
+            x = x[current_index:current_index + current_size].reshape(n_shape)
+
+            # Check gradients
+            #Actually the third derivative
+            analytic_hess = self._ddf(x)
+
+            #Can only calculate jacobian for one variable at a time
+            #From the docs:
+            #x0 : vector location
+            #at which to differentiate fun
+            #If x0 is an N x M array, then fun is assumed to be a function
+            #of N*M variables., thus we must have it flat, not (N,1), but just (N,)
+            #numeric_hess_partial = nd.Hessian(self._f, vectorized=False)
+            #Actually _df is already the hessian
+            numeric_hess_partial = nd.Jacobian(self._df, vectorized=True)
+            numeric_hess = numeric_hess_partial(x)
+
+            print "Done making numerical hessian"
+            if analytic_hess.dtype is np.dtype('object'):
+                #Blockify numeric_hess aswell
+                blocksizes, pagesizes = get_block_shapes_3d(analytic_hess)
+                #HACK
+                real_block_size = np.sum(blocksizes)
+                numeric_hess = numeric_hess.reshape(real_block_size, real_block_size, pagesizes)
+                #numeric_hess = get_blocks_3d(numeric_hess, blocksizes)#, pagesizes)
+            else:
+                numeric_hess = numeric_hess.reshape(*analytic_hess.shape)
+
+            #Check every block individually (for ease)
+            check_passed = [False]*numeric_hess.shape[2]
+            for block_ind in xrange(numeric_hess.shape[2]):
+                #Unless super_plot is set, just plot the first one
+                p = True if (plot and block_ind == numeric_hess.shape[2]-1) or super_plot else False
+                if verbose:
+                    print "Checking derivative of hessian wrt parameter number {}".format(block_ind)
+                check_passed[block_ind] = self.checkgrad_block(analytic_hess[:,:,block_ind], numeric_hess[:,:,block_ind], verbose=verbose, step=step, tolerance=tolerance, block_indices=block_indices, plot=p)
+
+            current_index += current_size
+        return np.all(check_passed)
+
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index f3e643c9..be01b769 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -74,6 +74,8 @@ class MRD(BayesianGPLVMMiniBatch):
 
         self.logger.debug("creating observable arrays")
         self.Ylist = [ObsAr(Y) for Y in Ylist]
+        #The next line is a fix for Python 3. It replicates the python 2 behaviour from the above comprehension
+        Y = Ylist[-1]
 
         if Ynames is None:
             self.logger.debug("creating Ynames")
@@ -82,7 +84,7 @@ class MRD(BayesianGPLVMMiniBatch):
         assert len(self.names) == len(self.Ylist), "one name per dataset, or None if Ylist is a dict"
 
         if inference_method is None:
-            self.inference_method = InferenceMethodList([VarDTC() for _ in xrange(len(self.Ylist))])
+            self.inference_method = InferenceMethodList([VarDTC() for _ in range(len(self.Ylist))])
         else:
             assert isinstance(inference_method, InferenceMethodList), "please provide one inference method per Y in the list and provide it as InferenceMethodList, inference_method given: {}".format(inference_method)
             self.inference_method = inference_method
@@ -137,7 +139,7 @@ class MRD(BayesianGPLVMMiniBatch):
 
         self.bgplvms = []
 
-        for i, n, k, l, Y, im, bs in itertools.izip(itertools.count(), Ynames, kernels, likelihoods, Ylist, self.inference_method, batchsize):
+        for i, n, k, l, Y, im, bs in zip(itertools.count(), Ynames, kernels, likelihoods, Ylist, self.inference_method, batchsize):
             assert Y.shape[0] == self.num_data, "All datasets need to share the number of datapoints, and those have to correspond to one another"
             md = np.isnan(Y).any()
             spgp = BayesianGPLVMMiniBatch(Y, input_dim, X, X_variance,
@@ -164,7 +166,7 @@ class MRD(BayesianGPLVMMiniBatch):
         self._log_marginal_likelihood = 0
         self.Z.gradient[:] = 0.
         self.X.gradient[:] = 0.
-        for b, i in itertools.izip(self.bgplvms, self.inference_method):
+        for b, i in zip(self.bgplvms, self.inference_method):
             self._log_marginal_likelihood += b._log_marginal_likelihood
 
             self.logger.info('working on im <{}>'.format(hex(id(i))))
@@ -195,7 +197,7 @@ class MRD(BayesianGPLVMMiniBatch):
         elif init in "PCA_single":
             X = np.zeros((Ylist[0].shape[0], self.input_dim))
             fracs = []
-            for qs, Y in itertools.izip(np.array_split(np.arange(self.input_dim), len(Ylist)), Ylist):
+            for qs, Y in zip(np.array_split(np.arange(self.input_dim), len(Ylist)), Ylist):
                 x,frcs = initialize_latent('PCA', len(qs), Y)
                 X[:, qs] = x
                 fracs.append(frcs)
@@ -327,9 +329,9 @@ class MRD(BayesianGPLVMMiniBatch):
 
     def __getstate__(self):
         state = super(MRD, self).__getstate__()
-        if state.has_key('kern'):
+        if 'kern' in state:
             del state['kern']
-        if state.has_key('likelihood'):
+        if 'likelihood' in state:
             del state['likelihood']
         return state
 
@@ -338,4 +340,4 @@ class MRD(BayesianGPLVMMiniBatch):
         super(MRD, self).__setstate__(state)
         self.kern = self.bgplvms[0].kern
         self.likelihood = self.bgplvms[0].likelihood
-        self.parameters_changed()
\ No newline at end of file
+        self.parameters_changed()
diff --git a/GPy/models/one_vs_all_sparse_classification.py b/GPy/models/one_vs_all_sparse_classification.py
index 3bdd2647..7528ffd2 100644
--- a/GPy/models/one_vs_all_sparse_classification.py
+++ b/GPy/models/one_vs_all_sparse_classification.py
@@ -30,7 +30,7 @@ class OneVsAllSparseClassification(object):
 
         self.results = {}
         for yj in labels:
-            print 'Class %s vs all' %yj
+            print('Class %s vs all' %yj)
             Ynew = Y.copy()
             Ynew[Y.flatten()!=yj] = 0
             Ynew[Y.flatten()==yj] = 1
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index e827bb70..ad62043a 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
+from __future__ import print_function
 import numpy as np
 from ..core.parameterization.param import Param
 from ..core.sparse_gp import SparseGP
@@ -43,14 +44,15 @@ class SparseGPMiniBatch(SparseGP):
     def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                  name='sparse gp', Y_metadata=None, normalizer=False,
                  missing_data=False, stochastic=False, batchsize=1):
-        #pick a sensible inference method
+        
+        # pick a sensible inference method
         if inference_method is None:
             if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
+                inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
             else:
                 #inference_method = ??
-                raise NotImplementedError, "what to do what to do?"
-            print "defaulting to ", inference_method, "for latent function inference"
+                raise NotImplementedError("what to do what to do?")
+            print("defaulting to ", inference_method, "for latent function inference")
 
         self.kl_factr = 1.
         self.Z = Param('inducing inputs', Z)
@@ -80,13 +82,13 @@ class SparseGPMiniBatch(SparseGP):
             overall = self.Y_normalized.shape[1]
             m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall)
             message = m_f(-1)
-            print message,
-            for d in xrange(overall):
+            print(message, end=' ')
+            for d in range(overall):
                 self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None])
-                print ' '*(len(message)+1) + '\r',
+                print(' '*(len(message)+1) + '\r', end=' ')
                 message = m_f(d)
-                print message,
-            print ''
+                print(message, end=' ')
+            print('')
 
         self.posterior = None
 
@@ -181,11 +183,11 @@ class SparseGPMiniBatch(SparseGP):
             full_values[key][value_indices[key]] += current_values[key]
         """
         for key in current_values.keys():
-            if value_indices is not None and value_indices.has_key(key):
+            if value_indices is not None and key in value_indices:
                 index = value_indices[key]
             else:
                 index = slice(None)
-            if full_values.has_key(key):
+            if key in full_values:
                 full_values[key][index] += current_values[key]
             else:
                 full_values[key] = current_values[key]
@@ -241,15 +243,15 @@ class SparseGPMiniBatch(SparseGP):
         if not self.stochastics:
             m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
             message = m_f(-1)
-            print message,
+            print(message, end=' ')
 
         for d in self.stochastics.d:
             ninan = self.ninan[:, d]
 
             if not self.stochastics:
-                print ' '*(len(message)) + '\r',
+                print(' '*(len(message)) + '\r', end=' ')
                 message = m_f(d)
-                print message,
+                print(message, end=' ')
 
             posterior, log_marginal_likelihood, \
                 grad_dict, current_values, value_indices = self._inner_parameters_changed(
@@ -268,7 +270,7 @@ class SparseGPMiniBatch(SparseGP):
             woodbury_vector[:, d:d+1] = posterior.woodbury_vector
             self._log_marginal_likelihood += log_marginal_likelihood
         if not self.stochastics:
-            print ''
+            print('')
 
         if self.posterior is None:
             self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index a61ad2a0..0f3b8fdd 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -39,7 +39,10 @@ class SSGPLVM(SparseGP_MPI):
             X_variance = np.random.uniform(0,.1,X.shape)
             
         if Gamma is None:
-            gamma = np.random.randn(X.shape[0], input_dim)
+            gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+            gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
+            gamma[gamma>1.-1e-9] = 1.-1e-9
+            gamma[gamma<1e-9] = 1e-9
         else:
             gamma = Gamma.copy()
                 
@@ -71,7 +74,7 @@ class SSGPLVM(SparseGP_MPI):
         self.link_parameter(self.X, index=0)
                 
         if self.group_spike:
-            [self.X.gamma[:,i].tie('tieGamma'+str(i)) for i in xrange(self.X.gamma.shape[1])] # Tie columns together
+            [self.X.gamma[:,i].tie('tieGamma'+str(i)) for i in range(self.X.gamma.shape[1])] # Tie columns together
         
     def set_X_gradients(self, X, X_grad):
         """Set the gradients of the posterior distribution of X in its specific form."""
diff --git a/GPy/models/ss_mrd.py b/GPy/models/ss_mrd.py
index 036ac095..bd2efce0 100644
--- a/GPy/models/ss_mrd.py
+++ b/GPy/models/ss_mrd.py
@@ -19,10 +19,10 @@ class SSMRD(Model):
                                name='model_'+str(i)) for i,y in enumerate(Ylist)]
         self.add_parameters(*(self.models))
         
-        [[[self.models[m].X.mean[i,j:j+1].tie('mean_'+str(i)+'_'+str(j)) for m in xrange(len(self.models))] for j in xrange(self.models[0].X.mean.shape[1])] 
-         for i in xrange(self.models[0].X.mean.shape[0])]
-        [[[self.models[m].X.variance[i,j:j+1].tie('var_'+str(i)+'_'+str(j)) for m in xrange(len(self.models))] for j in xrange(self.models[0].X.variance.shape[1])] 
-         for i in xrange(self.models[0].X.variance.shape[0])]
+        [[[self.models[m].X.mean[i,j:j+1].tie('mean_'+str(i)+'_'+str(j)) for m in range(len(self.models))] for j in range(self.models[0].X.mean.shape[1])] 
+         for i in range(self.models[0].X.mean.shape[0])]
+        [[[self.models[m].X.variance[i,j:j+1].tie('var_'+str(i)+'_'+str(j)) for m in range(len(self.models))] for j in range(self.models[0].X.variance.shape[1])] 
+         for i in range(self.models[0].X.variance.shape[0])]
         
         self.updates = True
         
@@ -31,4 +31,4 @@ class SSMRD(Model):
         self._log_marginal_likelihood = sum([m._log_marginal_likelihood for m in self.models])
 
     def log_likelihood(self):
-        return self._log_marginal_likelihood
\ No newline at end of file
+        return self._log_marginal_likelihood
diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 4b982ed2..5bc9a417 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
 import numpy as np
 from ..util.warping_functions import *
 from ..core import GP
@@ -10,14 +9,16 @@ from GPy.util.warping_functions import TanhWarpingFunction_d
 from GPy import kern
 
 class WarpedGP(GP):
-    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False):
+    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3):
 
         if kernel is None:
-            kernel = kern.rbf(X.shape[1])
+            kernel = kern.RBF(X.shape[1])
 
         if warping_function == None:
             self.warping_function = TanhWarpingFunction_d(warping_terms)
             self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
+        else:
+            self.warping_function = warping_function
 
         self.scale_data = False
         if self.scale_data:
@@ -25,10 +26,10 @@ class WarpedGP(GP):
         self.has_uncertain_inputs = False
         self.Y_untransformed = Y.copy()
         self.predict_in_warped_space = False
-        likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y)
+        likelihood = likelihoods.Gaussian()
 
-        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
-        self._set_params(self._get_params())
+        GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
+        self.link_parameter(self.warping_function)
 
     def _scale_data(self, Y):
         self._Ymax = Y.max()
@@ -38,62 +39,55 @@ class WarpedGP(GP):
     def _unscale_data(self, Y):
         return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
 
-    def _set_params(self, x):
-        self.warping_params = x[:self.warping_function.num_parameters]
-        Y = self.transform_data()
-        self.likelihood.set_data(Y)
-        GP._set_params(self, x[self.warping_function.num_parameters:].copy())
+    def parameters_changed(self):
+        self.Y[:] = self.transform_data()
+        super(WarpedGP, self).parameters_changed()
 
-    def _get_params(self):
-        return np.hstack((self.warping_params.flatten().copy(), GP._get_params(self).copy()))
+        Kiy = self.posterior.woodbury_vector.flatten()
 
-    def _get_param_names(self):
-        warping_names = self.warping_function._get_param_names()
-        param_names = GP._get_param_names(self)
-        return warping_names + param_names
-
-    def transform_data(self):
-        Y = self.warping_function.f(self.Y_untransformed.copy(), self.warping_params).copy()
-        return Y
-
-    def log_likelihood(self):
-        ll = GP.log_likelihood(self)
-        jacobian = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
-        return ll + np.log(jacobian).sum()
-
-    def _log_likelihood_gradients(self):
-        ll_grads = GP._log_likelihood_gradients(self)
-        alpha = np.dot(self.Ki, self.likelihood.Y.flatten())
-        warping_grads = self.warping_function_gradients(alpha)
-
-        warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1])
-        return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
-
-    def warping_function_gradients(self, Kiy):
-        grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
-        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params,
+        grad_y = self.warping_function.fgrad_y(self.Y_untransformed)
+        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed,
                                                                  return_covar_chain=True)
         djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
         dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
 
-        return -dquad_dpsi + djac_dpsi
+        warping_grads = -dquad_dpsi + djac_dpsi
+
+        self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
+        self.warping_function.d.gradient[:] = warping_grads[0, -1]
+
+
+    def transform_data(self):
+        Y = self.warping_function.f(self.Y_untransformed.copy()).copy()
+        return Y
+
+    def log_likelihood(self):
+        ll = GP.log_likelihood(self)
+        jacobian = self.warping_function.fgrad_y(self.Y_untransformed)
+        return ll + np.log(jacobian).sum()
 
     def plot_warping(self):
-        self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max())
+        self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())
 
-    def predict(self, Xnew, which_parts='all', full_cov=False, pred_init=None):
+    def predict(self, Xnew, which_parts='all', pred_init=None):
         # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        mu, var = GP._raw_predict(self, Xnew, full_cov=full_cov, which_parts=which_parts)
+        # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        mu, var = GP._raw_predict(self, Xnew)
 
         # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+        mean, var = self.likelihood.predictive_values(mu, var)
 
         if self.predict_in_warped_space:
-            mean = self.warping_function.f_inv(mean, self.warping_params, y=pred_init)
-            var = self.warping_function.f_inv(var, self.warping_params)
+            mean = self.warping_function.f_inv(mean,  y=pred_init)
+            var = self.warping_function.f_inv(var)
 
         if self.scale_data:
             mean = self._unscale_data(mean)
-        
-        return mean, var, _025pm, _975pm
+
+        return mean, var
+
+if __name__ == '__main__':
+    X = np.random.randn(100, 1)
+    Y = np.sin(X) + np.random.randn(100, 1)*0.05
+
+    m = WarpedGP(X, Y)
diff --git a/GPy/old_tests/mapping_tests.py b/GPy/old_tests/mapping_tests.py
index d501df1d..8e4f250d 100644
--- a/GPy/old_tests/mapping_tests.py
+++ b/GPy/old_tests/mapping_tests.py
@@ -5,6 +5,30 @@ import unittest
 import numpy as np
 import GPy
 
+class MappingGradChecker(GPy.core.Model):
+    """
+    This class has everything we need to check the gradient of a mapping. It
+    implement a simple likelihood which is the sum of the outputs of the
+    mapping. the gradients are checked against the parameters of the mapping
+    and the input.
+    """
+    def __init__(self, mapping, X, name):
+        super(MappingChecker).__init__(self, name)
+        self.mapping = mapping
+        self.add_parameter(self.mapping)
+        self.X = GPy.core.Param('X',X)
+        self.add_parameter(self.X)
+        self.dL_dY = np.ones((self.X.shape[0]. self.mapping.output_dim))
+    def log_likelihood(self):
+        return np.sum(self.mapping.f(X))
+    def parameters_changed(self):
+        self.X.gradient = self.mapping.gradients_X(self.dL_dY, self.X)
+        self.mapping.update_gradients(self.dL_dY, self.X)
+
+
+
+
+
 
 
 class MappingTests(unittest.TestCase):
diff --git a/GPy/plotting/__init__.py b/GPy/plotting/__init__.py
index d3a96914..9dd84441 100644
--- a/GPy/plotting/__init__.py
+++ b/GPy/plotting/__init__.py
@@ -2,6 +2,6 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 try:
-    import matplot_dep
+    from . import matplot_dep
 except (ImportError, NameError):
-    print 'Fail to load GPy.plotting.matplot_dep.'
\ No newline at end of file
+    print('Fail to load GPy.plotting.matplot_dep.')
\ No newline at end of file
diff --git a/GPy/plotting/matplot_dep/__init__.py b/GPy/plotting/matplot_dep/__init__.py
index 4c4402ce..a60b52c2 100644
--- a/GPy/plotting/matplot_dep/__init__.py
+++ b/GPy/plotting/matplot_dep/__init__.py
@@ -15,4 +15,4 @@ import netpbmfile
 import inference_plots
 import maps
 import img_plots
-from ssgplvm import SSGPLVM_plot
+from ssgplvm import SSGPLVM_plot   
\ No newline at end of file
diff --git a/GPy/plotting/matplot_dep/base_plots.py b/GPy/plotting/matplot_dep/base_plots.py
index b4142342..f25aee49 100644
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@@ -133,7 +133,7 @@ def x_frame1D(X,plot_limits=None,resolution=None):
     elif len(plot_limits)==2:
         xmin, xmax = plot_limits
     else:
-        raise ValueError, "Bad limits for plotting"
+        raise ValueError("Bad limits for plotting")
 
     Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
     return Xnew, xmin, xmax
@@ -149,7 +149,7 @@ def x_frame2D(X,plot_limits=None,resolution=None):
     elif len(plot_limits)==2:
         xmin, xmax = plot_limits
     else:
-        raise ValueError, "Bad limits for plotting"
+        raise ValueError("Bad limits for plotting")
 
     resolution = resolution or 50
     xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py
index 1398b40c..2c243e13 100644
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@@ -27,7 +27,7 @@ def most_significant_input_dimensions(model, which_indices):
             try:
                 input_1, input_2 = np.argsort(model.input_sensitivity())[::-1][:2]
             except:
-                raise ValueError, "cannot automatically determine which dimensions to plot, please pass 'which_indices'"
+                raise ValueError("cannot automatically determine which dimensions to plot, please pass 'which_indices'")
     else:
         input_1, input_2 = which_indices
     return input_1, input_2
@@ -62,7 +62,7 @@ def plot_latent(model, labels=None, which_indices=None,
 
 
     if X.shape[0] > 1000:
-        print "Warning: subsampling X, as it has more samples then 1000. X.shape={!s}".format(X.shape)
+        print("Warning: subsampling X, as it has more samples then 1000. X.shape={!s}".format(X.shape))
         subsample = np.random.choice(X.shape[0], size=1000, replace=False)
         X = X[subsample]
         labels = labels[subsample]
@@ -133,7 +133,7 @@ def plot_latent(model, labels=None, which_indices=None,
         try:
             xmin, xmax, ymin, ymax = plot_limits
         except (TypeError, ValueError) as e:
-            raise e.__class__, "Wrong plot limits: {} given -> need (xmin, xmax, ymin, ymax)".format(plot_limits)
+            raise e.__class__("Wrong plot limits: {} given -> need (xmin, xmax, ymin, ymax)".format(plot_limits))
     view = ImshowController(ax, plot_function,
                             (xmin, ymin, xmax, ymax),
                             resolution, aspect=aspect, interpolation='bilinear',
@@ -187,14 +187,14 @@ def plot_latent(model, labels=None, which_indices=None,
         fig.tight_layout()
         fig.canvas.draw()
     except Exception as e:
-        print "Could not invoke tight layout: {}".format(e)
+        print("Could not invoke tight layout: {}".format(e))
         pass
 
     if updates:
         try:
             ax.figure.canvas.show()
         except Exception as e:
-            print "Could not invoke show: {}".format(e)
+            print("Could not invoke show: {}".format(e))
         raw_input('Enter to continue')
         view.deactivate()
     return ax
diff --git a/GPy/plotting/matplot_dep/img_plots.py b/GPy/plotting/matplot_dep/img_plots.py
index 453a904d..5346545d 100644
--- a/GPy/plotting/matplot_dep/img_plots.py
+++ b/GPy/plotting/matplot_dep/img_plots.py
@@ -50,8 +50,8 @@ def plot_2D_images(figure, arr, symmetric=False, pad=None, zoom=None, mode=None,
     
     buf = np.ones((y_size*fig_nrows+pad*(fig_nrows-1), x_size*fig_ncols+pad*(fig_ncols-1), 3),dtype=arr.dtype)
     
-    for y in xrange(fig_nrows):
-        for x in xrange(fig_ncols):
+    for y in range(fig_nrows):
+        for x in range(fig_ncols):
             if y*fig_ncols+x<fig_num:
                 buf[y*y_size+y*pad:(y+1)*y_size+y*pad, x*x_size+x*pad:(x+1)*x_size+x*pad] = arr_color[y*fig_ncols+x,:,:,:3]
     img_plot = ax.imshow(buf, interpolation=interpolation)
diff --git a/GPy/plotting/matplot_dep/inference_plots.py b/GPy/plotting/matplot_dep/inference_plots.py
index c802932c..02007390 100644
--- a/GPy/plotting/matplot_dep/inference_plots.py
+++ b/GPy/plotting/matplot_dep/inference_plots.py
@@ -12,7 +12,7 @@ except:
 
 def plot_optimizer(optimizer):
     if optimizer.trace == None:
-        print "No trace present so I can't plot it. Please check that the optimizer actually supplies a trace."
+        print("No trace present so I can't plot it. Please check that the optimizer actually supplies a trace.")
     else:
         pb.figure()
         pb.plot(optimizer.trace)
diff --git a/GPy/plotting/matplot_dep/kernel_plots.py b/GPy/plotting/matplot_dep/kernel_plots.py
index 347e3d08..aa015009 100644
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@@ -81,7 +81,7 @@ def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False, filtering=Non
             last_bottom = ard_params[i,:]
             bottom += last_bottom
         else:
-            print "filtering out {}".format(kernel.parameters[i].name)
+            print("filtering out {}".format(kernel.parameters[i].name))
 
     ax.set_xlim(-.5, kernel.input_dim - .5)
     add_bar_labels(fig, ax, [bars[-1]], bottom=bottom-last_bottom)
@@ -132,7 +132,7 @@ def plot(kernel,x=None, fignum=None, ax=None, title=None, plot_limits=None, reso
         elif len(plot_limits) == 2:
             xmin, xmax = plot_limits
         else:
-            raise ValueError, "Bad limits for plotting"
+            raise ValueError("Bad limits for plotting")
 
         Xnew = np.linspace(xmin, xmax, resolution or 201)[:, None]
         Kx = kernel.K(Xnew, x)
@@ -154,7 +154,7 @@ def plot(kernel,x=None, fignum=None, ax=None, title=None, plot_limits=None, reso
         elif len(plot_limits) == 2:
             xmin, xmax = plot_limits
         else:
-            raise ValueError, "Bad limits for plotting"
+            raise ValueError("Bad limits for plotting")
 
         resolution = resolution or 51
         xx, yy = np.mgrid[xmin[0]:xmax[0]:1j * resolution, xmin[1]:xmax[1]:1j * resolution]
@@ -168,4 +168,4 @@ def plot(kernel,x=None, fignum=None, ax=None, title=None, plot_limits=None, reso
         ax.set_ylabel("x2")
         ax.set_title("k(x1,x2 ; %0.1f,%0.1f)" % (x[0, 0], x[0, 1]))
     else:
-        raise NotImplementedError, "Cannot plot a kernel with more than two input dimensions"
+        raise NotImplementedError("Cannot plot a kernel with more than two input dimensions")
diff --git a/GPy/plotting/matplot_dep/mapping_plots.py b/GPy/plotting/matplot_dep/mapping_plots.py
index 6156687d..53bc1de2 100644
--- a/GPy/plotting/matplot_dep/mapping_plots.py
+++ b/GPy/plotting/matplot_dep/mapping_plots.py
@@ -81,4 +81,4 @@ def plot_mapping(self, plot_limits=None, which_data='all', which_parts='all', re
         ax.set_ylim(xmin[1], xmax[1])
 
     else:
-        raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
diff --git a/GPy/plotting/matplot_dep/maps.py b/GPy/plotting/matplot_dep/maps.py
index fcb03b38..f38c5d93 100644
--- a/GPy/plotting/matplot_dep/maps.py
+++ b/GPy/plotting/matplot_dep/maps.py
@@ -6,7 +6,11 @@ try:
     from matplotlib.patches import Polygon
     from matplotlib.collections import PatchCollection
     #from matplotlib import cm
-    pb.ion()
+    try:
+        __IPYTHON__
+        pb.ion()
+    except NameError:
+        pass
 except:
     pass
 import re
@@ -34,7 +38,7 @@ def plot(shape_records,facecolor='w',edgecolor='k',linewidths=.5, ax=None,xlims=
         par = list(sparts) + [points.shape[0]]
 
         polygs = []
-        for pj in xrange(len(sparts)):
+        for pj in range(len(sparts)):
             polygs.append(Polygon(points[par[pj]:par[pj+1]]))
         ax.add_collection(PatchCollection(polygs,facecolor=facecolor,edgecolor=edgecolor, linewidths=linewidths))
 
@@ -159,10 +163,10 @@ def new_shape_string(sf,name,regex,field=2,type=None):
 
         newshp.line(parts=_parts)
         newshp.records.append(sr.record)
-        print len(sr.record)
+        print(len(sr.record))
 
     newshp.save(name)
-    print index
+    print(index)
 
 def apply_bbox(sf,ax):
     """
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index d2d5a8e2..0cda12f1 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2015, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 try:
@@ -16,7 +16,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         which_data_ycols='all', fixed_inputs=[],
         levels=20, samples=0, fignum=None, ax=None, resolution=None,
         plot_raw=False,
-        linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None, data_symbol='kx'):
+        linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None, data_symbol='kx',
+        apply_link=False, samples_f=0, plot_uncertain_inputs=True):
     """
     Plot the posterior of the GP.
       - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@@ -38,7 +39,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
     :type resolution: int
     :param levels: number of levels to plot in a contour plot.
     :type levels: int
-    :param samples: the number of a posteriori samples to plot
+    :param samples: the number of a posteriori samples to plot p(y*|y)
     :type samples: int
     :param fignum: figure to plot on.
     :type fignum: figure number
@@ -49,6 +50,10 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
     :type linecol:
     :param fillcol: color of fill
     :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+    :param apply_link: apply the link function if plotting f (default false)
+    :type apply_link: boolean
+    :param samples_f: the number of posteriori f samples to plot p(f*|y)
+    :type samples_f: int
     """
     #deal with optional arguments
     if which_data_rows == 'all':
@@ -88,8 +93,14 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         #make a prediction on the frame and plot it
         if plot_raw:
             m, v = model._raw_predict(Xgrid)
-            lower = m - 2*np.sqrt(v)
-            upper = m + 2*np.sqrt(v)
+            if apply_link:
+                lower = model.likelihood.gp_link.transf(m - 2*np.sqrt(v))
+                upper = model.likelihood.gp_link.transf(m + 2*np.sqrt(v))
+                #Once transformed this is now the median of the function
+                m = model.likelihood.gp_link.transf(m)
+            else:
+                lower = m - 2*np.sqrt(v)
+                upper = m + 2*np.sqrt(v)
         else:
             if isinstance(model,GPCoregionalizedRegression) or isinstance(model,SparseGPCoregionalizedRegression):
                 meta = {'output_index': Xgrid[:,-1:].astype(np.int)}
@@ -110,13 +121,31 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
                 plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                 #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
 
+        if samples_f: #NOTE not tested with fixed_inputs
+            Fsim = model.posterior_samples_f(Xgrid, samples_f)
+            for fi in Fsim.T:
+                plots['posterior_samples_f'] = ax.plot(Xnew, fi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
+                #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
+
 
         #add error bars for uncertain (if input uncertainty is being modelled)
-        if hasattr(model,"has_uncertain_inputs") and model.has_uncertain_inputs():
-            plots['xerrorbar'] = ax.errorbar(X[which_data_rows, free_dims].flatten(), Y[which_data_rows, which_data_ycols].flatten(),
-                        xerr=2 * np.sqrt(X_variance[which_data_rows, free_dims].flatten()),
-                        ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
-
+        if hasattr(model,"has_uncertain_inputs") and model.has_uncertain_inputs() and plot_uncertain_inputs:
+            if plot_raw:
+                #add error bars for uncertain (if input uncertainty is being modelled), for plot_f
+                #Hack to plot error bars on latent function, rather than on the data
+                vs = model.X.mean.values.copy()
+                for i,v in fixed_inputs:
+                    vs[:,i] = v
+                m_X, _ = model._raw_predict(vs)
+                if apply_link:
+                    m_X = model.likelihood.gp_link.transf(m_X)
+                plots['xerrorbar'] = ax.errorbar(X[which_data_rows, free_dims].flatten(), m_X[which_data_rows, which_data_ycols].flatten(),
+                            xerr=2 * np.sqrt(X_variance[which_data_rows, free_dims].flatten()),
+                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
+            else:
+                plots['xerrorbar'] = ax.errorbar(X[which_data_rows, free_dims].flatten(), Y[which_data_rows, which_data_ycols].flatten(),
+                            xerr=2 * np.sqrt(X_variance[which_data_rows, free_dims].flatten()),
+                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
 
         #set the limits of the plot to some sensible values
         ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
@@ -175,7 +204,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
             plots['inducing_inputs'] = ax.plot(Zu[:,0], Zu[:,1], 'wo')
 
     else:
-        raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
     return plots
 
 def plot_fit_f(model, *args, **kwargs):
@@ -186,3 +215,29 @@ def plot_fit_f(model, *args, **kwargs):
     """
     kwargs['plot_raw'] = True
     plot_fit(model,*args, **kwargs)
+
+def fixed_inputs(model, non_fixed_inputs, fix_routine='median'):
+    """
+    Convenience function for returning back fixed_inputs where the other inputs
+    are fixed using fix_routine
+    :param model: model
+    :type model: Model
+    :param non_fixed_inputs: dimensions of non fixed inputs
+    :type non_fixed_inputs: list
+    :param fix_routine: fixing routine to use, 'mean', 'median', 'zero'
+    :type fix_routine: string
+    """
+    f_inputs = []
+    if hasattr(model, 'has_uncertain_inputs') and model.has_uncertain_inputs():
+        X = model.X.mean.values.copy()
+    else:
+        X = model.X.values.copy()
+    for i in range(X.shape[1]):
+        if i not in non_fixed_inputs:
+            if fix_routine == 'mean':
+                f_inputs.append( (i, np.mean(X[:,i])) )
+            if fix_routine == 'median':
+                f_inputs.append( (i, np.median(X[:,i])) )
+            elif fix_routine == 'zero':
+                f_inputs.append( (i, 0) )
+    return f_inputs
diff --git a/GPy/plotting/matplot_dep/priors_plots.py b/GPy/plotting/matplot_dep/priors_plots.py
index 8f02a03b..39dad631 100644
--- a/GPy/plotting/matplot_dep/priors_plots.py
+++ b/GPy/plotting/matplot_dep/priors_plots.py
@@ -29,4 +29,4 @@ def plot(prior):
         pb.contour(xx, yy, zz, linewidths=2)
 
     else:
-        raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
diff --git a/GPy/plotting/matplot_dep/variational_plots.py b/GPy/plotting/matplot_dep/variational_plots.py
index 5cced10d..55128ec7 100644
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@@ -1,6 +1,6 @@
 import pylab as pb, numpy as np
 
-def plot(parameterized, fignum=None, ax=None, colors=None):
+def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
     """
     Plot latent space X in 1D:
 
@@ -13,13 +13,15 @@ def plot(parameterized, fignum=None, ax=None, colors=None):
 
     """
     if ax is None:
-        fig = pb.figure(num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1]))))
+        fig = pb.figure(num=fignum, figsize=figsize)
     if colors is None:
         colors = pb.gca()._get_lines.color_cycle
         pb.clf()
     else:
         colors = iter(colors)
-    plots = []
+    lines = []
+    fills = []
+    bg_lines = []
     means, variances = parameterized.mean, parameterized.variance
     x = np.arange(means.shape[0])
     for i in range(means.shape[1]):
@@ -29,20 +31,20 @@ def plot(parameterized, fignum=None, ax=None, colors=None):
             a = ax[i]
         else:
             raise ValueError("Need one ax per latent dimension input_dim")
-        a.plot(means, c='k', alpha=.3)
-        plots.extend(a.plot(x, means.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
-        a.fill_between(x,
+        bg_lines.append(a.plot(means, c='k', alpha=.3))
+        lines.extend(a.plot(x, means.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
+        fills.append(a.fill_between(x,
                         means.T[i] - 2 * np.sqrt(variances.T[i]),
                         means.T[i] + 2 * np.sqrt(variances.T[i]),
-                        facecolor=plots[-1].get_color(),
-                        alpha=.3)
+                        facecolor=lines[-1].get_color(),
+                        alpha=.3))
         a.legend(borderaxespad=0.)
         a.set_xlim(x.min(), x.max())
         if i < means.shape[1] - 1:
             a.set_xticklabels('')
     pb.draw()
     fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
-    return fig
+    return dict(lines=lines, fills=fills, bg_lines=bg_lines)
 
 def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_side=True):
     """
diff --git a/GPy/plotting/matplot_dep/visualize.py b/GPy/plotting/matplot_dep/visualize.py
index 9ff41730..b57b7dfc 100644
--- a/GPy/plotting/matplot_dep/visualize.py
+++ b/GPy/plotting/matplot_dep/visualize.py
@@ -25,10 +25,10 @@ class data_show:
         # If no axes are defined, create some.
 
     def modify(self, vals):
-        raise NotImplementedError, "this needs to be implemented to use the data_show class"
+        raise NotImplementedError("this needs to be implemented to use the data_show class")
 
     def close(self):
-        raise NotImplementedError, "this needs to be implemented to use the data_show class"
+        raise NotImplementedError("this needs to be implemented to use the data_show class")
 
 class vpython_show(data_show):
     """
@@ -225,8 +225,8 @@ class lvm_dimselect(lvm):
         self.labels = labels
         lvm.__init__(self,vals,model,data_visualize,latent_axes,sense_axes,latent_index)
         self.show_sensitivities()
-        print self.latent_values
-        print "use left and right mouse buttons to select dimensions"
+        print(self.latent_values)
+        print("use left and right mouse buttons to select dimensions")
 
 
     def on_click(self, event):
@@ -255,7 +255,7 @@ class lvm_dimselect(lvm):
 
 
     def on_leave(self,event):
-        print type(self.latent_values)
+        print(type(self.latent_values))
         latent_values = self.latent_values.copy()
         y = self.model.predict(latent_values[None,:])[0]
         self.data_visualize.modify(y)
@@ -403,7 +403,7 @@ class mocap_data_show_vpython(vpython_show):
         self.modify_vertices()
 
     def process_values(self):
-        raise NotImplementedError, "this needs to be implemented to use the data_show class"
+        raise NotImplementedError("this needs to be implemented to use the data_show class")
 
 class mocap_data_show(matplotlib_show):
     """Base class for visualizing motion capture data."""
@@ -455,11 +455,11 @@ class mocap_data_show(matplotlib_show):
         self.axes.figure.canvas.draw()
 
     def process_values(self):
-        raise NotImplementedError, "this needs to be implemented to use the data_show class"
+        raise NotImplementedError("this needs to be implemented to use the data_show class")
 
     def initialize_axes(self, boundary=0.05):
         """Set up the axes with the right limits and scaling."""
-        bs = [(self.vals[:, i].max()-self.vals[:, i].min())*boundary for i in xrange(3)]
+        bs = [(self.vals[:, i].max()-self.vals[:, i].min())*boundary for i in range(3)]
         self.x_lim = np.array([self.vals[:, 0].min()-bs[0], self.vals[:, 0].max()+bs[0]])
         self.y_lim = np.array([self.vals[:, 1].min()-bs[1], self.vals[:, 1].max()+bs[1]])
         self.z_lim = np.array([self.vals[:, 2].min()-bs[2], self.vals[:, 2].max()+bs[2]])
diff --git a/GPy/testing/examples_tests.py b/GPy/testing/examples_tests.py
index be26fff6..48a18119 100644
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@@ -46,20 +46,20 @@ def test_models():
     for loader, module_name, is_pkg in pkgutil.iter_modules([examples_path]):
         # Load examples
         module_examples = loader.find_module(module_name).load_module(module_name)
-        print "MODULE", module_examples
-        print "Before"
-        print inspect.getmembers(module_examples, predicate=inspect.isfunction)
+        print("MODULE", module_examples)
+        print("Before")
+        print(inspect.getmembers(module_examples, predicate=inspect.isfunction))
         functions = [ func for func in inspect.getmembers(module_examples, predicate=inspect.isfunction) if func[0].startswith('_') is False ][::-1]
-        print "After"
-        print functions
+        print("After")
+        print(functions)
         for example in functions:
             if example[0] in ['epomeo_gpx']:
                 #These are the edge cases that we might want to handle specially
                 if example[0] == 'epomeo_gpx' and not GPy.util.datasets.gpxpy_available:
-                    print "Skipping as gpxpy is not available to parse GPS"
+                    print("Skipping as gpxpy is not available to parse GPS")
                     continue
 
-            print "Testing example: ", example[0]
+            print("Testing example: ", example[0])
             # Generate model
 
             try:
@@ -69,7 +69,7 @@ def test_models():
             except Exception as e:
                 failing_models[example[0]] = "Cannot make model: \n{e}".format(e=e)
             else:
-                print models
+                print(models)
                 model_checkgrads.description = 'test_checkgrads_%s' % example[0]
                 try:
                     for model in models:
@@ -89,17 +89,17 @@ def test_models():
             #yield model_checkgrads, model
             #yield model_instance, model
 
-        print "Finished checking module {m}".format(m=module_name)
+        print("Finished checking module {m}".format(m=module_name))
         if len(failing_models.keys()) > 0:
-            print "Failing models: "
-            print failing_models
+            print("Failing models: ")
+            print(failing_models)
 
     if len(failing_models.keys()) > 0:
-        print failing_models
+        print(failing_models)
         raise Exception(failing_models)
 
 
 if __name__ == "__main__":
-    print "Running unit tests, please be (very) patient..."
+    print("Running unit tests, please be (very) patient...")
     # unittest.main()
     test_models()
diff --git a/GPy/testing/index_operations_tests.py b/GPy/testing/index_operations_tests.py
index e5c2011a..a97f1beb 100644
--- a/GPy/testing/index_operations_tests.py
+++ b/GPy/testing/index_operations_tests.py
@@ -121,14 +121,16 @@ class Test(unittest.TestCase):
         self.assertListEqual(removed.tolist(), [0, 2])
 
     def test_misc(self):
-        for k,v in self.param_index.copy()._properties.iteritems():
+        #py3 fix
+        #for k,v in self.param_index.copy()._properties.iteritems():
+        for k,v in self.param_index.copy()._properties.items():
             self.assertListEqual(self.param_index[k].tolist(), v.tolist())
         self.assertEqual(self.param_index.size, 8)
         self.assertEqual(self.view.size, 5)
 
     def test_print(self):
-        print self.param_index
-        print self.view
+        print(self.param_index)
+        print(self.view)
 
 if __name__ == "__main__":
     #import sys;sys.argv = ['', 'Test.test_index_view']
diff --git a/GPy/testing/inference_tests.py b/GPy/testing/inference_tests.py
index ac92c519..e09df1fe 100644
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@@ -11,39 +11,38 @@ import GPy
 
 
 class InferenceXTestCase(unittest.TestCase):
-    
+
     def genData(self):
         D1,D2,N = 12,12,50
-        np.random.seed(1234)
-    
+
         x = np.linspace(0, 4 * np.pi, N)[:, None]
         s1 = np.vectorize(lambda x: np.sin(x))
         s2 = np.vectorize(lambda x: np.cos(x)**2)
         s3 = np.vectorize(lambda x:-np.exp(-np.cos(2 * x)))
         sS = np.vectorize(lambda x: np.cos(x))
-    
+
         s1 = s1(x)
         s2 = s2(x)
         s3 = s3(x)
         sS = sS(x)
-    
+
         s1 -= s1.mean(); s1 /= s1.std(0)
         s2 -= s2.mean(); s2 /= s2.std(0)
         s3 -= s3.mean(); s3 /= s3.std(0)
         sS -= sS.mean(); sS /= sS.std(0)
-    
+
         S1 = np.hstack([s1, sS])
         S2 = np.hstack([s3, sS])
-    
+
         P1 = np.random.randn(S1.shape[1], D1)
         P2 = np.random.randn(S2.shape[1], D2)
-    
+
         Y1 = S1.dot(P1)
         Y2 = S2.dot(P2)
-    
+
         Y1 += .01 * np.random.randn(*Y1.shape)
         Y2 += .01 * np.random.randn(*Y2.shape)
-    
+
         Y1 -= Y1.mean(0)
         Y2 -= Y2.mean(0)
         Y1 /= Y1.std(0)
@@ -52,33 +51,34 @@ class InferenceXTestCase(unittest.TestCase):
         slist = [s1, s2, s3, sS]
         slist_names = ["s1", "s2", "s3", "sS"]
         Ylist = [Y1, Y2]
-        
+
         return Ylist
-    
+
     def test_inferenceX_BGPLVM(self):
         Ys = self.genData()
         m = GPy.models.BayesianGPLVM(Ys[0],5,kernel=GPy.kern.Linear(5,ARD=True))
-        
+
         x,mi = m.infer_newX(m.Y, optimize=False)
         self.assertTrue(mi.checkgrad())
-        
-        m.optimize(max_iters=10000)
-        x,mi = m.infer_newX(m.Y)
 
-        self.assertTrue(np.allclose(m.X.mean, mi.X.mean))
-        self.assertTrue(np.allclose(m.X.variance, mi.X.variance))
+        m.optimize(max_iters=10000)
+        x, mi = m.infer_newX(m.Y)
+
+        print(m.X.mean - mi.X.mean)
+        self.assertTrue(np.allclose(m.X.mean, mi.X.mean, rtol=1e-4, atol=1e-4))
+        self.assertTrue(np.allclose(m.X.variance, mi.X.variance, rtol=1e-4, atol=1e-4))
 
     def test_inferenceX_GPLVM(self):
         Ys = self.genData()
         m = GPy.models.GPLVM(Ys[0],3,kernel=GPy.kern.RBF(3,ARD=True))
-        
+
         x,mi = m.infer_newX(m.Y, optimize=False)
         self.assertTrue(mi.checkgrad())
-        
+
 #         m.optimize(max_iters=10000)
 #         x,mi = m.infer_newX(m.Y)
 #         self.assertTrue(np.allclose(m.X, x))
-        
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index c1bb9265..f3d82216 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -37,7 +37,7 @@ class Kern_check_model(GPy.core.Model):
     def is_positive_semi_definite(self):
         v = np.linalg.eig(self.kernel.K(self.X))[0]
         if any(v.real<=-1e-10):
-            print v.real.min()
+            print(v.real.min())
             return False
         else:
             return True
@@ -126,7 +126,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Positive definite check failed for " + kern.name + " covariance function.")
+        print(("Positive definite check failed for " + kern.name + " covariance function."))
         pass_checks = False
         assert(result)
         return False
@@ -137,7 +137,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        print(("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:"))
         Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=True)
         pass_checks = False
         assert(result)
@@ -149,7 +149,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        print(("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:"))
         Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=True)
         pass_checks = False
         assert(result)
@@ -162,11 +162,11 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     except NotImplementedError:
         result=True
         if verbose:
-            print("update_gradients_diag not implemented for " + kern.name)
+            print(("update_gradients_diag not implemented for " + kern.name))
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Gradient of Kdiag(X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        print(("Gradient of Kdiag(X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:"))
         Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=True)
         pass_checks = False
         assert(result)
@@ -182,13 +182,12 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     except NotImplementedError:
         result=True
         if verbose:
-            print("gradients_X not implemented for " + kern.name)
+            print(("gradients_X not implemented for " + kern.name))
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        print(("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:"))
         testmodel.checkgrad(verbose=True)
-        import ipdb;ipdb.set_trace()
         assert(result)
         pass_checks = False
         return False
@@ -203,11 +202,11 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     except NotImplementedError:
         result=True
         if verbose:
-            print("gradients_X not implemented for " + kern.name)
+            print(("gradients_X not implemented for " + kern.name))
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Gradient of K(X, X2) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        print(("Gradient of K(X, X2) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:"))
         testmodel.checkgrad(verbose=True)
         assert(result)
         pass_checks = False
@@ -223,11 +222,11 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     except NotImplementedError:
         result=True
         if verbose:
-            print("gradients_X not implemented for " + kern.name)
+            print(("gradients_X not implemented for " + kern.name))
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Gradient of Kdiag(X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        print(("Gradient of Kdiag(X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:"))
         Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=True)
         pass_checks = False
         assert(result)
@@ -256,13 +255,23 @@ class KernelGradientTestsContinuous(unittest.TestCase):
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
+    def test_Prod1(self):
+        k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
     def test_Prod2(self):
-        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D)
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
     def test_Prod3(self):
-        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D) * GPy.kern.Bias(self.D)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
+    def test_Prod4(self):
+        k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D) * GPy.kern.Matern32(2, active_dims=[0,1])
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
@@ -282,7 +291,7 @@ class KernelGradientTestsContinuous(unittest.TestCase):
         try:
             k.K(self.X)
         except AssertionError:
-            raise AssertionError, "k.K(X) should run on self.D-1 dimension"
+            raise AssertionError("k.K(X) should run on self.D-1 dimension")
 
     def test_Matern52(self):
         k = GPy.kern.Matern52(self.D)
@@ -401,11 +410,27 @@ class Coregionalize_weave_test(unittest.TestCase):
     GPy.util.config.config.set('weave', 'working', 'False')
 
 
+class KernelTestsProductWithZeroValues(unittest.TestCase):
+
+    def setUp(self):
+        self.X = np.array([[0,1],[1,0]])
+        self.k = GPy.kern.Linear(2) * GPy.kern.Bias(2)
+
+    def test_zero_valued_kernel_full(self):
+        self.k.update_gradients_full(1, self.X)
+        self.assertFalse(np.isnan(self.k['linear.variances'].gradient),
+                         "Gradient resulted in NaN")
+
+    def test_zero_valued_kernel_gradients_X(self):
+        target = self.k.gradients_X(1, self.X)
+        self.assertFalse(np.any(np.isnan(target)),
+                         "Gradient resulted in NaN")
 
 
 if __name__ == "__main__":
-    print "Running unit tests, please be (very) patient..."
+    print("Running unit tests, please be (very) patient...")
     unittest.main()
+
 #     np.random.seed(0)
 #     N0 = 3
 #     N1 = 9
diff --git a/GPy/testing/likelihood_tests.py b/GPy/testing/likelihood_tests.py
index 95929098..7fa5886f 100644
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@@ -10,7 +10,7 @@ from GPy.likelihoods import link_functions
 from GPy.core.parameterization import Param
 from functools import partial
 #np.random.seed(300)
-#np.random.seed(7)
+#np.random.seed(4)
 
 #np.seterr(divide='raise')
 def dparam_partial(inst_func, *args):
@@ -27,9 +27,9 @@ def dparam_partial(inst_func, *args):
           param
     """
     def param_func(param_val, param_name, inst_func, args):
-        #inst_func.im_self._set_params(param)
-        #inst_func.im_self.add_parameter(Param(param_name, param_val))
-        inst_func.im_self[param_name] = param_val
+        #inst_func.__self__._set_params(param)
+        #inst_func.__self__.add_parameter(Param(param_name, param_val))
+        inst_func.__self__[param_name] = param_val
         return inst_func(*args)
     return functools.partial(param_func, inst_func=inst_func, args=args)
 
@@ -44,38 +44,51 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
     The number of parameters and N is the number of data
     Need to take a slice out from f and a slice out of df
     """
-    print "\n{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
-                                           func.__name__, dfunc.__name__)
+    print("\n{} likelihood: {} vs {}".format(func.__self__.__class__.__name__,
+                                           func.__name__, dfunc.__name__))
     partial_f = dparam_partial(func, *args)
     partial_df = dparam_partial(dfunc, *args)
     gradchecking = True
     zipped_params = zip(params, params_names)
     for param_ind, (param_val, param_name) in enumerate(zipped_params):
         #Check one parameter at a time, make sure it is 2d (as some gradients only return arrays) then strip out the parameter
-        fnum = np.atleast_2d(partial_f(param_val, param_name))[:, param_ind].shape[0]
-        dfnum = np.atleast_2d(partial_df(param_val, param_name))[:, param_ind].shape[0]
+        f_ = partial_f(param_val, param_name)
+        df_ = partial_df(param_val, param_name)
+        #Reshape it such that we have a 3d matrix incase, that is we want it (?, N, D) regardless of whether ? is num_params or not
+        f_ = f_.reshape(-1, f_.shape[0], f_.shape[1])
+        df_ = df_.reshape(-1, f_.shape[0], f_.shape[1])
+
+        #Get the number of f and number of dimensions
+        fnum = f_.shape[-2]
+        fdim = f_.shape[-1]
+        dfnum = df_.shape[-2]
+
         for fixed_val in range(dfnum):
             #dlik and dlik_dvar gives back 1 value for each
             f_ind = min(fnum, fixed_val+1) - 1
-            print "fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val)
+            print("fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val))
             #Make grad checker with this param moving, note that set_params is NOT being called
             #The parameter is being set directly with __setattr__
             #Check only the parameter and function value we wish to check at a time
-            grad = GradientChecker(lambda p_val: np.atleast_2d(partial_f(p_val, param_name))[f_ind, param_ind],
-                                   lambda p_val: np.atleast_2d(partial_df(p_val, param_name))[fixed_val, param_ind],
-                                   param_val, [param_name])
+            #func = lambda p_val, fnum, fdim, param_ind, f_ind, param_ind: partial_f(p_val, param_name).reshape(-1, fnum, fdim)[param_ind, f_ind, :]
+            #dfunc_dparam = lambda d_val, fnum, fdim, param_ind, fixed_val: partial_df(d_val, param_name).reshape(-1, fnum, fdim)[param_ind, fixed_val, :]
+
+            #First we reshape the output such that it is (num_params, N, D) then we pull out the relavent parameter-findex and checkgrad just this index at a time
+            func = lambda p_val: partial_f(p_val, param_name).reshape(-1, fnum, fdim)[param_ind, f_ind, :]
+            dfunc_dparam = lambda d_val: partial_df(d_val, param_name).reshape(-1, fnum, fdim)[param_ind, fixed_val, :]
+            grad = GradientChecker(func, dfunc_dparam, param_val, [param_name])
 
             if constraints is not None:
                 for constrain_param, constraint in constraints:
                     if grad.grep_param_names(constrain_param):
                         constraint(constrain_param, grad)
                     else:
-                        print "parameter didn't exist"
-                    print constrain_param, " ", constraint
+                        print("parameter didn't exist")
+                    print(constrain_param, " ", constraint)
             if randomize:
                 grad.randomize()
             if verbose:
-                print grad
+                print(grad)
                 grad.checkgrad(verbose=1)
             if not grad.checkgrad(verbose=True):
                 gradchecking = False
@@ -104,37 +117,9 @@ class TestNoiseModels(object):
 
         self.var = 0.2
 
-        self.var = np.random.rand(1)
-
         #Make a bigger step as lower bound can be quite curved
         self.step = 1e-4
 
-    def tearDown(self):
-        self.Y = None
-        self.f = None
-        self.X = None
-
-    def test_scale2_models(self):
-        self.setUp()
-
-        ####################################################
-        # Constraint wrappers so we can just list them off #
-        ####################################################
-        def constrain_fixed(regex, model):
-            model[regex].constrain_fixed()
-
-        def constrain_negative(regex, model):
-            model[regex].constrain_negative()
-
-        def constrain_positive(regex, model):
-            model[regex].constrain_positive()
-
-        def constrain_bounded(regex, model, lower, upper):
-            """
-            Used like: partial(constrain_bounded, lower=0, upper=1)
-            """
-            model[regex].constrain_bounded(lower, upper)
-
         """
         Dictionary where we nest models we would like to check
             Name: {
@@ -149,136 +134,170 @@ class TestNoiseModels(object):
                 "link_f_constraints": [constraint_wrappers, listed_here]
                 }
         """
-        noise_models = {"Student_t_default": {
-                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [self.var],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                #"constraints": [("t_scale2", constrain_positive), ("deg_free", partial(constrain_fixed, value=5))]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_1_var": {
-                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [1.0],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_small_deg_free": {
-                            "model": GPy.likelihoods.StudentT(deg_free=1.5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [self.var],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_small_var": {
-                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [0.001],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_large_var": {
-                            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [10.0],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_approx_gauss": {
-                            "model": GPy.likelihoods.StudentT(deg_free=1000, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [self.var],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Student_t_log": {
-                            "model": GPy.likelihoods.StudentT(gp_link=link_functions.Log(), deg_free=5, sigma2=self.var),
-                            "grad_params": {
-                                "names": [".*t_scale2"],
-                                "vals": [self.var],
-                                "constraints": [(".*t_scale2", constrain_positive), (".*deg_free", constrain_fixed)]
-                                },
-                            "laplace": True
-                            },
-                        "Gaussian_default": {
-                            "model": GPy.likelihoods.Gaussian(variance=self.var),
-                            "grad_params": {
-                                "names": [".*variance"],
-                                "vals": [self.var],
-                                "constraints": [(".*variance", constrain_positive)]
-                                },
-                            "laplace": True,
-                            "ep": False # FIXME: Should be True when we have it working again
-                            },
-                        #"Gaussian_log": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log(), variance=self.var, D=self.D, N=self.N),
-                            #"grad_params": {
-                                #"names": ["noise_model_variance"],
-                                #"vals": [self.var],
-                                #"constraints": [constrain_positive]
-                                #},
-                            #"laplace": True
-                            #},
-                        #"Gaussian_probit": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Probit(), variance=self.var, D=self.D, N=self.N),
-                            #"grad_params": {
-                                #"names": ["noise_model_variance"],
-                                #"vals": [self.var],
-                                #"constraints": [constrain_positive]
-                                #},
-                            #"laplace": True
-                            #},
-                        #"Gaussian_log_ex": {
-                            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
-                            #"grad_params": {
-                                #"names": ["noise_model_variance"],
-                                #"vals": [self.var],
-                                #"constraints": [constrain_positive]
-                                #},
-                            #"laplace": True
-                            #},
-                        "Bernoulli_default": {
-                            "model": GPy.likelihoods.Bernoulli(),
-                            "link_f_constraints": [partial(constrain_bounded, lower=0, upper=1)],
-                            "laplace": True,
-                            "Y": self.binary_Y,
-                            "ep": False # FIXME: Should be True when we have it working again
-                            },
-                        "Exponential_default": {
-                            "model": GPy.likelihoods.Exponential(),
-                            "link_f_constraints": [constrain_positive],
-                            "Y": self.positive_Y,
-                            "laplace": True,
-                        },
-                        "Poisson_default": {
-                            "model": GPy.likelihoods.Poisson(),
-                            "link_f_constraints": [constrain_positive],
-                            "Y": self.integer_Y,
-                            "laplace": True,
-                            "ep": False #Should work though...
-                        }#,
-                        #GAMMA needs some work!"Gamma_default": {
-                            #"model": GPy.likelihoods.Gamma(),
-                            #"link_f_constraints": [constrain_positive],
-                            #"Y": self.positive_Y,
-                            #"laplace": True
-                        #}
-                    }
+        self.noise_models = {"Student_t_default": {
+            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+            "grad_params": {
+                "names": [".*t_scale2"],
+                "vals": [self.var],
+                "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+            },
+            "laplace": True
+            },
+            "Student_t_1_var": {
+                "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [1.0],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            "Student_t_small_deg_free": {
+                "model": GPy.likelihoods.StudentT(deg_free=1.5, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [self.var],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            "Student_t_small_var": {
+                "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [0.001],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            "Student_t_large_var": {
+                "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [10.0],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            "Student_t_approx_gauss": {
+                "model": GPy.likelihoods.StudentT(deg_free=1000, sigma2=self.var),
+                "grad_params": {
+                    "names": [".*t_scale2"],
+                    "vals": [self.var],
+                    "constraints": [(".*t_scale2", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+                },
+                "laplace": True
+            },
+            #"Student_t_log": {
+            #"model": GPy.likelihoods.StudentT(gp_link=link_functions.Log(), deg_free=5, sigma2=self.var),
+            #"grad_params": {
+            #"names": [".*t_noise"],
+            #"vals": [self.var],
+            #"constraints": [(".*t_noise", self.constrain_positive), (".*deg_free", self.constrain_fixed)]
+            #},
+            #"laplace": True
+            #},
+            "Gaussian_default": {
+                "model": GPy.likelihoods.Gaussian(variance=self.var),
+                "grad_params": {
+                    "names": [".*variance"],
+                    "vals": [self.var],
+                    "constraints": [(".*variance", self.constrain_positive)]
+                },
+                "laplace": True,
+                "ep": False # FIXME: Should be True when we have it working again
+            },
+            "Gaussian_log": {
+                "model": GPy.likelihoods.Gaussian(gp_link=link_functions.Log(), variance=self.var),
+                "grad_params": {
+                    "names": [".*variance"],
+                    "vals": [self.var],
+                    "constraints": [(".*variance", self.constrain_positive)]
+                },
+                "laplace": True
+            },
+            #"Gaussian_probit": {
+            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Probit(), variance=self.var, D=self.D, N=self.N),
+            #"grad_params": {
+            #"names": ["noise_model_variance"],
+            #"vals": [self.var],
+            #"constraints": [constrain_positive]
+            #},
+            #"laplace": True
+            #},
+            #"Gaussian_log_ex": {
+            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
+            #"grad_params": {
+            #"names": ["noise_model_variance"],
+            #"vals": [self.var],
+            #"constraints": [constrain_positive]
+            #},
+            #"laplace": True
+            #},
+            "Bernoulli_default": {
+                "model": GPy.likelihoods.Bernoulli(),
+                "link_f_constraints": [partial(self.constrain_bounded, lower=0, upper=1)],
+                "laplace": True,
+                "Y": self.binary_Y,
+                "ep": False # FIXME: Should be True when we have it working again
+            },
+            "Exponential_default": {
+                "model": GPy.likelihoods.Exponential(),
+                "link_f_constraints": [self.constrain_positive],
+                "Y": self.positive_Y,
+                "laplace": True,
+            },
+            "Poisson_default": {
+                "model": GPy.likelihoods.Poisson(),
+                "link_f_constraints": [self.constrain_positive],
+                "Y": self.integer_Y,
+                "laplace": True,
+                "ep": False #Should work though...
+            },
+            #,
+            #GAMMA needs some work!"Gamma_default": {
+            #"model": GPy.likelihoods.Gamma(),
+            #"link_f_constraints": [constrain_positive],
+            #"Y": self.positive_Y,
+            #"laplace": True
+            #}
+        }
 
-        for name, attributes in noise_models.iteritems():
+
+    ####################################################
+    # Constraint wrappers so we can just list them off #
+    ####################################################
+    def constrain_fixed(self, regex, model):
+        model[regex].constrain_fixed()
+
+    def constrain_negative(self, regex, model):
+        model[regex].constrain_negative()
+
+    def constrain_positive(self, regex, model):
+        model[regex].constrain_positive()
+
+    def constrain_fixed_below(self, regex, model, up_to):
+        model[regex][0:up_to].constrain_fixed()
+
+    def constrain_fixed_above(self, regex, model, above):
+        model[regex][above:].constrain_fixed()
+
+    def constrain_bounded(self, regex, model, lower, upper):
+        """
+        Used like: partial(constrain_bounded, lower=0, upper=1)
+        """
+        model[regex].constrain_bounded(lower, upper)
+
+
+    def tearDown(self):
+        self.Y = None
+        self.f = None
+        self.X = None
+
+    def test_scale2_models(self):
+        self.setUp()
+
+        for name, attributes in self.noise_models.items():
             model = attributes["model"]
             if "grad_params" in attributes:
                 params = attributes["grad_params"]
@@ -290,7 +309,7 @@ class TestNoiseModels(object):
                 param_vals = []
                 param_names = []
                 constrain_positive = []
-                param_constraints = [] # ??? TODO: Saul to Fix.
+                param_constraints = []
             if "link_f_constraints" in attributes:
                 link_f_constraints = attributes["link_f_constraints"]
             else:
@@ -303,6 +322,10 @@ class TestNoiseModels(object):
                 f = attributes["f"].copy()
             else:
                 f = self.f.copy()
+            if "Y_metadata" in attributes:
+                Y_metadata = attributes["Y_metadata"].copy()
+            else:
+                Y_metadata = None
             if "laplace" in attributes:
                 laplace = attributes["laplace"]
             else:
@@ -317,30 +340,30 @@ class TestNoiseModels(object):
 
             #Required by all
             #Normal derivatives
-            yield self.t_logpdf, model, Y, f
-            yield self.t_dlogpdf_df, model, Y, f
-            yield self.t_d2logpdf_df2, model, Y, f
+            yield self.t_logpdf, model, Y, f, Y_metadata
+            yield self.t_dlogpdf_df, model, Y, f, Y_metadata
+            yield self.t_d2logpdf_df2, model, Y, f, Y_metadata
             #Link derivatives
-            yield self.t_dlogpdf_dlink, model, Y, f, link_f_constraints
-            yield self.t_d2logpdf_dlink2, model, Y, f, link_f_constraints
+            yield self.t_dlogpdf_dlink, model, Y, f, Y_metadata, link_f_constraints
+            yield self.t_d2logpdf_dlink2, model, Y, f, Y_metadata, link_f_constraints
             if laplace:
                 #Laplace only derivatives
-                yield self.t_d3logpdf_df3, model, Y, f
-                yield self.t_d3logpdf_dlink3, model, Y, f, link_f_constraints
+                yield self.t_d3logpdf_df3, model, Y, f, Y_metadata
+                yield self.t_d3logpdf_dlink3, model, Y, f, Y_metadata, link_f_constraints
                 #Params
-                yield self.t_dlogpdf_dparams, model, Y, f, param_vals, param_names, param_constraints
-                yield self.t_dlogpdf_df_dparams, model, Y, f, param_vals, param_names, param_constraints
-                yield self.t_d2logpdf2_df2_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_df_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
+                yield self.t_d2logpdf2_df2_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
                 #Link params
-                yield self.t_dlogpdf_link_dparams, model, Y, f, param_vals, param_names, param_constraints
-                yield self.t_dlogpdf_dlink_dparams, model, Y, f, param_vals, param_names, param_constraints
-                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_link_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
+                yield self.t_dlogpdf_dlink_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
+                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, Y_metadata, param_vals, param_names, param_constraints
 
                 #laplace likelihood gradcheck
-                yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+                yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, Y_metadata, self.step, param_vals, param_names, param_constraints
             if ep:
                 #ep likelihood gradcheck
-                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, Y_metadata, self.step, param_vals, param_names, param_constraints
 
 
         self.tearDown()
@@ -349,76 +372,76 @@ class TestNoiseModels(object):
     # dpdf_df's #
     #############
     @with_setup(setUp, tearDown)
-    def t_logpdf(self, model, Y, f):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+    def t_logpdf(self, model, Y, f, Y_metadata):
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         #print model._get_params()
         np.testing.assert_almost_equal(
-                model.pdf(f.copy(), Y.copy()).prod(),
-                               np.exp(model.logpdf(f.copy(), Y.copy()).sum())
+                model.pdf(f.copy(), Y.copy(), Y_metadata=Y_metadata).prod(),
+                               np.exp(model.logpdf(f.copy(), Y.copy(), Y_metadata=Y_metadata).sum())
                                )
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_df(self, model, Y, f):
-        print "\n{}".format(inspect.stack()[0][3])
+    def t_dlogpdf_df(self, model, Y, f, Y_metadata):
+        print("\n{}".format(inspect.stack()[0][3]))
         self.description = "\n{}".format(inspect.stack()[0][3])
-        logpdf = functools.partial(model.logpdf, y=Y)
-        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
+        logpdf = functools.partial(np.sum(model.logpdf), y=Y, Y_metadata=Y_metadata)
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(logpdf, dlogpdf_df, f.copy(), 'g')
         grad.randomize()
-        print model
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf_df2(self, model, Y, f):
-        print "\n{}".format(inspect.stack()[0][3])
-        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
-        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
+    def t_d2logpdf_df2(self, model, Y, f, Y_metadata):
+        print("\n{}".format(inspect.stack()[0][3]))
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y, Y_metadata=Y_metadata)
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(dlogpdf_df, d2logpdf_df2, f.copy(), 'g')
         grad.randomize()
-        print model
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
-    def t_d3logpdf_df3(self, model, Y, f):
-        print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
-        d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=Y)
+    def t_d3logpdf_df3(self, model, Y, f, Y_metadata):
+        print("\n{}".format(inspect.stack()[0][3]))
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y, Y_metadata=Y_metadata)
+        d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, f.copy(), 'g')
         grad.randomize()
-        print model
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     ##############
     # df_dparams #
     ##############
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dparams(self, model, Y, f, params, params_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+    def t_dlogpdf_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
-                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_df_dparams(self, model, Y, f, params, params_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+    def t_dlogpdf_df_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
-                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf2_df2_dparams(self, model, Y, f, params, params_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+    def t_d2logpdf2_df2_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
-                    params, params_names, args=(f, Y), constraints=param_constraints,
+                    params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
@@ -426,10 +449,10 @@ class TestNoiseModels(object):
     # dpdf_dlink's #
     ################
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dlink(self, model, Y, f, link_f_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        logpdf = functools.partial(model.logpdf_link, y=Y)
-        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
+    def t_dlogpdf_dlink(self, model, Y, f, Y_metadata, link_f_constraints):
+        print("\n{}".format(inspect.stack()[0][3]))
+        logpdf = functools.partial(model.logpdf_link, y=Y, Y_metadata=Y_metadata)
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(logpdf, dlogpdf_dlink, f.copy(), 'g')
 
         #Apply constraints to link_f values
@@ -437,15 +460,15 @@ class TestNoiseModels(object):
             constraint('g', grad)
 
         grad.randomize()
-        print grad
-        print model
+        print(grad)
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf_dlink2(self, model, Y, f, link_f_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
-        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
+    def t_d2logpdf_dlink2(self, model, Y, f, Y_metadata, link_f_constraints):
+        print("\n{}".format(inspect.stack()[0][3]))
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y, Y_metadata=Y_metadata)
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, f.copy(), 'g')
 
         #Apply constraints to link_f values
@@ -453,15 +476,15 @@ class TestNoiseModels(object):
             constraint('g', grad)
 
         grad.randomize()
-        print grad
-        print model
+        print(grad)
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
-    def t_d3logpdf_dlink3(self, model, Y, f, link_f_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
-        d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=Y)
+    def t_d3logpdf_dlink3(self, model, Y, f, Y_metadata, link_f_constraints):
+        print("\n{}".format(inspect.stack()[0][3]))
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y, Y_metadata=Y_metadata)
+        d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, f.copy(), 'g')
 
         #Apply constraints to link_f values
@@ -469,40 +492,40 @@ class TestNoiseModels(object):
             constraint('g', grad)
 
         grad.randomize()
-        print grad
-        print model
+        print(grad)
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     #################
     # dlink_dparams #
     #################
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_link_dparams(self, model, Y, f, params, param_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+    def t_dlogpdf_link_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
-                    params, param_names, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_dlogpdf_dlink_dparams(self, model, Y, f, params, param_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+    def t_dlogpdf_dlink_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
-                    params, param_names, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
     @with_setup(setUp, tearDown)
-    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, params, param_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
-                    params, param_names, args=(f, Y), constraints=param_constraints,
+                    params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
                     randomize=False, verbose=True)
                 )
 
@@ -510,21 +533,23 @@ class TestNoiseModels(object):
     # laplace test #
     ################
     @with_setup(setUp, tearDown)
-    def t_laplace_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
-        print "\n{}".format(inspect.stack()[0][3])
+    def t_laplace_fit_rbf_white(self, model, X, Y, f, Y_metadata, step, param_vals, param_names, constraints):
+        print("\n{}".format(inspect.stack()[0][3]))
         #Normalize
         Y = Y/Y.max()
-        white_var = 1e-6
+        white_var = 1e-5
         kernel = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
         laplace_likelihood = GPy.inference.latent_function_inference.Laplace()
-        m = GPy.core.GP(X.copy(), Y.copy(), kernel, likelihood=model, inference_method=laplace_likelihood)
+
+        m = GPy.core.GP(X.copy(), Y.copy(), kernel, likelihood=model, Y_metadata=Y_metadata, inference_method=laplace_likelihood)
         m['.*white'].constrain_fixed(white_var)
 
         #Set constraints
         for constrain_param, constraint in constraints:
             constraint(constrain_param, m)
 
-        print m
+        print(m)
+        m.randomize()
         m.randomize()
 
         #Set params
@@ -533,7 +558,7 @@ class TestNoiseModels(object):
             m[name] = param_vals[param_num]
 
         #m.optimize(max_iters=8)
-        print m
+        print(m)
         #if not m.checkgrad(step=step):
             #m.checkgrad(verbose=1, step=step)
             #NOTE this test appears to be stochastic for some likelihoods (student t?)
@@ -545,14 +570,15 @@ class TestNoiseModels(object):
     # EP test #
     ###########
     @with_setup(setUp, tearDown)
-    def t_ep_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
-        print "\n{}".format(inspect.stack()[0][3])
+    def t_ep_fit_rbf_white(self, model, X, Y, f, Y_metadata, step, param_vals, param_names, constraints):
+        print("\n{}".format(inspect.stack()[0][3]))
         #Normalize
         Y = Y/Y.max()
         white_var = 1e-6
         kernel = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
         ep_inf = GPy.inference.latent_function_inference.EP()
-        m = GPy.core.GP(X.copy(), Y.copy(), kernel=kernel, likelihood=model, inference_method=ep_inf)
+
+        m = GPy.core.GP(X.copy(), Y.copy(), kernel=kernel, likelihood=model, Y_metadata=Y_metadata, inference_method=ep_inf)
         m['.*white'].constrain_fixed(white_var)
 
         for param_num in range(len(param_names)):
@@ -561,7 +587,7 @@ class TestNoiseModels(object):
             constraints[param_num](name, m)
 
         m.randomize()
-        print m
+        print(m)
         assert m.checkgrad(verbose=1, step=step)
 
 
@@ -571,8 +597,8 @@ class LaplaceTests(unittest.TestCase):
     """
 
     def setUp(self):
-        self.N = 5
-        self.D = 3
+        self.N = 15
+        self.D = 1
         self.X = np.random.rand(self.N, self.D)*10
 
         self.real_std = 0.1
@@ -598,7 +624,7 @@ class LaplaceTests(unittest.TestCase):
         self.X = None
 
     def test_gaussian_d2logpdf_df2_2(self):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         self.Y = None
 
         self.N = 2
@@ -636,28 +662,28 @@ class LaplaceTests(unittest.TestCase):
         exact_inf = GPy.inference.latent_function_inference.ExactGaussianInference()
         m1 = GPy.core.GP(X, Y.copy(), kernel=kernel1, likelihood=gauss_distr1, inference_method=exact_inf)
         m1['.*white'].constrain_fixed(1e-6)
-        m1['.*rbf.variance'] = initial_var_guess
-        m1['.*rbf.variance'].constrain_bounded(1e-4, 10)
+        m1['.*Gaussian_noise.variance'].constrain_bounded(1e-4, 10)
         m1.randomize()
 
         gauss_distr2 = GPy.likelihoods.Gaussian(variance=initial_var_guess)
         laplace_inf = GPy.inference.latent_function_inference.Laplace()
         m2 = GPy.core.GP(X, Y.copy(), kernel=kernel2, likelihood=gauss_distr2, inference_method=laplace_inf)
         m2['.*white'].constrain_fixed(1e-6)
-        m2['.*rbf.variance'].constrain_bounded(1e-4, 10)
+        m2['.*Gaussian_noise.variance'].constrain_bounded(1e-4, 10)
         m2.randomize()
 
         if debug:
-            print m1
-            print m2
+            print(m1)
+            print(m2)
+
         optimizer = 'scg'
-        print "Gaussian"
-        m1.optimize(optimizer, messages=debug)
-        print "Laplace Gaussian"
-        m2.optimize(optimizer, messages=debug)
+        print("Gaussian")
+        m1.optimize(optimizer, messages=debug, ipython_notebook=False)
+        print("Laplace Gaussian")
+        m2.optimize(optimizer, messages=debug, ipython_notebook=False)
         if debug:
-            print m1
-            print m2
+            print(m1)
+            print(m2)
 
         m2[:] = m1[:]
 
@@ -687,8 +713,6 @@ class LaplaceTests(unittest.TestCase):
             pb.scatter(X, m1.likelihood.Y, c='g')
             pb.scatter(X, m2.likelihood.Y, c='r', marker='x')
 
-
-
         #Check Y's are the same
         np.testing.assert_almost_equal(m1.Y, m2.Y, decimal=5)
         #Check marginals are the same
@@ -706,5 +730,5 @@ class LaplaceTests(unittest.TestCase):
         self.assertTrue(m2.checkgrad(verbose=True))
 
 if __name__ == "__main__":
-    print "Running unit tests"
+    print("Running unit tests")
     unittest.main()
diff --git a/GPy/testing/link_function_tests.py b/GPy/testing/link_function_tests.py
new file mode 100644
index 00000000..fb8fba99
--- /dev/null
+++ b/GPy/testing/link_function_tests.py
@@ -0,0 +1,143 @@
+import numpy as np
+import scipy as sp
+from scipy.special import cbrt
+from GPy.models import GradientChecker
+_lim_val = np.finfo(np.float64).max
+_lim_val_exp = np.log(_lim_val)
+_lim_val_square = np.sqrt(_lim_val)
+_lim_val_cube = cbrt(_lim_val)
+from GPy.likelihoods.link_functions import Identity, Probit, Cloglog, Log, Log_ex_1, Reciprocal, Heaviside
+
+class LinkFunctionTests(np.testing.TestCase):
+    def setUp(self):
+        self.small_f = np.array([[-1e-4]])
+        self.zero_f = np.array([[1e-4]])
+        self.mid_f = np.array([[5.0]])
+        self.large_f = np.array([[1e4]])
+        self.f_lower_lim = np.array(-np.inf)
+        self.f_upper_lim = np.array(np.inf)
+
+    def check_gradient(self, link_func, lim_of_inf, test_lim=False):
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.mid_f)
+        self.assertTrue(grad.checkgrad(verbose=True))
+        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.mid_f)
+        self.assertTrue(grad2.checkgrad(verbose=True))
+        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.mid_f)
+        self.assertTrue(grad3.checkgrad(verbose=True))
+
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.small_f)
+        self.assertTrue(grad.checkgrad(verbose=True))
+        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.small_f)
+        self.assertTrue(grad2.checkgrad(verbose=True))
+        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.small_f)
+        self.assertTrue(grad3.checkgrad(verbose=True))
+
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=self.zero_f)
+        self.assertTrue(grad.checkgrad(verbose=True))
+        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=self.zero_f)
+        self.assertTrue(grad2.checkgrad(verbose=True))
+        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=self.zero_f)
+        self.assertTrue(grad3.checkgrad(verbose=True))
+
+        #Do a limit test if the large f value is too large
+        large_f = np.clip(self.large_f, -np.inf, lim_of_inf-1e-3)
+        grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=large_f)
+        self.assertTrue(grad.checkgrad(verbose=True))
+        grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=large_f)
+        self.assertTrue(grad2.checkgrad(verbose=True))
+        grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=large_f)
+        self.assertTrue(grad3.checkgrad(verbose=True))
+
+        if test_lim:
+            print "Testing limits"
+            #Remove some otherwise we are too close to the limit for gradcheck to work effectively
+            lim_of_inf = lim_of_inf - 1e-4
+            grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=lim_of_inf)
+            self.assertTrue(grad.checkgrad(verbose=True))
+            grad2 = GradientChecker(link_func.dtransf_df, link_func.d2transf_df2, x0=lim_of_inf)
+            self.assertTrue(grad2.checkgrad(verbose=True))
+            grad3 = GradientChecker(link_func.d2transf_df2, link_func.d3transf_df3, x0=lim_of_inf)
+            self.assertTrue(grad3.checkgrad(verbose=True))
+
+    def check_overflow(self, link_func, lim_of_inf):
+        #Check that it does something sensible beyond this limit,
+        #note this is not checking the value is correct, just that it isn't nan
+        beyond_lim_of_inf = lim_of_inf + 100.0
+        self.assertFalse(np.isinf(link_func.transf(beyond_lim_of_inf)))
+        self.assertFalse(np.isinf(link_func.dtransf_df(beyond_lim_of_inf)))
+        self.assertFalse(np.isinf(link_func.d2transf_df2(beyond_lim_of_inf)))
+
+        self.assertFalse(np.isnan(link_func.transf(beyond_lim_of_inf)))
+        self.assertFalse(np.isnan(link_func.dtransf_df(beyond_lim_of_inf)))
+        self.assertFalse(np.isnan(link_func.d2transf_df2(beyond_lim_of_inf)))
+
+    def test_log_overflow(self):
+        link = Log()
+        lim_of_inf = _lim_val_exp
+
+        np.testing.assert_almost_equal(np.exp(self.mid_f), link.transf(self.mid_f))
+        assert np.isinf(np.exp(np.log(self.f_upper_lim)))
+        #Check the clipping works
+        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
+        #Need to look at most significant figures here rather than the decimals
+        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), _lim_val, significant=5)
+        self.check_overflow(link, lim_of_inf)
+
+        #Check that it would otherwise fail
+        beyond_lim_of_inf = lim_of_inf + 10.0
+        old_err_state = np.seterr(over='ignore')
+        self.assertTrue(np.isinf(np.exp(beyond_lim_of_inf)))
+        np.seterr(**old_err_state)
+
+    def test_log_ex_1_overflow(self):
+        link = Log_ex_1()
+        lim_of_inf = _lim_val_exp
+
+        np.testing.assert_almost_equal(np.log1p(np.exp(self.mid_f)), link.transf(self.mid_f))
+        assert np.isinf(np.log1p(np.exp(np.log(self.f_upper_lim))))
+        #Check the clipping works
+        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
+        #Need to look at most significant figures here rather than the decimals
+        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), np.log1p(_lim_val), significant=5)
+        self.check_overflow(link, lim_of_inf)
+
+        #Check that it would otherwise fail
+        beyond_lim_of_inf = lim_of_inf + 10.0
+        old_err_state = np.seterr(over='ignore')
+        self.assertTrue(np.isinf(np.log1p(np.exp(beyond_lim_of_inf))))
+        np.seterr(**old_err_state)
+
+
+    def test_log_gradients(self):
+        # transf dtransf_df d2transf_df2 d3transf_df3
+        link = Log()
+        lim_of_inf = _lim_val_exp
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_identity_gradients(self):
+        link = Identity()
+        lim_of_inf = _lim_val
+        #FIXME: Should be able to think of a way to test the limits of this
+        self.check_gradient(link, lim_of_inf, test_lim=False)
+
+    def test_probit_gradients(self):
+        link = Probit()
+        lim_of_inf = _lim_val
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_Cloglog_gradients(self):
+        link = Cloglog()
+        lim_of_inf = _lim_val_exp
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+
+    def test_Log_ex_1_gradients(self):
+        link = Log_ex_1()
+        lim_of_inf = _lim_val_exp
+        self.check_gradient(link, lim_of_inf, test_lim=True)
+        self.check_overflow(link, lim_of_inf)
+
+    def test_reciprocal_gradients(self):
+        link = Reciprocal()
+        lim_of_inf = _lim_val
+        #Does not work with much smaller values, and values closer to zero than 1e-5
+        self.check_gradient(link, lim_of_inf, test_lim=True)
diff --git a/GPy/testing/mapping_tests.py b/GPy/testing/mapping_tests.py
new file mode 100644
index 00000000..2ff0e2d8
--- /dev/null
+++ b/GPy/testing/mapping_tests.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class MappingGradChecker(GPy.core.Model):
+    """
+    This class has everything we need to check the gradient of a mapping. It
+    implement a simple likelihood which is a weighted sum of the outputs of the
+    mapping. the gradients are checked against the parameters of the mapping
+    and the input.
+    """
+    def __init__(self, mapping, X, name='map_grad_check'):
+        super(MappingGradChecker, self).__init__(name)
+        self.mapping = mapping
+        self.link_parameter(self.mapping)
+        self.X = GPy.core.Param('X',X)
+        self.link_parameter(self.X)
+        self.dL_dY = np.random.randn(self.X.shape[0], self.mapping.output_dim)
+    def log_likelihood(self):
+        return np.sum(self.mapping.f(self.X) * self.dL_dY)
+    def parameters_changed(self):
+        self.X.gradient = self.mapping.gradients_X(self.dL_dY, self.X)
+        self.mapping.update_gradients(self.dL_dY, self.X)
+
+
+class MappingTests(unittest.TestCase):
+
+    def test_kernelmapping(self):
+        X = np.random.randn(100,3)
+        Z = np.random.randn(10,3)
+        mapping = GPy.mappings.Kernel(3, 2, Z, GPy.kern.RBF(3))
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_linearmapping(self):
+        mapping = GPy.mappings.Linear(3, 2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_mlpmapping(self):
+        mapping = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_addmapping(self):
+        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
+        m2 = GPy.mappings.Linear(input_dim=3, output_dim=2)
+        mapping = GPy.mappings.Additive(m1, m2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+    def test_compoundmapping(self):
+        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
+        Z = np.random.randn(10,2)
+        m2 = GPy.mappings.Kernel(2, 4, Z, GPy.kern.RBF(2))
+        mapping = GPy.mappings.Compound(m1, m2)
+        X = np.random.randn(100,3)
+        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
+
+
+
+
+if __name__ == "__main__":
+    print("Running unit tests, please be (very) patient...")
+    unittest.main()
diff --git a/GPy/testing/meanfunc_tests.py b/GPy/testing/meanfunc_tests.py
new file mode 100644
index 00000000..1d875377
--- /dev/null
+++ b/GPy/testing/meanfunc_tests.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2015, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class MFtests(unittest.TestCase):
+    def simple_mean_function():
+        """
+        The simplest possible mean function. No parameters, just a simple Sinusoid.
+        """
+        #create  simple mean function
+        mf = GPy.core.Mapping(1,1)
+        mf.f = np.sin
+        mf.update_gradients = lambda a,b: None
+
+        X = np.linspace(0,10,50).reshape(-1,1)
+        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
+
+        k =GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        self.assertTrue(m.checkgrad())
+
+    def test_parametric_mean_function(self):
+        """
+        A linear mean function with parameters that we'll learn alongside the kernel
+        """
+
+        X = np.linspace(0,10,50).reshape(-1,1)
+        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
+
+        mf = GPy.mappings.Linear(1,1)
+
+        k =GPy.kern.RBF(1)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+        self.assertTrue(m.checkgrad())
+
+    def test_svgp_mean_function(self):
+
+        # an instance of the SVIGOP with a men function
+        X = np.linspace(0,10,500).reshape(-1,1)
+        Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
+        Y = np.where(Y>0, 1,0) # make aclassificatino problem
+
+        mf = GPy.mappings.Linear(1,1)
+        Z = np.linspace(0,10,50).reshape(-1,1)
+        lik = GPy.likelihoods.Bernoulli()
+        k =GPy.kern.RBF(1) + GPy.kern.White(1, 1e-4)
+        m = GPy.core.SVGP(X, Y,Z=Z, kernel=k, likelihood=lik, mean_function=mf)
+        self.assertTrue(m.checkgrad())
+
+
+
diff --git a/GPy/testing/misc_tests.py b/GPy/testing/misc_tests.py
new file mode 100644
index 00000000..e620fa7e
--- /dev/null
+++ b/GPy/testing/misc_tests.py
@@ -0,0 +1,18 @@
+import numpy as np
+import scipy as sp
+import GPy
+
+class MiscTests(np.testing.TestCase):
+    """
+    Testing some utilities of misc
+    """
+    def setUp(self):
+        self._lim_val = np.finfo(np.float64).max
+        self._lim_val_exp = np.log(self._lim_val)
+
+    def test_safe_exp_upper(self):
+        assert np.exp(self._lim_val_exp + 1) == np.inf
+        assert GPy.util.misc.safe_exp(self._lim_val_exp + 1) < np.inf
+
+    def test_safe_exp_lower(self):
+        assert GPy.util.misc.safe_exp(1e-10) < np.inf
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 559014f7..ce78ee88 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -153,19 +153,19 @@ class MiscTests(unittest.TestCase):
     def test_big_model(self):
         m = GPy.examples.dimensionality_reduction.mrd_simulation(optimize=0, plot=0, plot_sim=0)
         m.X.fix()
-        print m
+        print(m)
         m.unfix()
         m.checkgrad()
-        print m
+        print(m)
         m.fix()
-        print m
+        print(m)
         m.inducing_inputs.unfix()
-        print m
+        print(m)
         m.checkgrad()
         m.unfix()
         m.checkgrad()
         m.checkgrad()
-        print m
+        print(m)
 
     def test_model_set_params(self):
         m = GPy.models.GPRegression(self.X, self.Y)
@@ -176,7 +176,7 @@ class MiscTests(unittest.TestCase):
         m['.*var'] -= .1
         np.testing.assert_equal(m.kern.lengthscale, lengthscale)
         m.optimize()
-        print m
+        print(m)
 
     def test_model_updates(self):
         Y1 = np.random.normal(0, 1, (40, 13))
@@ -201,7 +201,7 @@ class MiscTests(unittest.TestCase):
         Y = np.sin(X) + np.random.randn(20, 1) * 0.05
         m = GPy.models.GPRegression(X, Y)
         m.optimize()
-        print m
+        print(m)
 
 class GradientTests(np.testing.TestCase):
     def setUp(self):
@@ -476,7 +476,7 @@ class GradientTests(np.testing.TestCase):
         likelihood = GPy.likelihoods.MixedNoise(likelihoods_list=likelihoods_list)
         m = GPy.core.SparseGP(X, Y, X[np.random.choice(num_obs, 10)],
                               kern, likelihood,
-                              GPy.inference.latent_function_inference.VarDTC(),
+                              inference_method=GPy.inference.latent_function_inference.VarDTC(),
                               Y_metadata=Y_metadata)
         self.assertTrue(m.checkgrad())
 
@@ -523,5 +523,5 @@ class GradientTests(np.testing.TestCase):
 
 
 if __name__ == "__main__":
-    print "Running unit tests, please be (very) patient..."
+    print("Running unit tests, please be (very) patient...")
     unittest.main()
diff --git a/GPy/testing/mpi_tests.py b/GPy/testing/mpi_tests.py
index 5c489032..28a23288 100644
--- a/GPy/testing/mpi_tests.py
+++ b/GPy/testing/mpi_tests.py
@@ -84,7 +84,7 @@ except:
 
 
 if __name__ == "__main__":
-    print "Running unit tests, please be (very) patient..."
+    print("Running unit tests, please be (very) patient...")
     try:
         import mpi4py
         unittest.main()
diff --git a/GPy/testing/parameterized_tests.py b/GPy/testing/parameterized_tests.py
index 7c4f4ce2..0fb129ff 100644
--- a/GPy/testing/parameterized_tests.py
+++ b/GPy/testing/parameterized_tests.py
@@ -12,6 +12,7 @@ from GPy.core.parameterization.transformations import NegativeLogexp, Logistic
 from GPy.core.parameterization.parameterized import Parameterized
 from GPy.core.parameterization.param import Param
 from GPy.core.parameterization.index_operations import ParameterIndexOperations
+from functools import reduce
 
 class ArrayCoreTest(unittest.TestCase):
     def setUp(self):
@@ -107,7 +108,7 @@ class ParameterizedTest(unittest.TestCase):
         self.assertListEqual(self.white._fixes_.tolist(), [FIXED])
         self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
         self.assertIs(self.test1.constraints, self.param.constraints._param_index_ops)
-        self.assertListEqual(self.test1.constraints[Logexp()].tolist(), range(self.param.size, self.param.size+self.rbf.size))
+        self.assertListEqual(self.test1.constraints[Logexp()].tolist(), list(range(self.param.size, self.param.size+self.rbf.size)))
 
     def test_remove_parameter_param_array_grad_array(self):
         val = self.test1.kern.param_array.copy()
@@ -120,15 +121,15 @@ class ParameterizedTest(unittest.TestCase):
     def test_default_constraints(self):
         self.assertIs(self.rbf.variance.constraints._param_index_ops, self.rbf.constraints._param_index_ops)
         self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
-        self.assertListEqual(self.rbf.constraints.indices()[0].tolist(), range(2))
+        self.assertListEqual(self.rbf.constraints.indices()[0].tolist(), list(range(2)))
         from GPy.core.parameterization.transformations import Logexp
         kern = self.test1.kern
         self.test1.unlink_parameter(kern)
-        self.assertListEqual(kern.constraints[Logexp()].tolist(), range(3))
+        self.assertListEqual(kern.constraints[Logexp()].tolist(), list(range(3)))
 
     def test_constraints(self):
         self.rbf.constrain(GPy.transformations.Square(), False)
-        self.assertListEqual(self.test1.constraints[GPy.transformations.Square()].tolist(), range(self.param.size, self.param.size+self.rbf.size))
+        self.assertListEqual(self.test1.constraints[GPy.transformations.Square()].tolist(), list(range(self.param.size, self.param.size+self.rbf.size)))
         self.assertListEqual(self.test1.constraints[GPy.transformations.Logexp()].tolist(), [self.param.size+self.rbf.size])
 
         self.test1.kern.unlink_parameter(self.rbf)
@@ -181,8 +182,8 @@ class ParameterizedTest(unittest.TestCase):
 
     def test_add_parameter_in_hierarchy(self):
         self.test1.kern.rbf.link_parameter(Param("NEW", np.random.rand(2), NegativeLogexp()), 1)
-        self.assertListEqual(self.test1.constraints[NegativeLogexp()].tolist(), range(self.param.size+1, self.param.size+1 + 2))
-        self.assertListEqual(self.test1.constraints[GPy.transformations.Logistic(0,1)].tolist(), range(self.param.size))
+        self.assertListEqual(self.test1.constraints[NegativeLogexp()].tolist(), list(range(self.param.size+1, self.param.size+1 + 2)))
+        self.assertListEqual(self.test1.constraints[GPy.transformations.Logistic(0,1)].tolist(), list(range(self.param.size)))
         self.assertListEqual(self.test1.constraints[GPy.transformations.Logexp(0,1)].tolist(), np.r_[50, 53:55].tolist())
 
     def test_regular_expression_misc(self):
@@ -240,7 +241,7 @@ class ParameterizedTest(unittest.TestCase):
                 self.p2.constrain_positive()
 
         m = TestLikelihood()
-        print m
+        print(m)
         val = m.p1.values.copy()
         self.assert_(m.p1.is_fixed)
         self.assert_(m.constraints[GPy.constraints.Logexp()].tolist(), [1])
@@ -248,9 +249,9 @@ class ParameterizedTest(unittest.TestCase):
         self.assertEqual(m.p1, val)
 
     def test_printing(self):
-        print self.test1
-        print self.param
-        print self.test1['']
+        print(self.test1)
+        print(self.param)
+        print(self.test1[''])
 
 if __name__ == "__main__":
     #import sys;sys.argv = ['', 'Test.test_add_parameter']
diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py
index c79e9914..fd1bf93c 100644
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@@ -19,6 +19,7 @@ from GPy.kern._src.static import Bias, White
 from GPy.examples.dimensionality_reduction import mrd_simulation
 from GPy.core.parameterization.variational import NormalPosterior
 from GPy.models.gp_regression import GPRegression
+from functools import reduce
 
 def toy_model():
     X = np.linspace(0,1,50)[:, None]
@@ -28,18 +29,25 @@ def toy_model():
 
 class ListDictTestCase(unittest.TestCase):
     def assertListDictEquals(self, d1, d2, msg=None):
-        for k,v in d1.iteritems():
+        #py3 fix
+        #for k,v in d1.iteritems():
+        for k,v in d1.items():
             self.assertListEqual(list(v), list(d2[k]), msg)
     def assertArrayListEquals(self, l1, l2):
-        for a1, a2 in itertools.izip(l1,l2):
+        for a1, a2 in zip(l1,l2):
             np.testing.assert_array_equal(a1, a2)
 
 class Test(ListDictTestCase):
     def test_parameter_index_operations(self):
         pio = ParameterIndexOperations(dict(test1=np.array([4,3,1,6,4]), test2=np.r_[2:130]))
         piov = ParameterIndexOperationsView(pio, 20, 250)
-        self.assertListDictEquals(dict(piov.items()), dict(piov.copy().iteritems()))
-        self.assertListDictEquals(dict(pio.iteritems()), dict(pio.copy().items()))
+        #py3 fix
+        #self.assertListDictEquals(dict(piov.items()), dict(piov.copy().iteritems()))
+        self.assertListDictEquals(dict(piov.items()), dict(piov.copy().items()))
+
+        #py3 fix
+        #self.assertListDictEquals(dict(pio.iteritems()), dict(pio.copy().items()))
+        self.assertListDictEquals(dict(pio.items()), dict(pio.copy().items()))
 
         self.assertArrayListEquals(pio.copy().indices(), pio.indices())
         self.assertArrayListEquals(piov.copy().indices(), piov.indices())
@@ -54,7 +62,9 @@ class Test(ListDictTestCase):
             pickle.dump(piov, f)
             f.seek(0)
             pio2 = pickle.load(f)
-            self.assertListDictEquals(dict(piov.items()), dict(pio2.iteritems()))
+            #py3 fix
+            #self.assertListDictEquals(dict(piov.items()), dict(pio2.iteritems()))
+            self.assertListDictEquals(dict(piov.items()), dict(pio2.items()))
 
     def test_param(self):
         param = Param('test', np.arange(4*2).reshape(4,2))
diff --git a/GPy/testing/prior_tests.py b/GPy/testing/prior_tests.py
index 6a61fbb5..ca03ad93 100644
--- a/GPy/testing/prior_tests.py
+++ b/GPy/testing/prior_tests.py
@@ -110,5 +110,5 @@ class PriorTests(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    print "Running unit tests, please be (very) patient..."
+    print("Running unit tests, please be (very) patient...")
     unittest.main()
diff --git a/GPy/testing/svgp_tests.py b/GPy/testing/svgp_tests.py
new file mode 100644
index 00000000..beb9c00d
--- /dev/null
+++ b/GPy/testing/svgp_tests.py
@@ -0,0 +1,54 @@
+import numpy as np
+import scipy as sp
+import GPy
+
+class SVGP_nonconvex(np.testing.TestCase):
+    """
+    Inference in the SVGP with a student-T likelihood
+    """
+    def setUp(self):
+        X = np.linspace(0,10,100).reshape(-1,1)
+        Z = np.linspace(0,10,10).reshape(-1,1)
+        Y = np.sin(X) + np.random.randn(*X.shape)*0.1
+        Y[50] += 3
+
+        lik = GPy.likelihoods.StudentT(deg_free=2)
+        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
+    def test_grad(self):
+        assert self.m.checkgrad(step=1e-4)
+
+class SVGP_classification(np.testing.TestCase):
+    """
+    Inference in the SVGP with a Bernoulli likelihood
+    """
+    def setUp(self):
+        X = np.linspace(0,10,100).reshape(-1,1)
+        Z = np.linspace(0,10,10).reshape(-1,1)
+        Y = np.where((np.sin(X) + np.random.randn(*X.shape)*0.1)>0, 1,0)
+
+        lik = GPy.likelihoods.Bernoulli()
+        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
+    def test_grad(self):
+        assert self.m.checkgrad(step=1e-4)
+
+class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
+    """
+    Inference in the SVGP with a Bernoulli likelihood
+    """
+    def setUp(self):
+        X = np.linspace(0,10,100).reshape(-1,1)
+        Z = np.linspace(0,10,10).reshape(-1,1)
+        latent_f = np.exp(0.1*X * 0.05*X**2)
+        Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
+
+        mf = GPy.mappings.Linear(1,1)
+
+        lik = GPy.likelihoods.Poisson()
+        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
+        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
+    def test_grad(self):
+        assert self.m.checkgrad(step=1e-4)
+
+
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index c3edfc48..e8d2456e 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -2,18 +2,18 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-import linalg
-import misc
-import squashers
-import warping_functions
-import datasets
-import mocap
-import decorators
-import classification
-import subarray_and_sorting
-import caching
-import diag
-import initialization
-import multioutput
-import linalg_gpu
+from . import linalg
+from . import misc
+from . import squashers
+from . import warping_functions
+from . import datasets
+from . import mocap
+from . import decorators
+from . import classification
+from . import subarray_and_sorting
+from . import caching
+from . import diag
+from . import initialization
+from . import multioutput
+from . import linalg_gpu
 
diff --git a/GPy/util/block_matrices.py b/GPy/util/block_matrices.py
index 95920868..e1e04aaa 100644
--- a/GPy/util/block_matrices.py
+++ b/GPy/util/block_matrices.py
@@ -1,9 +1,37 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2014-2015, Alan Saul
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 
+def get_blocks_3d(A, blocksizes, pagesizes=None):
+    """
+    Given a 3d matrix, make a block matrix, where the first and second dimensions are blocked according
+    to blocksizes, and the pages are blocked using pagesizes
+    """
+    assert (A.shape[0]==A.shape[1]) and len(A.shape)==3, "can't blockify this non-square matrix, may need to use 2d version"
+    N = np.sum(blocksizes)
+    assert A.shape[0] == N, "bad blocksizes"
+    num_blocks = len(blocksizes)
+    if pagesizes == None:
+        #Assume each page of A should be its own dimension
+        pagesizes = range(A.shape[2])#[0]*A.shape[2]
+    num_pages = len(pagesizes)
+    B = np.empty(shape=(num_blocks, num_blocks, num_pages), dtype=np.object)
+    count_k = 0
+    #for Bk, k in enumerate(pagesizes):
+    for Bk in pagesizes:
+        count_i = 0
+        for Bi, i in enumerate(blocksizes):
+            count_j = 0
+            for Bj, j in enumerate(blocksizes):
+                #We want to have it count_k:count_k + k but its annoying as it makes a NxNx1 array is page sizes are set to 1
+                B[Bi, Bj, Bk] = A[count_i:count_i + i, count_j:count_j + j, Bk]
+                count_j += j
+            count_i += i
+        #count_k += k
+    return B
+
 def get_blocks(A, blocksizes):
-    assert (A.shape[0]==A.shape[1]) and len(A.shape)==2, "can;t blockify this non-square matrix"
+    assert (A.shape[0]==A.shape[1]) and len(A.shape)==2, "can't blockify this non-square matrix"
     N = np.sum(blocksizes)
     assert A.shape[0] == N, "bad blocksizes"
     num_blocks = len(blocksizes)
@@ -17,10 +45,74 @@ def get_blocks(A, blocksizes):
         count_i += i
     return B
 
+def get_block_shapes_3d(B):
+    assert B.dtype is np.dtype('object'), "Must be a block matrix"
+    #FIXME: This isn't general AT ALL...
+    return get_block_shapes(B[:,:,0]), B.shape[2]
+
+def get_block_shapes(B):
+    assert B.dtype is np.dtype('object'), "Must be a block matrix"
+    return [B[b,b].shape[0] for b in range(0, B.shape[0])]
+
+def unblock(B):
+    assert B.dtype is np.dtype('object'), "Must be a block matrix"
+    block_shapes = get_block_shapes(B)
+    num_elements = np.sum(block_shapes)
+    A = np.empty(shape=(num_elements, num_elements))
+    count_i = 0
+    for Bi, i in enumerate(block_shapes):
+        count_j = 0
+        for Bj, j in enumerate(block_shapes):
+            A[count_i:count_i + i, count_j:count_j + j] = B[Bi, Bj]
+            count_j += j
+        count_i += i
+    return A
+
+def block_dot(A, B, diagonal=False):
+    """
+    Element wise dot product on block matricies
+
+    +------+------+   +------+------+    +-------+-------+
+    |      |      |   |      |      |    |A11.B11|B12.B12|
+    | A11  | A12  |   | B11  | B12  |    |       |       |
+    +------+------+ o +------+------| =  +-------+-------+
+    |      |      |   |      |      |    |A21.B21|A22.B22|
+    | A21  | A22  |   | B21  | B22  |    |       |       |
+    +-------------+   +------+------+    +-------+-------+
+
+    ..Note
+        If any block of either (A or B) are stored as 1d vectors then we assume
+        that it denotes a diagonal matrix efficient dot product using numpy
+        broadcasting will be used, i.e. A11*B11
+
+        If either (A or B) of the diagonal matrices are stored as vectors then a more
+        efficient dot product using numpy broadcasting will be used, i.e. A11*B11
+    """
+    #Must have same number of blocks and be a block matrix
+    assert A.dtype is np.dtype('object'), "Must be a block matrix"
+    assert B.dtype is np.dtype('object'), "Must be a block matrix"
+    assert A.shape == B.shape
+    def f(C,D):
+        """
+        C is an element of A, D is the associated element of B
+        """
+        Cshape = C.shape
+        Dshape = D.shape
+        if diagonal and (len(Cshape) == 1 or len(Dshape) == 1\
+                or C.shape[0] != C.shape[1] or D.shape[0] != D.shape[1]):
+            print "Broadcasting, C: {} D:{}".format(C.shape, D.shape)
+            return C*D
+        else:
+            print "Dotting, C: {} C:{}".format(C.shape, D.shape)
+            return np.dot(C,D)
+    dot = np.vectorize(f, otypes = [np.object])
+    return dot(A,B)
 
 
 if __name__=='__main__':
     A = np.zeros((5,5))
     B = get_blocks(A,[2,3])
     B[0,0] += 7
-    print B
+    print(B)
+
+    assert np.all(unblock(B) == A)
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index 16adc320..196ce343 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -2,6 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from ..core.parameterization.observable import Observable
 import collections, weakref
+from functools import reduce
 
 class Cacher(object):
     def __init__(self, operation, limit=5, ignore_args=(), force_kwargs=()):
@@ -148,10 +149,10 @@ class Cacher(object):
         return Cacher(self.operation, self.limit, self.ignore_args, self.force_kwargs)
 
     def __getstate__(self, memo=None):
-        raise NotImplementedError, "Trying to pickle Cacher object with function {}, pickling functions not possible.".format(str(self.operation))
+        raise NotImplementedError("Trying to pickle Cacher object with function {}, pickling functions not possible.".format(str(self.operation)))
 
     def __setstate__(self, memo=None):
-        raise NotImplementedError, "Trying to pickle Cacher object with function {}, pickling functions not possible.".format(str(self.operation))
+        raise NotImplementedError("Trying to pickle Cacher object with function {}, pickling functions not possible.".format(str(self.operation)))
 
     @property
     def __name__(self):
diff --git a/GPy/util/choleskies.py b/GPy/util/choleskies.py
index 3f37fc3f..37ac7211 100644
--- a/GPy/util/choleskies.py
+++ b/GPy/util/choleskies.py
@@ -2,23 +2,28 @@
 # Licensed under the GNU GPL version 3.0
 
 import numpy as np
-from scipy import weave
-import linalg
+from . import linalg
+from .config import config
 
+try:
+    from scipy import weave
+except ImportError:
+    config.set('weave', 'working', 'False')
 
 def safe_root(N):
     i = np.sqrt(N)
     j = int(i)
     if i != j:
-        raise ValueError, "N is not square!"
+        raise ValueError("N is not square!")
     return j
 
-def flat_to_triang(flat):
+def _flat_to_triang_weave(flat):
     """take a matrix N x D and return a M X M x D array where
 
     N = M(M+1)/2
 
     the lower triangluar portion of the d'th slice of the result is filled by the d'th column of flat.
+    This is the weave implementation
     """
     N, D = flat.shape
     M = (-1 + safe_root(8*N+1))/2
@@ -42,7 +47,24 @@ def flat_to_triang(flat):
     weave.inline(code, ['flat', 'ret', 'D', 'M'])
     return ret
 
-def triang_to_flat(L):
+def _flat_to_triang_pure(flat_mat):
+    N, D = flat_mat.shape
+    M = (-1 + safe_root(8*N+1))//2
+    ret = np.zeros((M, M, D))
+    count = 0
+    for m in range(M):
+        for mm in range(m+1):
+            for d in range(D):
+              ret.flat[d + m*D*M + mm*D] = flat_mat.flat[count];
+              count = count+1
+    return ret
+
+if config.getboolean('weave', 'working'):
+	flat_to_triang =  _flat_to_triang_weave
+else:
+        flat_to_triang =  _flat_to_triang_pure
+
+def _triang_to_flat_weave(L):
     M, _, D = L.shape
 
     L = np.ascontiguousarray(L) # should do nothing if L was created by flat_to_triang
@@ -66,13 +88,31 @@ def triang_to_flat(L):
     weave.inline(code, ['flat', 'L', 'D', 'M'])
     return flat
 
+def _triang_to_flat_pure(L):
+    M, _, D = L.shape
+
+    N = M*(M+1)//2
+    flat = np.empty((N, D))
+    count = 0;
+    for m in range(M):
+        for mm in range(m+1):
+            for d in range(D):
+                flat.flat[count] = L.flat[d + m*D*M + mm*D];
+                count = count +1
+    return flat
+
+if config.getboolean('weave', 'working'):
+    triang_to_flat =  _triang_to_flat_weave
+else:
+    triang_to_flat =  _triang_to_flat_pure
+
 def triang_to_cov(L):
-    return np.dstack([np.dot(L[:,:,i], L[:,:,i].T) for i in xrange(L.shape[-1])])
+    return np.dstack([np.dot(L[:,:,i], L[:,:,i].T) for i in range(L.shape[-1])])
 
 def multiple_dpotri_old(Ls):
     M, _, D = Ls.shape
     Kis = np.rollaxis(Ls, -1).copy()
-    [dpotri(Kis[i,:,:], overwrite_c=1, lower=1) for i in xrange(D)]
+    [dpotri(Kis[i,:,:], overwrite_c=1, lower=1) for i in range(D)]
     code = """
     for(int d=0; d<D; d++)
     {
@@ -93,9 +133,6 @@ def multiple_dpotri_old(Ls):
 def multiple_dpotri(Ls):
     return np.dstack([linalg.dpotri(np.asfortranarray(Ls[:,:,i]), lower=1)[0] for i in range(Ls.shape[-1])])
 
-
-
-
 def indexes_to_fix_for_low_rank(rank, size):
     """
     work out which indexes of the flatteneed array should be fixed if we want the cholesky to represent a low rank matrix
diff --git a/GPy/util/classification.py b/GPy/util/classification.py
index c0859793..69609091 100644
--- a/GPy/util/classification.py
+++ b/GPy/util/classification.py
@@ -25,9 +25,9 @@ def conf_matrix(p,labels,names=['1','0'],threshold=.5,show=True):
     true_0 = labels.size - true_1 - false_0 - false_1
     error = (false_1 + false_0)/np.float(labels.size)
     if show:
-        print 100. - error * 100,'% instances correctly classified'
-        print '%-10s|  %-10s|  %-10s| ' % ('',names[0],names[1])
-        print '----------|------------|------------|'
-        print '%-10s|  %-10s|  %-10s| ' % (names[0],true_1,false_0)
-        print '%-10s|  %-10s|  %-10s| ' % (names[1],false_1,true_0)
+        print(100. - error * 100,'% instances correctly classified')
+        print('%-10s|  %-10s|  %-10s| ' % ('',names[0],names[1]))
+        print('----------|------------|------------|')
+        print('%-10s|  %-10s|  %-10s| ' % (names[0],true_1,false_0))
+        print('%-10s|  %-10s|  %-10s| ' % (names[1],false_1,true_0))
     return error,true_1, false_1, true_0, false_0
diff --git a/GPy/util/config.py b/GPy/util/config.py
index 6dad46c8..312d6991 100644
--- a/GPy/util/config.py
+++ b/GPy/util/config.py
@@ -1,9 +1,18 @@
 #
 # This loads the configuration
 #
-import ConfigParser
 import os
-config = ConfigParser.ConfigParser()
+try:
+    #Attempt Python 2 ConfigParser setup
+    import ConfigParser
+    config = ConfigParser.ConfigParser()
+except ImportError:
+    #Attempt Python 3 ConfigParser setup
+    import configparser
+    config = configparser.ConfigParser()
+    
+
+    
 
 # This is the default configuration file that always needs to be present.
 default_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'defaults.cfg'))
@@ -20,4 +29,4 @@ user_file = os.path.join(home,'.gpy_user.cfg')
 config.readfp(open(default_file))
 config.read([local_file, user_file])
 if not config:
-    raise ValueError, "No configuration file found at either " + user_file + " or " + local_file + " or " + default_file + "."
+    raise ValueError("No configuration file found at either " + user_file + " or " + local_file + " or " + default_file + ".")
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 254639a6..57755ea9 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -1,17 +1,17 @@
+from __future__ import print_function
 import csv
 import os
 import copy
 import numpy as np
 import GPy
 import scipy.io
-import cPickle as pickle
 import zipfile
 import tarfile
 import datetime
 import json
 import re
-
-from config import *
+import sys
+from .config import *
 
 ipython_available=True
 try:
@@ -19,8 +19,20 @@ try:
 except ImportError:
     ipython_available=False
 
+try:
+    #In Python 2, cPickle is faster. It does not exist in Python 3 but the underlying code is always used
+    #if available
+    import cPickle as pickle
+except ImportError:
+    import pickle
 
-import sys, urllib2
+#A Python2/3 import handler - urllib2 changed its name in Py3 and was also reorganised
+try:
+    from urllib2 import urlopen
+    from urllib2 import URLError
+except ImportError:
+    from urllib.request import urlopen
+    from urllib.error import URLError
 
 def reporthook(a,b,c):
     # ',' at the end of the line is important!
@@ -75,7 +87,7 @@ def prompt_user(prompt):
     elif choice in no:
         return False
     else:
-        print("Your response was a " + choice)
+        print(("Your response was a " + choice))
         print("Please respond with 'yes', 'y' or 'no', 'n'")
         #return prompt_user()
 
@@ -99,7 +111,7 @@ def download_url(url, store_directory, save_name=None, messages=True, suffix='')
     """Download a file from a url and save it to disk."""
     i = url.rfind('/')
     file = url[i+1:]
-    print file
+    print(file)
     dir_name = os.path.join(data_path, store_directory)
 
     if save_name is None: save_name = os.path.join(dir_name, file)
@@ -107,12 +119,12 @@ def download_url(url, store_directory, save_name=None, messages=True, suffix='')
 
     if suffix is None: suffix=''
 
-    print "Downloading ", url, "->", save_name
+    print("Downloading ", url, "->", save_name)
     if not os.path.exists(dir_name):
         os.makedirs(dir_name)
     try:
-        response = urllib2.urlopen(url+suffix)
-    except urllib2.URLError, e:
+        response = urlopen(url+suffix)
+    except URLError as e:
         if not hasattr(e, "code"):
             raise
         response = e
@@ -150,7 +162,7 @@ def download_url(url, store_directory, save_name=None, messages=True, suffix='')
             sys.stdout.write(status)
             sys.stdout.flush()
         sys.stdout.write(" "*(len(status)) + "\r")
-        print status
+        print(status)
     # if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
     #with open(save_name, 'wb') as f:
     #    f.write(response.read())
@@ -159,32 +171,32 @@ def download_url(url, store_directory, save_name=None, messages=True, suffix='')
 
 def authorize_download(dataset_name=None):
     """Check with the user that the are happy with terms and conditions for the data set."""
-    print('Acquiring resource: ' + dataset_name)
+    print(('Acquiring resource: ' + dataset_name))
     # TODO, check resource is in dictionary!
     print('')
     dr = data_resources[dataset_name]
     print('Details of data: ')
-    print(dr['details'])
+    print((dr['details']))
     print('')
     if dr['citation']:
         print('Please cite:')
-        print(dr['citation'])
+        print((dr['citation']))
         print('')
     if dr['size']:
-        print('After downloading the data will take up ' + str(dr['size']) + ' bytes of space.')
+        print(('After downloading the data will take up ' + str(dr['size']) + ' bytes of space.'))
         print('')
-    print('Data will be stored in ' + os.path.join(data_path, dataset_name) + '.')
+    print(('Data will be stored in ' + os.path.join(data_path, dataset_name) + '.'))
     print('')
     if overide_manual_authorize:
         if dr['license']:
             print('You have agreed to the following license:')
-            print(dr['license'])
+            print((dr['license']))
             print('')
         return True
     else:
         if dr['license']:
             print('You must also agree to the following license:')
-            print(dr['license'])
+            print((dr['license']))
             print('')
         return prompt_user('Do you wish to proceed with the download? [yes/no]')
 
@@ -495,18 +507,18 @@ def google_trends(query_terms=['big data', 'machine learning', 'data science'],
     file = 'data.csv'
     file_name = os.path.join(dir_path,file)
     if not os.path.exists(file_name) or refresh_data:
-        print "Accessing Google trends to acquire the data. Note that repeated accesses will result in a block due to a google terms of service violation. Failure at this point may be due to such blocks."
+        print("Accessing Google trends to acquire the data. Note that repeated accesses will result in a block due to a google terms of service violation. Failure at this point may be due to such blocks.")
         # quote the query terms.
         quoted_terms = []
         for term in query_terms:
             quoted_terms.append(urllib2.quote(term))
-        print "Query terms: ", ', '.join(query_terms)
+        print("Query terms: ", ', '.join(query_terms))
 
-        print "Fetching query:"
+        print("Fetching query:")
         query = 'http://www.google.com/trends/fetchComponent?q=%s&cid=TIMESERIES_GRAPH_0&export=3' % ",".join(quoted_terms)
 
-        data = urllib2.urlopen(query).read()
-        print "Done."
+        data = urlopen(query).read()
+        print("Done.")
         # In the notebook they did some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD.
         header = """// Data table response\ngoogle.visualization.Query.setResponse("""
         data = data[len(header):-2]
@@ -520,8 +532,8 @@ def google_trends(query_terms=['big data', 'machine learning', 'data science'],
 
         df.to_csv(file_name)
     else:
-        print "Reading cached data for google trends. To refresh the cache set 'refresh_data=True' when calling this function."
-        print "Query terms: ", ', '.join(query_terms)
+        print("Reading cached data for google trends. To refresh the cache set 'refresh_data=True' when calling this function.")
+        print("Query terms: ", ', '.join(query_terms))
 
         df = pandas.read_csv(file_name, parse_dates=[0])
 
@@ -679,11 +691,11 @@ def ripley_synth(data_set='ripley_prnn_data'):
 def global_average_temperature(data_set='global_temperature', num_train=1000, refresh_data=False):
     path = os.path.join(data_path, data_set)
     if data_available(data_set) and not refresh_data:
-        print 'Using cached version of the data set, to use latest version set refresh_data to True'
+        print('Using cached version of the data set, to use latest version set refresh_data to True')
     else:
         download_data(data_set)
     data = np.loadtxt(os.path.join(data_path, data_set, 'GLBTS.long.data'))
-    print 'Most recent data observation from month ', data[-1, 1], ' in year ', data[-1, 0]
+    print('Most recent data observation from month ', data[-1, 1], ' in year ', data[-1, 0])
     allX = data[data[:, 3]!=-99.99, 2:3]
     allY = data[data[:, 3]!=-99.99, 3:4]
     X = allX[:num_train, 0:1]
@@ -695,11 +707,11 @@ def global_average_temperature(data_set='global_temperature', num_train=1000, re
 def mauna_loa(data_set='mauna_loa', num_train=545, refresh_data=False):
     path = os.path.join(data_path, data_set)
     if data_available(data_set) and not refresh_data:
-        print 'Using cached version of the data set, to use latest version set refresh_data to True'
+        print('Using cached version of the data set, to use latest version set refresh_data to True')
     else:
         download_data(data_set)
     data = np.loadtxt(os.path.join(data_path, data_set, 'co2_mm_mlo.txt'))
-    print 'Most recent data observation from month ', data[-1, 1], ' in year ', data[-1, 0]
+    print('Most recent data observation from month ', data[-1, 1], ' in year ', data[-1, 0])
     allX = data[data[:, 3]!=-99.99, 2:3]
     allY = data[data[:, 3]!=-99.99, 3:4]
     X = allX[:num_train, 0:1]
@@ -784,7 +796,7 @@ def hapmap3(data_set='hapmap3'):
         from sys import stdout
         import bz2
     except ImportError as i:
-        raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
+        raise i("Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset")
 
     dir_path = os.path.join(data_path,'hapmap3')
     hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
@@ -802,10 +814,10 @@ def hapmap3(data_set='hapmap3'):
     if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)):
         if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB "
                             "of memory and can take a (very) long time, continue? [Y/n]"):
-            print "Preprocessing required for further usage."
+            print("Preprocessing required for further usage.")
             return
         status = "Preprocessing data, please be patient..."
-        print status
+        print(status)
         def write_status(message, progress, status):
             stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush()
             status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20,
@@ -873,13 +885,13 @@ def hapmap3(data_set='hapmap3'):
         inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1])
         inandf.to_pickle(preprocessed_data_paths[2])
         status=write_status('done :)', 100, status)
-        print ''
+        print('')
     else:
-        print "loading snps..."
+        print("loading snps...")
         snpsdf = read_pickle(preprocessed_data_paths[0])
-        print "loading metainfo..."
+        print("loading metainfo...")
         metadf = read_pickle(preprocessed_data_paths[1])
-        print "loading nan entries..."
+        print("loading nan entries...")
         inandf = read_pickle(preprocessed_data_paths[2])
     snps = snpsdf.values
     populations = metadf.population.values.astype('S3')
@@ -1001,7 +1013,7 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'):
     # Extract the tar file
     filename = os.path.join(dir_path, 'GSE45719_Raw.tar')
     with tarfile.open(filename, 'r') as files:
-        print "Extracting Archive {}...".format(files.name)
+        print("Extracting Archive {}...".format(files.name))
         data = None
         gene_info = None
         message = ''
@@ -1010,9 +1022,9 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'):
         for i, file_info in enumerate(members):
             f = files.extractfile(file_info)
             inner = read_csv(f, sep='\t', header=0, compression='gzip', index_col=0)
-            print ' '*(len(message)+1) + '\r',
+            print(' '*(len(message)+1) + '\r', end=' ')
             message = "{: >7.2%}: Extracting: {}".format(float(i+1)/overall, file_info.name[:20]+"...txt.gz")
-            print message,
+            print(message, end=' ')
             if data is None:
                 data = inner.RPKM.to_frame()
                 data.columns = [file_info.name[:-18]]
@@ -1035,8 +1047,8 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'):
 
     sys.stdout.write(' '*len(message) + '\r')
     sys.stdout.flush()
-    print
-    print "Read Archive {}".format(files.name)
+    print()
+    print("Read Archive {}".format(files.name))
 
     return data_details_return({'Y': data,
                                 'series_info': info,
diff --git a/GPy/util/debug.py b/GPy/util/debug.py
index 00107f5e..d691ad82 100644
--- a/GPy/util/debug.py
+++ b/GPy/util/debug.py
@@ -13,7 +13,7 @@ def checkFinite(arr, name=None):
 
     if np.any(np.logical_not(np.isfinite(arr))):
         idx = np.where(np.logical_not(np.isfinite(arr)))[0]
-        print name+' at indices '+str(idx)+' have not finite values: '+str(arr[idx])+'!'
+        print(name+' at indices '+str(idx)+' have not finite values: '+str(arr[idx])+'!')
         return False
     return True
 
@@ -23,13 +23,13 @@ def checkFullRank(m, tol=1e-10, name=None, force_check=False):
     assert len(m.shape)==2 and m.shape[0]==m.shape[1], 'The input of checkFullRank has to be a square matrix!'
 
     if not force_check and m.shape[0]>=10000:
-        print 'The size of '+name+'is too big to check (>=10000)!'
+        print('The size of '+name+'is too big to check (>=10000)!')
         return True
 
     s = np.real(np.linalg.eigvals(m))
 
     if s.min()/s.max()<tol:
-        print name+' is close to singlar!'
-        print 'The eigen values of '+name+' is '+str(s)
+        print(name+' is close to singlar!')
+        print('The eigen values of '+name+' is '+str(s))
         return False
     return True
diff --git a/GPy/util/gpu_init.py b/GPy/util/gpu_init.py
index b6a4a164..26dff0b3 100644
--- a/GPy/util/gpu_init.py
+++ b/GPy/util/gpu_init.py
@@ -23,7 +23,7 @@ try:
         import pycuda.driver
         pycuda.driver.init()
         if gpuid>=pycuda.driver.Device.count():
-            print '['+MPI.Get_processor_name()+'] more processes than the GPU numbers!'
+            print('['+MPI.Get_processor_name()+'] more processes than the GPU numbers!')
             #MPI.COMM_WORLD.Abort()
             raise
         gpu_device = pycuda.driver.Device(gpuid)
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index b148f2f4..26c4b774 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -6,16 +6,22 @@
 # http://homepages.inf.ed.ac.uk/imurray2/code/tdot/tdot.py
 
 import numpy as np
-from scipy import linalg, weave
+from scipy import linalg
 import types
 import ctypes
 from ctypes import byref, c_char, c_int, c_double # TODO
 import scipy
 import warnings
 import os
-from config import config
+from .config import config
 import logging
 
+try:
+    from scipy import weave
+except ImportError:
+    config.set('weave', 'working', 'False')
+
+
 _scipyversion = np.float64((scipy.__version__).split('.')[:2])
 _fix_dpotri_scipy_bug = True
 if np.all(_scipyversion >= np.array([0, 14])):
@@ -34,7 +40,7 @@ if config.getboolean('anaconda', 'installed') and config.getboolean('anaconda',
         dsyrk = mkl_rt.dsyrk
         dsyr = mkl_rt.dsyr
         _blas_available = True
-        print 'anaconda installed and mkl is loaded'
+        print('anaconda installed and mkl is loaded')
     except:
         _blas_available = False
 else:
@@ -64,7 +70,7 @@ def force_F_ordered(A):
     """
     if A.flags['F_CONTIGUOUS']:
         return A
-    print "why are your arrays not F order?"
+    print("why are your arrays not F order?")
     return np.asfortranarray(A)
 
 # def jitchol(A, maxtries=5):
@@ -91,21 +97,24 @@ def jitchol(A, maxtries=5):
     else:
         diagA = np.diag(A)
         if np.any(diagA <= 0.):
-            raise linalg.LinAlgError, "not pd: non-positive diagonal elements"
+            raise linalg.LinAlgError("not pd: non-positive diagonal elements")
         jitter = diagA.mean() * 1e-6
         num_tries = 1
         while num_tries <= maxtries and np.isfinite(jitter):
             try:
                 L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
-                logging.warning('Added {} rounds of jitter, jitter of {:.10e}\n'.format(num_tries, jitter))
                 return L
             except:
                 jitter *= 10
+            finally:
                 num_tries += 1
+        raise linalg.LinAlgError("not positive definite, even with jitter.")
     import traceback
-    logging.warning('\n'.join(['Added {} rounds of jitter, jitter of {:.10e}'.format(num_tries-1, jitter),
-                                '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
-    raise linalg.LinAlgError, "not positive definite, even with jitter."
+    try: raise
+    except:
+        logging.warning('\n'.join(['Added jitter of {:.10e}'.format(jitter),
+            '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
+    return L
 
 # def dtrtri(L, lower=1):
 #     """
@@ -208,12 +217,12 @@ def mdot(*args):
 
 def _mdot_r(a, b):
     """Recursive helper for mdot"""
-    if type(a) == types.TupleType:
+    if type(a) == tuple:
         if len(a) > 1:
             a = mdot(*a)
         else:
             a = a[0]
-    if type(b) == types.TupleType:
+    if type(b) == tuple:
         if len(b) > 1:
             b = mdot(*b)
         else:
@@ -288,7 +297,7 @@ def pca(Y, input_dim):
 
     """
     if not np.allclose(Y.mean(axis=0), 0.0):
-        print "Y is not zero mean, centering it locally (GPy.util.linalg.pca)"
+        print("Y is not zero mean, centering it locally (GPy.util.linalg.pca)")
 
         # Y -= Y.mean(axis=0)
 
@@ -347,16 +356,16 @@ def tdot_blas(mat, out=None):
     # of C order. However, I tried that and had errors with large matrices:
     # http://homepages.inf.ed.ac.uk/imurray2/code/tdot/tdot_broken.py
     mat = np.asfortranarray(mat)
-    TRANS = c_char('n')
+    TRANS = c_char('n'.encode('ascii'))
     N = c_int(mat.shape[0])
     K = c_int(mat.shape[1])
     LDA = c_int(mat.shape[0])
-    UPLO = c_char('l')
+    UPLO = c_char('l'.encode('ascii'))
     ALPHA = c_double(1.0)
     A = mat.ctypes.data_as(ctypes.c_void_p)
     BETA = c_double(0.0)
     C = out.ctypes.data_as(ctypes.c_void_p)
-    LDC = c_int(np.max(out.strides) / 8)
+    LDC = c_int(np.max(out.strides) // 8)
     dsyrk(byref(UPLO), byref(TRANS), byref(N), byref(K),
             byref(ALPHA), A, byref(LDA), byref(BETA), C, byref(LDC))
 
@@ -383,7 +392,7 @@ def DSYR_blas(A, x, alpha=1.):
     """
     N = c_int(A.shape[0])
     LDA = c_int(A.shape[0])
-    UPLO = c_char('l')
+    UPLO = c_char('l'.encode('ascii'))
     ALPHA = c_double(alpha)
     A_ = A.ctypes.data_as(ctypes.c_void_p)
     x_ = x.ctypes.data_as(ctypes.c_void_p)
@@ -423,7 +432,7 @@ def symmetrify(A, upper=False):
         try:
             symmetrify_weave(A, upper)
         except:
-            print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+            print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
             config.set('weave', 'working', 'False')
             symmetrify_numpy(A, upper)
     else:
@@ -489,34 +498,35 @@ def symmetrify_numpy(A, upper=False):
     else:
         A[triu] = A.T[triu]
 
-def cholupdate(L, x):
-    """
-    update the LOWER cholesky factor of a pd matrix IN PLACE
-
-    if L is the lower chol. of K, then this function computes L\_
-    where L\_ is the lower chol of K + x*x^T
-
-    """
-    support_code = """
-    #include <math.h>
-    """
-    code = """
-    double r,c,s;
-    int j,i;
-    for(j=0; j<N; j++){
-      r = sqrt(L(j,j)*L(j,j) + x(j)*x(j));
-      c = r / L(j,j);
-      s = x(j) / L(j,j);
-      L(j,j) = r;
-      for (i=j+1; i<N; i++){
-        L(i,j) = (L(i,j) + s*x(i))/c;
-        x(i) = c*x(i) - s*L(i,j);
-      }
-    }
-    """
-    x = x.copy()
-    N = x.size
-    weave.inline(code, support_code=support_code, arg_names=['N', 'L', 'x'], type_converters=weave.converters.blitz)
+#This function appears to be unused. It's use of weave makes it problematic
+#Commenting out for now
+#def cholupdate(L, x):
+#    """
+#    update the LOWER cholesky factor of a pd matrix IN PLACE
+#
+#    if L is the lower chol. of K, then this function computes L\_
+#    where L\_ is the lower chol of K + x*x^T
+#    """
+#    support_code = """
+#    #include <math.h>
+#    """
+#    code = """
+#    double r,c,s;
+#    int j,i;
+#    for(j=0; j<N; j++){
+#      r = sqrt(L(j,j)*L(j,j) + x(j)*x(j));
+#      c = r / L(j,j);
+#      s = x(j) / L(j,j);
+#      L(j,j) = r;
+#      for (i=j+1; i<N; i++){
+#        L(i,j) = (L(i,j) + s*x(i))/c;
+#        x(i) = c*x(i) - s*L(i,j);
+#      }
+#    }
+#    """
+#    x = x.copy()
+#    N = x.size
+#    weave.inline(code, support_code=support_code, arg_names=['N', 'L', 'x'], type_converters=weave.converters.blitz)
 
 def backsub_both_sides(L, X, transpose='left'):
     """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
diff --git a/GPy/util/ln_diff_erfs.py b/GPy/util/ln_diff_erfs.py
index bb9cfe03..c1137283 100644
--- a/GPy/util/ln_diff_erfs.py
+++ b/GPy/util/ln_diff_erfs.py
@@ -6,7 +6,7 @@ try:
     from scipy.special import erfcx, erf
 except ImportError:
     from scipy.special import erf
-    from erfcx import erfcx
+    from .erfcx import erfcx
 
 import numpy as np
 
@@ -35,7 +35,7 @@ def ln_diff_erfs(x1, x2, return_sign=False):
         elif x2.size==1:
             v = np.zeros(x1.shape)
         else:
-            raise ValueError, "This function does not broadcast unless provided with a scalar."
+            raise ValueError("This function does not broadcast unless provided with a scalar.")
     
     if x1.size == 1:
         x1 = np.tile(x1, x2.shape)
diff --git a/GPy/util/misc.py b/GPy/util/misc.py
index bf37159d..ddc29fa0 100644
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@@ -2,7 +2,36 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from config import *
+from scipy.special import cbrt
+from .config import *
+
+_lim_val = np.finfo(np.float64).max
+_lim_val_exp = np.log(_lim_val)
+_lim_val_square = np.sqrt(_lim_val)
+#_lim_val_cube = cbrt(_lim_val)
+_lim_val_cube = np.nextafter(_lim_val**(1/3.0), -np.inf)
+_lim_val_quad = np.nextafter(_lim_val**(1/4.0), -np.inf)
+_lim_val_three_times = np.nextafter(_lim_val/3.0, -np.inf)
+
+def safe_exp(f):
+    clip_f = np.clip(f, -np.inf, _lim_val_exp)
+    return np.exp(clip_f)
+
+def safe_square(f):
+    f = np.clip(f, -np.inf, _lim_val_square)
+    return f**2
+
+def safe_cube(f):
+    f = np.clip(f, -np.inf, _lim_val_cube)
+    return f**3
+
+def safe_quad(f):
+    f = np.clip(f, -np.inf, _lim_val_quad)
+    return f**4
+
+def safe_three_times(f):
+    f = np.clip(f, -np.inf, _lim_val_three_times)
+    return 3*f
 
 def chain_1(df_dg, dg_dx):
     """
@@ -11,6 +40,11 @@ def chain_1(df_dg, dg_dx):
     .. math::
         \\frac{d(f . g)}{dx} = \\frac{df}{dg} \\frac{dg}{dx}
     """
+    if np.all(dg_dx==1.):
+        return df_dg
+    if len(df_dg) > 1 and len(df_dg.shape)>1 and df_dg.shape[-1] > 1:
+        #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+        raise NotImplementedError('Not implemented for matricies yet')
     return df_dg * dg_dx
 
 def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
@@ -20,7 +54,13 @@ def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
     .. math::
         \\frac{d^{2}(f . g)}{dx^{2}} = \\frac{d^{2}f}{dg^{2}}(\\frac{dg}{dx})^{2} + \\frac{df}{dg}\\frac{d^{2}g}{dx^{2}}
     """
-    return d2f_dg2*(dg_dx**2) + df_dg*d2g_dx2
+    if np.all(dg_dx==1.) and np.all(d2g_dx2 == 0):
+        return d2f_dg2
+    if  len(d2f_dg2) > 1 and len(d2f_dg2.shape)>1 and d2f_dg2.shape[-1] > 1:
+        raise NotImplementedError('Not implemented for matricies yet')
+    dg_dx_2 = np.clip(dg_dx, -np.inf, _lim_val_square)**2
+    #dg_dx_2 = dg_dx**2
+    return d2f_dg2*(dg_dx_2) + df_dg*d2g_dx2
 
 def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
     """
@@ -29,11 +69,18 @@ def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
     .. math::
         \\frac{d^{3}(f . g)}{dx^{3}} = \\frac{d^{3}f}{dg^{3}}(\\frac{dg}{dx})^{3} + 3\\frac{d^{2}f}{dg^{2}}\\frac{dg}{dx}\\frac{d^{2}g}{dx^{2}} + \\frac{df}{dg}\\frac{d^{3}g}{dx^{3}}
     """
-    return d3f_dg3*(dg_dx**3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3
+    if np.all(dg_dx==1.) and np.all(d2g_dx2==0) and np.all(d3g_dx3==0):
+        return d3f_dg3
+    if (  (len(d2f_dg2) > 1 and d2f_dg2.shape[-1] > 1)
+           or (len(d3f_dg3) > 1 and d3f_dg3.shape[-1] > 1)):
+        raise NotImplementedError('Not implemented for matricies yet')
+    dg_dx_3 = np.clip(dg_dx, -np.inf, _lim_val_cube)**3
+    #dg_dx_3 = dg_dx**3
+    return d3f_dg3*(dg_dx_3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3
 
 def opt_wrapper(m, **kwargs):
     """
-    This function just wraps the optimization procedure of a GPy
+    Thit function just wraps the optimization procedure of a GPy
     object so that optimize() pickleable (necessary for multiprocessing).
     """
     m.optimize(**kwargs)
@@ -96,3 +143,47 @@ from :class:ndarray)"""
     if len(param) == 1:
         return param[0].view(np.ndarray)
     return [x.view(np.ndarray) for x in param]
+
+def blockify_hessian(func):
+    def wrapper_func(self, *args, **kwargs):
+        # Invoke the wrapped function first
+        retval = func(self, *args, **kwargs)
+        # Now do something here with retval and/or action
+        if self.not_block_really and (retval.shape[0] != retval.shape[1]):
+            return np.diagflat(retval)
+        else:
+            return retval
+    return wrapper_func
+
+def blockify_third(func):
+    def wrapper_func(self, *args, **kwargs):
+        # Invoke the wrapped function first
+        retval = func(self, *args, **kwargs)
+        # Now do something here with retval and/or action
+        if self.not_block_really and (len(retval.shape) < 3):
+            num_data = retval.shape[0]
+            d3_block_cache = np.zeros((num_data, num_data, num_data))
+            diag_slice = range(num_data)
+            d3_block_cache[diag_slice, diag_slice, diag_slice] = np.squeeze(retval)
+            return d3_block_cache
+        else:
+            return retval
+    return wrapper_func
+
+def blockify_dhess_dtheta(func):
+    def wrapper_func(self, *args, **kwargs):
+        # Invoke the wrapped function first
+        retval = func(self, *args, **kwargs)
+        # Now do something here with retval and/or action
+        if self.not_block_really and (len(retval.shape) < 3):
+            num_data = retval.shape[0]
+            num_params = retval.shape[-1]
+            dhess_dtheta = np.zeros((num_data, num_data, num_params))
+            diag_slice = range(num_data)
+            for param_ind in range(num_params):
+                dhess_dtheta[diag_slice, diag_slice, param_ind] = np.squeeze(retval[:,param_ind])
+            return dhess_dtheta
+        else:
+            return retval
+    return wrapper_func
+
diff --git a/GPy/util/mocap.py b/GPy/util/mocap.py
index 58662cf9..4f6336c5 100644
--- a/GPy/util/mocap.py
+++ b/GPy/util/mocap.py
@@ -2,7 +2,6 @@ import os
 import numpy as np
 import math
 from GPy.util import datasets as dat
-import urllib2
 
 class vertex:
     def __init__(self, name, id, parents=[], children=[], meta = {}):
@@ -174,7 +173,7 @@ class skeleton(tree):
         return connection
 
     def to_xyz(self, channels):
-        raise NotImplementedError, "this needs to be implemented to use the skeleton class"
+        raise NotImplementedError("this needs to be implemented to use the skeleton class")
 
 
     def finalize(self):
diff --git a/GPy/util/multioutput.py b/GPy/util/multioutput.py
index cc9af29e..2233dbb6 100644
--- a/GPy/util/multioutput.py
+++ b/GPy/util/multioutput.py
@@ -51,7 +51,7 @@ def ICM(input_dim, num_outputs, kernel, W_rank=1,W=None,kappa=None,name='ICM'):
     :param W_rank: number tuples of the corregionalization parameters 'W'
     :type W_rank: integer
     """
-    if kernel.input_dim <> input_dim:
+    if kernel.input_dim != input_dim:
         kernel.input_dim = input_dim
         warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
 
diff --git a/GPy/util/parallel.py b/GPy/util/parallel.py
index fab43936..880dae58 100644
--- a/GPy/util/parallel.py
+++ b/GPy/util/parallel.py
@@ -27,7 +27,7 @@ def divide_data(datanum, rank, size):
     
     residue = (datanum)%size
     datanum_list = np.empty((size),dtype=np.int32)
-    for i in xrange(size):
+    for i in range(size):
         if i<residue:
             datanum_list[i] = int(datanum/size)+1
         else:
@@ -38,4 +38,4 @@ def divide_data(datanum, rank, size):
     else:
         size = datanum/size
         offset = size*rank+residue
-    return offset, offset+size, datanum_list
\ No newline at end of file
+    return offset, offset+size, datanum_list
diff --git a/GPy/util/pca.py b/GPy/util/pca.py
index f87b9807..7168a28f 100644
--- a/GPy/util/pca.py
+++ b/GPy/util/pca.py
@@ -13,6 +13,7 @@ except:
 from numpy.linalg.linalg import LinAlgError
 from operator import setitem
 import itertools
+from functools import reduce
 
 class PCA(object):
     """
@@ -47,7 +48,7 @@ class PCA(object):
             X_ = numpy.ma.masked_array(X, inan)
             self.mu = X_.mean(0).base
             self.sigma = X_.std(0).base
-        reduce(lambda y,x: setitem(x[0], x[1], x[2]), itertools.izip(X.T, inan.T, self.mu), None)
+        reduce(lambda y,x: setitem(x[0], x[1], x[2]), zip(X.T, inan.T, self.mu), None)
         X = X - self.mu
         X = X / numpy.where(self.sigma == 0, 1e-30, self.sigma)
         return X
@@ -138,4 +139,4 @@ class PCA(object):
             pylab.tight_layout()
         except:
             pass
-        return plots
\ No newline at end of file
+        return plots
diff --git a/GPy/util/univariate_Gaussian.py b/GPy/util/univariate_Gaussian.py
index 09b2e99c..e84f071f 100644
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@@ -1,77 +1,14 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
+# Copyright (c) 2015 James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from scipy import weave
+from scipy.special import ndtr as std_norm_cdf
 
+#define a standard normal pdf
+_sqrt_2pi = np.sqrt(2*np.pi)
 def std_norm_pdf(x):
-    """Standard Gaussian density function"""
-    return 1./np.sqrt(2.*np.pi)*np.exp(-.5*x**2)
-
-def std_norm_cdf(x):
-    """
-    Cumulative standard Gaussian distribution
-    Based on Abramowitz, M. and Stegun, I. (1970)
-    """
-    x_shape = np.asarray(x).shape
-
-    if len(x_shape) == 0 or x_shape[0] == 1:
-        sign = np.sign(x)
-        x *= sign
-        x /= np.sqrt(2.)
-        t = 1.0/(1.0 +  0.3275911*x)
-        erf = 1. - np.exp(-x**2)*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))))
-        cdf_x = 0.5*(1.0 + sign*erf)
-        return cdf_x
-    else:
-        x = np.atleast_1d(x).copy()
-        cdf_x = np.zeros_like(x)
-        sign = np.ones_like(x)
-        neg_x_ind = x<0
-        sign[neg_x_ind] = -1.0
-        x[neg_x_ind] = -x[neg_x_ind]
-        x /= np.sqrt(2.)
-        t = 1.0/(1.0 +  0.3275911*x)
-        erf = 1. - np.exp(-x**2)*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))))
-        cdf_x = 0.5*(1.0 + sign*erf)
-        cdf_x = cdf_x.reshape(x_shape)
-    return cdf_x
-
-def std_norm_cdf_weave(x):
-    """
-    Cumulative standard Gaussian distribution
-    Based on Abramowitz, M. and Stegun, I. (1970)
-
-    A weave implementation of std_norm_cdf, which is faster. this is unused,
-    because of the difficulties of a weave dependency. (see github issue #94)
-
-    """
-    #Generalize for many x
-    x = np.asarray(x).copy()
-    cdf_x = np.zeros_like(x)
-    N = x.size
-    support_code = "#include <math.h>"
-    code = """
-
-    double sign, t, erf;
-    for (int i=0; i<N; i++){
-        sign = 1.0;
-        if (x[i] < 0.0){
-            sign = -1.0;
-            x[i] = -x[i];
-        }
-        x[i] = x[i]/sqrt(2.0);
-
-        t = 1.0/(1.0 +  0.3275911*x[i]);
-
-        erf = 1. - exp(-x[i]*x[i])*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))));
-
-        //return_val = 0.5*(1.0 + sign*erf);
-        cdf_x[i] = 0.5*(1.0 + sign*erf);
-    }
-    """
-    weave.inline(code, arg_names=['x', 'cdf_x', 'N'], support_code=support_code)
-    return cdf_x
+    return np.exp(-np.square(x)/2)/_sqrt_2pi
 
 def inv_std_norm_cdf(x):
     """
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index a0a385e0..8f9d232f 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -1,17 +1,18 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
 import numpy as np
+from GPy.core.parameterization import Parameterized, Param
+from ..core.parameterization.transformations import Logexp
 
-class WarpingFunction(object):
+class WarpingFunction(Parameterized):
     """
     abstract function for warping
     z = f(y)
     """
 
-    def __init__(self):
-        raise NotImplementedError
+    def __init__(self, name):
+        super(WarpingFunction, self).__init__(name=name)
 
     def f(self,y,psi):
         """function transformation
@@ -34,9 +35,10 @@ class WarpingFunction(object):
     def _get_param_names(self):
         raise NotImplementedError
 
-    def plot(self, psi, xmin, xmax):
+    def plot(self,  xmin, xmax):
+        psi = self.psi
         y = np.arange(xmin, xmax, 0.01)
-        f_y = self.f(y, psi)
+        f_y = self.f(y)
         from matplotlib import pyplot as plt
         plt.figure()
         plt.plot(y, f_y)
@@ -50,6 +52,7 @@ class TanhWarpingFunction(WarpingFunction):
         """n_terms specifies the number of tanh terms to be used"""
         self.n_terms = n_terms
         self.num_parameters = 3 * self.n_terms
+        super(TanhWarpingFunction, self).__init__(name='warp_tanh')
 
     def f(self,y,psi):
         """
@@ -163,8 +166,18 @@ class TanhWarpingFunction_d(WarpingFunction):
         """n_terms specifies the number of tanh terms to be used"""
         self.n_terms = n_terms
         self.num_parameters = 3 * self.n_terms + 1
+        self.psi = np.ones((self.n_terms, 3))
 
-    def f(self,y,psi):
+        super(TanhWarpingFunction_d, self).__init__(name='warp_tanh')
+        self.psi = Param('psi', self.psi)
+        self.psi[:, :2].constrain_positive()
+
+        self.d = Param('%s' % ('d'), 1.0, Logexp())
+        self.link_parameter(self.psi)
+        self.link_parameter(self.d)
+
+
+    def f(self,y):
         """
         Transform y with f using parameter vector psi
         psi = [[a,b,c]]
@@ -175,9 +188,9 @@ class TanhWarpingFunction_d(WarpingFunction):
         #1. check that number of params is consistent
         # assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
         # assert psi.shape[1] == 4, 'inconsistent parameter dimensions'
-        mpsi = psi.copy()
-        d = psi[-1]
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
+
+        d = self.d
+        mpsi = self.psi
 
         #3. transform data
         z = d*y.copy()
@@ -187,7 +200,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         return z
 
 
-    def f_inv(self, z, psi, max_iterations=1000, y=None):
+    def f_inv(self, z, max_iterations=1000, y=None):
         """
         calculate the numerical inverse of f
 
@@ -198,21 +211,21 @@ class TanhWarpingFunction_d(WarpingFunction):
         z = z.copy()
         if y is None:
             y = np.ones_like(z)
-            
+
         it = 0
         update = np.inf
 
         while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
-            update = (self.f(y, psi) - z)/self.fgrad_y(y, psi)
+            update = (self.f(y) - z)/self.fgrad_y(y)
             y -= update
             it += 1
         if it == max_iterations:
-            print "WARNING!!! Maximum number of iterations reached in f_inv "
+            print("WARNING!!! Maximum number of iterations reached in f_inv ")
 
         return y
 
 
-    def fgrad_y(self, y, psi, return_precalc = False):
+    def fgrad_y(self, y,return_precalc = False):
         """
         gradient of f w.r.t to y ([N x 1])
 
@@ -221,9 +234,8 @@ class TanhWarpingFunction_d(WarpingFunction):
         """
 
 
-        mpsi = psi.copy()
-        d = psi[-1]
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
+        d = self.d
+        mpsi = self.psi
 
         # vectorized version
 
@@ -240,7 +252,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         return GRAD
 
 
-    def fgrad_y_psi(self, y, psi, return_covar_chain = False):
+    def fgrad_y_psi(self, y, return_covar_chain = False):
         """
         gradient of f w.r.t to y and psi
 
@@ -248,10 +260,10 @@ class TanhWarpingFunction_d(WarpingFunction):
 
         """
 
-        mpsi = psi.copy()
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
 
-        w, s, r, d = self.fgrad_y(y, psi, return_precalc = True)
+        mpsi = self.psi
+
+        w, s, r, d = self.fgrad_y(y, return_precalc = True)
 
         gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
         for i in range(len(mpsi)):
diff --git a/README.md b/README.md
index 5e98af85..60dcbe24 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,5 @@
 # GPy
 
-
 A Gaussian processes framework in Python.
 
 * [GPy homepage](http://sheffieldml.github.io/GPy/)
@@ -11,6 +10,27 @@ A Gaussian processes framework in Python.
 
 Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GPy.png)
 
+### Python 3 Compatibility
+Work is underway to make GPy run on Python 3.
+
+* Python 2.x compatibility is currently broken in this fork
+* All tests in the testsuite now run on Python3. 
+
+To see this for yourself, in Ubuntu 14.04, you can do
+
+    git clone https://github.com/mikecroucher/GPy.git
+    cd GPy
+    git checkout devel
+    nosetests3 GPy/testing
+
+nosetests3 is Ubuntu's way of reffering to the Python 3 version of nosetests. You install it with 
+
+    sudo apt-get install python3-nose
+
+* Test coverage is less than 100% so it is expected that there is still more work to be done. We need more tests and examples to try out.
+* All weave functions not covered by the test suite are *simply commented out*. Can add equivalents later as test functions become available
+* A set of benchmarks would be useful! 
+
 ### Citation
 
     @Misc{gpy2014,
@@ -105,14 +125,12 @@ Ensure nose is installed via pip:
 
 Run nosetests from the root directory of the repository:
 
-    nosetests -v
+    nosetests -v GPy/testing
 
 or from within IPython
 
     import GPy; GPy.tests()
-
-
-
+		    
 ## Funding Acknowledgements
 
 
diff --git a/benchmarks/boston_housing.py b/benchmarks/boston_housing.py
new file mode 100644
index 00000000..0dcff082
--- /dev/null
+++ b/benchmarks/boston_housing.py
@@ -0,0 +1,44 @@
+import numpy as np
+import GPy
+
+def load_housing_data():
+    X = np.loadtxt('housing.data')
+    X, Y = X[:,:-1], X[:,-1:]
+
+    #scale the X data
+    xmax, xmin = X.max(0), X.min(0)
+    X = (X-xmin)/(xmax-xmin)
+
+    #loy the response
+    Y = np.log(Y)
+    return X, Y
+
+def fit_full_GP():
+    X, Y = load_housing_data()
+    k = GPy.kern.RBF(X.shape[1], ARD=True) + GPy.kern.Linear(X.shape[1])
+    m = GPy.models.GPRegression(X, Y, kernel=k)
+    m.optimize('bfgs', max_iters=400, gtol=0)
+    return m
+
+def fit_svgp_st():
+    np.random.seed(0)
+    X, Y = load_housing_data()
+
+    Z = X[np.random.permutation(X.shape[0])[:100]]
+    k = GPy.kern.RBF(X.shape[1], ARD=True) + GPy.kern.Linear(X.shape[1], ARD=True) + GPy.kern.White(1,0.01) + GPy.kern.Bias(1)
+
+    lik = GPy.likelihoods.StudentT(deg_free=3.)
+    m = GPy.core.SVGP(X, Y, Z=Z, kernel=k, likelihood=lik)
+    [m.optimize('scg', max_iters=40, gtol=0, messages=1, xtol=0, ftol=0) for i in range(10)]
+    m.optimize('bfgs', max_iters=4000, gtol=0, messages=1, xtol=0, ftol=0)
+    return m
+
+
+
+
+
+
+if __name__=="__main__":
+    import timeit
+
+
diff --git a/benchmarks/housing.data b/benchmarks/housing.data
new file mode 100644
index 00000000..f83ac564
--- /dev/null
+++ b/benchmarks/housing.data
@@ -0,0 +1,506 @@
+ 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00
+ 0.02731   0.00   7.070  0  0.4690  6.4210  78.90  4.9671   2  242.0  17.80 396.90   9.14  21.60
+ 0.02729   0.00   7.070  0  0.4690  7.1850  61.10  4.9671   2  242.0  17.80 392.83   4.03  34.70
+ 0.03237   0.00   2.180  0  0.4580  6.9980  45.80  6.0622   3  222.0  18.70 394.63   2.94  33.40
+ 0.06905   0.00   2.180  0  0.4580  7.1470  54.20  6.0622   3  222.0  18.70 396.90   5.33  36.20
+ 0.02985   0.00   2.180  0  0.4580  6.4300  58.70  6.0622   3  222.0  18.70 394.12   5.21  28.70
+ 0.08829  12.50   7.870  0  0.5240  6.0120  66.60  5.5605   5  311.0  15.20 395.60  12.43  22.90
+ 0.14455  12.50   7.870  0  0.5240  6.1720  96.10  5.9505   5  311.0  15.20 396.90  19.15  27.10
+ 0.21124  12.50   7.870  0  0.5240  5.6310 100.00  6.0821   5  311.0  15.20 386.63  29.93  16.50
+ 0.17004  12.50   7.870  0  0.5240  6.0040  85.90  6.5921   5  311.0  15.20 386.71  17.10  18.90
+ 0.22489  12.50   7.870  0  0.5240  6.3770  94.30  6.3467   5  311.0  15.20 392.52  20.45  15.00
+ 0.11747  12.50   7.870  0  0.5240  6.0090  82.90  6.2267   5  311.0  15.20 396.90  13.27  18.90
+ 0.09378  12.50   7.870  0  0.5240  5.8890  39.00  5.4509   5  311.0  15.20 390.50  15.71  21.70
+ 0.62976   0.00   8.140  0  0.5380  5.9490  61.80  4.7075   4  307.0  21.00 396.90   8.26  20.40
+ 0.63796   0.00   8.140  0  0.5380  6.0960  84.50  4.4619   4  307.0  21.00 380.02  10.26  18.20
+ 0.62739   0.00   8.140  0  0.5380  5.8340  56.50  4.4986   4  307.0  21.00 395.62   8.47  19.90
+ 1.05393   0.00   8.140  0  0.5380  5.9350  29.30  4.4986   4  307.0  21.00 386.85   6.58  23.10
+ 0.78420   0.00   8.140  0  0.5380  5.9900  81.70  4.2579   4  307.0  21.00 386.75  14.67  17.50
+ 0.80271   0.00   8.140  0  0.5380  5.4560  36.60  3.7965   4  307.0  21.00 288.99  11.69  20.20
+ 0.72580   0.00   8.140  0  0.5380  5.7270  69.50  3.7965   4  307.0  21.00 390.95  11.28  18.20
+ 1.25179   0.00   8.140  0  0.5380  5.5700  98.10  3.7979   4  307.0  21.00 376.57  21.02  13.60
+ 0.85204   0.00   8.140  0  0.5380  5.9650  89.20  4.0123   4  307.0  21.00 392.53  13.83  19.60
+ 1.23247   0.00   8.140  0  0.5380  6.1420  91.70  3.9769   4  307.0  21.00 396.90  18.72  15.20
+ 0.98843   0.00   8.140  0  0.5380  5.8130 100.00  4.0952   4  307.0  21.00 394.54  19.88  14.50
+ 0.75026   0.00   8.140  0  0.5380  5.9240  94.10  4.3996   4  307.0  21.00 394.33  16.30  15.60
+ 0.84054   0.00   8.140  0  0.5380  5.5990  85.70  4.4546   4  307.0  21.00 303.42  16.51  13.90
+ 0.67191   0.00   8.140  0  0.5380  5.8130  90.30  4.6820   4  307.0  21.00 376.88  14.81  16.60
+ 0.95577   0.00   8.140  0  0.5380  6.0470  88.80  4.4534   4  307.0  21.00 306.38  17.28  14.80
+ 0.77299   0.00   8.140  0  0.5380  6.4950  94.40  4.4547   4  307.0  21.00 387.94  12.80  18.40
+ 1.00245   0.00   8.140  0  0.5380  6.6740  87.30  4.2390   4  307.0  21.00 380.23  11.98  21.00
+ 1.13081   0.00   8.140  0  0.5380  5.7130  94.10  4.2330   4  307.0  21.00 360.17  22.60  12.70
+ 1.35472   0.00   8.140  0  0.5380  6.0720 100.00  4.1750   4  307.0  21.00 376.73  13.04  14.50
+ 1.38799   0.00   8.140  0  0.5380  5.9500  82.00  3.9900   4  307.0  21.00 232.60  27.71  13.20
+ 1.15172   0.00   8.140  0  0.5380  5.7010  95.00  3.7872   4  307.0  21.00 358.77  18.35  13.10
+ 1.61282   0.00   8.140  0  0.5380  6.0960  96.90  3.7598   4  307.0  21.00 248.31  20.34  13.50
+ 0.06417   0.00   5.960  0  0.4990  5.9330  68.20  3.3603   5  279.0  19.20 396.90   9.68  18.90
+ 0.09744   0.00   5.960  0  0.4990  5.8410  61.40  3.3779   5  279.0  19.20 377.56  11.41  20.00
+ 0.08014   0.00   5.960  0  0.4990  5.8500  41.50  3.9342   5  279.0  19.20 396.90   8.77  21.00
+ 0.17505   0.00   5.960  0  0.4990  5.9660  30.20  3.8473   5  279.0  19.20 393.43  10.13  24.70
+ 0.02763  75.00   2.950  0  0.4280  6.5950  21.80  5.4011   3  252.0  18.30 395.63   4.32  30.80
+ 0.03359  75.00   2.950  0  0.4280  7.0240  15.80  5.4011   3  252.0  18.30 395.62   1.98  34.90
+ 0.12744   0.00   6.910  0  0.4480  6.7700   2.90  5.7209   3  233.0  17.90 385.41   4.84  26.60
+ 0.14150   0.00   6.910  0  0.4480  6.1690   6.60  5.7209   3  233.0  17.90 383.37   5.81  25.30
+ 0.15936   0.00   6.910  0  0.4480  6.2110   6.50  5.7209   3  233.0  17.90 394.46   7.44  24.70
+ 0.12269   0.00   6.910  0  0.4480  6.0690  40.00  5.7209   3  233.0  17.90 389.39   9.55  21.20
+ 0.17142   0.00   6.910  0  0.4480  5.6820  33.80  5.1004   3  233.0  17.90 396.90  10.21  19.30
+ 0.18836   0.00   6.910  0  0.4480  5.7860  33.30  5.1004   3  233.0  17.90 396.90  14.15  20.00
+ 0.22927   0.00   6.910  0  0.4480  6.0300  85.50  5.6894   3  233.0  17.90 392.74  18.80  16.60
+ 0.25387   0.00   6.910  0  0.4480  5.3990  95.30  5.8700   3  233.0  17.90 396.90  30.81  14.40
+ 0.21977   0.00   6.910  0  0.4480  5.6020  62.00  6.0877   3  233.0  17.90 396.90  16.20  19.40
+ 0.08873  21.00   5.640  0  0.4390  5.9630  45.70  6.8147   4  243.0  16.80 395.56  13.45  19.70
+ 0.04337  21.00   5.640  0  0.4390  6.1150  63.00  6.8147   4  243.0  16.80 393.97   9.43  20.50
+ 0.05360  21.00   5.640  0  0.4390  6.5110  21.10  6.8147   4  243.0  16.80 396.90   5.28  25.00
+ 0.04981  21.00   5.640  0  0.4390  5.9980  21.40  6.8147   4  243.0  16.80 396.90   8.43  23.40
+ 0.01360  75.00   4.000  0  0.4100  5.8880  47.60  7.3197   3  469.0  21.10 396.90  14.80  18.90
+ 0.01311  90.00   1.220  0  0.4030  7.2490  21.90  8.6966   5  226.0  17.90 395.93   4.81  35.40
+ 0.02055  85.00   0.740  0  0.4100  6.3830  35.70  9.1876   2  313.0  17.30 396.90   5.77  24.70
+ 0.01432 100.00   1.320  0  0.4110  6.8160  40.50  8.3248   5  256.0  15.10 392.90   3.95  31.60
+ 0.15445  25.00   5.130  0  0.4530  6.1450  29.20  7.8148   8  284.0  19.70 390.68   6.86  23.30
+ 0.10328  25.00   5.130  0  0.4530  5.9270  47.20  6.9320   8  284.0  19.70 396.90   9.22  19.60
+ 0.14932  25.00   5.130  0  0.4530  5.7410  66.20  7.2254   8  284.0  19.70 395.11  13.15  18.70
+ 0.17171  25.00   5.130  0  0.4530  5.9660  93.40  6.8185   8  284.0  19.70 378.08  14.44  16.00
+ 0.11027  25.00   5.130  0  0.4530  6.4560  67.80  7.2255   8  284.0  19.70 396.90   6.73  22.20
+ 0.12650  25.00   5.130  0  0.4530  6.7620  43.40  7.9809   8  284.0  19.70 395.58   9.50  25.00
+ 0.01951  17.50   1.380  0  0.4161  7.1040  59.50  9.2229   3  216.0  18.60 393.24   8.05  33.00
+ 0.03584  80.00   3.370  0  0.3980  6.2900  17.80  6.6115   4  337.0  16.10 396.90   4.67  23.50
+ 0.04379  80.00   3.370  0  0.3980  5.7870  31.10  6.6115   4  337.0  16.10 396.90  10.24  19.40
+ 0.05789  12.50   6.070  0  0.4090  5.8780  21.40  6.4980   4  345.0  18.90 396.21   8.10  22.00
+ 0.13554  12.50   6.070  0  0.4090  5.5940  36.80  6.4980   4  345.0  18.90 396.90  13.09  17.40
+ 0.12816  12.50   6.070  0  0.4090  5.8850  33.00  6.4980   4  345.0  18.90 396.90   8.79  20.90
+ 0.08826   0.00  10.810  0  0.4130  6.4170   6.60  5.2873   4  305.0  19.20 383.73   6.72  24.20
+ 0.15876   0.00  10.810  0  0.4130  5.9610  17.50  5.2873   4  305.0  19.20 376.94   9.88  21.70
+ 0.09164   0.00  10.810  0  0.4130  6.0650   7.80  5.2873   4  305.0  19.20 390.91   5.52  22.80
+ 0.19539   0.00  10.810  0  0.4130  6.2450   6.20  5.2873   4  305.0  19.20 377.17   7.54  23.40
+ 0.07896   0.00  12.830  0  0.4370  6.2730   6.00  4.2515   5  398.0  18.70 394.92   6.78  24.10
+ 0.09512   0.00  12.830  0  0.4370  6.2860  45.00  4.5026   5  398.0  18.70 383.23   8.94  21.40
+ 0.10153   0.00  12.830  0  0.4370  6.2790  74.50  4.0522   5  398.0  18.70 373.66  11.97  20.00
+ 0.08707   0.00  12.830  0  0.4370  6.1400  45.80  4.0905   5  398.0  18.70 386.96  10.27  20.80
+ 0.05646   0.00  12.830  0  0.4370  6.2320  53.70  5.0141   5  398.0  18.70 386.40  12.34  21.20
+ 0.08387   0.00  12.830  0  0.4370  5.8740  36.60  4.5026   5  398.0  18.70 396.06   9.10  20.30
+ 0.04113  25.00   4.860  0  0.4260  6.7270  33.50  5.4007   4  281.0  19.00 396.90   5.29  28.00
+ 0.04462  25.00   4.860  0  0.4260  6.6190  70.40  5.4007   4  281.0  19.00 395.63   7.22  23.90
+ 0.03659  25.00   4.860  0  0.4260  6.3020  32.20  5.4007   4  281.0  19.00 396.90   6.72  24.80
+ 0.03551  25.00   4.860  0  0.4260  6.1670  46.70  5.4007   4  281.0  19.00 390.64   7.51  22.90
+ 0.05059   0.00   4.490  0  0.4490  6.3890  48.00  4.7794   3  247.0  18.50 396.90   9.62  23.90
+ 0.05735   0.00   4.490  0  0.4490  6.6300  56.10  4.4377   3  247.0  18.50 392.30   6.53  26.60
+ 0.05188   0.00   4.490  0  0.4490  6.0150  45.10  4.4272   3  247.0  18.50 395.99  12.86  22.50
+ 0.07151   0.00   4.490  0  0.4490  6.1210  56.80  3.7476   3  247.0  18.50 395.15   8.44  22.20
+ 0.05660   0.00   3.410  0  0.4890  7.0070  86.30  3.4217   2  270.0  17.80 396.90   5.50  23.60
+ 0.05302   0.00   3.410  0  0.4890  7.0790  63.10  3.4145   2  270.0  17.80 396.06   5.70  28.70
+ 0.04684   0.00   3.410  0  0.4890  6.4170  66.10  3.0923   2  270.0  17.80 392.18   8.81  22.60
+ 0.03932   0.00   3.410  0  0.4890  6.4050  73.90  3.0921   2  270.0  17.80 393.55   8.20  22.00
+ 0.04203  28.00  15.040  0  0.4640  6.4420  53.60  3.6659   4  270.0  18.20 395.01   8.16  22.90
+ 0.02875  28.00  15.040  0  0.4640  6.2110  28.90  3.6659   4  270.0  18.20 396.33   6.21  25.00
+ 0.04294  28.00  15.040  0  0.4640  6.2490  77.30  3.6150   4  270.0  18.20 396.90  10.59  20.60
+ 0.12204   0.00   2.890  0  0.4450  6.6250  57.80  3.4952   2  276.0  18.00 357.98   6.65  28.40
+ 0.11504   0.00   2.890  0  0.4450  6.1630  69.60  3.4952   2  276.0  18.00 391.83  11.34  21.40
+ 0.12083   0.00   2.890  0  0.4450  8.0690  76.00  3.4952   2  276.0  18.00 396.90   4.21  38.70
+ 0.08187   0.00   2.890  0  0.4450  7.8200  36.90  3.4952   2  276.0  18.00 393.53   3.57  43.80
+ 0.06860   0.00   2.890  0  0.4450  7.4160  62.50  3.4952   2  276.0  18.00 396.90   6.19  33.20
+ 0.14866   0.00   8.560  0  0.5200  6.7270  79.90  2.7778   5  384.0  20.90 394.76   9.42  27.50
+ 0.11432   0.00   8.560  0  0.5200  6.7810  71.30  2.8561   5  384.0  20.90 395.58   7.67  26.50
+ 0.22876   0.00   8.560  0  0.5200  6.4050  85.40  2.7147   5  384.0  20.90  70.80  10.63  18.60
+ 0.21161   0.00   8.560  0  0.5200  6.1370  87.40  2.7147   5  384.0  20.90 394.47  13.44  19.30
+ 0.13960   0.00   8.560  0  0.5200  6.1670  90.00  2.4210   5  384.0  20.90 392.69  12.33  20.10
+ 0.13262   0.00   8.560  0  0.5200  5.8510  96.70  2.1069   5  384.0  20.90 394.05  16.47  19.50
+ 0.17120   0.00   8.560  0  0.5200  5.8360  91.90  2.2110   5  384.0  20.90 395.67  18.66  19.50
+ 0.13117   0.00   8.560  0  0.5200  6.1270  85.20  2.1224   5  384.0  20.90 387.69  14.09  20.40
+ 0.12802   0.00   8.560  0  0.5200  6.4740  97.10  2.4329   5  384.0  20.90 395.24  12.27  19.80
+ 0.26363   0.00   8.560  0  0.5200  6.2290  91.20  2.5451   5  384.0  20.90 391.23  15.55  19.40
+ 0.10793   0.00   8.560  0  0.5200  6.1950  54.40  2.7778   5  384.0  20.90 393.49  13.00  21.70
+ 0.10084   0.00  10.010  0  0.5470  6.7150  81.60  2.6775   6  432.0  17.80 395.59  10.16  22.80
+ 0.12329   0.00  10.010  0  0.5470  5.9130  92.90  2.3534   6  432.0  17.80 394.95  16.21  18.80
+ 0.22212   0.00  10.010  0  0.5470  6.0920  95.40  2.5480   6  432.0  17.80 396.90  17.09  18.70
+ 0.14231   0.00  10.010  0  0.5470  6.2540  84.20  2.2565   6  432.0  17.80 388.74  10.45  18.50
+ 0.17134   0.00  10.010  0  0.5470  5.9280  88.20  2.4631   6  432.0  17.80 344.91  15.76  18.30
+ 0.13158   0.00  10.010  0  0.5470  6.1760  72.50  2.7301   6  432.0  17.80 393.30  12.04  21.20
+ 0.15098   0.00  10.010  0  0.5470  6.0210  82.60  2.7474   6  432.0  17.80 394.51  10.30  19.20
+ 0.13058   0.00  10.010  0  0.5470  5.8720  73.10  2.4775   6  432.0  17.80 338.63  15.37  20.40
+ 0.14476   0.00  10.010  0  0.5470  5.7310  65.20  2.7592   6  432.0  17.80 391.50  13.61  19.30
+ 0.06899   0.00  25.650  0  0.5810  5.8700  69.70  2.2577   2  188.0  19.10 389.15  14.37  22.00
+ 0.07165   0.00  25.650  0  0.5810  6.0040  84.10  2.1974   2  188.0  19.10 377.67  14.27  20.30
+ 0.09299   0.00  25.650  0  0.5810  5.9610  92.90  2.0869   2  188.0  19.10 378.09  17.93  20.50
+ 0.15038   0.00  25.650  0  0.5810  5.8560  97.00  1.9444   2  188.0  19.10 370.31  25.41  17.30
+ 0.09849   0.00  25.650  0  0.5810  5.8790  95.80  2.0063   2  188.0  19.10 379.38  17.58  18.80
+ 0.16902   0.00  25.650  0  0.5810  5.9860  88.40  1.9929   2  188.0  19.10 385.02  14.81  21.40
+ 0.38735   0.00  25.650  0  0.5810  5.6130  95.60  1.7572   2  188.0  19.10 359.29  27.26  15.70
+ 0.25915   0.00  21.890  0  0.6240  5.6930  96.00  1.7883   4  437.0  21.20 392.11  17.19  16.20
+ 0.32543   0.00  21.890  0  0.6240  6.4310  98.80  1.8125   4  437.0  21.20 396.90  15.39  18.00
+ 0.88125   0.00  21.890  0  0.6240  5.6370  94.70  1.9799   4  437.0  21.20 396.90  18.34  14.30
+ 0.34006   0.00  21.890  0  0.6240  6.4580  98.90  2.1185   4  437.0  21.20 395.04  12.60  19.20
+ 1.19294   0.00  21.890  0  0.6240  6.3260  97.70  2.2710   4  437.0  21.20 396.90  12.26  19.60
+ 0.59005   0.00  21.890  0  0.6240  6.3720  97.90  2.3274   4  437.0  21.20 385.76  11.12  23.00
+ 0.32982   0.00  21.890  0  0.6240  5.8220  95.40  2.4699   4  437.0  21.20 388.69  15.03  18.40
+ 0.97617   0.00  21.890  0  0.6240  5.7570  98.40  2.3460   4  437.0  21.20 262.76  17.31  15.60
+ 0.55778   0.00  21.890  0  0.6240  6.3350  98.20  2.1107   4  437.0  21.20 394.67  16.96  18.10
+ 0.32264   0.00  21.890  0  0.6240  5.9420  93.50  1.9669   4  437.0  21.20 378.25  16.90  17.40
+ 0.35233   0.00  21.890  0  0.6240  6.4540  98.40  1.8498   4  437.0  21.20 394.08  14.59  17.10
+ 0.24980   0.00  21.890  0  0.6240  5.8570  98.20  1.6686   4  437.0  21.20 392.04  21.32  13.30
+ 0.54452   0.00  21.890  0  0.6240  6.1510  97.90  1.6687   4  437.0  21.20 396.90  18.46  17.80
+ 0.29090   0.00  21.890  0  0.6240  6.1740  93.60  1.6119   4  437.0  21.20 388.08  24.16  14.00
+ 1.62864   0.00  21.890  0  0.6240  5.0190 100.00  1.4394   4  437.0  21.20 396.90  34.41  14.40
+ 3.32105   0.00  19.580  1  0.8710  5.4030 100.00  1.3216   5  403.0  14.70 396.90  26.82  13.40
+ 4.09740   0.00  19.580  0  0.8710  5.4680 100.00  1.4118   5  403.0  14.70 396.90  26.42  15.60
+ 2.77974   0.00  19.580  0  0.8710  4.9030  97.80  1.3459   5  403.0  14.70 396.90  29.29  11.80
+ 2.37934   0.00  19.580  0  0.8710  6.1300 100.00  1.4191   5  403.0  14.70 172.91  27.80  13.80
+ 2.15505   0.00  19.580  0  0.8710  5.6280 100.00  1.5166   5  403.0  14.70 169.27  16.65  15.60
+ 2.36862   0.00  19.580  0  0.8710  4.9260  95.70  1.4608   5  403.0  14.70 391.71  29.53  14.60
+ 2.33099   0.00  19.580  0  0.8710  5.1860  93.80  1.5296   5  403.0  14.70 356.99  28.32  17.80
+ 2.73397   0.00  19.580  0  0.8710  5.5970  94.90  1.5257   5  403.0  14.70 351.85  21.45  15.40
+ 1.65660   0.00  19.580  0  0.8710  6.1220  97.30  1.6180   5  403.0  14.70 372.80  14.10  21.50
+ 1.49632   0.00  19.580  0  0.8710  5.4040 100.00  1.5916   5  403.0  14.70 341.60  13.28  19.60
+ 1.12658   0.00  19.580  1  0.8710  5.0120  88.00  1.6102   5  403.0  14.70 343.28  12.12  15.30
+ 2.14918   0.00  19.580  0  0.8710  5.7090  98.50  1.6232   5  403.0  14.70 261.95  15.79  19.40
+ 1.41385   0.00  19.580  1  0.8710  6.1290  96.00  1.7494   5  403.0  14.70 321.02  15.12  17.00
+ 3.53501   0.00  19.580  1  0.8710  6.1520  82.60  1.7455   5  403.0  14.70  88.01  15.02  15.60
+ 2.44668   0.00  19.580  0  0.8710  5.2720  94.00  1.7364   5  403.0  14.70  88.63  16.14  13.10
+ 1.22358   0.00  19.580  0  0.6050  6.9430  97.40  1.8773   5  403.0  14.70 363.43   4.59  41.30
+ 1.34284   0.00  19.580  0  0.6050  6.0660 100.00  1.7573   5  403.0  14.70 353.89   6.43  24.30
+ 1.42502   0.00  19.580  0  0.8710  6.5100 100.00  1.7659   5  403.0  14.70 364.31   7.39  23.30
+ 1.27346   0.00  19.580  1  0.6050  6.2500  92.60  1.7984   5  403.0  14.70 338.92   5.50  27.00
+ 1.46336   0.00  19.580  0  0.6050  7.4890  90.80  1.9709   5  403.0  14.70 374.43   1.73  50.00
+ 1.83377   0.00  19.580  1  0.6050  7.8020  98.20  2.0407   5  403.0  14.70 389.61   1.92  50.00
+ 1.51902   0.00  19.580  1  0.6050  8.3750  93.90  2.1620   5  403.0  14.70 388.45   3.32  50.00
+ 2.24236   0.00  19.580  0  0.6050  5.8540  91.80  2.4220   5  403.0  14.70 395.11  11.64  22.70
+ 2.92400   0.00  19.580  0  0.6050  6.1010  93.00  2.2834   5  403.0  14.70 240.16   9.81  25.00
+ 2.01019   0.00  19.580  0  0.6050  7.9290  96.20  2.0459   5  403.0  14.70 369.30   3.70  50.00
+ 1.80028   0.00  19.580  0  0.6050  5.8770  79.20  2.4259   5  403.0  14.70 227.61  12.14  23.80
+ 2.30040   0.00  19.580  0  0.6050  6.3190  96.10  2.1000   5  403.0  14.70 297.09  11.10  23.80
+ 2.44953   0.00  19.580  0  0.6050  6.4020  95.20  2.2625   5  403.0  14.70 330.04  11.32  22.30
+ 1.20742   0.00  19.580  0  0.6050  5.8750  94.60  2.4259   5  403.0  14.70 292.29  14.43  17.40
+ 2.31390   0.00  19.580  0  0.6050  5.8800  97.30  2.3887   5  403.0  14.70 348.13  12.03  19.10
+ 0.13914   0.00   4.050  0  0.5100  5.5720  88.50  2.5961   5  296.0  16.60 396.90  14.69  23.10
+ 0.09178   0.00   4.050  0  0.5100  6.4160  84.10  2.6463   5  296.0  16.60 395.50   9.04  23.60
+ 0.08447   0.00   4.050  0  0.5100  5.8590  68.70  2.7019   5  296.0  16.60 393.23   9.64  22.60
+ 0.06664   0.00   4.050  0  0.5100  6.5460  33.10  3.1323   5  296.0  16.60 390.96   5.33  29.40
+ 0.07022   0.00   4.050  0  0.5100  6.0200  47.20  3.5549   5  296.0  16.60 393.23  10.11  23.20
+ 0.05425   0.00   4.050  0  0.5100  6.3150  73.40  3.3175   5  296.0  16.60 395.60   6.29  24.60
+ 0.06642   0.00   4.050  0  0.5100  6.8600  74.40  2.9153   5  296.0  16.60 391.27   6.92  29.90
+ 0.05780   0.00   2.460  0  0.4880  6.9800  58.40  2.8290   3  193.0  17.80 396.90   5.04  37.20
+ 0.06588   0.00   2.460  0  0.4880  7.7650  83.30  2.7410   3  193.0  17.80 395.56   7.56  39.80
+ 0.06888   0.00   2.460  0  0.4880  6.1440  62.20  2.5979   3  193.0  17.80 396.90   9.45  36.20
+ 0.09103   0.00   2.460  0  0.4880  7.1550  92.20  2.7006   3  193.0  17.80 394.12   4.82  37.90
+ 0.10008   0.00   2.460  0  0.4880  6.5630  95.60  2.8470   3  193.0  17.80 396.90   5.68  32.50
+ 0.08308   0.00   2.460  0  0.4880  5.6040  89.80  2.9879   3  193.0  17.80 391.00  13.98  26.40
+ 0.06047   0.00   2.460  0  0.4880  6.1530  68.80  3.2797   3  193.0  17.80 387.11  13.15  29.60
+ 0.05602   0.00   2.460  0  0.4880  7.8310  53.60  3.1992   3  193.0  17.80 392.63   4.45  50.00
+ 0.07875  45.00   3.440  0  0.4370  6.7820  41.10  3.7886   5  398.0  15.20 393.87   6.68  32.00
+ 0.12579  45.00   3.440  0  0.4370  6.5560  29.10  4.5667   5  398.0  15.20 382.84   4.56  29.80
+ 0.08370  45.00   3.440  0  0.4370  7.1850  38.90  4.5667   5  398.0  15.20 396.90   5.39  34.90
+ 0.09068  45.00   3.440  0  0.4370  6.9510  21.50  6.4798   5  398.0  15.20 377.68   5.10  37.00
+ 0.06911  45.00   3.440  0  0.4370  6.7390  30.80  6.4798   5  398.0  15.20 389.71   4.69  30.50
+ 0.08664  45.00   3.440  0  0.4370  7.1780  26.30  6.4798   5  398.0  15.20 390.49   2.87  36.40
+ 0.02187  60.00   2.930  0  0.4010  6.8000   9.90  6.2196   1  265.0  15.60 393.37   5.03  31.10
+ 0.01439  60.00   2.930  0  0.4010  6.6040  18.80  6.2196   1  265.0  15.60 376.70   4.38  29.10
+ 0.01381  80.00   0.460  0  0.4220  7.8750  32.00  5.6484   4  255.0  14.40 394.23   2.97  50.00
+ 0.04011  80.00   1.520  0  0.4040  7.2870  34.10  7.3090   2  329.0  12.60 396.90   4.08  33.30
+ 0.04666  80.00   1.520  0  0.4040  7.1070  36.60  7.3090   2  329.0  12.60 354.31   8.61  30.30
+ 0.03768  80.00   1.520  0  0.4040  7.2740  38.30  7.3090   2  329.0  12.60 392.20   6.62  34.60
+ 0.03150  95.00   1.470  0  0.4030  6.9750  15.30  7.6534   3  402.0  17.00 396.90   4.56  34.90
+ 0.01778  95.00   1.470  0  0.4030  7.1350  13.90  7.6534   3  402.0  17.00 384.30   4.45  32.90
+ 0.03445  82.50   2.030  0  0.4150  6.1620  38.40  6.2700   2  348.0  14.70 393.77   7.43  24.10
+ 0.02177  82.50   2.030  0  0.4150  7.6100  15.70  6.2700   2  348.0  14.70 395.38   3.11  42.30
+ 0.03510  95.00   2.680  0  0.4161  7.8530  33.20  5.1180   4  224.0  14.70 392.78   3.81  48.50
+ 0.02009  95.00   2.680  0  0.4161  8.0340  31.90  5.1180   4  224.0  14.70 390.55   2.88  50.00
+ 0.13642   0.00  10.590  0  0.4890  5.8910  22.30  3.9454   4  277.0  18.60 396.90  10.87  22.60
+ 0.22969   0.00  10.590  0  0.4890  6.3260  52.50  4.3549   4  277.0  18.60 394.87  10.97  24.40
+ 0.25199   0.00  10.590  0  0.4890  5.7830  72.70  4.3549   4  277.0  18.60 389.43  18.06  22.50
+ 0.13587   0.00  10.590  1  0.4890  6.0640  59.10  4.2392   4  277.0  18.60 381.32  14.66  24.40
+ 0.43571   0.00  10.590  1  0.4890  5.3440 100.00  3.8750   4  277.0  18.60 396.90  23.09  20.00
+ 0.17446   0.00  10.590  1  0.4890  5.9600  92.10  3.8771   4  277.0  18.60 393.25  17.27  21.70
+ 0.37578   0.00  10.590  1  0.4890  5.4040  88.60  3.6650   4  277.0  18.60 395.24  23.98  19.30
+ 0.21719   0.00  10.590  1  0.4890  5.8070  53.80  3.6526   4  277.0  18.60 390.94  16.03  22.40
+ 0.14052   0.00  10.590  0  0.4890  6.3750  32.30  3.9454   4  277.0  18.60 385.81   9.38  28.10
+ 0.28955   0.00  10.590  0  0.4890  5.4120   9.80  3.5875   4  277.0  18.60 348.93  29.55  23.70
+ 0.19802   0.00  10.590  0  0.4890  6.1820  42.40  3.9454   4  277.0  18.60 393.63   9.47  25.00
+ 0.04560   0.00  13.890  1  0.5500  5.8880  56.00  3.1121   5  276.0  16.40 392.80  13.51  23.30
+ 0.07013   0.00  13.890  0  0.5500  6.6420  85.10  3.4211   5  276.0  16.40 392.78   9.69  28.70
+ 0.11069   0.00  13.890  1  0.5500  5.9510  93.80  2.8893   5  276.0  16.40 396.90  17.92  21.50
+ 0.11425   0.00  13.890  1  0.5500  6.3730  92.40  3.3633   5  276.0  16.40 393.74  10.50  23.00
+ 0.35809   0.00   6.200  1  0.5070  6.9510  88.50  2.8617   8  307.0  17.40 391.70   9.71  26.70
+ 0.40771   0.00   6.200  1  0.5070  6.1640  91.30  3.0480   8  307.0  17.40 395.24  21.46  21.70
+ 0.62356   0.00   6.200  1  0.5070  6.8790  77.70  3.2721   8  307.0  17.40 390.39   9.93  27.50
+ 0.61470   0.00   6.200  0  0.5070  6.6180  80.80  3.2721   8  307.0  17.40 396.90   7.60  30.10
+ 0.31533   0.00   6.200  0  0.5040  8.2660  78.30  2.8944   8  307.0  17.40 385.05   4.14  44.80
+ 0.52693   0.00   6.200  0  0.5040  8.7250  83.00  2.8944   8  307.0  17.40 382.00   4.63  50.00
+ 0.38214   0.00   6.200  0  0.5040  8.0400  86.50  3.2157   8  307.0  17.40 387.38   3.13  37.60
+ 0.41238   0.00   6.200  0  0.5040  7.1630  79.90  3.2157   8  307.0  17.40 372.08   6.36  31.60
+ 0.29819   0.00   6.200  0  0.5040  7.6860  17.00  3.3751   8  307.0  17.40 377.51   3.92  46.70
+ 0.44178   0.00   6.200  0  0.5040  6.5520  21.40  3.3751   8  307.0  17.40 380.34   3.76  31.50
+ 0.53700   0.00   6.200  0  0.5040  5.9810  68.10  3.6715   8  307.0  17.40 378.35  11.65  24.30
+ 0.46296   0.00   6.200  0  0.5040  7.4120  76.90  3.6715   8  307.0  17.40 376.14   5.25  31.70
+ 0.57529   0.00   6.200  0  0.5070  8.3370  73.30  3.8384   8  307.0  17.40 385.91   2.47  41.70
+ 0.33147   0.00   6.200  0  0.5070  8.2470  70.40  3.6519   8  307.0  17.40 378.95   3.95  48.30
+ 0.44791   0.00   6.200  1  0.5070  6.7260  66.50  3.6519   8  307.0  17.40 360.20   8.05  29.00
+ 0.33045   0.00   6.200  0  0.5070  6.0860  61.50  3.6519   8  307.0  17.40 376.75  10.88  24.00
+ 0.52058   0.00   6.200  1  0.5070  6.6310  76.50  4.1480   8  307.0  17.40 388.45   9.54  25.10
+ 0.51183   0.00   6.200  0  0.5070  7.3580  71.60  4.1480   8  307.0  17.40 390.07   4.73  31.50
+ 0.08244  30.00   4.930  0  0.4280  6.4810  18.50  6.1899   6  300.0  16.60 379.41   6.36  23.70
+ 0.09252  30.00   4.930  0  0.4280  6.6060  42.20  6.1899   6  300.0  16.60 383.78   7.37  23.30
+ 0.11329  30.00   4.930  0  0.4280  6.8970  54.30  6.3361   6  300.0  16.60 391.25  11.38  22.00
+ 0.10612  30.00   4.930  0  0.4280  6.0950  65.10  6.3361   6  300.0  16.60 394.62  12.40  20.10
+ 0.10290  30.00   4.930  0  0.4280  6.3580  52.90  7.0355   6  300.0  16.60 372.75  11.22  22.20
+ 0.12757  30.00   4.930  0  0.4280  6.3930   7.80  7.0355   6  300.0  16.60 374.71   5.19  23.70
+ 0.20608  22.00   5.860  0  0.4310  5.5930  76.50  7.9549   7  330.0  19.10 372.49  12.50  17.60
+ 0.19133  22.00   5.860  0  0.4310  5.6050  70.20  7.9549   7  330.0  19.10 389.13  18.46  18.50
+ 0.33983  22.00   5.860  0  0.4310  6.1080  34.90  8.0555   7  330.0  19.10 390.18   9.16  24.30
+ 0.19657  22.00   5.860  0  0.4310  6.2260  79.20  8.0555   7  330.0  19.10 376.14  10.15  20.50
+ 0.16439  22.00   5.860  0  0.4310  6.4330  49.10  7.8265   7  330.0  19.10 374.71   9.52  24.50
+ 0.19073  22.00   5.860  0  0.4310  6.7180  17.50  7.8265   7  330.0  19.10 393.74   6.56  26.20
+ 0.14030  22.00   5.860  0  0.4310  6.4870  13.00  7.3967   7  330.0  19.10 396.28   5.90  24.40
+ 0.21409  22.00   5.860  0  0.4310  6.4380   8.90  7.3967   7  330.0  19.10 377.07   3.59  24.80
+ 0.08221  22.00   5.860  0  0.4310  6.9570   6.80  8.9067   7  330.0  19.10 386.09   3.53  29.60
+ 0.36894  22.00   5.860  0  0.4310  8.2590   8.40  8.9067   7  330.0  19.10 396.90   3.54  42.80
+ 0.04819  80.00   3.640  0  0.3920  6.1080  32.00  9.2203   1  315.0  16.40 392.89   6.57  21.90
+ 0.03548  80.00   3.640  0  0.3920  5.8760  19.10  9.2203   1  315.0  16.40 395.18   9.25  20.90
+ 0.01538  90.00   3.750  0  0.3940  7.4540  34.20  6.3361   3  244.0  15.90 386.34   3.11  44.00
+ 0.61154  20.00   3.970  0  0.6470  8.7040  86.90  1.8010   5  264.0  13.00 389.70   5.12  50.00
+ 0.66351  20.00   3.970  0  0.6470  7.3330 100.00  1.8946   5  264.0  13.00 383.29   7.79  36.00
+ 0.65665  20.00   3.970  0  0.6470  6.8420 100.00  2.0107   5  264.0  13.00 391.93   6.90  30.10
+ 0.54011  20.00   3.970  0  0.6470  7.2030  81.80  2.1121   5  264.0  13.00 392.80   9.59  33.80
+ 0.53412  20.00   3.970  0  0.6470  7.5200  89.40  2.1398   5  264.0  13.00 388.37   7.26  43.10
+ 0.52014  20.00   3.970  0  0.6470  8.3980  91.50  2.2885   5  264.0  13.00 386.86   5.91  48.80
+ 0.82526  20.00   3.970  0  0.6470  7.3270  94.50  2.0788   5  264.0  13.00 393.42  11.25  31.00
+ 0.55007  20.00   3.970  0  0.6470  7.2060  91.60  1.9301   5  264.0  13.00 387.89   8.10  36.50
+ 0.76162  20.00   3.970  0  0.6470  5.5600  62.80  1.9865   5  264.0  13.00 392.40  10.45  22.80
+ 0.78570  20.00   3.970  0  0.6470  7.0140  84.60  2.1329   5  264.0  13.00 384.07  14.79  30.70
+ 0.57834  20.00   3.970  0  0.5750  8.2970  67.00  2.4216   5  264.0  13.00 384.54   7.44  50.00
+ 0.54050  20.00   3.970  0  0.5750  7.4700  52.60  2.8720   5  264.0  13.00 390.30   3.16  43.50
+ 0.09065  20.00   6.960  1  0.4640  5.9200  61.50  3.9175   3  223.0  18.60 391.34  13.65  20.70
+ 0.29916  20.00   6.960  0  0.4640  5.8560  42.10  4.4290   3  223.0  18.60 388.65  13.00  21.10
+ 0.16211  20.00   6.960  0  0.4640  6.2400  16.30  4.4290   3  223.0  18.60 396.90   6.59  25.20
+ 0.11460  20.00   6.960  0  0.4640  6.5380  58.70  3.9175   3  223.0  18.60 394.96   7.73  24.40
+ 0.22188  20.00   6.960  1  0.4640  7.6910  51.80  4.3665   3  223.0  18.60 390.77   6.58  35.20
+ 0.05644  40.00   6.410  1  0.4470  6.7580  32.90  4.0776   4  254.0  17.60 396.90   3.53  32.40
+ 0.09604  40.00   6.410  0  0.4470  6.8540  42.80  4.2673   4  254.0  17.60 396.90   2.98  32.00
+ 0.10469  40.00   6.410  1  0.4470  7.2670  49.00  4.7872   4  254.0  17.60 389.25   6.05  33.20
+ 0.06127  40.00   6.410  1  0.4470  6.8260  27.60  4.8628   4  254.0  17.60 393.45   4.16  33.10
+ 0.07978  40.00   6.410  0  0.4470  6.4820  32.10  4.1403   4  254.0  17.60 396.90   7.19  29.10
+ 0.21038  20.00   3.330  0  0.4429  6.8120  32.20  4.1007   5  216.0  14.90 396.90   4.85  35.10
+ 0.03578  20.00   3.330  0  0.4429  7.8200  64.50  4.6947   5  216.0  14.90 387.31   3.76  45.40
+ 0.03705  20.00   3.330  0  0.4429  6.9680  37.20  5.2447   5  216.0  14.90 392.23   4.59  35.40
+ 0.06129  20.00   3.330  1  0.4429  7.6450  49.70  5.2119   5  216.0  14.90 377.07   3.01  46.00
+ 0.01501  90.00   1.210  1  0.4010  7.9230  24.80  5.8850   1  198.0  13.60 395.52   3.16  50.00
+ 0.00906  90.00   2.970  0  0.4000  7.0880  20.80  7.3073   1  285.0  15.30 394.72   7.85  32.20
+ 0.01096  55.00   2.250  0  0.3890  6.4530  31.90  7.3073   1  300.0  15.30 394.72   8.23  22.00
+ 0.01965  80.00   1.760  0  0.3850  6.2300  31.50  9.0892   1  241.0  18.20 341.60  12.93  20.10
+ 0.03871  52.50   5.320  0  0.4050  6.2090  31.30  7.3172   6  293.0  16.60 396.90   7.14  23.20
+ 0.04590  52.50   5.320  0  0.4050  6.3150  45.60  7.3172   6  293.0  16.60 396.90   7.60  22.30
+ 0.04297  52.50   5.320  0  0.4050  6.5650  22.90  7.3172   6  293.0  16.60 371.72   9.51  24.80
+ 0.03502  80.00   4.950  0  0.4110  6.8610  27.90  5.1167   4  245.0  19.20 396.90   3.33  28.50
+ 0.07886  80.00   4.950  0  0.4110  7.1480  27.70  5.1167   4  245.0  19.20 396.90   3.56  37.30
+ 0.03615  80.00   4.950  0  0.4110  6.6300  23.40  5.1167   4  245.0  19.20 396.90   4.70  27.90
+ 0.08265   0.00  13.920  0  0.4370  6.1270  18.40  5.5027   4  289.0  16.00 396.90   8.58  23.90
+ 0.08199   0.00  13.920  0  0.4370  6.0090  42.30  5.5027   4  289.0  16.00 396.90  10.40  21.70
+ 0.12932   0.00  13.920  0  0.4370  6.6780  31.10  5.9604   4  289.0  16.00 396.90   6.27  28.60
+ 0.05372   0.00  13.920  0  0.4370  6.5490  51.00  5.9604   4  289.0  16.00 392.85   7.39  27.10
+ 0.14103   0.00  13.920  0  0.4370  5.7900  58.00  6.3200   4  289.0  16.00 396.90  15.84  20.30
+ 0.06466  70.00   2.240  0  0.4000  6.3450  20.10  7.8278   5  358.0  14.80 368.24   4.97  22.50
+ 0.05561  70.00   2.240  0  0.4000  7.0410  10.00  7.8278   5  358.0  14.80 371.58   4.74  29.00
+ 0.04417  70.00   2.240  0  0.4000  6.8710  47.40  7.8278   5  358.0  14.80 390.86   6.07  24.80
+ 0.03537  34.00   6.090  0  0.4330  6.5900  40.40  5.4917   7  329.0  16.10 395.75   9.50  22.00
+ 0.09266  34.00   6.090  0  0.4330  6.4950  18.40  5.4917   7  329.0  16.10 383.61   8.67  26.40
+ 0.10000  34.00   6.090  0  0.4330  6.9820  17.70  5.4917   7  329.0  16.10 390.43   4.86  33.10
+ 0.05515  33.00   2.180  0  0.4720  7.2360  41.10  4.0220   7  222.0  18.40 393.68   6.93  36.10
+ 0.05479  33.00   2.180  0  0.4720  6.6160  58.10  3.3700   7  222.0  18.40 393.36   8.93  28.40
+ 0.07503  33.00   2.180  0  0.4720  7.4200  71.90  3.0992   7  222.0  18.40 396.90   6.47  33.40
+ 0.04932  33.00   2.180  0  0.4720  6.8490  70.30  3.1827   7  222.0  18.40 396.90   7.53  28.20
+ 0.49298   0.00   9.900  0  0.5440  6.6350  82.50  3.3175   4  304.0  18.40 396.90   4.54  22.80
+ 0.34940   0.00   9.900  0  0.5440  5.9720  76.70  3.1025   4  304.0  18.40 396.24   9.97  20.30
+ 2.63548   0.00   9.900  0  0.5440  4.9730  37.80  2.5194   4  304.0  18.40 350.45  12.64  16.10
+ 0.79041   0.00   9.900  0  0.5440  6.1220  52.80  2.6403   4  304.0  18.40 396.90   5.98  22.10
+ 0.26169   0.00   9.900  0  0.5440  6.0230  90.40  2.8340   4  304.0  18.40 396.30  11.72  19.40
+ 0.26938   0.00   9.900  0  0.5440  6.2660  82.80  3.2628   4  304.0  18.40 393.39   7.90  21.60
+ 0.36920   0.00   9.900  0  0.5440  6.5670  87.30  3.6023   4  304.0  18.40 395.69   9.28  23.80
+ 0.25356   0.00   9.900  0  0.5440  5.7050  77.70  3.9450   4  304.0  18.40 396.42  11.50  16.20
+ 0.31827   0.00   9.900  0  0.5440  5.9140  83.20  3.9986   4  304.0  18.40 390.70  18.33  17.80
+ 0.24522   0.00   9.900  0  0.5440  5.7820  71.70  4.0317   4  304.0  18.40 396.90  15.94  19.80
+ 0.40202   0.00   9.900  0  0.5440  6.3820  67.20  3.5325   4  304.0  18.40 395.21  10.36  23.10
+ 0.47547   0.00   9.900  0  0.5440  6.1130  58.80  4.0019   4  304.0  18.40 396.23  12.73  21.00
+ 0.16760   0.00   7.380  0  0.4930  6.4260  52.30  4.5404   5  287.0  19.60 396.90   7.20  23.80
+ 0.18159   0.00   7.380  0  0.4930  6.3760  54.30  4.5404   5  287.0  19.60 396.90   6.87  23.10
+ 0.35114   0.00   7.380  0  0.4930  6.0410  49.90  4.7211   5  287.0  19.60 396.90   7.70  20.40
+ 0.28392   0.00   7.380  0  0.4930  5.7080  74.30  4.7211   5  287.0  19.60 391.13  11.74  18.50
+ 0.34109   0.00   7.380  0  0.4930  6.4150  40.10  4.7211   5  287.0  19.60 396.90   6.12  25.00
+ 0.19186   0.00   7.380  0  0.4930  6.4310  14.70  5.4159   5  287.0  19.60 393.68   5.08  24.60
+ 0.30347   0.00   7.380  0  0.4930  6.3120  28.90  5.4159   5  287.0  19.60 396.90   6.15  23.00
+ 0.24103   0.00   7.380  0  0.4930  6.0830  43.70  5.4159   5  287.0  19.60 396.90  12.79  22.20
+ 0.06617   0.00   3.240  0  0.4600  5.8680  25.80  5.2146   4  430.0  16.90 382.44   9.97  19.30
+ 0.06724   0.00   3.240  0  0.4600  6.3330  17.20  5.2146   4  430.0  16.90 375.21   7.34  22.60
+ 0.04544   0.00   3.240  0  0.4600  6.1440  32.20  5.8736   4  430.0  16.90 368.57   9.09  19.80
+ 0.05023  35.00   6.060  0  0.4379  5.7060  28.40  6.6407   1  304.0  16.90 394.02  12.43  17.10
+ 0.03466  35.00   6.060  0  0.4379  6.0310  23.30  6.6407   1  304.0  16.90 362.25   7.83  19.40
+ 0.05083   0.00   5.190  0  0.5150  6.3160  38.10  6.4584   5  224.0  20.20 389.71   5.68  22.20
+ 0.03738   0.00   5.190  0  0.5150  6.3100  38.50  6.4584   5  224.0  20.20 389.40   6.75  20.70
+ 0.03961   0.00   5.190  0  0.5150  6.0370  34.50  5.9853   5  224.0  20.20 396.90   8.01  21.10
+ 0.03427   0.00   5.190  0  0.5150  5.8690  46.30  5.2311   5  224.0  20.20 396.90   9.80  19.50
+ 0.03041   0.00   5.190  0  0.5150  5.8950  59.60  5.6150   5  224.0  20.20 394.81  10.56  18.50
+ 0.03306   0.00   5.190  0  0.5150  6.0590  37.30  4.8122   5  224.0  20.20 396.14   8.51  20.60
+ 0.05497   0.00   5.190  0  0.5150  5.9850  45.40  4.8122   5  224.0  20.20 396.90   9.74  19.00
+ 0.06151   0.00   5.190  0  0.5150  5.9680  58.50  4.8122   5  224.0  20.20 396.90   9.29  18.70
+ 0.01301  35.00   1.520  0  0.4420  7.2410  49.30  7.0379   1  284.0  15.50 394.74   5.49  32.70
+ 0.02498   0.00   1.890  0  0.5180  6.5400  59.70  6.2669   1  422.0  15.90 389.96   8.65  16.50
+ 0.02543  55.00   3.780  0  0.4840  6.6960  56.40  5.7321   5  370.0  17.60 396.90   7.18  23.90
+ 0.03049  55.00   3.780  0  0.4840  6.8740  28.10  6.4654   5  370.0  17.60 387.97   4.61  31.20
+ 0.03113   0.00   4.390  0  0.4420  6.0140  48.50  8.0136   3  352.0  18.80 385.64  10.53  17.50
+ 0.06162   0.00   4.390  0  0.4420  5.8980  52.30  8.0136   3  352.0  18.80 364.61  12.67  17.20
+ 0.01870  85.00   4.150  0  0.4290  6.5160  27.70  8.5353   4  351.0  17.90 392.43   6.36  23.10
+ 0.01501  80.00   2.010  0  0.4350  6.6350  29.70  8.3440   4  280.0  17.00 390.94   5.99  24.50
+ 0.02899  40.00   1.250  0  0.4290  6.9390  34.50  8.7921   1  335.0  19.70 389.85   5.89  26.60
+ 0.06211  40.00   1.250  0  0.4290  6.4900  44.40  8.7921   1  335.0  19.70 396.90   5.98  22.90
+ 0.07950  60.00   1.690  0  0.4110  6.5790  35.90 10.7103   4  411.0  18.30 370.78   5.49  24.10
+ 0.07244  60.00   1.690  0  0.4110  5.8840  18.50 10.7103   4  411.0  18.30 392.33   7.79  18.60
+ 0.01709  90.00   2.020  0  0.4100  6.7280  36.10 12.1265   5  187.0  17.00 384.46   4.50  30.10
+ 0.04301  80.00   1.910  0  0.4130  5.6630  21.90 10.5857   4  334.0  22.00 382.80   8.05  18.20
+ 0.10659  80.00   1.910  0  0.4130  5.9360  19.50 10.5857   4  334.0  22.00 376.04   5.57  20.60
+ 8.98296   0.00  18.100  1  0.7700  6.2120  97.40  2.1222  24  666.0  20.20 377.73  17.60  17.80
+ 3.84970   0.00  18.100  1  0.7700  6.3950  91.00  2.5052  24  666.0  20.20 391.34  13.27  21.70
+ 5.20177   0.00  18.100  1  0.7700  6.1270  83.40  2.7227  24  666.0  20.20 395.43  11.48  22.70
+ 4.26131   0.00  18.100  0  0.7700  6.1120  81.30  2.5091  24  666.0  20.20 390.74  12.67  22.60
+ 4.54192   0.00  18.100  0  0.7700  6.3980  88.00  2.5182  24  666.0  20.20 374.56   7.79  25.00
+ 3.83684   0.00  18.100  0  0.7700  6.2510  91.10  2.2955  24  666.0  20.20 350.65  14.19  19.90
+ 3.67822   0.00  18.100  0  0.7700  5.3620  96.20  2.1036  24  666.0  20.20 380.79  10.19  20.80
+ 4.22239   0.00  18.100  1  0.7700  5.8030  89.00  1.9047  24  666.0  20.20 353.04  14.64  16.80
+ 3.47428   0.00  18.100  1  0.7180  8.7800  82.90  1.9047  24  666.0  20.20 354.55   5.29  21.90
+ 4.55587   0.00  18.100  0  0.7180  3.5610  87.90  1.6132  24  666.0  20.20 354.70   7.12  27.50
+ 3.69695   0.00  18.100  0  0.7180  4.9630  91.40  1.7523  24  666.0  20.20 316.03  14.00  21.90
+13.52220   0.00  18.100  0  0.6310  3.8630 100.00  1.5106  24  666.0  20.20 131.42  13.33  23.10
+ 4.89822   0.00  18.100  0  0.6310  4.9700 100.00  1.3325  24  666.0  20.20 375.52   3.26  50.00
+ 5.66998   0.00  18.100  1  0.6310  6.6830  96.80  1.3567  24  666.0  20.20 375.33   3.73  50.00
+ 6.53876   0.00  18.100  1  0.6310  7.0160  97.50  1.2024  24  666.0  20.20 392.05   2.96  50.00
+ 9.23230   0.00  18.100  0  0.6310  6.2160 100.00  1.1691  24  666.0  20.20 366.15   9.53  50.00
+ 8.26725   0.00  18.100  1  0.6680  5.8750  89.60  1.1296  24  666.0  20.20 347.88   8.88  50.00
+11.10810   0.00  18.100  0  0.6680  4.9060 100.00  1.1742  24  666.0  20.20 396.90  34.77  13.80
+18.49820   0.00  18.100  0  0.6680  4.1380 100.00  1.1370  24  666.0  20.20 396.90  37.97  13.80
+19.60910   0.00  18.100  0  0.6710  7.3130  97.90  1.3163  24  666.0  20.20 396.90  13.44  15.00
+15.28800   0.00  18.100  0  0.6710  6.6490  93.30  1.3449  24  666.0  20.20 363.02  23.24  13.90
+ 9.82349   0.00  18.100  0  0.6710  6.7940  98.80  1.3580  24  666.0  20.20 396.90  21.24  13.30
+23.64820   0.00  18.100  0  0.6710  6.3800  96.20  1.3861  24  666.0  20.20 396.90  23.69  13.10
+17.86670   0.00  18.100  0  0.6710  6.2230 100.00  1.3861  24  666.0  20.20 393.74  21.78  10.20
+88.97620   0.00  18.100  0  0.6710  6.9680  91.90  1.4165  24  666.0  20.20 396.90  17.21  10.40
+15.87440   0.00  18.100  0  0.6710  6.5450  99.10  1.5192  24  666.0  20.20 396.90  21.08  10.90
+ 9.18702   0.00  18.100  0  0.7000  5.5360 100.00  1.5804  24  666.0  20.20 396.90  23.60  11.30
+ 7.99248   0.00  18.100  0  0.7000  5.5200 100.00  1.5331  24  666.0  20.20 396.90  24.56  12.30
+20.08490   0.00  18.100  0  0.7000  4.3680  91.20  1.4395  24  666.0  20.20 285.83  30.63   8.80
+16.81180   0.00  18.100  0  0.7000  5.2770  98.10  1.4261  24  666.0  20.20 396.90  30.81   7.20
+24.39380   0.00  18.100  0  0.7000  4.6520 100.00  1.4672  24  666.0  20.20 396.90  28.28  10.50
+22.59710   0.00  18.100  0  0.7000  5.0000  89.50  1.5184  24  666.0  20.20 396.90  31.99   7.40
+14.33370   0.00  18.100  0  0.7000  4.8800 100.00  1.5895  24  666.0  20.20 372.92  30.62  10.20
+ 8.15174   0.00  18.100  0  0.7000  5.3900  98.90  1.7281  24  666.0  20.20 396.90  20.85  11.50
+ 6.96215   0.00  18.100  0  0.7000  5.7130  97.00  1.9265  24  666.0  20.20 394.43  17.11  15.10
+ 5.29305   0.00  18.100  0  0.7000  6.0510  82.50  2.1678  24  666.0  20.20 378.38  18.76  23.20
+11.57790   0.00  18.100  0  0.7000  5.0360  97.00  1.7700  24  666.0  20.20 396.90  25.68   9.70
+ 8.64476   0.00  18.100  0  0.6930  6.1930  92.60  1.7912  24  666.0  20.20 396.90  15.17  13.80
+13.35980   0.00  18.100  0  0.6930  5.8870  94.70  1.7821  24  666.0  20.20 396.90  16.35  12.70
+ 8.71675   0.00  18.100  0  0.6930  6.4710  98.80  1.7257  24  666.0  20.20 391.98  17.12  13.10
+ 5.87205   0.00  18.100  0  0.6930  6.4050  96.00  1.6768  24  666.0  20.20 396.90  19.37  12.50
+ 7.67202   0.00  18.100  0  0.6930  5.7470  98.90  1.6334  24  666.0  20.20 393.10  19.92   8.50
+38.35180   0.00  18.100  0  0.6930  5.4530 100.00  1.4896  24  666.0  20.20 396.90  30.59   5.00
+ 9.91655   0.00  18.100  0  0.6930  5.8520  77.80  1.5004  24  666.0  20.20 338.16  29.97   6.30
+25.04610   0.00  18.100  0  0.6930  5.9870 100.00  1.5888  24  666.0  20.20 396.90  26.77   5.60
+14.23620   0.00  18.100  0  0.6930  6.3430 100.00  1.5741  24  666.0  20.20 396.90  20.32   7.20
+ 9.59571   0.00  18.100  0  0.6930  6.4040 100.00  1.6390  24  666.0  20.20 376.11  20.31  12.10
+24.80170   0.00  18.100  0  0.6930  5.3490  96.00  1.7028  24  666.0  20.20 396.90  19.77   8.30
+41.52920   0.00  18.100  0  0.6930  5.5310  85.40  1.6074  24  666.0  20.20 329.46  27.38   8.50
+67.92080   0.00  18.100  0  0.6930  5.6830 100.00  1.4254  24  666.0  20.20 384.97  22.98   5.00
+20.71620   0.00  18.100  0  0.6590  4.1380 100.00  1.1781  24  666.0  20.20 370.22  23.34  11.90
+11.95110   0.00  18.100  0  0.6590  5.6080 100.00  1.2852  24  666.0  20.20 332.09  12.13  27.90
+ 7.40389   0.00  18.100  0  0.5970  5.6170  97.90  1.4547  24  666.0  20.20 314.64  26.40  17.20
+14.43830   0.00  18.100  0  0.5970  6.8520 100.00  1.4655  24  666.0  20.20 179.36  19.78  27.50
+51.13580   0.00  18.100  0  0.5970  5.7570 100.00  1.4130  24  666.0  20.20   2.60  10.11  15.00
+14.05070   0.00  18.100  0  0.5970  6.6570 100.00  1.5275  24  666.0  20.20  35.05  21.22  17.20
+18.81100   0.00  18.100  0  0.5970  4.6280 100.00  1.5539  24  666.0  20.20  28.79  34.37  17.90
+28.65580   0.00  18.100  0  0.5970  5.1550 100.00  1.5894  24  666.0  20.20 210.97  20.08  16.30
+45.74610   0.00  18.100  0  0.6930  4.5190 100.00  1.6582  24  666.0  20.20  88.27  36.98   7.00
+18.08460   0.00  18.100  0  0.6790  6.4340 100.00  1.8347  24  666.0  20.20  27.25  29.05   7.20
+10.83420   0.00  18.100  0  0.6790  6.7820  90.80  1.8195  24  666.0  20.20  21.57  25.79   7.50
+25.94060   0.00  18.100  0  0.6790  5.3040  89.10  1.6475  24  666.0  20.20 127.36  26.64  10.40
+73.53410   0.00  18.100  0  0.6790  5.9570 100.00  1.8026  24  666.0  20.20  16.45  20.62   8.80
+11.81230   0.00  18.100  0  0.7180  6.8240  76.50  1.7940  24  666.0  20.20  48.45  22.74   8.40
+11.08740   0.00  18.100  0  0.7180  6.4110 100.00  1.8589  24  666.0  20.20 318.75  15.02  16.70
+ 7.02259   0.00  18.100  0  0.7180  6.0060  95.30  1.8746  24  666.0  20.20 319.98  15.70  14.20
+12.04820   0.00  18.100  0  0.6140  5.6480  87.60  1.9512  24  666.0  20.20 291.55  14.10  20.80
+ 7.05042   0.00  18.100  0  0.6140  6.1030  85.10  2.0218  24  666.0  20.20   2.52  23.29  13.40
+ 8.79212   0.00  18.100  0  0.5840  5.5650  70.60  2.0635  24  666.0  20.20   3.65  17.16  11.70
+15.86030   0.00  18.100  0  0.6790  5.8960  95.40  1.9096  24  666.0  20.20   7.68  24.39   8.30
+12.24720   0.00  18.100  0  0.5840  5.8370  59.70  1.9976  24  666.0  20.20  24.65  15.69  10.20
+37.66190   0.00  18.100  0  0.6790  6.2020  78.70  1.8629  24  666.0  20.20  18.82  14.52  10.90
+ 7.36711   0.00  18.100  0  0.6790  6.1930  78.10  1.9356  24  666.0  20.20  96.73  21.52  11.00
+ 9.33889   0.00  18.100  0  0.6790  6.3800  95.60  1.9682  24  666.0  20.20  60.72  24.08   9.50
+ 8.49213   0.00  18.100  0  0.5840  6.3480  86.10  2.0527  24  666.0  20.20  83.45  17.64  14.50
+10.06230   0.00  18.100  0  0.5840  6.8330  94.30  2.0882  24  666.0  20.20  81.33  19.69  14.10
+ 6.44405   0.00  18.100  0  0.5840  6.4250  74.80  2.2004  24  666.0  20.20  97.95  12.03  16.10
+ 5.58107   0.00  18.100  0  0.7130  6.4360  87.90  2.3158  24  666.0  20.20 100.19  16.22  14.30
+13.91340   0.00  18.100  0  0.7130  6.2080  95.00  2.2222  24  666.0  20.20 100.63  15.17  11.70
+11.16040   0.00  18.100  0  0.7400  6.6290  94.60  2.1247  24  666.0  20.20 109.85  23.27  13.40
+14.42080   0.00  18.100  0  0.7400  6.4610  93.30  2.0026  24  666.0  20.20  27.49  18.05   9.60
+15.17720   0.00  18.100  0  0.7400  6.1520 100.00  1.9142  24  666.0  20.20   9.32  26.45   8.70
+13.67810   0.00  18.100  0  0.7400  5.9350  87.90  1.8206  24  666.0  20.20  68.95  34.02   8.40
+ 9.39063   0.00  18.100  0  0.7400  5.6270  93.90  1.8172  24  666.0  20.20 396.90  22.88  12.80
+22.05110   0.00  18.100  0  0.7400  5.8180  92.40  1.8662  24  666.0  20.20 391.45  22.11  10.50
+ 9.72418   0.00  18.100  0  0.7400  6.4060  97.20  2.0651  24  666.0  20.20 385.96  19.52  17.10
+ 5.66637   0.00  18.100  0  0.7400  6.2190 100.00  2.0048  24  666.0  20.20 395.69  16.59  18.40
+ 9.96654   0.00  18.100  0  0.7400  6.4850 100.00  1.9784  24  666.0  20.20 386.73  18.85  15.40
+12.80230   0.00  18.100  0  0.7400  5.8540  96.60  1.8956  24  666.0  20.20 240.52  23.79  10.80
+10.67180   0.00  18.100  0  0.7400  6.4590  94.80  1.9879  24  666.0  20.20  43.06  23.98  11.80
+ 6.28807   0.00  18.100  0  0.7400  6.3410  96.40  2.0720  24  666.0  20.20 318.01  17.79  14.90
+ 9.92485   0.00  18.100  0  0.7400  6.2510  96.60  2.1980  24  666.0  20.20 388.52  16.44  12.60
+ 9.32909   0.00  18.100  0  0.7130  6.1850  98.70  2.2616  24  666.0  20.20 396.90  18.13  14.10
+ 7.52601   0.00  18.100  0  0.7130  6.4170  98.30  2.1850  24  666.0  20.20 304.21  19.31  13.00
+ 6.71772   0.00  18.100  0  0.7130  6.7490  92.60  2.3236  24  666.0  20.20   0.32  17.44  13.40
+ 5.44114   0.00  18.100  0  0.7130  6.6550  98.20  2.3552  24  666.0  20.20 355.29  17.73  15.20
+ 5.09017   0.00  18.100  0  0.7130  6.2970  91.80  2.3682  24  666.0  20.20 385.09  17.27  16.10
+ 8.24809   0.00  18.100  0  0.7130  7.3930  99.30  2.4527  24  666.0  20.20 375.87  16.74  17.80
+ 9.51363   0.00  18.100  0  0.7130  6.7280  94.10  2.4961  24  666.0  20.20   6.68  18.71  14.90
+ 4.75237   0.00  18.100  0  0.7130  6.5250  86.50  2.4358  24  666.0  20.20  50.92  18.13  14.10
+ 4.66883   0.00  18.100  0  0.7130  5.9760  87.90  2.5806  24  666.0  20.20  10.48  19.01  12.70
+ 8.20058   0.00  18.100  0  0.7130  5.9360  80.30  2.7792  24  666.0  20.20   3.50  16.94  13.50
+ 7.75223   0.00  18.100  0  0.7130  6.3010  83.70  2.7831  24  666.0  20.20 272.21  16.23  14.90
+ 6.80117   0.00  18.100  0  0.7130  6.0810  84.40  2.7175  24  666.0  20.20 396.90  14.70  20.00
+ 4.81213   0.00  18.100  0  0.7130  6.7010  90.00  2.5975  24  666.0  20.20 255.23  16.42  16.40
+ 3.69311   0.00  18.100  0  0.7130  6.3760  88.40  2.5671  24  666.0  20.20 391.43  14.65  17.70
+ 6.65492   0.00  18.100  0  0.7130  6.3170  83.00  2.7344  24  666.0  20.20 396.90  13.99  19.50
+ 5.82115   0.00  18.100  0  0.7130  6.5130  89.90  2.8016  24  666.0  20.20 393.82  10.29  20.20
+ 7.83932   0.00  18.100  0  0.6550  6.2090  65.40  2.9634  24  666.0  20.20 396.90  13.22  21.40
+ 3.16360   0.00  18.100  0  0.6550  5.7590  48.20  3.0665  24  666.0  20.20 334.40  14.13  19.90
+ 3.77498   0.00  18.100  0  0.6550  5.9520  84.70  2.8715  24  666.0  20.20  22.01  17.15  19.00
+ 4.42228   0.00  18.100  0  0.5840  6.0030  94.50  2.5403  24  666.0  20.20 331.29  21.32  19.10
+15.57570   0.00  18.100  0  0.5800  5.9260  71.00  2.9084  24  666.0  20.20 368.74  18.13  19.10
+13.07510   0.00  18.100  0  0.5800  5.7130  56.70  2.8237  24  666.0  20.20 396.90  14.76  20.10
+ 4.34879   0.00  18.100  0  0.5800  6.1670  84.00  3.0334  24  666.0  20.20 396.90  16.29  19.90
+ 4.03841   0.00  18.100  0  0.5320  6.2290  90.70  3.0993  24  666.0  20.20 395.33  12.87  19.60
+ 3.56868   0.00  18.100  0  0.5800  6.4370  75.00  2.8965  24  666.0  20.20 393.37  14.36  23.20
+ 4.64689   0.00  18.100  0  0.6140  6.9800  67.60  2.5329  24  666.0  20.20 374.68  11.66  29.80
+ 8.05579   0.00  18.100  0  0.5840  5.4270  95.40  2.4298  24  666.0  20.20 352.58  18.14  13.80
+ 6.39312   0.00  18.100  0  0.5840  6.1620  97.40  2.2060  24  666.0  20.20 302.76  24.10  13.30
+ 4.87141   0.00  18.100  0  0.6140  6.4840  93.60  2.3053  24  666.0  20.20 396.21  18.68  16.70
+15.02340   0.00  18.100  0  0.6140  5.3040  97.30  2.1007  24  666.0  20.20 349.48  24.91  12.00
+10.23300   0.00  18.100  0  0.6140  6.1850  96.70  2.1705  24  666.0  20.20 379.70  18.03  14.60
+14.33370   0.00  18.100  0  0.6140  6.2290  88.00  1.9512  24  666.0  20.20 383.32  13.11  21.40
+ 5.82401   0.00  18.100  0  0.5320  6.2420  64.70  3.4242  24  666.0  20.20 396.90  10.74  23.00
+ 5.70818   0.00  18.100  0  0.5320  6.7500  74.90  3.3317  24  666.0  20.20 393.07   7.74  23.70
+ 5.73116   0.00  18.100  0  0.5320  7.0610  77.00  3.4106  24  666.0  20.20 395.28   7.01  25.00
+ 2.81838   0.00  18.100  0  0.5320  5.7620  40.30  4.0983  24  666.0  20.20 392.92  10.42  21.80
+ 2.37857   0.00  18.100  0  0.5830  5.8710  41.90  3.7240  24  666.0  20.20 370.73  13.34  20.60
+ 3.67367   0.00  18.100  0  0.5830  6.3120  51.90  3.9917  24  666.0  20.20 388.62  10.58  21.20
+ 5.69175   0.00  18.100  0  0.5830  6.1140  79.80  3.5459  24  666.0  20.20 392.68  14.98  19.10
+ 4.83567   0.00  18.100  0  0.5830  5.9050  53.20  3.1523  24  666.0  20.20 388.22  11.45  20.60
+ 0.15086   0.00  27.740  0  0.6090  5.4540  92.70  1.8209   4  711.0  20.10 395.09  18.06  15.20
+ 0.18337   0.00  27.740  0  0.6090  5.4140  98.30  1.7554   4  711.0  20.10 344.05  23.97   7.00
+ 0.20746   0.00  27.740  0  0.6090  5.0930  98.00  1.8226   4  711.0  20.10 318.43  29.68   8.10
+ 0.10574   0.00  27.740  0  0.6090  5.9830  98.80  1.8681   4  711.0  20.10 390.11  18.07  13.60
+ 0.11132   0.00  27.740  0  0.6090  5.9830  83.50  2.1099   4  711.0  20.10 396.90  13.35  20.10
+ 0.17331   0.00   9.690  0  0.5850  5.7070  54.00  2.3817   6  391.0  19.20 396.90  12.01  21.80
+ 0.27957   0.00   9.690  0  0.5850  5.9260  42.60  2.3817   6  391.0  19.20 396.90  13.59  24.50
+ 0.17899   0.00   9.690  0  0.5850  5.6700  28.80  2.7986   6  391.0  19.20 393.29  17.60  23.10
+ 0.28960   0.00   9.690  0  0.5850  5.3900  72.90  2.7986   6  391.0  19.20 396.90  21.14  19.70
+ 0.26838   0.00   9.690  0  0.5850  5.7940  70.60  2.8927   6  391.0  19.20 396.90  14.10  18.30
+ 0.23912   0.00   9.690  0  0.5850  6.0190  65.30  2.4091   6  391.0  19.20 396.90  12.92  21.20
+ 0.17783   0.00   9.690  0  0.5850  5.5690  73.50  2.3999   6  391.0  19.20 395.77  15.10  17.50
+ 0.22438   0.00   9.690  0  0.5850  6.0270  79.70  2.4982   6  391.0  19.20 396.90  14.33  16.80
+ 0.06263   0.00  11.930  0  0.5730  6.5930  69.10  2.4786   1  273.0  21.00 391.99   9.67  22.40
+ 0.04527   0.00  11.930  0  0.5730  6.1200  76.70  2.2875   1  273.0  21.00 396.90   9.08  20.60
+ 0.06076   0.00  11.930  0  0.5730  6.9760  91.00  2.1675   1  273.0  21.00 396.90   5.64  23.90
+ 0.10959   0.00  11.930  0  0.5730  6.7940  89.30  2.3889   1  273.0  21.00 393.45   6.48  22.00
+ 0.04741   0.00  11.930  0  0.5730  6.0300  80.80  2.5050   1  273.0  21.00 396.90   7.88  11.90