merge with commit of dgplvm

2026-06-02 14:45:15 +02:00 · 2015-04-20 16:02:19 +01:00 · 2015-04-20 16:02:19 +01:00 · 401374d068
commit 401374d068
parent 17fa8a0ada 440d7b6478
152 changed files with 4272 additions and 1875 deletions
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -1,12 +1,12 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from model import *
-from parameterization.parameterized import adjust_name_for_printing, Parameterizable
-from parameterization.param import Param, ParamConcatenation
-from parameterization.observable_array import ObsAr
+from .model import *
+from .parameterization.parameterized import adjust_name_for_printing, Parameterizable
+from .parameterization.param import Param, ParamConcatenation
+from .parameterization.observable_array import ObsAr

-from gp import GP
-from svgp import SVGP
-from sparse_gp import SparseGP
-from mapping import *
+from .gp import GP
+from .svgp import SVGP
+from .sparse_gp import SparseGP
+from .mapping import *
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -4,13 +4,15 @@
 import numpy as np
 import sys
 from .. import kern
-from model import Model
-from parameterization import ObsAr
+from .model import Model
+from .parameterization import ObsAr
+from .mapping import Mapping
 from .. import likelihoods
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
-from parameterization.variational import VariationalPosterior
+from .parameterization.variational import VariationalPosterior

 import logging
+import warnings
 from GPy.util.normalizer import MeanNorm
 logger = logging.getLogger("GP")

@ -34,7 +36,7 @@ class GP(Model):


    """
-    def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
+    def __init__(self, X, Y, kernel, likelihood, mean_function=None, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
        super(GP, self).__init__(name)

        assert X.ndim == 2
@ -62,10 +64,14 @@ class GP(Model):
            self.Y = ObsAr(Y)
            self.Y_normalized = self.Y

-        assert Y.shape[0] == self.num_data
+        if Y.shape[0] != self.num_data:
+            #There can be cases where we want inputs than outputs, for example if we have multiple latent
+            #function values
+            warnings.warn("There are more rows in your input data X, \
+                         than in your output data Y, be VERY sure this is what you want")
        _, self.output_dim = self.Y.shape

-        #TODO: check the type of this is okay?
+        assert ((Y_metadata is None) or isinstance(Y_metadata, dict))
        self.Y_metadata = Y_metadata

        assert isinstance(kernel, kern.Kern)
@ -75,6 +81,15 @@ class GP(Model):
        assert isinstance(likelihood, likelihoods.Likelihood)
        self.likelihood = likelihood

+        #handle the mean function
+        self.mean_function = mean_function
+        if mean_function is not None:
+            assert isinstance(self.mean_function, Mapping)
+            assert mean_function.input_dim == self.input_dim
+            assert mean_function.output_dim == self.output_dim
+            self.link_parameter(mean_function)
+
+
        #find a sensible inference method
        logger.info("initializing inference method")
        if inference_method is None:
@ -82,14 +97,16 @@ class GP(Model):
                inference_method = exact_gaussian_inference.ExactGaussianInference()
            else:
                inference_method = expectation_propagation.EP()
-                print "defaulting to ", inference_method, "for latent function inference"
+                print("defaulting to ", inference_method, "for latent function inference")
        self.inference_method = inference_method

        logger.info("adding kernel and likelihood as parameters")
        self.link_parameter(self.kern)
        self.link_parameter(self.likelihood)
+        self.posterior = None

-    def set_XY(self, X=None, Y=None):
+
+    def set_XY(self, X=None, Y=None, trigger_update=True):
        """
        Set the input / output data of the model
        This is useful if we wish to change our existing data but maintain the same model
@ -99,7 +116,7 @@ class GP(Model):
        :param Y: output observations
        :type Y: np.ndarray
        """
-        self.update_model(False)
+        if trigger_update: self.update_model(False)
        if Y is not None:
            if self.normalizer is not None:
                self.normalizer.scale_by(Y)
@ -123,26 +140,26 @@ class GP(Model):
                    self.link_parameters(self.X)
            else:
                self.X = ObsAr(X)
-        self.update_model(True)
-        self._trigger_params_changed()
+        if trigger_update: self.update_model(True)
+        if trigger_update: self._trigger_params_changed()

-    def set_X(self,X):
+    def set_X(self,X, trigger_update=True):
        """
        Set the input data of the model

        :param X: input observations
        :type X: np.ndarray
        """
-        self.set_XY(X=X)
+        self.set_XY(X=X, trigger_update=trigger_update)

-    def set_Y(self,Y):
+    def set_Y(self,Y, trigger_update=True):
        """
        Set the output data of the model

        :param X: output observations
        :type X: np.ndarray
        """
-        self.set_XY(Y=Y)
+        self.set_XY(Y=Y, trigger_update=trigger_update)

    def parameters_changed(self):
        """
@ -153,9 +170,11 @@ class GP(Model):
            This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
            this method yourself, there may be unexpected consequences.
        """
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata)
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.mean_function, self.Y_metadata)
        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
        self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
+        if self.mean_function is not None:
+            self.mean_function.update_gradients(self.grad_dict['dL_dm'], self.X)

    def log_likelihood(self):
        """
@ -192,6 +211,10 @@ class GP(Model):

        #force mu to be a column vector
        if len(mu.shape)==1: mu = mu[:,None]
+
+        #add the mean function in
+        if not self.mean_function is None:
+            mu += self.mean_function.f(_Xnew)
        return mu, var

    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
@ -241,12 +264,14 @@ class GP(Model):

    def predictive_gradients(self, Xnew):
        """
-        Compute the derivatives of the latent function with respect to X*
+        Compute the derivatives of the predicted latent function with respect to X*

        Given a set of points at which to predict X* (size [N*,Q]), compute the
        derivatives of the mean and variance. Resulting arrays are sized:
         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).

+        Note that this is not the same as computing the mean and variance of the derivative of the function!
+
         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
        :param X: The points at which to get the predictive gradients
        :type X: np.ndarray (Xnew x self.input_dim)
@ -276,7 +301,7 @@ class GP(Model):
        :type size: int.
        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
        :type full_cov: bool.
-        :returns: Ysim: set of simulations
+        :returns: fsim: set of simulations
        :rtype: np.ndarray (N x samples)
        """
        m, v = self._raw_predict(X,  full_cov=full_cov)
@ -284,11 +309,11 @@ class GP(Model):
            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
        v = v.reshape(m.size,-1) if len(v.shape)==3 else v
        if not full_cov:
-            Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
+            fsim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
        else:
-            Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
+            fsim = np.random.multivariate_normal(m.flatten(), v, size).T

-        return Ysim
+        return fsim

    def posterior_samples(self, X, size=10, full_cov=False, Y_metadata=None):
        """
@ -304,16 +329,16 @@ class GP(Model):
        :type noise_model: integer.
        :returns: Ysim: set of simulations, a Numpy array (N x samples).
        """
-        Ysim = self.posterior_samples_f(X, size, full_cov=full_cov)
-        Ysim = self.likelihood.samples(Ysim, Y_metadata)
-
+        fsim = self.posterior_samples_f(X, size, full_cov=full_cov)
+        Ysim = self.likelihood.samples(fsim, Y_metadata)
        return Ysim

    def plot_f(self, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
        plot_raw=True,
-        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
+        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx',
+        apply_link=False):
        """
        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
        This is a call to plot with plot_raw=True.
@ -350,6 +375,8 @@ class GP(Model):
        :type Y_metadata: dict
        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
+        :param apply_link: if there is a link function of the likelihood, plot the link(f*) rather than f*
+        :type apply_link: boolean
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
@ -362,7 +389,7 @@ class GP(Model):
                                     which_data_ycols, fixed_inputs,
                                     levels, samples, fignum, ax, resolution,
                                     plot_raw=plot_raw, Y_metadata=Y_metadata,
-                                     data_symbol=data_symbol, **kw)
+                                     data_symbol=data_symbol, apply_link=apply_link, **kw)

    def plot(self, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
@ -441,7 +468,7 @@ class GP(Model):
        try:
            super(GP, self).optimize(optimizer, start, **kwargs)
        except KeyboardInterrupt:
-            print "KeyboardInterrupt caught, calling on_optimization_end() to round things up"
+            print("KeyboardInterrupt caught, calling on_optimization_end() to round things up")
            self.inference_method.on_optimization_end()
            raise

--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -1,13 +1,14 @@
 # Copyright (c) 2013,2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import sys
-from parameterization import Parameterized
+from .parameterization import Parameterized
 import numpy as np

 class Mapping(Parameterized):
    """
-    Base model for shared behavior between models that can act like a mapping.
+    Base model for shared mapping behaviours
    """

    def __init__(self, input_dim, output_dim, name='mapping'):
@ -18,49 +19,12 @@ class Mapping(Parameterized):
    def f(self, X):
        raise NotImplementedError

-    def df_dX(self, dL_df, X):
-        """Evaluate derivatives of mapping outputs with respect to inputs.
-
-        :param dL_df: gradient of the objective with respect to the function.
-        :type dL_df: ndarray (num_data x output_dim)
-        :param X: the input locations where derivatives are to be evaluated.
-        :type X: ndarray (num_data x input_dim)
-        :returns: matrix containing gradients of the function with respect to the inputs.
-        """
+    def gradients_X(self, dL_dF, X):
        raise NotImplementedError

-    def df_dtheta(self, dL_df, X):
-        """The gradient of the outputs of the mapping with respect to each of the parameters.
-
-        :param dL_df: gradient of the objective with respect to the function.
-        :type dL_df: ndarray (num_data x output_dim)
-        :param X: input locations where the function is evaluated.
-        :type X: ndarray (num_data x input_dim)
-        :returns: Matrix containing gradients with respect to parameters of each output for each input data.
-        :rtype: ndarray (num_params length)
-        """
-
+    def update_gradients(self, dL_dF, X):
        raise NotImplementedError

-    def plot(self, *args):
-        """
-        Plots the mapping associated with the model.
-          - In one dimension, the function is plotted.
-          - In two dimensions, a contour-plot shows the function
-          - In higher dimensions, we've not implemented this yet !TODO!
-
-        Can plot only part of the data and part of the posterior functions
-        using which_data and which_functions
-
-        This is a convenience function: arguments are passed to
-        GPy.plotting.matplot_dep.models_plots.plot_mapping
-        """
-
-        if "matplotlib" in sys.modules:
-            from ..plotting.matplot_dep import models_plots
-            mapping_plots.plot_mapping(self,*args)
-        else:
-            raise NameError, "matplotlib package has not been imported."

 class Bijective_mapping(Mapping):
    """
@ -74,72 +38,4 @@ class Bijective_mapping(Mapping):
        """Inverse mapping from output domain of the function to the inputs."""
        raise NotImplementedError

-from model import Model
-
-class Mapping_check_model(Model):
-    """
-    This is a dummy model class used as a base class for checking that the
-    gradients of a given mapping are implemented correctly. It enables
-    checkgradient() to be called independently on each mapping.
-    """
-    def __init__(self, mapping=None, dL_df=None, X=None):
-        num_samples = 20
-        if mapping==None:
-            mapping = GPy.mapping.linear(1, 1)
-        if X==None:
-            X = np.random.randn(num_samples, mapping.input_dim)
-        if dL_df==None:
-            dL_df = np.ones((num_samples, mapping.output_dim))
-
-        self.mapping=mapping
-        self.X = X
-        self.dL_df = dL_df
-        self.num_params = self.mapping.num_params
-        Model.__init__(self)
-
-
-    def _get_params(self):
-        return self.mapping._get_params()
-
-    def _get_param_names(self):
-        return self.mapping._get_param_names()
-
-    def _set_params(self, x):
-        self.mapping._set_params(x)
-
-    def log_likelihood(self):
-        return (self.dL_df*self.mapping.f(self.X)).sum()
-
-    def _log_likelihood_gradients(self):
-        raise NotImplementedError, "This needs to be implemented to use the Mapping_check_model class."
-
-class Mapping_check_df_dtheta(Mapping_check_model):
-    """This class allows gradient checks for the gradient of a mapping with respect to parameters. """
-    def __init__(self, mapping=None, dL_df=None, X=None):
-        Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X)
-
-    def _log_likelihood_gradients(self):
-        return self.mapping.df_dtheta(self.dL_df, self.X)
-
-
-class Mapping_check_df_dX(Mapping_check_model):
-    """This class allows gradient checks for the gradient of a mapping with respect to X. """
-    def __init__(self, mapping=None, dL_df=None, X=None):
-        Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X)
-
-        if dL_df==None:
-            dL_df = np.ones((self.X.shape[0],self.mapping.output_dim))
-        self.num_params = self.X.shape[0]*self.mapping.input_dim
-
-    def _log_likelihood_gradients(self):
-        return self.mapping.df_dX(self.dL_df, self.X).flatten()
-
-    def _get_param_names(self):
-        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-
-    def _get_params(self):
-        return self.X.flatten()
-
-    def _set_params(self, x):
-        self.X=x.reshape(self.X.shape)

--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -5,7 +5,7 @@
 from .. import likelihoods
 from ..inference import optimization
 from ..util.misc import opt_wrapper
-from parameterization import Parameterized
+from .parameterization import Parameterized
 import multiprocessing as mp
 import numpy as np
 from numpy.linalg.linalg import LinAlgError
@ -13,6 +13,7 @@ import itertools
 import sys
 from .verbose_optimization import VerboseOptimization
 # import numdifftools as ndt
+from functools import reduce

 class Model(Parameterized):
    _fail_count = 0  # Count of failed optimization steps (see objective)
@ -30,7 +31,7 @@ class Model(Parameterized):
        self.add_observer(self.tie, self.tie._parameters_changed_notification, priority=-500)

    def log_likelihood(self):
-        raise NotImplementedError, "this needs to be implemented to use the model class"
+        raise NotImplementedError("this needs to be implemented to use the model class")
    def _log_likelihood_gradients(self):
        return self.gradient.copy()

@ -82,7 +83,7 @@ class Model(Parameterized):
                pool.close()  # signal that no more data coming in
                pool.join()  # wait for all the tasks to complete
            except KeyboardInterrupt:
-                print "Ctrl+c received, terminating and joining pool."
+                print("Ctrl+c received, terminating and joining pool.")
                pool.terminate()
                pool.join()

@ -95,10 +96,10 @@ class Model(Parameterized):
                    self.optimization_runs.append(jobs[i].get())

                if verbose:
-                    print("Optimization restart {0}/{1}, f = {2}".format(i + 1, num_restarts, self.optimization_runs[-1].f_opt))
+                    print(("Optimization restart {0}/{1}, f = {2}".format(i + 1, num_restarts, self.optimization_runs[-1].f_opt)))
            except Exception as e:
                if robust:
-                    print("Warning - optimization restart {0}/{1} failed".format(i + 1, num_restarts))
+                    print(("Warning - optimization restart {0}/{1} failed".format(i + 1, num_restarts)))
                else:
                    raise e

@ -119,7 +120,7 @@ class Model(Parameterized):

        DEPRECATED.
        """
-        raise DeprecationWarning, 'parameters now have default constraints'
+        raise DeprecationWarning('parameters now have default constraints')

    def objective_function(self):
        """
@ -213,14 +214,14 @@ class Model(Parameterized):
            self.obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e10, 1e10)
        return obj_f, self.obj_grads

-    def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, **kwargs):
+    def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, clear_after_finish=False, **kwargs):
        """
        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.

        kwargs are passed to the optimizer. They can be:

-        :param max_f_eval: maximum number of function evaluations
-        :type max_f_eval: int
+        :param max_iters: maximum number of function evaluations
+        :type max_iters: int
        :messages: True: Display messages during optimisation, "ipython_notebook":
        :type messages: bool"string
        :param optimizer: which optimizer to use (defaults to self.preferred optimizer)
@ -237,10 +238,10 @@ class Model(Parameterized):

        """
        if self.is_fixed or self.size == 0:
-            print 'nothing to optimize'
+            print('nothing to optimize')

        if not self.update_model():
-            print "updates were off, setting updates on again"
+            print("updates were off, setting updates on again")
            self.update_model(True)

        if start == None:
@ -255,7 +256,7 @@ class Model(Parameterized):
        else:
            optimizer = optimization.get_optimizer(optimizer)
            opt = optimizer(start, model=self, max_iters=max_iters, **kwargs)
-                        
+
        with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook) as vo:
            opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
            vo.finish(opt)
@ -305,7 +306,7 @@ class Model(Parameterized):
                    transformed_index = (indices - (~self._fixes_).cumsum())[transformed_index[which[0]]]

                if transformed_index.size == 0:
-                    print "No free parameters to check"
+                    print("No free parameters to check")
                    return

            # just check the global ratio
@ -340,9 +341,9 @@ class Model(Parameterized):
            cols.extend([max(float_len, len(header[i])) for i in range(1, len(header))])
            cols = np.array(cols) + 5
            header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
-            header_string = map(lambda x: '|'.join(x), [header_string])
+            header_string = list(map(lambda x: '|'.join(x), [header_string]))
            separator = '-' * len(header_string[0])
-            print '\n'.join([header_string[0], separator])
+            print('\n'.join([header_string[0], separator]))
            if target_param is None:
                param_index = range(len(x))
                transformed_index = param_index
@ -358,19 +359,24 @@ class Model(Parameterized):
                    transformed_index = param_index

                if param_index.size == 0:
-                    print "No free parameters to check"
+                    print("No free parameters to check")
                    return

            gradient = self._grads(x).copy()
            np.where(gradient == 0, 1e-312, gradient)
            ret = True
-            for nind, xind in itertools.izip(param_index, transformed_index):
+            for nind, xind in zip(param_index, transformed_index):
                xx = x.copy()
                xx[xind] += step
                f1 = self._objective(xx)
                xx[xind] -= 2.*step
                f2 = self._objective(xx)
-                df_ratio = np.abs((f1 - f2) / min(f1, f2))
+                #Avoid divide by zero, if any of the values are above 1e-15, otherwise both values are essentiall
+                #the same
+                if f1 > 1e-15 or f1 < -1e-15 or f2 > 1e-15 or f2 < -1e-15:
+                    df_ratio = np.abs((f1 - f2) / min(f1, f2))
+                else:
+                    df_ratio = 1.0
                df_unstable = df_ratio < df_tolerance
                numerical_gradient = (f1 - f2) / (2 * step)
                if np.all(gradient[xind] == 0): ratio = (f1 - f2) == gradient[xind]
@ -392,7 +398,7 @@ class Model(Parameterized):
                ng = '%.6f' % float(numerical_gradient)
                df = '%1.e' % float(df_ratio)
                grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}|{5:^{c5}}".format(formatted_name, r, d, g, ng, df, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4], c5=cols[5])
-                print grad_string
+                print(grad_string)

            self.optimizer_array = x
            return ret
@ -402,6 +408,7 @@ class Model(Parameterized):
        model_details = [['<b>Model</b>', self.name + '<br>'],
                         ['<b>Log-likelihood</b>', '{}<br>'.format(float(self.log_likelihood()))],
                         ["<b>Number of Parameters</b>", '{}<br>'.format(self.size)],
+                         ["<b>Number of Optimization Parameters</b>", '{}<br>'.format(self._size_transformed())],
                         ["<b>Updates</b>", '{}<br>'.format(self._update_on)],
                         ]
        from operator import itemgetter
@ -419,6 +426,7 @@ class Model(Parameterized):
        model_details = [['Name', self.name],
                         ['Log-likelihood', '{}'.format(float(self.log_likelihood()))],
                         ["Number of Parameters", '{}'.format(self.size)],
+                         ["Number of Optimization Parameters", '{}'.format(self._size_transformed())],
                         ["Updates", '{}'.format(self._update_on)],
                         ]
        from operator import itemgetter
--- a/GPy/core/parameterization/init.py
+++ b/GPy/core/parameterization/init.py
@ -1,5 +1,5 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from param import Param, ObsAr
-from parameterized import Parameterized
+from .param import Param, ObsAr
+from .parameterized import Parameterized
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@ -3,7 +3,8 @@

 import numpy
 from numpy.lib.function_base import vectorize
-from lists_and_dicts import IntArrayDict
+from .lists_and_dicts import IntArrayDict
+from functools import reduce

 def extract_properties_to_index(index, props):
    prop_index = dict()
@ -62,12 +63,15 @@ class ParameterIndexOperations(object):
    def __init__(self, constraints=None):
        self._properties = IntArrayDict()
        if constraints is not None:
-            for t, i in constraints.iteritems():
+            #python 3 fix
+            #for t, i in constraints.iteritems():
+            for t, i in constraints.items():
                self.add(t, i)

-    def iteritems(self):
-        return self._properties.iteritems()
-
+    #iteritems has gone in python 3
+    #def iteritems(self):
+    #    return self._properties.iteritems()
+        
    def items(self):
        return self._properties.items()

@ -75,7 +79,7 @@ class ParameterIndexOperations(object):
        return self._properties.keys()

    def iterproperties(self):
-        return self._properties.iterkeys()
+        return iter(self._properties)

    def shift_right(self, start, size):
        for ind in self.iterindices():
@ -83,7 +87,7 @@ class ParameterIndexOperations(object):
            ind[toshift] += size

    def shift_left(self, start, size):
-        for v, ind in self.items():
+        for v, ind in list(self.items()):
            todelete = (ind>=start) * (ind<start+size)
            if todelete.size != 0:
                ind = ind[~todelete]
@ -101,7 +105,11 @@ class ParameterIndexOperations(object):
        return reduce(lambda a,b: a+b.size, self.iterindices(), 0)

    def iterindices(self):
-        return self._properties.itervalues()
+        try:
+            return self._properties.itervalues()
+        except AttributeError:
+	#Changed this from itervalues to values for Py3 compatibility. It didn't break the test suite.
+            return self._properties.values()

    def indices(self):
        return self._properties.values()
@ -150,14 +158,18 @@ class ParameterIndexOperations(object):
        return numpy.array([]).astype(int)

    def update(self, parameter_index_view, offset=0):
-        for i, v in parameter_index_view.iteritems():
+        #py3 fix
+        #for i, v in parameter_index_view.iteritems():
+        for i, v in parameter_index_view.items():
            self.add(i, v+offset)

    def copy(self):
        return self.__deepcopy__(None)

    def __deepcopy__(self, memo):
-        return ParameterIndexOperations(dict(self.iteritems()))
+        #py3 fix
+        #return ParameterIndexOperations(dict(self.iteritems()))
+        return ParameterIndexOperations(dict(self.items()))

    def __getitem__(self, prop):
        return self._properties[prop]
@ -195,22 +207,26 @@ class ParameterIndexOperationsView(object):
    def _filter_index(self, ind):
        return ind[(ind >= self._offset) * (ind < (self._offset + self._size))] - self._offset

-
-    def iteritems(self):
-        for i, ind in self._param_index_ops.iteritems():
+    #iteritems has gone in python 3. It has been renamed items()
+    def items(self):
+        _items_list = list(self._param_index_ops.items())
+        for i, ind in _items_list:
            ind2 = self._filter_index(ind)
            if ind2.size > 0:
                yield i, ind2
-
-    def items(self):
-        return [[i,v] for i,v in self.iteritems()]
+    
+    #Python 3 items() is now implemented as per py2 iteritems
+    #def items(self):
+    #    return [[i,v] for i,v in self.iteritems()]

    def properties(self):
        return [i for i in self.iterproperties()]


    def iterproperties(self):
-        for i, _ in self.iteritems():
+        #py3 fix
+        #for i, _ in self.iteritems():
+        for i, _ in self.items():
            yield i


@ -230,7 +246,9 @@ class ParameterIndexOperationsView(object):


    def iterindices(self):
-        for _, ind in self.iteritems():
+        #py3 fix
+        #for _, ind in self.iteritems():
+        for _, ind in self.items():
            yield ind


@ -286,10 +304,14 @@ class ParameterIndexOperationsView(object):

    def __str__(self, *args, **kwargs):
        import pprint
-        return pprint.pformat(dict(self.iteritems()))
+        #py3 fixes
+        #return pprint.pformat(dict(self.iteritems()))
+        return pprint.pformat(dict(self.items()))

    def update(self, parameter_index_view, offset=0):
-        for i, v in parameter_index_view.iteritems():
+        #py3 fixes
+        #for i, v in parameter_index_view.iteritems():
+        for i, v in parameter_index_view.items():
            self.add(i, v+offset)


@ -297,6 +319,8 @@ class ParameterIndexOperationsView(object):
        return self.__deepcopy__(None)

    def __deepcopy__(self, memo):
-        return ParameterIndexOperations(dict(self.iteritems()))
+        #py3 fix
+        #return ParameterIndexOperations(dict(self.iteritems()))
+        return ParameterIndexOperations(dict(self.items()))
    pass

--- a/GPy/core/parameterization/lists_and_dicts.py
+++ b/GPy/core/parameterization/lists_and_dicts.py
@ -32,7 +32,7 @@ class ArrayList(list):
            if el is item:
                return index
            index += 1
-        raise ValueError, "{} is not in list".format(item)
+        raise ValueError("{} is not in list".format(item))
    pass

 class ObserverList(object):
@ -75,7 +75,7 @@ class ObserverList(object):

    def __str__(self):
        from . import ObsAr, Param
-        from parameter_core import Parameterizable
+        from .parameter_core import Parameterizable
        ret = []
        curr_p = None
        
--- a/GPy/core/parameterization/observable.py
+++ b/GPy/core/parameterization/observable.py
@ -12,7 +12,7 @@ class Observable(object):
    """
    def __init__(self, *args, **kwargs):
        super(Observable, self).__init__()
-        from lists_and_dicts import ObserverList
+        from .lists_and_dicts import ObserverList
        self.observers = ObserverList()
        self._update_on = True

--- a/GPy/core/parameterization/observable_array.py
+++ b/GPy/core/parameterization/observable_array.py
@ -3,8 +3,8 @@


 import numpy as np
-from parameter_core import Pickleable
-from observable import Observable
+from .parameter_core import Pickleable
+from .observable import Observable

 class ObsAr(np.ndarray, Pickleable, Observable):
    """
@ -39,7 +39,7 @@ class ObsAr(np.ndarray, Pickleable, Observable):
        return self.view(np.ndarray)

    def copy(self):
-        from lists_and_dicts import ObserverList
+        from .lists_and_dicts import ObserverList
        memo = {}
        memo[id(self)] = self
        memo[id(self.observers)] = ObserverList()
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -4,8 +4,9 @@
 import itertools
 import numpy
 np = numpy
-from parameter_core import Parameterizable, adjust_name_for_printing, Pickleable
-from observable_array import ObsAr
+from .parameter_core import Parameterizable, adjust_name_for_printing, Pickleable
+from .observable_array import ObsAr
+from functools import reduce

 ###### printing
 __constraints_name__ = "Constraint"
@ -156,7 +157,7 @@ class Param(Parameterizable, ObsAr):
    #===========================================================================
    @property
    def is_fixed(self):
-        from transformations import __fixed__
+        from .transformations import __fixed__
        return self.constraints[__fixed__].size == self.size

    def _get_original(self, param):
@ -207,10 +208,14 @@ class Param(Parameterizable, ObsAr):
        return 0
    @property
    def _constraints_str(self):
-        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.iteritems()))]
+        #py3 fix
+        #return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.iteritems()))]
+        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.items()))]
    @property
    def _priors_str(self):
-        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.iteritems()))]
+        #py3 fix
+        #return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.iteritems()))]
+        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.items()))]
    @property
    def _ties_str(self):
        return ['']
@ -279,7 +284,7 @@ class Param(Parameterizable, ObsAr):
 .tg th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
 .tg .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
 .tg .tg-right{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:right;}
-</style>"""] + ['<table class="tg">'] + [header] + ["<tr><td class=tg-left>{i}</td><td  class=tg-right>{x}</td><td class=tg-left>{c}</td><td class=tg-left>{p}</td><td class=tg-left>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])
+</style>"""] + ['<table class="tg">'] + [header] + ["<tr><td class=tg-left>{i}</td><td  class=tg-right>{x}</td><td class=tg-left>{c}</td><td class=tg-left>{p}</td><td class=tg-left>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in zip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])

    def __str__(self, constr_matrix=None, indices=None, prirs=None, ties=None, lc=None, lx=None, li=None, lp=None, lt=None, only_name=False):
        filter_ = self._current_slice_
@ -300,7 +305,7 @@ class Param(Parameterizable, ObsAr):
        if only_name: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hierarchy_name(), c=sep*lc, i=sep*li, t=sep*lt, p=sep*lp)  # nice header for printing
        else: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hierarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
        if not ties: ties = itertools.cycle([''])
-        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
+        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in zip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
        # except: return super(Param, self).__str__()

 class ParamConcatenation(object):
@ -313,7 +318,7 @@ class ParamConcatenation(object):
        See :py:class:`GPy.core.parameter.Param` for more details on constraining.
        """
        # self.params = params
-        from lists_and_dicts import ArrayList
+        from .lists_and_dicts import ArrayList
        self.params = ArrayList([])
        for p in params:
            for p in p.flattened_parameters:
@ -336,7 +341,9 @@ class ParamConcatenation(object):
                    level += 1
                    parent = parent._parent_
        import operator
-        self.parents = map(lambda x: x[0], sorted(parents.iteritems(), key=operator.itemgetter(1)))
+        #py3 fix
+        #self.parents = map(lambda x: x[0], sorted(parents.iteritems(), key=operator.itemgetter(1)))
+        self.parents = map(lambda x: x[0], sorted(parents.items(), key=operator.itemgetter(1)))
    #===========================================================================
    # Get/set items, enable broadcasting
    #===========================================================================
@ -429,14 +436,14 @@ class ParamConcatenation(object):
        params = self.params
        constr_matrices, ties_matrices, prior_matrices = zip(*map(f, params))
        indices = [p._indices() for p in params]
-        lc = max([p._max_len_names(cm, __constraints_name__) for p, cm in itertools.izip(params, constr_matrices)])
+        lc = max([p._max_len_names(cm, __constraints_name__) for p, cm in zip(params, constr_matrices)])
        lx = max([p._max_len_values() for p in params])
-        li = max([p._max_len_index(i) for p, i in itertools.izip(params, indices)])
-        lt = max([p._max_len_names(tm, __tie_name__) for p, tm in itertools.izip(params, ties_matrices)])
-        lp = max([p._max_len_names(pm, __constraints_name__) for p, pm in itertools.izip(params, prior_matrices)])
+        li = max([p._max_len_index(i) for p, i in zip(params, indices)])
+        lt = max([p._max_len_names(tm, __tie_name__) for p, tm in zip(params, ties_matrices)])
+        lp = max([p._max_len_names(pm, __constraints_name__) for p, pm in zip(params, prior_matrices)])
        strings = []
        start = True
-        for p, cm, i, tm, pm in itertools.izip(params,constr_matrices,indices,ties_matrices,prior_matrices):
+        for p, cm, i, tm, pm in zip(params,constr_matrices,indices,ties_matrices,prior_matrices):
            strings.append(p.__str__(constr_matrix=cm, indices=i, prirs=pm, ties=tm, lc=lc, lx=lx, li=li, lp=lp, lt=lt, only_name=(1-start)))
            start = False
        return "\n".join(strings)
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@ -13,11 +13,12 @@ Observable Pattern for patameterization

 """

-from transformations import Transformation,Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
+from .transformations import Transformation,Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
 import numpy as np
 import re
 import logging
-from updateable import Updateable
+from .updateable import Updateable
+from functools import reduce

 class HierarchyError(Exception):
    """
@ -36,7 +37,7 @@ def adjust_name_for_printing(name):
        name = name.replace("/", "_l_").replace("@", '_at_')
        name = name.replace("(", "_of_").replace(")", "")
        if re.match(r'^[a-zA-Z_][a-zA-Z0-9-_]*$', name) is None:
-            raise NameError, "name {} converted to {} cannot be further converted to valid python variable name!".format(name2, name)
+            raise NameError("name {} converted to {} cannot be further converted to valid python variable name!".format(name2, name))
        return name
    return ''

@ -65,13 +66,13 @@ class Parentable(object):
        Gets called, when the parent changed, so we can adjust our
        inner attributes according to the new parent.
        """
-        raise NotImplementedError, "shouldnt happen, Parentable objects need to be able to change their parent"
+        raise NotImplementedError("shouldnt happen, Parentable objects need to be able to change their parent")

    def _disconnect_parent(self, *args, **kw):
        """
        Disconnect this object from its parent
        """
-        raise NotImplementedError, "Abstract superclass"
+        raise NotImplementedError("Abstract superclass")

    @property
    def _highest_parent_(self):
@ -109,7 +110,10 @@ class Pickleable(object):
                  it properly.
        :param protocol: pickling protocol to use, python-pickle for details.
        """
-        import cPickle as pickle
+        try: #Py2
+            import cPickle as pickle
+        except ImportError: #Py3
+            import pickle
        if isinstance(f, str):
            with open(f, 'wb') as f:
                pickle.dump(self, f, protocol)
@ -138,9 +142,9 @@ class Pickleable(object):
            which = self
        which.traverse_parents(parents.append) # collect parents
        for p in parents:
-            if not memo.has_key(id(p)):memo[id(p)] = None # set all parents to be None, so they will not be copied
-        if not memo.has_key(id(self.gradient)):memo[id(self.gradient)] = None # reset the gradient
-        if not memo.has_key(id(self._fixes_)):memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent
+            if not id(p) in memo :memo[id(p)] = None # set all parents to be None, so they will not be copied
+        if not id(self.gradient) in memo:memo[id(self.gradient)] = None # reset the gradient
+        if not id(self._fixes_) in memo :memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent
        copy = copy.deepcopy(self, memo) # and start the copy
        copy._parent_index_ = None
        copy._trigger_params_changed()
@ -163,14 +167,16 @@ class Pickleable(object):
                       '_Cacher_wrap__cachers', # never pickle cachers
                       ]
        dc = dict()
-        for k,v in self.__dict__.iteritems():
+        #py3 fix
+        #for k,v in self.__dict__.iteritems():
+        for k,v in self.__dict__.items():
            if k not in ignore_list:
                dc[k] = v
        return dc

    def __setstate__(self, state):
        self.__dict__.update(state)
-        from lists_and_dicts import ObserverList
+        from .lists_and_dicts import ObserverList
        self.observers = ObserverList()
        self._setup_observers()
        self._optimizer_copy_transformed = False
@ -214,7 +220,7 @@ class Gradcheckable(Pickleable, Parentable):
        Perform the checkgrad on the model.
        TODO: this can be done more efficiently, when doing it inside here
        """
-        raise HierarchyError, "This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!"
+        raise HierarchyError("This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!")

 class Nameable(Gradcheckable):
    """
@ -268,7 +274,7 @@ class Indexable(Nameable, Updateable):
    def __init__(self, name, default_constraint=None, *a, **kw):
        super(Indexable, self).__init__(name=name, *a, **kw)
        self._default_constraint_ = default_constraint
-        from index_operations import ParameterIndexOperations
+        from .index_operations import ParameterIndexOperations
        self.constraints = ParameterIndexOperations()
        self.priors = ParameterIndexOperations()
        if self._default_constraint_ is not None:
@ -310,7 +316,7 @@ class Indexable(Nameable, Updateable):
        that is an int array, containing the indexes for the flattened
        param inside this parameterized logic.
        """
-        from param import ParamConcatenation
+        from .param import ParamConcatenation
        if isinstance(param, ParamConcatenation):
            return np.hstack((self._raveled_index_for(p) for p in param.params))
        return param._raveled_index() + self._offset_for(param)
@ -407,7 +413,7 @@ class Indexable(Nameable, Updateable):
        repriorized = self.unset_priors()
        self._add_to_index_operations(self.priors, repriorized, prior, warning)

-        from domains import _REAL, _POSITIVE, _NEGATIVE
+        from .domains import _REAL, _POSITIVE, _NEGATIVE
        if prior.domain is _POSITIVE:
            self.constrain_positive(warning)
        elif prior.domain is _NEGATIVE:
@ -426,7 +432,9 @@ class Indexable(Nameable, Updateable):
        """evaluate the prior"""
        if self.priors.size > 0:
            x = self.param_array
-            return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()), 0)
+            #py3 fix
+            #return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()), 0)
+            return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.items()), 0)
        return 0.

    def _log_prior_gradients(self):
@ -434,7 +442,9 @@ class Indexable(Nameable, Updateable):
        if self.priors.size > 0:
            x = self.param_array
            ret = np.zeros(x.size)
-            [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.iteritems()]
+            #py3 fix
+            #[np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.iteritems()]
+            [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.items()]
            return ret
        return 0.

@ -536,7 +546,7 @@ class Indexable(Nameable, Updateable):
        update the constraints and priors view, so that
        constraining is automized for the parent.
        """
-        from index_operations import ParameterIndexOperationsView
+        from .index_operations import ParameterIndexOperationsView
        #if getattr(self, "_in_init_"):
            #import ipdb;ipdb.set_trace()
            #self.constraints.update(param.constraints, start)
@ -558,7 +568,7 @@ class Indexable(Nameable, Updateable):
        """
        if warning and reconstrained.size > 0:
            # TODO: figure out which parameters have changed and only print those
-            print "WARNING: reconstraining parameters {}".format(self.hierarchy_name() or self.name)
+            print("WARNING: reconstraining parameters {}".format(self.hierarchy_name() or self.name))
        index = self._raveled_index()
        which.add(what, index)
        return index
@ -571,7 +581,7 @@ class Indexable(Nameable, Updateable):
        if len(transforms) == 0:
            transforms = which.properties()
        removed = np.empty((0,), dtype=int)
-        for t in transforms:
+        for t in list(transforms):
            unconstrained = which.remove(t, self._raveled_index())
            removed = np.union1d(removed, unconstrained)
            if t is __fixed__:
@ -612,7 +622,9 @@ class OptimizationHandlable(Indexable):

        if not self._optimizer_copy_transformed:
            self._optimizer_copy_.flat = self.param_array.flat
-            [np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
+            #py3 fix
+            #[np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
+            [np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.items() if c != __fixed__]
            if self.has_parent() and (self.constraints[__fixed__].size != 0 or self._has_ties()):
                fixes = np.ones(self.size).astype(bool)
                fixes[self.constraints[__fixed__]] = FIXED
@ -641,21 +653,25 @@ class OptimizationHandlable(Indexable):
        if f is None:
            self.param_array.flat = p
            [np.put(self.param_array, ind, c.f(self.param_array.flat[ind]))
-             for c, ind in self.constraints.iteritems() if c != __fixed__]
+             #py3 fix
+             #for c, ind in self.constraints.iteritems() if c != __fixed__]
+             for c, ind in self.constraints.items() if c != __fixed__]
        else:
            self.param_array.flat[f] = p
            [np.put(self.param_array, ind[f[ind]], c.f(self.param_array.flat[ind[f[ind]]]))
-             for c, ind in self.constraints.iteritems() if c != __fixed__]
+             #py3 fix
+             #for c, ind in self.constraints.iteritems() if c != __fixed__]
+             for c, ind in self.constraints.items() if c != __fixed__]
        #self._highest_parent_.tie.propagate_val()

        self._optimizer_copy_transformed = False
        self.trigger_update()

    def _get_params_transformed(self):
-        raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!"
+        raise DeprecationWarning("_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!")
 #
    def _set_params_transformed(self, p):
-        raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!"
+        raise DeprecationWarning("_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!")

    def _trigger_params_changed(self, trigger_parent=True):
        """
@ -680,7 +696,9 @@ class OptimizationHandlable(Indexable):
        constraint to it.
        """
        self._highest_parent_.tie.collate_gradient()
-        [np.put(g, i, c.gradfactor(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        #py3 fix
+        #[np.put(g, i, c.gradfactor(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        [np.put(g, i, c.gradfactor(self.param_array[i], g[i])) for c, i in self.constraints.items() if c != __fixed__]
        if self._has_fixes(): return g[self._fixes_]
        return g

@ -690,7 +708,9 @@ class OptimizationHandlable(Indexable):
        constraint to it.
        """
        self._highest_parent_.tie.collate_gradient()
-        [np.put(g, i, c.gradfactor_non_natural(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        #py3 fix
+        #[np.put(g, i, c.gradfactor_non_natural(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        [np.put(g, i, c.gradfactor_non_natural(self.param_array[i], g[i])) for c, i in self.constraints.items() if c != __fixed__]
        if self._has_fixes(): return g[self._fixes_]
        return g

@ -701,7 +721,7 @@ class OptimizationHandlable(Indexable):
        Return the number of parameters of this parameter_handle.
        Param objects will always return 0.
        """
-        raise NotImplemented, "Abstract, please implement in respective classes"
+        raise NotImplemented("Abstract, please implement in respective classes")

    def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
        """
@ -750,7 +770,9 @@ class OptimizationHandlable(Indexable):
        self.optimizer_array = x  # makes sure all of the tied parameters get the same init (since there's only one prior object...)
        # now draw from prior where possible
        x = self.param_array.copy()
-        [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.iteritems() if not p is None]
+        #Py3 fix
+        #[np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.iteritems() if not p is None]
+        [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.items() if not p is None]
        unfixlist = np.ones((self.size,),dtype=np.bool)
        unfixlist[self.constraints[__fixed__]] = False
        self.param_array.flat[unfixlist] = x.view(np.ndarray).ravel()[unfixlist]
@ -947,7 +969,7 @@ class Parameterizable(OptimizationHandlable):
            self._add_parameter_name(param, ignore_added_names)
        # and makes sure to not delete programmatically added parameters
        for other in self.parameters[::-1]:
-            if other is not param and other.name.startswith(param.name):
+            if other is not param and other.name == param.name:
                warn_and_retry(param, _name_digit.match(other.name))
                return
        if pname not in dir(self):
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -1,12 +1,12 @@
 # Copyright (c) 2014, Max Zwiessele, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
+import six # For metaclass support in Python 2 and 3 simultaneously
 import numpy; np = numpy
 import itertools
 from re import compile, _pattern_type
-from param import ParamConcatenation
-from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
+from .param import ParamConcatenation
+from .parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing

 import logging
 from GPy.core.parameterization.index_operations import ParameterIndexOperationsView
@ -27,6 +27,7 @@ class ParametersChangedMeta(type):
        self.parameters_changed()
        return self

+@six.add_metaclass(ParametersChangedMeta)
 class Parameterized(Parameterizable):
    """
    Parameterized class
@ -73,7 +74,9 @@ class Parameterized(Parameterizable):
    # Metaclass for parameters changed after init.
    # This makes sure, that parameters changed will always be called after __init__
    # **Never** call parameters_changed() yourself
-    __metaclass__ = ParametersChangedMeta
+    #This is ignored in Python 3 -- you need to put the meta class in the function definition. 
+    #__metaclass__ = ParametersChangedMeta
+    #The six module is used to support both Python 2 and 3 simultaneously
    #===========================================================================
    def __init__(self, name=None, parameters=[], *a, **kw):
        super(Parameterized, self).__init__(name=name, *a, **kw)
@ -131,7 +134,7 @@ class Parameterized(Parameterizable):
            if param.has_parent():
                def visit(parent, self):
                    if parent is self:
-                        raise HierarchyError, "You cannot add a parameter twice into the hierarchy"
+                        raise HierarchyError("You cannot add a parameter twice into the hierarchy")
                param.traverse_parents(visit, self)
                param._parent_.unlink_parameter(param)
            # make sure the size is set
@ -173,7 +176,7 @@ class Parameterized(Parameterizable):
                self._highest_parent_._connect_fixes()

        else:
-            raise HierarchyError, """Parameter exists already, try making a copy"""
+            raise HierarchyError("""Parameter exists already, try making a copy""")


    def link_parameters(self, *parameters):
@ -189,9 +192,9 @@ class Parameterized(Parameterizable):
        """
        if not param in self.parameters:
            try:
-                raise RuntimeError, "{} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
+                raise RuntimeError("{} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name))
            except AttributeError:
-                raise RuntimeError, "{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param))
+                raise RuntimeError("{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param)))

        start = sum([p.size for p in self.parameters[:param._parent_index_]])
        self._remove_parameter_name(param)
@ -215,9 +218,9 @@ class Parameterized(Parameterizable):
        self._highest_parent_._notify_parent_change()

    def add_parameter(self, *args, **kwargs):
-        raise DeprecationWarning, "add_parameter was renamed to link_parameter to avoid confusion of setting variables, use link_parameter instead"
+        raise DeprecationWarning("add_parameter was renamed to link_parameter to avoid confusion of setting variables, use link_parameter instead")
    def remove_parameter(self, *args, **kwargs):
-        raise DeprecationWarning, "remove_parameter was renamed to unlink_parameter to avoid confusion of setting variables, use unlink_parameter instead"
+        raise DeprecationWarning("remove_parameter was renamed to unlink_parameter to avoid confusion of setting variables, use unlink_parameter instead")

    def _connect_parameters(self, ignore_added_names=False):
        # connect parameterlist to this parameterized object
@ -237,7 +240,7 @@ class Parameterized(Parameterizable):
        self._param_slices_ = []
        for i, p in enumerate(self.parameters):
            if not p.param_array.flags['C_CONTIGUOUS']:
-                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
+                raise ValueError("This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS")

            p._parent_ = self
            p._parent_index_ = i
@ -268,7 +271,7 @@ class Parameterized(Parameterizable):
        """
        if not isinstance(regexp, _pattern_type): regexp = compile(regexp)
        found_params = []
-        for n, p in itertools.izip(self.parameter_names(False, False, True), self.flattened_parameters):
+        for n, p in zip(self.parameter_names(False, False, True), self.flattened_parameters):
            if regexp.match(n) is not None:
                found_params.append(p)
        return found_params
@ -279,7 +282,7 @@ class Parameterized(Parameterizable):
        else:
            if paramlist is None:
                paramlist = self.grep_param_names(name)
-            if len(paramlist) < 1: raise AttributeError, name
+            if len(paramlist) < 1: raise AttributeError(name)
            if len(paramlist) == 1:
                if isinstance(paramlist[-1], Parameterized):
                    paramlist = paramlist[-1].flattened_parameters
@ -295,7 +298,7 @@ class Parameterized(Parameterizable):
            try:
                self.param_array[name] = value
            except:
-                raise ValueError, "Setting by slice or index only allowed with array-like"
+                raise ValueError("Setting by slice or index only allowed with array-like")
            self.trigger_update()
        else:
            try: param = self.__getitem__(name, paramlist)
@ -325,7 +328,7 @@ class Parameterized(Parameterizable):
            self._notify_parent_change()
            self.parameters_changed()
        except Exception as e:
-            print "WARNING: caught exception {!s}, trying to continue".format(e)
+            print("WARNING: caught exception {!s}, trying to continue".format(e))

    def copy(self, memo=None):
        if memo is None:
@ -379,7 +382,7 @@ class Parameterized(Parameterizable):
        pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
        format_spec = "<tr><td class=tg-left>{{name:<{0}s}}</td><td class=tg-right>{{desc:>{1}s}}</td><td class=tg-left>{{const:^{2}s}}</td><td class=tg-left>{{pri:^{3}s}}</td><td class=tg-left>{{t:^{4}s}}</td></tr>".format(nl, sl, cl, pl, tl)
        to_print = []
-        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
+        for n, d, c, t, p in zip(names, desc, constrs, ts, prirs):
            to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
        sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
        if header:
@ -414,7 +417,7 @@ class Parameterized(Parameterizable):
        pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
        format_spec = "  \033[1m{{name:<{0}s}}\033[0;0m  |  {{desc:>{1}s}}  |  {{const:^{2}s}}  |  {{pri:^{3}s}}  |  {{t:^{4}s}}".format(nl, sl, cl, pl, tl)
        to_print = []
-        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
+        for n, d, c, t, p in zip(names, desc, constrs, ts, prirs):
            to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
        sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
        if header:
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -5,7 +5,7 @@
 import numpy as np
 from scipy.special import gammaln, digamma
 from ...util.linalg import pdinv
-from domains import _REAL, _POSITIVE
+from .domains import _REAL, _POSITIVE
 import warnings
 import weakref

@ -15,8 +15,12 @@ class Prior(object):
    _instance = None
    def __new__(cls, *args, **kwargs):
        if not cls._instance or cls._instance.__class__ is not cls:
-            cls._instance = super(Prior, cls).__new__(cls, *args, **kwargs)
-        return cls._instance
+                newfunc = super(Prior, cls).__new__
+                if newfunc is object.__new__:
+                    cls._instance = newfunc(cls)  
+                else:
+                    cls._instance = newfunc(cls, *args, **kwargs)
+                return cls._instance

    def pdf(self, x):
        return np.exp(self.lnpdf(x))
@ -52,7 +56,11 @@ class Gaussian(Prior):
            for instance in cls._instances:
                if instance().mu == mu and instance().sigma == sigma:
                    return instance()
-        o = super(Prior, cls).__new__(cls, mu, sigma)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, mu, sigma)            
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()

@ -140,7 +148,11 @@ class LogGaussian(Gaussian):
            for instance in cls._instances:
                if instance().mu == mu and instance().sigma == sigma:
                    return instance()
-        o = super(Prior, cls).__new__(cls, mu, sigma)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, mu, sigma)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()

@ -258,7 +270,11 @@ class Gamma(Prior):
            for instance in cls._instances:
                if instance().a == a and instance().b == b:
                    return instance()
-        o = super(Prior, cls).__new__(cls, a, b)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, a, b)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()

@ -398,7 +414,7 @@ class DGPLVM_KFDA(Prior):
    def compute_cls(self, x):
        cls = {}
        # Appending each data point to its proper class
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
            class_label = self.get_class_label(self.lbl[j])
            if class_label not in cls:
                cls[class_label] = []
@ -537,7 +553,7 @@ class DGPLVM(Prior):
    def compute_cls(self, x):
        cls = {}
        # Appending each data point to its proper class
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
            class_label = self.get_class_label(self.lbl[j])
            if class_label not in cls:
                cls[class_label] = []
@ -549,14 +565,14 @@ class DGPLVM(Prior):
        M_i = np.zeros((self.classnum, self.dim))
        for i in cls:
            # Mean of each class
-	    class_i = cls[i]
+            class_i = cls[i]
            M_i[i] = np.mean(class_i, axis=0)
        return M_i

    # Adding data points as tuple to the dictionary so that we can access indices
    def compute_indices(self, x):
        data_idx = {}
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
            class_label = self.get_class_label(self.lbl[j])
            if class_label not in data_idx:
                data_idx[class_label] = []
@ -575,7 +591,7 @@ class DGPLVM(Prior):
            else:
                lst_idx = []
            # Here we put indices of each class in to the list called lst_idx_all
-            for m in xrange(len(data_idx[i])):
+            for m in range(len(data_idx[i])):
                lst_idx.append(data_idx[i][m][0])
            lst_idx_all.append(lst_idx)
        return lst_idx_all
@ -611,7 +627,7 @@ class DGPLVM(Prior):
            # pdb.set_trace()
            # Calculating Bi
            B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
-        for k in xrange(self.datanum):
+        for k in range(self.datanum):
            for i in data_idx:
                N_i = float(len(data_idx[i]))
                if k in lst_idx_all[i]:
@ -663,7 +679,7 @@ class DGPLVM(Prior):
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))

    # This function calculates derivative of the log of prior function
@ -684,7 +700,7 @@ class DGPLVM(Prior):
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)
@ -990,7 +1006,7 @@ class DGPLVM_T(Prior):
        self.datanum = lbl.shape[0]
        self.x_shape = x_shape
        self.dim = x_shape[1]
-	self.vec = vec
+        self.vec = vec


    def get_class_label(self, y):
@ -1004,7 +1020,7 @@ class DGPLVM_T(Prior):
    def compute_cls(self, x):
        cls = {}
        # Appending each data point to its proper class
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
            class_label = self.get_class_label(self.lbl[j])
            if class_label not in cls:
                cls[class_label] = []
@ -1024,7 +1040,7 @@ class DGPLVM_T(Prior):
    # Adding data points as tuple to the dictionary so that we can access indices
    def compute_indices(self, x):
        data_idx = {}
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
            class_label = self.get_class_label(self.lbl[j])
            if class_label not in data_idx:
                data_idx[class_label] = []
@ -1043,7 +1059,7 @@ class DGPLVM_T(Prior):
            else:
                lst_idx = []
            # Here we put indices of each class in to the list called lst_idx_all
-            for m in xrange(len(data_idx[i])):
+            for m in range(len(data_idx[i])):
                lst_idx.append(data_idx[i][m][0])
            lst_idx_all.append(lst_idx)
        return lst_idx_all
@ -1079,7 +1095,7 @@ class DGPLVM_T(Prior):
            # pdb.set_trace()
            # Calculating Bi
            B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
-        for k in xrange(self.datanum):
+        for k in range(self.datanum):
            for i in data_idx:
                N_i = float(len(data_idx[i]))
                if k in lst_idx_all[i]:
@ -1135,7 +1151,7 @@ class DGPLVM_T(Prior):
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
 	#print 'SB_inv: ', Sb_inv_N
        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))

    # This function calculates derivative of the log of prior function
@ -1160,7 +1176,7 @@ class DGPLVM_T(Prior):
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
 	#print 'SB_inv: ',Sb_inv_N
        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)
--- a/GPy/core/parameterization/ties_and_remappings.py
+++ b/GPy/core/parameterization/ties_and_remappings.py
@ -2,8 +2,8 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from parameterized import Parameterized
-from param import Param
+from .parameterized import Parameterized
+from .param import Param

 class Remapping(Parameterized):
    def mapping(self):
@ -98,7 +98,7 @@ class Tie(Parameterized):
            if np.all(self.label_buf[idx]==0):
                # None of p has been tied before.
                tie_idx = self._expandTieParam(1)
-                print tie_idx
+                print(tie_idx)
                tie_id = self.label_buf.max()+1
                self.label_buf[tie_idx] = tie_id
            else:
@ -185,18 +185,18 @@ class Tie(Parameterized):
    def _check_change(self):
        changed = False
        if self.tied_param is not None:
-            for i in xrange(self.tied_param.size):
+            for i in range(self.tied_param.size):
                b0 = self.label_buf==self.label_buf[self.buf_idx[i]]
                b = self._highest_parent_.param_array[b0]!=self.tied_param[i]
                if b.sum()==0:
-                    print 'XXX'
+                    print('XXX')
                    continue
                elif b.sum()==1:
-                    print '!!!'
+                    print('!!!')
                    val = self._highest_parent_.param_array[b0][b][0]
                    self._highest_parent_.param_array[b0] = val
                else:
-                    print '@@@'
+                    print('@@@')
                    self._highest_parent_.param_array[b0] = self.tied_param[i]
                changed = True
        return changed
@ -212,11 +212,11 @@ class Tie(Parameterized):
        if self.tied_param is not None:
            self.tied_param.gradient = 0.
            [np.put(self.tied_param.gradient, i, self._highest_parent_.gradient[self.label_buf==self.label_buf[self.buf_idx[i]]].sum()) 
-                for i in xrange(self.tied_param.size)]
+                for i in range(self.tied_param.size)]
    
    def propagate_val(self):
        if self.tied_param is not None:
-            for i in xrange(self.tied_param.size):
+            for i in range(self.tied_param.size):
                self._highest_parent_.param_array[self.label_buf==self.label_buf[self.buf_idx[i]]] = self.tied_param[i]


--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@ -3,7 +3,7 @@


 import numpy as np
-from domains import _POSITIVE,_NEGATIVE, _BOUNDED
+from .domains import _POSITIVE,_NEGATIVE, _BOUNDED
 import weakref

 import sys
@ -72,7 +72,7 @@ class Logexp(Transformation):
        return np.einsum('i,i->i', df, np.where(f>_lim_val, 1., 1. - np.exp(-f)))
    def initialize(self, f):
        if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
        return np.abs(f)
    def __str__(self):
        return '+ve'
@ -130,7 +130,7 @@ class NormalTheta(Transformation):

    def initialize(self, f):
        if np.any(f[self.var_indices] < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
            f[self.var_indices] = np.abs(f[self.var_indices])
        return f

@ -177,7 +177,7 @@ class NormalNaturalAntti(NormalTheta):

    def initialize(self, f):
        if np.any(f[self.var_indices] < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
            f[self.var_indices] = np.abs(f[self.var_indices])
        return f

@ -220,7 +220,7 @@ class NormalEta(Transformation):

    def initialize(self, f):
        if np.any(f[self.var_indices] < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
            f[self.var_indices] = np.abs(f[self.var_indices])
        return f

@ -360,7 +360,7 @@ class LogexpNeg(Transformation):
        return np.einsum('i,i->i', df, np.where(f>_lim_val, -1, -1 + np.exp(-f)))
    def initialize(self, f):
        if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
        return np.abs(f)
    def __str__(self):
        return '+ve'
@ -412,7 +412,7 @@ class LogexpClipped(Logexp):
        return np.einsum('i,i->i', df, gf) # np.where(f < self.lower, 0, gf)
    def initialize(self, f):
        if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
        return np.abs(f)
    def __str__(self):
        return '+ve_c'
@ -428,7 +428,7 @@ class Exponent(Transformation):
        return np.einsum('i,i->i', df, f)
    def initialize(self, f):
        if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
        return np.abs(f)
    def __str__(self):
        return '+ve'
@ -468,7 +468,11 @@ class Logistic(Transformation):
            for instance in cls._instances:
                if instance().lower == lower and instance().upper == upper:
                    return instance()
-        o = super(Transformation, cls).__new__(cls, lower, upper, *args, **kwargs)
+        newfunc = super(Transformation, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, lower, upper, *args, **kwargs)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, lower, upper):
@ -486,7 +490,7 @@ class Logistic(Transformation):
        return np.einsum('i,i->i', df, (f - self.lower) * (self.upper - f) / self.difference)
    def initialize(self, f):
        if np.any(np.logical_or(f < self.lower, f > self.upper)):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
        #return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
        #FIXME: Max, zeros_like right?
        return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(np.zeros_like(f)), f)
--- a/GPy/core/parameterization/updateable.py
+++ b/GPy/core/parameterization/updateable.py
@ -3,7 +3,7 @@ Created on 11 Nov 2014

@author: maxz
 '''
-from observable import Observable
+from .observable import Observable


 class Updateable(Observable):
@ -35,7 +35,7 @@ class Updateable(Observable):
        self.trigger_update()

    def toggle_update(self):
-        print "deprecated: toggle_update was renamed to update_toggle for easier access"
+        print("deprecated: toggle_update was renamed to update_toggle for easier access")
        self.update_toggle()
    def update_toggle(self):
        self.update_model(not self.update_model())
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -5,9 +5,9 @@ Created on 6 Nov 2013
 '''

 import numpy as np
-from parameterized import Parameterized
-from param import Param
-from transformations import Logexp, Logistic,__fixed__
+from .parameterized import Parameterized
+from .param import Param
+from .transformations import Logexp, Logistic,__fixed__
 from GPy.util.misc import param_to_array
 from GPy.util.caching import Cache_this

@ -16,13 +16,13 @@ class VariationalPrior(Parameterized):
        super(VariationalPrior, self).__init__(name=name, **kw)

    def KL_divergence(self, variational_posterior):
-        raise NotImplementedError, "override this for variational inference of latent space"
+        raise NotImplementedError("override this for variational inference of latent space")

    def update_gradients_KL(self, variational_posterior):
        """
        updates the gradients for mean and variance **in place**
        """
-        raise NotImplementedError, "override this for variational inference of latent space"
+        raise NotImplementedError("override this for variational inference of latent space")

 class NormalPrior(VariationalPrior):
    def KL_divergence(self, variational_posterior):
@ -50,31 +50,29 @@ class SpikeAndSlabPrior(VariationalPrior):
    def KL_divergence(self, variational_posterior):
        mu = variational_posterior.mean
        S = variational_posterior.variance
-        gamma,gamma1 = variational_posterior.gamma_probabilities()
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma = variational_posterior.gamma.values
        if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi
            
        var_mean = np.square(mu)/self.variance
        var_S = (S/self.variance - np.log(S))
-        var_gamma = (gamma*(log_gamma-np.log(pi))).sum()+(gamma1*(log_gamma1-np.log(1-pi))).sum()
+        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
        return var_gamma+ (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.

    def update_gradients_KL(self, variational_posterior):
        mu = variational_posterior.mean
        S = variational_posterior.variance
-        gamma,gamma1 = variational_posterior.gamma_probabilities()
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma = variational_posterior.gamma.values
        if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi

-        variational_posterior.binary_prob.gradient -= (np.log((1-pi)/pi)+log_gamma-log_gamma1+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.)*gamma*gamma1
+        variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
        mu.gradient -= gamma*mu/self.variance
        S.gradient -= (1./self.variance - 1./S) * gamma /2.
        if self.learnPi:
@ -141,7 +139,7 @@ class NormalPosterior(VariationalPosterior):
    holds the means and variances for a factorizing multivariate normal distribution
    '''

-    def plot(self, *args):
+    def plot(self, *args, **kwargs):
        """
        Plot latent space X in 1D:

@ -150,8 +148,7 @@ class NormalPosterior(VariationalPosterior):
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ...plotting.matplot_dep import variational_plots
-        import matplotlib
-        return variational_plots.plot(self,*args)
+        return variational_plots.plot(self, *args, **kwargs)

 class SpikeAndSlabPosterior(VariationalPosterior):
    '''
@ -162,24 +159,8 @@ class SpikeAndSlabPosterior(VariationalPosterior):
        binary_prob : the probability of the distribution on the slab part.
        """
        super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
-        self.gamma = Param("binary_prob",binary_prob)
+        self.gamma = Param("binary_prob",binary_prob,Logistic(0.,1.))
        self.link_parameter(self.gamma)
-        
-    @Cache_this(limit=5)
-    def gamma_probabilities(self):
-        prob = np.zeros_like(param_to_array(self.gamma))
-        prob[self.gamma>-710] = 1./(1.+np.exp(-self.gamma[self.gamma>-710]))
-        prob1 = -np.zeros_like(param_to_array(self.gamma))
-        prob1[self.gamma<710] = 1./(1.+np.exp(self.gamma[self.gamma<710]))
-        return prob, prob1
-    
-    @Cache_this(limit=5)
-    def gamma_log_prob(self):
-        loggamma = param_to_array(self.gamma).copy()
-        loggamma[loggamma>-40] = -np.log1p(np.exp(-loggamma[loggamma>-40]))
-        loggamma1 = -param_to_array(self.gamma).copy()
-        loggamma1[loggamma1>-40] = -np.log1p(np.exp(-loggamma1[loggamma1>-40]))
-        return loggamma,loggamma1

    def set_gradients(self, grad):
        self.mean.gradient, self.variance.gradient, self.gamma.gradient = grad
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -2,19 +2,15 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from gp import GP
-from parameterization.param import Param
+from .gp import GP
+from .parameterization.param import Param
 from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
-from parameterization.variational import VariationalPosterior, NormalPosterior
+from .parameterization.variational import VariationalPosterior, NormalPosterior
 from ..util.linalg import mdot

 import logging
-from GPy.inference.latent_function_inference.posterior import Posterior
-from GPy.inference.optimization.stochastics import SparseGPStochastics,\
-    SparseGPMissing
-#no stochastics.py file added! from GPy.inference.optimization.stochastics import SparseGPStochastics,\
-    #SparseGPMissing
+import itertools
 logger = logging.getLogger("sparse gp")

 class SparseGP(GP):
@ -25,6 +21,10 @@ class SparseGP(GP):
    (Gaussian likelihoods) as well as non-conjugate sparse methods based on
    these.

+    This is not for missing data, as the implementation for missing data involves
+    some inefficient optimization routine decisions.
+    See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
+
    :param X: inputs
    :type X: np.ndarray (num_data x input_dim)
    :param likelihood: a likelihood instance, containing the observed data
@ -40,7 +40,7 @@ class SparseGP(GP):

    """

-    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, inference_method=None,
                 name='sparse gp', Y_metadata=None, normalizer=False):
        #pick a sensible inference method
        if inference_method is None:
@ -48,13 +48,13 @@ class SparseGP(GP):
                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
            else:
                #inference_method = ??
-                raise NotImplementedError, "what to do what to do?"
-            print "defaulting to ", inference_method, "for latent function inference"
+                raise NotImplementedError("what to do what to do?")
+            print("defaulting to ", inference_method, "for latent function inference")

        self.Z = Param('inducing inputs', Z)
        self.num_inducing = Z.shape[0]

-        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+        GP.__init__(self, X, Y, kernel, likelihood, mean_function, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)

        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
@ -63,6 +63,14 @@ class SparseGP(GP):
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)

+    def set_Z(self, Z, trigger_update=True):
+        if trigger_update: self.update_model(False)
+        self.unlink_parameter(self.Z)
+        self.Z = Param('inducing inputs',Z)
+        self.link_parameter(self.Z, index=0)
+        if trigger_update: self.update_model(True)
+        if trigger_update: self._trigger_params_changed()
+
    def parameters_changed(self):
        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata)

@ -103,15 +111,15 @@ class SparseGP(GP):

    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
-        Make a prediction for the latent function values. 
-    
+        Make a prediction for the latent function values.
+
        For certain inputs we give back a full_cov of shape NxN,
        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, 
        we take only the diagonal elements across N.
        
        For uncertain inputs, the SparseGP bound produces a full covariance structure across D, so for full_cov we 
        return a NxDxD matrix and in the not full_cov case, we return the diagonal elements across D (NxD).
-        This is for both with and without missing data.
+        This is for both with and without missing data. See for missing data SparseGP implementation py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
        """

        if kern is None: kern = self.kern
@ -128,7 +136,16 @@ class SparseGP(GP):
                var = var
            else:
                Kxx = kern.Kdiag(Xnew)
-                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
+                if self.posterior.woodbury_inv.ndim == 2:
+                    var = Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0)
+                elif self.posterior.woodbury_inv.ndim == 3:
+                    var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
+                    for i in range(var.shape[1]):
+                        var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
+                var = var
+            #add in the mean function
+            if self.mean_function is not None:
+                mu += self.mean_function.f(Xnew)
        else:
            psi0_star = self.kern.psi0(self.Z, Xnew)
            psi1_star = self.kern.psi1(self.Z, Xnew)
@ -158,4 +175,5 @@ class SparseGP(GP):
                    var[i] = var_
                else:
                    var[i] = np.diag(var_)+p0-t2
+
        return mu, var
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from sparse_gp import SparseGP
+from .sparse_gp import SparseGP
 from numpy.linalg.linalg import LinAlgError
 from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch

@ -56,7 +56,7 @@ class SparseGP_MPI(SparseGP):
            self.N_range = (N_start, N_end)
            self.N_list = np.array(N_list)
            self.Y_local = self.Y[N_start:N_end]
-            print 'MPI RANK '+str(self.mpi_comm.rank)+' with the data range '+str(self.N_range)
+            print('MPI RANK '+str(self.mpi_comm.rank)+' with the data range '+str(self.N_range))
            mpi_comm.Bcast(self.param_array, root=0)
        self.update_model(True)

--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -3,13 +3,13 @@

 import numpy as np
 from ..util import choleskies
-from sparse_gp import SparseGP
-from parameterization.param import Param
+from .sparse_gp import SparseGP
+from .parameterization.param import Param
 from ..inference.latent_function_inference import SVGP as svgp_inf


 class SVGP(SparseGP):
-    def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None, batchsize=None):
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None):
        """
        Stochastic Variational GP.

@ -25,25 +25,20 @@ class SVGP(SparseGP):

        Hensman, Matthews and Ghahramani, Scalable Variational GP Classification, ArXiv 1411.2005
        """
-        if batchsize is None:
-            batchsize = X.shape[0]
-
-        self.X_all, self.Y_all = X, Y
-        # how to rescale the batch likelihood in case of minibatches
        self.batchsize = batchsize
-        batch_scale = float(self.X_all.shape[0])/float(self.batchsize)
-        #KL_scale = 1./np.float64(self.mpi_comm.size)
-        KL_scale = 1.0
-
-        import climin.util
-        #Make a climin slicer to make drawing minibatches much quicker. Annoyingly, this doesn;t pickle.
-        self.slicer = climin.util.draw_mini_slices(self.X_all.shape[0], self.batchsize)
-        X_batch, Y_batch = self.new_batch()
+        self.X_all, self.Y_all = X, Y
+        if batchsize is None:
+            X_batch, Y_batch = X, Y
+        else:
+            import climin.util
+            #Make a climin slicer to make drawing minibatches much quicker
+            self.slicer = climin.util.draw_mini_slices(self.X_all.shape[0], self.batchsize)
+            X_batch, Y_batch = self.new_batch()

        #create the SVI inference method
        inf_method = svgp_inf()

-        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
+        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
                 name=name, Y_metadata=Y_metadata, normalizer=False)

        self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
@ -53,23 +48,31 @@ class SVGP(SparseGP):
        self.link_parameter(self.m)

    def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.mean_function, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))

        #update the kernel gradients
        self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
        grad = self.kern.gradient.copy()
        self.kern.update_gradients_full(self.grad_dict['dL_dKmn'], self.Z, self.X)
-        grad += self.kern.gradient
+        grad += self.kern.gradient.copy()
        self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
        self.kern.gradient += grad
        if not self.Z.is_fixed:# only compute these expensive gradients if we need them
            self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) + self.kern.gradients_X(self.grad_dict['dL_dKmn'], self.Z, self.X)

+
        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
        #update the variational parameter gradients:
        self.m.gradient = self.grad_dict['dL_dm']
        self.chol.gradient = self.grad_dict['dL_dchol']

+        if self.mean_function is not None:
+            self.mean_function.update_gradients(self.grad_dict['dL_dmfX'], self.X)
+            g = self.mean_function.gradient[:].copy()
+            self.mean_function.update_gradients(self.grad_dict['dL_dmfZ'], self.Z)
+            self.mean_function.gradient[:] += g
+            self.Z.gradient[:] += self.mean_function.gradients_X(self.grad_dict['dL_dmfZ'], self.Z)
+
    def set_data(self, X, Y):
        """
        Set the data without calling parameters_changed to avoid wasted computation
--- a/GPy/core/symbolic.py
+++ b/GPy/core/symbolic.py
@ -223,7 +223,7 @@ class Symbolic_core():

    def code_gradients_cacheable(self, function, variable):
        if variable not in self.cacheable:
-            raise RuntimeError, variable + ' must be a cacheable.'
+            raise RuntimeError(variable + ' must be a cacheable.')
        lcode = 'gradients_' + variable + ' = np.zeros_like(' + variable + ')\n'
        lcode += 'self.update_cache(' + ', '.join(self.cacheable) + ')\n'
        for i, theta in enumerate(self.variables[variable]):
--- a/GPy/core/verbose_optimization.py
+++ b/GPy/core/verbose_optimization.py
@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, Max Zwiessele.
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
+from __future__ import print_function
 import numpy as np
 import sys
 import time
@ -11,7 +11,7 @@ def exponents(fnow, current_grad):
    return np.sign(exps) * np.log10(exps).astype(int)

 class VerboseOptimization(object):
-    def __init__(self, model, opt, maxiters, verbose=False, current_iteration=0, ipython_notebook=True):
+    def __init__(self, model, opt, maxiters, verbose=False, current_iteration=0, ipython_notebook=True, clear_after_finish=False):
        self.verbose = verbose
        if self.verbose:
            self.model = model
@ -22,55 +22,59 @@ class VerboseOptimization(object):
            self.opt_name = opt.opt_name
            self.model.add_observer(self, self.print_status)
            self.status = 'running'
+            self.clear = clear_after_finish

            self.update()

            try:
                from IPython.display import display
-                from IPython.html.widgets import FloatProgressWidget, HTMLWidget, ContainerWidget
-                self.text = HTMLWidget()
-                self.progress = FloatProgressWidget()
-                self.model_show = HTMLWidget()
+                from IPython.html.widgets import IntProgress, HTML, Box, VBox, HBox, FlexBox
+                self.text = HTML(width='100%')
+                self.progress = IntProgress(min=0, max=maxiters)
+                #self.progresstext = Text(width='100%', disabled=True, value='0/{}'.format(maxiters))
+                self.model_show = HTML()
                self.ipython_notebook = ipython_notebook
            except:
                # Not in Ipython notebook
                self.ipython_notebook = False

            if self.ipython_notebook:
-                self.text.set_css('width', '100%')
-                #self.progress.set_css('width', '100%')
+                left_col = VBox(children=[self.progress, self.text], padding=2, width='40%')
+                right_col = Box(children=[self.model_show], padding=2, width='60%')
+                self.hor_align = FlexBox(children = [left_col, right_col], width='100%', orientation='horizontal')

-                left_col = ContainerWidget(children = [self.progress, self.text])
-                right_col = ContainerWidget(children = [self.model_show])
-                hor_align = ContainerWidget(children = [left_col, right_col])
+                display(self.hor_align)
+                                
+                try:
+                    self.text.set_css('width', '100%')
+                    left_col.set_css({
+                             'padding': '2px',
+                             'width': "100%",
+                             })
+    
+                    right_col.set_css({
+                             'padding': '2px',
+                             })
+    
+                    self.hor_align.set_css({
+                             'width': "100%",
+                             })

-                display(hor_align)
+                    self.hor_align.remove_class('vbox')
+                    self.hor_align.add_class('hbox')
+    
+                    left_col.add_class("box-flex1")
+                    right_col.add_class('box-flex0')

-                left_col.set_css({
-                         'padding': '2px',
-                         'width': "100%",
-                         })
-
-                right_col.set_css({
-                         'padding': '2px',
-                         })
-
-                hor_align.set_css({
-                         'width': "100%",
-                         })
-
-                hor_align.remove_class('vbox')
-                hor_align.add_class('hbox')
-
-                left_col.add_class("box-flex1")
-                right_col.add_class('box-flex0')
+                except:
+                    pass

                #self.text.add_class('box-flex2')
                #self.progress.add_class('box-flex1')
            else:
                self.exps = exponents(self.fnow, self.current_gradient)
-                print 'Running {} Code:'.format(self.opt_name)
-                print ' {3:7s}   {0:{mi}s}   {1:11s}    {2:11s}'.format("i", "f", "|g|", "secs", mi=self.len_maxiters)
+                print('Running {} Code:'.format(self.opt_name))
+                print(' {3:7s}   {0:{mi}s}   {1:11s}    {2:11s}'.format("i", "f", "|g|", "secs", mi=self.len_maxiters))

    def __enter__(self):
        self.start = time.time()
@ -102,7 +106,8 @@ class VerboseOptimization(object):
                html_body += "<td class='tg-right'>{}</td>".format(val)
                html_body += "</tr>"
            self.text.value = html_begin + html_body + html_end
-            self.progress.value = 100*(self.iteration+1)/self.maxiters
+            self.progress.value = (self.iteration+1)
+            #self.progresstext.value = '0/{}'.format((self.iteration+1))
            self.model_show.value = self.model._repr_html_()
        else:
            n_exps = exponents(self.fnow, self.current_gradient)
@ -111,11 +116,11 @@ class VerboseOptimization(object):
                b = np.any(n_exps < self.exps)
                if a or b:
                    self.p_iter = self.iteration
-                    print ''
+                    print('')
                if b:
                    self.exps = n_exps
-            print '\r',
-            print '{3:> 7.2g}  {0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), time.time()-self.start, mi=self.len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+            print('\r', end=' ')
+            print('{3:> 7.2g}  {0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), time.time()-self.start, mi=self.len_maxiters), end=' ') # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
            sys.stdout.flush()

    def print_status(self, me, which=None):
@ -144,7 +149,9 @@ class VerboseOptimization(object):
            self.print_out()

            if not self.ipython_notebook:
-                print ''
-                print 'Optimization finished in {0:.5g} Seconds'.format(self.stop-self.start)
-                print 'Optimization status: {0:.5g}'.format(self.status)
-                print
+                print()
+                print('Optimization finished in {0:.5g} Seconds'.format(self.stop-self.start))
+                print('Optimization status: {0}'.format(self.status))             
+                print()
+            elif self.clear:
+                self.hor_align.close()